
    )`i                         d Z ddlZddlmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZmZ ddlmZ dd	lmZmZmZmZmZmZmZmZmZmZ d
ej        dej        dej        fdZ G d d          Z G d d          ZdS )a3  
Copyright (c) 2024 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)OptionalTupleUnion   )flashinfer_api)get_batch_decode_module)_compute_page_mask_indptrget_batch_prefill_module)segment_packbits)
MaskModePosEncodingModeTensorLayout_check_pos_encoding_modecheck_shape_dtype_device_get_cache_alibi_slopes_bufcanonicalize_torch_dtypedetermine_attention_backenddevice_support_pdl	is_float8maskindptrreturnc                 t   | j         \  }}}t          |          dz
  }t          j        ||z  |z  f| j        | j                  }t          |          D ]b}| ||         ||dz                                         dd                              d          |||         |z  |z  ||dz            |z  |z  <   c|S )a|  Convert mask from BSR data layout to flashinfer's flattened mask layout.

    Parameters
    ----------
    mask : torch.Tensor
        A boolean mask tensor with shape ``(nnz, R, C)``.
    indptr : torch.Tensor
        The indptr tensor in BSR format.

    Returns
    -------
    flattened_mask : torch.Tensor
        A flattenedd mask tensor with shape ``(nnz * R * C,)``.
    r   dtypedevicer   )	shapelentorchemptyr   r   range	transposereshape)r   r   nnzRCMBmask_flashinferis           e/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/sparse.pyconvert_bsr_mask_layoutr,   (   s     
ICA	VqBk37Q;.
4;WWWO2YY 
 
VAE]*+55a;;CCBGG 	q	A)F1q5MA,=,AABB     c            0          e Zd ZdZe	 d2dej        deddfd            Zdej        dej        ddfd	Z	e	 	 	 	 	 	 	 	 	 	 	 	 	 d3dej        dej        de
de
de
de
de
de
de
deej                 deej                 dedededee         dee         dee         dee         d eeej        f         d!eeeej        f                  d"eeej        f         d#eddf.d$            ZeZ	 	 	 	 	 	 	 	 	 d4d%ej        d&ej        d'ej        d(eej                 d)eej                 d*eej                 dededee         dee         dee         dee         dej        fd+Ze	 	 	 	 	 	 	 d5d%ej        d&ej        d'ej        d(eej                 d)eej                 d*eej                 d,eej                 d-eej                 d.ed/ee         deej        eej        ej        f         f         fd0            Zd6d1ZdS )7BlockSparseAttentionWrappera  Wrapper class for attention computation with a block-sparse matrix as attention mask.
    The definition of block sparse matrix can be found at
    `bsr_matrix <https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.bsr_matrix.html>`_
    in SciPy.

    This API supports any block size ``(R, C)``.

    Example
    -------
    >>> import torch
    >>> import flashinfer
    >>> num_qo_heads = 32
    >>> num_kv_heads = 8
    >>> head_dim = 128
    >>> # allocate 128MB workspace buffer
    >>> workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0")
    >>> bsr_wrapper = flashinfer.BlockSparseAttentionWrapper(workspace_buffer)
    >>> # sparse mask: [[0, 0, 1], [1, 0, 1], [0, 1, 1]]
    >>> M = 3
    >>> N = 3
    >>> indptr = torch.tensor([0, 1, 3, 5], dtype=torch.int32, device="cuda:0")
    >>> indices = torch.tensor([2, 0, 2, 1, 2], dtype=torch.int32, device="cuda:0")
    >>> bsr_wrapper.plan(
    ...     indptr,
    ...     indices,
    ...     M,
    ...     N,
    ...     1, # R(block_rows)=1
    ...     1, # C(block_columns)=1
    ...     num_qo_heads,
    ...     num_kv_heads,
    ...     head_dim,
    ... )
    >>> q = torch.randn((M, num_qo_heads, head_dim), dtype=torch.float16, device="cuda:0")
    >>> k = torch.randn((N, num_kv_heads, head_dim), dtype=torch.float16, device="cuda:0")
    >>> v = torch.randn((N, num_kv_heads, head_dim), dtype=torch.float16, device="cuda:0")
    >>> o = bsr_wrapper.run(q, k, v)
    >>> # use dense implementation with attention mask for comparison
    >>> mask = torch.tensor([[0, 0, 1], [1, 0, 1], [0, 1, 1]], dtype=torch.bool, device="cuda:0")
    >>> o_ref = flashinfer.single_prefill_with_kv_cache(q, k, v, custom_mask=mask)
    >>> torch.allclose(o, o_ref)
    True
    autofloat_workspace_bufferbackendr   Nc                 L   || _         |j        | _        |                                |                                z  | _        t          j        dt
          j        | j                  | _        t          j        dt
          j	        | j                  | _
        t          j        | j        j        t
          j        dd          | _        d| _        d| _        d	| _        d	| _        d	| _        d	| _        d	| _        d	| _        d	| _        d	| _        d	| _        d	| _        || _        d	S )
a  Constructs of :class:`BlockSparseAttentionWrapper`.

        Parameters
        ----------
        float_workspace_buffer : torch.Tensor
            The user reserved float workspace buffer used to store intermediate attention results
            in the split-k algorithm. The recommended size is 128MB, the device of the workspace
            buffer should be the same as the device of the input tensors.
        backend : str
            The implementation backend, could be ``auto``/``fa2`` or ``fa3``. Defaults to ``auto``.
            If set to ``auto``, the function will automatically choose the backend based on the
            device architecture and kernel availability.
        i   r   i   Tcpur   
pin_memoryr   FNHDN)_float_workspace_bufferr   numelelement_size_workspace_sizer    r!   uint8_int_workspace_bufferint32_kv_lens_bufferr    _pin_memory_int_workspace_buffer_use_cuda_graph
_kv_layout
_qo_indptr_paged_kv_indptr_buf_paged_kv_indices_buf_paged_kv_last_page_len_packed_mask_buf_mask_indptr_bufr&   r'   MN_backendselfr1   r2   s      r+   __init__z$BlockSparseAttentionWrapper.__init__n   s!   & (>$,3"((**-C-P-P-R-RR 	 &+[ek$+&
 &
 &
"  %{EK 
  
  
 16&,+	1
 1
 1
-  %26<@!=A"?C$8<8< $ $ $ $r-   int_workspace_bufferc                     || _         || _        |                                |                                z  | _        t          j        | j        j        | j        j        d          | _	        dS a  Reset the workspace buffer.

        Parameters
        ----------
        float_workspace_buffer : torch.Tensor
            The new float workspace buffer, the device of the new float workspace buffer should
            be the same as the device of the input tensors.

        int_workspace_buffer : torch.Tensor
            The new int workspace buffer, the device of the new int workspace buffer should
            be the same as the device of the input tensors.
        T)r   r8   N
r:   r?   r;   r<   r=   r    r!   r   r   rB   rO   r1   rQ   s      r+   reset_workspace_bufferz2BlockSparseAttentionWrapper.reset_workspace_buffer   s    " (>$%9""((**-C-P-P-R-RR 	 16&,,21
 1
 1
---r-   FNONEfloat16Tr   indicesrK   rL   r&   r'   num_qo_headsnum_kv_headshead_dimr   packed_maskcausalpos_encoding_modeuse_fp16_qk_reductionlogits_soft_capsm_scale
rope_scale
rope_thetaq_data_typekv_data_typeo_data_typenon_blockingc                 $
   t          |          }||}t          |          }t          |          | _        |d}t          |          dz
  }|t          j        |dz   t          j                  z  }||d<   |                    |j        |          }|                                	                                |z  |k    rt          d          t          j        |f|t          j        |j                  }|
|t          ||||          }|L|
Jt          |
|          }
t          |
                                                    d          |d	
          \  }}|                    | j        |          | _        |                    | j        |          | _        |                    | j        |          | _        |                    | j        |          | _        |T|                    | j        |          | _        |                    | j        |          | _        t.          j        j        }n2d| _        d| _        |rt.          j        j        nt.          j        j        }|| _        || _        || _        || _        || _         |                    d          }|||z  z  dk     r|t.          j        j        k    r|t          j!        t          j"        fvrd| _#        tI          ||| j        |j%        |	|	tL          |         j        d|dk    	  	        | _'        | j'        (                    | j)        | j*        | j+        |||||dd||	|	t          j,        d|          t          j,        d|                    | _-        ndd| _#        | j.        dk    rAt_          | j        tL          |         j        ||t.          j        j        k    ||          | _.        ||| j        |j%        |	|	tL          |         j        d|dk    |f
}ta          | j.        g|R  | _'        |dd         |dd         z
  | j         z  }| j1        dt          |                   2                    |           | j)        | j*        | j+        |||||||| j         d|	|	|dg} | j.        dk    r?| 3                    d           | 3                    d           | 3                    d            | j'        j(        |  | _-        || _4        || _5        || _6        || _7        || _8        || _9        dS )a  Create auxiliary data structures for block sparse attention.

        Parameters
        ----------
        indptr : torch.Tensor
            The block index pointer of the block-sparse matrix on row dimension, shape ``(MB + 1,)``,
            where ``MB`` is the number of blocks in the row dimension.
        indices: torch.Tensor
            The block indices of the block-sparse matrix on column dimension, shape ``(nnz,)``, where
            ``nnz`` is the number of non-zero blocks. The elements in ``indices`` array should be less then ``NB``:
            the number of blocks in the column dimension.
        M : int
            The number of rows of the block-sparse matrix, ``MB = ceil_div(M, R)``.
        N : int
            The number of columns of the block-sparse matrix, ``NB = N // C``, ``N`` should be divisible by ``C``.
        R : int
            The number of rows in each block.
        C : int
            The number of columns in each block.
        num_qo_heads : int
            The number of heads in the query/output tensor.
        num_kv_heads : int
            The number of heads in the key/value tensor.
        head_dim : int
            The dimension of each head.
        mask : torch.Tensor, optional
            The mask tensor with shape ``(nnz, R, C,)``, where nnz is the number of non-zero blocks.
            If every block is full, then we don't need to provide the mask tensor.
        packed_mask : torch.Tensor, optional
            The 1D packed mask tensor, if provided, the :attr:`custom_mask` will be ignored.
            The packed mask tensor is generated by :func:`flashinfer.quantization.packbits`.
        causal : bool
            Whether to apply causal mask to the attention matrix.
            This is only effective when :attr:`custom_mask` is not provided in
            :meth:`plan`.
        pos_encoding_mode : str, optional
            The position encoding applied inside attention kernels, could be
            ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
            Default is ``NONE``.
        use_fp16_qk_reduction : bool
            Whether to use f16 for qk reduction (faster at the cost of slight precision
            loss).
        logits_soft_cap : Optional[float]
            The attention logits soft capping value (used in Gemini, Grok and Gemma-2, etc.), if not
            provided, will be set to ``0``. If greater than 0, the logits will be capped according to
            formula:
            :math:`\texttt{logits_soft_cap} \times \mathrm{tanh}(x / \texttt{logits_soft_cap})`,
            where :math:`x` is the input logits.
        sm_scale : Optional[float]
            The scale used in softmax, if not provided, will be set to
            ``1.0 / sqrt(head_dim)``.
        rope_scale : Optional[float]
            The scale used in RoPE interpolation, if not provided, will be set to
            ``1.0``.
        rope_theta : Optional[float]
            The theta used in RoPE, if not provided, will be set to ``1e4``.
        q_data_type : str, optional
            The data type of the query tensor.
        kv_data_type : Optional[Union[str, torch.dtype]]
            The data type of the key/value tensor. If None, will be set to :attr:`q_data_type`.
        o_data_type : str, optional
            The data type of the output tensor. Default is ``half``. As output dtype cannot
            be inferred by input dtype in quantization
        non_blocking : bool
            Whether to copy the input tensors to the device asynchronously, defaults to ``True``.


        The :meth:`plan` method should be called before any :meth:`run` or
        :meth:`run_return_lse` calls, auxiliary data structures will be created
        during this call and cached for multiple kernel runs.

        The ``num_qo_heads`` must be a multiple of ``num_kv_heads``. If ``num_qo_heads``
        is not equal to ``num_kv_heads``, the function will use
        `grouped query attention <https://arxiv.org/abs/2305.13245>`_.
        N        r   r   r   ri   zindices out of boundr   little)bitorderr6      Fr   Tr0   fa2):r   _o_dtyper   r    aranger@   tor   maxitem
ValueErrorfullr	   r,   r   
contiguousviewrE   rF   rG   rH   rI   rJ   r   CUSTOMvalueCAUSAL
NON_CAUSAL
_mask_moderK   rL   r&   r'   float8_e4m3fnfloat8_e5m2_use_tensor_coresr   r   r   _cached_moduleplanr:   r?   rB   r!   
_plan_inforM   r   r
   rA   copy_append_pos_encoding_mode_use_fp16_qk_reduction_logits_soft_cap	_sm_scale_rope_scale_rope_theta)!rO   r   rZ   rK   rL   r&   r'   r[   r\   r]   r   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   num_blocks_rowqo_indptr_host	qo_indptrlast_block_lenmask_indptr	mask_modekv_indptr_hostget_module_argskv_lens_arr_hostargss!                                    r+   r   z BlockSparseAttentionWrapper.plan   sh   J /{;;&L/==0=="!OVqU\.1*<EKPPPPr"%%fm,%OO	;;==!#a''3444qFM
 
 
 {63	 K 4#3*488D'7!!&&r**K(( ( ($K $,,t{,NN$*IIdkI$U$U!%,ZZ,Z%W%W"'5'8'8Kl (9 (
 (
$ "$/NN, %3 % %D! %0NN, %3 % %D! !-II$(D!$(D!17V--X=P=VI#5)) -.22X_222E$79J#KKK &+D""9 128!#
# 
#D #166,*5A[111A\222 DOO& &*D"}&& ;K#$56<)!66 ! !  128!#%O #;# /# # #D !/qrr 2^CRC5H HDFR !83'7#8#8!89??   
 ,*5 !D$ }%%BE"""A6d16DO #4&;# /!%%r-   qkvscale_qscale_kscale_vc                     || _         || _        |	| _        |
| _        || _        || _        |                     ||||||          S zCWarning: This method is deprecated, please use :meth:`run` instead.r   r   r   r   r   r   run)rO   r   r   r   r   r   r   r`   ra   rb   rc   rd   re   s                r+   forwardz#BlockSparseAttentionWrapper.forward  sO      #4&;# /!%%xx1a'7;;;r-   outlse
return_lse
enable_pdlc                    |
t          |j                  }
| j        }| j        }| j        }| j        }| j        }t          |           |d}|*dt          j	        |
                    d                    z  }|d}|d} |j        d| j        g|j        dd         R  } |j        d| j        g|j        dd         R  }|	r|Ot          j        |
                    d          |
                    d          ft          j        |j        	          }nJt#          ||
                    d          |
                    d          ft          j        |j        d
           |t          j        || j                  }n"t#          ||j        | j        |j        d           t)          |          r|j        |j        cxk    r|j        k    sn J |j        d         |j        d         cxk    r|j        d         k    sn J | j        dk    r| j        sJ |1t          j        |j        d         t          j        |j        	          }|1t          j        |j        d         t          j        |j        	          }|1t          j        |j        d         t          j        |j        	          }| j        r | j        j        g | j        | j        | j        |||| j        | j        | j         | j!        ||| j"        tF          | j$                 j%        d|
| j&        | j'        tQ          |j        d         | j                  ddd|||||||d| j)        R   n}| j        *                    | j        | j        | j        |||| j        | j         | j!        ||tF          | j$                 j%        d|
tQ          |j        d         | j                  ||||           |	r||fn|S )a7  Compute block-sparse attention between Q/K/V tensors.

        Parameters
        ----------
        q : torch.Tensor
            The query tensor with shape ``(M, num_qo_heads, head_dim)``.
        k : torch.Tensor
            The key tensor with shape ``(N, num_kv_heads, head_dim)``.
        v : torch.Tensor
            The value tensor with shape ``(N, num_kv_heads, head_dim)``.
        scale_q : Optional[torch.Tensor]
            The scale tensor for query, per-head quantization with shape: ``[num_qo_heads]``.
            Used with FP8 Quantization. If not provided, will be set to ``1.0``.
        scale_k : Optional[torch.Tensor]
            The scale tensor for key, per-head quantization with shape: ``[num_kv_heads]``.
            Used with FP8 Quantization. If not provided, will be set to ``1.0``.
        scale_v : Optional[torch.Tensor]
            The scale tensor for value, per-head quantization with shape: ``[num_kv_heads]``.
            Used with FP8 Quantization. If not provided, will be set to ``1.0``.
        out : Optional[torch.Tensor]
            The output tensor, if not provided, will be allocated internally.
        lse : Optional[torch.Tensor]
            The log-sum-exp of attention logits, if not provided, will be allocated internally.
        return_lse : bool
            Whether to return the log-sum-exp of attention logits
        enable_pdl : bool
            Whether to enable Programmatic Dependent Launch (PDL). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#programmatic-dependent-launch-and-synchronization
            Only supported for >= sm90, and currently only for FA2 and CUDA core decode.

        Returns
        -------
        Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
            If :attr:`return_lse` is ``False``, the attention output, shape: ``[M, num_qo_heads, head_dim]``.
            If :attr:`return_lse` is ``True``, a tuple of two tensors:

            * The attention output, shape: ``[M, num_qo_heads, head_dim]``.
            * The logsumexp of attention output, shape: ``[M, num_qo_heads]``.
        Nrk         ?r        @r   r   r   r   rl   r   fa3)+r   r   r   r   r   r   r   r   mathsqrtsizer$   r'   r   r    r!   float32r   
empty_likerr   r   r   rM   r   onesr   	paged_runr:   r?   r   rE   rF   rG   rH   r   r   rD   r|   rI   rJ   r   r=   r   )rO   r   r   r   r   r   r   r   r   r   r   r`   rb   rc   rd   re   s                   r+   r   zBlockSparseAttentionWrapper.run  s   h +AH55J 3/>%
%
 !2333"!OTYqvvbzz222HJJAIb$&017233<000AIb$&017233<000 	{kVVAYYq		*%-   )!&&))QVVAYY/%   ;"1DM:::CC$S!'4=!(ERRRQ<< 
	W7ag000000000072;!'"+<<<<<<<<<<=E))d.D))D*QWQZu}QXVVV*QWQZu}QXVVV*QWQZu}QXVVV! 9	)D) !,!*! ! 	!
 ! ! ! )! *! ,! ! ! ! T_-3! !  !!$ %%!& %'!( ,AGAJDD)!* +!, -!. /!0  1!2 3!4 5!6 7!8 9!: ;!< =!> ?!@ $A! ! ! ! !F ##,*)*,T_-3+AGAJDD)  . (0SzzS0r-   c                     dS )z5Warning: This method is deprecated and has no effect.N )rO   s    r+   end_forwardz'BlockSparseAttentionWrapper.end_forward  s    r-   r0   )NNFrX   FNNNNrY   NrY   T)	NNNrX   FNNNN)NNNNNFN)r   N)__name__
__module____qualname____doc__r   r    TensorstrrP   rV   intr   boolfloatr   r   r   begin_forwardr   r   r   r   r   r-   r+   r/   r/   A   s4       * *X  0  0  %0  0  
	0  0  0  ^0 d
 %
 $l
 
	
 
 
 
8  (,.2!'&++/$(&*&*/8:>/8!/J& J&J& J& 	J&
 J& J& J& J& J& J& u|$J& el+J& J& J&  $J&  "%!J&" 5/#J&$ UO%J&& UO'J&( 3+,)J&* uS%+%567+J&, 3+,-J&. /J&0 
1J& J& J& ^J&X M +/*.*.!'&++/$(&*&*< <<< << <	<
 %,'< %,'< %,'< <  $< "%< 5/< UO< UO< 
< < < <0  +/*.*.&*&* %)]1 ]1<]1 <]1 <	]1
 %,']1 %,']1 %,']1 el#]1 el#]1 ]1 TN]1 
u|U5<#=>>	?]1 ]1 ]1 ^]1~     r-   r/   c            %          e Zd ZdZe	 d(dej        deddfd            Zdej        dej        ddfd	Z	e	 	 	 	 	 	 	 	 	 	 d)dej        dej        dej        de
de
de
dedededee         dee         dee         dee         dedeeej        f         deeeej        f                  ddf"d            Z	 	 	 	 	 	 d*dej        d ej        d!ej        dededee         dee         dee         dee         dej        fd"Ze	 	 	 	 d+dej        d ej        d!ej        d#eej                 d$eej                 d%ed&ee         deej        eej        ej        f         f         fd'            ZdS ),#VariableBlockSparseAttentionWrappera  Wrapper class for attention computation with a block-sparse matrix as attention mask.
    This API supports variable block sizes provided by ``block_row_sz`` and ``block_col_sz``.
    Besides, each ``kv_head_idx`` can specify its own sparse patterns without using the same mask.

    Example
    -------
    >>> import torch
    >>> import flashinfer
    >>> num_qo_heads = 1
    >>> num_kv_heads = 1
    >>> head_dim = 128
    >>> seq_len = 6 # This corresponds to the `block_row_sz` and `block_col_sz`
    >>> # allocate 128MB workspace buffer
    >>> workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0")
    >>> wrapper = flashinfer.VariableBlockSparseAttentionWrapper(workspace_buffer)
    >>> block_mask_map = torch.tensor([[[0, 0, 1], [1, 0, 1], [0, 1, 1]]], dtype=torch.bool, device="cuda:0")
    >>> block_row_sz = torch.tensor([[1, 2, 3]], dtype=torch.int32, device="cuda:0")
    >>> block_col_sz = torch.tensor([[3, 1, 2]], dtype=torch.int32, device="cuda:0")
    >>> wrapper.plan(
    ...     block_mask_map,
    ...     block_row_sz,
    ...     block_col_sz,
    ...     num_qo_heads,
    ...     num_kv_heads,
    ...     head_dim,
    ... )
    >>> q = torch.randn((num_qo_heads, seq_len, head_dim), dtype=torch.float16, device="cuda:0")
    >>> k = torch.randn((num_kv_heads, seq_len, head_dim), dtype=torch.float16, device="cuda:0")
    >>> v = torch.randn((num_kv_heads, seq_len, head_dim), dtype=torch.float16, device="cuda:0")
    >>> o = wrapper.run(q, k, v)
    r0   r1   r2   r   Nc                    || _         |j        | _        |                                |                                z  | _        t          j        dt
          j        | j                  | _        t          j        dt
          j	        | j                  | _
        t          j        | j        j        t
          j        dd          | _        d| _        d| _        d	| _        d	| _        d	| _        d	| _        || _        d	S )
a  Constructs of :class:`VariableBlockSparseAttentionWrapper`.

        Parameters
        ----------
        float_workspace_buffer : torch.Tensor
            The user reserved float workspace buffer used to store intermediate attention results
            in the split-k algorithm. The recommended size is 128MB, the device of the workspace
            buffer should be the same as the device of the input tensors.
        backend : str
            The implementation backend, could be ``auto``/``fa2`` or ``fa3``. Defaults to ``auto``.
            If set to ``auto``, the function will automatically choose the backend based on the
            device architecture and kernel availability.
        r4   r   r5   Tr6   r7   Fr9   N)r:   r   r;   r<   r=   r    r!   r>   r?   r@   rA   r   rB   rC   rD   rE   rF   rG   rH   rM   rN   s      r+   rP   z,VariableBlockSparseAttentionWrapper.__init__  s    & (>$,3"((**-C-P-P-R-RR 	 &+[ek$+&
 &
 &
"  %{EK 
  
  
 16&,+	1
 1
 1
-  %26<@!=A"?C$r-   rQ   c                     || _         || _        |                                |                                z  | _        t          j        | j        j        | j        j        d          | _	        dS rS   rT   rU   s      r+   rV   z:VariableBlockSparseAttentionWrapper.reset_workspace_buffer  rW   r-   FrX   TrY   block_mask_mapblock_row_szblock_col_szr[   r\   r]   r_   r`   ra   rb   rc   rd   re   ri   rf   rg   c                 	   t          |          }||}t          |          }|| _        |
d}
|j        d         }|j        d         }t          j        t          j        dt          j        |j                  t          j        |	                                dt          j                  gd          }|
                    d	|
          }t          j        ||z  fdt          j        |j                  }dt          j        dt          j        dt          t          j        t          j        f         fd} |||          \  }}|
                    d	|
          }|
                    d	|
          }|
                    | j        |
          | _        |
                    | j        |
          | _        |
                    | j        |
          | _        |
                    | j        |
          | _        t          j                                         |rt(          j        j        nt(          j        j        | _        ||z  dk    s
J d            ||z  dz   |j        d         k    sJ |d                                         |j        d         k    s2J |d                                          d|j        d                      ||j        d         k    sJ ||j        d         k    sJ ||j        d         k    sJ ||j        d         k    sJ ||j        d         k    sJ | j        dk    rFt7          | j        t8          |         j        |	| j        t(          j        j        k    ||          | _        ||| j        |j        ||t8          |         j        d|
dk    |	f
}t?          | j        g|R  | _         |dd         |dd         z
  }| j!        dtE          |                   #                    |           | j$        | j%        | j&        ||||d                                         ||z  ||z  ddd|||dg}| j        dk    r?|'                    d           |'                    d           |'                    d            | j         j(        | | _)        || _*        |	| _+        |
| _,        || _-        || _.        || _/        || _0        ||z  | _1        dS )a
  Create auxiliary data structures for block sparse attention.

        Parameters
        ----------
        block_mask_map : torch.Tensor
            The block mask map (boolean), shape ``(num_kv_heads, MB, NB)``, where ``MB`` is the number of blocks in the row dimension,
            ``NB`` is the number of blocks in the column dimension.
        block_row_sz : torch.Tensor
            The block row size, shape ``(num_kv_heads, MB,)``.
        block_col_sz : torch.Tensor
            The block column size, shape ``(num_kv_heads, NB,)``.
        num_qo_heads : int
            The number of heads in the query/output tensor.
        num_kv_heads : int
            The number of heads in the key/value tensor. Note that a group of ``qo_heads`` shares the same sparse pattern of ``kv_heads``.
        head_dim : int
            The dimension of each head.
        causal : bool
            Whether to apply causal mask to the attention matrix.
        pos_encoding_mode : str, optional
            The position encoding applied inside attention kernels, could be
            ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
            Default is ``NONE``.
        use_fp16_qk_reduction : bool
            Whether to use f16 for qk reduction (faster at the cost of slight precision
            loss).
        logits_soft_cap : Optional[float]
            The attention logits soft capping value (used in Gemini, Grok and Gemma-2, etc.), if not
            provided, will be set to ``0``. If greater than 0, the logits will be capped according to
            formula:
            :math:`\texttt{logits_soft_cap} \times \mathrm{tanh}(x / \texttt{logits_soft_cap})`,
            where :math:`x` is the input logits.
        sm_scale : Optional[float]
            The scale used in softmax, if not provided, will be set to
            ``1.0 / sqrt(head_dim)``.
        rope_scale : Optional[float]
            The scale used in RoPE interpolation, if not provided, will be set to
            ``1.0``.
        rope_theta : Optional[float]
            The theta used in RoPE, if not provided, will be set to ``1e4``.
        non_blocking : bool
            Whether to copy the input tensors to the device asynchronously, defaults to ``True``.


        The :meth:`plan` method should be called before any :meth:`run` or
        :meth:`run_return_lse` calls, auxiliary data structures will be created
        during this call and cached for multiple kernel runs.

        The ``num_qo_heads`` must be a multiple of ``num_kv_heads``. If ``num_qo_heads``
        is not equal to ``num_kv_heads``, the function will use
        `grouped query attention <https://arxiv.org/abs/2305.13245>`_.
        Nrk   r   r   r   r   )dimr   r   r6   rm   r   r   r   c                    | j         }t          j        }| |dddddf         z                      d          }t          j        t          j        d||          t          j        |                                d          gd          }t          j        |                    |          d          |z
  }|                    d|          }t          j        |d          |z
  }| 	                    d	          \  }	}
}||	|f                             |          }||	         ||	|f         z   }t          j        |d          }t          j
        ||z
  |          }t          j        |d         |
          |z
  }t          j
        ||          |z   }|                    ||          |                    ||          fS )ug  
            Args:
                block_mask_map:  bool/int  [num_kv_heads, num_blocks_row, num_blocks_col]
                block_col_sz:    int32/64  [num_kv_heads, num_blocks_col]
            Returns:
                kv_indptr:  [H*R + 1]  int32  —  CSR indptr
                kv_indices: [nnz]      int32  —  token indices per (head, row)
            Nr   r   r   r   r   rl   T)as_tuple)r   )r   r    r@   sumcatzeroscumsumflattenrt   nonzerorepeat_interleavers   )r   r   r   dtype_irow_lengths	kv_indptr
col_offsethead_lenhead_offseth_idxr_idxc_idxlengthsbasecumstartsoffsets_within
kv_indicess                     r+   #_block_mask_map_to_expanded_indiceszUVariableBlockSparseAttentionWrapper.plan.<locals>._block_mask_map_to_expanded_indices]  s    $*FkG *LD!!!,DDII"MMK	K@@@L!4!4!6!6::   I \__W55q99LH  $'''99H,x33h>K #1"8"8$"8"G"GE5%"5%<033G<<Gu%
5%<(@@D ,w**C,S7]GDDF"\#b'&AAAFJN0w??.PJ<<gf<==z}}f @M @ @  r-   z/num_qo_heads must be a multiple of num_kv_headsz !=    r0   Frq   )2r   rr   r   r    r   r   r@   r   r   r   rt   rx   r   r   rE   rF   rG   rH   cudasynchronizer   r}   r|   r~   r   rv   rM   r   r   r{   r   r
   r   rA   r   r   r:   r?   rB   r   r   r   r   r   r   r   r   r   _num_kv_heads_gqa_group_size)rO   r   r   r   r[   r\   r]   r_   r`   ra   rb   rc   rd   re   ri   rf   rg   r   num_blocks_colr   r   r   r   r   r   r   kv_indices_hostr   r   r   s                                 r+   r   z(VariableBlockSparseAttentionWrapper.plan  sC   P /{;;&L/==#"!O &+B/%+B/ IAU[9LMMM\1133%+NNN 
 
 
	 #e,GGl*,+!(	
 
 
-	!L-	,-	 5<-.-	 -	 -	 -	^ !D CL!
 !
	: #e,GG$--L-II#,,t{,NN$-LL<L$X$X!%/]]Kl &3 &
 &
" (6'8'8Kl (9 (
 (
$ 	
   39X(///x?R?X l*a///= 0// ,q0N4H4KKKKKb!&&((O,A!,DDDDb!&&((HHo.CA.FHH EDD ~3A66666|1!44444|1!44444!5a!88888!5a!88888=F""7 128%8?#88 DM M -.4a!
 7t}WWWW)!""-ss0CC4s#34445;;	
 	
 	

 (&12##%%\)L(!
$ =E!!KKOOOKKKKNNN2$-2
 #4&;# /!%%)+|;r-   r   r   r   c
                     || _         || _        || _        || _        || _        |	| _        |                     |||          S r   r   )
rO   r   r   r   r`   ra   rb   rc   rd   re   s
             r+   r   z+VariableBlockSparseAttentionWrapper.forward  sI     #4&;# /!%%xx1a   r-   r   r   r   r   c                    ddl }|t          |j                  }| j        }	| j        }
| j        }| j        }| j        }t          |	           |
d}
|*dt          j
        |                    d                    z  }|d}|d}|                    |d| j                                                  }|                    |d	                                          }|                    |d	                                          }|r|Ot          j        |                    d          |                    d
          ft          j        |j                  }nJt%          ||                    d          |                    d
          ft          j        |j        d           |t          j        || j                  }n"t%          ||j        | j        |j        d            | j        j        g | j        | j        | j        |||| j        | j        | j        | j        ||| j        t@          | j!                 j"        d|dddddd|
|ddd||d| j#        R   |                    |d| j                                                  }|r/|                    |d| j                                                  }|r||fn|S )a  Compute block-sparse attention between Q/K/V tensors.

        Parameters
        ----------
        q : torch.Tensor
            The query tensor with shape ``(num_qo_heads, qo_len, head_dim)``.
        k : torch.Tensor
            The key tensor with shape ``(num_kv_heads, kv_len, head_dim)``.
        v : torch.Tensor
            The value tensor with shape ``(num_kv_heads, kv_len, head_dim)``.
        out : Optional[torch.Tensor]
            The output tensor, if not provided, will be allocated internally.
        lse : Optional[torch.Tensor]
            The log-sum-exp of attention logits, if not provided, will be allocated internally.
        return_lse : bool
            Whether to return the log-sum-exp of attention logits
        enable_pdl : bool
            Whether to enable Programmatic Dependent Launch (PDL). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#programmatic-dependent-launch-and-synchronization
            Only supported for >= sm90, and currently only for FA2 and CUDA core decode.

        Returns
        -------
        Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
            If :attr:`return_lse` is ``False``, the attention output, shape: ``[M, num_qo_heads, head_dim]``.
            If :attr:`return_lse` is ``True``, a tuple of two tensors:

            * The attention output, shape: ``[M, num_qo_heads, head_dim]``.
            * The logsumexp of attention output, shape: ``[M, num_qo_heads]``.
        r   Nrk   r   r   r   z^(num_kv_heads gqa_group_size) qo_len head_dim -> (num_kv_heads qo_len) gqa_group_size head_dim)r\   zBnum_kv_heads kv_len head_dim -> (num_kv_heads kv_len) 1 1 head_dimr   r   r   rl   r   z^(num_kv_heads qo_len) gqa_group_size head_dim -> (num_kv_heads gqa_group_size) qo_len head_dimzL(num_kv_heads qo_len) gqa_group_size -> (num_kv_heads gqa_group_size) qo_len)$einopsr   r   r   r   r   r   r   r   r   r   r   	rearranger   ry   r    r!   r   r   r   rr   r   r   r   r:   r?   r   rE   rF   rG   rH   r   r   rD   r|   r=   )rO   r   r   r   r   r   r   r   r   r`   rb   rc   rd   re   s                 r+   r   z'VariableBlockSparseAttentionWrapper.run   s   R 	+AH55J 3/>%
%
 !2333"!OTYqvvbzz222HJJ
 l+  
 
 *,,	 	
 P
 
 *,, 	
 P
 
 *,, 	

  	{kVVAYYq		*%-   )!&&))QVVAYY/%   ;"1DM:::CC$S!'4=!(ERRR%% "	
("	
&"	
 O"	
 	"	

 "	
 "	
 O"	
 %"	
 &"	
 ("	
 "	
 "	
 O"	
 )/"	
 "	
  !"	
& '"	
( )"	
* +"	
, -"	
. /"	
0 1"	
2 3"	
4 5"	
6 7"	
8 9"	
: ;"	
< ="	
> ?"	
@ A"	
B  C"	
 "	
 "	
 "	
J l+  
 
 *,,	 	  	""^!/ #   jll	  (0SzzS0r-   r   )
FrX   FNNNNTrY   N)rX   FNNNN)NNFN)r   r   r   r   r   r    r   r   rP   rV   r   r   r   r   r   r   r   r   r   r   r   r-   r+   r   r     s<        @  *  *  %*  *  
	*  *  *  ^* X
 %
 $l
 
	
 
 
 
8  !'&++/$(&*&*!/8:>#u< u<u< lu< l	u<
 u< u< u< u< u<  $u< "%u< 5/u< UOu< UOu< u<  3+,!u<" uS%+%567#u<$ 
%u< u< u< ^u<x "(&++/$(&*&*! !<! <! <	!
 !  $! "%! 5/! UO! UO! 
! ! ! !*  '+&* %)O1 O1<O1 <O1 <	O1
 el#O1 el#O1 O1 TNO1 
u|U5<#=>>	?O1 O1 O1 ^O1 O1 O1r-   r   )r   r   typingr   r   r   r    api_loggingr   decoder   prefillr	   r
   quantizationr   utilsr   r   r   r   r   r   r   r   r   r   r   r,   r/   r   r   r-   r+   <module>r      s      ) ) ) ) ) ) ) ) ) )  ' ' ' ' ' ' + + + + + + H H H H H H H H * * * * * *                       %,      2E	 E	 E	 E	 E	 E	 E	 E	PG1 G1 G1 G1 G1 G1 G1 G1 G1 G1r-   