
    )`i4             2       
   d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
mZmZ ddlZddlmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZmZmZmZmZmZmZmZm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@ ejA        d             ZBejA        deCdefd            ZDejA        d             ZEejA        d             ZFedddddedejG        dejG        dejG        deCdeHdeIfd            ZJejA        d             ZKe	 	 	 	 	 	 	 	 	 	 	 	 d`dejG        dejG        dejG        deCd!eCd"eId#e	eL         d$e	eL         d%e	eL         deHd&e	eL         d'e	eL         d(e	eL         d)e	eL         ded         d*ejG        f d+            ZMe	 	 	 	 	 	 	 	 	 	 	 	 dadejG        dejG        dejG        deCd!eCd"eId#e	eL         d$e	eL         d%e	eL         deHd&e	eL         d'e	eL         d(e	eL         d)e	eL         ded,         d*e
ejG        ejG        f         f d-            ZMe	 	 	 	 	 	 	 	 	 	 	 	 d`dejG        dejG        dejG        deCd!eCd"eId#e	eL         d$e	eL         d%e	eL         deHd&e	eL         d'e	eL         d(e	eL         d)e	eL         deId*eejG        e
ejG        ejG        f         f         f d.            ZM G d/ d0          ZN G d1 d2eN          ZO G d3 d4          ZP G d5 d6          ZQejA        d7             ZRe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dbd;ejG        d<eejG        e
ejG        ejG        f         f         d=ejG        d>ejG        d?ejG        d@eHdAeeLejG        f         dBeeLejG        f         deHdCe	eejG        e*f                  dDe	eejS        eCf                  dEe	eL         dFe	eH         dGe	eejG                          deCdHe	eI         dIeCdJe	eH         dKe	eL         dLe	ejG                 dMe	eH         dNe	ejG                 d*eejG        e*f         f.dO            ZTe	 	 	 	 	 	 	 	 	 	 dcd;ejG        d<eejG        e
ejG        ejG        f         f         d=ejG        d>ejG        d?ejG        d@eHdAeeLejG        f         dBeeLejG        f         deHdCe	ejG                 dGe	ejG                 deCdHeIdJe	eH         dKe	eL         dLe	ejG                 d*ejG        f"dP            ZU	 	 	 	 	 	 	 	 	 	 	 	 	 dddQejG        dRejG        dSejG        dTeHdUeHdVeHdWeHd!eCdeHd&e	eL         dXe	eeCejS        f                  dYe	eeCejS        f                  dZe	eeCejS        f                  d'e	eL         d(e	eL         d)e	eL         d[eId\e	eH         d]eId^e	ejG                 d*df*d_ZVdS )ea3  
Copyright (c) 2023 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)SimpleNamespace)AnyListLiteralOptionalTupleUnionoverload   )flashinfer_api)%trtllm_batch_decode_with_kv_cache_mla"xqa_batch_decode_with_kv_cache_mla)xqaxqa_mla) cudnn_batch_decode_with_kv_cache)
gen_batch_decode_mla_modulegen_batch_decode_module!gen_customize_batch_decode_module"gen_customize_batch_prefill_modulegen_single_decode_moduleget_batch_decode_uriget_batch_prefill_uriget_single_decode_urisetup_cubin_loadergen_trtllm_gen_fmha_module)get_seq_lens)get_batch_prefill_jit_moduleget_batch_prefill_moduleget_single_prefill_module)log2e	FP4TensorMaskModePosEncodingModeTensorLayout_check_cached_qkv_data_type_check_kv_layout_check_pos_encoding_modecheck_shape_dtype_device_get_cache_alibi_slopes_buf_get_cache_buf_get_range_buf_unpack_paged_kv_cachecanonicalize_torch_dtypedetermine_attention_backenddevice_support_pdlget_device_sm_count	is_float8register_custom_opregister_fake_opceil_divround_upget_compute_capabilityGPUArchitectureErrorc                  N   t          |  }t          |                                  }|j        t	          d| dd          dt
          j        dt
          j        dt
          j        dt
          j        d	t
          j        d
t          t
          j                 dt          t
          j                 dt          dt          dt          dt          dt          dt          dd ffd            }t          d| d          dt
          j        dt
          j        dt
          j        dt
          j        d	t
          j        d
t          t
          j                 dt          t
          j                 dt          dt          dt          dt          dt          dt          dd fd            }t          |          S )Nflashinfer::_run)tmpomutates_argsqkvr;   r<   	maybe_lsealibi_slopeskv_layout_codewindow_leftlogits_soft_capsm_scale
rope_scale
rope_thetareturnc                 B     | |||||||||	|
d|z  d|z             d S N      ? )r?   r@   rA   r;   r<   rB   rC   rD   rE   rF   rG   rH   rI   run_funcs                e/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/decode.pyrun_single_decodez3get_single_decode_module.<locals>.run_single_decodeW   sT      	**	
 	
 	
 	
 	
    c                     d S NrN   )r?   r@   rA   r;   r<   rB   rC   rD   rE   rF   rG   rH   rI   s                rP   _fake_run_single_decodez9get_single_decode_module.<locals>._fake_run_single_decodew   s	      	rR   )run)r   r   build_and_loadrV   r2   torchTensorr   intfloatr3   r   )argsurimodulerQ   rU   rO   s        @rP   get_single_decode_moduler_   O   s   

&C%t,;;==FzH 0s000|LLL
<
<
 <
 \	

 <
 EL)
 u|,
 
 
 
 
 
 
 

 
 
 
 
 ML
> .S...//<< < \	
 < EL) u|,       
   0/$ 01111rR   module_name
jit_modulec                    |j         }|j        t          d|  dd          dt          j        dt          j        dt
          t                   dt          j        d	t          t          j                 d
t          t          j                 dt          j        dt          j        dt          j        dt          j        dt          t          j                 dt          dt          dt          dd ffd            }t          d|  d          dt          j        dt          j        dt
          t                   dt          j        d	t          t          j                 d
t          t          j                 dt          j        dt          j        dt          j        dt          j        dt          t          j                 dt          dt          dt          dd fd            }t          ||          S )Nr9   r:   float_workspace_bufferint_workspace_bufferpaged_k_cachepaged_v_cacher<   rB   r=   rd   re   plan_info_vecr?   rf   rg   paged_kv_indptrpaged_kv_indicespaged_kv_last_page_lenr<   rB   rD   rE   
enable_pdlrJ   c                 4     | |||||||||	|
|||g|R   d S rT   rN   )rd   re   rh   r?   rf   rg   ri   rj   rk   r<   rB   rD   rE   rl   r\   rO   s                  rP   run_batch_decodez5get_batch_decode_jit_module.<locals>.run_batch_decode   s\    8 	" "	
 	
 	
 	
 	
 	
 	
rR   c                     d S rT   rN   )rd   re   rh   r?   rf   rg   ri   rj   rk   r<   rB   rD   rE   rl   r\   s                  rP   _fake_run_batch_decodez;get_batch_decode_jit_module.<locals>._fake_run_batch_decode   s	    $ 	rR   planrV   )rr   rV   r2   rX   rY   r   rZ   r   boolr3   r   )r`   ra   	plan_funcrn   rp   rO   s        @rP   get_batch_decode_jit_moduleru      s%   I~H({(((

 
 
!
 %!
#l!
 Cy!
 <	!

  -!
  -!
 !
  ,!
 !&!
 <!
 EL)!
 !
 !
 !
  
!!
 !
 !
 !
 !

 
!
F 6[66677 %#l Cy <	
  -  -   , !& < EL)     
!   87&    rR   c            )         t          |  }t          |                                  }|j        }|j        t          d| dd          dt          j        dt          j        dt          t                   dt          j        d	t          t          j                 d
t          t          j                 dt          j        dt          j        dt          j        dt          j        dt          t          j                 dt          dt          dt          dt          t          j                 dt          dt          dt          dt          dd f(fd            }t          d| d          dt          j        dt          j        dt          t                   dt          j        d	t          t          j                 d
t          t          j                 dt          j        dt          j        dt          j        dt          j        dt          t          j                 dt          dt          dt          dt          t          j                 dt          dt          dt          dt          dd f(d            }t          ||          S )Nr9   r:   rc   r=   rd   re   rh   r?   rf   rg   ri   rj   rk   r<   rB   rD   rE   rl   rC   rF   rG   rH   rI   rJ   c                 N     | |||||||||	|
||||||d|z  d|z             d S rL   rN   )rd   re   rh   r?   rf   rg   ri   rj   rk   r<   rB   rD   rE   rl   rC   rF   rG   rH   rI   rO   s                      rP   rn   z1get_batch_decode_module.<locals>.run_batch_decode   sg    @ 	" "**'	
 	
 	
 	
 	
rR   c                     d S rT   rN   )rd   re   rh   r?   rf   rg   ri   rj   rk   r<   rB   rD   rE   rl   rC   rF   rG   rH   rI   s                      rP   rp   z7get_batch_decode_module.<locals>._fake_run_batch_decode  s	    , 	rR   rq   )r   r   rW   rr   rV   r2   rX   rY   r   rZ   r   rs   r[   r3   r   )r\   r]   modrt   rn   rp   rO   s         @rP   get_batch_decode_modulerz      s   

%C
!4
(
7
7
9
9CIwH  s   

 
 
)
 %)
#l)
 Cy)
 <	)

  -)
  -)
 )
  ,)
 !&)
 <)
 EL))
 )
 )
 )
 u|,)
  !)
" #)
$ %)
& ')
( 
))
 )
 )
 )
 )

 
)
V .S...// %#l Cy <	
  -  -   , !& < EL)    u|,  !" #$ %& '( 
)   0/6    rR   c                      t                      } |                                 }t          |                                            |S rT   )r   rW   r   get_library_path)ry   ops     rP   get_trtllm_gen_fmha_moduler~   ;  s=    
$
&
&C					Bs++--...IrR   NHDF)	kv_layoutrE   
return_lser?   r@   rA   r   rE   r   c          
         |j         }t          dd|          }	t          j        |          }
|r5t          j        |                    d          t          j        |          }nd } | j        ||||	|
|t          |         j	        |g|R   |
S )Nsingle_decode_with_kv_cache_tmp   r   dtypedevice)
r   r*   rX   
empty_likeemptysizefloat32rV   r$   value)ra   r?   r@   rA   r   rE   r   r\   r   r;   r<   lses               rP   +single_decode_with_kv_cache_with_jit_moduler   C  s     XF
:<Lf
U
UCA k166!99U]6JJJJN				Y%
 

 
 
 
 HrR   c                  8    t          |                                  S rT   )r   rW   )r\   s    rP   get_batch_decode_mla_moduler   c  s    &-<<>>>rR   NONEpos_encoding_modeuse_tensor_coresq_scalek_scalev_scalerF   rG   rH   rI   rJ   c                     d S rT   rN   r?   r@   rA   r   r   r   r   r   r   rE   rF   rG   rH   rI   r   s                  rP   single_decode_with_kv_cacher   h  s	    " 3rR   Tc                     d S rT   rN   r   s                  rP   r   r   |  s	    " ),rR   c                 L   t          |           t          |           t          dd| j                  }| j        d         }|
d}
|dt          j        |          z  }|||z  }|||z  }|d}|d}| j        d         }d}|r't          j        |ft          j	        | j        	          }|rt          j
        |                     d                    }t          d
| j        |j        | j        ||t          |         j        |	dk    |
dk    d
  
                            |                     d          ||||||                    d          ndt"          j        j        t&          |         j        |	dt)          || j                  |
|ddd||           |                    d          }|r|                    d          }nt          j
        |           }t-          | j        |j        | j        ||t          |         j        |	dk    |
dk                                  | |||||t)          || j                  t&          |         j        |	|
|||           |F|j        dk    r6|                    t2                    |z                      |j                  }n||z  }|r||fS |S )a,  Decode attention with KV Cache for single request, return attention output.

    Parameters
    ----------
    q : torch.Tensor
        The query tensor, shape: ``[num_qo_heads, head_dim]``.
    k : torch.Tensor
        The key tensor, shape: ``[kv_len, num_kv_heads, head_dim]`` if :attr:`kv_layout`
        is ``NHD``, or ``[num_kv_heads, kv_len, head_dim]`` if :attr:`kv_layout` is
        ``HND``.
    v : torch.Tensor
        The value tensor, shape: ``[kv_len, num_kv_heads, head_dim]`` if
        :attr:`kv_layout` is ``NHD``, or ``[num_kv_heads, kv_len, head_dim]`` if
        :attr:`kv_layout` is ``HND``.
    kv_layout : str
        The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.
    pos_encoding_mode : str
        The position encoding applied inside attention kernels, could be
        ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
        Defaults to ``NONE``.
    use_tensor_cores: bool
        Whether to use tensor cores for the computation. Will be faster for large group
        size in grouped query attention. Defaults to ``False``.
    q_scale : Optional[float]
        The calibration scale of query for fp8 input, if not provided, will be set to ``1.0``.
    k_scale : Optional[float]
        The calibration scale of key for fp8 input, if not provided, will be set to ``1.0``.
    v_scale : Optional[float]
        The calibration scale of value for fp8 input, if not provided, will be set to ``1.0``.
    window_left : int
        The left (inclusive) window size for the attention window, when set to ``-1``, the window
        size will be set to the full length of the sequence. Defaults to ``-1``.
    logits_soft_cap : Optional[float]
        The attention logits soft capping value (used in Gemini, Grok and Gemma-2, etc.), if not
        provided, will be set to ``0``. If greater than 0, the logits will be capped according to
        formula:
        :math:`\texttt{logits_soft_cap} \times \mathrm{tanh}(x / \texttt{logits_soft_cap})`,
        where :math:`x` is the input logits.
    sm_scale : Optional[float]
        The scale of softmax, if not provided, will be set to ``1 / sqrt(head_dim)``.
    rope_scale : Optional[float]
        The scale used in RoPE interpolation, if not provided, will be set to ``1.0``.
    rope_theta : Optional[float]
        The theta used in RoPE, if not provided, will be set to ``1e4``.
    return_lse : bool
        Whether to return the log sum exp value of the attention logits.

    Returns
    -------
    Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
        If :attr:`return_lse` is ``False``, the attention output, shape: ``[qo_len, num_qo_heads, head_dim_vo]``.
        If :attr:`return_lse` is ``True``, a tuple of two tensors:

        * The attention output, shape: ``[num_qo_heads, head_dim_vo]``.
        * The log sum exp value, shape: ``[num_qo_heads]``.

    Examples
    --------

    >>> import torch
    >>> import flashinfer
    >>> kv_len = 4096
    >>> num_qo_heads = 32
    >>> num_kv_heads = 32
    >>> head_dim = 128
    >>> q = torch.randn(num_qo_heads, head_dim).half().to("cuda:0")
    >>> k = torch.randn(kv_len, num_kv_heads, head_dim).half().to("cuda:0")
    >>> v = torch.randn(kv_len, num_kv_heads, head_dim).half().to("cuda:0")
    >>> o = flashinfer.single_decode_with_kv_cache(q, k, v)
    >>> o.shape
    torch.Size([32, 128])

    Note
    ----
    The ``num_qo_heads`` must be a multiple of ``num_kv_heads``. If ``num_qo_heads`` is
    not equal to ``num_kv_heads``, the function will use
    `grouped query attention <https://arxiv.org/abs/2305.13245>`_.
    r   r   r   N        rM        @r   r   fa2Fr   )r'   r&   r*   r   shapemathsqrtrX   r   r   r   	unsqueezer   r   r#   r   rV   r"   
NON_CAUSALr$   r)   squeezer_   itemsizetor[   )r?   r@   rA   r   r   r   r   r   r   rE   rF   rG   rH   rI   r   r;   head_dimnum_qo_headsr   outs                       rP   r   r     s   @ .///Y
:<Lah
W
WCwr{H8,,,GG

71:L
C Qk</qxPPP =
q{{1~~..!GGG-.42a	
 	
 #KKNN #CMM!T%#)'ah??%
 
 
( kk!nn 	!++a..Cq!! GGG-.42a		
 		
 #'ah??#)
 
 
  <166%==7*..sy99CC7NC Cx
rR   c            0          e Zd ZdZe	 	 	 	 	 	 	 	 dEdej        deded	ed
e	ej                 de	ej                 de	ej                 dede	e
e                  ddfd            Zedefd            Zedefd            Zdej        dej        ddfdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dFdej        dej        dej        dedededed ed!ed"e	e         d#e	eeej        f                  d$e	eeej        f                  d%e	eeej        f                  d&e	eeej        f                  d'e	e         d(e	e         d)e	e         d*ed+e	ej                 d,e	ej                 d-e	e         d.eddf.d/            ZeZ	 	 	 	 	 	 	 	 	 dGd0ej        d1eej        eej        ej        f         f         d ed2e	e         d3e	e         d4e	e         d!ed"e	e         d'e	e         d(e	e         d)e	e         dej        fd5Zeddddddddd6d0ej        d1eej        eej        ej        f         f         d2e	e         d3e	e         d4e	e         d7e	ej                 d8e	ej                 d9ed         d:e	e         d!e	e         dej        fd;            Zeddddddddd6d0ej        d1eej        eej        ej        f         f         d2e	e         d3e	e         d4e	e         d7e	ej                 d8e	ej                 d9ed         d:e	e         d!e	e         deej        ej        f         fd<            Zedddddddddd=d>
d0ej        d1eej        eej        ej        f         f         d2e	e         d3e	e         d4e	e         d7e	ej                 d8e	ej                 d9ed:e	e         d!e	e         d?e	ej                 d@e	e         deej        eej        ej        f         f         fdA            Z	 	 	 	 	 	 	 	 	 dGd0ej        d1ej        d ed2e	e         d3e	e         d4e	e         d!ed"e	e         d'e	e         d(e	e         d)e	e         deej        ej        f         fdBZ ej        edC          ZdHdDZ dS )I"BatchDecodeWithPagedKVCacheWrapperaf	  Wrapper class for decode attention with paged kv-cache (first proposed in
    `vLLM <https://arxiv.org/abs/2309.06180>`_) for batch of requests.

    Check :ref:`our tutorial<kv-layout>` for page table layout.

    Examples
    --------
    >>> import torch
    >>> import flashinfer
    >>> num_layers = 32
    >>> num_qo_heads = 64
    >>> num_kv_heads = 8
    >>> head_dim = 128
    >>> max_num_pages = 128
    >>> page_size = 16
    >>> # allocate 128MB workspace buffer
    >>> workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0")
    >>> decode_wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
    ...     workspace_buffer, "NHD"
    ... )
    >>> batch_size = 7
    >>> kv_page_indices = torch.arange(max_num_pages).int().to("cuda:0")
    >>> kv_page_indptr = torch.tensor(
    ...     [0, 17, 29, 44, 48, 66, 100, 128], dtype=torch.int32, device="cuda:0"
    ... )
    >>> # 1 <= kv_last_page_len <= page_size
    >>> kv_last_page_len = torch.tensor(
    ...     [1, 7, 14, 4, 3, 1, 16], dtype=torch.int32, device="cuda:0"
    ... )
    >>> kv_cache_at_layer = [
    ...     torch.randn(
    ...         max_num_pages, 2, page_size, num_kv_heads, head_dim, dtype=torch.float16, device="cuda:0"
    ...     ) for _ in range(num_layers)
    ... ]
    >>> # create auxiliary data structures for batch decode attention
    >>> decode_wrapper.plan(
    ...     kv_page_indptr,
    ...     kv_page_indices,
    ...     kv_last_page_len,
    ...     num_qo_heads,
    ...     num_kv_heads,
    ...     head_dim,
    ...     page_size,
    ...     pos_encoding_mode="NONE",
    ...     data_type=torch.float16
    ... )
    >>> outputs = []
    >>> for i in range(num_layers):
    ...     q = torch.randn(batch_size, num_qo_heads, head_dim).half().to("cuda:0")
    ...     kv_cache = kv_cache_at_layer[i]
    ...     # compute batch decode attention, reuse auxiliary data structures for all layers
    ...     o = decode_wrapper.run(q, kv_cache)
    ...     outputs.append(o)
    ...
    >>> outputs[0].shape
    torch.Size([7, 64, 128])

    Note
    ----
    To accelerate computation, FlashInfer's batch decode attention creates some
    auxiliary data structures, these data structures can be reused across multiple
    batch decode attention calls (e.g. different Transformer layers). This wrapper class
    manages the lifecycle of these data structures.
    r   FNautord   r   use_cuda_graphr   paged_kv_indptr_bufferpaged_kv_indices_bufferpaged_kv_last_page_len_bufferbackendjit_argsrJ   c
                    t          |           |	p|r9t          |	d         t          |g|	R                                            | _        n<t          |	d         t          |	                                           | _        nd| _        || _        || _        |j	        | _	        t          j        dt          j        | j	                  | _        t          j        dt          j        dd          | _        d| _        |dk    r+t          j        d	t          j        | j	                  | _        |rt          j        |          st%          d
          t          j        |          st%          d          t          j        |          st%          d          t'          |          | _        t'          |          | j        dz   k    rt%          d          nd| _        || _        || _        || _        |p|dk    | _        || _        |r5|r3t          j        | j        dz   t          j        |j	                  | _        || _        dS )a	  Constructor of :class:`BatchDecodeWithPagedKVCacheWrapper`.

        Parameters
        ----------
        float_workspace_buffer : torch.Tensor. Must be initialized to 0 for its first use.
            The user reserved float workspace buffer used to store intermediate attention results
            in the split-k algorithm. The recommended size is 128MB, the device of the workspace
            buffer should be the same as the device of the input tensors.

        kv_layout : str
            The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.

        use_cuda_graph : bool
            Whether to enable CUDAGraph for batch decode attention, if enabled, the
            auxiliary data structures will be stored as the provided buffers. The ``batch_size``
            cannot change during the lifecycle of this wrapper when CUDAGraph is enabled.

        use_tensor_cores : bool
            Whether to use tensor cores for the computation. Will be faster for large group
            size in grouped query attention. Defaults to ``False``.

        paged_kv_indptr_buffer : Optional[torch.Tensor]
            The user reserved buffer on GPU to store the indptr of the paged kv cache, the size
            of the buffer should be ``[batch_size + 1]``.
            Only needed when ``use_cuda_graph`` is ``True``.

        paged_kv_indices_buffer : Optional[torch.Tensor]
            The user reserved buffer on GPU to store the page indices of the paged kv cache,
            should be large enough to store the maximum number of page indices
            (``max_num_pages``) during the lifecycle of this wrapper.
            Only needed when ``use_cuda_graph`` is ``True``.

        paged_kv_last_page_len_buffer : Optional[torch.Tensor]
            The user reserved buffer on GPU to store the number of entries in the last page, the
            size of the buffer should be ``[batch_size]``.
            Only needed when ``use_cuda_graph`` is ``True``.

        backend : str
            The implementation backend, could be ``auto``/``fa2``/``fa3`` or ``trtllm-gen``. Defaults to ``auto``.
            If set to ``auto``, the wrapper will automatically choose the backend based on the
            device architecture and kernel availability.

        jit_args : Optional[List[Any]]
            If provided, the wrapper will use the provided arguments to create the JIT module,
            otherwise, the wrapper will use default attention implementation.
        Nr      r   Tcpur   
pin_memoryr   
trtllm-gen)i   Apaged_kv_indptr_buffer should be a torch.Tensor in cudagraph modeBpaged_kv_indices_buffer should be a torch.Tensor in cudagraph modeHpaged_kv_last_page_len_buffer should be a torch.Tensor in cudagraph moder   ;The size of paged_kv_indptr_buffer should be batch_size + 1)r&   r   r   rW   _jit_moduleru   r   
_kv_layout_float_workspace_bufferr   rX   r   uint8_int_workspace_buffer _pin_memory_int_workspace_buffer_kv_lens_bufferint32	is_tensor
ValueErrorlen_fixed_batch_size_paged_kv_indptr_buf_paged_kv_indices_buf_paged_kv_last_page_len_buf_use_tensor_cores_use_cuda_grapharange_qo_indptr_buf_backend)
selfrd   r   r   r   r   r   r   r   r   s
             rP   __init__z+BatchDecodeWithPagedKVCacheWrapper.__init__  s   v 	### #?QK6"*  $n&&	$ $   $?QK5x@OOQQ$ $  
  $D#'=$,3%*[ek$+&
 &
 &
" 16+	1
 1
 1
- 8<l""#(;DK$ $ $D   	'?#9::  W   ?#:;;  X   ?#@AA  ^   &))F%G%GD")**d.Dq.HHH Q   I
 &'D"$:!%<"+H(!1!LW5L- 	 &+l*Q.+18' ' '#
  rR   c                     | j         S rT   r   r   s    rP   r   z3BatchDecodeWithPagedKVCacheWrapper.use_tensor_cores      %%rR   c                     | j         S rT   r   r   s    rP   is_cuda_graph_enabledz8BatchDecodeWithPagedKVCacheWrapper.is_cuda_graph_enabled      ##rR   re   c                     || _         || _        t          j        | j        j        | j        j        dd          | _        dS a  Reset the workspace buffer.

        Parameters
        ----------
        float_workspace_buffer : torch.Tensor
            The new float workspace buffer, the device of the new float workspace buffer should
            be the same as the device of the input tensors.

        int_workspace_buffer : torch.Tensor
            The new int workspace buffer, the device of the new int workspace buffer should
            be the same as the device of the input tensors.
        r   T)r   r   r   Nr   r   rX   r   r   r   r   r   rd   re   s      rP   reset_workspace_bufferz9BatchDecodeWithPagedKVCacheWrapper.reset_workspace_buffer  K     (>$%9"05&,,2	1
 1
 1
---rR   r   r   float16Tindptrindiceslast_page_lenr   num_kv_headsr   	page_sizer   rE   rF   q_data_typekv_data_typeo_data_type	data_typerG   rH   rI   non_blockingblock_tablesseq_lensfixed_split_sizedisable_split_kvc                 \   | j                                         | j                                         z  | _        t	          |          }|
d}
t          |dz   d          }| j        r|| j        k    r(t          d	                    || j                            t	          |          t	          | j
                  k    rt          d          | j                            ||           | j                            ||           | j
        dt	          |                                       ||j        | j        k    o|           n|                    | j        |          | _        |                    | j        |          | _
        |                    | j        |          | _        |                    | j        |          | _        |                    d          }|                    d          }|||}||}t#          |          }||}t#          |          }||}t#          |          }|| j        st          d          |d	}|| _        || _        || _        || _        || _        || _        || _        d| _        |t7          ||          }n|                                }| j        d
k    rO|
dk    sJ t=          |                                          | _        | j         dt	          |                                       ||           | j        fd|D             }t=          |          }tC          j"        ||ftB          j#        | j                  | _        |d         }tI          |          D ]-}||         } | j
        ||| z            | j        |d| f<   || z  }.tK          ||||j&        ||tN          |         j(        |	dk    |
dk    d
  
        | _)        | j)        *                                | _+        n| j        r8t=          |                                          | _        | j,        | j,        | _)        ny| j        dk    r.t[          | j        tN          |         j(        dd||          | _        t]          | j        ||||j&        ||tN          |         j(        |	d	k    |
dk    d          | _)        | j         | j/        | j0        |||||||| j        ||d|	g}!| j        dk    r?|!1                    |           |!1                    |           |!1                    d            | j)        j*        |! | _+        n| j,        | j,        | _)        n9te          ||||j&        ||tN          |         j(        |	d	k    |
dk    	  	        | _)        | j)        *                    | j         | j/        | j0        ||||| j        |	|
||tC          j3        d|          tC          j3        d|                    | _+        || _4        |	| _5        |
| _6        || _7        || _8        || _9        dS )a|  Plan batch decode for given problem specification.

        Parameters
        ----------
        indptr : torch.Tensor
            The indptr of the paged kv cache, shape: ``[batch_size + 1]``, dtype: ``torch.int32``
        indices : torch.Tensor
            The page indices of the paged kv cache, shape: ``[kv_indptr[-1]]``, dtype: ``torch.int32``
        last_page_len : torch.Tensor
            The number of entries in the last page of each request in the paged kv
            cache, shape: ``[batch_size]``, dtype: ``torch.int32``
        num_qo_heads : int
            The number of query/output heads
        num_kv_heads : int
            The number of key/value heads
        head_dim : int
            The dimension of the heads
        page_size : int
            The page size of the paged kv cache
        pos_encoding_mode : str
            The position encoding applied inside attention kernels, could be
            ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
            Defaults to ``NONE``.
        window_left : int
            The left (inclusive) window size for the attention window, when set to ``-1``, the window
            size will be set to the full length of the sequence. Defaults to ``-1``.
        logits_soft_cap : Optional[float]
            The attention logits soft capping value (used in Gemini, Grok and Gemma-2, etc.), if not
            provided, will be set to ``0``. If greater than 0, the logits will be capped according to
            formula:
            :math:`\texttt{logits_soft_cap} \times \mathrm{tanh}(x / \texttt{logits_soft_cap})`,
            where :math:`x` is the input logits.
        q_data_type : Optional[Union[str, torch.dtype]]
            The data type of the query tensor, defaults torch.float16.
        kv_data_type : Optional[Union[str, torch.dtype]]
            The data type of the key/value tensor. If None, will be set to
            ``q_data_type``. Defaults to ``None``.
        o_data_type : Optional[Union[str, torch.dtype]]
            The data type of the output tensor. If None, will be set to :attr:`q_data_type`.
            For FP8 inputs, this should typically be set to torch.float16 or torch.bfloat16.
        data_type: Optional[Union[str, torch.dtype]]
            The data type of both the query and key/value tensors. Defaults to torch.float16.
            data_type is deprecated, please use q_data_type and kv_data_type instead.
        non_blocking : bool
            Whether to copy the input tensors to the device asynchronously, defaults to ``True``.
        seq_lens: Optional[torch.Tensor]
            A uint32 1D tensor indicating the kv sequence length of each prompt. shape: ``[batch_size]``.
        block_tables: Optional[torch.Tensor]
            A uint32 2D tensor indicating the block table of each prompt. shape: ``[batch_size, max_num_blocks_per_seq]``.
        fixed_split_size : Optional[int],
            The fixed split size for FA2 split-kv decode, in pages. Only supported by tensor core decode for now. Recommend setting to the average sequence length of your workload.
            When enabled, will lead to deterministic softmax score reduction in the merge_states kernel, and therefore
            batch-size invariant outputs. See https://thinkingmachines.ai/blog/defeating-nondeterminism-in-llm-inference/
            Note that compatibility with CUDA graph is NOT guaranteed, as even when bs is fixed, kv seq len can change
            and lead to a varied number of launched CTAs.
        disable_split_kv : bool,
            Whether to disable the split-kv for determinism in CUDA Graph, defaults to ``False``.
        Note
        ----
        The :meth:`plan` method should be called before any :meth:`run` or
        :meth:`run_return_lse` calls, auxiliary data structures will be created
        during this call and cached for multiple run calls.

        The ``num_qo_heads`` must be a multiple of ``num_kv_heads``. If ``num_qo_heads``
        is not equal to ``num_kv_heads``, the function will use
        `grouped query attention <https://arxiv.org/abs/2305.13245>`_.

        The :meth:`plan` method cannot be used in Cuda Graph or in ``torch.compile``.
        Nr   r   r   The batch size should be fixed in cudagraph mode, the runtime batch size {}  mismatches the batch size set during initialization {}HThe size of indices should be less than or equal to the allocated bufferr   zAfixed_split_size is only supported by tensor core decode for now.r   r   c                 &    g | ]}|z   d z
  z  S )r   rN   ).0seq_lenr   s     rP   
<listcomp>z;BatchDecodeWithPagedKVCacheWrapper.plan.<locals>.<listcomp>  s8     " " " y(1,:" " "rR   r   r   Fr   r   r   ):r   numelelement_size_workspace_sizer   r+   r   r   r   formatr   r   copy_r   r   r   r   r-   r   _cached_q_data_type_cached_kv_data_type_cached_o_data_type_batch_size_num_qo_heads_num_kv_heads_block_tables_max_kv_lenr   r   r   maxitemr   rX   zerosrZ   rangeget_trtllm_gen_decode_moduler   r#   r   _cached_modulerr   
_plan_infor   r.   r   r   r   appendrz   r   _pos_encoding_mode_window_left_logits_soft_cap	_sm_scale_rope_scale_rope_theta)"r   r   r   r   r   r   r   r   r   rE   rF   r   r   r   r   rG   rH   rI   r   r   r   r   r   
batch_sizeqo_indptr_hostindptr_hostlast_page_len_hostkv_lens_arr_hostblocks_per_seqmax_num_blocks_per_seqblock_idinum_blocks_neededr\   s"          `                          rP   rr   z'BatchDecodeWithPagedKVCacheWrapper.plan7  s<   @ (..00*7799: 	
 ''
"!O'
Q>>% 	T333 NNTf"D$:O O   7||c$"<==== ^   %++F+NNN,22L 3    &~W~6<<w~'D&V, =     )/		, )2 ) )D% *1, *4 * *D& 0=/?/?, 0@ 0 0D, #1"3"3, #4 # #D ii&&*--e44 "'#(.{;;&L/==%K.{;;'0E'S   #!#. $0!#. %))5A*.+K9KYWW'||~~=L(("c))))"#34499;;D !83'7#8#8!89?? | @    !)" " " "#3" " " *-^)<)<&%*[!78);& & &"
 "!9z** 2 2A(6q(9%2$x2C'CC &q*<+<*<'<=
  11HH"> 128q !## #D #16688DOO" T	"#34499;;D+&*&6##=F**$?'(9:@#$% %DM '?M L#$56<2%#a'' '# ,*5 *!D$ }%%,---,---A6d16DOO +&*&6##&= L#$56<2%#a'
' 
'# #166,*5*A[111A\222 DO$ #4' /!%%rR   r?   paged_kv_cacher   r   r   c                     || _         || _        || _        |	| _        |
| _        || _        |                     |||||          S )zEWarning: this function is deprecated, please use :meth:`run` instead.)r   r   r   r  r  r  r  r  r  rV   r   r?   r   r   r   r   r   rE   rF   rG   rH   rI   s               rP   forwardz*BatchDecodeWithPagedKVCacheWrapper.forwardl  sY     #4' /!%%xx~w  
 
 	
rR   )r   r   r   r   r   r   rl   rE   r   r   r   rl   c                    d S rT   rN   r   r?   r   r   r   r   r   r   r   rl   rE   r\   s               rP   rV   z&BatchDecodeWithPagedKVCacheWrapper.run  s	     srR   c                    d S rT   rN   r&  s               rP   rV   z&BatchDecodeWithPagedKVCacheWrapper.run  s	     -0CrR   r   )
r   r   r   r   r   r   rl   rE   sinksq_len_per_reqr(  r)  c       
         
   |	t          |j                  }	t          || j                  \  }}| j        dk    r|j        d         }n|j        d         }t          ||| j        | j                   | j        dk    r7| j        dk    r,|	                    dd          }|	                    dd          }| j
        }|
| j        n|
}
| j        dk    r|
| j        k    sJ | j        }| j        }| j        }| j        }t!          |           |d}|$|j        d	         }d
t#          j        |          z  }|||z  }|||z  }|d
}|d}|r|Ot'          j        |                    d          |                    d          ft&          j        |j                  }nJt/          ||                    d          |                    d          ft&          j        |j        d           |Rt1          | dd          p|j        }t'          j        |j        dd	         |j        d	d         z   ||j                  }n5t1          | dd          p|j        }t/          ||j        ||j        d           | j        dk    rT|                    |                    d          |z  ||                    d          |                    d                    }| j        r+| j        | j        | j        |||| j        | j         | j!        | j"        ||tF          j$        j%        tL          | j                 j%        |
|	g}| j'        #|(                    tS          |                     nd}d}d}tU          |          r+tW          |          dk    r|d         }|d         }|d         }|ddtY          |j        d         |j                  ddd|||||||d| j-        || j.        | j/        | j0        | j1        || j2        |gz  } | j3        j4        |  n| j        dk    r
| j        g }n| j        }|
J d            | j        | j        | j        |||| j         | j!        | j"        ||tL          | j                 j%        |
|	g}| j'        #|(                    tS          |                     n(|tY          |j        d         |j                  ||||gz  } | j3        j5        |  tm          |tn                    o|d
k    }|Q|sOtU          |          r;|8                    t&          j                  |z  8                    |j                  }n||z  }|r||fn|S )a
  Compute batch decode attention between query and paged kv cache.

        Parameters
        ----------
        q : torch.Tensor
            The query tensor, shape: ``[batch_size, num_qo_heads, head_dim]``
        paged_kv_cache : Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
            The paged KV-Cache stored as a tuple of tensors or a single tensor:

            * a tuple ``(k_cache, v_cache)`` of 4-D tensors, each with shape:
              ``[max_num_pages, page_size, num_kv_heads, head_dim]`` if :attr:`kv_layout` is ``NHD``,
              and ``[max_num_pages, num_kv_heads, page_size, head_dim]`` if :attr:`kv_layout` is ``HND``.

            * a single 5-D tensor with shape:
              ``[max_num_pages, 2, page_size, num_kv_heads, head_dim]`` if
              :attr:`kv_layout` is ``NHD``, and
              ``[max_num_pages, 2, num_kv_heads, page_size, head_dim]`` if
              :attr:`kv_layout` is ``HND``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
              ``paged_kv_cache[:, 1]`` is the value-cache.
        *args
            Additional arguments for the custom kernel.
        q_scale : Optional[float]
            The calibration scale of query for fp8 input, if not provided, will be set to ``1.0``.
        k_scale : Optional[float]
            The calibration scale of key for fp8 input, if not provided, will be set to ``1.0``.
        v_scale : Optional[float]
            The calibration scale of value for fp8 input, if not provided, will be set to ``1.0``.
        out : Optional[torch.Tensor]
            The output tensor, if not provided, will be allocated internally.
        lse : Optional[torch.Tensor]
            The log-sum-exp of attention logits, if not provided, will be allocated internally.
        return_lse : bool
            Whether to return the logsumexp of attention scores, defaults to ``False``.
        enable_pdl : bool
            Whether to enable Programmatic Dependent Launch (PDL). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#programmatic-dependent-launch-and-synchronization
            Only supported for >= sm90, and currently only for FA2 and CUDA core decode.
        q_len_per_req : int
            The number of query tokens per request, if not provided, will be set to ``1``.
        Returns
        -------
        Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
            If :attr:`return_lse` is ``False``, the attention output, shape: ``[batch_size, num_qo_heads, head_dim]``.
            If :attr:`return_lse` is ``True``, a tuple of two tensors:

            * attention output, shape: ``[batch_size, num_qo_heads, head_dim]``
            * logsumexp of attention scores, shape: ``[batch_size, num_qo_heads]``.
        Nr   r      r   r   r   rM   r   r   r   r   r  r      zplan info is not initialized)9r/   r   r,   r   r   r%   r   r  r   	transposer  r  r  r  r  r  r'   r   r   rX   r   r   r   r(   getattrr   viewr   r   r   r  r   r   r   r   r"   r   r   r$   r   extendlistr1   r   r)   r   r  r  r  r   r  r  	paged_runrV   
isinstancer[   r   )r   r?   r   r   r   r   r   r   r   rl   rE   r(  r)  r\   k_cachev_cacher   r   rF   rG   rH   rI   r   	out_dtyperun_argsfp8_scale_qfp8_scale_kfp8_scale_v	plan_infois_float_ones                                 rP   rV   z&BatchDecodeWithPagedKVCacheWrapper.run  s   @ +AH55J1.$/RR?e##a(IIa(I#w0$2K	
 	
 	

 =L((T_-E-E''B//G''B//G 3+6+>d''K=L(( $"33333/>%
%
 !2333"!Owr{HTYx000HHHJJ 	{kVVAYYq		*%-   )!&&))QVVAYY/%   ;&;TBBMagI+w}RSS1118  CC  &;TBBMagI$S!'9ahNNN=L((qvvayyM1=!&&))QVVTUYYWWA  ^	/,*#)*0#)T_-3!H& +T

++++ #""Q<< *CIINN"&q'K"&q'K"&q'K/
AHEE#("&&&($/ 4 *D)8444 },,1H')		 O	((*H((( ,*)*0T_-3H" +T

++++/
AHEE#  $D#X..!'511Dgn|~~ vvem,,w6::39EEw'0SzzS0rR   c                     || _         || _        || _        |	| _        |
| _        || _        |                     |||||d          S )zPWarning: this function is deprecated, please use :meth:`run_return_lse` instead.T)r   r   r   r   r"  r#  s               rP   forward_return_lsez5BatchDecodeWithPagedKVCacheWrapper.forward_return_lse  s`     #4' /!%%xx  
 
 	
rR   r   c                     dS )z7Warning: this function is deprecated and has no effect.NrN   r   s    rP   end_forwardz.BatchDecodeWithPagedKVCacheWrapper.end_forward  s    rR   )r   FFNNNr   N)r   r   Nr   NNNNNNTNNNF)	r   NNNr   NNNNrJ   N)!__name__
__module____qualname____doc__r   rX   rY   strrs   r   r   r   r   propertyr   r   r   rZ   r[   r	   r   rr   begin_forwardr   r$  r
   r   rV   r@  	functoolspartialmethodrun_return_lserC  rN   rR   rP   r   r   Q  sX       ? ?B  $!&9=:>@D(,A  A  %A  A  	A 
 A  !) 6A  "*%,!7A  (0'=A  A  49%A  
A  A  A  ^A F &$ & & & X& $t $ $ $ X$
&+l
JO,
	
 
 
 
0  "(+/9B:>9=7;$(&*&*!/3+/*.!&/p& p&p& p& |	p&
 p& p& p& p& p& p& "%p& eC$456p& uS%+%567p& eC$456p& E#u{"234p&  5/!p&" UO#p&$ UO%p&& 'p&( u|,)p&* 5<(+p&, #3--p&. /p&0 
1p& p& p& ^p&d	 M "(#'#'#'+/$(&*&*
 
<
 elE%,2L,MMN
 	

 %
 %
 %
 
 "%
 5/
 UO
 UO
 

 
 
 
2  $(#'#'&*&*%*%)%)  < elE%,2L,MMN
 % % % el# el# EN TN c] 
   X  $(#'#'&*&*$(%)%)0 0 0<0 elE%,2L,MMN0
 %0 %0 %0 el#0 el#0 DM0 TN0 c]0 
u|U\)	*0 0 0 X0  $(#'#'&*&* %)%)(,'(h1 h1 h1<h1 elE%,2L,MMNh1
 %h1 %h1 %h1 el#h1 el#h1 h1 TNh1 c]h1 %h1  }h1 
u|U5<#=>>	?h1 h1 h1 ^h1\ "(#'#'#'+/$(&*&*
 
<
 
 	

 %
 %
 %
 
 "%
 5/
 UO
 UO
 
u|U\)	*
 
 
 
< -Y,STBBBN     rR   r   c                   n     e Zd ZdZ	 	 ddej        dej        dej        dej        ded	ed
df fdZ xZ	S )+CUDAGraphBatchDecodeWithPagedKVCacheWrappera|  CUDAGraph-compatible Wrapper class for decode attention with paged kv-cache (first
    proposed in `vLLM <https://arxiv.org/abs/2309.06180>`_) for batch of requests.

    Note that this wrapper may not be as efficient as :class:`BatchDecodeWithPagedKVCacheWrapper`
    because we won't dispatch to different kernels for different batch sizes/sequence lengths/etc
    to accommodate the CUDAGraph requirement.

    Check :ref:`our tutorial<kv-layout>` for page table layout.

    Note
    ----
    The :meth:`plan` method could not be captured by CUDAGraph.

    See Also
    --------
    :class:`BatchDecodeWithPagedKVCacheWrapper`
    r   Fworkspace_bufferindptr_bufferindices_bufferlast_page_len_bufferr   r   rJ   Nc           	      X    t                                          ||d||||           dS )a  Constructor of :class:`BatchDecodeWithPagedKVCacheWrapper`.

        Parameters
        ----------
        workspace_buffer : torch.Tensor
            The user reserved workspace buffer on GPU used to store auxiliary data structures,
            recommended size is 128MB, the device of the workspace buffer should be the
            same as the device of the input tensors.

        indptr_buffer : torch.Tensor
            The user reserved buffer on GPU to store the indptr of the paged kv cache, should
            be large enough to store the indptr of maximum batch size (``[max_batch_size + 1]``)
            during the lifecycle of this wrapper.

        indices_buffer : torch.Tensor
            The user reserved buffer on GPU to store the page indices of the paged kv cache,
            should be large enough to store the maximum number of page indices
            (``max_num_pages``) during the lifecycle of this wrapper.

        last_page_len_buffer : torch.Tensor
            The user reserved buffer on GPU to store the number of entries in the last page,
            should be large enough to store the maximum batch size (``[max_batch_size]``)
            during the lifecycle of this wrapper.

        use_tensor_cores : bool
            Whether to use tensor cores for the computation. Will be faster for large group
            size in grouped query attention. Defaults to ``False``.

        kv_layout : str
            The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.
        T)r   r   r   r   r   N)superr   )r   rQ  rR  rS  rT  r   r   	__class__s          rP   r   z4CUDAGraphBatchDecodeWithPagedKVCacheWrapper.__init__  sF    P 	-#0$2*> 	 	
 	
 	
 	
 	
rR   )r   F)
rE  rF  rG  rH  rX   rY   rI  rs   r   __classcell__)rW  s   @rP   rP  rP    s         0 !&0
 0
,0
 |0
 	0

 $l0
 0
 0
 
0
 0
 0
 0
 0
 0
 0
 0
 0
 0
rR   rP  c                      e Zd ZdZe	 	 	 	 	 d.dej        dededeej                 deej                 d	eej                 d
dfd            Z	e
d
efd            Ze
d
efd            Zdej        dej        d
dfdZe	 	 	 	 	 	 d/dej        dej        dej        dedededededee         deeej        f         deeeej        f                  dee         dee         d
dfd            Ze	 	 	 	 	 	 	 d0d ej        d!ej        d"ej        d#ej        d$ee         d%ee         d&ee         d'eej                 d(eej                 d)ed*ed
eej        eej        ej        f         f         fd+            Z ej        ed,-          ZdS )1%BatchDecodeMlaWithPagedKVCacheWrappera  Warning: this class is deprecated and will be removed in a future release.
    Please use :class:`flashinfer.mla.BatchMLAPagedAttentionWrapper` instead, which provides
    a more efficient and general MLA implementation that supports decode and incremental prefill.
    FNrd   r   r   r   r   r   rJ   c                 x   || _         |j        | _        t          j        dt          j        | j                  | _        t          j        dt          j        dd          | _        |rt          j        |          st          d          t          j        |          st          d          t          j        |          st          d          t          |          | _
        t          |          | j
        d	z   k    rt          d
          nd| _
        || _        || _        || _        || _        || _        dS )a'  Constructor of :class:`BatchDecodeWithPagedKVCacheWrapper`.

        Parameters
        ----------
        float_workspace_buffer : torch.Tensor
            The user reserved float workspace buffer used to store intermediate attention results
            in the split-k algorithm. The recommended size is 128MB, the device of the workspace
            buffer should be the same as the device of the input tensors.

        use_cuda_graph : bool
            Whether to enable CUDAGraph for batch decode attention, if enabled, the
            auxiliary data structures will be stored as the provided buffers. The ``batch_size``
            cannot change during the lifecycle of this wrapper when CUDAGraph is enabled.

        use_tensor_cores : bool
            Whether to use tensor cores for the computation. Will be faster for large group
            size in grouped query attention. Defaults to ``False``.

        paged_kv_indptr_buffer : Optional[torch.Tensor]
            The user reserved buffer on GPU to store the indptr of the paged kv cache, the size
            of the buffer should be ``[batch_size + 1]``.
            Only needed when ``use_cuda_graph`` is ``True``.

        paged_kv_indices_buffer : Optional[torch.Tensor]
            The user reserved buffer on GPU to store the page indices of the paged kv cache,
            should be large enough to store the maximum number of page indices
            (``max_num_pages``) during the lifecycle of this wrapper.
            Only needed when ``use_cuda_graph`` is ``True``.

        paged_kv_last_page_len_buffer : Optional[torch.Tensor]
            The user reserved buffer on GPU to store the number of entries in the last page, the
            size of the buffer should be ``[batch_size]``.
            Only needed when ``use_cuda_graph`` is ``True``.
        r   r   Tr   r   r   r   r   r   r   r   N)r   r   rX   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   rd   r   r   r   r   r   s          rP   r   z.BatchDecodeMlaWithPagedKVCacheWrapper.__init__  sl   X (>$,3%*[ek$+&
 &
 &
" 16+	1
 1
 1
-  	'?#9::  W   ?#:;;  X   ?#@AA  ^   &))F%G%GD")**d.Dq.HHH Q   I
 &'D"!1$:!%<"+H(-rR   c                     | j         S rT   r   r   s    rP   r   z;BatchDecodeMlaWithPagedKVCacheWrapper.is_cuda_graph_enabledT  r   rR   c                     | j         S rT   r   r   s    rP   r   z6BatchDecodeMlaWithPagedKVCacheWrapper.use_tensor_coresX  r   rR   re   c                     || _         || _        t          j        | j        j        | j        j        dd          | _        dS r   r   r   s      rP   r   z<BatchDecodeMlaWithPagedKVCacheWrapper.reset_workspace_buffer\  r   rR   r   r   r   r   r   r   head_dim_compressed_kvr   rG   rE   rF   r   r   rH   rI   c                    t          |          }|	d}	| j        r|| j        k    r(t          d                    || j                            t          |          t          | j                  k    rt          d          | j                            |           || j        dt          |          <   | j                            |           n]|	                    | j
                  | _        |	                    | j
                  | _        |	                    | j
                  | _        t          |
          }
|s|
}t          |          }|	                    d          }t          ||
||j        |||dk    |	dk    | j        	  	        | _        | j                            | j        | j        | j        ||||| j                  | _        || _        || _        |	| _        || _        || _        dS )a  Plan batch decode for given problem specification.

        Parameters
        ----------
        indptr : torch.Tensor
            The indptr of the paged kv cache, shape: ``[batch_size + 1]``, dtype: ``torch.int32``
        indices : torch.Tensor
            The page indices of the paged kv cache, shape: ``[qo_indptr[-1]]``, dtype: ``torch.int32``
        last_page_len : torch.Tensor
            The number of entries in the last page of each request in the paged kv
            cache, shape: ``[batch_size]``, dtype: ``torch.int32``
        num_qo_heads : int
            The number of query/output heads
        head_dim_compressed_kv : int
            The dimension of the compressed kv, is also kv_lora_rank
        page_size : int
            The page size of the paged kv cache
        sm_scale : float
            The scale of softmax, should be ``1 / sqrt(qk_nope_head_dim + qk_rope_head_dim)``
        window_left : int
            The left (inclusive) window size for the attention window, when set to ``-1``, the window
            size will be set to the full length of the sequence. Defaults to ``-1``.
        logits_soft_cap : Optional[float]
            The attention logits soft capping value (used in Gemini, Grok and Gemma-2, etc.), if not
            provided, will be set to ``0``. If greater than 0, the logits will be capped according to
            formula:
            :math:`\texttt{logits_soft_cap} \times \mathrm{tanh}(x / \texttt{logits_soft_cap})`,
            where :math:`x` is the input logits.
        data_type : Union[str, torch.dtype]
            The data type of the paged kv cache. Defaults to ``float16``.
        q_data_type : Optional[Union[str, torch.dtype]]
            The data type of the query tensor. If None, will be set to
            ``data_type``. Defaults to ``None``.

        Note
        ----
        The :meth:`plan` method should be called before any :meth:`run` or
        :meth:`run_return_lse` calls, auxiliary data structures will be created
        during this call and cached for multiple run calls.
        Nr   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r-   r   r   r   r  rr   r   r   r   r  r  r  r  r  r  )r   r   r   r   r   r_  r   rG   rE   rF   r   r   rH   rI   r  r  s                   rP   rr   z*BatchDecodeMlaWithPagedKVCacheWrapper.plant  s   r ''
"!O% 	MT333 NNTf"D$:O O   7||c$"<==== ^   %++F3339@D&~W~6,22=AAAA(.		$+(>(>D%)0DK)@)@D&/</?/?/L/LD,,Y77	 	$#K.{;;ii&&9L"2a"

 

 -22(&1&	
 	
 "' /%%rR   q_nopeq_pepaged_ckv_cachepaged_kpe_cacher   r   r   r   r   r   rl   c                    t          |j                  \  }}|dz  |z   }|dk    rt          d| d          | j        }| j        }| j        }| j        }| j        }|d}|||z  }|||z  }|d}|d}| j        }|t          j	        ||	          }n"t          ||j        |j        |j        d
           |
r|	Jt          j        |                    d          |                    d          ft          j        |          }	nEt          |	|                    d          |                    d          f|j        |j        d           | j                            | j        | j        | j        ||||| j        | j        | j        |||||||	|           |
r||	gn|g}||dxx         |z  cc<   |
rt1          |          n|d         S )a  Compute batch decode attention between query and paged kv cache.

        Parameters
        ----------
        q_nope : torch.Tensor
            The query tensor not related to ROPE, shape: ``[batch_size, num_qo_heads, head_dim_ckv]``
        q_pe : torch.Tensor
            The query tensor related to ROPE, shape: ``[batch_size, num_qo_heads, head_dim_kpe]``
        paged_ckv_cache : torch.Tensor
            The paged compressed-KV-Cache stored as a single tensor:
            * 3-D tensors, each with shape: ``[max_num_pages, page_size, head_dim_ckv]``.
        paged_kpe_cache : torch.Tensor
            The paged k-pe-Cache stored as a single tensor:
            * 3-D tensors, each with shape: ``[max_num_pages, page_size, head_dim_kpe]``.
        q_scale : Optional[float]
            The calibration scale of query for fp8 input, if not provided, will be set to ``1.0``.
        k_scale : Optional[float]
            The calibration scale of key for fp8 input, if not provided, will be set to ``1.0``.
        v_scale : Optional[float]
            The calibration scale of value for fp8 input, if not provided, will be set to ``1.0``.
        out : Optional[torch.Tensor]
            The output tensor, if not provided, will be allocated internally.
        lse : Optional[torch.Tensor]
            The log-sum-exp of attention logits, if not provided, will be allocated internally.
        return_lse : bool
            Whether to return the logsumexp of attention scores, defaults to ``False``.
        enable_pdl : bool
            Whether to enable Programmatic Dependent Launch (PDL). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#programmatic-dependent-launch-and-synchronization
            Only supported for >= sm90, and currently only for FA2 and CUDA core decode.
        Returns
        -------
        Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
            If :attr:`return_lse` is ``False``, the attention output, shape: ``[batch_size, num_qo_heads, head_dim]``.
            If :attr:`return_lse` is ``True``, a tuple of two tensors:

            * attention output, shape: ``[batch_size, num_qo_heads, head_dim]``
            * logsumexp of attention scores, shape: ``[batch_size, num_qo_heads]``.
        
   P   z2MLA decode kernel is not supported on this GPU (SMz ). Supported architecture: SM80.Nr   rM   r   )r   r   r   r   r   r   )r6   r   r7   r  r  r  r  r  rX   r   r(   r   r   r   r   r   r  rV   r   r   r  r   r   r   tuple)r   ra  rb  rc  rd  r   r   r   r   r   r   rl   majorminordevice_archrE   rF   rG   rH   rI   r   s                        rP   rV   z)BatchDecodeMlaWithPagedKVCacheWrapper.run  sC   n .fm<<ubj5("&0[ 0 0 0   '/>%
%
"!OHHJJ;"6&999CC$V\6<    	{k[[^^V[[^^4-!   )[[^^V[[^^4LM   	(&O%&,%	
 	
 	
( '1sCjjSEFFFgFFF'3uSzzzSV3rR   TrA  )FFNNN)r   Nr   NNN)NNNNNFF)rE  rF  rG  rH  r   rX   rY   rs   r   r   rJ  r   r   r   rZ   r[   r	   rI  r   rr   r   rV   rL  rM  rN  rN   rR   rP   rZ  rZ    sS        
   %!&9=:>@DP. P. %P. P. 	P.
 !) 6P. "*%,!7P. (0'=P. 
P. P. P. ^P.d $t $ $ $ X$ &$ & & & X&
&+l
JO,
	
 
 
 
0  +/-69=&*&*q& q&q& q& |	q&
 q& !$q& q& q& q& "%q& ek)*q& eC$456q& UOq& UOq& 
q& q& q& ^q&f  $(#'#'&*&*  |4 |4|4 l|4 	|4
 |4 %|4 %|4 %|4 el#|4 el#|4 |4 |4 
u|U5<#=>>	?|4 |4 |4 ^|4| -Y,STBBBNNNrR   rZ  c                      e Zd ZddZ	 	 	 	 ddej        dej        dej        dej        d	ej        d
ej        dedeeej        f         deeej        f         dedede	de
ej                 de
ej                 dej        fdZd ZdS )TrtllmGenDecodeModulerJ   Nc                     d | _         t                      | _        | j                                        | _        ddlm}  || j                                                   d S )Nr   )r   )	_sm_countr   _modrW   _opflashinfer.jit.cubin_loaderr   r|   )r   r   s     rP   r   zTrtllmGenDecodeModule.__init__k  sb    (,.00	9++--BBBBBB49557788888rR   r   queryr6  r7  rQ  r   r   max_seq_len
bmm1_scale
bmm2_scaleworkspace_sizerE   rl   r   r(  c                    |t          j        |          }| j        t          |j                  | _        t          |t           j                  r!|j        t           j        k    sJ |t          z  }t          |	t           j                  r|	j        t           j        k    sJ t          |                                          dk    sJ |                    d          }|                    d          }|                    dd          }| j                            |d ||||||||||	ddd||d| j        ||
|d            |S )N   r   r   r   )rX   r   ro  r0   r   r5  rY   r   r   r    r   r   flattenrq  trtllm_paged_attention_decode)r   rs  r6  r7  rQ  r   r   rt  ru  rv  rw  rE   rl   r   r(  r  	max_q_lens                    rP   
_paged_runz TrtllmGenDecodeModule._paged_runs  sJ   " ;"5))C>!0>>DNj%,// 	,#u}4444#e+Jj%,// 	5#u}44445::<<  A%%%%ZZ]]
JJqMM	a##..N/	
 	
 	
2 
rR   c                     d S rT   rN   )r   r\   kwargss      rP   _planzTrtllmGenDecodeModule._plan  s    rR   rD  )r   NNN)rE  rF  rG  r   rX   rY   rZ   r	   r[   rs   r   r}  r  rN   rR   rP   rm  rm  j  s'       9 9 9 9( &*(,: :|: : 	:
  ,: l: ,: : %-.: %-.: : : : el#: %:  
!: : : :x    rR   rm  c            R      	   t          dg| R  }t                      t          d| dd          	 	 	 	 	 	 	 	 d2dt          j        dt          j        dt
          t                   d	t          j        d
t          j        dt          j        dt          j        dt          j        dt          j        dt          j        dt          j        dt          t          j                 dt          dt          dt          dt          dt          t          j                 dt          t          j                 dt          t          j                 dt          t          j                 dt          t          j                 dt          t          j                 dt          dt          dt          t          j                 dt          t          j                 d t          t          j                 d!t          d"t          d#t          d$t          d%t          t          j                 d&t          t                   d't          t                   d(t          t          j                 d)t          t          j                 d*t          t                   d+t          t                   d,t          t          j                 d-d fPfd.            }t          d| d/          	 	 	 	 	 	 	 	 d2dt          j        dt          j        dt
          t                   d	t          j        d
t          j        dt          j        dt          j        dt          j        dt          j        dt          j        dt          j        dt          t          j                 dt          dt          dt          dt          dt          t          j                 dt          t          j                 dt          t          j                 dt          t          j                 dt          t          j                 dt          t          j                 dt          dt          d!t          d"t          d#t          d%t          t          j                 d&t          t                   d't          t                   d(t          t          j                 d)t          t          j                 d*t          t                   d+t          t                   d,t          t          j                 d-d fHd0            }t          j        |1          S )3Nr   r9   _ragged_run)rd   re   r<   rB   r=   rd   re   rh   r?   rf   rg   	qo_indptrri   rj   rk   r<   rB   	mask_modelayoutrE   rl   maybe_custom_maskmaybe_mask_indptrmaybe_alibi_slopesmaybe_prefix_len_ptrmaybe_token_pos_in_items_ptrmaybe_max_item_len_ptrrF   rG   scale_qscale_kscale_vrH   rI   token_pos_in_items_lenrw  r   r   r   r   kv_lens_bufferr   
max_kv_lenr(  rJ   c'                     |J |J | J |!J |"J |#J |$J |%J |J |dk    s
J d            '                     |                                ||| |"|#|%|d||||
|&          }
d S )Nr   z%workspace_size must be greater than 0rM   )r   r(  )r}  
contiguous)(rd   re   rh   r?   rf   rg   r  ri   rj   rk   r<   rB   r  r  rE   rl   r  r  r  r  r  r  rF   rG   r  r  r  rH   rI   r  rw  r   r   r   r   r  r   r  r(  r^   s(                                          rP   r4  z/get_trtllm_gen_decode_module.<locals>.paged_run  s    d    )))''''''''')))$$$%%%%%%!!!#J!!!LLNN"  
 
rR   r}  c#                     d S rT   rN   )#rd   re   rh   r?   rf   rg   r  ri   rj   rk   r<   rB   r  r  rE   rl   r  r  r  r  r  r  rF   rG   rH   rI   r  r   r   r   r   r  r   r  r(  s#                                      rP   _fake_paged_runz5get_trtllm_gen_decode_module.<locals>._fake_paged_run  s
    L 	rR   )rr   r4  )NNNNNNNN)r   rm  r2   rX   rY   r   rZ   r   rs   r[   r3   r   r  )r\   r]   r4  r  r^   s       @rP   r  r    sC   

4t
4
4
4C"$$F's'''
  R 26&*&*/315#'$((,OB
 B
 %B
#lB
 CyB
 <	B

 |B
 |B
 <B
 B
  ,B
 !&B
 <B
 EL)B
 B
 B
 B
  !B
" $EL1#B
$ $EL1%B
& %U\2'B
( 'u|4)B
* '/u|&<+B
, !) 6-B
. /B
0 1B
2 %,'3B
4 %,'5B
6 %,'7B
8 9B
: ;B
< !$=B
> ?B
@ !.AB
B smCB
D smEB
F u|,GB
H !.IB
J C=KB
L SMMB
N %OB
P 
QB
 B
 B
 B
 B
 B
H 4S44455: 26&*&*/315#'$((,G% % %%#l% Cy% <	%
 |% |% <% %  ,% !&% <% EL)% % % %  !%" $EL1#%$ $EL1%%& %U\2'%( 'u|4)%* '/u|&<+%, !) 6-%. /%0 1%2 3%4 5%6 !$7%8 !.9%: sm;%< sm=%> u|,?%@ !.A%B C=C%D SME%F %G%H 
I% % % 65%V \   rR   rM   HNDr   rs  kv_cacherQ  r   r   rt  ru  rv  r   r8  
o_sf_scaleo_sf_vec_sizer(  rl   r   r)  o_scalemaskr|  cum_seq_lens_qc                    |t          | j                  n|}t          |t                    r|\  }}nJ|j        d         dk    r||}}n4|j        d         dk    s
J d            |                    d          \  }}|dk    r"t          | j                  d         dk    rd	nd
}|d
k    r|
dk    s|
$t          |	t                    rt          d          ||t          d          ||t          d          |
|	|	j	        n| j	        }
|	t          j        | |
          }	t          d5i d| d||fd|d|d|d|d|d|d|d|	d|d|d|d|d|d|S |d	k    rr|d k    r,|                    d!d"          }|                    d!d"          }t                      j        }t!          | j                  }|
dk    s|
Jt          |	t                    r4| j	        t          j        k    s
J d#            |J |d$v s
J d%            |pd&}| j        dd'         t%          | j        d'         d          fz   }t          |	t                    rX|	j        j        d         t)          | j        d         | j        d         z  |z  d(          f}|	j        }|	j        }|	j        }	|
pd}
n|	t)          | j        d         d)          t)          | j        d         | j        d         z  |z  d(          f}t          j        |t          j        | j        *          }d}t          j        |t          j        | j        *          }	nt          d+|	           |
dk    sJ t          |	t          j                  sJ t5          |	|t          j        | j        d           t5          ||t          j        | j        d,           |dk     s||	j        d         z   |j        d         k    r.t          d-| d.|	j        d          d/|j        d                    nt          |
t          j	                  s|
|J |J d}d}|
|	|	j	        n| j	        }
|	|	nt          j        | |
          }	|
| j	        t          j        t          j        fvrt          d0|
           t5          |	| j        |
| j        d           nt          d1|
           t          |t          j                  r!|j	        t          j        k    sJ |t<          z  }t          |t          j                  r|j	        t          j        k    sJ ||}|                     d          |z  }n|J |                    d          dz
  } ||	|| ||||||||||pd2|pd'|||d|||                                 |!                                z  ||           |
dk    r|	nt          |	||| j                  S tE          d3| d4          )6a  
    Parameters
    ----------
    query : torch.Tensor
        query tensor with shape [num_tokens, num_heads, head_dim], num_tokens = total query tokens in the batch.

    kv_cache : Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
        If kv_cache is a single tensor, it should be a tensor with shape [num_pages, 1 or 2, num_kv_heads, page_size, head_dim] if :attr:`kv_layout` is ``HND``,
        or [num_pages, 1 or 2, page_size, num_kv_heads, head_dim] if :attr:`kv_layout` is ``NHD``.
        If kv_cache is a tuple of two tensors, it should be a tuple of two tensors with shape [num_pages, num_kv_heads, page_size, head_dim] if :attr:`kv_layout` is ``HND``,
        or [num_pages, page_size, num_kv_heads, head_dim] if :attr:`kv_layout` is ``NHD``.
        The first tensor is the key cache, and the second tensor is the value cache.

    workspace_buffer : torch.Tensor. Must be initialized to 0 for its first use.
        workspace

    block_tables : torch.Tensor
        page_table of kv cache, [batch_size, num_pages]

    seq_lens : torch.Tensor
        A uint32 1D tensor indicating the kv sequence length of each prompt. shape: ``[batch_size]``

    max_seq_len : int
        max sequence length for kv_cache

    bmm1_scale : Union[float, torch.Tensor]
        fused scale for bmm1 input.
        when using trtllm-gen backend, it can be a torch.Tensor with dtype torch.float32.

    bmm2_scale : Union[float, torch.Tensor]
        fused scale for bmm2 input.
        when using trtllm-gen backend, it can be a torch.Tensor with dtype torch.float32.

    window_left : int = -1
        The left (inclusive) window size for the attention window, when set to ``-1``, the window
        size will be set to the full length of the sequence. Defaults to ``-1``.

    out :  Optional[Union[torch.Tensor, FP4Tensor]] = None
        output tensor, if not provided, will be allocated with ``out_dtype``, if ``out_dtype`` is not provided, will use the type of ``query``.

    out_dtype : Optional[Union[torch.dtype, str]] = None
        output dtype, if not provided, will use the type of ``out``. For nvfp4, use string ``nvfp4``.

    o_sf_scale : Optional[float] = None
        scale for nvfp4 output tensor scale factor.

    o_sf_vec_size : Optional[int] = None
        vector size for nvfp4 output tensor scale factor.

    sinks : Optional[List[torch.Tensor]] = None
        additional value per head in the denominator of the softmax.

    kv_layout : str = "HND"
        The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.
        Defaults to ``HND``.

    enable_pdl : Optional[bool] = None
        Whether to enable Programmatic Dependent Launch (PDL). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#programmatic-dependent-launch-and-synchronization
        When set to ``None``, the backend will be chosen based on the device architecture and kernel availability.

    backend : str = "auto"
        The implementation backend, could be ``auto``/``xqa`` or ``trtllm-gen``. Defaults to ``auto``.
        When set to ``auto``, the backend will be chosen based on the device architecture and kernel availability.
        For sm_100 and sm_103 (blackwell architecture), ``auto`` will choose ``trtllm-gen`` backend.
        For sm_90 (hopper architecture) and sm_120 (blackwell architecture), ``auto`` will choose ``xqa`` backend.

    o_scale : Optional[float] = 1.0
        output scale factor for xqa fp8 output.

    mask : Optional[torch.Tensor] = None
        causal attention mask for xqa speculative decoding.

    max_q_len: Optional[int] = None
        The maximum query sequence length across all requests when using variable-length queries.
        Only supported by trtllm-gen backend. Must be provided together with ``cum_seq_lens_q``.
        When None, all requests use uniform query length specified by ``q_len_per_req``.

    cum_seq_lens_q : Optional[torch.Tensor] = None
        Cumulative query sequence lengths for variable-length query support, shape: ``[batch_size + 1]``, dtype: ``torch.int32``.
        Only supported by trtllm-gen backend. Must be provided together with ``max_q_len``.
        When None, all requests use uniform query length specified by ``q_len_per_req``.

    Returns
    -------
    out : Union[torch.Tensor, FP4Tensor]
        output torch.Tensor or FP4Tensor.
    Nr   r+  EWhen kv_cache is a single tensor, the second dimension must be 1 or 2dimr   r   rf  r   r   nvfp4z)xqa backend does not support nvfp4 outputz8xqa backend does not support o_sf_scale or o_sf_vec_sizez+xqa backend does not support cum_seq_lens_qr   rs  r  rQ  r   r   rt  ru  rv  rE   r   r(  r   rl   r)  r  r  r   r,  r-  z*query must be fp8 when out_dtype is nvfp4.)N   z$only o_sf_vec_size = 16 is supportedr  r   ry     r   zInvalid out: out_scale_factorzQo_sf_start_index is out of the valid range of out_scale_factor. o_sf_start_index=z, out.shape[0]=z, out_scale_factor.shape[0]=zUnsupported out_dtype: zInvalid out_dtype: g      zBackend z not supportedrN   )#r/   r   r5  rh  r   unbindr6   r!   r   r   rX   r   xqa_batch_decode_with_kv_cacher/  r~   r{  r0   float8_e4m3fnr4   scaler5   scale_start_indexdatar   r   rY   r(   r   bfloat16r   r    r   r   r   KeyError)rs  r  rQ  r   r   rt  ru  rv  rE   r   r8  r  r  r(  r   rl   r   r)  r  r  r|  r  r6  r7  rO   sm_countfp4_out_shapefp4_out_scale_shaper  o_sf_start_indexr  s                                  rP   !trtllm_batch_decode_with_kv_cacher  7  s   ` 6@5G#EL111ZJ(E"" 6#>!!!'WGG>!$)))W *))
  (155GW&25<@@CrIILLu 	 %I$5*S):T:T$5HIII!]%>WXXX N$>JKKK %(_		%+I;"5	:::C . 
 
 
%
w''
 .-
 &	

 X
 $
 "z
 "z
 $
 
 %
  i
 "z
 (-
 G
  !
 	
$ 
L	 	 ''B//G''B//G-//M&u|44I$5*S):T:T$5;%"5555< 655 ))) J...0V...)/RM!K,R!0L0L/NNM#y)) 8IOA&U[^ek!n<MqQQ'# $'9 #&#8 h%0		U[^S11U[^ek!n<MqQQ'# $);'u/B5<$ $ $  $% k-u{5<XXX !6!6!6777''''c5<00000 %]EKu   % ##"   !1$$#cil25E5KA5NNN M(8M MILSTM M1A1G1JM M   O 	5;// 	@93D%%% (((#  ),CIIek	##e.>uI.V.V.VCemU^ LLL !F9!F!FGGG$S%+y%,PUVVVV>9>>???j%,// 	,#u}4444#e+Jj%,// 	5#u}4444$%IA-7JJ(((',,Q//!3J$R""$$'7'D'D'F'FF/	
 	
 	
8 G## C3 02BEKPP	
 9'999:::rR   c                     |t          | j                  n|}t          |t                    r|\  }}nJ|j        d         dk    r||}}n4|j        d         dk    s
J d            |                    d          \  }}t          | j                  }|dk    r(|j        d         }|j        d         }|j        d         }n'|j        d         }|j        d         }|j        d         }|                    t          j	                  }|dd         }|dd         }||z  }||z  |d	z  z  }|dk    r>| j        d
         |z  }|                     ||| j        d         | j        d                   } | 
                    d          }|
                    d          }|
|
                    |d          nd}|	t          j        |           }	|	
                    d          }t          ||||||||||||||d
k    r|dz   nd
|||d|z  ||           |	S )ar	  
    Parameters
    ----------
    query : torch.Tensor
        query tensor with shape [num_tokens, num_heads, head_dim], num_tokens = batch_size * q_len_per_request

    kv_cache : Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
        If kv_cache is a single tensor, it should be a tensor with shape [num_pages, 1 or 2, page_size, num_kv_heads, head_dim] if :attr:`kv_layout` is ``NHD``,
        or [num_pages, 1 or 2, num_kv_heads, page_size, head_dim] if :attr:`kv_layout` is ``HND``.
        If kv_cache is a tuple of two tensors, it should be a tuple of two tensors with shape [num_pages, page_size, num_kv_heads, head_dim] if :attr:`kv_layout` is ``NHD``,
        or [num_pages, num_kv_heads, page_size, head_dim] if :attr:`kv_layout` is ``HND``.

    workspace_buffer : torch.Tensor. Must be initialized to 0 for its first use.
        workspace

    block_tables : torch.Tensor
        page_table of kv cache, [batch_size, num_pages]

    seq_lens : torch.Tensor
        A uint32 1D tensor indicating the kv sequence length of each prompt. shape: ``[batch_size]``

    max_seq_len : int
        max sequence length for kv_cache

    bmm1_scale : Union[float, torch.Tensor]
        fused scale for bmm1 input.

    bmm2_scale : Union[float, torch.Tensor]
        fused scale for bmm2 input.

    window_left : int = -1
        The left (inclusive) window size for the attention window, when set to ``-1``, the window
        size will be set to the full length of the sequence. Defaults to ``-1``.

    out :  Optional[torch.Tensor] = None
        output tensor, if not provided, will be allocated with ``query.dtype``.

    sinks : Optional[torch.Tensor] = None
        additional value per head in the denominator of the softmax.

    kv_layout : str
        The layout of the kv cache. Can be either ``NHD`` or ``HND``. Defaults to ``NHD``.

    enable_pdl : bool
        Whether to enable Programmatic Dependent Launch (PDL). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#programmatic-dependent-launch-and-synchronization
        Only supported for >= sm90, and currently only for FA2, CUDA core, and trtllm-gen decode.

    o_scale : Optional[float] = 1.0
        output scale factor for fp8 output.

    mask : Optional[torch.Tensor] = None
        causal attention mask for xqa speculative decoding.

    Returns
    -------
    out : torch.Tensor
        output torch.Tensor.
    Nr   r+  r  r  r   r.  r   g      ?r   r   rM   )
r(  r   kv_scalesliding_win_sizer   r  rl   rcp_out_scale	q_seq_lenr  )r/   r   r5  rh  r   r  r0   r1  rX   r   r   reshaper   r   ) rs  r  rQ  r   r   rt  ru  rv  rE   r   r(  r   rl   r)  r  r  r6  r7  r  r   r   r   workspace_u8	semaphorescratchkv_scale_valueq_scale_valuer  	query_newseq_lens_new	sinks_newout_4ds                                    rP   r  r  b	  sj   Z 6@5G#EL111ZJ(E"" 6#>!!!'WGG>!$)))W *))
  (155GW"5<00H EM!$	}Q'=# }Q'M!$	=##((55L../I?,,-G')N/8S=AMq[^}4


:}ek!nekRSnUU""I%%a((L383DlB///$I {u%%]]1F,71,<,<q!Gm)   . JrR   r   r   r   r   r   r   r   r   r   r   r   r   r   global_override_indptr_cpuc                    t          |          }|
d}
|	||}||}n|d}||}| j        rt          |dz   d          }|d}| j        rh|| j        k    r(t          d                    || j                            t          |          t          | j                  k    rt          d          n=|| _        || _        || _	        | j        r!|
                    | j        |	          | _        t          j        d
t          |t                     rt#          t          |          n|| j                  }t          j        d
t          |t                     rt#          t          |          n|| j                  }||n|                                }t          j                            | j                  5  | j        r|dk    r#t          j        |ft          j        d          }n|                                }t-          |||          }	 | j        | j        | j        ||||||||| j        ||d|	g}| j        dk    r?|                    |           |                    |           |                    d
            | j        j        | | _        n# t>          $ r}tA          d|           |d}~ww xY w	 | j                            | j        | j        | j        |||||| j        |	|
||||          | _        n%# t>          $ r}tA          d|           |d}~ww xY wddd           n# 1 swxY w Y   || _!        |	| _"        |
| _#        || _$        || _%        || _&        dS )a  
    A faster version of BatchDecodeWithPagedKVCacheWrapper::plan used for FlashInferMultiStepDraftBackend.
    Modifications:
    - Remove unnecessary device-to-device copy for the cuda graph buffers.
    - Remove unnecessary host-to-device copy for the metadata buffers.
    Nr   r   r   r   r   r   r   r   r   r   Fr   zError in standard plan: )'r   r   r+   r   r   r   r   r   r   r   r   r   r   rX   r   r5  rI  r0  r   cudaonesr   r   r   r   r   r   r  r  rr   r  	ExceptionRuntimeErrorr  r  r  r  r  r  )r   r   r   r   r   r   r   r   r   rE   rF   r   r   r   rG   rH   rI   r   r   r   r  r  r  empty_q_dataempty_kv_cacher  r  r  r\   es                                 rP   fast_decode_planr  	  s   : ]##J #K$L		" "'
Q>>#!! ///JJP& 6K K   w<<#d89999Z   :
 %+!%,"+8(  	"0"3"3, #4 # #D
 ;	+5k3+G+GXGE;'''[{  L [	 ,,,GE<((({  N &1 	#"ZZ\\  
		4;	'	' @J @J  ?	JA~~ &+ZMU& & &"" &3%6%6%8%8"+K9KYWWJ 0.9"$  .!$ =E))KK 0111KK 0111KKNNN":$"5":#  J J J"#Aa#A#ABBIJJ"&"5":":0.9  .# "# #"  J J J"#Aa#A#ABBIJ@J @J @J @J @J @J @J @J @J @J @J @J @J @J @JD 0D#D+DDN!D!Ds]   9AL.BJL.
J7J22J77L.;AK=<L.=
LLLL..L25L2)r   r   FNNNr   NNNNF)r   r   FNNNr   NNNNT)rM   rM   r   NNNNNr  Nr   r   rM   NNN)
rM   rM   r   NNr   Nr   rM   N)r   r   NNNNNNNTNFN)WrH  rL  r   typesr   typingr   r   r   r   r   r	   r
   rX   api_loggingr   mlar   r   r   r   cudnnr   jitr   r   r   r   r   r   r   r   r   r   pager   prefillr   r   r   utilsr    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   cacher_   rI  ru   rz   r~   rY   rZ   rs   r   r   r[   r   r   rP  rZ  rm  r  r   r  r  r  rN   rR   rP   <module>r     s          ! ! ! ! ! ! G G G G G G G G G G G G G G G G G G  ' ' ' ' ' '        ) ( ( ( ( ( ( ( W W W W W W                                      
                                                   8 :2 :2 :2z IS Ic I I I IX ] ] ]@       | | |	      > ? ? ? 

 #"###'+ $"&"&!& || | 	
   e_ e_ e_  e_ uo     \!   
& 

 #"###'+ $"&"& $, ,|,|, |, 	,
 , , e_, e_, e_, , e_, uo, , , ,  5<%&!, , , 
,& 
 #"###'+ $"&"&} }|}|} |} 	}
 } } e_} e_} e_} } e_} uo} } } }  5<u|U\9::;!} } } }@a a a a a a a aHC
 C
 C
 C
 C
2T C
 C
 C
LlC lC lC lC lC lC lC lC^F F F F F F F FR @ @ @F  .1-04837"&#'*.!%#$"#'#-1-f; f;<f;EL%el(B"CCDf; lf; ,	f;
 lf; f; eU\)*f; eU\)*f; f; 
%i/0	1f; ek3./0f; f; C=f; D&'f; f;  !f;" #f;$ C=%f;& e_'f;( 5<
 )f;* }+f;, U\*-f;. 5<"#/f; f; f; f;T	  .1-0"&$(#$"#'!R R<REL%el(B"CCDR lR ,	R
 lR R eU\)*R eU\)*R R 
%,	R EL!R R R C=R e_R  5<
 !R" \#R R R R| $'+596:37 $"&"&&*"9=+h" h"Lh" \h" <	h"
 h" h" h" h" h" h" e_h" %U[ 012h" 5ek!123h" c5;./0h" uoh"  !h"" #h"$ %h"& sm'h"( )h"* !) 6+h", 
-h" h" h" h" h" h"rR   