
    )`iA}              $       8   d Z ddlZddlmZmZmZmZmZmZ ddl	Z	ddl
mZ ddlmZmZmZ ddlmZ ddlmZmZmZmZmZmZmZ dd	lmZ d
 Zd Zej        d             Zej        d             Z ej        d             Z! G d d          Z"e	 	 	 	 	 	 	 d&de	j#        de	j#        de	j#        de$de$de$de	j#        de	j#        de$de$dee	j#                 dee%e	j#        f         dee%e	j#        f         d eee	j#                          d!e&d"e'd#e	j#        f"d$            Z(e	 	 	 	 	 d'de	j#        de	j#        de	j#        de$de$de$de	j#        de	j#        de$dee	j#                 dee%e	j#        f         dee%e	j#        f         d eee	j#                          d!e&d#e	j#        fd%            Z)dS )(a3  
Copyright (c) 2023 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)ListLiteralOptionalTupleUnionoverload   )flashinfer_api)gen_batch_mla_modulegen_trtllm_gen_fmha_modulesetup_cubin_loader)gen_mla_module)MaskModecheck_shape_dtype_devicedetermine_mla_backenddevice_support_pdlget_compute_capabilityget_device_sm_countlog2e)xqa_mlac                 d   | j         dk    rt          d| j                    |j         dk    rt          d|j                    |j         dk    rt          d|j                    |j         dk    rt          d|j                    | j        \  }}}|j        d         }|dk    rt          d	|           ||k    s|d
k    rt          d| d|           |j        \  }}	|j        d         }
||k    rt          d| d| d|           |	d|
z  z  dk    rt          d|	d|
          d S )N   z"Expected q_nope_pe.ndim == 3, got z&Expected ckv_kpe_cache.ndim == 3, got r	   zExpected kv_len.ndim == 1, got    z#Expected page_table.ndim == 2, got    z&Expected 128 heads for q_nope_pe, got @  z;Expected head dim 576 for q_nope_pe and ckv_kpe_cache, got  and Expected batch size z$ for q_nope_pe and block_table, got r   <Expected block_num % (128 / block_size) == 0, got block_num= and block_size=)ndim
ValueErrorshape)	q_nope_peckv_kpe_cachekv_len
page_tableB_qHD_qD_ckvB_block_table	block_num
block_sizes              b/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/mla.py_check_cutlass_shaper/   %   s   ~NinNNOOOQV-BTVVWWW{aH6;HHIII!PzPPQQQ/KCC"ECxxE!EEFFF
e||sczz[#[[TY[[
 
 	
  */M9$Q'J
me3eeCeeVcee
 
 	
 C*$%**___R\__
 
 	
 +*    c                    | j         dk    rt          d| j                    |j         dk    r|                    d          }n"|j         dk    rt          d|j                    |dk    rt          d|           |dk    rt          d	|           |d
k    rt          d|           | j        \  }}	}
}|j        d         }||k    s|dk    rt          d| d|           |dk    r#|j        }|||	|fk    rt          d|           nK|j        \  }}|}||k    rt          d| d| d|           |d|z  z  dk    rt          d|d|          |S )N   zExpected query.ndim == 4, got r   r	   z&Expected kv_cache.ndim == 3 or 4, got r   z&Expected qk_nope_head_dim == 128, got i   z"Expected kv_lora_rank == 512, got @   z%Expected qk_rope_head_dim == 64, got r   z2Expected head dim 576 for query and kv_cache, got r   r   zAExpected page_table.shape == (B_q, Q_len, sparse_mla_top_k), got r   z  for query and block_table, got r   r   )r    r!   	unsqueezer"   )querykv_cacheqk_nope_head_dimkv_lora_rankqk_rope_head_dimsparse_mla_top_kr&   	page_sizer'   Q_lenr(   r)   r*   page_table_shaper+   r,   r-   s                    r.   _check_trtllm_gen_mla_shaper>   B   s    zQF%*FFGGG }%%a((	!		Q(-QQRRR3TBRTTUUUsLlLLMMM2SAQSSTTTC3N1E e||sczzRRR5RR
 
 	
 !%+U,<===fTdff   >
 $.#3 y
-eseeCeeVcee   j()Q..cYccV`cc   Or0   c                      t                      } |                                 }t          |                                            |S N)r   build_and_loadr   get_library_path)modops     r.   get_trtllm_gen_fmha_modulerE   |   s=    
$
&
&C					Bs++--...Ir0   c                  B    t                                                      S r@   )r   rA    r0   r.   get_mla_modulerH      s    **,,,r0   c                 @    t          | g|R                                  S r@   )r   rA   )backendargss     r.   get_batch_mla_modulerL      s$    /$///>>@@@r0   c                      e Zd ZdZe	 	 	 	 	 	 d'dej        dedeej                 deej                 d	eej                 d
eej                 de	ddfd            Z
e	 d(dej        dej        d	ej        d
ej        dededededededej        dej        deddfd            Ze	 	 	 	 	 	 	 d)dej        dej        dej        dej        deej                 deej                 ded         deej                 d eej                 d!eej                 d"edej        fd#            Ze	 	 	 	 	 	 	 d*dej        dej        dej        dej        deej                 deej                 ded$         deej                 d eej                 d!eej                 d"edeej        ej        f         fd%            Ze	 	 	 	 	 	 	 d)dej        dej        dej        dej        deej                 deej                 dedeej                 d eej                 d!eej                 d"edeej        eej        ej        f         f         fd&            ZdS )+BatchMLAPagedAttentionWrappera
  Wrapper class for MLA (`Multi-head Latent Attention <https://arxiv.org/abs/2405.04434>`_)
    PagedAttention on DeepSeek models. This kernel can be used in decode, and incremental prefill
    and should be used together with `Matrix Absorption trick
    <https://github.com/madsys-dev/deepseekv2-profile/blob/main/workspace/blog/optimizing-mla.md>`_:
    where :math:`W_{UQ}` is absorbed with :math:`W_{UK}`, and :math:`W_{UV}` is
    absorbed with :math:`W_{O}`.
    For MLA attention without Matrix Absorption (``head_dim_qk=192`` and ``head_dim_vo=128``, which is
    used in prefilling self-attention stage), please use
    :class:`flashinfer.prefill.BatchPrefillWithRaggedKVCacheWrapper`.

    More information about The Paged KV-Cache layout in MLA is explained in our tutorial
    :ref:`MLA Page Layout <mla-page-layout>`.

    For more details about the MLA computation, Matrix Absorption and FlashInfer's MLA implementation,
    please refer to our `blog post <http://flashinfer.ai/2025/02/10/flashinfer-deepseek-mla.html>`_.

    Example
    -------
    >>> import torch
    >>> import flashinfer
    >>> num_local_heads = 128
    >>> batch_size = 114
    >>> head_dim_ckv = 512
    >>> head_dim_kpe = 64
    >>> page_size = 1
    >>> mla_wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(
    ...     torch.empty(128 * 1024 * 1024, dtype=torch.int8).to(0),
    ...     backend="fa2"
    ... )
    >>> q_indptr = torch.arange(0, batch_size + 1).to(0).int() # for decode, each query length is 1
    >>> kv_lens = torch.full((batch_size,), 999, dtype=torch.int32).to(0)
    >>> kv_indptr = torch.arange(0, batch_size + 1).to(0).int() * 999
    >>> kv_indices = torch.arange(0, batch_size * 999).to(0).int()
    >>> q_nope = torch.randn(
    ...     batch_size * 1, num_local_heads, head_dim_ckv, dtype=torch.bfloat16, device="cuda"
    ... )
    >>> q_pe = torch.zeros(
    ...     batch_size * 1, num_local_heads, head_dim_kpe, dtype=torch.bfloat16, device="cuda"
    ... )
    >>> ckv = torch.randn(
    ...     batch_size * 999, 1, head_dim_ckv, dtype=torch.bfloat16, device="cuda"
    ... )
    >>> kpe = torch.zeros(
    ...     batch_size * 999, 1, head_dim_kpe, dtype=torch.bfloat16, device="cuda"
    ... )
    >>> sm_scale = 1.0 / ((128 + 64) ** 0.5)  # use head dimension before matrix absorption
    >>> mla_wrapper.plan(
    ...     q_indptr,
    ...     kv_indptr,
    ...     kv_indices,
    ...     kv_lens,
    ...     num_local_heads,
    ...     head_dim_ckv,
    ...     head_dim_kpe,
    ...     page_size,
    ...     False,  # causal
    ...     sm_scale,
    ...     q_nope.dtype,
    ...     ckv.dtype,
    ... )
    >>> o = mla_wrapper.run(q_nope, q_pe, ckv, kpe, return_lse=False)
    >>> o.shape
    torch.Size([114, 128, 512])
    FNautofloat_workspace_bufferuse_cuda_graph	qo_indptr	kv_indptr
kv_indices
kv_len_arrrJ   returnc                    || _         |j        | _        |dk    r	|| _        dS t          j        dt          j        | j                  | _        t          j        | j        j        | j        j        dd          | _	        || _
        || _        || _        || _        || _        |dk    rt          | j                  | _        dS || _        dS )	an  Constructor for BatchMLAPagedAttentionWrapper.

        Parameters
        ----------
        float_workspace_buffer : torch.Tensor
            The user reserved workspace buffer used to store intermediate attention results in
            split-k algorithm. The recommended size is 128MB, the device of the workspace buffer
            should be the same as the device of the input tensors.
        use_cuda_graph : bool, optional
            Whether to enable CUDA graph capture for the prefill kernels, if enabled, the
            auxiliary data structures will be stored in provided buffers. The ``batch_size``
            cannot change during the lifecycle of this wrapper when CUDAGraph is enabled.
        qo_indptr_buf : Optional[torch.Tensor]
            The user reserved buffer to store the ``qo_indptr`` array, the size of the buffer
            should be ``[batch_size + 1]``.
            This argument is only effective when ``use_cuda_graph`` is ``True``.
        kv_indptr_buf : Optional[torch.Tensor]
            The user reserved buffer to store the ``kv_indptr`` array, the size of the buffer
            should be ``[batch_size + 1]``.
            This argument is only effective when ``use_cuda_graph`` is ``True``.
        kv_indices_buf : Optional[torch.Tensor]
            The user reserved buffer to store the ``kv_indices`` array.
            This argument is only effective when ``use_cuda_graph`` is ``True``.
        kv_len_arr_buf : Optional[torch.Tensor]
            The user reserved buffer to store the ``kv_len_arr`` array, the size of the buffer
            should be ``[batch_size]``.
            This argument is only effective when ``use_cuda_graph`` is ``True``.
        backend : str
            The implementation backend, could be ``auto``/``fa2`` or ``fa3``. Defaults to ``auto``.
            If set to ``auto``, the function will automatically choose the backend based on the
            device architecture and kernel availability. If ``cutlass`` is provided, the MLA
            kernels will be generated by CUTLASS and only float_workspace_buffer is required and
            other arguments are ignored.
        cutlassN)   dtypedeviceTcpu)r[   
pin_memoryr\   rO   )_float_workspace_bufferr\   _backendtorchemptyuint8_int_workspace_bufferr"   r[    _pin_memory_int_workspace_buffer_use_cuda_graph_qo_indptr_buf_kv_indptr_buf_kv_indices_buf_kv_len_arr_bufr   )selfrP   rQ   rR   rS   rT   rU   rJ   s           r.   __init__z&BatchMLAPagedAttentionWrapper.__init__   s    Z (>$,3i#DMF%*[ek$+&
 &
 &
" 16&,,2	1
 1
 1
-  .''))f1$+>>DMMM#DMMMr0   	num_headshead_dim_ckvhead_dim_kper;   causalsm_scaleq_data_typekv_data_typeuse_profilerc                    t          | j        ||||j        |||          | _        |                    d          }|                    d          }|                    d          }| j        r| j                            |d           | j                            |d           | j	        dt          |                                       |d           | j                            |d           n|                    | j        d          | _        |                    | j        d          | _        |                    | j        d          | _	        |                    | j        d          | _        |	| _        || _        |
| _        || _        | j                            | j        | j        | j        ||||||		  	        | _        dS )a  Plan the MLA attention computation.

        Parameters
        ----------
        qo_indptr : torch.IntTensor
            The indptr of the query/output tensor, shape: ``[batch_size + 1]``.
            For decoding attention, the length of each query is 1, and the content
            of the tensor should be ``[0, 1, 2, ..., batch_size]``.
        kv_indptr : torch.IntTensor
            The indptr of the paged kv-cache, shape: ``[batch_size + 1]``.
        kv_indices : torch.IntTensor
            The page indices of the paged kv-cache, shape: ``[kv_indptr[-1]]`` or larger.
        kv_len_arr : torch.IntTensor
            The query length of each request, shape: ``[batch_size]``.
        num_heads : int
            The number of heads in query/output tensor.
        head_dim_ckv : int
            The head dimension of compressed-kv.
        head_dim_kpe : int
            The head dimension for rope k-cache.
        page_size : int
            The page size of the paged kv-cache.
        causal : bool
            Whether to use causal attention.
        sm_scale : float
            The scale factor for softmax operation.
        q_data_type : torch.dtype
            The data type of the query tensor.
        kv_data_type : torch.dtype
            The data type of the kv-cache tensor.
        use_profiler : bool, optional
            Whether to enable intra-kernel profiler, default is False.
        r]   T)non_blockingN)rL   r`   r[   _cached_moduletorf   rg   copy_rh   ri   lenrj   r\   _causal
_page_size	_sm_scale_use_profilerplanr_   rd   re   
_plan_info)rk   rR   rS   rT   rU   rm   rn   ro   r;   rp   rq   rr   rs   rt   qo_indptr_hostkv_indptr_hostkv_len_arr_hosts                    r.   r   z"BatchMLAPagedAttentionWrapper.plan  s   d 3MO	
 	
 #e,,"e,,$--.. 		Q%%id%CCC%%id%CCC !23z??!2399*SW9XXX &&z&EEEE"+,,t{,"N"ND"+,,t{,"N"ND#-==4=#P#PD #-==4=#P#PD #!)-22(&1

 

r0   q_nopeq_pe	ckv_cache	kpe_cacheoutlse
return_lseprofiler_bufferr%   r&   return_lse_base_on_ec                     d S r@   rG   rk   r   r   r   r   r   r   r   r   r%   r&   r   s               r.   runz!BatchMLAPagedAttentionWrapper.runr  s	     sr0   Tc                     d S r@   rG   r   s               r.   r   z!BatchMLAPagedAttentionWrapper.run  s	     -0Cr0   c                    | j         dk    r|rt          d          |t          d          t                      | _        |t	          j        |          }n"t          ||j        |j        |j	        d           t	          j
        ||gd          }t	          j
        ||gd          }t          |||	|
           t	          j        dt          j        | j	        	          }| j                            | j        |||||	|
           |S || j        rt          d
          |j        d         }| j        }| j        }| j        }|rt(          j        j        nt(          j        j        }| j	        }|t	          j        |          }n"t          ||j        |j        |j	        d           |r`|/t	          j        |j        dd         t          j        |	          }n/t          ||j        dd         t          j        |j	        d           | j        r|fnd} | j        j        | j        | j        | j        ||||| j        |||||||g|R   |r||fn|S )a  Run the MLA attention computation.

        Parameters
        ----------
        q_nope : torch.Tensor
            The query tensor without rope, shape: ``[batch_size, num_heads, head_dim_ckv]``.
        q_pe : torch.Tensor
            The rope part of the query tensor, shape: ``[batch_size, num_heads, head_dim_kpe]``.
        ckv_cache : torch.Tensor
            The compressed kv-cache tensor (without rope), shape: ``[num_pages, page_size, head_dim_ckv]``.
            ``head_dim_ckv`` is 512 in DeepSeek v2/v3 models.
        kpe_cache : torch.Tensor
            The rope part of the kv-cache tensor, shape: ``[num_pages, page_size, head_dim_kpe]``.
            ``head_dim_kpe`` is 64 in DeepSeek v2/v3 models.
        out : Optional[torch.Tensor]
            The output tensor, if not provided, will be allocated internally.
        lse : Optional[torch.Tensor]
            The log-sum-exp of attention logits, if not provided, will be allocated internally.
        return_lse : bool, optional
            Whether to return the log-sum-exp value, default is False.
        profiler_buffer : Optional[torch.Tensor]
            The buffer to store the profiler data.
        kv_len : Optional[torch.Tensor]
            The query length of each request, shape: ``[batch_size]``. Required when ``backend`` is ``cutlass``.
        page_table : Optional[torch.Tensor]
            The page table of the paged kv-cache, shape: ``[batch_size, num_pages]``. Required when ``backend`` is ``cutlass``.
        rX   z4return_lse does not support cutlass backend for now.Nz9profiler_buffer does not support cutlass backend for now.r   )dimr   rZ   z5Profiler is enabled, profiler_buffer must be providedr	   r   r   rG   )r`   r!   rH   rw   ra   
empty_liker   r"   r[   r\   catr/   rb   float32cutlass_mla_paged_attentionr_   r~   r|   r}   r{   r   CAUSALvalue
NON_CAUSALr   rd   r   ri   )rk   r   r   r   r   r   r   r   r   r%   r&   r   r#   r$   rm   r;   rq   rp   	mask_moder\   profiler_argss                        r.   r   z!BatchMLAPagedAttentionWrapper.run  s   T =I%% Y !WXXX* O   #1"2"2D{&v..(v|V]E   	64.b999I!Iy)&<"EEEM M6:NNN+au}T[IIIC;;,   J"!  K   LO	O	>-3RHO))9L9R	;"6**CC$V\6<    	{k&,rr"2%-PVWWW(bqb)5=&-   /3.@H**b(&O  	
  !	
 	
 	
 	
& (0SzzS0r0   )FNNNNrO   )F)NNFNNNF)NNTNNNF)__name__
__module____qualname____doc__r
   ra   Tensorboolr   strrl   intfloatr[   r   r   r   r   r   r   rG   r0   r.   rN   rN      s       ? ?B   %,0,0-1-1D$ D$ %D$ D$ EL)	D$
 EL)D$ U\*D$ U\*D$ D$ 
D$ D$ D$ ^D$L  #X
 X
<X
 <X
 L	X

 LX
 X
 X
 X
 X
 X
 X
 [X
 kX
 X
 
X
 X
 X
 ^X
t  '+&*%*26)--1%*  l <	
 < el# el# EN "%,/ & U\* # 
   X  '+&*$(26)--1%*0 00 l0 <	0
 <0 el#0 el#0 DM0 "%,/0 &0 U\*0 #0 
u|U\)	*0 0 0 X0  '+&* 26)--1%*s1 s1s1 ls1 <	s1
 <s1 el#s1 el#s1 s1 "%,/s1 &s1 U\*s1 #s1 
u|U5<#=>>	?s1 s1 s1 ^s1 s1 s1r0   rN         ?rO   r5   r6   workspace_bufferr7   r8   r9   block_tablesseq_lensmax_seq_lenr:   r   
bmm1_scale
bmm2_scalesinks
enable_pdlrJ   rV   c                    |dk    r"t          | j                  d         dk    rdnd}t          |t          j                  r!|j        t          j        k    sJ |t          z  }t          |t          j                  r|j        t          j        k    sJ |dk    rt          | j                  d         dk    s*| j        t          j        k    s|j        t          j        k    rt          d| j         d|j                   |t          d
          | 
                    d          dk    r%t          d| 
                    d                     t          | |||||||||
||||          S |dk    r|t          | j                  n|}t                      j        }t          | j                  }|
                    d          }|dk    r|dk    rt          d|           t!          | |||||	||          }|
:| j        d	d         |fz   }t          j        |t          j        | j                  }
n1| j        \  }}}}t)          |
|||gt          j        | j        d           | 
                    d          }| 
                    d          }|                     dd          }  ||
d	| |||||||||ddd|d|	|||                                |                                z  |d	           |
S t          d| d          )a
  
    Parameters
    ----------
    query: [batch_size, q_len_per_request, num_heads, head_dim_qk], head_dim_qk = qk_nope_head_dim (kv_lora_rank) + qk_rope_head_dim, should be concated q_nope + q_rope; q_len_per_request is the MTP query length.
    kv_cache: [num_pages, page_size, head_dim_ckv + head_dim_kpe] or [num_pages, 1, page_size, head_dim_ckv + head_dim_kpe], should be concated ckv_cache + kpe_cache. Both 3D and 4D formats are supported for backward compatibility.
    workspace_buffer: [num_semaphores, 4], used for multi_block mode. Must be initialized to 0 for its first use.
    qk_nope_head_dim: qk_nope_head_dim, must be 128
    kv_lora_rank: kv_lora_rank, must be 512
    qk_rope_head_dim: qk_rope_head_dim, must be 64
    sparse_mla_top_k: sparse MLA top k, must be 0 for non-sparse MLA.
    block_tables: page_table of kv cache, [batch_size, num_pages]
    seq_lens: query_len
    max_seq_len: max sequence length for kv_cache
    out: output tensor, if not provided, will be allocated internally
    bmm1_scale: fused scale for mla bmm1 input.
        when using trtllm-gen backend, it can be a torch.Tensor with dtype torch.float32.
    bmm2_scale: fused scale for mla bmm2 input.
        when using trtllm-gen backend, it can be a torch.Tensor with dtype torch.float32.
    sinks: additional value per head in the denominator of the softmax.
    backend : str = "auto"
        The implementation backend, could be ``auto``/``xqa`` or ``trtllm-gen``. Defaults to ``auto``.
        When set to ``auto``, the backend will be chosen based on the device architecture and kernel availability.
        For sm_100 and sm_103 (blackwell architecture), ``auto`` will choose ``trtllm-gen`` backend.
        For sm_120 (blackwell architecture), ``auto`` will choose ``xqa`` backend.

    Note
    ----
    In MLA, the actual BMM1 and BMM2 scales applied would be fused as:
    bmm1_scale = q_scale * k_scale * sm_scale / (head_dim_qk ** 0.5)
    bmm2_scale = v_scale * o_scale
    or,
    bmm1_scale = torch.Tensor([q_scale * k_scale * sm_scale / (head_dim_qk ** 0.5))
    bmm2_scale = torch.Tensor([v_scale * o_scale])

    The two scale factors should be static constant for cuda graph capture.
    Either (bmm1_scale, bmm2_scale) or (bmm1_scale_log2_tensor, bmm2_scale_tensor) should be provided.

    For static constant scale factors, the scale factors should be provided as float.
        - (bmm1_scale, bmm2_scale)
    For on-device fused scale tensors, which could dynamically change, the scale factors should be provided as torch.Tensor.
        - (bmm1_scale_log2_tensor, bmm2_scale_tensor)
        - Currently, only fp8 tensor core operation supports this mode.
    When both are provided, the dynamic scale factor tensors will be used.
    rO   r   
   z
trtllm-genxqa   z7XQA MLA only supports fp8 operation on SM120 GPUs, got r   NXQA MLA does not support sinksr	   2XQA MLA only supports q_len_per_request == 1, got     r3   z(Supported block_size are 32 and 64, got r   rZ   r   zBackend z not supported)r   r\   
isinstancera   r   r[   r   r   float8_e4m3fnr!   size"xqa_batch_decode_with_kv_cache_mlar   rE   trtllm_paged_attention_decoder   r>   r"   rb   bfloat16r   flattennumelelement_size)r5   r6   r   r7   r8   r9   r   r   r   r:   r   r   r   r   r   rJ   run_funcsm_countr-   	out_shape
batch_size_num_q_heads	max_q_lens                           r.   %trtllm_batch_decode_with_kv_cache_mlar   	  s   ~ &25<@@CrIILLu 	 *el++ (5=0000%'
*el++ 15=0000%"5<003r99{e111~!444l%+ll\d\jll   =>>>::a==ATUZZPQ]]TT   2
 
 	
  
L	 	 0:0Bu|,,,
 	 .//M&u|44 ]]2&&
"r!1!1T
TTUUU /	
 	
 ;CRC(L?:I+iu~elSSSCC,1K)J;$[,7   ZZ]]
JJqMM	a##""$$'7'D'D'F'FF/	
 	
 	
4 
;G;;;<<<r0   c                    |t          | j                  n|}t          | j                  }|                    d          }|                     d          }|dk    rt	          d|           | j        t          j        k    s|j        t          j        k    rt	          d| j         d|j                   |t	          d          t          | ||||d||          }|	:| j	        dd	         |fz   }t          j
        |t          j        | j        
          }	n1| j	        \  }}}}t          |	|||gt          j        | j        d           |                    t          j                  }|dd         }|dd         }|                    d                              d          }|                    d          }t#          | |||||	||||
|||           |	S )a  
    Parameters:
    query: [batch_size, q_len_per_request, num_heads, head_dim_qk], head_dim_qk = qk_nope_head_dim (kv_lora_rank) + qk_rope_head_dim, should be concated q_nope + q_rope; q_len_per_request is the MTP query length.
    kv_cache: [num_pages, page_size, head_dim_ckv + head_dim_kpe] or [num_pages, 1, page_size, head_dim_ckv + head_dim_kpe], should be concated ckv_cache + kpe_cache. Both 3D and 4D formats are supported for backward compatibility.
    workspace_buffer: torch.Tensor. Must be initialized to 0 for its first use.
    qk_nope_head_dim: qk_nope_head_dim, must be 128
    kv_lora_rank: kv_lora_rank, must be 512
    qk_rope_head_dim: qk_rope_head_dim, must be 64
    block_tables: page_table of kv cache, [batch_size, num_pages]
    seq_lens: query_len
    max_seq_len: max sequence length for kv_cache
    out: output tensor, if not provided, will be allocated internally
    bmm1_scale: fused scale for mla bmm1 input. Can be a float or a torch.Tensor.
    bmm2_scale: fused scale for mla bmm2 input. Can be a float or a torch.Tensor.
    sinks: additional value per head in the denominator of the softmax.

    Note:
    In MLA, the actual BMM1 and BMM2 scales applied would be fused as:
    bmm1_scale = q_scale * k_scale * sm_scale / (head_dim_qk ** 0.5)
    bmm2_scale = v_scale * o_scale

    The two scale factors should be static constant for cuda graph capture.
    Either (bmm1_scale, bmm2_scale) or (bmm1_scale_log2_tensor, bmm2_scale_tensor) should be provided.

    For static constant scale factors, the scale factors should be provided as float.
        - (bmm1_scale, bmm2_scale)
    For on-device fused scale tensors, which could dynamically change, the scale factors should be provided as torch.Tensor.
        - (bmm1_scale_log2_tensor, bmm2_scale_tensor)
        - Currently, only fp8 tensor core operation supports this mode.
    When both are provided, the dynamic scale factor tensors will be used.
    Nr   r	   r   z5XQA MLA only supports fp8 tensor core operation, got r   r   r   r   rZ   r   rY   r   )q_scalekv_scaler   r   )r   r\   r   r   r!   r[   ra   r   r>   r"   rb   r   r   viewrc   squeezer4   r   )r5   r6   r   r7   r8   r9   r   r   r   r   r   r   r   r   r   r-   q_len_per_requestr   r   r   r   workspace_u8	semaphorescratchkv_cache_newseq_lens_news                             r.   r   r     s-   ` 6@5G#EL111ZJ"5<00H r""J

1ATARTT
 
 	
 {e)))X^u?R-R-RfEKffV^Vdff
 
 	
 9::: +		 	H {K$6	k)5>%,OOO(-%
A{A l3NL	
 	
 	
 $((55L../I?,,-G##A&&0033L%%a((L     Jr0   )r   Nr   r   NNrO   )Nr   r   NN)*r   	functoolstypingr   r   r   r   r   r   ra   api_loggingr
   jitr   r   r   jit.mlar   utilsr   r   r   r   r   r   r   r   r   r/   r>   cacherE   rH   rL   rN   r   r   r   r   r   r   r   rG   r0   r.   <module>r      s         B B B B B B B B B B B B B B B B  ' ' ' ' ' ' U U U U U U U U U U # # # # # #                       
 
 
:7 7 7t    - - - A A Ax1 x1 x1 x1 x1 x1 x1 x1v  "&-0-0*.!m= m=<m=lm= lm= 	m=
 m= m= ,m= lm= m= m= 
%,	m= eU\)*m= eU\)*m= D&'m= m=  !m=" \#m= m= m= m=`  #'-0-0*.p p<plp lp 	p
 p p ,p lp p 
%,	p eU\)*p eU\)*p D&'p p \p p p p p pr0   