§
    )`ƒiê	  ã            	       óŒ   — d Z ddlZddlZddlmZ ej        d„ ¦   «         Zedej        dej        dej        d	dfd
„¦   «         ZdS )a3  
Copyright (c) 2025 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
é    Né   )Úflashinfer_apic                  óF   — ddl m}   | ¦   «                              ¦   «         S )Nr   ©Úgen_concat_mla_module)Újit.dsv3_optimizationsr   Úbuild_and_loadr   s    úi/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/concat_ops.pyÚget_concat_mla_moduler      s/   € à=Ð=Ð=Ð=Ð=Ð=à Ð Ñ"Ô"×1Ò1Ñ3Ô3Ð3ó    ÚkÚk_nopeÚk_ropeÚreturnc                 óL   — t          ¦   «                              | ||¦  «         dS )a  Concatenate k_nope and k_rope tensors for MLA attention.



    This function efficiently concatenates:
      - k_nope: per-head nope values
      - k_rope: shared rope values (broadcast to all heads)

    Key optimizations:
      - Warp-based processing with software pipelining
      - Vectorized memory access (int2 for nope, int for rope)
      - L2 prefetching for next row while processing current
      - Register reuse for rope values across all heads in a chunk

    Parameters
    ----------
    k : torch.Tensor
        Output tensor, shape: ``[num_tokens, num_heads, nope_dim + rope_dim]``.
        Modified in-place.
    k_nope : torch.Tensor
        The nope part of k, shape: ``[num_tokens, num_heads, nope_dim]``.
    k_rope : torch.Tensor
        The rope part of k (shared), shape: ``[num_tokens, 1, rope_dim]``.
        This is broadcast to all heads.

    Example
    -------
    >>> import torch
    >>> import flashinfer
    >>> num_tokens = 2048
    >>> num_heads = 128
    >>> nope_dim = 128
    >>> rope_dim = 64
    >>> k = torch.empty(num_tokens, num_heads, nope_dim + rope_dim, dtype=torch.bfloat16, device="cuda")
    >>> k_nope = torch.randn(num_tokens, num_heads, nope_dim, dtype=torch.bfloat16, device="cuda")
    >>> k_rope = torch.randn(num_tokens, 1, rope_dim, dtype=torch.bfloat16, device="cuda")
    >>> flashinfer.concat_ops.concat_mla_k(k, k_nope, k_rope)

    Note
    ----
    This kernel is specifically optimized for:
    - ``num_heads = 128``
    - ``nope_dim = 128``
    - ``rope_dim = 64``
    N)r   Úconcat_mla_k)r   r   r   s      r
   r   r      s)   € õf ÑÔ×(Ò(¨¨F°FÑ;Ô;Ð;Ð;Ð;r   )	Ú__doc__Ú	functoolsÚtorchÚapi_loggingr   Úcacher   ÚTensorr   © r   r
   ú<module>r      s©   ððð ð  Ð Ð Ð à €€€à 'Ð 'Ð 'Ð 'Ð 'Ð 'ð „ð4ð 4ñ „ð4ð ð2<Ø„|ð2<àŒLð2<ð ŒLð2<ð 
ð	2<ð 2<ð 2<ñ „ð2<ð 2<ð 2<r   