
    )`i              1          d Z ddlZddlmZmZ ddlZddlmZ ddlm	Z	 ddl
mZmZ ej        d             Z ed	d
          dej        dej        dej        dej        dej        dej        dededededdfd            Z ed	          dej        dej        dej        dej        dej        dej        dededededdfd            Z edd
          dej        dej        dej        dej        dej        dej        dedededededededdfd            Z ed          dej        dej        dej        dej        dej        dej        dedededededededdfd            Z edd
          dej        dej        dej        dej        d ej        dededededdfd!            Z ed          dej        dej        dej        dej        d ej        dededededdfd"            Z ed#d$          d%ej        d&ej        d'ej        d(ej        d)ej        d ej        d*ej        d+ej        d,ej        d-ej        d.ed/eded0eddfd1            Z ed#          d%ej        d&ej        d'ej        d(ej        d)ej        d ej        d*ej        d+ej        d,ej        d-ej        d.ed/eded0eddfd2            Z ed3d4          d%ej        d&ej        d'ej        d(ej        d5ej        d*ej        d,ej        d)ej        d ej        d6ej        d7ej        d8ej        d9ej        d:ej        d;ej        d<ej        d=ej        d>ed?ed.ed/eded0eddf0d@            Z ed3          d%ej        d&ej        d'ej        d(ej        d5ej        d*ej        d,ej        d)ej        d ej        d6ej        d7ej        d8ej        d9ej        d:ej        d;ej        d<ej        d=ej        d>ed?ed.ed/eded0eddf0dA            Z edBd
          dej        dej        dej        dej        d)ej        d ej        deddfdC            Z edB          dej        dej        dej        dej        dDej        dEej        d ej        deddfdF            Z edGd
          dej        dej        dej        dej        d ej        dedededededededdfdH            Z edG          dej        dej        dej        dej        d ej        dedededededededdfdI            Z e	 	 	 	 dkdej        dej        dej        dej        dee         dedededdfdL            Z!e	 	 	 	 dkdej        dej        d ej        dee         dedededdfdM            Z"e	 	 	 	 	 	 	 dldej        dej        dej        dej        dee         dededededededdfdR            Z#e	 	 	 	 	 	 	 dldej        dej        d ej        dee         dededededededdfdS            Z$e	 	 	 	 dkdej        dej        dej        dej        dee         dedededeej        ej        f         fdT            Z%e	 	 	 	 dkdej        dej        d ej        dee         dedededeej        ej        f         fdU            Z&e	 	 	 	 	 	 	 dldej        dej        dej        dej        dee         dededededededeej        ej        f         fdV            Z'e	 	 	 	 	 	 	 dldej        dej        d ej        dee         dededededededeej        ej        f         fdW            Z(e	 dmd=ej        dYej        dZej        d[ed)ej        d\edeej        ej        f         fd]            Z)e	 dmd=ej        dYej        dZej        d[ed)ej        d\eddfd^            Z*e	 	 	 	 	 	 	 	 	 dndej        dej        d`ej        daej        d)ej        d ej        d\edbeej+                 d.ed/ed*eej                 d+eej                 d,eej                 d-eej                 d0edeej        ej        ej        ej        f         f dc            Z,e	 	 	 	 	 	 	 	 	 dndej        dej        d`eej                 daeej                 d)ej        d ej        d\edbeej+                 d.ed/ed*eej                 d+eej                 d,eej                 d-eej                 d0edeej        ej        ej        ej        f         f dd            Z-e	 	 	 	 	 	 	 	 	 dodej        dej        d`eej                 daeej                 dgeej                 d)ej        d ej        dheej        ej        f         d:ej        d;ej        d<ej        d=ej        d\edbeej+                 d.ed/ed?edie.d*eej                 d,eej                 d0edeej        ej        f         f,dj            Z/dS )pa3  
Copyright (c) 2024 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)OptionalTuple   )flashinfer_api)gen_rope_module)register_custom_opregister_fake_opc                  B    t                                                      S N)r   build_and_load     c/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/rope.pyget_rope_moduler      s    ++---r   zflashinfer::apply_rope)q_ropek_rope)mutates_argsqkr   r   indptroffsets
rotary_dim
interleave
rope_scale
rope_thetareturnc
                 Z    t                                          | |||||||||	
  
         d S r   )r   
apply_rope
r   r   r   r   r   r   r   r   r   r   s
             r   _apply_roper        sJ       		    r   c
                     d S r   r   r   s
             r   _fake_apply_roper"   ;   s	     	Dr   zflashinfer::apply_llama31_ropelow_freq_factorhigh_freq_factorold_context_lenc                 `    t                                          | |||||||||	|
||           d S r   )r   apply_llama31_roper   r   r   r   r   r   r   r   r   r   r#   r$   r%   s                r   _apply_llama31_roper)   K   sS      ((		    r   c                     d S r   r   r(   s                r   _fake_apply_llama31_roper+   l   s	      	Dr   zflashinfer::apply_rope_pos_idspos_idsc	                 X    t                                          | ||||||||	  	         d S r   )r   apply_rope_pos_ids	r   r   r   r   r,   r   r   r   r   s	            r   _apply_rope_pos_idsr0      sG     ((		
 
 
 
 
r   c	                     d S r   r   r/   s	            r   _fake_apply_rope_pos_idsr2      s	     	Dr   zflashinfer::rope_quantize)
q_rope_out
k_rope_out
q_nope_out
k_nope_out	q_rope_in	k_rope_in	q_nope_in	k_nope_incos_sin_cacher3   r4   r5   r6   quant_scale_qquant_scale_kv
enable_pdlc                 b    t                                          | |||||||	|||
|||           dS )zCustom operator that routes to the CUDA kernel implementation.

    Converts is_neox parameter to interleave format and dispatches to the underlying
    CUDA kernel via the JIT-compiled module.
    N)r   rope_quantizer7   r8   r9   r:   r;   r,   r3   r4   r5   r6   r<   r=   r   r>   s                 r   _rope_quantizerB      sV    2 ##    r   c                     d S r   r   rA   s                 r   _fake_rope_quantizerD      s	    " 	Dr   z/flashinfer::rope_quantize_append_paged_kv_cache)r3   r5   k_cachev_cache	ckv_cache	kpe_cachev_inrE   rF   rG   rH   
kv_indices	kv_indptrbatch_indices	positionskv_layout_code	page_sizec                 t    t                                          | |||||||||	|
||||||||||||           dS )a%  Custom operator that routes to the CUDA kernel implementation.

    Fuses RoPE application, FP8 quantization, and paged KV cache append into a single kernel.

    Converts is_neox parameter to interleave format and dispatches to the underlying
    CUDA kernel via the JIT-compiled module.
    N)r   #rope_quantize_append_paged_kv_cacher7   r8   r9   r:   rI   r3   r5   r;   r,   rE   rF   rG   rH   rJ   rK   rL   rM   rN   rO   r<   r=   r   r>   s                          r   (_rope_quantize_fp8_append_paged_kv_cacherS      sr    V 99/    r   c                     d S r   r   rR   s                          r   -_fake_rope_quantize_fp8_append_paged_kv_cacherU   ,  s	    4 	Dr   z,flashinfer::apply_rope_pos_ids_cos_sin_cachec           	      T    t                                          | ||||||           d S r   )r    apply_rope_pos_ids_cos_sin_cacher   r   r   r   r;   r,   r   s          r   !_apply_rope_pos_ids_cos_sin_cacherY   I  sA     66		    r   	cos_cache	sin_cachec                     d S r   r   )r   r   r   r   rZ   r[   r,   r   s           r   &_fake_apply_rope_pos_ids_cos_sin_cacher]   `  s	     	Dr   z&flashinfer::apply_llama31_rope_pos_idsc                 ^    t                                          | |||||||||	|
|           d S r   )r   apply_llama31_rope_pos_idsr   r   r   r   r,   r   r   r   r   r#   r$   r%   s               r   _apply_llama31_rope_pos_idsra   n  sP    " 00		    r   c                     d S r   r   r`   s               r    _fake_apply_llama31_rope_pos_idsrc     s	     	Dr   F     @c                 d    ||                      d          }t          | || |||||||
  
         dS )a  Apply rotary embedding to a batch of queries/keys (stored as RaggedTensor) inplace.
    cos/sin values are computed on the fly inside the kernel.

    We use :attr:`indptr` to denote the start pointer of each segment in the batch, the i-th
    segment the query of the i-th segment is ``q[indptr[i]:indptr[i+1]]`` and the key of the
    i-th segment is ``k[indptr[i]:indptr[i+1]]``, the first element of :attr:`indptr` is always
    0 and the last element of :attr:`indptr` is the total number of queries/keys in the batch.
    Please see :ref:`Ragged Tensor tutorial <kv-layout>` for more details about the
    ragged tensor.

    Parameters
    ----------
    q : torch.Tensor
        Query ragged tensor, shape: ``(nnz, num_q_heads, head_dim)`, where ``nnz`` is the last
        element of ``indptr``.
    k : torch.Tensor
        Key ragged tensor, shape: ``(nnz, num_k_heads, head_dim)``, where ``nnz`` is the last
        element of ``indptr``.
    indptr : torch.Tensor
        Indptr tensor, shape: ``(batch_size + 1)``.
    offsets : torch.Tensor
        The relative position offsets of each query in the batch, shape: ``(batch_size)``.
    rotary_dim : Optional[int]
        The dimensions to apply RoPE, if ``None``, we apply RoPE to the entire head dimension,
        otherwise, we apply RoPE to the first ``rotary_dim`` dimensions, default: ``None``.
    interleave : bool
        Whether to use interleaved layout in the last dimension, default: ``False``.

        * If ``True``, the last dimension of the query/key tensor is interleaved, i.e.,
          we rotate the even dimensions ``([..., ::2])`` and odd dimensions ``([..., 1::2])``.

        * If ``False``, the last dimension of the query/key tensor is not interleaved, i.e.,
          we rotate the first half dimensions ``([..., :head_dim//2])`` and the second half
          dimensions ``([..., head_dim//2:])``.

    rope_scale : float
        The scaling factor used in the rope embedding, default: ``1``.
    rope_theta : float
        The theta value used in the rope embedding, default: ``1e4``.

    Examples
    --------
    >>> import torch
    >>> import flashinfer
    >>> batch_size = 128
    >>> qkv_len = 1024
    >>> num_qo_heads = 32
    >>> num_kv_heads = 32
    >>> head_dim = 128
    >>> nnz = batch_size * qkv_len
    >>> qkv_packed = torch.randn(
    >>>    nnz,
    >>>    (num_qo_heads + 2 * num_kv_heads) * head_dim,
    >>>    dtype=torch.float16,
    >>>    device="cuda:0",
    >>> )
    >>> q = qkv_packed[:, : num_qo_heads * head_dim].reshape(nnz, num_qo_heads, head_dim)
    >>> k = qkv_packed[
    ...    :, num_qo_heads * head_dim : (num_qo_heads + num_kv_heads) * head_dim
    ... ].reshape(nnz, num_kv_heads, head_dim)
    >>> indptr = torch.tensor(
    ...    [i * qkv_len for i in range(batch_size + 1)], dtype=torch.int32, device="cuda:0"
    >>> )
    >>> offsets = torch.full((batch_size,), 10, dtype=torch.int32, device="cuda:0")
    >>> flashinfer.apply_rope_inplace(q, k, indptr, offsets)

    See Also
    --------
    apply_rope
    N)sizer    )r   r   r   r   r   r   r   r   s           r   apply_rope_inplacerh     sK    b VVBZZ
	1aFGZZ    r   c                 b    ||                      d          }t          | || ||||||	  	         dS )a  Apply rotary embedding to a batch of queries/keys (stored as RaggedTensor) inplace.
    cos/sin values are computed on the fly inside the kernel.

    We use :attr:`indptr` to denote the start pointer of each segment in the batch, the i-th
    segment the query of the i-th segment is ``q[indptr[i]:indptr[i+1]]`` and the key of the
    i-th segment is ``k[indptr[i]:indptr[i+1]]``, the first element of :attr:`indptr` is always
    0 and the last element of :attr:`indptr` is the total number of queries/keys in the batch.
    Please see :ref:`Ragged Tensor tutorial <kv-layout>` for more details about the
    ragged tensor.

    Parameters
    ----------
    q : torch.Tensor
        Query ragged tensor, shape: ``(nnz, num_q_heads, head_dim)`, where ``nnz`` is the last
        element of ``indptr``.
    k : torch.Tensor
        Key ragged tensor, shape: ``(nnz, num_k_heads, head_dim)``, where ``nnz`` is the last
        element of ``indptr``.
    pos_ids : torch.Tensor
        Position indices, shape: ``(nnz)``.
    rotary_dim : Optional[int]
        The dimensions to apply RoPE, if ``None``, we apply RoPE to the entire head dimension,
        otherwise, we apply RoPE to the first ``rotary_dim`` dimensions, default: ``None``.
    interleave : bool
        Whether to use interleaved layout in the last dimension, default: ``False``.

        * If ``True``, the last dimension of the query/key tensor is interleaved, i.e.,
          we rotate the even dimensions ``([..., ::2])`` and odd dimensions ``([..., 1::2])``.

        * If ``False``, the last dimension of the query/key tensor is not interleaved, i.e.,
          we rotate the first half dimensions ``([..., :head_dim//2])`` and the second half
          dimensions ``([..., head_dim//2:])``.

    rope_scale : float
        The scaling factor used in the rope embedding, default: ``1``.
    rope_theta : float
        The theta value used in the rope embedding, default: ``1e4``.

    See Also
    --------
    apply_rope_pos_ids
    Nrf   )rg   r0   )r   r   r,   r   r   r   r   s          r   apply_rope_pos_ids_inplacerj     sI    h VVBZZ
	1aGZZ    r          A       c                     ||                      d          }t          | || |||||||||	t          |
                     dS )a>  Apply Llama 3.1 style rotary embedding to a batch of queries/keys (stored as
    RaggedTensor) inplace. cos/sin values are computed on the fly inside the kernel.

    We use :attr:`indptr` to denote the start pointer of each segment in the batch, the i-th
    segment the query of the i-th segment is ``q[indptr[i]:indptr[i+1]]`` and the key of the
    i-th segment is ``k[indptr[i]:indptr[i+1]]``, the first element of :attr:`indptr` is always
    0 and the last element of :attr:`indptr` is the total number of queries/keys in the batch.
    Please see :ref:`Ragged Tensor tutorial <kv-layout>` for more details about the
    ragged tensor.

    Parameters
    ----------
    q : torch.Tensor
        Query ragged tensor, shape: ``(nnz, num_q_heads, head_dim)``, where ``nnz`` is the last
        element of ``indptr``.
    k : torch.Tensor
        Key ragged tensor, shape: ``(nnz, num_k_heads, head_dim)``, where ``nnz`` is the last
        element of ``indptr``.
    indptr : torch.Tensor
        Indptr tensor, shape: ``(batch_size + 1)``.
    offsets : torch.Tensor
        The relative position offsets of each query in the batch, shape: ``(batch_size)``.
    rotary_dim : Optional[int]
        The dimensions to apply RoPE, if ``None``, we apply RoPE to the entire head dimension,
        otherwise, we apply RoPE to the first ``rotary_dim`` dimensions, default: ``None``.
    interleave : bool
        Whether to use interleaved layout in the last dimension, default: ``False``.

        * If ``True``, the last dimension of the query/key tensor is interleaved, i.e.,
          we rotate the even dimensions ``([..., ::2])`` and odd dimensions ``([..., 1::2])``.

        * If ``False``, the last dimension of the query/key tensor is not interleaved, i.e.,
          we rotate the first half dimensions ``([..., :head_dim//2])`` and the second half
          dimensions ``([..., head_dim//2:])``.

    rope_scale : float
        The scaling factor used in the rope embedding, default: ``8``.
    rope_theta : float
        The theta value used in the rope embedding, default: ``5e5``.
    low_freq_factor : float
        The low frequency factor used in Llama 3.1 RoPE, default: ``1``.
    high_freq_factor : float
        The high frequency factor used in Llama 3.1 RoPE, default: ``4``.
    old_context_len : int
        The old context length used in Llama 3.1 RoPE, default: ``8192``.

    Examples
    --------
    >>> import torch
    >>> import flashinfer
    >>> batch_size = 128
    >>> qkv_len = 1024
    >>> num_qo_heads = 32
    >>> num_kv_heads = 32
    >>> head_dim = 128
    >>> nnz = batch_size * qkv_len
    >>> qkv_packed = torch.randn(
    >>>    nnz,
    >>>    (num_qo_heads + 2 * num_kv_heads) * head_dim,
    >>>    dtype=torch.float16,
    >>>    device="cuda:0",
    >>> )
    >>> q = qkv_packed[:, : num_qo_heads * head_dim].reshape(nnz, num_qo_heads, head_dim)
    >>> k = qkv_packed[
    ...    :, num_qo_heads * head_dim : (num_qo_heads + num_kv_heads) * head_dim
    ... ].reshape(nnz, num_kv_heads, head_dim)
    >>> indptr = torch.tensor(
    ...    [i * qkv_len for i in range(batch_size + 1)], dtype=torch.int32, device="cuda:0"
    >>> )
    >>> offsets = torch.full((batch_size,), 10, dtype=torch.int32, device="cuda:0")
    >>> flashinfer.apply_llama31_rope_inplace(q, k, indptr, offsets)

    See Also
    --------
    apply_llama31_rope
    Nrf   )rg   r)   float)r   r   r   r   r   r   r   r   r#   r$   r%   s              r   apply_llama31_rope_inplacerq   4  se    t VVBZZ
				o    r   c
                     ||                      d          }t          | || ||||||||t          |	                     dS )a 	  Apply Llama 3.1 style rotary embedding to a batch of queries/keys (stored as
    RaggedTensor) inplace. cos/sin values are computed on the fly inside the kernel.

    We use :attr:`indptr` to denote the start pointer of each segment in the batch, the i-th
    segment the query of the i-th segment is ``q[indptr[i]:indptr[i+1]]`` and the key of the
    i-th segment is ``k[indptr[i]:indptr[i+1]]``, the first element of :attr:`indptr` is always
    0 and the last element of :attr:`indptr` is the total number of queries/keys in the batch.
    Please see :ref:`Ragged Tensor tutorial <kv-layout>` for more details about the
    ragged tensor.

    Parameters
    ----------
    q : torch.Tensor
        Query ragged tensor, shape: ``(nnz, num_q_heads, head_dim)``, where ``nnz`` is the last
        element of ``indptr``.
    k : torch.Tensor
        Key ragged tensor, shape: ``(nnz, num_k_heads, head_dim)``, where ``nnz`` is the last
        element of ``indptr``.
    pos_ids : torch.Tensor
        Position indices, shape: ``(nnz)``.
    rotary_dim : Optional[int]
        The dimensions to apply RoPE, if ``None``, we apply RoPE to the entire head dimension,
        otherwise, we apply RoPE to the first ``rotary_dim`` dimensions, default: ``None``.
    interleave : bool
        Whether to use interleaved layout in the last dimension, default: ``False``.

        * If ``True``, the last dimension of the query/key tensor is interleaved, i.e.,
          we rotate the even dimensions ``([..., ::2])`` and odd dimensions ``([..., 1::2])``.

        * If ``False``, the last dimension of the query/key tensor is not interleaved, i.e.,
          we rotate the first half dimensions ``([..., :head_dim//2])`` and the second half
          dimensions ``([..., head_dim//2:])``.

    rope_scale : float
        The scaling factor used in the rope embedding, default: ``8``.
    rope_theta : float
        The theta value used in the rope embedding, default: ``5e5``.
    low_freq_factor : float
        The low frequency factor used in Llama 3.1 RoPE, default: ``1``.
    high_freq_factor : float
        The high frequency factor used in Llama 3.1 RoPE, default: ``4``.
    old_context_len : int
        The old context length used in Llama 3.1 RoPE, default: ``8192``.

    See Also
    --------
    apply_llama31_rope_pos_ids
    Nrf   )rg   ra   rp   )
r   r   r,   r   r   r   r   r#   r$   r%   s
             r   "apply_llama31_rope_pos_ids_inplacers     sb    z VVBZZ
				o    r   c                     t          j        |           }t          j        |          }	||                     d          }t          | |||	||||||
  
         ||	fS )aE  Apply rotary embedding to a batch of queries/keys (stored as RaggedTensor).
    cos/sin values are computed on the fly inside the kernel.

    We use :attr:`indptr` to denote the start pointer of each segment in the batch, the i-th
    segment the query of the i-th segment is ``q[indptr[i]:indptr[i+1]]`` and the key of the
    i-th segment is ``k[indptr[i]:indptr[i+1]]``, the first element of :attr:`indptr` is always
    0 and the last element of :attr:`indptr` is the total number of queries/keys in the batch.
    Please see :ref:`Ragged Tensor tutorial <kv-layout>` for more details about the
    ragged tensor.

    Parameters
    ----------
    q : torch.Tensor
        Query ragged tensor, shape: ``(nnz, num_q_heads, head_dim)`, where ``nnz`` is the last
        element of ``indptr``.
    k : torch.Tensor
        Key ragged tensor, shape: ``(nnz, num_k_heads, head_dim)``, where ``nnz`` is the last
        element of ``indptr``.
    indptr : torch.Tensor
        Indptr tensor, shape: ``(batch_size + 1)``.
    offsets : torch.Tensor
        The relative position offsets of each query in the batch, shape: ``(batch_size)``.
    rotary_dim : Optional[int]
        The dimensions to apply RoPE, if ``None``, we apply RoPE to the entire head dimension,
        otherwise, we apply RoPE to the first ``rotary_dim`` dimensions, default: ``None``.
    interleave : bool
        Whether to use interleaved layout in the last dimension, default: ``False``.

        * If ``True``, the last dimension of the query/key tensor is interleaved, i.e.,
          we rotate the even dimensions ``([..., ::2])`` and odd dimensions ``([..., 1::2])``.

        * If ``False``, the last dimension of the query/key tensor is not interleaved, i.e.,
          we rotate the first half dimensions ``([..., :head_dim//2])`` and the second half
          dimensions ``([..., head_dim//2:])``.

    rope_scale : float
        The scaling factor used in the rope embedding, default: ``1``.
    rope_theta : float
        The theta value used in the rope embedding, default: ``1e4``.

    Returns
    -------
    q_rope : torch.Tensor
        The rotated query tensor, shape: ``(nnz, num_q_heads, head_dim)``.
    k_rope : torch.Tensor
        The rotated key tensor, shape: ``(nnz, num_k_heads, head_dim)``.

    Examples
    --------
    >>> import torch
    >>> import flashinfer
    >>> batch_size = 128
    >>> qkv_len = 1024
    >>> num_qo_heads = 32
    >>> num_kv_heads = 32
    >>> head_dim = 128
    >>> nnz = batch_size * qkv_len
    >>> qkv_packed = torch.randn(
    >>>    nnz,
    >>>    (num_qo_heads + 2 * num_kv_heads) * head_dim,
    >>>    dtype=torch.float16,
    >>>    device="cuda:0",
    >>> )
    >>> q = qkv_packed[:, : num_qo_heads * head_dim].reshape(nnz, num_qo_heads, head_dim)
    >>> k = qkv_packed[
    ...    :, num_qo_heads * head_dim : (num_qo_heads + num_kv_heads) * head_dim
    ... ].reshape(nnz, num_kv_heads, head_dim)
    >>> indptr = torch.tensor(
    ...    [i * qkv_len for i in range(batch_size + 1)], dtype=torch.int32, device="cuda:0"
    >>> )
    >>> offsets = torch.full((batch_size,), 10, dtype=torch.int32, device="cuda:0")
    >>> q_rope, k_rope = flashinfer.apply_rope(q, k, indptr, offsets)
    >>> q_rope.shape
    torch.Size([131072, 32, 128])
    >>> k_rope.shape
    torch.Size([131072, 32, 128])

    See Also
    --------
    apply_rope_inplace
    Nrf   )torch
empty_likerg   r    )
r   r   r   r   r   r   r   r   r   r   s
             r   r   r     sv    x a  Fa  FVVBZZ
		   6>r   c                     t          j        |           }t          j        |          }||                     d          }t          | ||||||||	  	         ||fS )a  Apply rotary embedding to a batch of queries/keys (stored as RaggedTensor).
    cos/sin values are computed on the fly inside the kernel.

    We use :attr:`indptr` to denote the start pointer of each segment in the batch, the i-th
    segment the query of the i-th segment is ``q[indptr[i]:indptr[i+1]]`` and the key of the
    i-th segment is ``k[indptr[i]:indptr[i+1]]``, the first element of :attr:`indptr` is always
    0 and the last element of :attr:`indptr` is the total number of queries/keys in the batch.
    Please see :ref:`Ragged Tensor tutorial <kv-layout>` for more details about the
    ragged tensor.

    Parameters
    ----------
    q : torch.Tensor
        Query ragged tensor, shape: ``(nnz, num_q_heads, head_dim)`, where ``nnz`` is the last
        element of ``indptr``.
    k : torch.Tensor
        Key ragged tensor, shape: ``(nnz, num_k_heads, head_dim)``, where ``nnz`` is the last
        element of ``indptr``.
    pos_ids : torch.Tensor
        Position indices, shape: ``(batch_size + 1)``.
    rotary_dim : Optional[int]
        The dimensions to apply RoPE, if ``None``, we apply RoPE to the entire head dimension,
        otherwise, we apply RoPE to the first ``rotary_dim`` dimensions, default: ``None``.
    interleave : bool
        Whether to use interleaved layout in the last dimension, default: ``False``.

        * If ``True``, the last dimension of the query/key tensor is interleaved, i.e.,
          we rotate the even dimensions ``([..., ::2])`` and odd dimensions ``([..., 1::2])``.

        * If ``False``, the last dimension of the query/key tensor is not interleaved, i.e.,
          we rotate the first half dimensions ``([..., :head_dim//2])`` and the second half
          dimensions ``([..., head_dim//2:])``.

    rope_scale : float
        The scaling factor used in the rope embedding, default: ``1``.
    rope_theta : float
        The theta value used in the rope embedding, default: ``1e4``.

    Returns
    -------
    q_rope : torch.Tensor
        The rotated query tensor, shape: ``(nnz, num_q_heads, head_dim)``.
    k_rope : torch.Tensor
        The rotated key tensor, shape: ``(nnz, num_k_heads, head_dim)``.

    See Also
    --------
    apply_rope_inplace
    Nrf   )ru   rv   rg   r0   )	r   r   r,   r   r   r   r   r   r   s	            r   r.   r.   _  sk    v a  Fa  FVVBZZ
	1ffgz:z:   6>r   c                     t          j        |           }t          j        |          }||                     d          }t          | |||||||||||	t	          |
                     ||fS )a  Apply Llama 3.1 style rotary embedding to a batch of queries/keys (stored as
    RaggedTensor). cos/sin values are computed on the fly inside the kernel.

    We use :attr:`indptr` to denote the start pointer of each segment in the batch, the i-th
    segment the query of the i-th segment is ``q[indptr[i]:indptr[i+1]]`` and the key of the
    i-th segment is ``k[indptr[i]:indptr[i+1]]``, the first element of :attr:`indptr` is always
    0 and the last element of :attr:`indptr` is the total number of queries/keys in the batch.
    Please see :ref:`Ragged Tensor tutorial <kv-layout>` for more details about the
    ragged tensor.

    Parameters
    ----------
    q : torch.Tensor
        Query ragged tensor, shape: ``(nnz, num_q_heads, head_dim)``, where ``nnz`` is the last
        element of ``indptr``.
    k : torch.Tensor
        Key ragged tensor, shape: ``(nnz, num_k_heads, head_dim)``, where ``nnz`` is the last
        element of ``indptr``.
    indptr : torch.Tensor
        Indptr tensor, shape: ``(batch_size + 1)``.
    offsets : torch.Tensor
        The relative position offsets of each query in the batch, shape: ``(batch_size)``.
    rotary_dim : Optional[int]
        The dimensions to apply RoPE, if ``None``, we apply RoPE to the entire head dimension,
        otherwise, we apply RoPE to the first ``rotary_dim`` dimensions, default: ``None``.
    interleave : bool
        Whether to use interleaved layout in the last dimension, default: ``False``.

        * If ``True``, the last dimension of the query/key tensor is interleaved, i.e.,
          we rotate the even dimensions ``([..., ::2])`` and odd dimensions ``([..., 1::2])``.

        * If ``False``, the last dimension of the query/key tensor is not interleaved, i.e.,
          we rotate the first half dimensions ``([..., :head_dim//2])`` and the second half
          dimensions ``([..., head_dim//2:])``.

    rope_scale : float
        The scaling factor used in the rope embedding, default: ``8``.
    rope_theta : float
        The theta value used in the rope embedding, default: ``5e5``.
    low_freq_factor : float
        The low frequency factor used in Llama 3.1 RoPE, default: ``1``.
    high_freq_factor : float
        The high frequency factor used in Llama 3.1 RoPE, default: ``4``.
    old_context_len : int
        The old context length used in Llama 3.1 RoPE, default: ``8192``.

    Returns
    -------
    q_rope : torch.Tensor
        The rotated query tensor, shape: ``(nnz, num_q_heads, head_dim)``.
    k_rope : torch.Tensor
        The rotated key tensor, shape: ``(nnz, num_k_heads, head_dim)``.

    Examples
    --------
    >>> import torch
    >>> import flashinfer
    >>> batch_size = 128
    >>> qkv_len = 1024
    >>> num_qo_heads = 32
    >>> num_kv_heads = 32
    >>> head_dim = 128
    >>> nnz = batch_size * qkv_len
    >>> qkv_packed = torch.randn(
    >>>    nnz,
    >>>    (num_qo_heads + 2 * num_kv_heads) * head_dim,
    >>>    dtype=torch.float16,
    >>>    device="cuda:0",
    >>> )
    >>> q = qkv_packed[:, : num_qo_heads * head_dim].reshape(nnz, num_qo_heads, head_dim)
    >>> k = qkv_packed[
    ...    :, num_qo_heads * head_dim : (num_qo_heads + num_kv_heads) * head_dim
    ... ].reshape(nnz, num_kv_heads, head_dim)
    >>> indptr = torch.tensor(
    ...    [i * qkv_len for i in range(batch_size + 1)], dtype=torch.int32, device="cuda:0"
    >>> )
    >>> offsets = torch.full((batch_size,), 10, dtype=torch.int32, device="cuda:0")
    >>> q_rope, k_rope = flashinfer.apply_llama31_rope(q, k, indptr, offsets)
    >>> q_rope.shape
    torch.Size([131072, 32, 128])
    >>> k_rope.shape
    torch.Size([131072, 32, 128])

    See Also
    --------
    apply_llama31_rope_inplace
    Nrf   )ru   rv   rg   r)   rp   )r   r   r   r   r   r   r   r   r#   r$   r%   r   r   s                r   r'   r'     s    J a  Fa  FVVBZZ
		o   6>r   c
                     t          j        |           }
t          j        |          }||                     d          }t          | ||
||||||||t	          |	                     |
|fS )a	  Apply Llama 3.1 style rotary embedding to a batch of queries/keys (stored as
    RaggedTensor). cos/sin values are computed on the fly inside the kernel.

    We use :attr:`indptr` to denote the start pointer of each segment in the batch, the i-th
    segment the query of the i-th segment is ``q[indptr[i]:indptr[i+1]]`` and the key of the
    i-th segment is ``k[indptr[i]:indptr[i+1]]``, the first element of :attr:`indptr` is always
    0 and the last element of :attr:`indptr` is the total number of queries/keys in the batch.
    Please see :ref:`Ragged Tensor tutorial <kv-layout>` for more details about the
    ragged tensor.

    Parameters
    ----------
    q : torch.Tensor
        Query ragged tensor, shape: ``(nnz, num_q_heads, head_dim)``, where ``nnz`` is the last
        element of ``indptr``.
    k : torch.Tensor
        Key ragged tensor, shape: ``(nnz, num_k_heads, head_dim)``, where ``nnz`` is the last
        element of ``indptr``.
    pos_ids : torch.Tensor
        Position indices, shape: ``(nnz)``.
    rotary_dim : Optional[int]
        The dimensions to apply RoPE, if ``None``, we apply RoPE to the entire head dimension,
        otherwise, we apply RoPE to the first ``rotary_dim`` dimensions, default: ``None``.
    interleave : bool
        Whether to use interleaved layout in the last dimension, default: ``False``.

        * If ``True``, the last dimension of the query/key tensor is interleaved, i.e.,
          we rotate the even dimensions ``([..., ::2])`` and odd dimensions ``([..., 1::2])``.

        * If ``False``, the last dimension of the query/key tensor is not interleaved, i.e.,
          we rotate the first half dimensions ``([..., :head_dim//2])`` and the second half
          dimensions ``([..., head_dim//2:])``
    rope_scale : float
        The scaling factor used in the rope embedding, default: ``8``.
    rope_theta : float
        The theta value used in the rope embedding, default: ``5e5``.
    low_freq_factor : float
        The low frequency factor used in Llama 3.1 RoPE, default: ``1``.
    high_freq_factor : float
        The high frequency factor used in Llama 3.1 RoPE, default: ``4``.
    old_context_len : int
        The old context length used in Llama 3.1 RoPE, default: ``8192``.

    Returns
    -------
    q_rope : torch.Tensor
        The rotated query tensor, shape: ``(nnz, num_q_heads, head_dim)``.
    k_rope : torch.Tensor
        The rotated key tensor, shape: ``(nnz, num_k_heads, head_dim)``.

    See Also
    --------
    apply_llama31_rope_pos_ids_inplace
    Nrf   )ru   rv   rg   ra   rp   )r   r   r,   r   r   r   r   r#   r$   r%   r   r   s               r   r_   r_     s    F a  Fa  FVVBZZ
		o   6>r   Tquerykey	head_sizeis_neoxc           
         |j         t          j        k    rt          d          t          j        |          }t          j        |          }t          |                    |j        d         d|          |                    |j        d         d|          |                    |j        d         d|          |                    |j        d         d|          || |            ||fS )a  
    Apply rotary embedding to keys and queries with precomputed cos/sin values.
    This is designed to be compatible with the SGL/vLLM implementation.

    Parameters
    ----------
    positions : torch.Tensor
        Position indices, shape: ``(nnz)``.
    query : torch.Tensor
        Query tensor, shape: ``(nnz, num_q_heads * head_size)``.
    key : torch.Tensor
        Key tensor, shape: ``(nnz, num_k_heads * head_size)``.
    cos_sin_cache : torch.Tensor
        Cosine and Sine cache tensor, shape: ``(max_seq_len, rotary_dim)``.
        Cosine is the first half and Sine is the second half on rotary_dim.
    is_neox : bool
        Whether to use Neox style RoPE, default: ``True``.

        * If ``True``, the last dimension of the query/key tensor is not interleaved, i.e.,
          we rorate the first half dimensions ``([..., :head_dim//2])`` and the second half
          dimensions ``([..., head_dim//2:])``.

        * If ``False``, the last dimension of the query/key tensor is interleaved, i.e.,
          we rotate the even dimensions ``([..., ::2])`` and odd dimensions ``([..., 1::2])``.

    Returns
    -------
    query_out : torch.Tensor
        The rotated query tensor, shape: ``(nnz, num_q_heads * head_size)``.
    key_out : torch.Tensor
        The rotated key tensor, shape: ``(nnz, num_k_heads * head_size)``.

    Note
    ----
    The rotary dimension is determined by the cosine cache and sine cache.
    cos_sin_cache should be float32r   rf   rX   )dtyperu   float32
ValueErrorrv   rY   viewshape)rM   rz   r{   r|   r;   r}   	query_outkey_outs           r   apply_rope_with_cos_sin_cacher   w  s    Z em++:;;; ''Is##G%
**U[^R
3
3
((39Q<Y
/
/~~ioa0"i@@||GM!,b)<<#K    gr   c           
      |   |j         t          j        k    rt          d          t	          |                    |j        d         d|          |                    |j        d         d|          |                    |j        d         d|          |                    |j        d         d|          || |            dS )a  
    Apply rotary embedding to keys and queries with precomputed cos/sin values.
    This is designed to be compatible with the SGL/vLLM implementation.
    The result is inplace applied to the input tensors.

    Parameters
    ----------
    positions : torch.Tensor
        Position indices, shape: ``(nnz)``.
    query : torch.Tensor
        Query tensor, shape: ``(nnz, num_q_heads * head_size)``.
    key : torch.Tensor
        Key tensor, shape: ``(nnz, num_k_heads * head_size)``.
    cos_sin_cache : torch.Tensor
        Cosine and Sine cache tensor, shape: ``(max_seq_len, rotary_dim)``.
        Cosine is the first half and Sine is the second half on rotary_dim.
    is_neox : bool
        Whether to use Neox style RoPE, default: ``True``.

        * If ``True``, the last dimension of the query/key tensor is not interleaved, i.e.,
          we rorate the first half dimensions ``([..., :head_dim//2])`` and the second half
          dimensions ``([..., head_dim//2:])``.

        * If ``False``, the last dimension of the query/key tensor is interleaved, i.e.,
          we rotate the even dimensions ``([..., ::2])`` and odd dimensions ``([..., 1::2])``.
    Note
    ----
    The rotary dimension is determined by the cosine cache and sine cache.
    r   r   rf   rX   N)r   ru   r   r   rY   r   r   )rM   rz   r{   r|   r;   r}   s         r   %apply_rope_with_cos_sin_cache_inplacer     s    L em++:;;; &
**U[^R
3
3
((39Q<Y
/
/zz%+a."i88xx	!b)44#K     r         ?q_nopek_nopequantize_dtypec                 <    t          | |||||||||	|
||||          S r   )rope_quantize_fp8)r   r   r   r   r;   r,   r}   r   r<   r=   r3   r4   r5   r6   r>   s                  r   mla_rope_quantize_fp8r     sC    $   r   c                    |j         t          j        k    rt          d          | j        d         }| j        d         }|j        dk    }|rdn|j        d         }|#t          j        ||d| j         | j                  }|H|r#t          j        |d|j         |j                  }n#t          j        ||d|j         |j                  }| |
|||fD ]}|	|j         } nt          j        }|
|
nt          j	        | |          }
||nt          j	        ||          }||nt          j	        ||          }||nt          j	        ||          }t          | ||||||
|||||	| |           |
|||fS )aI  Apply RoPE (Rotary Positional Embeddings) and quantize to FP8 format.

    This function takes pre-split query/key tensors (rotary and non-rotary dimensions separated),
    applies RoPE to the rotary dimension tensors, and quantizes both rotary and non-rotary
    tensors to FP8 format. Supports MLA, GQA, and MHA architectures.

    Parameters
    ----------
    q_rope : torch.Tensor
        Query tensor (rotary dimensions), shape: ``(nnz, num_qo_heads, rope_dim)``.
        Must be float16 or bfloat16.
    k_rope : torch.Tensor
        Key tensor (rotary dimensions). For GQA/MHA: ``(nnz, num_kv_heads, rope_dim)``.
        For MLA: ``(nnz, rope_dim)``. Must be float16 or bfloat16.
    q_nope : Optional[torch.Tensor]
        Query tensor (non-rotary dimensions), shape: ``(nnz, num_qo_heads, no_rope_dim)``.
        If ``None``, treated as zero-dim: a size-0 tensor will be created internally.
    k_nope : Optional[torch.Tensor]
        Key tensor (non-rotary dimensions). For GQA/MHA: ``(nnz, num_kv_heads, no_rope_dim)``.
        For MLA: ``(nnz, no_rope_dim)``. If ``None``, treated as zero-dim and created internally.
    cos_sin_cache : torch.Tensor
        Precomputed cosine and sine values, shape: ``(max_seq_len, rope_dim)``.
        First half contains cosine values, second half contains sine values. Must be float32.
    pos_ids : torch.Tensor
        Position indices for each token, shape: ``(nnz,)``.
    is_neox : bool
        RoPE layout style. If ``True`` (default), use non-interleaved layout (first/second half).
        If ``False``, use interleaved layout (even/odd dimensions).
    quantize_dtype : Optional[torch.dtype]
        Target quantization dtype. If ``None``, inferred from output tensors or defaults to
        ``torch.float8_e4m3fn``. Must be ``torch.float8_e4m3fn`` or ``torch.float8_e5m2``.
    quant_scale_q : float
        Quantization scaling factor for query tensors, default: ``1.0``.
    quant_scale_kv : float
        Quantization scaling factor for key tensors, default: ``1.0``.
    q_rope_out : Optional[torch.Tensor]
        Pre-allocated output tensor for quantized query (rotary). If ``None``, allocated automatically.
    k_rope_out : Optional[torch.Tensor]
        Pre-allocated output tensor for quantized key (rotary). If ``None``, allocated automatically.
    q_nope_out : Optional[torch.Tensor]
        Pre-allocated output tensor for quantized query (non-rotary). If ``None``, allocated automatically.
    k_nope_out : Optional[torch.Tensor]
        Pre-allocated output tensor for quantized key (non-rotary). If ``None``, allocated automatically.
    enable_pdl : bool
        Whether to enable PDL (Programmatic Dependent Launch). Default: ``False``.

    Returns
    -------
    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
        Quantized tensors: (q_rope_out, k_rope_out, q_nope_out, k_nope_out).
    r   r   r      Nr   devicer   )r   ru   r   r   r   ndimemptyr   float8_e4m3fnrv   rB   )r   r   r   r   r;   r,   r}   r   r<   r=   r3   r4   r5   r6   r>   nnznum_qo_headsis_mlanum_kv_headsouts                       r   r   r     s   J em++:;;; ,q/C<?L[AF311FLOL~qV]
 
 
 ~ 	[av|FMRRRFF[\1FL  F
 
J
C 	1 	1C!$  #0N
 ! 	
fN;;;  ! 	
fN;;;  ! 	
fN;;;  ! 	
fN;;;    " z:z99r      NHDvpaged_kv_cache	kv_layoutc                 R   |j         t          j        k    rt          d          |j        dk    }| j        d         }| j        d         }|#t          j        ||d| j         | j                  }|U|r#t          j        |d|j         |j                  }n0|j        d         }t          j        ||d|j         |j                  }| ||j         }n||j         }nt          j        }|t          j	        | |          }|t          j	        ||          }|rs|j        dk    r|
                    d          }|j        dk    r|
                    d          }|"t          j        d| j         | j                  }nt          d          t          |          dk    rt          d	          |\  }}|r|}|}|j         |k    s|j         |k    r"t          d
| d|j          d|j                    |j        dk    s|j        dk    r t          d|j         d|j         d          t          j        d|| j                  }t          j        d|| j                  }n|}|}|t          d          |j         |k    s|j         |k    r"t          d| d|j          d|j                    |j        dk    s|j        dk    r t          d|j         d|j         d          t          j        d|| j                  }t          j        d|| j                  }ddlm} ||         j        } |
                                }
|                                }|                                }|	                                }	t!          | ||||||||||||||	|
|| |||| |           ||fS )ax  Apply RoPE (Rotary Positional Embeddings), quantize to FP8, and append K/V to paged cache.

    This fused function applies RoPE to query/key (Q/K) rotary dimension tensors, quantizes all Q/K tensors
    (and V for GQA/MHA) to FP8 format, and directly appends the quantized K/V to a paged KV cache.
    It returns quantized Q tensors for use in attention computation. Supports MLA, GQA, and MHA
    architectures with automatic detection based on input tensor shapes.

    Parameters
    ----------
    q_rope : torch.Tensor
        Query tensor (rotary dimensions), shape: ``(nnz, num_qo_heads, rope_dim)``.
        Must be float16 or bfloat16.
    k_rope : torch.Tensor
        Key tensor (rotary dimensions). For GQA/MHA: ``(nnz, num_kv_heads, rope_dim)``.
        For MLA: ``(nnz, rope_dim)``. Must be float16 or bfloat16.
    q_nope : torch.Tensor
        Query tensor (non-rotary dimensions), shape: ``(nnz, num_qo_heads, no_rope_dim)``.
        Must be float16 or bfloat16.
    k_nope : torch.Tensor
        Key tensor (non-rotary dimensions). For GQA/MHA: ``(nnz, num_kv_heads, no_rope_dim)``.
        For MLA: ``(nnz, no_rope_dim)``. Must be float16 or bfloat16.
    v : Optional[torch.Tensor]
        Value tensor for GQA/MHA: ``(nnz, num_kv_heads, head_dim)``. Must be float16 or bfloat16.
        For MLA: pass ``None`` (MLA does not use separate V; K non-RoPE acts as compressed KV).
    cos_sin_cache : torch.Tensor
        Precomputed cosine and sine values, shape: ``(max_seq_len, rope_dim)``.
        First half contains cosine values, second half contains sine values. Must be float32.
    pos_ids : torch.Tensor
        Position indices for each token, shape: ``(nnz,)``.
    paged_kv_cache : Tuple[torch.Tensor, torch.Tensor]
        For MLA: ``(ckv_cache, kpe_cache)`` where:
            - ckv_cache: ``(max_pages, page_size, no_rope_dim)`` in FP8
            - kpe_cache: ``(max_pages, page_size, rope_dim)`` in FP8
        For GQA/MHA: ``(k_cache, v_cache)`` where:
            - k_cache: ``(max_pages, page_size, num_kv_heads, head_dim)`` or
              ``(max_pages, num_kv_heads, page_size, head_dim)`` depending on layout, in FP8
            - v_cache: same shape as k_cache, in FP8
    kv_indices : torch.Tensor
        Page indices mapping, shape: ``(total_pages,)``. Typically ``torch.arange(total_pages)``.
    kv_indptr : torch.Tensor
        Page indptr array for each request, shape: ``(batch_size + 1,)``.
        ``kv_indptr[i]`` is the starting page index for request ``i``.
    batch_indices : torch.Tensor
        Batch index for each token, shape: ``(nnz,)``. Maps each token to its request.
    positions : torch.Tensor
        Position within each request's sequence for each token, shape: ``(nnz,)``.
    is_neox : bool
        RoPE layout style. If ``True`` (default), use non-interleaved layout (first/second half).
        If ``False``, use interleaved layout (even/odd dimensions).
    quantize_dtype : Optional[torch.dtype]
        Target quantization dtype. If ``None``, inferred from output tensors or defaults to
        ``torch.float8_e4m3fn``. Must be ``torch.float8_e4m3fn`` or ``torch.float8_e5m2``.
    quant_scale_q : float
        Quantization scaling factor for query tensors, default: ``1.0``.
    quant_scale_kv : float
        Quantization scaling factor for key/value tensors, default: ``1.0``.
    page_size : int
        Number of entries per page in the paged cache, default: ``16``.
    kv_layout : str
        Cache memory layout for GQA/MHA. Options: ``"NHD"`` (page, seq, head, dim) or
        ``"HND"`` (page, head, seq, dim). Default: ``"NHD"``. Ignored for MLA.
    q_rope_out : Optional[torch.Tensor]
        Pre-allocated output tensor for quantized query (rotary). If ``None``, allocated automatically.
    q_nope_out : Optional[torch.Tensor]
        Pre-allocated output tensor for quantized query (non-rotary). If ``None``, allocated automatically.
    enable_pdl : bool
        Whether to enable PDL (Programmatic Dependent Launch). Default: ``False``.

    Returns
    -------
    Tuple[torch.Tensor, torch.Tensor]
        Quantized query tensors: (q_rope_out, q_nope_out).
        K/V are written directly to the paged cache and not returned.

    Notes
    -----
    - Architecture detection: Automatically distinguishes MLA (2D K tensors) from GQA/MHA (3D K tensors).
    - MLA writes K-RoPE to ``kpe_cache`` and K-noRoPE to ``ckv_cache``; V is not used.
    - GQA/MHA writes full K (RoPE+noRoPE) to ``k_cache`` and V to ``v_cache``.
    - The ``batch_indices`` and ``positions`` tensors are typically obtained from
      ``flashinfer.get_batch_indices_positions()``.
    - Cache tensors must already be allocated in the target FP8 dtype.
    r   r   r   r   Nr   r   z'MLA should not have V input (pass None)z+paged_kv_cache must be a tuple of 2 tensorsz#MLA cache dtype mismatch: expected z
, got ckv=z, kpe=   z;MLA cache must be 3D: (max_pages, page_size, dim), got ckv=zD, kpe=DzbGQA/MHA expects a V tensor, but got None. Only MLA uses None for V (compressed KV representation).z'GQA/MHA cache dtype mismatch: expected z, got k=z, v=rm   z GQA/MHA cache must be 4D, got k=zD, v=)TensorLayout)r   ru   r   r   r   r   r   r   r   rv   	unsqueezelenutilsr   valueintrS   )!r   r   r   r   r   r;   r,   r   rJ   rK   rL   rM   r}   r   r<   r=   rO   r   r3   r5   r>   r   r   r   r   cache_0cache_1rG   rH   rE   rF   r   rN   s!                                    r   'rope_quantize_fp8_append_paged_kv_cacher     s   V em++:;;; [AF ,q/C<?L~qV]
 
 
 ~ 	[av|FMRRRFF!<?L[\1FL  F
 !'-NN#'-NN"0N %fNCCC
%fNCCC
  	H;!%%a((F;!%%a((F9AV\&-HHHAAFGGG >aFGGG%GW &O		?n,,	>0Q0QDn D D$?D D2;/D D   >Q).A"5"5D$>D D2;.D D D  
 +a~fmLLL+a~fmLLL 9K   =N**gm~.M.M<. < < < <,3M< <   <1 1 1U7<UUglUUU   KNNN	KNNN	 $#####!),2N!%%''MI!!JI -	/  4 z!!r   )NFr   rd   )NFrk   rl   r   rm   rn   )T)	TNr   r   NNNNF)	TNr   r   r   r   NNF)0__doc__	functoolstypingr   r   ru   api_loggingr   jit.roper   r   r   r	   cacher   Tensorr   boolrp   r    r"   r)   r+   r0   r2   rB   rD   rS   rU   rY   r]   ra   rc   rh   rj   rq   rs   r   r.   r'   r_   r   r   r   r   r   strr   r   r   r   <module>r      s2         " " " " " " " "  ' ' ' ' ' ' % % % % % % 7 7 7 7 7 7 7 7 . . . ,;OPPP|| L L	
 L \     
   QP4 *++	|	|	 L	 L		
 L	 \	 	 	 	 	 
	 	 	 ,+	 4CWXXX|| L L	
 L \        
   YX@ 233	|	|	 L	 L		
 L	 \	 	 	 	 	 	 	 	 
	 	 	 43	$ 4CWXXX|| L L	
 \     
   YX0 233	|	|	 L	 L		
 \	 	 	 	 	 
	 	 	 43	 I  $|$|$ |$ |	$
 <$ \$ $ $ $ $ $ $ $ $ 
$ $ $	 $N -..	|	|	 |	 |		
 <	 \	 	 	 	 	 	 	 	 	 
	 	 	 /.	& 5
 
 
8|8|8 |8 |	8
 ,8 8 8 <8 \8 \8 \8 |8 |8 8 |8  <!8" |#8$ %8& '8( )8* +8, -8. /80 
18 8 8
 
8v CDD	|	|	 |	 |		
 ,	 	 	 <	 \	 \	 \	 |	 |	 	 |	  <!	" |#	$ %	& '	( )	* +	, -	. /	0 
1	 	 	 ED	8 2AU  || L L	
 < \  
   ( @AA
	|
	|
	 L
	 L	
	
 |
	 |
	 \
	 
	 

	 
	 
	 BA
	 ,;O  || L L	
 \        
   < :;;	|	|	 L	 L		
 \	 	 	 	 	 	 	 	 
	 	 	 <;	"  !%T T|T|T LT \	T
 T T T T 
T T T Tn 
 !%7 7|7|7 \7 	7
 7 7 7 
7 7 7 7t  !%i i|i|i Li \	i
 i i i i i i i 
i i i iX 
 !%K K|K|K \K 	K
 K K K K K K 
K K K K\  !%k k|k|k Lk \	k
 k k k k 5<%&k k k k\ 
 !%A A|A|A \A 	A
 A A A 5<%&A A A AH  !%w w|w|w Lw \	w
 w w w w w w w 5<%&w w w wt 
 !%T T|T|T \T 	T
 T T T T T T 5<%&T T T Tn  < <|<<< 
< 	<
 << < 5<%&< < < <~  1 1|1<1 
1 	1
 <1 1 
1 1 1 1h  ,0)-)-)-)-! !L!L! L! L	!
 <! \! ! U[)! ! ! &! &! &! &! !  5<u|U\AB!! ! ! !H  ,0)-)-)-)-H: H:LH:LH: U\"H: U\"	H:
 <H: \H: H: U[)H: H: H: &H: &H: &H: &H: H:  5<u|U\AB!H: H: H: H:V  ,0)-)-+n" n"Ln"Ln" U\"n" U\"	n"
 n" <n" \n" %,45n" n" |n" <n" |n" n" U[)n" n"  !n"" #n"$ %n"& &'n"( &)n"* +n", 5<%&-n" n" n" n" n" n"r   