
    )`iH              ,       l   d Z ddlZddlmZ ddlmZmZ ddlZddlm	Z	 ddl
mZmZ ddlmZ dd	lmZmZmZmZmZ ej        d
ej        dej        dededededej        defd            Ze		 	 	 	 	 	 	 	 	 	 d,dej        dej        dej        dej        dej        dej        dej        dej        dededeej                 deeej        f         d eeej        f         d!ed"ed#ee         d$ee         d%eded&eej                 d'df*d(            Zej        	 d-d
ej        dej        dedededefd*            Ze		 	 	 	 d.dej        dej        dej        dej        dej        dej        dej        dej        dedeeej        f         d eeej        f         d#ee         d$ee         d'dfd+            ZdS )/a3  
Copyright (c) 2024 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)SimpleNamespace)OptionalUnion   )flashinfer_api)gen_xqa_modulegen_xqa_module_mla)filename_safe_dtype_map)get_device_sm_countregister_custom_opregister_fake_opget_compute_capabilitydevice_support_pdlinput_dtypekv_cache_dtype	page_sizehead_dimhead_group_ratiouse_sliding_windowoutput_dtype	q_seq_lenc           -         t          | |||||||                                          |dk    rd}nd}t          dt          |           dt          |          dt          |          d| d| d	| d
| d| d| d          dt          dt
          dt
          dt
          dt          t          t          j	        f         dt          j	        dt          dt          j	        dt          t          j	                 dt          j	        dt          j	        dt          j	        dt
          dt          j	        dt
          dt          t          t          j	        f         dt          j	        d t          j	        d!t          d"t
          d#t          t          j	                 d$d f,fd%            }	t          dt          |           dt          |          dt          |          d| d| d	| d
| d| d|           dt          dt
          dt
          dt
          dt          t          t          j	        f         dt          j	        dt          dt          j	        dt          t          j	                 dt          j	        dt          j	        dt          j	        dt
          dt          j	        dt
          dt          t          t          j	        f         dt          j	        d t          j	        d!t          d"t
          d#t          t          j	                 d$d f,d&            }
t          |	'          S )(Nr   TFzflashinfer::xqa_input_
_kv_cache__output__page_size_
_head_dim__head_group_ratio__use_sliding_window__use_spec_dec__spec_q_seq_len_outputworkspace_buffermutates_argsrun_sm90_fp8_mhasm_countnum_kv_headssliding_win_sizeq_scaler"   rcp_out_scaleqsinksk_cachev_cache
page_tablemax_seq_lenseq_lens
batch_sizekv_scale
semaphoresr#   
enable_pdlr   maskreturnc                 *                        | |||t          |t          j                  rdn|t          |t                    rd n||||||	|
||||t          |t          j                  rdn|t          |t                    rd n||||||           d S N      ?)xqa_wrapper
isinstancetorchTensorfloat)r&   r'   r(   r)   r*   r"   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r#   r6   r   r7   modules                        b/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/xqa.pyxqazget_xqa_module.<locals>.xqa=   s    6 	gu|44ACC'w..;DDGh55CCC8x//=DDX/	
 	
 	
 	
 	
    c                     d S N )r&   r'   r(   r)   r*   r"   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r#   r6   r   r7   s                        rB   	_fake_xqaz!get_xqa_module.<locals>._fake_xqar   s	    4 	rD   )rC   )r   build_and_loadr   r
   boolintr   r@   r>   r?   r   r   r   )r   r   r   r   r   r   r   r   use_spec_decrC   rH   rA   s              @rB   get_xqa_modulerM   "   sz    	 	 n  1}} 	W!8!E  	W  	WQhiwQx  	W  	W  CZ  [g  Ch  	W  	W  u~  	W  	W  JR  	W  	W  fv  	W  	W  L^  	W  	W  nz  	W  	W  LU  	W  	W3  /
/
/
 /
 	/

 uel*+/
 /
 /
 </
 %/
 /
 /
 L/
 /
 ,/
 /
  u|+,!/
" L#/
$  ,%/
& '/
( )/
* u|$+/
, 
-/
 /
 /
 /
 /
	 /
b  	W!8!E  	W  	WQhiwQx  	W  	W  CZ  [g  Ch  	W  	W  u~  	W  	W  JR  	W  	W  fv  	W  	W  L^  	W  	W  nz  	W  	W  LU  	W  	W   	
 uel*+   < %   L  ,   u|+,!" L#$  ,%& '( )* u|$+, 
-   2    rD   r;   NHDr,   r.   r/   r0   r2   r"   r#   r5   r(   r-   r*   r4   r)   	kv_layoutr'   r6   r+   r7   r8   c                    |t          | j                  }||nt          | j                  }| j        d         }| j        d         }| j        d         }||z  }|j        d         }||	z  }|dk    }|j        |j        k    s
J d            |j        t
          j        k    r |j        t
          j        k    s
J d            n|j        | j        k    s
J d            |dk    r,|                    d	d          }|                    d	d          }|j        t
          j        k    r/t          t          j        d
                    d         dk    rd}nd}t          t          j        d
                    d         dvrt          d          t          | j        |j        |	||||j        |          }|dk    r|
J d            |
d}|                    ||||r|nd|||| |
||||||||||||           dS )a  Apply attention with paged KV cache using XQA kernel.
    Parameters
    ----------
    q : torch.Tensor
        Query tensor with shape ``[batch_size, beam_width, num_q_heads, head_dim]`` if not using speculative decoding,
        or ``[batch_size, beam_width, q_seq_len, num_q_heads, head_dim]`` if using speculative decoding. ``q_seq_len`` is the number of speculative decoding tokens.
        Data type should be torch.float16 or torch.bfloat16.
        Now only beam_width 1 is supported.
    k_cache: torch.Tensor
        Paged K cache tensor with shape ``[num_pages, page_size, num_kv_heads, head_dim]`` if :attr:`kv_layout` is ``NHD``,
        or ``[num_pages, num_kv_heads, page_size, head_dim]`` if :attr:`kv_layout` is ``HND``.
        Data type should match query tensor or be torch.float8_e4m3fn, in which case xqa will run fp8 calculation.
        Should be the same data type as v_cache.
    v_cache: torch.Tensor
        Paged V cache tensor with shape ``[num_pages, page_size, num_kv_heads, head_dim]`` if :attr:`kv_layout` is ``NHD``,
        or ``[num_pages, num_kv_heads, page_size, head_dim]`` if :attr:`kv_layout` is ``HND``.
        Data type should match query tensor or be torch.float8_e4m3fn, in which case xqa will run fp8 calculation.
        Should be the same data type as k_cache.
    page_table : torch.Tensor
        Page table tensor with shape ``batch_size, nb_pages_per_seq``.
        Data type should be torch.int32.
        K and V share the same table.
    seq_lens : torch.Tensor
        Sequence lengths tensor with shape ``[batch_size, beam_width]``.
        Data type should be torch.uint32.
    output : torch.Tensor
        Output tensor with shape that matches the query tensor.
        Data type should match query tensor or kv tensor. This tensor will be modified in-place.
    workspace_buffer : torch.Tensor
        Workspace buffer for temporary computations.
        Data type should be torch.uint8.
    semaphores : torch.Tensor
        Semaphore buffer for synchronization.
        Data type should be torch.uint32.
    num_kv_heads : int
        Number of key-value heads in the attention mechanism.
    page_size : int
        Size of each page in the paged KV cache. Must be one of [16, 32, 64, 128].
    sinks : Optional[torch.Tensor], default=None
        Attention sink values with shape ``[num_kv_heads, head_group_ratio]``.
        Data type should be torch.float32.
        If None, no attention sinks are used.
    q_scale : Union[float, torch.Tensor], default=1.0
        Scale factor for query tensor.
    kv_scale : Union[float, torch.Tensor], default=1.0
        Scale factor for KV cache.
    sliding_win_size : int, default=0
        Sliding window size for attention. If 0, no sliding window is used.
    kv_layout : str, default="NHD"
        The layout of the KV cache. Can be either ``NHD`` or ``HND``.
    sm_count : Optional[int], default=None
        Number of streaming multiprocessors to use.
        If None, will be inferred from the device.
    enable_pdl : Optional[bool], default=None
        Whether to enable PDL (Persistent Data Loader) optimization.
        If None, will be set to True if hardware supports it.
    rcp_out_scale : float, default=1.0
        Reciprocal of output scale factor.
    q_seq_len : int, default=1
        Query sequence length. When > 1, enables speculative decoding mode.
    mask : Optional[torch.Tensor], default=None
        Causal attention mask for speculative decoding mode (when ``q_seq_len > 1``).
        Shape: ``[batch_size, q_seq_len, mask_size_per_row]`` where
        ``mask_size_per_row = ((q_seq_len + 31) // 32) * 2``.
        Data type should be torch.uint16 (bit-packed format, aligned to 32 bits).

    Note
    ----
    The function automatically infers several parameters from tensor shapes:
    - batch_size from q.shape[0]
    - num_q_heads from q.shape[-2]
    - head_dim from q.shape[-1]
    - input_dtype from q.dtype
    - kv_cache_dtype from k.dtype
    - head_group_ratio from num_q_heads // num_kv_heads
    - max_seq_len from page_table.shape[-1] * page_size
    Nr   &K and V cache must have the same dtypez'KV cache must be fp8 when output is fp8z)Output and query must have the same dtypeHNDcudadevice	   TF)rY   
      z0XQA is only supported on SM90, SM100, SM120 GPUsr   z)Mask is required for speculative decoding)r   rX   r   shapedtyper>   float8_e4m3fn	transposer   RuntimeErrorrM   rC   )r,   r.   r/   r0   r2   r"   r#   r5   r(   r   r-   r*   r4   r)   rO   r'   r6   r+   r   r7   r3   num_q_headsr   r   num_pages_per_seqr1   r   r&   
xqa_modules                                rB   rC   rC      sp   J &qx00)5;Mah;W;WJ J'"+Kwr{H #l2 #(,#i/K *A-=GM)))+S)))|u***} 33335 4333 |qw&&&(S&&& E##B++##B++ 	,,,"5<v#>#>#>??BaGG el&999::1=[PPMNNN		 	J 1}}!L$NN.5A	+    rD   Fc                 d   t          | |||||                                          t          dt          |           dt          |          d| d| d| d| d          d	t          d
t
          t          t          j        f         dt          j        dt          j        dt          j        dt          j        dt          j        dt          dt          j        dt          dt
          t          t          j        f         dt          j        dt          j        dt          dd ffd            }t          dt          |           dt          |          d| d| d| d|           d	t          d
t
          t          t          j        f         dt          j        dt          j        dt          j        dt          j        dt          j        dt          dt          j        dt          dt
          t          t          j        f         dt          j        dt          j        dt          dd fd            }t          |          S )Nzflashinfer::xqa_mla_input_r   r   r   r   r   r!   r$   r'   r*   r"   r,   r.   r/   r0   r1   r2   r3   r4   r5   r#   r6   r8   c                                         | t          |t          j                  rdn|t          |t                    rd n|||||||||	t          |
t          j                  rdn|
t          |
t                    rd n|
|||           d S r:   )xqa_wrapper_mlar=   r>   r?   r@   )r'   r*   r"   r,   r.   r/   r0   r1   r2   r3   r4   r5   r#   r6   rA   s                 rB   xqa_mlaz#get_xqa_module_mla.<locals>.xqa_mlaa  s    ( 	gu|44ACC'w..;DDGh55CCC8x//=DDX!	
 	
 	
 	
 	
rD   c                     d S rF   rG   )r'   r*   r"   r,   r.   r/   r0   r1   r2   r3   r4   r5   r#   r6   s                 rB   _fake_xqa_mlaz)get_xqa_module_mla.<locals>._fake_xqa_mla  s	    & 	rD   )rg   )r	   rI   r   r
   rK   r   r@   r>   r?   rJ   r   r   )	r   r   r   r   r   r   rg   ri   rA   s	           @rB   get_xqa_module_mlarj   O  s)       n   	u%<[%I  	u  	uUlm{U|  	u  	u  JS  	u  	u  _g  	u  	u  {K  	u  	u  as  	u  	u3  !
!
uel*+!
 !
 <	!

 !
 !
 L!
 !
 ,!
 !
 u|+,!
 L!
  ,!
 !
 
!
 !
 !
 !
 !
	 !
F  	u%<[%I  	u  	uUlm{U|  	u  	u  JS  	u  	u  _g  	u  	u  {K  	u  	u  as  	u  	u uel*+  <	
   L  ,  u|+, L  ,  
   $    rD   c                    |t          | j                  }||nt          | j                  }| j        d         }| j        d         }d}|j        d         }||z  }|j        |j        k    s
J d            t          t          j        d                    d         dvrt          d	          t          | j        |j        |||d
          }|	                    ||	|| |||||||
|||           dS )a  Apply attention with paged KV cache using XQA MLA (Multi-Head Latent Attention) kernel.
    Parameters
    ----------
    q : torch.Tensor
        Query tensor with shape ``[batch_size, beam_width, num_q_heads, head_dim]``.
        Data type should be torch.float8_e4m3fn.
        Now only beam_width 1 is supported.
    k_cache: torch.Tensor
        Paged K cache tensor with shape ``[total_num_cache_heads, head_dim]``.
        Data type should be torch.float8_e4m3fn
    v_cache: torch.Tensor
        Paged V cache tensor with shape ``[total_num_cache_heads, head_dim]``.
        Data type should be torch.float8_e4m3fn
    page_table : torch.Tensor
        Page table tensor with shape ``batch_size, nb_pages_per_seq``.
        Data type should be torch.int32.
        K and V share the same table.
    seq_lens : torch.Tensor
        Sequence lengths tensor with shape ``[batch_size, beam_width]``.
        Data type should be torch.uint32.
    output : torch.Tensor
        Output tensor with shape ``[batch_size, beam_width, num_q_heads, head_dim]``.
        Data type should be torch.bfloat16. This tensor will be modified in-place.
    workspace_buffer : torch.Tensor
        Workspace buffer for temporary computations.
        Data type should be torch.uint8.
    semaphores : torch.Tensor
        Semaphore buffer for synchronization.
        Data type should be torch.uint32.
    page_size : int
        Size of each page in the paged KV cache. Must be one of [16, 32, 64, 128].
    q_scale : Union[float, torch.Tensor], default=1.0
        Scale factor for query tensor.
    kv_scale : Union[float, torch.Tensor], default=1.0
        Scale factor for KV cache.
    sm_count : Optional[int], default=None
        Number of streaming multiprocessors to use.
        If None, will be inferred from the device.
    enable_pdl : Optional[bool], default=None
        Whether to enable PDL (Persistent Data Loader) optimization.
        If None, will be set to True if hardware supports it.

    Note
    ----
    The function automatically infers several parameters from tensor shapes:
    - batch_size from q.shape[0]
    - head_dim from q.shape[-1]
    - head_group_ratio is fixed to 128 for MLA
    - max_seq_len from page_table.shape[-1] * page_size
    Nr   rR      rS   rV   rW   )r[   z'XQA MLA is only supported on SM120 GPUsF)
r   rX   r   r\   r]   r   r>   r`   rj   rg   )r,   r.   r/   r0   r2   r"   r#   r5   r   r*   r4   r'   r6   r3   r   r   rb   r1   rc   s                      rB   rg   rg     s2   F &qx00)5;Mah;W;WJ Jwr{H  #(,#i/K=GM)))+S)))el&999::1=TIIDEEE#	 J 	    rD   )
Nr;   r;   r   rN   NNr;   r   N)F)r;   r;   NN) __doc__	functoolstypesr   typingr   r   r>   api_loggingr   jit.xqar   r	   	jit.utilsr
   utilsr   r   r   r   r   cacher]   rK   rJ   rM   r?   r@   strrC   rj   rg   rG   rD   rB   <module>rw      s=         ! ! ! ! ! ! " " " " " " " "  ' ' ' ' ' ' 7 7 7 7 7 7 7 7 . . . . . .              mmKm m 	m
 m m +m m m m m`  %)*-+."!%#')x x|x\x \x 	x
 lx Lx lx x x x EL!x 5%,&'x E5<'(x x x  sm!x" #x$ %x& 'x( 5<
 )x* 
+x x x xv   %O OOKO O 	O
 O O O O Od  +.+."!%n n|n\n \n 	n
 ln Ln ln n n 5%,&'n E5<'(n smn n 
n n n n n nrD   