
    )`i%                         d Z ddlZddlZddlmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZmZmZmZmZmZ ddlmZ dd	lmZ dd
lmZ ej        d             Z G d d          Z G d de          ZdS )a3  
Copyright (c) 2025 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)OptionalTupleUnion   )flashinfer_api)gen_batch_attention_module)MaskModePosEncodingModeTensorLayout_check_kv_layout_unpack_paged_kv_cachedetermine_attention_backend)#BatchPrefillWithPagedKVCacheWrapper)attention_sink_decl)filename_safe_dtype_mapc                  8    t          |                                  S )N)r   build_and_load)argss    h/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/attention.pyget_holistic_attention_moduler   &   s    %t,;;===    c            "          e Zd Ze	 	 d"dedefd            Zedddej        ej        dfdej        d	ej        d
ej        dej        de	de	de	de	de	de
dedee         dej        dej        de
ddf d            Ze	 	 	 	 	 	 d#dej        deej        eej        ej        f         f         deej                 deej                 deej                 deej                 ded eej                 deej        ej        f         fd!            ZdS )$BatchAttentionNHDcuda	kv_layoutdevicec                    t          |           || _        t          j        dt          j        t          j        |                    | _        t          j        dt          j        t          j        |                    | _        t          j        dt          j        t          j        d          d          | _        d S )Ni   )dtyper   i   cpuT)r   r   
pin_memory)	r   
_kv_layouttorchemptyuint8r   float_workspace_bufferint_workspace_buffer page_locked_int_workspace_buffer)selfr   r   s      r   __init__zBatchAttention.__init__,   s     	####&+k+<'''
 '
 '
#
 %*K+<''%
 %
 %
!
 16+<&&	1
 1
 1
---r   FN	qo_indptr	kv_indptr
kv_indices
kv_len_arrnum_qo_headsnum_kv_headshead_dim_qkhead_dim_vo	page_sizecausalsm_scalelogits_soft_capq_data_typekv_data_typeuse_profilerreturnc                    |d}|| _         ||||j        ||t          d         j        |dk    |f	}t	          | | _        |                    t          j        d          d          }|                    t          j        d          d          }|                    t          j        d          d          }t          j	        
                                 |j        d         }|	| _        || _        |
rt          j        j        nt          j        j        | _        || _        || _        |	| _        || _        || _        | j                            | j        | j        | j        ||||||||
          | _        d S )N        NONEr    T)non_blockingr   )_logits_soft_capr   r
   valuer   moduletor#   r   r   synchronizeshape
_page_size	_sm_scaler	   CAUSAL
NON_CAUSAL
_mask_mode_num_qo_heads_num_kv_heads_use_profiler_kv_indicesplanr&   r'   r(   
_plan_info)r)   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   get_module_argsqo_indptr_hostkv_indptr_hostkv_len_arr_host
batch_sizes                        r   rN   zBatchAttention.planF   sp   & "!O / OF#)c!

 4_E"el5&9&9MM"el5&9&9MM$--U(;(;$-OO
   %a(
#!39X(///x?R?X))#) &+**'%1
 
r   r<   qkv_cacheoutlsek_scalev_scaleprofiler_bufferc	                    || j         rt          d          |dk    r| j        dk    rt          d          t          || j                  \  }	}
|t          j        |          }|=t          j        |j        d         |j        d         |j	        t
          j
                  }|j        d         }| j        }|dt          j        |          z  }|||z  }|d}| j         r|fnd	} | j        j        | j        | j        | j        ||	|
| j        ||| j        t*          | j                 j        | j        | j        | j        |||g|R   ||fS )
Nz5Profiler is enabled, profiler_buffer must be providedr<   zhlogits_soft_cap used in kernel run but not provided in plan(). This will cause template deduction error.r   r   )r   r      g      ? )rL   
ValueErrorr?   r   r"   r#   
empty_liker$   rD   r   float32rF   mathsqrtrA   runr&   r'   rO   rM   rI   r   r@   rJ   rK   rE   )r)   rU   rV   rW   rX   rY   rZ   r6   r[   k_cachev_cacher1   r5   profiler_argss                 r   rd   zBatchAttention.run   s    "!  K   S  T%:c%A%Az   2(DOLL;"1%%C;+
AGAJqxu}  C gaj>TY{333HH?G.2.@H**b'%OO)/O#	
( )	
 	
 	
 	
. Cxr   )r   r   )NNNNr<   N)__name__
__module____qualname__r   strr*   r#   bfloat16Tensorintboolfloatr   r   rN   r   r   rd   r^   r   r   r   r   +   s0        
 

 
 
 
 ^
2  +/#(>$)N"!A
 A
<A
 <A
 L	A

 LA
 A
 A
 A
 A
 A
 A
 A
 "%A
 [A
 kA
  !A
" 
#A
 A
 A
 ^A
F 
 '+&**.*.!$26? ?<? eEL%,,F&GGH? el#	?
 el#? %,'? %,'? ? "%,/? 
u|U\)	*? ? ? ^? ? ?r   r   c            %       D    e Zd ZdZdddddddddddej        ej        dddfd	ej        d
edede	ej                 de	ej                 de	ej                 de	ej                 de	ej                 de	ej                 dedededej
        dej
        dedededdf$ fdZ xZS )&BatchAttentionWithAttentionSinkWrappera  
    Wrapper for prefill and decode attention with paged KV-cache that adds support for
    attention sinks. This class extends `BatchPrefillWithPagedKVCacheWrapper`, providing
    a convenient interface for using attention sinks during prefill or decode attention.
    r   FNautor=      r&   r   use_cuda_graphqo_indptr_bufpaged_kv_indptr_bufpaged_kv_indices_bufpaged_kv_last_page_len_bufcustom_mask_bufmask_indptr_bufbackendpos_encoding_modeuse_fp16_qk_reductionr7   r8   r1   r2   window_leftr:   c                    |
dv sJ |
dk    r+t          |j        t          |         j        ||d u||          }
dt          |          d|dk     d|
 |||t
          j        ||dgdgd	gd
gdt          |
         g}|dk    |t          |         j        d}t                      	                    |||||||||	|
||           d S )N)fa2fa3rs   rs   batch_prefill_attention_sink__swa_r   _sinkrp   r5   doubleAttentionSink)use_sliding_windowr   r~   )r&   r   rv   rw   rx   ry   rz   r{   r|   r}   jit_args
jit_kwargs)
r   r   r
   r@   r   r#   int32r   superr*   )r)   r&   r   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r7   r8   r1   r2   r   r   r   	__class__s                       r   r*   z/BatchAttentionWithAttentionSinkWrapper.__init__   s0   * 00000f1&- 128%t+ G t,CK,PssWbfgWgssjqssKHILJ(
  #."2%:!01B!C!I
 

 	#9)' 3!5'A++! 	 	
 	
 	
 	
 	
r   )rh   ri   rj   __doc__r#   rl   rm   rk   ro   r   r   rn   r*   __classcell__)r   s   @r   rr   rr      s         $046:7;=A2626!'&+#(>$)N%C
 C
 %C
 C
 	C

  -C
 &el3C
 'u|4C
 %-U\$:C
 "%,/C
 "%,/C
 C
 C
  $C
 [C
 kC
  !C
" #C
$ %C
& 
'C
 C
 C
 C
 C
 C
 C
 C
 C
 C
r   rr   )r   	functoolsrb   typingr   r   r   r#   api_loggingr   jitr   utilsr	   r
   r   r   r   r   prefillr   jit.attention.variantsr   	jit.utilsr   cacher   r   rr   r^   r   r   <module>r      s          ) ) ) ) ) ) ) ) ) )  ' ' ' ' ' ' + + + + + +                9 8 8 8 8 8 7 7 7 7 7 7 . . . . . . > > >_ _ _ _ _ _ _ _DJ
 J
 J
 J
 J
-P J
 J
 J
 J
 J
r   