
    .`i*              3          d Z ddlZddlmZ ddlmZ ddlZddlZddl	m
Z
 ddlmZmZmZmZmZ ddlmZmZmZ ddlmZ dd	lmZ  e            rdd
lmZmZmZmZ ddlmZmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, ddlm-Z-m.Z.m/Z/ ddl0m1Z1m2Z2 ddl3m4Z4  e&e5          Z6 G d de          Z7e G d d                      Z8dede9e:e;e;f         dz           fdZ< G d de.e8                   Z= G d de          Z>d e;d!ej?        d"e;d#e;d$e@d%e@d&e@d'e;d(e;de@fd)ZA	 	 	 	 	 	 dBd*ejB        d+ejB        d,ejB        d-ejB        d.ejB        d/e;d0ejB        d1ejB        d2ejB        d3e;d4eCd5ejB        dz  d6e:e;e;f         d7eCd8ejB        d e;d9e;d:e;d;ejB        dz  d<ejB        dz  d=ejB        dz  d>ejB        dz  d?ejB        dz  d@ejB        dz  dejB        f2dAZDdS )Cz$Attention layer with FlashAttention.    N)	dataclass)ClassVar)	Attention)AttentionBackendAttentionImplAttentionType
MultipleOfis_quantized_kv_cache)flash_attn_supports_fp8get_flash_attn_version#is_flash_attn_varlen_func_available)cp_lse_ag_out_rs)merge_attn_states)flash_attn_supports_sinksflash_attn_varlen_funcget_scheduler_metadatareshape_and_cache_flash)
VllmConfigget_current_vllm_configget_layers_from_vllm_config)
CacheDTypeget_dcp_group)init_logger)vllm_is_batch_invariantDeviceCapability)cdiv)AttentionCGSupportAttentionMetadataBuilderCommonAttentionMetadata)get_dcp_local_seq_lensget_kv_cache_layout)AttentionSpecc                      e Zd ZU dZeed<   ej        ej        gZ	e
eej                          ed<   edeeez           fd            ZdZeed<   edefd            Zed	edefd
            Zeded         fd            Zeded         fd            Ze	 d'dedededededeedf         fd            Ze	 d(dedeedf         fd            Zededej        fd            Zededefd            Zededz  defd            Zedefd            Z ede!defd             Z"eded!ej        dedz  dedz  d"ed#ed$ed%e!dedz  fd&            Z#dS ))FlashAttentionBackendTaccept_output_buffersupported_dtypesreturnc                      t                      } | j        }| j        }|r!|j        r|j        dk    s|j        dk    rg dS t          d          gS )Nfloat32)       @   r,   )r   model_configcache_config	is_hybridmamba_ssm_cache_dtypemamba_cache_dtyper	   )vllm_configr/   r0   s      y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/attention/backends/flash_attn.py get_supported_kernel_block_sizesz6FlashAttentionBackend.get_supported_kernel_block_sizes>   si    -//"/"/	 &	  2i??1Y>>  <<2    F forward_includes_kv_cache_updatec                      dS )N
FLASH_ATTN r;   r7   r5   get_namezFlashAttentionBackend.get_nameT   s    |r7   	attn_typec                 b    |t           j        t           j        t           j        t           j        fv S )z,FlashAttention supports all attention types.)r   DECODERENCODERENCODER_ONLYENCODER_DECODER)clsr=   s     r5   supports_attn_typez(FlashAttentionBackend.supports_attn_typeX   s.     !!&)	
 
 	
r7   FlashAttentionImplc                      t           S N)rE   r;   r7   r5   get_impl_clsz"FlashAttentionBackend.get_impl_clsb   s    !!r7   FlashAttentionMetadataBuilderc                      t           S rG   )rI   r;   r7   r5   get_builder_clsz%FlashAttentionBackend.get_builder_clsf   s    ,,r7   auto
num_blocks
block_sizenum_kv_heads	head_sizecache_dtype_str.c                 @    |dz  dk    rt          d          d| |||fS )Nr,   r   z$Block size must be a multiple of 16.   )
ValueError)rM   rN   rO   rP   rQ   s        r5   get_kv_cache_shapez(FlashAttentionBackend.get_kv_cache_shapej   s3     ?aCDDD:z<CCr7   include_num_layers_dimensionc                     t                      }|dk    r| rdS |dk    rd}n&|dk    r| rdS |dk    rd}nt          d| d          |S )	NNHD)rS   r               )r   rY   rS   rZ   r[   HND)rS   r[   r   rY   rZ   r\   )r   rY   rZ   rS   r[   zUnknown cache layout format .)r#   rT   )rV   cache_layoutstride_orders      r5   get_kv_cache_stride_orderz/FlashAttentionBackend.get_kv_cache_stride_orderv   s     +,,5  %A %%U""*LLU""'C"%%U""*LLKLKKKLLLr7   kv_cache_dtypec                 F    | dv rt           j        S t          d|            )N)fp8fp8_e4m3zUnrecognized FP8 dtype: )torchfloat8_e4m3fnrT   )rb   s    r5   get_fp8_dtype_for_flashattnz1FlashAttentionBackend.get_fp8_dtype_for_flashattn   s.    000&&HHHIIIr7   c                      |dz  dk    o|dk    S )N   r      r;   )rC   rP   s     r5   supports_head_sizez(FlashAttentionBackend.supports_head_size   s    1}!6i3&66r7   Nc                 X    |dS |                     d          rt                      S |dv S )NTrd   )rL   bfloat16)
startswithr   )rC   rb   s     r5   supports_kv_cache_dtypez-FlashAttentionBackend.supports_kv_cache_dtype   s;    !4$$U++ 	-*,,,!555r7   c                 >    t                      sdS t                      S )NF)r   r   )rC   s    r5   supports_sinkz#FlashAttentionBackend.supports_sink   s!    244 	5(***r7   
capabilityc                 *    |t          dd          k    S )Nrj   r   r   )rC   rs   s     r5   supports_compute_capabilityz1FlashAttentionBackend.supports_compute_capability   s    -a3333r7   dtypeuse_mlahas_sink
use_sparsedevice_capabilityc	                 6    |r|t          dd          k     rdS d S )N	   r   z.sink not supported on compute capability < 9.0r   )	rC   rP   rv   rb   rN   rw   rx   ry   rz   s	            r5   supports_combinationz*FlashAttentionBackend.supports_combination   s.      	D),<Q,B,BBBCCtr7   )rL   F)$__name__
__module____qualname__r'   bool__annotations__rf   float16rn   r(   r   listrv   staticmethodintr	   r6   r8   strr<   classmethodrD   typerH   rK   tuplerU   ra   rh   rl   r   rp   rr   r   ru   r}   r;   r7   r5   r&   r&   :   se        !%$%%%5:]EN4ShtEK01SSS d33C.D       \ & .3$d222c    \ 
3 
4 
 
 
 [
 "$34 " " " \" -T"AB - - - \-   &	D 	D	D	D 	D 		D
 	D 
sCx	D 	D 	D \	D -2 &*	sCx   \( JC JEK J J J \J 73 74 7 7 7 [7 6Z$5F 64 6 6 6 [6 +d + + + [+
 45E 4$ 4 4 4 [4  { #T)	
 $J    , 
t   [  r7   r&   c                   j   e Zd ZU eed<   eed<   ej        ed<   eed<   ej        ed<   ej        ed<   ej        ed<   eed<   eed	<   ej        d
z  ed<   ej        d
z  ed<   ej        d
z  ed<   d
Zed
z  ed<   d
Z	ej        d
z  ed<   d
Z
ej        d
z  ed<   d
Zej        d
z  ed<   dZeed<   dZeed<   d
S )FlashAttentionMetadatanum_actual_tokensmax_query_lenquery_start_locmax_seq_lenseq_lensblock_tableslot_mappinguse_cascadecommon_prefix_lenNcu_prefix_query_lensprefix_kv_lenssuffix_kv_lensmax_dcp_context_kv_lendcp_context_kv_lensscheduler_metadataprefix_scheduler_metadatar   max_num_splitsTcausal)r   r   r   r   r   rf   Tensorr   r   r   r   r   r   r   r;   r7   r5   r   r      sD         \!!!l, ,----L4''''L4'''' *.C$J---/3,333 /3t+22259u|d2999NCFDr7   r   r4   r)   c                     t                      }t          | t                    }|                                D ]=}t	          |j        t                    sJ |                    |j        j                   >|S )z<Get the set of all sliding window configs used in the model.)	setr   r   values
isinstanceimplrE   addsliding_window)r4   sliding_window_configslayerslayers       r5   _get_sliding_window_configsr      so     ;>%%(i@@F > >%*&899999""5:#<====!!r7   c            	           e Zd ZU  e            dk    rej        nej        ZdZe	e
d<   edddddefd	            Zded
ee         dedej        f fdZ	 ddedede	defdZdedej        dej        defdZde	fdZ xZS )rI   rZ   Tsupports_update_block_tabler4   r   kv_cache_specr$   r)   c                     |j         j        r8t                      dk    r&t                              d           t
          j        S | j        S )NrS   zFlashAttention2 does not support CUDA graphs with encoder-decoder models due to accuracy issues reported in #33091. Disabling CUDA graph.)r/   is_encoder_decoderr   loggerwarning_oncer   NEVER_cudagraph_support)rC   r4   r   s      r5   get_cudagraph_supportz3FlashAttentionMetadataBuilder.get_cudagraph_support  sT     $7		,&((A--(  
 &++%%r7   layer_namesdevicec                    t                                          ||||           |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        | j                            | j                  | _        | j        	                    | j                  | _
        |j        | _        | j                                        | _        |j        | _        d| _        t#                      dk    | _        	 ddlm}  |            j        | _         |            j        | _        n# t2          $ r d| _        d| _        Y nw xY w| j        j        | _        | j        j                                        | _        | j        j        | _        | j        rP| j        rItA          j!        |j"        j#        dz   t@          j$        | j%                  | _&        | j        j'        | _        d | _(        d S )Nr   rZ   r   rY   rv   r   ))super__init__r/   parallel_configr0   compilation_configattention_configget_num_attention_headsnum_heads_qget_num_kv_headsnum_heads_kvrv   rb   get_head_sizeheaddimrN   r   r   aot_schedulevllm.distributed.parallel_stater   
world_sizedcp_world_sizerank_in_groupdcp_rankAssertionErrorcp_kv_cache_interleave_sizecudagraph_modehas_full_cudagraphsuse_full_cuda_graphmax_cudagraph_capture_sizemax_cudagraph_sizerf   zerosscheduler_configmax_num_seqsint32r   r   (flash_attn_max_num_splits_for_cuda_graphaot_sliding_window)selfr   r   r4   r   r   	__class__s         r5   r   z&FlashAttentionMetadataBuilder.__init__  s    	[&III'4*:'4"-"@ + <,DD 
 
 !->>t?STT+1(6688'22449	EEEEEE"/-//"<D)MOO9DMM 	 	 	"#DDMMM	  < 	(
 #2FFHH 	  #'"9"T# 	(9 	&+k,9A=k{' ' 'D# %N  ;?s   ?.D. .E	E	Fr   common_attn_metadata
fast_buildc                 .    |j         }|j        }|j        }|j        }|j        }|j        }	|j        }
|j        }|j        } j	        o|  j
        jd _
        rat           j                  }t          |          dk    r|                                }|| _
        nt          |          dk    r	d _	        dd j        r j        | j        k    r j        t%                      rd fd}|dk    }d}d}d}d}d}d} j        dk    ro|dd         |dd         z
  }|	|z
  }t)          | j         j         j                  } j         j        z  }||z   dz
  |z   j        z  } ||||||d          }n|rt/          j        d|gt.          j         j        	          }t/          j        |gt.          j         j        	          }|	d|         |z
  } |d||||d          } |||||||z
  d
          }n |||||	||          } j        r6|4|j        d         }| j        d|<   d j        |d<    j        d|         }t;          di d|d|d|d|d|	d|
d|d|d|d|d|d|d|d|d|d|dd|}|S )zu
        fast_build disables AOT scheduling, used when there will be few
        iterations i.e. spec-decode
        Nr   rY   Fr   c                    
j         j        }|                    d          rt                              |          }n
j        }r=t          | ||
j        
j        z  
j	        
j
        |||
j        |
j        	          S d S )Nrd   )
batch_sizemax_seqlen_qmax_seqlen_kr   r   r   cache_seqlens	qkv_dtypecu_seqlens_q	page_sizer   window_size
num_splits)r0   cache_dtypero   r&   rh   rb   r   r   r   r   r   rN   r   )r   cu_query_lensr   seqlensr   r   r   r   r   r   r   s           r5   schedulez5FlashAttentionMetadataBuilder.build.<locals>.schedule  s     +7K%%e,, 01MM 		 !/	 -)!.!, $ 043F F!%!2 L")'!."o! $ 7-    4r7   r   )r   r   r   r   r   r   r   Tr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r;   )num_reqsr   r   r   r   r   block_table_tensorr   r   r   r   r   r4   lenpopr   r   r   r   r   r"   r   r   rf   tensorr   r   shaper   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   sliding_window_configr   r   r   r   r   r   r   r   query_kv_lensnum_partitionsr   nattn_metadatar   r   s   `                           @@r5   buildz#FlashAttentionMetadataBuilder.buildU  sw    (00B,:*6.>'01D+8%, (;^"*&.D#
  ))DTEU)V)V&-..!33,B,F,F,H,H),82G//00144(-D%#(L$		1'3!T%<<< "0N"$$ 	N	 	 	 	 	 	 	8 (!+!""#$(!""+ABB//#2#2FFM"*]":"8##0	# # "043SSN~-1nD0&1" "*#-++2" " "  !	#(<%&ek$+$ $ $  #\"#5;t{  N &ixi03DDN(02/&-) ) )% "*#-+&'*;;" " " "*#-+ '" " " # 	=(:(F"(+A*<D#BQB'
 +,D#ABB'!%!8!!<. 
 
 
//
'-
 ,O
 $	

 X
 +*
 &
 $:#9
 !4 3
 $
 0/
  21
 "6!5
 *>
 *>
  '@&?!
" *>#
$ 6%
( r7   metadata	blk_tabler   c                 J    t          j         |          }||_        ||_        |S rG   )copyr   r   )r   r   r   r   new_metadatas        r5   update_block_tablez0FlashAttentionMetadataBuilder.update_block_table  s)     y**#, $0!r7   c                     t          |i |S rG   )use_cascade_attention)r   argskwargss      r5   r   z3FlashAttentionMetadataBuilder.use_cascade_attention  s    $d5f555r7   r~   )r   r   r   r   r   ALWAYSUNIFORM_BATCHr   r   r   r   r   r   r$   r   r   r   rf   r   r   r   r!   r   r   r   r   r   __classcell__)r   s   @r5   rI   rI      s        * "!##q(( 	!!- 
 )-,,,&!& '& 
	& & & [&&;?$;? #Y;?  	;?
 ;? ;? ;? ;? ;? ;?B !	w ww 6w 	w
 
 w w w wr	(	 <	 l		
 
 	 	 	 	6 6 6 6 6 6 6 6 6r7   rI   c                      e Zd ZU dZeed<   dej        ddfdedede	dede
e	         dz  d	edz  d
ede	dz  dededz  dej        dz  ddfdZ	 	 	 d$dej        j        dej        dej        dej        dej        dedej        dz  dej        dz  dej        dz  dej        fdZdej        j        dej        dej        dej        dej        ddfdZ	 	 	 d$dej        dej        dej        dej        dej        dej        dedej        dz  d ej        dz  d!ej        dz  dej        fd"Zdej        dej        dej        dej        dedej        j        dej        fd#ZdS )%rE   Tcan_return_lse_for_decodeN	num_headsrP   scalerO   alibi_slopesr   rb   logits_soft_capr=   kv_sharing_target_layer_namesinksr)   c                    || _         || _        t          |          | _        || _        | t          j        |t
          j                  }|| _        |d| _	        n,|	t          j        k    r|dz
  |dz
  f| _	        n|dz
  df| _	        || _        |d}|| _        |
| _        | j         | j        z  | _        |	| _        t#                      | _        t'                      | _        t+          | j                  rt-                      st/          d          || _        | j        8t3                      s
J d            | j        j        d         |k    s
J d            d| _        | j        | j        d	k    nd
| _        d S )N)rv   r   rY   r   z<FlashAttention does not support fp8 kv-cache on this device.z,Sinks are only supported in FlashAttention 3zLSinks must have the same number of heads as the number of heads in the layerTrZ   F)r  rP   floatr	  rO   rf   r   r+   r
  r   r   rA   rb   r  r  num_queries_per_kvr=   r   vllm_flash_attn_versionr   batch_invariant_enabledr
   r   NotImplementedErrorr  r   r   supports_quant_query_inputsupports_per_head_quant_scales)r   r  rP   r	  rO   r
  r   rb   r  r=   r  r  s               r5   r   zFlashAttentionImpl.__init__   s    #"5\\
(# <EMJJJL(!"*D-444#1A#5~7I"JD#1A#5q"9D,"O.,H)"&.D4E"E"'='?'?$'>'@'@$ !455 	>U>W>W 	%N   
:!,..  > . :#A&)333% 433
 +/' +7 (A-- 	+++r7   r   querykeyvaluekv_cacher   outputoutput_scaleoutput_block_scalec
                    |
J d            | j         
J d            ||	t          d          ||                    d          S | j        }
|j        }|
t
          j        t
          j        fv r:|                     |d|         |d|         |d|         |d|         ||          S |	                    d          \  }}| j
                            d          rIt                              | j
                  }|                    |          }|                    |          }|j        sv|j        }|j        }|j        }|j        }|j        }|j        }|j        d         dz
  | j        f}|j                            |          }|j                            |          }|j                            |          }| j        dk    rA|                     |d|         |d|         |d|         |||d|         ||||
  
         |S | j        t?          | j                  nd}tA          d*i d	|d|         d
|d|d|d|         d|d|d|d|d| j!        d|j"        d| j#        d|d|d| j$        d|d| j         d|d|d|d|j%        d| j&         |S tO          |d|         |d|         ||fi d|j        d|j        d |j(        d!|j)        d"|j*        d#|j        d| j!        d| j#        d$| j        d%| j$        d|j        d&|j+        d'|j%        d| j         d(|j,        d)|j        d|j        d|j        d|j        d| j&         |S )+a  Forward pass with FlashAttention.

        Args:
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
            kv_cache: shape =
                [2, num_blocks, block_size, num_kv_heads, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
        NOTE: FP8 quantization, flash-attn expect the size of
              {q,k,v}_descale to be (num_sequences, num_kv_heads).
              We use torch's .expand() to avoid duplicating values
        NzOutput tensor must be provided.$FlashAttention version not detected.zEfused output quantization is not yet supported for FlashAttentionImplr   rd   rY   )	q_descale	k_descale	v_descaleqkvoutr   r   	seqused_kr   softmax_scaler   r
  r   r   softcapr   
fa_versionr  r   r!  r   s_auxr   r   r   r   r   
max_kv_lenr   r  r   r   r   suffix_scheduler_metadatar;   )-r  r  fill_r=   r   r   rA   r@   _forward_encoder_attentionunbindrb   ro   r&   rh   viewr   r   r   r   r   r   r   r   rO   _q_scaleexpand_k_scale_v_scaler   _forward_with_dcpr   r   r   r	  r   r
  r  r   r  cascade_attentionr   r   r   r   r   )r   r   r  r  r  r  r   r  r  r  r=   r   	key_cachevalue_cacherv   r   r&  r   r   r   r   descale_shaper  r   r!  sliding_window_sizes                             r5   forwardzFlashAttentionImpl.forward_  sg   6 !!#D!!!+772 877 #'9'E%W    <<??"N	 *; 3]5JKKK 22((()&&&'((())))*   "*!3!3	;))%00 	2)EE# E "u--I%**511K( 9	(8L%.I(6L(4L'3K!.!A)/2Q68IJM--m<<I--m<<I--m<<I"Q&&&&,,,-***+,,,----.!''' '     *6 ,--- $
 '   ...//i "k 1 1122	
 ". ". (i ". #'** )// "&!2!2 !4 3 !, !00 (:'9   $;;!" (i#$ (i%& (i'(  -;;)* **+ .  	%%%&$$$%		
 	
 	

 (77	
 (55	
 "/!C!C	
 )77	
 )77	
 %00	
 **	
 **	
  ..	
 !00	
 &11	
  ,==!	
" )77#	
$ 33%	
& '4&M&M'	
( '4&F&F)	
* nn+	
, nn-	
. nn/	
0 **1	
 	
4 r7   r   c           
          | j         t          j        t          j        fv rd S | j        ||d S |                    d          \  }}t          |||||| j        |j        |j	                   d S )Nr   )
r=   r   rA   r@   r  r/  r   rb   r3  r4  )r   r   r  r  r  r   r7  r8  s           r5   do_kv_cache_updatez%FlashAttentionImpl.do_kv_cache_update  s     >m8-:OPPP F -9{}F!)!3!3	; 	 NN		
 		
 		
 		
 		
r7   r7  r8  r  r   r!  c                    | j         
J d            |j        }|j        }|j        }|                                }t                                          |d          }| j        t          | j                  nd }t          di d|d|d|dd d|d	|d
|j
        d|j        d| j        ddd| j        d|d|d| j        ddd|j        d| j         d|d|	d|
\  }}t!          ||                    dd          t                      d          \  }}|                    dd                                          }t          di d|d|d|dd d|d	|d|d|d| j        d|j        d| j        d|d| j        ddd| j         d|d|	d|
\  }}|j        |j        k    sJ |j        |j        k    sJ t)          |||||           d S )Nr  rY   )dimr"  r#  r$  r%  r   r   r&  r   r'  r   Fr
  r   r   r(  return_softmax_lseTr   r)  r  r   r!  r   )
return_lsecu_seqlens_kr;   )r  r   r   r   
contiguousr   
all_gatherr   r   r   r   r   r	  r
  r  r   r   	transposer   r   r   )r   r  r  r  r7  r8  r  r   r  r   r!  r   r   r   query_across_dcpr:  context_attn_outcontext_lsecontext_attn_out_corcontext_lse_corquery_attn_out	query_lses                         r5   r5  z$FlashAttentionImpl._forward_with_dcp0  sd    +772 877 %4$2#/  ""(??55e5CC)-)<)HD$%%%d 	 )? )
 )
 )
)
i)
 k)
 	)

 &)
 &)
 $77)
 '==)
 **)
 5)
 **)
 ,+)
 $)
 (()
  $t)
   -??!)
" 33#)
$  i%)
&  i')
(  i))
%+. 1A!!!Q''OO	1
 1
 1
-o *33Aq99DDFF$: %
 %
 %
e%
c%
 e%
 	%

 &%
 &%
 &%
 &%
 **%
 !''%
 **%
 ,+%
 ((%
  $t%
 33%
   i!%
"  i#%
$  i%%
!	( $)^-AAAAA$	7777 	
 	
 	
 	
 	
r7   c                 d   | j         
J d            | j                            d          rt          d          |j        }|j        }|j        }	|j        }
|j        d         dz
  | j        f}| j        t          | j                  nd}t          di d|d|d	|d
|d|d|d|	d|
d| j        ddd| j        d|d| j        d| j         d|j                            |          d|j                            |          d|j                            |          d| j        rdnd |S )a  Forward pass for encoder attention without KV cache.

        Args:
            query: shape = [num_encoder_tokens, num_heads, head_size]
            key: shape = [num_encoder_tokens, num_kv_heads, head_size]
            value: shape = [num_encoder_tokens, num_kv_heads, head_size]
            output: shape = [num_encoder_tokens, num_heads, head_size]
            attn_metadata: Encoder attention metadata
            layer: The attention layer
        Nr  rd   z3quantization is not supported for encoder attentionr   rY   r"  r#  r$  r%  r   rB  r   r   r'  r   Fr
  r   r(  r)  r  r   r!  r   r;   )r  rb   ro   r  r   r   r   rO   r   r   r   r	  r
  r  r1  r2  r3  r4  r  )r   r  r  r  r  r   r   r   rB  r   r   r9  r:  s                r5   r.  z-FlashAttentionImpl._forward_encoder_attention  s   & +772 877
 ))%00 	%E  
 %4$4$2$2 q!A%
 *.)<)HD$%%%d 	 	 	
 	
 	
e	
c	
 e	
 		

 &	
 &	
 &	
 &	
 **	
 5	
 **	
 ,+	
 ((	
 33	
 n++M:::	
  n++M:::!	
" n++M:::#	
$ !8?qqa%	
 	
* r7   )NNN)r   r   r   r  r   r   r   r?   r   r  r   r   rf   r   r   nnModuler   r;  r=  r5  r.  r;   r7   r5   rE   rE     s/        &*t*** )-#0#837%)=
 =
=
 =
 	=

 =
 5kD(=
 d
=
 =
 =
 !=
 '*Dj=
 |d"=
 
=
 =
 =
 =
N '+,026d dxd |d \	d
 |d ,d .d t#d lT)d "L4/d 
d d d dL)
x)
 \)
 |	)

 ,)
 l)
 
)
 )
 )
 )
h *.)-)-U
 U
|U
 \U
 |	U

 <U
 \U
 U
 .U
 <$&U
 <$&U
 <$&U
 
U
 U
 U
 U
nA|A \A |	A
 A .A xA 
A A A A A Ar7   rE   r   
query_lensnum_query_headsrO   	use_alibiuse_sliding_windowuse_local_attentionnum_smsr   c	                    | dk     rdS |s|s|rdS t          |          }	|	dk     rdS |dk    rdS ||z  }
|
dk    o| o| ot          j        |dk              }|sdS |	}d}d}t          | |          }|t          ||          z  }t          ||          }||z  }|	|z  t          |
|          z  }||z  }t          ||          }||k     S )zDecide whether to use cascade attention.

    This function 1) checks whether cascade attention is supported with the
    given configuration, and 2) heuristically decides whether using cascade
    attention can improve performance.
    rk   Frj   rY   T   )r   npallr   )r   rP  rQ  rO   rR  rS  rT  rU  r   r   r  use_flash_decoding
num_tokensq_tile_sizekv_tile_sizenum_prefix_tilescascade_ctascascade_wavescascade_timeflash_decoding_ctasflash_decoding_times                        r5   r   r     sR   * 3u & *= u :H!||uu
 )L8 	Q 	$""	$M	$ F:?##	   t J KL-|<<"T*k%B%BBLw//M #33L 	<$'9;"G"GG  ++2G<< ---r7   r  r  r7  r8  r   r   r   r   r   r+  r'  r
  r   r  r   r   r)  r   r,  r  r   r!  r*  c           	         |
J d            |dk    s
J d            |j         d         }|j         d         }||z  dk    sJ ||z  }|dk    sJ |j         d         dz
  |j         d         f}t          di d|d	|d
|d|d|d|d|d|
dddt          |          d|d d         d|ddd|d|d||                    |          nd d||                    |          nd d||                    |          nd d|dt	                      rdn|\  }}|j         d         dz
  |j         d         f}t          di d|d	|d
|d|d|d|d|	|z
  d|
dddt          |          d|d d |d f         d|ddd|d|d||                    |          nd d||                    |          nd d||                    |          nd dt	                      rdn|\  }}t          | ||||           d S )Nz)Cascade attention does not support ALiBi.r   z2Cascade attention does not support sliding window.r   rY   r"  r#  r$  r   r&  r   r   r'  r   Fr   r   r(  r@  Tr   r)  r  r   r!  r*  r   r;   )r   r   r   r2  r   r   ) r  r  r7  r8  r   r   r   r   r   r+  r'  r
  r   r  r   r   r   r)  r   r,  r  r   r!  r*  r[  rN   num_common_kv_blocksr9  prefix_output
prefix_lsesuffix_output
suffix_lses                                    r5   r6  r6    s   4 !LX%%%< &%% QJ$Jz)Q....,
:!####)/2Q6	8KLM !7 ! ! !
%!
)! +! *)	!
 !.!  Z! '&! $m! u! (((!  OO!  !  4! 54! :!  6?5J)""=111PT!!" 6?5J)""=111PT#!$ 6?5J)""=111PT%!* e+!, 011E11~-!M:2 #(+a/1DEM !7 ! ! !
%!
)! +! #]	!
 !.! #]!  "333! $m! t! (((!  #7#8#8 899!  !  4! 54! :!  6?5J)""=111PT!!" 6?5J)""=111PT#!$ 6?5J)""=111PT%!& 011E11~'!M:. fmZ
SSSSSr7   )NNNNNN)E__doc__r   dataclassesr   typingr   numpyrX  rf   vllm.attention.layerr   vllm.v1.attention.backendr   r   r   r	   r
   #vllm.v1.attention.backends.fa_utilsr   r   r   vllm.v1.attention.ops.commonr   'vllm.v1.attention.ops.merge_attn_statesr   r   r   r   r   vllm.configr   r   r   vllm.config.cacher   r   r   vllm.loggerr   *vllm.model_executor.layers.batch_invariantr   vllm.platforms.interfacer   vllm.utils.math_utilsr   r   r    r!    vllm.v1.attention.backends.utilsr"   r#   vllm.v1.kv_cache_interfacer$   r   r   r&   r   r   r   r   r   rI   rE   ndarrayr   r   r   r  r6  r;   r7   r5   <module>r~     s   + *  ! ! ! ! ! !            * * * * * *                      
 : 9 9 9 9 9 E E E E E E&&((             Y X X X X X X X X X ( ( ( ( ( ( 9 9 9 9 9 9 # # # # # #      6 5 5 5 5 5 & & & & & &         
        5 4 4 4 4 4	X		| | | | |, | | |~ ! ! ! ! ! ! ! !H	"	"sCx4	 	" 	" 	" 	"p6 p6 p6 p6 p6$<=S$T p6 p6 p6f	k k k k k k k k\K.K.
K. K. 	K.
 K. K. K. K. K. 
K. K. K. K.B 6:59%)%)%)!%1[T [TL[T<[T |[T 	[T
 <[T [T  ,[T L[T L[T [T [T ,%[T #s(O[T [T [T  ![T" #[T$ %[T&  %|d2'[T(  %|d2)[T* |d"+[T, |d"-[T. |d"/[T0 <$1[T2 \3[T [T [T [T [T [Tr7   