
    .`iz             -       F   d Z ddlmZ ddlmZ ddlZddlZddlm	Z	m
Z
mZmZ ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7 ddl8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z? ddl@mAZAmBZBmCZCmDZDmEZEmFZF ddlGmHZH ddlImJZJ ddlKmLZL ddlMmNZN dZO e)jP                    ZQejR        ZS e!eT          ZUdaVd ZWe.jX        d e-jY        d!e-jY        fd"            ZZd#ej[        d$ej[        d%ej[        d&ej[        d'ej\        d(e]ej[        ej[        f         fd)Z^ G d* d+          Z_ G d, d-e9          Z`e G d. d/                      Zae G d0 d1                      Zbe G d2 d3                      Zce G d4 d5                      Zde G d6 d7                      Ze G d8 d9e<ee                   Zf G d: d;e;          Zg	 	 	 	 	 	 	 	 	 	 	 	 	 dYdAej[        dBej[        dCej[        dDej[        dEehdFehdGehdHehdIeidJehdKejdz  dLeiej\        z  dz  dMeiej\        z  dz  dNeiej\        z  dz  dOeiej\        z  dz  dPejdz  dQejdz  dRejdz  dSekdTehdUekd(df,dVZle.jX        dWe-jY        fdX            ZmdS )Zz Attention layer with FlashInfer.    )	dataclass)ClassVarN)"BatchDecodeWithPagedKVCacheWrapper#BatchPrefillWithPagedKVCacheWrapper$BatchPrefillWithRaggedKVCacheWrapper!MultiLevelCascadeAttentionWrapper)_get_range_buf!trtllm_batch_decode_with_kv_cache)"trtllm_batch_context_with_kv_cache)	FP4Tensor)override)envs)CUDAGraphMode
VllmConfigget_current_vllm_config)
CacheDType)get_dcp_group)init_logger)vllm_is_batch_invariant)QuantKeykFp8StaticTensorSymkNvfp4Dynamiccurrent_platformDeviceCapability)tltriton)can_use_trtllm_attentionuse_trtllm_attention)cdiv)is_pin_memory_available)is_strictly_contiguous)AttentionBackendAttentionCGSupportAttentionImplAttentionMetadataBuilderAttentionTypeCommonAttentionMetadata
MultipleOf)KVCacheLayoutTypeget_dcp_local_seq_lensget_kv_cache_layoutget_per_layer_parametersinfer_global_hyperparameterssplit_decodes_and_prefills)cp_lse_ag_out_rs)merge_attn_states)AttentionSpec)CpuGpuBufferl        c                  t    t           +t          j        t          j        t          j        d          a t           S )Ncudadtypedevice)trtllm_gen_workspace_buffertorchzerosr   %VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZEuint8     y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/attention/backends/flashinfer.py _get_trtllm_gen_workspace_bufferrB   L   s4    "*&+k6ekRX'
 '
 '
# '&r@   K_CACHE_STRIDEKV_CACHE_STRIDEc                 V   t          j        d                              t           j                  }t          j        d                              t           j                  }	t          j        |||z  z   |	z                                 t           j                  }
|
dk    rd S |j        j        }t          j        |          }|
|z  t          j        d|          z   }t          j        | |z             }|                    t           j                  |z  }||z  |	z   dz   |z  t          j        d|          z   }|                    |          }t          j	        ||z   |           t          j        |          }|
|z  |z   t          j        d|          z   }t          j        | |z             }|                    t           j                  |z  }||z  |	z   dz   |z  |z   t          j        d|          z   }|                    |          }t          j	        ||z   |           d S )Nr      )
r   
program_idtoint64loadr8   
element_tyarangefloat32store)kv_cache_ptrblock_tables_prefill_ptrblock_table_stridemock_kv_cache_ptrk_scale_ptrv_scale_ptrrC   rD   	batch_idxmock_block_table_idxorig_page_numdequant_dtypek_scale_valoffsetfp8_valsdequantized_valsmock_cache_offsetv_scale_vals                     rA   "_trtllm_prefill_attn_kvfp8_dequantr_   U   s$    a  ##BH--I=++..rx88G 9/A#AADXX bll  %+6M '+&&K_,ryN/K/KKFw|f,--H{{2:..<&&)==A)A~667 (**=99H!224DEEE '+&&K'.829Q;W;WW  w|f,--H{{2:..<	'	'*>	>	BoU
	
)A~
&
&	' 
 (**=99H!224DEEEEEr@   kv_cacheblock_tables_prefillk_scalev_scalerX   returnc           
         |j         \  }}| j         }|d         dk    sJ |t          j        t          j        fv sJ |d         |d         z  |d         z  }||d         z  }	||z  dz   |d         |d         |d         |d         f}
t          j        |
|| j                  }t          j        d||z  dz   t          j        |j                                      ||          }||f}t          |         | |||||||	           ||fS )NrF            r7   )startendr8   r9   )
shaper;   bfloat16float16emptyr9   rL   int32reshaper_   )r`   ra   rb   rc   rX   
batch_sizenum_of_page_per_tokensk_cache_stridekv_cache_stridenew_smock_kv_cachemock_block_tablegrids                 rA   !trtllm_prefill_attn_kvfp8_dequantrz      s@    )=(B%J%AQ419999U^U];;;;;qTAaD[1Q4'N$qt+O//!3QqT1Q41qtLEK]8?SSSM|..2k#*	  
 gj/00  -.D&t,	 	 	 ***r@   c                       e Zd Z	 ddej        dz  fdZdej        dej        dej        dej        ded	ed
ededededededz  dej        dej        dede	f dZ
dej        j        dej        dej        dej        dej        dej        fdZdS )BatchDCPPrefillWrapperNworkspace_bufferc                     t          |t                                | _        t          |t                                | _        d S N)r   r-   _contextr   _new_tokensselfr}   s     rA   __init__zBatchDCPPrefillWrapper.__init__   sH     <133
 
 @133
 
r@   qo_indptr_cpupaged_kv_indptr_cpupaged_kv_indicespaged_kv_last_page_len_cpu	page_sizenum_qo_headsdcp_world_sizenum_kv_headshead_dimsm_scalewindow_leftlogits_soft_capq_data_typekv_cache_dtypeprefill_fixed_split_sizedisable_split_kvc                     | j                             ||||||z  ||	|d|
||||||           | j                            |||||	|	d|
|||           dS )z1Plan the prefill operation with given parameters.F)causalr   r   r   r   kv_data_typefixed_split_sizer   T)	qo_indptr	kv_indptrr   r   head_dim_qkhead_dim_vor   r   r   r   r   N)r   planr   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   s                    rA   r   zBatchDCPPrefillWrapper.plan   s    ( 	&>)#+#'5-! 	 	
 	
 	
$ 	##%%  #+# 	 	
 	
 	
 	
 	
r@   layerprefill_querykv_cache_permutekeyvalueoutc                    t                                          |                                d          }| j                            |||j        |j        d          \  }}	t          ||	t                      dd          \  }
}|                    dd                                          }| j	                            |||d          \  }}|                    dd                                          }t          ||
|||           |S )	NrF   dimT)rb   rc   
return_lseF)r   is_lse_base_on_er   )r   )r   
all_gather
contiguousr   run_k_scale_float_v_scale_floatr1   	transposer   r2   )r   r   r   r   r   r   r   prefill_query_across_dcpoutput_context_tmplse_context_tmpoutput_contextlse_contextoutput_query	lse_querys                 rA   r   zBatchDCPPrefillWrapper.run   s8    $1??#=#=$$&&A $> $
 $
  /3m.?.?$(( /@ /
 /
+O '7OO"'
 '
 '
# "++Aq11<<>>"&"2"6"6	 #7 #
 #
i ''1--88::		
 	
 	
 
r@   r   )__name__
__module____qualname__r;   Tensorr   intfloatr8   boolr   nnModuler   r?   r@   rA   r|   r|      s`        15	
 	
,-	
 	
 	
 	
2
|2
 #\2
  ,	2

 %*L2
 2
 2
 2
 2
 2
 2
 2
 2
 [2
 2
  #&!2
" #2
 2
 2
 2
h+x+ |+  ,	+
 \+ |+ \+ + + + + +r@   r|   c                   R   e Zd ZU dZeed<   ej        ej        gZ	e
eej                          ed<   g dZe
ee                  ed<   edeeez           fd            Zedefd            Zeded	         fd
            Zeded         fd            Ze	 d dedededededeedf         fd            Ze	 d!dedeedf         fd            Zededej        fd            Zedee         fd            Zededefd            Zedefd            Z ede!dz  fd            Z"dS )"FlashInferBackendTaccept_output_buffersupported_dtypes)autorl   fp8fp8_e4m3fp8_e5m2supported_kv_cache_dtypesrd   c                  
    g dS )N)       @   r?   r?   r@   rA    get_supported_kernel_block_sizesz2FlashInferBackend.get_supported_kernel_block_sizes"  s     ||r@   c                      dS )N
FLASHINFERr?   r?   r@   rA   get_namezFlashInferBackend.get_name(  s    |r@   FlashInferImplc                      t           S r   )r   r?   r@   rA   get_impl_clszFlashInferBackend.get_impl_cls,  s    r@   FlashInferMetadataBuilderc                      t           S r   )r   r?   r@   rA   get_builder_clsz!FlashInferBackend.get_builder_cls0  s    ((r@   r   
num_blocks
block_sizer   	head_sizecache_dtype_str.c                     | d|||fS Nrf   r?   )r   r   r   r   r   s        rA   get_kv_cache_shapez$FlashInferBackend.get_kv_cache_shape4  s     Az<CCr@   Finclude_num_layers_dimensionc                     t                      }|dk    r| rdS |dk    rd}n&|dk    r| rdS |dk    rd}nt          d| d          |S )	NNHD)rF   r   rf   rg   rh      )r   rF   rf   rg   rh   HND)rF   rf   rh   r   rg   r   )r   rF   rg   rf   rh   zUnknown cache layout format .)r-   
ValueError)r   cache_layoutstride_orders      rA   get_kv_cache_stride_orderz+FlashInferBackend.get_kv_cache_stride_order>  s     +,,5  %A %%U""*LLU""'C"%%U""*LLKLKKKLLLr@   r   c                 j    | dv rt           j        S | dk    rt           j        S t          d|            )N)r   r   r   zUnrecognized FP8 dtype: )r;   float8_e4m3fnfloat8_e5m2r   )r   s    rA   get_fp8_dtype_for_flashinferz.FlashInferBackend.get_fp8_dtype_for_flashinferS  sB    000&&z))$$HHHIIIr@   c                 
    g dS )N)r         r?   )clss    rA   get_supported_head_sizesz*FlashInferBackend.get_supported_head_sizes\  s     ~~r@   
capabilityc                 R    |t          dd          k    o|t          dd          k    S )N   r      rF   r   )r   r   s     rA   supports_compute_capabilityz-FlashInferBackend.supports_compute_capabilitya  s:    -a333 

FVG
 G
 9
 	
r@   c                 B    ddl m}m}  |            du rdS  |            S )zEFlashInfer supports sinks when TRTLLM attention is available (SM100).r   )force_use_trtllm_attentionsupports_trtllm_attentionF)vllm.utils.flashinferr   r   )r   r   r   s      rA   supports_sinkzFlashInferBackend.supports_sinkg  sV    	
 	
 	
 	
 	
 	
 	
 	
 &%''5005 )(***r@   Nc                 N    ddl m}  |j                    }||j        dk    rdS d S )Nr   r   
   r   )vllm.platformsr   get_device_capabilitymajor)r   r   r   s      rA   get_required_kv_cache_layoutz.FlashInferBackend.get_required_kv_cache_layoutw  sB    333333;%;==
!j&6"&<&<5tr@   )r   F)#r   r   r   r   r   __annotations__r;   rm   rl   r   r   listr8   r   r   staticmethodr   r*   r   strr   typer   r   tupler   r   r   classmethodr   r   r   r   r+   r   r?   r@   rA   r   r     s        !%$%%%5:]EN4ShtEK01SSS= = =xZ(89    d33C.D    \
 c    \ $/0    \ )T"=> ) ) ) \)   &D DDD D 	D
 D 
sCxD D D \D -2 &*	sCx   \( JS JU[ J J J \J c    [ 
5E 
$ 
 
 
 [

 +d + + + [+ ->-E    [  r@   r   c                   $    e Zd ZU dZeez  ed<   dS )	FIPrefillz@Metadata for the native FlashInfer prefill pathway (non-TRTLLM).wrapperN)r   r   r   __doc__r   r|   r  r?   r@   rA   r	  r	    s*         JJ03IIIIIIIr@   r	  c                       e Zd ZU dZeed<   dS )FIDecodez?Metadata for the native FlashInfer decode pathway (non-TRTLLM).r
  N)r   r   r   r  r   r  r?   r@   rA   r  r    s$         II//////r@   r  c                   ~    e Zd ZU dZej        ed<   	 ej        ed<   	 ej        ed<   ej        ed<   eed<   	 eed<   dS )	TRTLLMPrefillz(Metadata for the TRTLLM prefill pathway.block_tablesseq_lenscum_seq_lens_qcum_seq_lens_kv	max_q_lenmax_seq_lenNr   r   r   r  r;   r   r  r   r?   r@   rA   r  r    sz         22,
 l
 L   \!!!NNN 33r@   r  c                   J    e Zd ZU dZej        ed<   	 ej        ed<   	 eed<   dS )TRTLLMDecodez'Metadata for the TRTLLM decode pathway.r  r  r  Nr  r?   r@   rA   r  r    sN         11,
 l
 33r@   r  c                       e Zd ZU eed<   	 ej        ed<   	 ej        ed<   eed<   eed<   eed<   eed<   ee	z  dz  ed	<   	 e
ez  dz  ed
<   	 eed<   	 edz  ed<   dS )FlashInferMetadatanum_actual_tokensslot_mappingr   num_decodesnum_decode_tokensnum_prefillsnum_prefill_tokensNprefilldecodeuse_cascadecascade_wrapper)r   r   r   r   r  r;   r   r8   r	  r  r  r  r   r   r?   r@   rA   r  r    s         B,I&----
 |#d**** 
 7======r@   r  c                       e Zd ZU dZeed<   dedee         de	de
j        f fdZe
j        dd	ee
j        z  d
e
j        defdZeeded          de	dedefd                        Zd Zde
j        fdZdeez  fdZd#dedefdZd Zde j!        de j!        de
j        dedede
j        fdZ"	 d#dede#d ede$fd!Z%defd"Z& xZ'S )$r   rF   reorder_batch_thresholdkv_cache_speclayer_namesvllm_configr9   c                    t                                          ||||           |j        | _        |j        | _        |j        | _        d | _        d | _        d | _        t                      rd| _	        d| _
        d| _        nd| _	        d| _
        d| _        |j        | _        t          | j        j        | j        j                  }|j        j        }||z  }|j        }||j        nd}	| j        j                                        t.          j        k    | _        | j        rDi | _        d|	z   |z  | _        | j        j        $t;          | j        | j        j                  | _        	 t=                      j        | _         t=                      j!        | _"        |j#        j$        | _$        n%# tJ          $ r d| _         d| _"        d| _$        Y nw xY w| j         dk    | _&        | j        '                    | j(        j#                  | _)        | j        j*        | _*        | j        j+        | _,        | j        j        | _-        | j        j.        | _.        | j.        /                    d          r%t`          1                    | j.                  | _2        n-| j        j3        | j        j3        k    sJ | j        j3        | _2        ti          | j)        | j*                  }
|
r|j        j5        s| j2        | _6        n| j        j3        | _6        |
| _7        | 8                    d|
	           d | _9        tu          tw          ||tx                              | _=        | j=        j>        | _>        | j=        j?        | _?        | j=        j@        | _@        | j=        jA        | _A        | jA        r|
st          d
          t          jD         ot                      | _F        | G                    |dz             | _H        t          jJ        | jH        jK        | jF                  | _L        | G                    |          | _M        | G                    |          | _N        | j,        dk    r)t          jP        d          r|j        dk    sJ d            d S d S d S )Ni   i   TFr   rF   r   )supports_spec_as_decodeFlashInfer backend currently does not support attention sinks, please use trtllm on blackwell or flash attention on earlier GPUs.)
pin_memoryr   d   r   zThere is a bug in FlashInfer block_size 16 head size 256 support. Please avoid this combination by passing --block-size 32 or --block-size 64.)Qsuperr   cache_configmodel_configattention_config_workspace_buffer_prefill_wrapper_decode_wrapperr   decode_fixed_split_sizer   r   compilation_configr!   max_model_lenr'  r   scheduler_configmax_num_seqsspeculative_confignum_speculative_tokenscudagraph_modedecode_moder   FULLenable_cuda_graph_decode_wrappers_cudagraph_decode_cudagraph_max_bsmax_cudagraph_capture_sizeminr   
world_sizer   rank_in_groupdcp_rankparallel_configdcp_kv_cache_interleave_sizeAssertionErroruse_dcpget_num_attention_headsr)  r   r   r   r   r   cache_dtype
startswithr   r   r   r8   r   !disable_flashinfer_q_quantizationr   use_trtllm_decode_attention_init_reorder_batch_threshold_cascade_wrapperr/   r.   r   global_hyperparametersr   r   r   	has_sinksNotImplementedErrorr   VLLM_USE_V2_MODEL_RUNNERr"   r.  _make_bufferpaged_kv_indptrr;   
zeros_likecpupaged_kv_indptr_cpu_bufferr   paged_kv_last_page_lenr   is_device_capability_family)r   r'  r(  r)  r9   max_num_pages_per_reqmax_num_reqsmax_num_pagesr<  num_spec_tokenscan_use_trtllm	__class__s              rA   r   z"FlashInferMetadataBuilder.__init__  s    	[&III'4'4 + <!%  	  $"$$ 	*+/D(,0D)$(D!!+-D(,.D)$)D!"-"@ $+T-?-J!
 !
 #3@$'<<(; "- 55 	 #2>>@@MDVV 	 ! 	
  + ./-@L,PD)&AM031+F1 1-
	2"///"<D)OO9DM+H --  	2 	2 	2"#DDM01D---		2
 *Q. -EE,
 
 !.;*4+6,8&&u-- 	;"3"P"P # #D %+t/@/FFFFF"&"4":D 2$2CTEVWW	70R	7  $2D#06D ,:(**1n*UUU $ 'C$[+~NN'
 '
# 3<6B#:J4>> 	. 	%    --K2I2K2K 	  $001ABB*/*: $+
 +
 +
' !% 1 1- @ @&*&7&7&E&E#=C$4$PQT$U$U !+r111> 211   21s   (AF* *GGr8   sizer8   rd   c                4    t          ||| j        | j        ddS )NT)r8   r9   r.  
with_numpy)r4   r9   r.  )r   r8   rf  s      rA   rX  z&FlashInferMetadataBuilder._make_bufferu  s.     ;
 
 
 	
r@   r   c                     t          |j                            |j                  |j                  }|rt
          j        S t
          j        S )N)r   r   )r   r2  rM  rI  r   r%   UNIFORM_BATCHUNIFORM_SINGLE_TOKEN_DECODE)r   r)  r'  has_trtllm_supports       rA   get_cudagraph_supportz/FlashInferMetadataBuilder.get_cudagraph_support  sZ     6$1II+  '3	
 
 
  	B%33%AAr@   c                     | j         Lt          j        }t                      rt          }t          j        |t
          j        | j                  | _         | j         S )Nr7   )	r4  r   r=   r   0FLASHINFER_WORKSPACE_BUFFER_SIZE_BATCH_INVARIANTr;   r<   r>   r9   )r   buffer_sizes     rA   _get_workspace_bufferz/FlashInferMetadataBuilder._get_workspace_buffer  sV    !)DK&(( ON%*[5;t{& & &D" %%r@   r}   c                     || _         d S r   )r4  r   s     rA   set_workspace_bufferz.FlashInferMetadataBuilder.set_workspace_buffer  s    !1r@   c                     | j         b| j        r(t          |                                           | _         n3t	          |                                 t                                | _         | j         J | j         S )N)r}   )r5  rL  r|   rq  r   r-   r   s    rA   _get_prefill_wrapperz.FlashInferMetadataBuilder._get_prefill_wrapper  s      (| (>%)%?%?%A%A) ) )%% )L..002E2G2G) )% $000$$r@   Frq   use_cudagraphc           	      d   |r| j                             |d           }n| j        }||r8| j        j        d |dz            }| j        j        }| j        j        d |         }nd }d }d }t          |                                 t                      ||||d          }|r|| j         |<   n|| _        |S )NrF   T)use_cuda_graphpaged_kv_indptr_bufferpaged_kv_indices_bufferpaged_kv_last_page_len_bufferuse_tensor_cores)
rB  getr6  rY  gpur   r]  r   rq  r-   )r   rq   rw  decode_wrapperrY  r   r]  s          rA   _get_decode_wrapperz-FlashInferMetadataBuilder._get_decode_wrapper  s     	2!<@@TRRNN!1N! ."&"6":;KZ!^;K"L#'#8#< )-)D)H*)U&&"&#' )-&?**,,#%%,'6(8.D "&  N  6>L/
;;'5$r@   c                     | j         4t          d|                                 t                                | _         | j         S r   )rS  r   rq  r-   ru  s    rA   _get_cascade_wrapperz.FlashInferMetadataBuilder._get_cascade_wrapper  sB     ($E4--//1D1F1F% %D! $$r@   num_blocks_npseq_lens_npblock_table_tensornum_reqsr   c                 H   t          j        |t           j        | j        j         d|dz                       | j        j        d|dz            | j        d|dz   <   | j        j        d|dz            }|                    | j        d|dz            d           | j        j         |         }| j        j        d|         }t          |f         |||
                    d          |d           ||z  }	t          j        |	dk    |dk    z  ||	          | j        j         d|<   |S )	a=  
        Compute paged_kv_indptr, paged_kv_indices, paged_kv_last_page_len for FlashInfer
        attention.

        Results are stored in self.paged_kv_indptr,
        self.paged_kv_indices, self.paged_kv_last_page_len buffers.

        Returns paged_kv_indices, a GPU tensor with shape [num_actual_pages].
        rF   )r8   r   NTnon_blockingr   i   )
BLOCK_SIZE)npcumsumro   rY  r[  r\  r  copy_r   _copy_page_indices_kernelstridewherer]  )
r   r  r  r  r  r   rY  num_actual_pagesr   paged_kv_last_page_len_nps
             rA   _compute_flashinfer_kv_metadataz9FlashInferMetadataBuilder._compute_flashinfer_kv_metadata  sh   $ 		($'HqL(89	
 	
 	
 	
 ;?:N:RhlN;
'(Q,7 .2>X\>B+NhlN;$ 	 	
 	
 	

  /28<045F6F5FG!8+.%%a((	
 	
 	
 	
 %0)$;!46H&!+q0@A%5
 5
#&yy1
  r@   common_prefix_lencommon_attn_metadata
fast_buildc                    |j         }|j        }t          || j        d          \  }}}}	| j        }
|j        }|j        }|j        }|j        }|j	        }|dk    }| j        dk    }t          | j        | j        |	|| j        | j        | j        d| j        j
        | j        |          }| j        o
| j        dk    }|dk    s|o|dk    p|}|dk    o|dk    o|}|sX| j        rt'          d          | j        j        st-          d          | j        j        s
J d            | j        j        | _        t5          ||j        | j        ||||	|d d d 	          }| j        p|p| }|r|j                                        nd }||                                nd }|||
dz
  z   |
z  nd }| j        re|J |dk    r:||d          ||         z
  }|dd          |d d
         z
  }||d          |z
  ||d <   t?          || j        | j         | j!                  }|r|J ||
z  dk    sJ ||
z  }||z  }|p| }|r"|J |J | "                    |||||
          }nd }|r3||
z  }tG          j$        d|gtF          j%        d          } tG          j$        d|gtF          j%        d          }!|dd |f         }"tG          j$        |
gtF          j%        d          }#|d d |d f         }||z  }|J | j&        j        d d|z            }$| j'        j        d |         }%| (                                |_)        |j)        *                    | |g|!|$g|"|g|#|%g| j        | j        | j+        | j        d| j,        | j-        | j.        | j        | j/                   |S |dk    rF|}&||&d          ||&         z
  }|j0        d         |dz   k    sJ |r||&d          ||&         z
  }'| j&        j1        |&|dz            }(|dd          |d d
         z
  }te          |3                                4                                          })tk          ||&d          ||&d          |'|(|)|          |_6        n{| 7                                }*| j'        j        |&|         }+|+j0        d         |k    sJ | j&        j        |&|dz            },|,j0        d         |dz   k    sJ | j        rytq          |*tr                    sJ |**                    ||,||+| j        | j        | j        | j        | j+        | j,        | j-        | j.        | j        | j/        | j:        | j;                   n~tq          |*tx                    sJ |**                    ||,||+| j        | j        | j+        | j        d| j,        | j-        | j.        | j        | j/        | j        j        | j:        | j;                   t{          |*          |_6        |dk    r|r;||z  dk    s
J d            t}          |d |         |d |         |          |_?        n|dk    }-| j@        o|-o
|| jA        k    }.|}/| B                    |/|.          }0t          |0| j&        j        d |/dz            || j'        j        d |/         |d |/         | j        | j        z  | j        | j+        | j        d| j,        | j-        | j.        | j        | j/        | j        j        | jD        | j;                   t          |0          |_?        |S )NT)decode_thresholdrequire_uniformr   rF   )
is_prefillforce_use_trtllmrU  has_specr-  zcWindow left is not the same for all layers. One potential fix is to set disable_sliding_window=TruezFlashInfer backend currently only supports models in which all layers share the same values for the following hyperparameters: `window_left`, `logits_soft_cap`, `sm_scale`.)r  r  r   r  r  r  r   r#  r!  r"  r$  r+  r[  r7   )r   r   r   r   r   r   )r  r  r  r  r  r  )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )	r   r   r   r   r   r   o_data_typer   r   )r
  z9TRTLLM decode requires uniform query lengths per request.)r  r  r  NONE)	pos_encoding_moder   r   r   r   r   r  r   r   )Fr  r  r0   r&  r   r  r  r  query_start_locquery_start_loc_cpur    r   r   r   rN  r   r3  rU  rQ  rV  rT  has_same_window_leftsr   has_same_all_paramsr2  r8   r  r  rL  r[  numpyr,   rH  rJ  r  r;   tensorro   rY  r]  r  r$  r   r   r   r   r   r   rk   r  r   maxitemr  r!  rv  
isinstancer|   r   r   r   r	  r  r"  rA  rC  r  fast_plan_decoder7  r  )1r   r  r  r  r  r  r  r  r  r   r   r  r  r  r   r   r#  uses_spec_reorderprefill_use_trtllmdecode_use_trtllmall_uses_trtllmis_only_trtllm_decodeattn_metadataneeds_seq_lens_cpuseq_lens_cpur  r  qo_indptr_prefill_cpuquery_lens_prefill_cpunum_common_kv_blocksneeds_paged_kv_indicesr   shared_qo_indptr_cpushared_kv_page_indptr_cpushared_kv_page_indices_cpushared_kv_last_page_len_cpur   r   prefill_startqo_indptr_prefill_gpupaged_kv_indptr_prefill_gpumax_q_len_prefillprefill_wrapper"paged_kv_last_page_len_prefill_cpupaged_kv_indptr_prefill_cpupure_decoderw  num_input_tokensr  s1                                                    rA   buildzFlashInferMetadataBuilder.build  s	    (00B&$!%!= $   	I\#46H N	*6'01D(8	,@ (!+ 81<1!2Gn&
 
 
 ,I1D1I 	 (1,B0B 
11 1 	 !- 1 !
!O1 1 	  	7~ )'   .D  N  
 .B   B  $06D
 +/-:(#/%1# 
 
 
  "\U[U@U<U3EO )--///4 	 /;.Fl((***D & IM*y88 	 < 	+++a!+,,/-2LL & *!""-0Ecrc0JJ ' !.1GG [\\* 2#1	 L  	2 ,,,$y0A5555#4	#A 11M "-!I4I0I! 	$ ,,,***#CC"     $  )	!#4	#A  $)<%&ek%$ $ $  ).()U) ) )% *<A?T@T?T<T)U&*/,5;u+ + +'
 "4AAA7K7L7L4L!M11M#///"&"6":>Q\>"J)-)D)H()S&,0,E,E,G,GM))..%}5*,?@+-=>,.HI!! , $ 4 ,!0 /     !  !'Mmnn-m0LL " ).q1\A5EEEEE! LK mnn-	-0HH & /3.B.F!HqL0/+
 *!""-0Ecrc0JJ ' %((>(B(B(D(D(I(I(K(K$L$L!(5!3MNN!C%mnn5#8$?/ +) ) )%% #'";";"="=595P5T!(*62 :?BlRRRR.2.B.F!HqL0/+ 38;|a?OOOOO< +%o7MNNNNN#((&;,G)93U"&.%)%6'+':%)%6!%!%$($4(,(<$($4'+':151N)-)>! )    & &';     $((-3(:))#!%$($4(,(<$($4%)%8$($5$;)-)F)-)># )   & )2/(J(J(J% ??  -H(;6!;;;O <;; (4!3L[L!A%l{l3 +( ( ($$ +a/* K#K)T-JJ 
 $5 !%!9!9$m" " !"(,-C/?!/C-CD$/34E5E4EF !2"2!23%(;;%MN&,!] $ 0$($8 $ 0!%!4 $ 1 7%)%A%)%:'   * (0'G'G'G$r@   c                 H    | j         j        | j        j        j        k    rdS dS )NF)r'  r8   r)  r2  )r   argskwargss      rA   use_cascade_attentionz/FlashInferMetadataBuilder.use_cascade_attention\  s*    #t'7'D'JJJ 5 ur@   r   )(r   r   r   r&  r   r  r3   r  r  r   r;   r9   r   ro   SymIntr8   r4   rX  r   r  r  r%   rm  rq  r   rs  r   r|   rv  r   r  r  r  ndarrayr  r)   r  r  r  __classcell__)rd  s   @rA   r   r     so        #$S$$$K$K #YK  	K
 K K K K K K\ ?Dk	
 	
 	
5<'	
05	
		
 	
 	
 	
 B-.BB %B 
	B B B [ XB & & &2U\ 2 2 2 2%	,/E	E% % % %" "c "$ " " " "H% % %4 z4  Z4  "L	4 
 4  4  
4  4  4  4 t !	K KK 6K 	K
 
K K K KZ
        r@   r   c                   n   e Zd ZU dZeed<   dej        ddfdedede	dede
e	         dz  d	edz  d
ede	dz  dededz  dej        dz  ddfdZdefdZdej        fdZ	 	 	 ddej        j        dej        dej        dej        dej        dedej        dz  dej        dz  dej        dz  dej        fdZdS ) r   Tcan_return_lse_for_decodeN	num_headsr   scaler   alibi_slopessliding_windowr   r   	attn_typekv_sharing_target_layer_namesinksrd   c                    || _         || _        t          |          | _        || _        | t          j        |t
          j                  }|| _        |d| _	        n|dz
  df| _	        | j	        | j	        d         nd| _
        || _        || _        |
| _        | j         | j        z  | _        |	t          j        k    rt#          d          d | _        |9|j        d         |k    r!t)          d| d|j        d          d	          || _        t+          ||          | _        t/                      }| j        o|j        j         | _        d | _        d | _        d | _        d S )
Nre  )r+  r+  rF   r   r+  zaEncoder self-attention and encoder/decoder cross-attention are not implemented for FlashInferImplzWSinks must have the same number of heads as the number of heads in the layer. Expected z
, but got r   )r  r   r   r  r   r;   r  rM   r  r  r   r   r   r  num_queries_per_kvr(   DECODERrV  r  rk   r   r   support_trtllm_attnr   r3  rP  supports_quant_query_input
bmm1_scale
bmm2_scale
o_sf_scale)r   r  r   r  r   r  r  r   r   r  r  r  r)  s                rA   r   zFlashInferImpl.__init__i  s    #"5\\
(# <EMJJJL(!"*D#1A#5q"9D&*&9&ED""2 	 -.,H)"&.D4E"E---%!   +/
{1~** )4=) ){1~) ) )  
 DJ#;I|#T#T -//$ S0RR 	' )-(,(,r@   	quant_keyc                 d    | j         o)| j                            d          o|t          t          fv S )Nr   )r  r   rO  r   r   )r   r  s     rA   fused_output_quant_supportedz+FlashInferImpl.fused_output_quant_supported  s;    $ B#..u55B1=AA	
r@   	act_dtypec                     | j         E| j         j        t          j        k    r-| j                             t          j                  | _         d S d S d S r   )r  r8   r;   rM   rH   )r   r  s     rA   process_weights_after_loadingz,FlashInferImpl.process_weights_after_loading  sD    :!dj&6%-&G&Gu}55DJJJ "!&G&Gr@   r   queryr   r   r`   r  outputoutput_scaleoutput_block_scalec
                    |
J d            ||                     d          S |j        |j        k    sJ d|j         d|j                     | j        |j        |j        z  | j        z  | _        | j        |j        | _        t          |j
        t                    }
t          |j        t                    }||	
J d            n|j        t          k    s
J d            |j        dk    s|
r|j        dk    s|s
J d            |j        t          k    r|	
J d	            n4|j        t"          k    r|	
J d
            nt%          d|j                   |j        l|                                                                |_        |j        t          k    r| j        |j        z  | _        n|j        t"          k    r|j        | _        |j        }| j        t2          j        j                            |||dddf         |dddf         |j        | j        |j        |j                    | j        !                    d          r4tD          #                    | j                  }|$                    |          }|d|         }|d|         }|d|         }|}|d|         }|j%        r9|j&        J |'                    |j&        (                    ||                     |S |j)        }|j*        }tD          +                                } |j,        | }| j-        dk    }|dk    r||d         }|j.        d         |k    sJ |
st          |j
        t^                    sJ |j
        j0        }|J |rt          |tb                    sJ |j2        j3        | j4        k    sJ |j2        j5        | j6        pdk    sJ |j2        j7        | j        k    sJ |j2        j8        rJ |j9        j3        | j4        k    sJ |j9        j5        | j6        pdk    sJ |j9        j7        | j        k    sJ |j9        j8        sJ |(                    |||||d         ||d         ||d                    n~t          |tt                    sJ |j3        | j4        k    sJ |j5        | j6        pdk    sJ |j7        | j        k    sJ |j8        sJ |(                    |||j        |j        ||d                    nt          |j
        t                    sJ |;                                <                    |j.                  }t{                      }|j
        j>        }|j
        j?        }t                      dk    sJ t          |          sJ t          |          sJ t          |          sJ t          |          sJ t          |          sJ |j        t"          k    r*| j        J t          ||d         |	||j.                  }n| j        J ||d         }|j        t          k    r@| j        !                    d          r&t          |||j        |j         |j                  \  }}n|}|}t          d+i d|d|d|d|d|d|j
        jE        d|j
        jF        d| j        d| j        d|j        d|j
        jG        d|j
        jH        d| j4        d | jI        d!| j        d"| |dk    r|d|         }|j.        d         |k    sJ |swt          |j        t                    sJ |j        j0        }|J |j3        | j4        k    sJ |j5        | j6        pdk    sJ |j7        | j        k    sJ |rt                      L                    |;                                d#$          }t3          jM        |          }t3          jN        |O                    d          |O                    d          ft2          jP        |jQ        %          }|(                    |||j        |j        ||d&'           t          ||t                      d()          |d|<   n|(                    |||j        |j        |d|                    n~t          |j        t                    sJ |;                                <                    |j.                  }t{                      }|j        j>        } |j        j?        }!t                      dk    sJ t          |          sJ t          |          sJ t          |          sJ t          |           sJ t          |!          sJ |j        t"          k    r*| j        J t          |d|         |	d|j.                  }n| j        J |d|         }||j        z  dk    rd}"n
||j        z  }"t          |||| |!|j        jF        | j        | j        | j4        | jI        | j        ||"*           |S ),aM  Forward pass with FlashInfer.

        Args:
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
            kv_cache: KV cache tensor with different possible shapes:
                - NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
                - HND: [num_blocks, 2, num_kv_heads, block_size, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
        NzOutput tensor must be provided.r   zQuery dtype mismatch: expected z, got z@output_block_scale is not supported when fusion has not happenedz2Query must be FP8 when attn+quant fusion happened.zMust use TRT-LLM attnz8output_block_scale should not be provided for fp8 outputz/output_block_scale is required for nvfp4 outputzUnsupported output dtype: rF   r           )r   )rb   rc   r   r   )datar  scale_start_indexoriginal_shaper  r`   r}   r  r  r  
max_kv_lenr  r  rq   r  r  r   r  r  r   r   r7   T)rb   rc   r   lser   F)r   )r  r`   r}   r  r  r  r  r  r   r  r  r   q_len_per_reqr?   )Tfill_r   r8   r  _q_scale_floatr   r  r  r   r  r!  r  r"  r  	FP8_DTYPEr  r  	FP4_DTYPEr   _o_scale_floatr[  r  r  r  r  r;   ops_C_cache_opsreshape_and_cache_flashr  r   _k_scale_v_scalerO  r   r   viewr#  r$  r  r   r  r   r   permuter   rk   r	  r
  r|   r   _window_leftr   _logits_soft_capr   	_sm_scale_causalr   r   r   rp   rB   r  r  r-   r#   r   rz   r   r  r  r  r  r  r  r   r   
empty_likern   rf  rM   r9   r1   r
   )#r   r   r  r   r   r`   r  r  r  r  r  r  r  torch_dtypeoutput_paddedr  r   r   r   rL  r   r  r}   ra   seq_lens_prefillr   rw   rx   decode_queryr  
output_tmpr  block_tables_decodeseq_lens_decoder  s#                                      rA   forwardzFlashInferImpl.forward  s   2 !!#D!!! <<??" (EK777!m.G ! !;! ! 877
 ?"#2U5IIDJVDO?"#2DO'(=}MM&}';\JJ %--R .--- !,	999D :99 ".!337I3)Q..2C..& /. |y(()11N 2111 **)55E 6555 !!Lfl!L!LMMM
 #+'3'7'7'9'9'>'>'@'@$<9,,&*o8L&LDOO\Y..&+&:DO *;,4 I"::AA*#	 	 	 "--e44 6/LL'  $==55 ((()$$$%((()***+$ 	 0<<<LL6::5(KKLLLM *;*=(BBDD+8+\:%) !!!"3"4"45M &q)-?????% o!-"7CCCCC"/"7"?&222 '%o7MNNNNN*3@DDTTTTT*3D,3    +3=KKKK.7????*6CtGWWWWW*6G,3    +6@DJNNNN*6>>>>#''%(-..//001"#4#5#56 (     &')L     +74;KKKKK*;,3    +4
BBBB*2222#''%( % 4 % 4"#4#5#56 (     "-"7GGGGG
 !. 8 8 : : B B=CV W W#C#E#E '4'<'I$#0#8#A  +,,5555-m<<<<<-.>?????-.>?????-.BCCCCC-.>?????<9,,?666##$5$6$670*;'4':	  CC  ?222 !2!3!34C "-::+66u== ; 7X(,%17 73M#3#3 %5M';$2   '-*] &6%5 "2!1	
 .- ,3==  -4@@  $  $  -99 $1#8#G#G %2$9$I$I !% 0 0 **  $  ! & q   !3"3!34L%a(,=====$ ]!-"6AAAAA!.!5!=%111%2d6FFFFF%64;O;VSVWWWW%/4:====  #0??#=#=$//11r $> $ $L "'!1,!?!?J+%**1--|/@/@/C/CD#m+2  C
 #&&$( % 4 % 4&#' '    2B"%).	2 2 2F---.. #&&$( % 4 % 4"#5$5#56 '     "-"6EEEEE  ,6688@@ASTT#C#E#E &3&:&G#"/"6"? +,,5555-l;;;;;-.>?????-.>?????-.ABBBBB-o>>>>><9,,?666##$6%6$670*+'3'9	  CC  ?222 !3"3!34C$}'@@AEE %&MM$59R$RM1&-%5!4, - 4 @## $ 0*#"/    r@   )NNN)r   r   r   r  r   r  r(   r  r   r   r  r  r;   r   r   r   r  r8   r  r   r   r  r  r?   r@   rA   r   r   f  s        &*t*** )-#0#837%)<- <-<- <- 	<-
 <- 5kD(<- d
<- <- <- !<- '*Dj<- |d"<- 
<- <- <- <-|
h 
 
 
 
6u{ 6 6 6 6 '+,026f fxf |f \	f
 |f ,f *f t#f lT)f "L4/f 
f f f f f fr@   r   r  r+  rm   TF
indptr_cpuindiceslast_page_len_cpur  r   r   r   r   r  r   r   r   r   r  	data_typer   
rope_scale
rope_thetar  r   r   c                    | j         rt          | dd          r3|                     ||||||||	|
|||||||||dd||           d| _        dS | j         s
J d            t	          |          }|d}|	||}||}n|d}||}t          |t                    rt          t          |          n|}t          |t                    rt          t          |          n|}|| j        k    r(t          d
                    || j                            t	          |          t	          | j                  k    rt          d	          | j                            |d
           | j                            |d
           t          |dz   d          }	 | j        | j        | j        ||||||||| j         ||d|
g}| j        dk    r?|                    |           |                    |           |                    d            | j        j        | | _        n%# t.          $ r}t1          d|           |d}~ww xY w|	| _        |
| _        || _        || _        || _        || _        dS )ag  
    A faster version of BatchDecodeWithPagedKVCacheWrapper::plan used for
    cudagraph capture/replay, while the no cudagraph version turns back
    to the original plan.
    using original plan after passing host-side buffers:
    - only host-to-device copy of indptr and last_page_len buffers
    Modifications for cudagraph:
    - only host-to-device copy of indptr and last_page_len buffers.
    - avoid device-to-device copy of indices buffer.

    Part of the code get inspiration from the original plan from FlashInfer repo
    and the implementation of fast_decode_plan for FlashInfer in SGlang repo.
    vllm_first_callTNFzShould be cudagraph only herer  rm   zThe batch size should be fixed in cudagraph mode, the runtime batch size {} mismatches the batch size set during initialization {}zHThe size of indices should be less than or equal to the allocated bufferr  rF   r[  fa2r   zError in tensor core plan: )is_cuda_graph_enabledgetattrr   r  lenr  r  r;   _fixed_batch_sizer   format_paged_kv_indices_buf_paged_kv_indptr_bufr  _paged_kv_last_page_len_bufr	   _float_workspace_buffer_int_workspace_buffer _pin_memory_int_workspace_buffer_backendappend_cached_module
_plan_info	ExceptionRuntimeError_pos_encoding_moder  r  r  _rope_scale_rope_theta)r   r	  r
  r  r  r   r   r   r   r  r   r   r   r   r  r  r   r  r  r  r   r   rq   qo_indptr_hostr  es                             rA   r  r    s   P % 7H$)O)O 		-	
 	
 	
0  %%FF'FFF%&''J #K$L		"'1+s'C'CT{###  )3<(E(EW|$$$<  T+++  &z43I J J
 
 	

 7||c$45555V
 
 	

 	##JT#BBB$**+<4*PPP#JNE::NE (&1&!
$ =E!!KK()))KK()))KKNNN2$-2
  E E E<<<==1DE 0D#D+DDN!D!Ds   BH 
H5H00H5r  c                    t          j        d          }|||z  z   }t          j        ||z             }t          j        ||z   dz             }||z
  }	t          j        d|          }
t          j        d|	|          D ]L}t          j        ||z   |
z   ||
z   |	k               }t          j        | |z   |z   |
z   |||
z   |	k                Md S )Nr   rF   )mask)r   rG   rJ   rL   rangerN   )page_indicesblock_tablerQ   cu_num_blocksr  req_idxrow_ptr	start_idxend_idxr   rZ   i	block_idss                rA   r  r    s     mAGG&888G/00Igmg-122G9$JYq*%%FXaZ00 
 
GGaK&0q6zJ7NOOO	
9$q(61Vj(	
 	
 	
 	
 	

 
r@   )r  r+  Nrm   NNNNNNTr+  F)nr  dataclassesr   typingr   r  r  r;   
flashinferr   r   r   r   flashinfer.decoder	   r
   flashinfer.prefillr   flashinfer.utilsr   typing_extensionsr   vllmr   vllm.configr   r   r   vllm.config.cacher   vllm.distributed.parallel_stater   vllm.loggerr   *vllm.model_executor.layers.batch_invariantr   9vllm.model_executor.layers.quantization.utils.quant_utilsr   r   r   r   r   vllm.platforms.interfacer   vllm.triton_utilsr   r   r   r   r    vllm.utils.math_utilsr!   vllm.utils.platform_utilsr"   vllm.utils.torch_utilsr#   vllm.v1.attention.backendr$   r%   r&   r'   r(   r)   r*    vllm.v1.attention.backends.utilsr+   r,   r-   r.   r/   r0   vllm.v1.attention.ops.commonr1   'vllm.v1.attention.ops.merge_attn_statesr2   vllm.v1.kv_cache_interfacer3   vllm.v1.utilsr4   ro  	fp8_dtyper  r>   r  r   loggerr:   rB   jit	constexprr_   r   r8   r  rz   r|   r   r	  r  r  r  r  r   r   r   r  r   r   r  r  r?   r@   rA   <module>rQ     s   ' & ! ! ! ! ! !                       P O O O O O O O A A A A A A & & & & & & & & & & & &       J J J J J J J J J J ( ( ( ( ( ( 9 9 9 9 9 9 # # # # # #              
 , + + + + + 5 5 5 5 5 5 ( ( ( ( ( ( ( (        ' & & & & & = = = = = = 9 9 9 9 9 9                                 : 9 9 9 9 9 E E E E E E 4 4 4 4 4 4 & & & & & &3E 0&&((	K		X		" ' ' ' +F L+F \+F +F +F +F\"+l"+,"+ \"+ \	"+
 ;"+ 5<%&"+ "+ "+ "+Jk k k k k k k k\g g g g g( g g gT J J J J J J J J 0 0 0 0 0 0 0 0 4 4 4 4 4 4 4 46 4 4 4 4 4 4 4 4& "> "> "> "> "> "> "> ">J~	 ~	 ~	 ~	 ~	 89K L ~	 ~	 ~	Bs s s s s] s s s@ $$(,5-1,0*.!##"-Q" Q"Q" \Q" |	Q"
 ,Q" Q" Q" Q" Q" Q" Q" T\Q" u{"T)Q" #d*Q" u{"T)Q"  U[ 4'!Q"" dl#Q"$ %Q"& 'Q"( )Q"* +Q", -Q". 
/Q" Q" Q" Q"h 

 
 
 
 
 
 
r@   