
    .`i                        d dl mZ d dlmZmZmZ d dlZd dlZd dl	m
Z d dlmZmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZmZm Z m!Z!m"Z"m#Z#m$Z$ d dl%m&Z&m'Z'm(Z(m)Z) d dl*m+Z+m,Z,m-Z-m.Z. d dl/m0Z0 d dl1m2Z2 erd dl3m4Z4  ee5          Z6dZ7	  G d de          Z8e G d de!                      Z9ej:        dej;        dej;        dej;        dej;        fd            Z<	 	 	 	 	 	 d.d ej=        d!ej=        d"ej=        de>d#e>de>d$e?d%ej=        dz  d&ej=        dz  fd'Z@d(e>fd)ZA G d* d+e"e9                   ZB G d, d-ee9                   ZCdS )/    )	dataclass)TYPE_CHECKINGClassVarOptionalN)_custom_ops)
VllmConfigget_current_vllm_config)
CacheDType)init_logger)MLACommonBaseImplget_mla_dims)current_platform)DeviceCapability)tltriton)AttentionBackendAttentionCGSupportAttentionLayerAttentionMetadataAttentionMetadataBuilderCommonAttentionMetadata
MultipleOf)#reshape_attn_output_for_spec_decodereshape_query_for_spec_decodesplit_decodes_and_prefillssplit_prefill_chunks)FlashMLASchedMetaflash_mla_sparse_fwdflash_mla_with_kvcacheget_mla_metadata)AttentionSpec)current_workspace_manager)Indexer    c                      e Zd ZU dZeed<   ej        gZe	e
ej                          ed<   g dZe	e
e                  ed<   ede
eez           fd            Zedefd            Zeded	         fd
            Zeded         fd            Zede
e         fd            Zedefd            Zedefd            Zededefd            Ze	 ddedededededeedf         fd            ZdS )FlashMLASparseBackendTaccept_output_buffersupported_dtypes)autobfloat16
fp8_ds_mlasupported_kv_cache_dtypesreturnc                      dgS )N@    r0       /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/attention/backends/mla/flashmla_sparse.py get_supported_kernel_block_sizesz6FlashMLASparseBackend.get_supported_kernel_block_sizesU   s	    tr1   c                      dS )NFLASHMLA_SPARSEr0   r0   r1   r2   get_namezFlashMLASparseBackend.get_nameY   s      r1   FlashMLASparseMetadataBuilderc                      t           S N)r7   r0   r1   r2   get_builder_clsz%FlashMLASparseBackend.get_builder_cls]   s    ,,r1   FlashMLASparseImplc                      t           S r9   )r;   r0   r1   r2   get_impl_clsz"FlashMLASparseBackend.get_impl_clsa   s    !!r1   c                     dgS )Ni@  r0   clss    r2   get_supported_head_sizesz.FlashMLASparseBackend.get_supported_head_sizese   s	    ur1   c                     dS NTr0   r?   s    r2   is_mlazFlashMLASparseBackend.is_mlai       tr1   c                     dS rC   r0   r?   s    r2   	is_sparsezFlashMLASparseBackend.is_sparsem   rE   r1   
capabilityc                     |j         dv S )N)	   
   )major)r@   rH   s     r2   supports_compute_capabilityz1FlashMLASparseBackend.supports_compute_capabilityq   s    7**r1   r)   
num_blocks
block_sizenum_kv_heads	head_sizecache_dtype_str.c                 "    |dk    r| |dfS | ||fS )Nr+   i  r0   )rN   rO   rP   rQ   rR   s        r2   get_kv_cache_shapez(FlashMLASparseBackend.get_kv_cache_shapeu   s+     l** 
C00
I66r1   N)r)   )__name__
__module____qualname__r'   bool__annotations__torchr*   r(   r   listdtyper,   r
   staticmethodintr   r3   strr6   typer:   r=   classmethodrA   rD   rG   r   rM   tuplerT   r0   r1   r2   r&   r&   L   s:        !%$%%%5:^4DhtEK01DDD= = =xZ(89    d33C.D    \ !c ! ! ! \! -T"AB - - - \- "$34 " " " \" c    [ t    [ $    [ +5E +$ + + + [+   &7 777 7 	7
 7 
sCx7 7 7 \7 7 7r1   r&   c                       e Zd ZU eed<   eed<   eed<   eed<   ej        ed<   ej        ed<   ej        ed<   ej        ed<   d	Zeed
<   dZeed<   e	 G d d                      Z
e	 G d d                      ZdZee
z  dz  ed<   dZeed<   dS )FlashMLASparseMetadatanum_reqsmax_query_lenmax_seq_lennum_actual_tokensquery_start_locslot_mappingblock_tablereq_id_per_tokenr/   rO      topk_tokensc                   B    e Zd ZU eed<   ej        ed<   ej        ed<   dS )(FlashMLASparseMetadata.FP8KernelMetadatascheduler_metadatadummy_block_table
cache_lensN)rU   rV   rW   r   rY   rZ   Tensorr0   r1   r2   FP8KernelMetadatarp      s:         ---- <'''L     r1   ru   c                       e Zd ZU e G d d                      Ze G d d                      ZdZeed<   dZ	eed<   dZ
eed<   dZeed	<   d
Zed
z  ed<   d
Zed
z  ed<   d
S )/FlashMLASparseMetadata.FP8SeparatePrefillDecodec                   $    e Zd ZU ded<   eed<   dS )6FlashMLASparseMetadata.FP8SeparatePrefillDecode.Decoderp   kernel_metadatadecode_query_lenN)rU   rV   rW   rY   r^   r0   r1   r2   Decodery      s*         GGGG!!!!!!r1   r|   c                       e Zd ZU ej        ed<   ej        ed<   ej        ed<   e G d d                      Zee         ed<   dS )7FlashMLASparseMetadata.FP8SeparatePrefillDecode.Prefillseq_lensrequest_idsworkspace_startsc                   n    e Zd ZU dZej        ed<   eed<   ej        ed<   eed<   ej        ed<   eed<   dS )	=FlashMLASparseMetadata.FP8SeparatePrefillDecode.Prefill.ChunkzMetadata for a chunk of prefill requests.

                Prefill requests may be chunked to fit within the fixed workspace size.
                r   tokens_slicerk   req_start_idxr   chunk_tot_seqlenN)	rU   rV   rW   __doc__rZ   rt   rY   slicer^   r0   r1   r2   Chunkr      sl          
  ,&&&####"\)))"""""',..."%%%%%%r1   r   chunksN)	rU   rV   rW   rZ   rt   rY   r   r   r[   r0   r1   r2   Prefillr~      s          l"""
 %%% $l***& & & & & & & Y& Kr1   r   r   num_prefillsnum_decodesnum_prefill_tokensnum_decode_tokensNdecodeprefill)rU   rV   rW   r   r|   r   r   r^   rY   r   r   r   r   r   r0   r1   r2   FP8SeparatePrefillDecoderw      s         		" 	" 	" 	" 	" 	" 	" 
	" 
	  	  	  	  	  	  	  
	 @ cS"#C###!"3""" $$$$"&4&&&&&r1   r   Nfp8_extra_metadataFfp8_use_mixed_batch)rU   rV   rW   r^   rY   rZ   rt   rO   rn   r   ru   r   r   r   rX   r0   r1   r2   rd   rd      s0        MMM\!!!,l"""JK! ! ! ! ! ! ! Y!
 -' -' -' -' -' -' -' Y-'^ OS03DDtKRRR %%%%%%r1   rd   max_num_blocks_per_req
BLOCK_SIZEBLOCK_NHAS_PREFILLc                    t          j        d          }t          j        d          }||z  t          j        d|          z   }t          j        | |z             }|||z  z   ||z  z   }t          j        |          }|dk     }d}|	rt          j        ||z             }|dk    }||z  }||z  }||k     |dk    z  }|||
z  z   ||z  z   }|| z  }t          j        ||| z  d          }||z  |z   }|	r5t          j        ||z   |d          }||z   } t          j        || |          }t          j        |d|          }|||z  z   ||z  z   }!t          j        |!|           d S )Nr      F)maskother)r   
program_idarangeloadwherestore)"
req_id_ptrblock_table_ptrtoken_indices_ptrout_ptrprefill_request_id_ptrworkspace_starts_ptrr   r   r   r   
bt_stride0
bt_stride1
ti_stride0
ti_stride1out_stride0out_stride1token_idtile_id	indice_idreqti_ptrtokis_invalid_tok
is_prefillprefill_req_idblock_idinblock_offvalid_blockbt_ptrbaseout_valworkspace_startprefill_out
out_ptr_ijs"                                     r2   )_convert_req_index_to_global_index_kernelr      s   . }QHmAG '!BIa$9$99I '*x'
(
(C J!66Z9OOF
'&//C 1WNJ )!7(!BCC#q(
j H
"K 44QGKsZ//(Z2GGF{l"N76zk 9CCCDZ+-G  =' >1
!
 
 
 &+(:{G<<h~r733G 8k11I4KKJHZ!!!!!r1   r/   rm      Freq_idrk   token_indicesNUM_TOPK_TOKENSHAS_PREFILL_WORKSPACEprefill_workspace_request_idsprefill_workspace_startsc	                 ~   | j         t          j        k    sJ |j         t          j        k    sJ |j         t          j        k    sJ |j        d         |k    sJ ||z  dk    sJ d| d| d            |r6|J |J |j         t          j        k    sJ |j         t          j        k    sJ | j        d         }	|j        d         }
||z  }|                                 }|                                }|                                }t          j        |          }|                                \  }}|                                \  }}|                                \  }}|r4|J |J |                                sJ |                                sJ |	|f}t          |         |||||||
|||||||||           |S )a:  
    out[token_id, indice_id] =
        block_table[req_id[token_id],
            token_indices[token_id, indice_id] // BLOCK_SIZE] * BLOCK_SIZE
        + token_indices[token_id, indice_id] % BLOCK_SIZE

    Only when token_indices[token_id, indice_id] == -1 do we output -1.
    For safety, we also output -1 if the derived block_id would be
        out-of-bounds.

    When HAS_PREFILL_WORKSPACE is True, prefill tokens are mapped to workspace offsets
    instead of global cache slots. prefill_workspace_request_ids and
    prefill_workspace_starts must be provided.

    prefill_workspace_request_ids: int32 [num_tokens], -1 for decode else
        prefill request index (maps to prefill_workspace_starts)
    prefill_workspace_starts: int32 [num_prefills], 0-indexed workspace
        starts for each prefill request
    r   r   zNUM_TOPK_TOKENS (z ) must be divisible by BLOCK_N ())	r\   rZ   int32shape
contiguous
empty_likestrideis_contiguousr   )r   rk   r   r   r   r   r   r   r   
num_tokensr   tiles_per_rowreq_id_cblock_table_ctoken_indices_coutr   r   r   r   r   r   grids                          r2   (triton_convert_req_index_to_global_indexr     s4   < <5;&&&&++++%+----q!_4444W$)))WOWWWWWW *))  =,888'333,2ekAAAA'-<<<<aJ(.q1#w.M   ""H**,,M#..00O

?
+
+C +1133J
,3355J
"zz||K  8,888'333,::<<<<<'5577777 &D-d3% %  ( Jr1   max_model_lenc                     | dz  S )N   r0   )r   s    r2   get_prefill_workspace_sizer   n  s     1r1   c            
           e Zd ZU ej        Zee         ed<   dede	e
         dedej        ddf
dZd	edd
fdZd	eddfdZ	 dded	ededefdZdS )r7   _cudagraph_supportkv_cache_speclayer_namesvllm_configdevicer-   Nc                    || _         || _        |j        }|| _        |j        | _        |j        }|| _        |                     dd           t          j	        
                    |          }|j        }| j                            |          | _        t          | j                  | _        t                               | j                  | _        |j        j        j        | _        |j        dk    | _        |j        j        }	t          j        |	f| j        |t          j                  | _        t          j        |	f| j        j        |t          j                  | _        t          j        |	dft          j        | j                  | _         | j        }
tC          j"        d          r|}n|tG          d|
dz            z  }t          j        |d	ft          j        |          | _$        t          j        |	dz   ft          j        |          | _%        t          j        |j        j&        ft          j        |          | _'        d S )
Nr   T)supports_spec_as_decoder+   )r   r\   r\   r   d   r/      )(r   r   cache_configr   model_configparallel_configr   _init_reorder_batch_thresholdrZ   cudaget_device_propertiesmulti_processor_countget_num_attention_heads	num_headsr   mla_dimsr;    _compute_fp8_decode_padded_headsfp8_decode_padded_heads	hf_config
index_topkrn   cache_dtypeuse_fp8_kv_cachescheduler_configmax_num_seqsfullr   topk_tokens_tensorr   max_model_len_tensoremptyrr   r   is_device_capability_familymaxtile_scheduler_metadata_buffernum_splits_buffermax_num_batched_tokensreq_id_per_token_buffer)selfr   r   r   r   r   r   propssm_countr   h_qmax_num_sm_partss               r2   __init__z&FlashMLASparseMetadataBuilder.__init__{  s?    '&"/*'4%5 	**1d*KKK
0088.*BB?SS$T%677 ??OO 	$ '3=H , 8L H"3@"'*OT-fEK#
 #
 #
 %*JO++	%
 %
 %
! "'1U["
 "
 "
 *7<< 	='  (3q#)+<+<<.3k q!+/
 /
 /
+ "'A+"
 "
 "

 (-{)@B+(
 (
 (
$$$r1   common_attn_metadatarp   c                     |j         }| j        }t          | j        dd         ||z  | j        |dd          \  }}t
                              || j        dd         | j        dd                   }|S )zBuild FP8 metadata treating all tokens as one mixed batch.

        This matches main branch's approach and avoids the BF16 prefill kernel
        which has head padding overhead when num_heads is small (high TP case).
        Nr   Tcache_seqlensnum_q_tokens_per_head_ktopknum_heads_qnum_heads_kis_fp8_kvcache)rq   rs   rr   )	rh   r   r    r   rn   rd   ru   r   rr   )r  r
  r   padded_headsrq   _fp8_metadatas          r2   _build_fp8_mixed_decode_prefillz=FlashMLASparseMetadataBuilder._build_fp8_mixed_decode_prefill  s     *;
 3 !11"1"5$.$=!$!
 !
 !
A .??10!4"4RaR8 @ 
 
 r1   rw   c                    |j         }t          || j        pdd          \  }}}}t          j        } |||||          }d }	d }
d }d }|dk    rY|j                                        }|j        }|j        }||d          }||d          }	t          j	        |fdt          j
        | j                  }
t          |          D ]!}||z   }||         }||dz            }||
||<   "t          j        |t          j
        d          }t          j        |d d         d	          |dd <   t          j        |t          j
        | j                  }t!          | j        j        j                  }t)          ||          }g }|D ]\  }}||                                         }|||xx         |z  cc<   |	||         }|||                                         }|||z                                            }|||z                                            }t/          ||          }|||         } |j        ||z   ||z            }!|                    |j                            |||!|| |
                     |                    |d           |                    |	|
||          |_        |dk    r|j        }|d         |d         z
                                  }"| j        }#t?          | j         d |         |"|#z  | j!        |#dd          \  }$}%t          "                    |$| j#        d |         | j$        d |                   }&|%                    |&|"          |_&        |S )Nr   T)decode_thresholdrequire_uniform)r   r   r   r   r   r   r   )r\   
pin_memorydim)r   r   rk   r   r   r   non_blocking)r   r   r   r   r  )rq   rr   rs   )rz   r{   )'rh   r   reorder_batch_thresholdrd   r   r   cpuquery_start_loc_cpurZ   r   r   r   rangezeroscumsumr   r   r   r   r   r   itemsumr   block_table_tensorappendr   r   copy_r   r   r    r   rn   ru   rr   r   r|   r   )'r  r
  r   r   r   r   r   FP8Metar  prefill_seq_lensprefill_request_idr   prefill_chunksseq_lens_cpur   r!  prefill_seq_lens_cpureq_idxglobal_req_idxreq_query_startreq_query_endprefill_workspace_starts_cpumax_prefill_buffer_sizechunk_boundschunk_start	chunk_endoffsetchunk_seq_lensr   token_start	token_endr   chunk_workspace_startschunk_block_tabler{   r  rq   r  kernel_metas'                                          r2   "_build_fp8_separate_prefill_decodez@FlashMLASparseMetadataBuilder._build_fp8_separate_prefill_decode  ss    *;
 '$!%!=!B $   	Kl$57I )Aw#%/1	
 
 
  !#'  !/8<<>>L+4H"6"J#/#= '5
 "'rT[" " " !.. L L!,w!6"5n"E 3NQ4F GDK"?=#@AA ,1;EKD, , ,( 05|$SbS)q0 0 0(,
 (-{EK( ( ($
 'A -;' '# 0$&= L  N*6  &Y 6kBGGII,[-BCCCvMCCC!1+i2G!H#7I8M#N#R#R#T#T 1+2KLQQSS/i0GHMMOO	$[)<< *B+iBW)X&$8$K+-i0GG%! %%O))!/%1$5&1)?)9 *  	 	 	 	 %**,4 +    $+??).!9%	 $3 $ $L  ??"6"J 3A 69LQ9O OUUWW  7L$4"5l{lC(8<(G%(#% % %! 1BB#5"&"8+"F4\k\B C  K
 #*.. +!1 #1 # #L
 r1   Fcommon_prefix_len
fast_buildc                     |}|j         }t          j        |j        t          j                  }t          j        |          }t          j        t          j        |j        d         t          j                  |          }| j	        
                    d           | j	        d |j        d                                      t          j        |          d           | j	        d |         }d }	| j        t          k     }
| j        r-|
r|                     |          }	n|                     |          }	t'          |j        |j        |j        |j         |j        |j        |j        || j        j        | j        |	|
          }|S )N)r\   r   Tr  )re   rf   rg   rh   ri   rj   rk   rl   rO   rn   r   r   )rh   npasarrayr!  r   diffrepeatr   r   r  fill_r)  rZ   
from_numpyr   MIN_HEADS_FOR_BF16_PREFILLr   r  r@  rd   re   rf   rg   ri   rj   r'  r   rO   rn   )r  rA  r
  rB  cmr   startsseg_lengthsrl   r   r   metadatas               r2   buildz#FlashMLASparseMetadataBuilder.build  s    ")
B2"(CCCgfoo9Ik'*"(;;;[
 
 	$**1---$%@'7'=a'@%@AGG-..T 	H 	
 	
 	
  7D 	 	
 #n/II  	Q" Q%)%I%I"%M%M""%)%L%LR%P%P")[* 2.--)4(1 3
 
 
 r1   )F)rU   rV   rW   r   UNIFORM_BATCHr   r   rY   r!   r[   r_   r   rZ   r   r	  r   r  r@  r^   rX   rd   rO  r0   r1   r2   r7   r7   x  s        7I7W!34WWWO
$O
 #YO
  	O

 O
 
O
 O
 O
 O
b5 
4   @Q5Q 
;Q Q Q Qn !	/ // 6/ 	/
 
 / / / / / /r1   r7   c                       e Zd Zededefd            Z	 	 d$dededededee         dz  d	edz  d
ededz  dededz  de	j
        dz  ded         ddf fdZde	j
        de	j
        de	j
        dede	j
        f
dZde	j
        de	j
        de	j
        dede	j
        f
dZde	j
        de	j
        de	j
        dede	j
        f
dZde	j
        de	j
        de	j
        dej        dee	j
        e	j
        f         f
dZde	j
        de	j
        de	j
        de	j
        fdZ	 	 	 d%dede	j
        de	j
        de	j
        de	j
        dedz  d e	j
        dz  d!e	j
        dz  d"e	j
        dz  de	j
        fd#Z xZS )&r;   r   r-   c                     | dk    rdndS )Nr/   r   r0   )r   s    r2   r   z3FlashMLASparseImpl._compute_fp8_decode_padded_heads  s     "__rr#-r1   NrQ   scalerP   alibi_slopessliding_windowkv_cache_dtypelogits_soft_cap	attn_typekv_sharing_target_layer_nametopk_indice_bufferindexerr#   c                     t                      j        |||||||||	|
f
i | || _        |J |j        | _        t	          j        d          rdnd| _        |                     |          | _        |dk    rvt                      }||j
        J t          |j
        j                  }||f| _        t                                          | j        t           j        f          \  | _        d S d S )Nr   r   r/   r+   )superr	  softmax_scaletopk_indices_bufferr   r   prefill_paddingr   r   r	   r   r   r   prefill_workspace_shaper"   get_simultaneousrZ   r*   prefill_bf16_workspace)r  r   rQ   rS  rP   rT  rU  rV  rW  rX  rY  rZ  r[  mla_argsr   prefill_workspace_size	__class__s                   r2   r	  zFlashMLASparseImpl.__init__  s8   " 	(	
 	
 	
 	
 	
 #"""8?8S  $?DDLCC" 	 (,'L'LY'W'W$\))133K*{/G/S/SS%?(6& &" -CI+ND()++<<15>B  +T((( *)r1   qkv_c_and_k_pe_cachetopk_indicesattn_metadatac                     t          |j        |j        ||j        |j        d                   }|                     |||          S )Nr   r   r   )r   rl   rk   rO   r   _bf16_flash_mla_kernel)r  rg  rh  ri  rj  s        r2   _forward_bf16_kvz#FlashMLASparseImpl._forward_bf16_kv  sT     @*%$/(.q1
 
 
 **1.A<PPPr1   c                     |j         t          t          j                  sJ j        d }d }d}j        j        j        }j        j        }d}t          |j	        |j
        ||j        |j        d         |||          }|j         t          t          j                  sJ dt          j        dt          j        dt          j        f fd}j        }	j        }
|	d	k    r|
d	k    rj        J  |||          }n|                    |j         j         j        f|j        |j        
          }|	d	k    r! ||d |	         |d |	                   |d |	<   j        J j        j        D ]} j        d |j                 }t7          j        ||j
        |j        |j        t=          |j
                             ||j                 }||j                 }                      |||          ||j        <   |S )NFTr   )r   r   r   r   r   rg  ri  r-   c                     t          |           } | j        d         }|                    |d          }j        J                     | |j        j                  \  }}t          |          S )Nr   r   rg  rh  ri  rz   )r   r   viewr   _fp8_flash_mla_kernelrz   r   )	rg  ri  seq_lenattn_outr  r  rh  r   r  s	        r2   _fp8_decodezOFlashMLASparseImpl._forward_fp8_kv_separate_prefill_decode.<locals>._fp8_decode'  s     .a==AgajG (,,['2FFL&22244$7) , 3 C	 5  KHa 7x@@@r1   r   r   )!r   
isinstancerd   r   r   r   r   r   r   rl   rk   rO   r   rZ   rt   r   r   r   	new_emptyrh   r   kv_lora_rankr\   r   r   rc  r   ops$cp_gather_and_upconvert_fp8_kv_cacher   lenr   rm  )r  rg  rh  ri  rj  prefill_request_idsr   has_prefill_workspacerv  r   r   ru  chunkchunk_workspacechunk_qchunk_topk_indices_workspacer  r   s   ` `             @@r2   '_forward_fp8_kv_separate_prefill_decodez:FlashMLASparseImpl._forward_fp8_kv_separate_prefill_decode   s    %7,(>(WXXXXX"."#'  %+"."6"B'3';'L$$(! @*%$/(.q1"7*=%=	
 	
 	
 %7,(>(WXXXXX	A5< 	Au| 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A& ):)< q  %71%<%<&222"{1l33HH {{0$.$BSTgx #  H !1$$/:{((()<8J9J8J+K0 0+++,  '333%-4  "&"=>V@V>V"W8'#%N*)**   E.//;E<N/O,/3/J/J#00 0+,, r1   c                 n   t          |j        |j        ||j        |j        d                   }|j        J t          |j        t          j                  sJ |j        }| 	                    |
                    d          ||
                    d          |          \  }}|                    d          S )a  Mixed batch FP8 forward path that treats all tokens as one batch.

        This is equivalent to main branch's approach and avoids the BF16
        prefill kernel which has head padding overhead when num_heads is small.
        Used when use_mixed_batch is True.
        r   rl  Nr   rq  )r   rl   rk   rO   r   r   rw  rd   ru   rs  	unsqueezesqueeze)r  rg  rh  ri  rj  r  	_attn_outr  s           r2   _forward_fp8_kv_mixed_batchz.FlashMLASparseImpl._forward_fp8_kv_mixed_batche  s     @*%$/(.q1
 
 
 /;;;,.D.V
 
 	
 	
 
 %711kk!nn 3%//22(	 2 
 
	1   ###r1   rz   c                 L   |                     d          }| j        }||k     rt                              d| d| d           |                    |                     d          |                     d          ||                     d          f          }||d d d d d |d d f<   |}t          ||                    t          j                  	                    d          |j
        d	|j        |j        d
|| j        	  	        \  }}	||k     r|d d d d d |d d f         }||	fS )N   Padding num_heads from  to z for FP8 sparse decode kernelr   r      i   T)	rg  k_cacherk   
head_dim_vr  tile_scheduler_metadatar  indicesr^  )sizer   loggerwarning_once	new_zerosr   rr  rZ   uint8r  rr   rs   rq   r^  )
r  rg  rh  ri  rz   actual_num_headspadded_num_headsq_paddedr   lses
             r2   rs  z(FlashMLASparseImpl._fp8_flash_mla_kernel  sn    66!997 ...C*: C C#C C C   {{AFF1IIqvvayy:JAFFSTII#VWWH34HQQQ,,,aaa/0A)',,U[99CCBGG'9)4$3$F ,

 

 

S ...aaa---qqq01CCxr1   c                 .   |j         d         }|                    dd|j         d                   }| j        | j        z  dk    r| j        | j        z  dk    sJ t                              d| j         d| j         d           |                    |j         d         | j        |j         d         f          }||d d d | j        d d f<   |}|                    |dd          }t          |||| j                  d         }|d d d | j        d d f         }|S )Nr   r   r   r  r  z for BF16 sparse prefill kernelr  )	r   rr  r   r`  r  r  rx  r   r^  )r  rg  rh  ri  r   r  outputs          r2   rm  z)FlashMLASparseImpl._bf16_flash_mla_kernel  s_    WQZ
166&,R0
 
 >D00A55'$.8A====I$. I I'I I I   {{AGAJ0Dagaj#QRRH/0HQQQ($.(!!!+,A#((Q;;%"L$2D
 

 +T^+QQQ./r1   layer
k_c_normedk_pekv_cacher  output_scaleoutput_block_scalec
                    |
J d            ||	t          d          ||                    d          S |j        }
|d |
df         }|d |
df         }|d |
df         }| j        J | j        d |
         }|                    | j        | j        gd          \  }}|                    dd          }t          j	        || j
                  }|                    dd          }| j        dk    }t          j        ||gd          }|                                dk    rNt          j        ||                    d          ||j                                        | j        |j        	           |s|                     ||||          }n8|j        r|                     ||||          }n|                     ||||          }|                     ||d |
         
           |S )NzOutput tensor must be provided.z@fused output quantization is not yet supported for MLACommonImplr   .r   r  r   r+   )rV  rS  )r   )NotImplementedErrorrH  rh   r_  splitqk_nope_head_dimqk_rope_head_dim	transposerZ   bmmW_UK_TrV  catnumelrz  concat_and_cache_mlar  rj   flatten_k_scalern  r   r  r  
_v_up_proj)r  r  rg  r  r  r  rj  r  r  r  num_actual_toksri  q_nopeq_peql_nopeuse_fp8_cacheru  s                    r2   forwardzFlashMLASparseImpl.forward  s?    !!#D!!!#'9'E%R    
 <<??"'9 #$ 0 0# 56
$_$c)*'333/0@0@Aww 5t7LMSUwVV!!!Q'')FDK00##Aq))+|;Iwo2... >>a$Q*2244#2n     		,,Q,VVHH. 	778\= HH CC8\= H 	f-=o-=&>???r1   )NN)NNN)rU   rV   rW   r]   r^   r   floatr[   r_   rZ   rt   r   r	  rd   rn  r  r  ru   rb   rs  rm  r   r  __classcell__)rf  s   @r2   r;   r;     sl       .C .C . . . \.$ 37'+3 33 3 	3
 3 5kD(3 d
3 3 3 3 '*Dj3 "L4/3 )$3  
!3 3 3 3 3 3jQ<Q #\Q l	Q
 .Q 
Q Q Q Q&c<c #\c l	c
 .c 
c c c cJ%$<%$ #\%$ l	%$
 .%$ 
%$ %$ %$ %$N%<% #\% l	%
 0A% 
u|U\)	*% % % %N< #\ l	
 
   L '+,026J JJ <J L	J
 lJ ,J .4J t#J lT)J "L4/J 
J J J J J J J Jr1   r;   )r/   rm   r   FNN)Ddataclassesr   typingr   r   r   numpyrD  rZ   vllmr   rz  vllm.configr   r	   vllm.config.cacher
   vllm.loggerr   2vllm.model_executor.layers.attention.mla_attentionr   r   vllm.platformsr   vllm.platforms.interfacer   vllm.triton_utilsr   r   vllm.v1.attention.backendr   r   r   r   r   r   r    vllm.v1.attention.backends.utilsr   r   r   r   vllm.v1.attention.ops.flashmlar   r   r   r    vllm.v1.kv_cache_interfacer!   vllm.v1.worker.workspacer"   &vllm.model_executor.models.deepseek_v2r#   rU   r  rJ  r&   rd   jit	constexprr   rt   r^   rX   r   r   r7   r;   r0   r1   r2   <module>r     s-   " ! ! ! ! ! 4 4 4 4 4 4 4 4 4 4      # # # # # # ; ; ; ; ; ; ; ; ( ( ( ( ( ( # # # # # #        , + + + + + 5 5 5 5 5 5 ( ( ( ( ( ( ( (                                        5 4 4 4 4 4 > > > > > > ?>>>>>>	X		   67 67 67 67 67, 67 67 67r E& E& E& E& E&. E& E& E&R ?" L?" ?" \?" ?" ?" ?" ?"L "'9=48Y YLYY <Y 	Y
 Y Y  Y $)<$#6Y $lT1Y Y Y Yxc    v v v v v$<=S$T v v vr	j j j j j*+AB j j j j jr1   