
    .`i                     @   d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
mZmZmZmZ d dlmZ d dlmZmZmZmZ d d	lmZ  ee          Zd
Z G d dee                   Z G d de
          Z ej        eej        d          Z G d dee                   Z dS )    )ClassVarN)%trtllm_batch_decode_with_kv_cache_mla)
CacheDType)init_logger)MLACommonBackendMLACommonImplMLACommonMetadataMLACommonMetadataBuilderQueryLenSupport)DeviceCapability)AttentionCGSupportAttentionLayerAttentionType
MultipleOf)KVCacheLayoutTypei   c                   X    e Zd ZU ej        Zee         ed<   ej	        Z
ee         ed<   dS )FlashInferMLAMetadataBuilder_cudagraph_supportquery_len_supportN)__name__
__module____qualname__r   UNIFORM_BATCHr   r   __annotations__r   UNIFORMr        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/attention/backends/mla/flashinfer_mla.pyr   r       sE         7I7W!34WWW3B3Jx0JJJJJr   r   c                      e Zd ZU ej        ej        gZeeej	                          e
d<   g dZeee                  e
d<   edeeez           fd            Zedefd            Zeded         fd            Zeded	         fd
            Zededefd            Zededej	        dedz  dededededededz  fd            Zedd            ZdS )FlashInferMLABackendsupported_dtypes)autobfloat16fp8fp8_e4m3supported_kv_cache_dtypesreturnc                  
    ddgS )N    @   r   r   r   r    get_supported_kernel_block_sizesz5FlashInferMLABackend.get_supported_kernel_block_sizes.   s    Bxr   c                      dS )NFLASHINFER_MLAr   r   r   r   get_namezFlashInferMLABackend.get_name2   s    r   FlashInferMLAImplc                      t           S N)r/   r   r   r   get_impl_clsz!FlashInferMLABackend.get_impl_cls6   s      r   r   c                      t           S r1   )r   r   r   r   get_builder_clsz$FlashInferMLABackend.get_builder_cls:   s    ++r   
capabilityc                     |j         dk    S )N
   )major)clsr5   s     r   supports_compute_capabilityz0FlashInferMLABackend.supports_compute_capability>   s    2%%r   	head_sizedtypekv_cache_dtypeN
block_sizeuse_mlahas_sink
use_sparsedevice_capabilityc	                     ddl m}	  |	            }
|
j        (|
j        j        }t	          |dd          }|dk    rd| S d S )Nr   )get_current_vllm_configqk_nope_head_dim      z@FlashInfer MLA kernel requires qk_nope_head_dim == 128, but got )vllm.configrD   model_confighf_text_configgetattr)r9   r;   r<   r=   r>   r?   r@   rA   rB   rD   vllm_configrJ   rE   s                r   supports_combinationz)FlashInferMLABackend.supports_combinationB   st     	877777--//#/(5DN&~7I1MM3&&2/2 2 tr   KVCacheLayoutType | Nonec                     dS )NHNDr   )r9   s    r   get_required_kv_cache_layoutz1FlashInferMLABackend.get_required_kv_cache_layout\   s    ur   )r'   rN   )r   r   r   torchfloat16r#   r!   r   listr<   r   r&   r   staticmethodintr   r+   strr.   typer2   r4   classmethodr   boolr:   rM   rQ   r   r   r   r    r    %   s        5:]EN4ShtEK01SSS= = =xZ(89    d33C.D    \  c       \  !$23 ! ! ! \! ,T"@A , , , \, &5E &$ & & & [&  { #T)	
     , 
t   [2    [  r   r    cuda)r<   devicec                        e Zd Zdededededee         dz  dedz  ded	edz  d
ededz  ddf fdZdej	        e
ej	        ej	        f         z  dej	        dedede
ej	        ej	        dz  f         f
dZ xZS )r/   	num_headsr;   scalenum_kv_headsalibi_slopesNsliding_windowr=   logits_soft_cap	attn_typekv_sharing_target_layer_namer'   c                     t                      j        |||||||||	|
f
i | |||g}t          |          rt          d          |	t          j        k    rt          d          t          | _        d | _        d | _	        d S )NzfFlashInferMLAImpl does not support one of the following: alibi_slopes, sliding_window, logits_soft_capzdEncoder self-attention and encoder/decoder cross-attention are not implemented for FlashInferMLAImpl)
super__init__anyNotImplementedErrorr   DECODERg_fi_workspace_workspace_buffer
bmm1_scale
bmm2_scale)selfr^   r;   r_   r`   ra   rb   r=   rc   rd   re   mla_argsunsupported_features	__class__s                r   rh   zFlashInferMLAImpl.__init__i   s     	(	
 	
 	
 	
 	
 !-noN#$$ 	%@  
 ---%$   "0(,(,r   qkv_c_and_k_pe_cacheattn_metadatalayerc                 .   |                                 dk    sJ |j        J t          |t                    r|\  }}t	          j        ||gd          }|j        |j        z  dk    r0t          	                    d           |
                    d          }n3|                    |j        d|j        d         |j        d                   }| j        |j        |j        z  | j        z  | _        | j        |j        | _        t'          ||
                    d          | j        | j        | j        | j        |j        j        |j        j        |j        | j        | j                  }|                    d|j        d         |j        d                   }|d fS )Nr   )dimzFlashInferMLAImpl got a query of uneven length.
                This usually indicates an issue in batch reordering
                or incorrect setup in dummy_run.rF   )querykv_cacheworkspace_bufferrE   kv_lora_rankqk_rope_head_dimblock_tablesseq_lensmax_seq_lenrn   ro   )numeldecode
isinstancetuplerR   catnum_decode_tokensnum_decodesloggerwarning_once	unsqueezeviewshapern   _q_scale_float_k_scale_floatr_   ro   _v_scale_floatr   rm   rE   r   r   block_tabler   r   )rp   rt   ru   rv   rw   q_nopeq_peos           r   _forward_decodez!FlashInferMLAImpl._forward_decode   s    #((**Q....#///a 	2LFD	64.b111A *]-FF!KK4  
 AAA}0"agbk172;OOA?"#2U5IIDJVDO?"#2DO1(22155!3!2*!2&-9")2%1
 
 
 FF2qwr{AGBK00 $wr   )r   r   r   rV   floatrT   rW   rh   rR   Tensorr   r	   r   r   __classcell__)rs   s   @r   r/   r/   h   s-       .-.- .- 	.-
 .- 5kD(.- d
.- .- .- .- '*Dj.- 
.- .- .- .- .- .-`1<%el :;;1 #\1 )	1
 1 
u|U\D00	11 1 1 1 1 1 1 1r   r/   )!typingr   rR   flashinfer.decoder   vllm.config.cacher   vllm.loggerr   2vllm.model_executor.layers.attention.mla_attentionr   r   r	   r
   r   vllm.platforms.interfacer   vllm.v1.attention.backendr   r   r   r    vllm.v1.attention.backends.utilsr   r   r   $FLASHINFER_MLA_WORKSPACE_BUFFER_SIZEr   r    zerosuint8rl   r/   r   r   r   <module>r      s          C C C C C C ( ( ( ( ( ( # # # # # #              6 5 5 5 5 5            ? > > > > >	X		'8 $K K K K K#;<M#N K K K
9 9 9 9 9+ 9 9 9x (
+  b b b b b&78 b b b b br   