
    .`i+                        d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZ d dlmZ d d	lmZ d d
lmZmZmZmZ d dlmZmZ d dlmZm Z m!Z!m"Z"m#Z#m$Z$ d dl%m&Z&  e
e'          Z( G d de          Z)e G d de                      Z*e G d dee*                               Z+ G d dee+                   Z, G d dee+                   Z-dS )    )	dataclass)ClassVarN)
VllmConfig)
CacheDType)init_logger)MLACommonBackendMLACommonDecodeMetadataMLACommonImplMLACommonMetadataMLACommonMetadataBuilderQueryLenSupport)vllm_is_batch_invariant)DeviceCapability)AttentionCGSupportAttentionLayerAttentionType
MultipleOf)#reshape_attn_output_for_spec_decodereshape_query_for_spec_decode)FlashMLASchedMetaflash_mla_with_kvcacheflash_mla_with_kvcache_fp8get_mla_metadataget_mla_metadata_dense_fp8is_flashmla_dense_supported)AttentionSpecc                      e Zd ZU ej        ej        gZeeej	                          e
d<   g dZeee                  e
d<   edeeez           fd            Zedefd            Zeded         fd            Zeded	         fd
            Zededefd            Zededej	        dedz  dededededededz  fd            ZdS )FlashMLABackendsupported_dtypes)autobfloat16fp8fp8_e4m3supported_kv_cache_dtypesreturnc                      dgS )N@    r(       {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/attention/backends/mla/flashmla.py get_supported_kernel_block_sizesz0FlashMLABackend.get_supported_kernel_block_sizes8   s	    tr)   c                      dS )NFLASHMLAr(   r(   r)   r*   get_namezFlashMLABackend.get_name<   s    zr)   FlashMLAMetadataBuilderc                      t           S N)r/   r(   r)   r*   get_builder_clszFlashMLABackend.get_builder_cls@   s    &&r)   FlashMLAImplc                      t           S r1   )r3   r(   r)   r*   get_impl_clszFlashMLABackend.get_impl_clsD   s    r)   
capabilityc                     |j         dv S )N)	   
   )major)clsr6   s     r*   supports_compute_capabilityz+FlashMLABackend.supports_compute_capabilityH   s    7**r)   	head_sizedtypekv_cache_dtypeN
block_sizeuse_mlahas_sink
use_sparsedevice_capabilityc	                 ^    |rddl m}	  |	            d         S ddl m}
  |
            d         S )Nr   )is_flashmla_sparse_supported   )r   )vllm.v1.attention.ops.flashmlarF   r   )r;   r=   r>   r?   r@   rA   rB   rC   rD   rF   r   s              r*   supports_combinationz$FlashMLABackend.supports_combinationL   sX      	4SSSSSS//11!44RRRRRR..0033r)   )__name__
__module____qualname__torchfloat16r!   r   r   listr>   __annotations__r$   r   staticmethodintr   r+   strr.   typer2   r5   classmethodr   boolr<   rI   r(   r)   r*   r   r   /   s        5:]EN4ShtEK01SSS= = =xZ(89    d33C.D    \ c    \ 'T";< ' ' ' \' $~.    \ +5E +$ + + + [+ 44 {4 #T)	4
 4 4 4 4 ,4 
t4 4 4 [4 4 4r)   r   c                       e Zd ZU eed<   dS )FlashMLADecodeMetadatascheduler_metadataN)rJ   rK   rL   r   rP   r(   r)   r*   rX   rX   b   s         ))))))r)   rX   c                       e Zd ZdS )FlashMLAMetadataN)rJ   rK   rL   r(   r)   r*   r[   r[   g   s        Dr)   r[   c                        e Zd ZU ej        Zee         ed<   ej	        Z
ee         ed<   dZeed<   dedee         dedej        f fd	Zd
ej        dej        dedej        dej        dedej        dz  defdZ xZS )r/   _cudagraph_supportquery_len_support   reorder_batch_thresholdkv_cache_speclayer_namesvllm_configdevicec                 h   t                                          ||||t                     |j                            |j                  | _        d | _        d | _        |j	        j
                            d          | _        t          j                            | j                  }|j        }| j        j                                        rgt          j        |df| j        t          j                  | _        t          j        |j        j        dz   | j        t          j                  | _        d S d S )Nr"      )rd   r>   rG   )super__init__r[   model_configget_num_attention_headsparallel_confignum_q_headscg_buf_tile_scheduler_metadatacg_buf_num_splitscache_configcache_dtype
startswithis_fp8_kvcacherM   cudaget_device_propertiesrd   multi_processor_countcompilation_configcudagraph_modehas_full_cudagraphszerosint32emptyscheduler_configmax_num_seqs)selfra   rb   rc   rd   device_propertiesnum_sms	__class__s          r*   rh   z FlashMLAMetadataBuilder.__init__r   s'    	;V=M	
 	
 	
 '3KK'
 
 /3+!%)6BMMeTT!J<<T[II#9"1EEGG 	27+ !{k3 3 3D/ &+[-:Q>{k& & &D"""	 	r)   block_table_tensorseq_lens_devicemax_seq_lenquery_start_loc_cpuquery_start_loc_devicenum_decode_tokensdcp_tot_seq_lens_deviceNr%   c                 @   |dd          |d d         z
  }|                                                                 }	|	| j        z  dz  }
t          ||
d| j                  \  }}| j        r"t          ||
d          \  }}||_        ||_        t          ||||          S )NrG   )rr   )block_tableseq_lensrY   dcp_tot_seq_lens)	maxitemrl   r   rr   r   tile_scheduler_metadata
num_splitsrX   )r~   r   r   r   r   r   r   r   query_lens_cpumax_query_lennum_q_tokens_per_head_krY   _r   r   s                  r*   _build_decodez%FlashMLAMetadataBuilder._build_decode   s     -QRR03Fss3KK&**,,1133"/$2B"Ba"G 0#.	!
 !
 !
A  	72L'3 3/#Z
 :Q6,6)%*$14	
 
 
 	
r)   )rJ   rK   rL   r   UNIFORM_BATCHr]   r   rP   r   UNIFORMr^   r`   rR   r   rO   rS   r   rM   rd   rh   TensorrX   r   __classcell__r   s   @r*   r/   r/   l   s!        7I7W!34WWW3B3Jx0JJJ#&S&&&"$" #Y"  	"
 " " " " " "H"
!L"
 "
 	"

 #\"
 !&"
 "
 "'!4"
 
 "
 "
 "
 "
 "
 "
 "
 "
r)   r/   c                       e Zd ZU dZeed<   dededededee         dz  d	edz  d
e	dedz  de	de	dz  ddf fdZ
dej        eej        ej        f         z  dej        dededeej        ej        dz  f         f
dZ xZS )r3   Tcan_return_lse_for_decode	num_headsr=   scalenum_kv_headsalibi_slopesNsliding_windowr?   logits_soft_cap	attn_typekv_sharing_target_layer_namer%   c                     t                      j        |||||||||	|
f
i | t                      \  }}|s
J |            |||g}t          |          rt	          d          |	t
          j        k    rt	          d          d S )NzaFlashMLAImpl does not support one of the following: alibi_slopes, sliding_window, logits_soft_capz_Encoder self-attention and encoder/decoder cross-attention are not implemented for FlashMLAImpl)rg   rh   r   anyNotImplementedErrorr   DECODER)r~   r   r=   r   r   r   r   r?   r   r   r   mla_argsis_supportedreasonunsupported_featuresr   s                  r*   rh   zFlashMLAImpl.__init__   s     	(	
 	
 	
 	
 	
  ;<<f##V##| ,noN#$$ 	%@  
 ---%   .-r)   qkv_c_and_k_pe_cacheattn_metadatalayerc                    |                                 dk    sJ |j        J t          |          t          u rt	          j        |d          }t          |t          j                  sJ |j        }t          ||          }|j        j
        }t                      r| j                            d          s|j        }t          j        }|j        d         }	|j        j        j        d         }
d}|
|z  dk    sJ d|
 d|             |
|z  }t	          j        d||	          }d|d
<   d|d<   |	dz
  |d<   ||d<   d|d<   t	          j        |	dz   f||	          }||_        ||_        | j                            d          rt+          ||                    d          |j        j        |j        j        | j        |j        |j        | j        d|j                            d          |j                            d                    \  }}nLt;          ||                    d          |j        j        |j        j        | j        || j        dd	  	        \  }}t=          |          }||fS )Nr   r   )dimr"   r'   ztopk (z) must be divisible by )rG   rf   )r>   rd   )r   r   )r   rG   rG   )r      )r      )r      T)r   k_cacher   cache_seqlens
head_dim_vr   r   softmax_scalecausal	descale_q	descale_kF)	r   r   r   r   r   r   r   r   rr   )numeldecoderT   tuplerM   cat
isinstancer   num_decodesr   rY   r   r?   rq   rd   rz   shaper   ry   r   r   r   	unsqueezer   kv_lora_rankr   _q_scalereshape_k_scaler   r   )r~   r   r   r   r   r   rY   rd   r>   BtopkB_TOPKend_block_idxr   r   olses                    r*   _forward_decodezFlashMLAImpl._forward_decode   s    #((**Q....#///77e	!$$$A !U\*****#/)![99*1D"$$ 	7T-@-K-KE-R-R 	7XFKE
A !'39"=DF&=A%%%'U'U'UV'U'U%%% FNM ',k&f&U&U&U#,-#D),-#D),-E#D),9#D),-#D)
 a!eXU6JJJJ9P6,6)))%00 	/+55b99)0<+2;,(:(R-8"j.0033.0033  FAss ,+55b99)0<+2;,(:"j$
 
 
FAs 022#vr)   )rJ   rK   rL   r   rV   rP   rR   floatrO   rS   rh   rM   r   r   r[   r   r   r   r   s   @r*   r3   r3      sM        &*t***-- - 	-
 - 5kD(- d
- - - - '*Dj- 
- - - - - -^P<%el :;;P #\P (	P
 P 
u|U\D00	1P P P P P P P Pr)   r3   ).dataclassesr   typingr   rM   vllm.configr   vllm.config.cacher   vllm.loggerr   2vllm.model_executor.layers.attention.mla_attentionr   r	   r
   r   r   r   *vllm.model_executor.layers.batch_invariantr   vllm.platforms.interfacer   vllm.v1.attention.backendr   r   r   r    vllm.v1.attention.backends.utilsr   r   rH   r   r   r   r   r   r   vllm.v1.kv_cache_interfacer   rJ   loggerr   rX   r[   r/   r3   r(   r)   r*   <module>r      s   " ! ! ! ! !        " " " " " " ( ( ( ( ( ( # # # # # #                     6 5 5 5 5 5                                  5 4 4 4 4 4	X		04 04 04 04 04& 04 04 04f * * * * *4 * * * 	 	 	 	 	()?@ 	 	 	L
 L
 L
 L
 L
67GH L
 L
 L
^B B B B B=!12 B B B B Br)   