
    .`i/                        d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZ d dlmZ d d	lmZ d d
lmZmZmZmZmZ d dlmZmZ d dlm Z  d dl!m"Z"m#Z#  e
e$          Z% G d de          Z&e G d de                      Z'e G d dee'                               Z( G d dee(                   Z) G d dee(                   Z*dS )    )	dataclass)ClassVarN)
VllmConfig)
CacheDType)init_logger)MLACommonBackendMLACommonDecodeMetadataMLACommonImplMLACommonMetadataMLACommonMetadataBuilderQueryLenSupport)vllm_is_batch_invariant)DeviceCapability)AttentionCGSupportAttentionLayerAttentionType
MultipleOfis_quantized_kv_cache)flash_attn_supports_mlaget_flash_attn_version)AttentionSpec)flash_attn_varlen_funcget_scheduler_metadatac                      e Zd ZU ej        ej        gZeeej	                          e
d<   ddgZeee                  e
d<   edeeez           fd            Zedefd            Zeded         fd	            Zeded
         fd            Zededefd            Zededej	        dedz  dededededededz  fd            ZdS )FlashAttnMLABackendsupported_dtypesautobfloat16supported_kv_cache_dtypesreturnc                  "    t          d          gS )N   )r        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/attention/backends/mla/flashattn_mla.py get_supported_kernel_block_sizesz4FlashAttnMLABackend.get_supported_kernel_block_sizes3   s    2r$   c                      dS )NFLASH_ATTN_MLAr#   r#   r$   r%   get_namezFlashAttnMLABackend.get_name7   s    r$   FlashAttnMLAMetadataBuilderc                      t           S N)r*   r#   r$   r%   get_builder_clsz#FlashAttnMLABackend.get_builder_cls;   s    **r$   FlashAttnMLAImplc                      t           S r,   )r.   r#   r$   r%   get_impl_clsz FlashAttnMLABackend.get_impl_cls?   s    r$   
capabilityc                     |j         dk    S )N	   )major)clsr1   s     r%   supports_compute_capabilityz/FlashAttnMLABackend.supports_compute_capabilityC   s    1$$r$   	head_sizedtypekv_cache_dtypeN
block_sizeuse_mlahas_sink
use_sparsedevice_capabilityc	                 &    t                      sdS d S )Nz/FlashAttention MLA not supported on this device)r   )	r5   r7   r8   r9   r:   r;   r<   r=   r>   s	            r%   supports_combinationz(FlashAttnMLABackend.supports_combinationG   s     '(( 	EDDtr$   )__name__
__module____qualname__torchfloat16r   r   r   listr8   __annotations__r   r   staticmethodintr   r&   strr)   typer-   r0   classmethodr   boolr6   r@   r#   r$   r%   r   r   ,   s        5:]EN4ShtEK01SSS=xZ(89   
  d33C.D       \   c       \  +T"?@ + + + \+  $12       \  %5E %$ % % % [%  { #T)	
     , 
t   [  r$   r   c                   d    e Zd ZU ej        ed<   eed<   eed<   dZej        dz  ed<   dZeed<   dS )FlashAttnMLADecodeMetadataquery_start_locmax_query_lenmax_seq_lenNscheduler_metadatar   max_num_splits)	rA   rB   rC   rD   TensorrG   rI   rS   rT   r#   r$   r%   rO   rO   X   s`         \!!!.2t+222NCr$   rO   c                       e Zd ZdS )FlashAttnMLAMetadataN)rA   rB   rC   r#   r$   r%   rW   rW   a   s        Dr$   rW   c                       e Zd ZU ej        Zee         ed<   ej	        Z
ee         ed<   dZeed<   dedee         dedej        f fd	Zd
 Zdej        dej        dedej        dej        dedej        dz  defdZ xZS )r*   _cudagraph_supportquery_len_supporti   reorder_batch_thresholdkv_cache_speclayer_namesvllm_configdevicec           	          |j         j        }t                                          ||||t          |dk               d| _        t                      dk    | _        | j        j	        
                                | _        | j        j        | _        | j        rP| j        rIt          j        |j        j        dz   t          j        | j                  | _        |j        j        | _        t/                      r	d| _        d S d S )N   )supports_dcp_with_varlenr      )r8   r_   )parallel_configcp_kv_cache_interleave_sizesuper__init__rW   rT   r   fa_aot_schedulecompilation_configcudagraph_modehas_full_cudagraphsuse_full_cuda_graphmax_cudagraph_capture_sizemax_cudagraph_sizerD   zerosscheduler_configmax_num_seqsint32r_   rS   attention_config(flash_attn_max_num_splits_for_cuda_graphr   )selfr\   r]   r^   r_   interleave_size	__class__s         r%   rg   z$FlashAttnMLAMetadataBuilder.__init__k   s    &5Q &5&: 	 	
 	
 	
  5771< #2FFHH 	  #'"9"T# 	(< 	&+k,9A=k{' ' 'D# ,U  #$$ 	$"#D	$ 	$r$   c                     | j         rLt          |||| j        | j        z  d| j        j        || j        j        | j        j        | j	        |||          S d S )Nra   )
batch_sizemax_seqlen_qmax_seqlen_knum_heads_qnum_heads_kvheaddimcache_seqlens	qkv_dtype	headdim_v	page_sizecu_seqlens_qcausal
num_splits)
rh   r   	num_headsdcp_world_sizemla_dimsqk_rope_head_dimr\   r8   kv_lora_rankr   )ru   num_reqscu_query_lensrQ   seqlensrR   r   rT   s           r%   _schedule_decodez,FlashAttnMLAMetadataBuilder._schedule_decode   sp      	)#*( NT-@@6%,2-4.*)    tr$   block_table_tensorseq_lens_devicerR   query_start_loc_cpuquery_start_loc_devicenum_decode_tokensdcp_tot_seq_lens_deviceNr    c           
      F   |dd          |d d         z
  }|                                                                 }	d}
| j        r| j        || j        k    r| j        }
t                      rd}
|                     |j        d         ||	||d|
          }| j        rn|l|j        d         }|| j        j        d         k    s"J d| d| j        j        d          z               || j        d |<   d| j        |d <   | j        d |         }t          ||||	|||
|          }|S )	Nra   r   T)r   r   rQ   r   rR   r   rT   zScheduler metadata size z exceeds buffer size )block_tableseq_lensrP   rQ   rR   rS   rT   dcp_tot_seq_lens)
maxitemrl   rn   rT   r   r   shaperS   rO   )ru   r   r   rR   r   r   r   r   query_lens_cpurQ   rT   rS   nmetadatas                 r%   _build_decodez)FlashAttnMLAMetadataBuilder._build_decode   s    -QRR03Fss3KK&**,,1133 $		1'3!T%<<< "0N"$$ 	N!22$*1-0'##) 3 
 
 # 	=(:(F"(+A/5a8888C1CCC,21578 988 +=D#BQB'
 +,D#ABB'!%!8!!<-*$2'#1)4	
 	
 	
 r$   )rA   rB   rC   r   UNIFORM_BATCHrY   r   rG   r   VARLENrZ   r[   rI   r   rF   rJ   r   rD   r_   rg   r   rU   rO   r   __classcell__rw   s   @r%   r*   r*   f   s@        7I7W!34WWW3B3Ix0III#&S&&&&$$&$ #Y&$  	&$
 &$ &$ &$ &$ &$ &$P  8@!L@ @ 	@
 #\@ !&@ @ "'!4@ 
$@ @ @ @ @ @ @ @r$   r*   c                       e Zd ZU dZeed<   dededededee         dz  d	edz  d
e	dedz  de	de	dz  ddf fdZ
dej        eej        ej        f         z  dej        dededeej        ej        dz  f         f
dZ xZS )r.   Tcan_return_lse_for_decoder   r7   scalenum_kv_headsalibi_slopesNsliding_windowr9   logits_soft_cap	attn_typekv_sharing_target_layer_namer    c                 H    t                      j        |||||||||	|
f
i | t                      s
J d            |||g}t          |          rt	          d          |	t
          j        k    rt	          d          t          | j                  rt	          d          d S )Nz,FlashAttnMLA is not supported on this devicezeFlashAttnMLAImpl does not support one of the following: alibi_slopes, sliding_window, logits_soft_capzcEncoder self-attention and encoder/decoder cross-attention are not implemented for FlashAttnMLAImplz3FlashAttnMLA V1 with FP8 KV cache not yet supported)	rf   rg   r   anyNotImplementedErrorr   DECODERr   r9   )ru   r   r7   r   r   r   r   r9   r   r   r   mla_argsunsupported_featuresrw   s                r%   rg   zFlashAttnMLAImpl.__init__   s     	(	
 	
 	
 	
 	
 '((XX*XXX( ,noN#$$ 	%@  
 ---%#   !!455 	%E  	 	r$   qkv_c_and_k_pe_cacheattn_metadatalayerc                 j   |                                 dk    sJ |j        J t          |          t          u r|\  }}n&t	          j        || j        | j        gd          \  }}| j        	                    d          rt          d          |dd | j        f         }|d| j        d f         }t          |j        j        d          }	t          di d|d	|                    d
          d|                    d
          d|d|	d|j        j        d|j        j        d|j        j        d|j        j        d| j        ddd| j        ddd|j        j        d|j        j        d| j        d| j        d|j        j        }
| j        r|
\  }}||                    dd          fS |
}|d fS )Nr   r   )dimfp8z(FP8 FlashAttention MLA not yet supported.ra   r   kvq_vrz   r   r{   	seqused_kr   softmax_scaler   Treturn_softmax_lse
fa_versionrc   rS   r   cp_world_sizecp_rankcp_tot_seqused_kr#   )numeldecoderK   tuplerD   splitr   r   r9   
startswithr   r   rQ   r   	unsqueezerP   rR   r   r   r   need_to_return_lse_for_decoderS   rT   r   dcp_rankr   	transpose)ru   r   r   r   r   q_nopeq_pe
kv_c_cache
k_pe_cacherz   attn_outolses                r%   _forward_decodez FlashAttnMLAImpl._forward_decode(  sj    #((**Q....#///77eLFDD ;D%t'<=2  LFD ))%00 	R%&PQQQ(.A0A.A)AB
(d.?.A.A)AB

 =/=qAA) 
 
 
d
""2&&&
 ""2&&&
 	

 &
 '-==
 '-99
 $*33
 &,88
 **
 4
  $AA
 q
  -3FF
 %+::
  --!
" MM#
$ +1BB%
* - 	FAscmmAq))))Ad7Nr$   )rA   rB   rC   r   rM   rG   rI   floatrF   rJ   rg   rD   rU   r   rW   r   r   r   r   s   @r%   r.   r.      s@        &*t***11 1 	1
 1 5kD(1 d
1 1 1 1 '*Dj1 
1 1 1 1 1 1f7<%el :;;7 #\7 ,	7
 7 
u|U\D00	17 7 7 7 7 7 7 7r$   r.   )+dataclassesr   typingr   rD   vllm.configr   vllm.config.cacher   vllm.loggerr   2vllm.model_executor.layers.attention.mla_attentionr   r	   r
   r   r   r   *vllm.model_executor.layers.batch_invariantr   vllm.platforms.interfacer   vllm.v1.attention.backendr   r   r   r   r   #vllm.v1.attention.backends.fa_utilsr   r   vllm.v1.kv_cache_interfacer   vllm.vllm_flash_attnr   r   rA   loggerr   rO   rW   r*   r.   r#   r$   r%   <module>r      s   " ! ! ! ! !        " " " " " " ( ( ( ( ( ( # # # # # #                     6 5 5 5 5 5                     5 4 4 4 4 4       
 
X		) ) ) ) )* ) ) )X     !8    	 	 	 	 	,-GH 	 	 	I I I I I":;O"P I I IXm m m m m}%9: m m m m mr$   