
    .`i"                     <   d dl Z d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZ d dlmZ d dlmZmZmZmZmZ  e
e          Z G d d	ee                   Z G d
 de          Z G d d          Z ed          ZdZ G d dee                   ZdS )    N)ClassVar)
CacheDType)init_logger)MLACommonBackendMLACommonImplMLACommonMetadataMLACommonMetadataBuilder)DeviceCapability)AttentionCGSupportAttentionLayerAttentionType
MultipleOfis_quantized_kv_cachec                   4    e Zd ZU ej        Zee         ed<   dS )CutlassMLAMetadataBuilder_cudagraph_supportN)__name__
__module____qualname__r   UNIFORM_SINGLE_TOKEN_DECODEr   r   __annotations__     ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/attention/backends/mla/cutlass_mla.pyr   r      s:          	6 !34     r   r   c                   <   e Zd ZU ej        ej        gZeeej	                          e
d<   g dZeee                  e
d<   edeeez           fd            Zedefd            Zeded         fd            Zeded	         fd
            Zededefd            ZdS )CutlassMLABackendsupported_dtypes)autobfloat16fp8fp8_e4m3supported_kv_cache_dtypesreturnc                      dgS )N   r   r   r   r    get_supported_kernel_block_sizesz2CutlassMLABackend.get_supported_kernel_block_sizes.   s	    ur   c                      dS )NCUTLASS_MLAr   r   r   r   get_namezCutlassMLABackend.get_name2   s    }r   CutlassMLAImplc                      t           S N)r*   r   r   r   get_impl_clszCutlassMLABackend.get_impl_cls6   s    r   r   c                      t           S r,   )r   r   r   r   get_builder_clsz!CutlassMLABackend.get_builder_cls:   s    ((r   
capabilityc                     |j         dk    S )N
   )major)clsr0   s     r   supports_compute_capabilityz-CutlassMLABackend.supports_compute_capability>   s    2%%r   N)r   r   r   torchfloat16r   r   r   listdtyper   r"   r   staticmethodintr   r&   strr)   typer-   r/   classmethodr
   boolr5   r   r   r   r   r   %   sJ        5:]EN4ShtEK01SSS= = =xZ(89    d33C.D    \ c    \ $/0    \ )T"=> ) ) ) \) &5E &$ & & & [& & &r   r   c                   *    e Zd Zd Zd ZdedefdZdS )SM100Workspacec                     t          j        |dt           j                  | _        d| _        t           j                            t          j        d                    }|j        | _	        d S )Ncuda)devicer9   r%   zcuda:0)
r6   emptyuint8_workspace_buf_block_sizerC   get_device_propertiesrD   multi_processor_count	_sm_count)selfinitial_workspace_size
propertiess      r   __init__zSM100Workspace.__init__D   s]    #k"6
 
 
  Z55el86L6LMM
#9r   c                     | j         S r,   )rG   )rL   s    r   get_bufzSM100Workspace.get_bufP   s    ""r   attn_metadatanum_kv_splitsc                     |j         }|j        }t          j        || j        z  || j        |          }| j        j        d         |k     r| j                            |           d S d S )N)rS   r   )	num_reqsmax_query_lenops$sm100_cutlass_mla_get_workspace_sizerH   rK   rG   shaperesize_)rL   rR   rS   
batch_sizemax_seq_lenworkspace_sizes         r   ensure_sizezSM100Workspace.ensure_sizeS   s|    "+
#1A$**N'	
 
 
 $Q'.88''77777 98r   N)r   r   r   rO   rQ   r   r;   r^   r   r   r   rA   rA   C   sU        
: 
: 
:# # #8): 83 8 8 8 8 8 8r   rA   i   r%   c                       e Zd ZU dZeed<   dededededee         dz  d	edz  d
e	dedz  de	de	dz  ddf fdZ
dej        dej        dej        dej        dej        dej        dededeej        ej        f         fdZdej        eej        ej        f         z  dej        dededeej        ej        dz  f         f
dZ xZS )r*   Tcan_return_lse_for_decode	num_heads	head_sizescalenum_kv_headsalibi_slopesNsliding_windowkv_cache_dtypelogits_soft_cap	attn_typekv_sharing_target_layer_namer#   c                     t                      j        |||||||||	|
f
dt          i| |||g}t          |          rt	          d          |	t
          j        k    rt	          d          t          j        	                    dd           }|r=t                              dt          |                     t          |          | _        nd| _        t          | _        d S )Nq_pad_num_headszcCutlassMLAImpl does not support one of the following: alibi_slopes, sliding_window, logits_soft_capzaEncoder self-attention and encoder/decoder cross-attention are not implemented for CutlassMLAImplFORCE_NUM_KV_SPLITSzForcing num_kv_splits to %d)superrO   	MAX_HEADSanyNotImplementedErrorr   DECODERosenvirongetlogger
debug_oncer;   _num_kv_splitsg_sm100_workspace
_workspace)rL   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   mla_argsunsupported_featuresforce_num_kv_splits	__class__s                 r   rO   zCutlassMLAImpl.__init__j   s     	(	
 	
 &	
 	
 	
 	
 !-noN#$$ 	%@  
 ---%!   !jnn-BDII 	%;SAT=U=UVVV"%&9":":D"$D ,r   q_nopeq_pekv_c_and_k_pe_cacheseq_lens
page_table	workspacesm_scalerS   c	                 .   |j         dk    sJ d|j                      |j         dk    sJ d|j                      |j         dk    s"J d                    |j                               |j        \  }	}
}|j        \  }}}|	|k    r|
|k    sJ |j        \  }}}d}d}||k    sJ ||k    sJ |||z   k    sJ d}|
|k    sJ d| d	|
             t          |j                  d
k    sJ |j        \  }}||	k    sJ |dk    sJ d|             |d|z  z  dk    sJ |j        t
          j        t
          j        t
          j        fv sJ d|j         d            |j        |j        cxk    r|j        k    sn J |j        t
          j	        k    sJ d|j         d            |j        t
          j	        k    sJ d|j         d            t          | j                  rt
          j        n|j        }|                    |	||f|          }| j        r(t          j        |	|ft
          j        |j                  nt          j                    }t%          j        ||||||||||
  
         |
|k     r%| j        r|d d d |
f         n|}|d d d |
f         }||fS )N   z$q_nope must be a 3D tensor, but got z"q_pe must be a 3D tensor, but got z3kv_c_and_k_pe_cache must be a 3D tensor, but got {}i   @   r%   zH must be <= z
, but got    r   z&block num must be greater than 0, got z6q_nope.dtype needs to be fp16 or bf16 or e4m3 but got .z)seq_lens.dtype needs to be int32 but got z+page_table.dtype needs to be int32 but got )r9   )r9   rD   )ndimformatrY   lenr9   r6   r7   r   float8_e4m3fnint32r   rg   	new_emptyneed_to_return_lse_for_decoderE   float32rD   TensorrW   sm100_cutlass_mla_decode)rL   r   r   r   r   r   r   r   rS   B_qHD_q_nopeB_q_2H_2D_q_pe_	PAGE_SIZED_ckvD_latentD_roperp   B_block_table	block_numr9   outlses                             r   _sm100_cutlass_mla_decodez(CutlassMLAImpl._sm100_cutlass_mla_decode   sc    {a!U!U!UyA~~~ODIOO~~~"'1,,,AHH#(  -,, "<Q!ZsFu1888,179e8####6)))))	I~~~GyGGAGG~~~:#$$))))#-#3 y####1}}}RyRR}}}C)O,1111|u~u?RSSSSTV\TTT TSS |tzFFFF-@-FFFFFFF~,,,IIII -,, 5;...M*:JMMM /.. %T%899ENN 	
 Y9GG 1 EKi(fmTTTT 	 	$	
 	
 	
 y== $ BK#aaa!e**Caaa!e*CCxr   qrR   layerc           
         |                                 dk    sJ |j        J t          |          t          u r|\  }}n&t	          j        || j        | j        gd          \  }}| j        	                    || j
                   |                     ||||j        j        |j        j        | j                                        | j        | j
                  \  }}|| j        r|nd fS )Nr   rn   )dim)numeldecoder=   tupler6   splitkv_lora_rankqk_rope_head_dimr{   r^   ry   r   r   block_tablerQ   rc   r   )	rL   r   r   rR   r   r   r   or   s	            r   _forward_decodezCutlassMLAImpl._forward_decode   s     #((**Q....#///77eLFDD ;D%t'<=2  LFD
 	##M43FGGG // ) ,O##%%J	
 	
3 $<F33$GGr   )r   r   r   r`   r?   r   r;   floatr8   r<   rO   r6   r   r   r   r   r   r   __classcell__)r   s   @r   r*   r*   g   s        &*t***8,8, 8, 	8,
 8, 5kD(8, d
8, 8, 8, 8, '*Dj8, 
8, 8, 8, 8, 8, 8,tQQ lQ #\	Q
 ,Q LQ <Q Q Q 
u|U\)	*Q Q Q Qf H<%el :;; H #\ H )	 H
  H 
u|U\D00	1 H  H  H  H  H  H  H  Hr   r*   ) rt   typingr   r6   vllm._custom_ops_custom_opsrW   vllm.config.cacher   vllm.loggerr   2vllm.model_executor.layers.attention.mla_attentionr   r   r   r	   vllm.platforms.interfacer
   vllm.v1.attention.backendr   r   r   r   r   r   rw   r   r   rA   rz   rp   r*   r   r   r   <module>r      s   
			              ( ( ( ( ( ( # # # # # #            6 5 5 5 5 5              
X		     89J K   & & & & &( & & &<8 8 8 8 8 8 8 8> #N#455 	pH pH pH pH pH]#45 pH pH pH pH pHr   