
    .`i;                         d dl mZ d dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZmZmZ d d	lmZ  ee          Z G d
 de          Z G d de	e
                   ZdS )    )ClassVarN)
CacheDType)init_logger)MLACommonBackendMLACommonImplMLACommonMetadata)vllm_is_batch_invariant)DeviceCapability)AttentionLayerAttentionTypeis_quantized_kv_cache)decode_attention_fwdc                       e Zd ZU ej        ej        gZeeej	                          e
d<   ddgZeee                  e
d<   edefd            Zeded         fd            Zed	edefd
            ZdS )TritonMLABackendsupported_dtypesautobfloat16supported_kv_cache_dtypesreturnc                      dS )N
TRITON_MLA r       }/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/attention/backends/mla/triton_mla.pyget_namezTritonMLABackend.get_name$   s    |r   TritonMLAImplc                      t           S )N)r   r   r   r   get_impl_clszTritonMLABackend.get_impl_cls(   s    r   
capabilityc                     dS )NTr   )clsr   s     r   supports_compute_capabilityz,TritonMLABackend.supports_compute_capability,   s    tr   N)__name__
__module____qualname__torchfloat16r   r   r   listdtype__annotations__r   r   staticmethodstrr   typer   classmethodr
   boolr"   r   r   r   r   r      s         5:]EN4ShtEK01SSS=xZ(89   
 c    \ $/    \ 5E $    [  r   r   c                       e Zd ZU dZeed<   dededededee         dz  d	edz  d
e	dedz  de	de	dz  ddf fdZ
	 d fd	Zdej        eej        ej        f         z  dej        dededeej        ej        dz  f         f
dZ xZS )r   Tcan_return_lse_for_decode	num_heads	head_sizescalenum_kv_headsalibi_slopesNsliding_windowkv_cache_dtypelogits_soft_cap	attn_typekv_sharing_target_layer_namer   c                     t                      j        |||||||||	|
f
i | |||g}t          |          rt          d          |	t          j        k    rt          d          t          | j                  rt          d          d S )NzbTritonMLAImpl does not support one of the following: alibi_slopes, sliding_window, logits_soft_capz`Encoder self-attention and encoder/decoder cross-attention are not implemented for TritonMLAImplz0TritonMLA V1 with FP8 KV cache not yet supported)super__init__anyNotImplementedErrorr   DECODERr   r8   )selfr2   r3   r4   r5   r6   r7   r8   r9   r:   r;   mla_argsunsupported_features	__class__s                r   r>   zTritonMLAImpl.__init__4   s     	(	
 	
 	
 	
 	
 !-noN#$$ 	%@  
 ---%    !!455 	%B  	 	r   Fc                 B     t                      j        |||f||d|S )N)return_softmax_lsesoftmax_scale)r=    _flash_attn_varlen_diff_headdims)rB   qkvrG   rH   kwargsrE   s          r   rI   z.TritonMLAImpl._flash_attn_varlen_diff_headdimse   sE     8uww7
  2'
 
 
 
 	
r   rJ   kv_c_and_k_pe_cacheattn_metadatalayerc                 \   |                                 dk    sJ |j        J | j                            d          rt	          d          t          |          t          u rt          j        |d          }t          |t          j
                  sJ |j        d         }|j        d         }t          j        ||| j        |j        |j                  }t          j        |||j        |j                  }t!                      rdnd}	t          j        |||	| j        dz   ft          j        |j                  }
|                    d	          }|d
d | j        f         }|                    d          }t+          ||||||j        j        |j        j        |
|	| j        |           ||fS )Nr   fp8z FP8 Triton MLA not yet supported)dim   )r)   device      .)numeldecoder8   
startswithr@   r-   tupler&   cat
isinstanceTensorshapezeroskv_lora_rankr)   rV   r	   emptyfloat32	unsqueezesizer   block_tableseq_lensr4   )rB   rJ   rN   rO   rP   Bq_num_headsolsenum_kv_splitsattn_logits
kv_c_cache	PAGE_SIZEs                r   _forward_decodezTritonMLAImpl._forward_decodeq   s    #((**Q....#///))%00 	J%&HIII77e	!$$$A!U\*****GAJgajK{D-QWQX
 
 
 k![III 566=A k !A% -8
 
 
 2;;A>>(.A0A.A)AB
',,Q//	 	 , )J	
 	
 	
 #vr   )FN)r#   r$   r%   r1   r/   r*   intfloatr(   r,   r>   rI   r&   r_   r\   r   r   rq   __classcell__)rE   s   @r   r   r   1   se        &*t***// / 	/
 / 5kD(/ d
/ / / / '*Dj/ 
/ / / / / /d @D

 

 

 

 

 

=<%el :;;= #\= )	=
 = 
u|U\D00	1= = = = = = = =r   r   )typingr   r&   vllm.config.cacher   vllm.loggerr   2vllm.model_executor.layers.attention.mla_attentionr   r   r   *vllm.model_executor.layers.batch_invariantr	   vllm.platforms.interfacer
   vllm.v1.attention.backendr   r   r   -vllm.v1.attention.ops.triton_decode_attentionr   r#   loggerr   r   r   r   r   <module>r~      so          ( ( ( ( ( ( # # # # # #         
      6 5 5 5 5 5         
 O N N N N N	X		    '   (} } } } }M"34 } } } } }r   