
    .`ii              	          d dl mZmZ d dlmZmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZmZ d dlZd dlZd dlmZ e	r*d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlm Z  d dl!m"Z"  G d de#e          Z$ G d d          Z% G d de          Z& G d d          Z' ede'          Z(e G d d                      Z) ed          Z* G d de          Z+ G d deee*                   Z, G d d e          Z- G d! d"eee(                   Z. G d# d$e.e(         ee(                   Z/d%e#d&e0fd'Z1d(e#d)e2e&         d*e2e,e*                  d&e2e&         fd+Z3d(e#d)e2e&         d,e4e#e
f         d&e2e&         fd-Z5dS ).    )ABCabstractmethod)	dataclassreplace)Enum)TYPE_CHECKINGAnyClassVarGenericProtocolTypeVarget_argsN)
deprecated)
VllmConfig)
CacheDType)ColumnParallelLinear)QuantKey)DeviceCapability)KVCacheLayoutType)AttentionSpecc                   (    e Zd ZdZdZ	 dZ	 dZ	 dZdS )AttentionTypezO
    Attention type.
    Use string to be compatible with `torch.compile`.
    decoderencoderencoder_onlyencoder_decoderN)__name__
__module____qualname____doc__DECODERENCODERENCODER_ONLYENCODER_DECODER     m/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/attention/backend.pyr   r      s9         
 G9GM!L9'ODDr&   r   c                   &    e Zd ZU eed<   defdZdS )
MultipleOfbasec                     || _         d S N)r*   )selfr*   s     r'   __init__zMultipleOf.__init__*   s    			r&   N)r   r   r   int__annotations__r.   r%   r&   r'   r)   r)   '   s:         
IIIS      r&   r)   c                   P   e Zd ZU dZdZeed<   ej        ej	        gZ
eeej                          ed<   ddgZeed                  ed<   d	Zeed
<   edeeez           fd            Zeedefd                        Zeeded         fd                        Zeed                         Zee	 d7dedededededeedf         fd                        Ze	 d8dedeedf         fd            Zedeeef         fd            Zedee         fd            Zededefd            Z edej        defd            Z!edd defd!            Z"eded"z  defd#            Z#edefd$            Z$edefd%            Z%edefd&            Z&edefd'            Z'edefd(            Z(ed)edefd*            Z)ed+d,defd-            Z*ededej        dd ded.ed/ed0ed1d,ded"z  fd2            Z+ededej        dd ded.ed/ed0ed3ed1d,d)edee         fd4            Z,ed9d6            Z-d"S ):AttentionBackendz&Abstract class for attention backends.Faccept_output_buffersupported_dtypesautobfloat16r   supported_kv_cache_dtypesT forward_includes_kv_cache_updatereturnc                  "    t          d          gS N   )r)   r%   r&   r'    get_supported_kernel_block_sizesz1AttentionBackend.get_supported_kernel_block_sizes;   s    1r&   c                      t           r,   NotImplementedErrorr%   r&   r'   get_namezAttentionBackend.get_name?   
     "!r&   AttentionImplc                      t           r,   r?   r%   r&   r'   get_impl_clszAttentionBackend.get_impl_clsD   rB   r&   c                      t           r,   r?   r%   r&   r'   get_builder_clsz AttentionBackend.get_builder_clsI   rB   r&   
num_blocks
block_sizenum_kv_heads	head_sizecache_dtype_str.c                     t           r,   r?   )rH   rI   rJ   rK   rL   s        r'   get_kv_cache_shapez#AttentionBackend.get_kv_cache_shapeN   s
     "!r&   include_num_layers_dimensionc                     t           )av  
        Get the physical (memory layout) ordering of the kv cache dimensions.
        e.g. if the KV cache shape is
        [2, num_blocks, block_size, num_heads, head_size],
        and get_kv_cache_stride_order returns (1, 3, 0, 2, 4) then the physical
        ordering of dimensions is
        [num_blocks, num_heads, 2, block_size, head_size].

        If this function is unimplemented / raises NotImplementedError,
        the physical layout of the KV cache will match the logical shape.

        Args:
            include_num_layers_dimension: if True, includes an additional
                num_layers dimension, which is assumed to be prepended
                to the logical KV cache shape.
                With the above example, a return value (2, 4, 0, 1, 3, 5)
                corresponds to
                [num_blocks, num_heads, num_layers, 2, block_size, head_size].

                If an additional dimension is NOT included in the returned
                tuple, the physical layout will not include a layers dimension.

        Returns:
            A tuple of ints which is a permutation of range(len(shape)).
        r?   )rO   s    r'   get_kv_cache_stride_orderz*AttentionBackend.get_kv_cache_stride_orderY   s
    : "!r&   c                     | j         | j        fS r,   )r   r   clss    r'   full_cls_namezAttentionBackend.full_cls_namex   s     011r&   c                     g S r,   r%   rS   s    r'   get_supported_head_sizesz)AttentionBackend.get_supported_head_sizes|   s    	r&   c                 8    |                                  }| p||v S r,   )rW   )rT   rK   supported_head_sizess      r'   supports_head_sizez#AttentionBackend.supports_head_size   s(    ";;==((NY:N-NNr&   dtypec                     || j         v S r,   )r4   )rT   r[   s     r'   supports_dtypezAttentionBackend.supports_dtype   s    ,,,r&   kv_cache_dtypezCacheDType | Nonec                 ,    |dS | j          p|| j         v S NT)r7   )rT   r^   s     r'   supports_kv_cache_dtypez(AttentionBackend.supports_kv_cache_dtype   s+    !411 
c;;	
r&   Nc                     ddl m} |dS t          |          }||vrdS |                                 }|sdS |D ]*}t	          |t
                    r|j        }||z  dk    r dS +dS )Nr   )	BlockSizeTF)vllm.config.cacherc   r   r=   
isinstancer)   r*   )rT   rI   rc   valid_sizessupported_kernel_block_sizessupported_sizes         r'   supports_block_sizez$AttentionBackend.supports_block_size   s    //////4y))[((5'*'K'K'M'M$+ 	4: 	 	N.*55 5!/!4 N*a//tt 0ur&   c                     dS NFr%   rS   s    r'   is_mlazAttentionBackend.is_mla       ur&   c                     dS rk   r%   rS   s    r'   supports_sinkzAttentionBackend.supports_sink   rm   r&   c                     dS rk   r%   rS   s    r'   supports_alibi_sqrtz$AttentionBackend.supports_alibi_sqrt   rm   r&   c                     dS rk   r%   rS   s    r'   supports_mm_prefixz#AttentionBackend.supports_mm_prefix   rm   r&   c                     dS rk   r%   rS   s    r'   	is_sparsezAttentionBackend.is_sparse   rm   r&   	attn_typec                 "    |t           j        k    S )zCheck if backend supports a given attention type.

        By default, only supports decoder attention.
        Backends should override this to support other attention types.
        )r   r!   )rT   rv   s     r'   supports_attn_typez#AttentionBackend.supports_attn_type   s     M111r&   
capabilityr   c                     dS r`   r%   )rT   ry   s     r'   supports_compute_capabilityz,AttentionBackend.supports_compute_capability       tr&   use_mlahas_sink
use_sparsedevice_capabilityc	                     d S r,   r%   )	rT   rK   r[   r^   rI   r}   r~   r   r   s	            r'   supports_combinationz%AttentionBackend.supports_combination   s	     tr&   use_mm_prefixc           
      0   g }|                      |          s|                    d           |                     |          s|                    d           |                     |          s|                    d           |                     |          s|                    d           |r)|                                 s|                    d           ||                                 k    r-|r|                    d           n|                    d           |r)|                                 s|                    d           ||                                 k    r-|r|                    d	           n|                    d
           | 	                    |	          s|                    d           | 
                    |
          s|                    d|
 d           |                     ||||||||	          }||                    |           |S )Nzhead_size not supportedzdtype not supportedzkv_cache_dtype not supportedzblock_size not supportedz5partial multimodal token full attention not supportedzMLA not supportedznon-MLA not supportedzsink setting not supportedzsparse not supportedznon-sparse not supportedz compute capability not supportedzattention type z not supported)rZ   appendr]   ra   ri   rs   rl   ro   ru   r{   rx   r   )rT   rK   r[   r^   rI   r}   r~   r   r   r   rv   invalid_reasonscombination_reasons                r'   validate_configurationz'AttentionBackend.validate_configuration   sg    %%i00 	>""#<===!!%(( 	:""#8999**>:: 	C""#ABBB&&z22 	?""#=>>> 	!7!7!9!9 	""G   cjjll"" @&&':;;;;&&'>??? 	AC--// 	A""#?@@@(( C&&'=>>>>&&'ABBB../@AA 	G""#EFFF%%i00 	P""#NY#N#N#NOOO 55	
 	
 )""#5666r&   KVCacheLayoutType | Nonec                     d S r,   r%   rS   s    r'   get_required_kv_cache_layoutz-AttentionBackend.get_required_kv_cache_layout  r|   r&   )r5   F)r9   r   ).r   r   r   r    r3   boolr0   torchfloat16r6   r4   r
   listr[   r7   r8   staticmethodr/   r)   r=   r   strrA   typerE   rG   tuplerN   rQ   classmethodrU   rW   rZ   r]   ra   ri   rl   ro   rq   rs   ru   rx   r{   r   r   r   r%   r&   r'   r2   r2   .   sg        00
 "'$&&&5:]EN4ShtEK01SSS?Ez>Rx\(:;RRR .2$d111d33C.D    \ "c " " " ^ \" "$/ " " " ^ \" " " ^ \"   &" """ " 	"
 " 
sCx" " " ^ \" -2" "&*"	sCx" " " \"< 2eCHo 2 2 2 [2 c    [ O3 O4 O O O [O -5; -4 - - - [- 
5H 
T 
 
 
 [
 S4Z D    [0 t    [ d    [ D    [ 4    [ $    [ 23 24 2 2 2 [2 5G D    [  { ,	
     . 
t   [ 66 {6 ,	6
 6 6 6 6 6 .6 6 
c6 6 6 [6p    [  r&   r2   c                       e Zd ZdS )AttentionMetadataN)r   r   r   r%   r&   r'   r   r     s        Dr&   r   T)boundc                      e Zd ZU dZej        ed<   ej        ed<   	 ej        ed<   	 eed<   	 eed<   	 eed<   	 eed<   	 ej        ed	<   ej        ed
<   dZe	ed<   dZ
ej        dz  ed<   dZedz  ed<   dZej        dz  ed<   dZej        dz  ed<   dZej        dz  ed<   dZej        dz  ed<   	 dZej        dz  ed<   dZej        dz  ed<   dZej        dz  ed<   defdZdej        fdZd"dZe ed          dej        fd                        Ze ed          dej        fd                        Zdej        fdZded edd fd!ZdS )#CommonAttentionMetadataz
    Per-batch attention metadata, shared across layers and backends.
    AttentionMetadataBuilder instances use it to construct per-layer metadata.

    For many of the tensors we keep both GPU and CPU versions.
    query_start_locquery_start_loc_cpuseq_lensnum_reqsnum_actual_tokensmax_query_lenmax_seq_lenblock_table_tensorslot_mappingTcausalNlogits_indices_paddednum_logits_indicesencoder_seq_lensencoder_seq_lens_cpudcp_local_seq_lensdcp_local_seq_lens_cpu_seq_lens_cpu_num_computed_tokens_cpu_num_computed_tokens_cacher9   c                 &    | j         j        d         S )Nr   )r   shaper-   s    r'   
batch_sizez"CommonAttentionMetadata.batch_sizeO  s    }"1%%r&   c                 @    | j         dd         | j         dd         z
  S )zENaive because it assumes that query ends where the next query starts.r<   N)r   r   s    r'   naive_query_lensz(CommonAttentionMetadata.naive_query_lensR  s%    #ABB'$*>ss*CCCr&   c                     t          | fi |S r,   )r   )r-   kwargss     r'   r   zCommonAttentionMetadata.replaceV  s    t&&v&&&r&   z
    Prefer using device seq_lens directly to avoid implicit H<>D sync.
    If a CPU copy is needed, use `seq_lens.cpu()` instead.
    Will be removed in a future release (v0.15.0)
    c                 \    | j         | j                            d          | _         | j         S )Ncpu)r   r   tor   s    r'   seq_lens_cpuz$CommonAttentionMetadata.seq_lens_cpuY  s.     %!%!1!1%!8!8D!!r&   z
    Prefer using device seq_lens directly to avoid implicit H<>D sync which breaks full
    async scheduling. If a CPU copy is needed, it can be derived from 
    query_start_loc_cpu and seq_lens.
    Will be removed in a future release (v0.15.0)
    c                 z    | j         .| j        dd          | j        d d         z
  }| j        |z
  | _         | j         S )Nr<   r   )r   r   r   )r-   query_seq_lenss     r'   num_computed_tokens_cpuz/CommonAttentionMetadata.num_computed_tokens_cpuf  sM     (0(,t/G/LL  -1,=,ND),,r&   c                 z    | j         .| j        dd         | j        dd         z
  }| j        |z
  | _         | j         S )z>Compute num_computed_tokens on device (seq_lens - query_lens).Nr<   r   )r   r   r   )r-   
query_lenss     r'   compute_num_computed_tokensz3CommonAttentionMetadata.compute_num_computed_tokensw  sE    *2-abb1D4H"4MMJ.2mj.HD+..r&   num_actual_reqsc           	         fd}t          di d| j        d dz            d| j        d dz            d| j        d          d| j        | j        d          nd d| j        | j        d          nd dd	|d
| j        d| j        d| j        d          d| j	        d |         d| j
        d| j        d| j        d || j                  d || j                  d || j                  d || j                  S )Nc                      | 
| d          nd S r,   r%   )xr   s    r'   <lambda>z2CommonAttentionMetadata.unpadded.<locals>.<lambda>  s    AMQ'7'7%8%8t r&   r   r<   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r%   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r-   r   r   maybe_slice_reqss     ` r'   unpaddedz CommonAttentionMetadata.unpadded  s    TSSS& 
 
 
 01F?Q3F1FGG
 $ 89N?Q;N9N O O
 ]#3O#344

 !- ,-=o-=>>
 ,8 &*%BCSOCS%T%T
 %_
 0/
 ,,
 ((
  $67G7GHH
 *+=,=+=>>
  ;;!
" #'"<"<#
$  $66%
& .-d.CDDD'
( "2!1$2K!L!L!L)
*  0/0GHHH+
, $4#3D4O#P#P#P-
 	
r&   )r9   r   )r   r   r   r    r   Tensorr0   r/   r   r   r   r   r   r   npndarrayr   r   r   r   r   r   r   r   propertyr   r   r   r   r   r%   r&   r'   r   r     s          \!!!%%%OlGMMM) 8$$$,FD 265<$.555%)d
))) -1elT)000.2"*t+222.2t+22226EL4/666P *.M5<$&---48elT18886:t 3:::&C & & & &D%, D D D D' ' ' ' Z	 "el " " "  X"
 Z	 - - - -  X-/U\ / / / /
!$
7:
	"
 
 
 
 
 
r&   r   Mc                   (    e Zd ZdZdZ	 dZ	 dZ	 dZdS )AttentionCGSupportzConstants for the cudagraph support of the attention backend
    Here we do not consider the cascade attention, as currently
    it is never cudagraph supported.      r<   r   N)r   r   r   r    ALWAYSUNIFORM_BATCHUNIFORM_SINGLE_TOKEN_DECODENEVERr%   r&   r'   r   r     s>        ( ( FCM9 #$OEr&   r   c                      e Zd ZU ej        Zee         ed<   dZe	dz  ed<   dZ
eed<   edddee         d	d
dej        fd            Zeded          d	d
dddefd            Z	 	 	 d(de	dz  dededdfdZe	 d)de	dededefd            Zdedej        dej        defdZdedefdZdede	defdZde	dej        d e	d!e	d"ed#ed$ed%e	d&e	defd'ZdS )*AttentionMetadataBuilder_cudagraph_supportNreorder_batch_thresholdFsupports_update_block_tablekv_cache_specr   layer_namesvllm_configr   devicec                 >    || _         || _        || _        || _        d S r,   )r   r   r   r   )r-   r   r   r   r   s        r'   r.   z!AttentionMetadataBuilder.__init__  s'     +&&r&   rT   r9   c                     | j         S )z6Get the cudagraph support level of this builder class.)r   )rT   r   r   s      r'   get_cudagraph_supportz.AttentionMetadataBuilder.get_cudagraph_support  s     %%r&   r<   supports_spec_as_decodesupports_dcp_with_varlenc                     || _         | j         9|r7| j        j        }|)|j        "t	          | j         d|j        z             | _         | j        j        j        dk    r|sd| _         d S d S d S r;   )r   r   speculative_confignum_speculative_tokensmaxparallel_configdecode_context_parallel_size)r-   r   r   r   r   s        r'   _init_reorder_batch_thresholdz6AttentionMetadataBuilder._init_reorder_batch_threshold  s     (?$'38O3 "&!1!D".&=I/20*AA0 0, ,IAMM, N ,-D((( NMMMr&   common_prefix_lencommon_attn_metadata
fast_buildc                     t           )a  
        Central method that builds attention metadata.
        Some builders (MLA) require reorder_batch to be called prior to build.

        Args:
            common_prefix_len: The length of the common prefix of the batch.
            common_attn_metadata: The common attention metadata.
            fast_build: The meta-data will prioritize speed of building over
                then speed at execution. Can be used for spec-decode where the
                result of a build call may only be used for few layers/iters.
        r?   )r-   r   r   r   s       r'   buildzAttentionMetadataBuilder.build  s
    $ "!r&   metadata	blk_tabler   c                     t           )a  
        Update the block table for the attention metadata.
        Faster when theres multiple kv-cache groups that create virtually the
        same metadata but just with different block tables.

        Only needs to be implemented if supports_update_block_table is True.
        r?   )r-   r   r   r   s       r'   update_block_tablez+AttentionMetadataBuilder.update_block_table  
     "!r&   c                 0    |                      d|          S )z
        Build attention metadata for CUDA graph capture. Uses build by default.
        Subclasses that override this method should call self.build or
        super().build_for_cudagraph_capture.
        r   )r   r   r   )r-   r   s     r'   build_for_cudagraph_capturez4AttentionMetadataBuilder.build_for_cudagraph_capture  s%     zz6J  
 
 	
r&   draft_indexc                 2    |                      d|d          S )a  
        Build attention metadata for draft model. Uses build by default.

        Args:
            common_attn_metadata: The common attention metadata.
            draft_index: The index of the current draft operation.
                When speculating a chain of tokens, this index refers to the
                draft attempt for the i-th token.
                For tree-based attention, this index instead refers to the
                draft attempt for the i-th level in the tree of tokens.
        r   T)r   r   r   r   )r-   r   r   s      r'   build_for_draftingz+AttentionMetadataBuilder.build_for_drafting  s(      zz!5  
 
 	
r&   r   num_query_headsrJ   	use_alibiuse_sliding_windowuse_local_attentionnum_smsdcp_world_sizec
                     dS rk   r%   )
r-   r   r   r   rJ   r   r   r   r   r   s
             r'   use_cascade_attentionz.AttentionMetadataBuilder.use_cascade_attention3  s	     ur&   )r<   FFr   ) r   r   r   r   r   r   r
   r0   r   r/   r   r   r   r   r   r   r   r.   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r%   r&   r'   r   r     s         8J7O!34OOO +/S4Z... ).---
&
 #Y
 "	

 
 
 
 ^
 &,-&!& '& 
	& & & [& /0(-).	- -!$t- "&- #'	-
 
- - - -6 
 !	" "" 6" 	"
 
" " " ^"&"" <" l	"
 
" " " "

$;

	


 

 

 


5
 
 
	
 
 
 
, J 	
   ! "   
     r&   r   c                       e Zd ZU ej        ed<   ej        ed<   ej        ed<   eed<   eed<   eed<   ej        ed<   dej        d	ej        d
ej        dej        dedej        fdZdS )AttentionLayer_q_scale_k_scale_v_scale_q_scale_float_k_scale_float_v_scale_float_prob_scalequerykeyvaluekv_cacheattn_metadatar9   c                     d S r,   r%   )r-   r  r  r  r  r  s         r'   forwardzAttentionLayer.forwardK  s	     sr&   N)	r   r   r   r   r   r0   floatr   r  r%   r&   r'   r  r  B  s         lll| \ |	
 , ) 
     r&   r  c                       e Zd ZU eed<   eed<   eed<   dZeed<   dZeed<   dZ	eed<   dZ
eed<   dZeed	<   dZeed
<   eed<   eed<   eed<   eed<   eed<   eed<    fdZedddddej        dfdededededz  dee         dz  dedz  dededz  dededz  ddfd            Ze	 	 	 d,dedej        dej        d ej        d!ej        d"ed#ej        dz  d$ej        dz  d%ej        dz  dej        fd&            Zd-d)Zd*ej        fd+Z xZS ).rC   	num_headsrK   scaleFcan_return_lse_for_decodesupports_pcp0supports_mtp_with_cp_non_trivial_interleave_sizeneed_to_return_lse_for_decodesupports_quant_query_inputsupports_per_head_quant_scalesr   dcp_rankpcp_world_sizepcp_ranktotal_cp_world_sizetotal_cp_rankc                    t                                          |           }	 ddlm}  |            j        |_         |            j        |_        n# t          $ r d|_        d|_        Y nw xY w	 ddlm	}  |            j        |_
         |            j        |_        n# t          $ r d|_
        d|_        Y nw xY w|j
        |j        z  |_        |j        |j        z  |j        z   |_        |j        dk    o|j        |_        |S )Nr   )get_dcp_groupr<   )get_pcp_group)super__new__vllm.distributed.parallel_stater#  
world_sizer   rank_in_groupr  AssertionErrorr$  r  r  r   r!  r  r  )rT   argsr   r-   r#  r$  	__class__s         r'   r&  zAttentionImpl.__new__}  s:   wws##	EEEEEE"/-//"<D)MOO9DMM 	 	 	"#DDMMM		EEEEEE"/-//"<D)MOO9DMM 	 	 	"#DDMMM	 $(#69L#L !]T-@@4=P !#F(F 	* s#   .A A.-A.2.B! !B<;B<Nr5   rJ   alibi_slopessliding_windowr^   logits_soft_caprv   kv_sharing_target_layer_namer9   c                     t           r,   r?   )r-   r  rK   r  rJ   r-  r.  r^   r/  rv   r0  s              r'   r.   zAttentionImpl.__init__  s
     "!r&   layerr  r  r  r  r  outputoutput_scaleoutput_block_scalec
                     t           r,   r?   )
r-   r2  r  r  r  r  r  r3  r4  r5  s
             r'   r  zAttentionImpl.forward  r   r&   	quant_keyr   c                     dS )ab  
        Does this attention implementation support fused output quantization.
        This is used by the AttnFusionPass to only fuse output quantization
        onto implementations that support it.

        :param quant_key: QuantKey object that describes the quantization op
        :return: is fusion supported for this type of quantization
        Fr%   )r-   r7  s     r'   fused_output_quant_supportedz*AttentionImpl.fused_output_quant_supported  s	     ur&   	act_dtypec                     d S r,   r%   )r-   r:  s     r'   process_weights_after_loadingz+AttentionImpl.process_weights_after_loading  s    r&   NNN)r7  r   )r   r   r   r/   r0   r  r  r   r  r  r  r  r  r&  r   r   r!   r   r   r.   r  r   r   r   r  r9  r[   r<  __classcell__)r,  s   @r'   rC   rC   U  s        NNNNNNLLL ',t+++ L$ >C4dBBB +0!4/// (-,,,+0"D000MMMMMM    8  $(+/%)$(,&.37" "" " 	"
 Dj" 5kD(" d
" " " " '*Dj" 
" " " ^"  '+,026" "" |" \	"
 |" ," " t#" lT)" "L4/" 
" " " ^"	 	 	 	u{        r&   rC   c            (       P   e Zd Ze	 d!dededededee         dz  dedz  ded	edz  d
ededz  dedz  dedededededddedz  ddf&d            Z	e	 	 	 d"de
dej        dej        dej        dej        dedej        dz  dej        dz  dej        dz  dej        fd             ZdS )#MLAAttentionImplNr  rK   r  rJ   r-  r.  r^   r/  rv   r0  q_lora_rankkv_lora_rankqk_nope_head_dimqk_rope_head_dimqk_head_dim
v_head_dim	kv_b_projr   indexerr9   c                     t           r,   r?   )r-   r  rK   r  rJ   r-  r.  r^   r/  rv   r0  rA  rB  rC  rD  rE  rF  rG  rH  s                      r'   r.   zMLAAttentionImpl.__init__  s
    . "!r&   r2  hidden_states_or_cqkv_c_normedk_per  r  r3  r4  r5  c
                     t           r,   r?   )
r-   r2  rJ  rK  rL  r  r  r3  r4  r5  s
             r'   r  zMLAAttentionImpl.forward  r   r&   r,   r=  )r   r   r   r   r/   r  r   r   objectr.   r  r   r   r   r  r%   r&   r'   r@  r@    s       * "&)" "" " 	"
 " 5kD(" d
" " " " '*Dj" 4Z" " "  !"" #"$ %"& *'"( $)"* 
+" " " ^"0  '+,026" "" #\" \	"
 l" ," " t#" lT)" "L4/" 
" " " ^" " "r&   r@  r^   r9   c                 ,    |                      d          S )Nfp8)
startswith)r^   s    r'   is_quantized_kv_cacherR    s    $$U+++r&   name_prefixattention_backend_clsbuilder_clsc                 F    | |j         z   }t          ||fdfdi          S )zN
    Return a new subclass where `get_builder_cls` returns `builder_cls`.
    rG   c                       S r,   r%   )rU  s   r'   r   z,subclass_attention_backend.<locals>.<lambda>   s    K r&   r   r   )rS  rT  rU  names     ` r'   subclass_attention_backendrZ    s@     3<<D$&):<O<O<O<O(P  r&   	overridesc                 :    | |j         z   }t          ||f|          S r,   rX  )rS  rT  r[  rY  s       r'   )subclass_attention_backend_with_overridesr]    s(    
 3<<D,.	:::r&   )6abcr   r   dataclassesr   r   enumr   typingr   r	   r
   r   r   r   r   numpyr   r   typing_extensionsr   vllm.configr   rd   r   !vllm.model_executor.layers.linearr   9vllm.model_executor.layers.quantization.utils.quant_utilsr   vllm.platforms.interfacer    vllm.v1.attention.backends.utilsr   vllm.v1.kv_cache_interfacer   r   r   r)   r2   r   r   r   r   r   r   r  rC   r@  r   rR  r   rZ  dictr]  r%   r&   r'   <module>rk     s	   $ # # # # # # # * * * * * * * *       U U U U U U U U U U U U U U U U U U      ( ( ( ( ( ( 9&&&&&&,,,,,,FFFFFFRRRRRR999999BBBBBB888888E E E E EC E E E        f f f f fs f f fR	 	 	 	 	 	 	 	 GC())) {
 {
 {
 {
 {
 {
 {
 {
| GCLL       "N N N N NsGAJ N N Nb    X   &o o o o oC o o od'" '" '" '" '"}Q' '" '" '"T,# ,$ , , , , 01 .q12 

	   ;; 01; CH~; 

	; ; ; ; ; ;r&   