
    -`i                     V   d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*m+Z+ ddl,m-Z-m.Z.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5m6Z6m7Z7m8Z8  ee9          Z:de dz  de;fdZ<dIdej=        de;ddfdZ>dej=        dedz  d e?ddfd!Z@ G d" d#ej=        e          ZA G d$ d%ej=        e          ZBd&ejC        d'ejC        d(ejC        d)e?ddf
d*ZDd&ejC        d'ejC        d(ejC        d)e?ddf
d+ZE e*d,eDg d-eE.           d)e?deFeGeHz  dz  eAeBz  ejC        f         fd/ZIed&ejC        d'ejC        d(ejC        d)e?dejC        f
d0            ZJd&ejC        d'ejC        d(ejC        d)e?dejC        f
d1ZK e*d2eJeK3           d'ejC        d(ejC        d)e?dejC        fd4ZLd'ejC        d(ejC        d)e?dejC        fd5ZM e*d6eLeMg 7           e	 	 	 dJd&ejC        d'ejC        d(ejC        d8ejC        d)e?d9ejC        dz  d:ejC        dz  d;ejC        dz  ddfd<            ZN	 	 	 dJd&ejC        d'ejC        d(ejC        d8ejC        d)e?d9ejC        dz  d:ejC        dz  d;ejC        dz  ddfd=ZO e*d>eNd8d:geO.           ed?ejC        d@ejC        dAejC        d)e?dejC        f
dB            ZPd?ejC        d@ejC        dAejC        d)e?dejC        f
dCZQ e*dDePg eQe(jR        E           e	 	 dKd?ejC        d@ejC        dAejC        d8ejC        d)e?d9ejC        dz  d:ejC        dz  ddfdF            ZS	 	 dKd?ejC        d@ejC        dAejC        d8ejC        d)e?d9ejC        dz  d:ejC        dz  ddfdGZT e*dHeSd8d:geTe(jR        E           dS )LzAttention layer.    )castN)validate_kv_sharing_target)maybe_transfer_kv_layer)CacheConfigget_current_vllm_config)
VllmConfig)ForwardContextget_forward_context)init_logger)AttentionLayerBase)vllm_is_batch_invariant)ColumnParallelLinearUnquantizedLinearMethod)QuantizationConfig)QuantizeMethodBase)QuantFP8)BaseKVCacheMethod)
GroupShape)current_platform)direct_register_custom_opkv_cache_dtype_str_to_dtype)AttentionBackendAttentionTypeMLAAttentionImpl)AttentionBackendEnum)get_attn_backend)FullAttentionSpecKVCacheSpecMLAAttentionSpecSlidingWindowSpecquant_methodreturnc                 6    | duot          | t                     S )zFReturns whether the quantization method should load quantized weights.N)
isinstancer   )r!   s    h/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/attention/layer.pyshould_load_quant_weightsr&   2   s+    t# J-- - )     Flayerregister_bufferc                    |r|                      dt          j        dt          j                             |                      dt          j        dt          j                             |                      dt          j        dt          j                             |                      dt          j        dt          j                             nh| j                            d           | j                            d           | j                            d           | j                            d           d| _	        d| _
        d| _        d| _        t          j        t          j        t          j                  | _        t          j        t          j        t          j                  | _        t          j        t          j        t          j                  | _        dS )z/Sets default quantization scales for the layer._k_scale      ?dtype_v_scale_q_scale_prob_scaleN)r)   torchtensorfloat32r+   fill_r/   r0   r1   _q_scale_float_k_scale_float_v_scale_float_prob_scale_floatenvsQ_SCALE_CONSTANTq_rangeK_SCALE_CONSTANTk_rangeV_SCALE_CONSTANTv_range)r(   r)   s     r%   set_default_quant_scalesrA   9   s{    	%j%,s%-*P*P*PQQQj%,s%-*P*P*PQQQj%,s%-*P*P*PQQQmU\#U]-S-S-STTTTS!!!S!!!S!!!$$$
 EEE!E L!6emLLLEML!6emLLLEML!6emLLLEMMMr'   quant_configprefixc                 h   |r|                     | |          nd}t          | d           d| _        |r|                     | |          nd}t          |          rTt	          |t
                    sJ | j        dk    rt          d          || _        | j        	                    |            dS dS )a  Initializes KV cache scaling factors and quantization method.

    This helper function sets up the KV cache quantization attributes that are
    shared between Attention and MLAAttention layers. It initializes scale
    tensors for query, key, value, and probability, and configures the
    quantization method if applicable.

    Args:
        layer: The attention layer instance to initialize.
        quant_config: Optional quantization configuration.
        prefix: Layer name prefix for quantization method lookup.
    rC   NTr)   fp8_e5m2z8fp8_e5m2 kv-cache is not supported with fp8 checkpoints.)
get_quant_methodrA   _o_scale_floatr&   r$   r   kv_cache_dtype
ValueErrorr!   create_weights)r(   rB   rC   r!   s       r%   _init_kv_cache_quantrM   T   s    $ @LU%%eF%;;;QU " UD9999  E @LU%%eF%;;;QU 
 !.. 1,(9::::: :--WXXX
 *))%000001 1r'   c            !           e Zd ZdZddddddddej        dddfdededededz  dee         dz  d	e	dz  d
e
dz  dedz  dedz  dedz  dedededz  dee         dz  dedz  ddf  fdZ	 d!dej        dej        dej        dej        dz  dej        f
dZd ZdefdZdej        fdZdee         fdZdedefd Z xZS )"	Attentionac  Attention layer.

    This class takes query, key, and value tensors as input. The input tensors
    can either contain prompt tokens or generation tokens.
    The class does the following:

    1. Store the input key and value tensors in the KV cache.
    2. Perform (multi-head/multi-query/grouped-query) attention.
    3. Return the output tensor.
    N 	num_heads	head_sizescalenum_kv_headsalibi_slopesuse_alibi_sqrtcache_configrB   logits_soft_capper_layer_sliding_windowrC   	attn_typekv_sharing_target_layer_nameattn_backendhead_size_vr"   c                 <   t                                                       |
|
}n||j        }nd}t                      }||j        }|j        }|j        }nd}d}d}t          |dd          d}d}|d|_        d|_        t          ||j	                  | _
        || _        || _        ||}||z  dk    sJ d| d	| d
            || _        || _        || _        || _        || j        n|| _        || _        || _        |                    d          du| _        |j	        }|duo|j        | _        t-          j                    }|'t1          ||||d| j        | j        |          | _        n|| _        | j                                        }|r|nd}|r,|s*t7          d| j                                         d          t;          |          | _        |r
| j        |d<   |r|j        rktA                      r]| j                                        dk    s| j                                        dk    r#tB          "                    dd           d|_        | j        #                                } |||||||||	||f
i || _$        tJ          | j                                                 | _&        || _'        tQ          j)                     | _*        | j        j+        | _,        |j-        }||j.        v rt7          d|           | |j.        |<   || _/        |ta          |||j.                   || _1        d te          |j3        j4                  D             | _5        tm          | ||           d| _7        | j$        j8        r| j        9                    d          rtu          | d          o!| j;        <                                | j        k    }| j        | j        z  | j        z  }t{          d|rt}          d|          nt|          j?                  | _7        dS dS dS )zg
        The KV cache is stored inside this class and is accessed via
        `self.kv_cache`.
        Nauto   Fkv_cache_schemefp8r   znum_heads (z$) is not divisible by num_kv_heads ()sinks)use_mlahas_sinkuse_mm_prefixrZ   z+use_alibi_sqrt is not supported by backend .rV   
FLASHINFER
TRITON_MLAzeDisabling prefix caching for FLASHINFER/TRITON_MLA with batch invariance, as it is not yet supported.localscopeDuplicate layer name: c                 6    g | ]}t          j        g           S  r2   r3   .0_s     r%   
<listcomp>z&Attention.__init__.<locals>.<listcomp>;  s2     
 
 
 L
 
 
r'   q_scaleT)staticgroup_shape)@super__init__sliding_windowr   cache_dtype
block_sizecalculate_kv_scalesgetattrr   model_configkv_cache_torch_dtyperJ   rB   
layer_namerQ   rR   r]   rT   getrf   is_mm_prefix_lmrg   r2   get_default_dtyper   r\   supports_alibi_sqrtrK   get_nameboolrV   enable_prefix_cachingr   loggerwarning_onceget_impl_clsimplr   backendr.   r   opaque_attention_opuse_direct_callaccept_output_buffer
use_outputcompilation_configstatic_forward_contextrZ   r   r[   rangeparallel_configpipeline_parallel_sizekv_cacherM   query_quantsupports_quant_query_input
startswithhasattrrv   numelr   r   
PER_TENSOR)selfrQ   rR   rS   rT   rU   rV   rW   rB   rX   rY   rC   rZ   r[   r\   r]   extra_impl_argsr|   vllm_configrJ   r~   r   r   r.   backend_supports_alibi_sqrtimpl_clsr   is_per_head	__class__s                               r%   r{   zAttention.__init__   s&   . 	#/5NN%)8NN!N-//#)5N%0J"."B#NJ"' <!2D99E"N"''+0(380$?K4%
 %
! -#6 $L<'1,,,X)XXXXX -,, ) ""-8-@4>>k(,'++G44D@ #/)5V,:V ')) 0"0#	! 	! 	!D !-D&*&7&K&K&M&M#+9Du 	"= 	3$--//3 3 3   #>22& 	D040CO,- $2 %')) % !**,,<<$--//<?? E     
 27L.$1133H(
 
 
 
	 ,D,=,F,F,H,HI
 $4#G#I#II+@(;'>>>>f>>???<@1&9"'3&,"9  
 -I)

 
;6MNN
 
 
 	T<888  9/ 	D4G4R4R5
 5
 	 i((VT\-?-?-A-ATEV-V  $.8D<MMJ'+Jr:666*	     D	 	 	 	r'   querykeyvalueoutput_shapec                 
   | j         r,t          j        j                            |||| j                   |j        }| j        5| j        dv sJ | j	        j
        r|                     || j                  \  }}| j        r|0|j        d         }t          j        || j        | j        z  f          }t          j        |||j                  }|d         }	|                    d| j        | j                  }|                    d| j        | j                  }|!|                    d| j        | j                  }|!|                    d| j        | j                  }| j        r?d}
| j        j        st3          ||| j                  }
t5          ||||| j        |
           nld}
| j        j        s/||+t          j        j                            ||| j                  }
t          j        j                            ||||| j        |
           |                    d|	          S | j        j        s
J d            | j        rt7          |||| j                  S t          j        j                            |||| j                  S )a_  
        The KV cache is stored inside this class and is accessed via
        `self.kv_cache`.

        Attention metadata (`attn_metadata`) is set using a context manager in
        the model runner's `execute_model` method. It is accessed via forward
        context using
        `vllm.forward_context.get_forward_context().attn_metadata`.
        N>   rb   fp8_e4m3r   r.   devicerw   )kv_cache_dummy_depzDSplit KV cache update not supported when output tensor not provided.)r   r2   opsvllmmaybe_calc_kv_scalesr   r.   r   rJ   r   r   r0   r   shapeSizerQ   r]   emptyr   viewrR   rT   r   r\    forward_includes_kv_cache_updateunified_kv_cache_updateunified_attention_with_outputunified_attention)r   r   r   r   r   output_dtypert   
num_tokensoutputhidden_sizer   s              r%   forwardzAttention.forwardS  s   & # 	TIN//sE4?SSS{' &*===== y3 B++E4=AAq? <	# #[^
$z$2B!BC    [\%,WWWF&r*K JJr4>4>BBE[[T^T5EFFFhhr4#4dnEE 

2t'8$:JKK# %)"(I )@UDO* *& .O'9     &*"(I Ou'8).)O)OUDO* *& 	<<O'9 =    ;;r;///$E  V E # (UDOLLLy~773t  r'   c                 l   | j                             t          j        |                                          | j        z             | j                            t          j        |                                          | j        z             | j                            t          j        |                                          | j	        z             | j         
                                | _        | j        
                                | _        | j        
                                | _        d| _        d S )NF)r0   copy_r2   absmaxr<   r+   r>   r/   r@   itemr6   r7   r8   r   )r   r   r   r   s       r%   calc_kv_scaleszAttention.calc_kv_scales  s    EIe,,0022T\ABBBEIcNN..004<?@@@EIe,,0022T\ABBB"m0022"m0022"m0022#(   r'   c                     d| j         j         }|d| j         j         z  }|d| j         j         z  }|d| j         j         z  }|d| j         j        j         z  }|S )Nz
head_size=z, num_heads=z, num_kv_heads=z, scale=z
, backend=)r   rR   rQ   rT   rS   r   __name__)r   ss     r%   
extra_reprzAttention.extra_repr  s{    .,..	1DI/111	7ty5777	)	)))	8$)-6888r'   	act_dtypec                     | j                             |           | j        r!| j                            | | j                  nd }t          |          st          | d           d S d S )NrE   FrF   )r   process_weights_after_loadingrB   rH   r   r&   rA   r   r   r!   s      r%   r   z'Attention.process_weights_after_loading  s    	//	:::  D..tDO.LLL 	
 )66 	B$T5AAAAAA	B 	Br'   c                     | j         S Nr\   r   s    r%   r   zAttention.get_attn_backend        r'   r   c                 "   |j         j        }| j        t          j        k    sJ | j        >|j        j        r
J d            t          || j	        | j
        | j        | j                  S t          || j	        | j
        | j        | j                  S )Nz&MLA is not supported for slidingwindow)r~   rT   rR   r.   r|   )r~   rT   rR   r]   r.   )rW   r~   rZ   r   DECODERr|   r   re   r    rT   rR   r   r   r]   )r   r   r~   s      r%   get_kv_cache_speczAttention.get_kv_cache_spec  s     -8
~!66666*"/7  8 7 %%!../#2    %%!.. ,/   r'   r   )r   
__module____qualname____doc__r   r   intfloatlistr   r   r   strtyper   r{   r2   Tensorr   r   r   r   r.   r   r   r   r   r   __classcell__r   s   @r%   rO   rO      sX       	 	  $(+/&*+/26(,/3&.376:"&!v vv v 	v
 Djv 5kD(v tv "D(v )4/v v #&*v v v '*Djv +,t3v  4Z!v$ 
%v v v v v v@ +/^ ^|^ \^ |	^ j4'^ 
^ ^ ^ ^@) ) )C    Bu{ B B B B!$'7"8 ! ! ! !Z K        r'   rO   c                   H    e Zd ZdZ	 	 	 	 	 ddedededed	ed
edz  dedededz  dedz  de	de
dedz  f fdZ	 d dej        dej        dej        dej        dz  dej        f
dZdej        fdZdej        dej        dej        ddfdZdee         fdZdedefdZ xZS )!MLAAttentiona2  Multi-Head Latent Attention layer.

    This class takes query, and compressed key/value tensors as input.
    The class does the following:

    1. Store the input key and value tensors in the KV cache.
    2. Perform (multi-head/multi-query/grouped-query) attention.
    3. Return the output tensor.
    NrP   FrQ   rS   qk_nope_head_dimqk_rope_head_dim
v_head_dimq_lora_rankkv_lora_rank	kv_b_projrW   rB   rC   
use_sparseindexerc                 N   t                                                       || _        || _        || _        || _        || _        || _        || _        ||z   | _	        || _
        |	|	j        }|	j        }|	j        }nd}d}d}|
| _        || _        || _        t!          | |
|           t#          j                    }t'          | j	        |||d|          | _        |	r|	j        rkt-                      r]| j                                        dk    s| j                                        dk    r#t0                              dd	
           d|	_        t5          t6          t8                   | j                                                  } |d!i d| j        d| j	        d| j        dddd dd d| j        dd dt<          j        dd d| j        d| j        d| j        d| j        d| j        | j        z   d| j        d|d||| _         tC          j"                     | _#        tI                      j%        }||j&        v rtO          d|           | |j&        |<   d tQ          tI                      j)        j*                  D             | _+        || _,        t#          j-        t\          j/        t"          j0                   | _1        t#          j-        t\          j2        t"          j0                   | _3        t#          j-        t\          j4        t"          j0                   | _5        d S )"Nr_   r`   FT)re   r   rj   ri   zgDisabling prefix caching for TRITON_MLA / FLASHINFER with batch invariance, as it is not yet supported.rk   rl   rQ   rR   rS   rT      rU   r|   rJ   rX   rZ   r[   r   r   r   r   qk_head_dimr   r   r   rn   c                 6    g | ]}t          j        g           S rp   rq   rr   s     r%   ru   z)MLAAttention.__init__.<locals>.<listcomp>_  s2     
 
 
 L
 
 
r'   r-   rp   )6rz   r{   rQ   rS   r   r   r   r   r   rR   r   r}   r~   r   rB   rJ   rM   r2   r   r   r\   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rK   r   r   r   r   r   r3   r:   r;   r4   r<   r=   r>   r?   r@   )r   rQ   rS   r   r   r   r   r   r   rW   rB   rC   r   r   r   rJ   r~   r   r.   r   r   r   s                        r%   r{   zMLAAttention.__init__  s   " 	"
 0 0$&(%(88 #)5N%0J"."B#NJ"'( -#6 T<888')),N!
 
 
 $2 %')) % !**,,<<$--//<?? E     
 27L.-.0A0N0N0P0PQQH 
 
 
nn
nn
 **
 	

 
  4
  ..
 !D
 $++
 *.
 ((
 **
 "22
 "22
  -0EEE!
" #
$  i%
& G)
 
	. $4#G#I#II466I'>>>>f>>???<@1&9
 
'))9P 
 
 
 % |D$9OOO|D$9OOO|D$9OOOr'   qkv_c_normedk_per   r"   c           	         | j         r,t          j        j                            |||| j                   | j        rt                      }|j        }t          |t                    r|| j                 }| j        |j                 }| j        j        rDt          j        ||j        |j                  }| j                            | ||||||           |S | j                            | |||||          S | j        j        rPt          j        ||j        |j                  }t          j        j                            ||||| j                   |S t          j        j                            |||| j                  S )Nr   )r   )r   r2   r   r   r   r   r   r
   attn_metadatar$   dictr   virtual_enginer\   r   r   r.   r   r   r   !unified_mla_attention_with_outputunified_mla_attention)	r   r   r   r   r   forward_contextr   self_kv_cacher   s	            r%   r   zMLAAttention.forwardm  s    # 	WIN//;doVVV (	.A.C.CO+9M-.. ? -do > M/*HIM 5 \RRR	!!!!! "    y((![$}    5 \RRR	@@O   y~;;O	  r'   r   c                     t          | j        d          r| j                            |           | j        r!| j                            | | j                  nd }t          |          st          | d           d S d S )Nr   rE   FrF   )r   r   r   rB   rH   r   r&   rA   r   s      r%   r   z*MLAAttention.process_weights_after_loading  s    49=>> 	?I33I>>>  D..tDO.LLL 	
 )66 	B$T5AAAAAA	B 	Br'   c                    t          | dt          j        d                    }t          | dt          j        d                    }t          | dt          j        d                    }| j                            t          j        |                                          |z             t          j        |                                          }| j                            ||z             | j                            ||z             | j        	                                | _
        | j        	                                | _        | j        	                                | _        d| _        dS )zOptional scale calculation for MLA inputs.

        Mirrors Attention.calc_kv_scales. Not all MLA backends require this
        r<   r,   r>   r@   FN)r   r2   r3   r0   r   r   r   r+   r/   r   r6   r7   r8   r   )r   r   r   r   r<   r>   r@   
kv_abs_maxs           r%   r   zMLAAttention.calc_kv_scales  s%    $	5<+<+<==$	5<+<+<==$	5<+<+<==EIaLL,,..8999Y{++//11
J0111J0111"m0022"m0022"m0022#(   r'   c                     | j         S r   r   r   s    r%   r   zMLAAttention.get_attn_backend  r   r'   r   c                     t          | j        |j                  }t          |j        j        d| j        ||j        j                  S )Nr   )r~   rT   rR   r.   cache_dtype_str)r   rJ   r   r   rW   r~   rR   r}   )r   r   rJ   s      r%   r   zMLAAttention.get_kv_cache_spec  sR    4!9
 
  "/:n '4@
 
 
 	
r'   )NNrP   FNr   )r   r   r   r   r   r   r   r   r   r   r   objectr{   r2   r   r   r   r.   r   r   r   r   r   r   r   r   r   r   s   @r%   r   r     s        ( ,026 !%pP pPpP pP 	pP
 pP pP 4ZpP pP (pP "D(pP )4/pP pP pP $pP pP pP pP pP pPn +/2 2<2 \2 l	2
 j4'2 
2 2 2 2hBu{ B B B B)),1L)@E)	) ) ) ),!$'7"8 ! ! ! !

Z 

K 

 

 

 

 

 

 

 

r'   r   r   r   r   r   c                 |    t                      }|j        |         }|j        sd S |                    | ||           d S r   )r
   no_compile_layersr   r   )r   r   r   r   r   r   s         r%   r   r     sL     ':&;&;O,Z8D # sE*****r'   c                     d S r   rp   r   r   r   r   s       r%   maybe_calc_kv_scales_faker    s	     Fr'   r   )r   r   r   )op_nameop_funcmutates_args	fake_implc                     t                      }|j        }t          |t                    r||          }|j        |          }|j        |j                 }|||fS )a  Extract attention context for a given layer.

    This helper function extracts the attention metadata, attention layer
    instance, and KV cache tensor for a specific layer.

    Args:
        layer_name: The name/identifier of the attention layer.

    Returns:
        A tuple containing:
        - attn_metadata: Attention metadata for this specific layer, or None if
            no metadata available
        - attn_layer: The attention layer instance (Attention or MLAAttention)
        - kv_cache: The KV cache tensor for current virtual engine

        Note: attn_metadata may be None, but attn_layer and kv_cache are always
        extracted from the forward context.
    )r
   r   r$   r   r  r   r   )r   r   r   
attn_layerr   s        r%   get_attention_contextr    s]    * ':&;&;O#1M-&& 2%j1+:+LZ+XJ"?#ABH*h..r'   c                 j    t          |          \  }}}|j                            || ||||          }|S r   r  r   r   )r   r   r   r   r   r   r   r   s           r%   r   r     s<     %:*$E$E!M4YtUC-PPFMr'   c                 N    t          j        |                                           S r   r2   
empty_like
contiguousr  s       r%   unified_attention_faker  #  s!     E""--///r'   r   )r  r  r	  c                    t                      }|j        |         }|j        |j                 }|j        }t          |t                    sJ dt          |           d            |                    |          }|Ot          |j
        d          sJ |j
        j        j         d            |j
                            || |||           t          j        d|j        |j                  S )z
    Returns a dummy that is passed to unified_attention to signal a side effect and
    the data dependency between them to ensure torch.compile preserves ordering.
    z(Expected slot_mapping to be a dict, got z. Ndo_kv_cache_updatez! does not support kv cache updater   r   r.   )r
   r  r   r   slot_mappingr$   r   r   r   r   r   r   r   r  r2   r   r   r.   )r   r   r   r   r  r   r  layer_slot_mappings           r%   r   r   3  s    *++O 2:>J"?#ABH"/LlD))  I43E3EIII ) &))*55%z(<== 	
 	
(1TTT	
 	
= 	**	
 	
 	
 ;qGGGGr'   c                 D    t          j        d| j        | j                  S )Nr   r  )r2   r   r   r.   )r   r   r   s      r%   unified_kv_cache_update_faker  T  s    
 ;q39====r'   r   )r  r  r	  r  r   output_scaleoutput_block_scaler   c                 t    ~t          |          \  }}	}
|	j                            |	| |||
||||	  	         d S N)r   r  r  r  )r   r   r   r   r   r  r  r   r   r   r   s              r%   r   r   d  s_     	$9*$E$E!M4I!-  
 
 
 
 
r'   c                     d S r   rp   )r   r   r   r   r   r  r  r   s           r%   "unified_attention_with_output_faker     s	     Fr'   r   r   r   r   c                 j    t          |          \  }}}|j                            || ||||          }|S r   r  )r   r   r   r   r   r   r   r   s           r%   r   r     s<     %:*$E$E!M4YtQT8]SSFMr'   c                 N    t          j        |                                           S r   r  )r   r   r   r   s       r%   unified_mla_attention_faker#    s!     A))+++r'   r   )r  r  r  r	  dispatch_keyc                 r    t          |          \  }}}	|j                            || |||	||||	  	         d S r  r  )
r   r   r   r   r   r  r  r   r   r   s
             r%   r   r     s\     %:*$E$E!M4I	!-  
 
 
 
 
r'   c                     d S r   rp   )r   r   r   r   r   r  r  s          r%   &unified_mla_attention_with_output_faker'    s	     Fr'   r   )F)NNN)NN)Ur   typingr   r2   torch.nnnn	vllm.envsr:   %vllm.attention.utils.kv_sharing_utilsr   &vllm.attention.utils.kv_transfer_utilsr   vllm.configr   r   vllm.config.vllmr   vllm.forward_contextr	   r
   vllm.loggerr   /vllm.model_executor.layers.attention_layer_baser   *vllm.model_executor.layers.batch_invariantr   !vllm.model_executor.layers.linearr   r   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.quantization.base_configr   7vllm.model_executor.layers.quantization.input_quant_fp8r   0vllm.model_executor.layers.quantization.kv_cacher   9vllm.model_executor.layers.quantization.utils.quant_utilsr   vllm.platformsr   vllm.utils.torch_utilsr   r   vllm.v1.attention.backendr   r   r   #vllm.v1.attention.backends.registryr   vllm.v1.attention.selectorr   vllm.v1.kv_cache_interfacer   r   r   r    r   r   r   r&   ModulerA   r   rM   rO   r   r   r   r  tupler   r   r  r   r  r   r  r   r   r   r#  r$  r   r'  rp   r'   r%   <module>rB     sY	                        L L L L L L J J J J J J < < < < < < < < ' ' ' ' ' ' D D D D D D D D # # # # # # N N N N N N N N N N N N        G F F F F F R R R R R R L L L L L L N N N N N N P P P P P P + + + + + +                
 E D D D D D 7 7 7 7 7 7            
X		,>,E $    M MBI M MQU M M M M681981$t+81 81 
	81 81 81 81v^ ^ ^ ^ ^	- ^ ^ ^Bc
 c
 c
 c
 c
290 c
 c
 c
L+<+	+ <+ 	+
 
+ + + +"<	 < 	
 
     " ***'	   //
4&=4\!95<GH/ / / /< 	<			 <	 		
 \	 	 	 	0<0	0 <0 	0
 \0 0 0 0  $   H	H<H H \	H H H HB>	><> > \	> > > >  %#*	     )-.2.2 <	 < L	
  ,% t+ t+ 
   F )-.2.2
 
<
	
 <
 L	

 
 ,%
 t+
 t+
 

 
 
 
  +)010	    	|		 ,	 		
 \	 	 	 	,|,, ,, 	,
 \, , , ,  #!(!.     )-.2 | , L	
  ,% t+ 
   : )-.2	 	|		 ,	 L		
 	 ,%	 t+	 
	 	 	 	  /-014!.     r'   