
    .`i                         d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 e G d d                      Z ej        d	           G d
 de                      ZdS )    )	dataclassN)MLAAttention)CacheConfig)PluggableLayer)QuantizationConfigc                      e Zd ZU dZej        j        ed<   ej        j        ed<   ej        j        ed<   ej        j        ed<   ej        j        dz  ed<   ej        j        dz  ed<   ej        j        dz  ed	<   ej        j        dz  ed
<   ej        j        dz  ed<   ej        j        dz  ed<   eed<   ej	        dz  ed<   dZ
ej        j        dz  ed<   dS )
MLAModuleszModules used in MLA.kv_a_layernorm	kv_b_proj
rotary_embo_projNfused_qkv_a_projkv_a_proj_with_mqaq_a_layernormq_b_projq_projindexer	is_sparsetopk_indices_bufferindexer_rotary_emb)__name__
__module____qualname____doc__torchnnModule__annotations__boolTensorr        r/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/layers/mla.pyr	   r	      s        HO###xHOho,,,,$....8?T))))ho$$$$HOd""""X_t####OOO,,,,15$.55555r"   r	   multi_head_latent_attentionc                        e Zd ZdZ	 	 	 ddededededed	ed
edz  dedededz  dedz  de	ddf fdZ
	 ddej        dej        dej        dz  dej        fdZ xZS )MultiHeadLatentAttentionWrappera  Pluggable MLA layer which allows OOT backends to add
    custom implementations of the outer MLA layer (including rope & o_proj).
    Note that currently oot platforms can still use CustomOp.register_oot to
    replace MLA layer entirly, although we use PluggableLayer to register
    this layer now.

    This class takes positions and hidden_states as input.
    The input tensors can either contain prefill tokens or decode tokens.
    The class does the following:

    1. MLA Preprocess.
    2. Perform multi-head attention to prefill tokens and
       multi-query attention to decode tokens separately.
    3. Return the output tensor.
    N hidden_size	num_headsscaleqk_nope_head_dimqk_rope_head_dim
v_head_dimq_lora_rankkv_lora_rankmla_modulescache_configquant_configprefixreturnc                    t                                                       || _        || _        || _        ||z   | _        || _        || _        || _        || _	        |	j
        | _
        |	j        | _        |	j        | _        |	j        | _        |	j        | _        |	j        | _        |	j        | _        |	j        | _        |	j        | _        |	j        | _        |	j        | _        |	j        | _        | j        4t/          | j        d          sJ | j        j        | _        |	j        | _        t5          | j	        || j        | j        | j        | j        | j        |
|| d| j        | j        | j                  | _        || _        d S )Ntopk_tokensz.attn)r)   r*   r+   r,   r-   r.   r/   r1   r2   r3   r   
use_sparser   )super__init__r(   r+   r,   qk_head_dimr-   r.   r/   r)   r   r   r   r   r   r
   r   r   r   r   r   indexer_rope_embr   hasattrr6   r   r   mla_attnr3   )selfr(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   	__class__s                r#   r9   z(MultiHeadLatentAttentionWrapper.__init__5   s}    	& 0 0+.>>$&(" + <"-"@(6#,!()8$.%0!("* + >$.<#4<77777#|7D'2'FD$$n!2!2(*%%###n~L
 
 
  r"   	positionshidden_statesllama_4_scalingc                 |   d }d }| j         | j        
J d            | j        
J d            | j        
J d            |                     |          d         }|                    | j         | j        | j        z   gd          \  }}|                     |          }|                     |          d         }nX| j        
J d            | j        
J d            |                     |          d         }|                     |          d         }|                    | j        | j        gd          \  }}	| 	                    |          }
|
                    d| j        | j                  }|	                    d	          }	| j        5|                     ||d
| j        d f         |	          \  |d
| j        d f<   }	| j        r$| j        r|                     |||| j                  }|||z  }|                     ||
|	|j        d         | j        | j        z  f          }|                     |          d         S )Nz9fused_qkv_a_proj is required when q_lora_rank is not Nonez6q_a_layernorm is required when q_lora_rank is not Nonez1q_b_proj is required when q_lora_rank is not Noner   )dimz7kv_a_proj_with_mqa is required when q_lora_rank is Nonez+q_proj is required when q_lora_rank is None   .)output_shape)r.   r   r   r   splitr/   r,   r   r   r
   viewr)   r:   	unsqueezer   r+   r   r   r;   r=   shaper-   r   )r>   r@   rA   rB   q_ckv_loraqkv_loraqkv_ck_pekv_c_normed_topk_indicesattn_outs                r#   forwardz'MultiHeadLatentAttentionWrapper.forwardq   s    '(44K 544 %11H 211 =,,C -,, ,,];;A>H#>>!4#4t7L#LM *  LC $$S))Cc""1%AA*66I 766 ;**= +** --m<<Q?GM**1-A]]D$5t7L#MSU]VV
d))$//FF2t~t'788~~a  ?&48OO1S$"7"9"99:D5 51Ac4(***+T < 	DN 	 LLsIt/D M & A=='-a0$.4?2RS	 ! 
 
 {{8$$Q''r"   )NNr'   )N)r   r   r   r   intfloatr	   r   r   strr9   r   r    rU   __classcell__)r?   s   @r#   r&   r&   !   s7        : ,026: :: : 	:
 : : : 4Z: :  : "D(: )4/: : 
: : : : : :@ 04	?( ?(<?( |?( ,	?(
 
?( ?( ?( ?( ?( ?( ?( ?(r"   r&   )dataclassesr   r   vllm.attention.layerr   vllm.configr   vllm.model_executor.custom_opr   'vllm.model_executor.layers.quantizationr   r	   registerr&   r!   r"   r#   <module>r`      s    " ! ! ! ! !  - - - - - - # # # # # # 8 8 8 8 8 8 F F F F F F 6 6 6 6 6 6 6 6& 677N( N( N( N( N(n N( N( 87N( N( N(r"   