
    .`i'G                        d dl mZ d dlmZ d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZmZ d dlmZmZmZmZmZmZmZ d d	lmZ d d
lmZmZ  ee          Zej        ej        fZ  G d de          Z!e G d d                      Z" G d dee"                   Z# G d de          Z$dej%        dej&        dej%        de'ej%                 fdZ(dej%        de)de)dej&        de'ej%                 f
dZ*	 ddej&        de)de)dz  de+fdZ,dS )    )	dataclass)ClassVarN)_custom_ops)
VllmConfig)init_logger)CpuArchEnumcurrent_platform)AttentionBackendAttentionImplAttentionLayerAttentionMetadataBuilderAttentionTypeCommonAttentionMetadatais_quantized_kv_cache)split_decodes_and_prefills)AttentionSpecCrossAttentionSpecc                      e Zd ZU dZeed<   ej        ej        ej	        gZ
eeej                          ed<   edeej                 fd            Zedee         fd            Zedefd            Zededefd	            Zeded
         fd            Zeded         fd            Ze	 ddedededededeedf         fd            Zedefd            ZdS )CPUAttentionBackendTaccept_output_buffersupported_dtypesreturnc                 H    t           j        t           j        t           j        gS N)torchfloat16bfloat16float32clss    w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/attention/backends/cpu_attn.pyget_supported_dtypesz(CPUAttentionBackend.get_supported_dtypes'   s    u~u}==    c                 
    g dS )N)
    @   P   `   p                   r   s    r!   get_supported_head_sizesz,CPUAttentionBackend.get_supported_head_sizes+   s    ====r#   c                      dS )NCPU_ATTNr/   r/   r#   r!   get_namezCPUAttentionBackend.get_name/   s    zr#   	attn_typec                 b    |t           j        t           j        t           j        t           j        fv S )zSCPU attention supports decoder,
        encoder-only and encoder-decoder attention.)r   DECODERENCODERENCODER_ONLYENCODER_DECODER)r    r4   s     r!   supports_attn_typez&CPUAttentionBackend.supports_attn_type3   s.     !!&)	
 
 	
r#   CPUAttentionBackendImplc                      t           S r   )r;   r/   r#   r!   get_impl_clsz CPUAttentionBackend.get_impl_cls>   s    &&r#   CPUAttentionMetadataBuilderc                      t           S r   )r>   r/   r#   r!   get_builder_clsz#CPUAttentionBackend.get_builder_clsB   s    **r#   auto
num_blocks
block_sizenum_kv_heads	head_sizecache_dtype_str.c                     d| |||fS )N   r/   )rB   rC   rD   rE   rF   s        r!   get_kv_cache_shapez&CPUAttentionBackend.get_kv_cache_shapeF   s     *lJ	AAr#   c                      dS )NFr/   )argskwargss     r!   use_cascade_attentionz)CPUAttentionBackend.use_cascade_attentionP   s    ur#   N)rA   )__name__
__module____qualname__r   bool__annotations__r   r   r   r   r   r   listdtypeclassmethodr"   intr0   staticmethodstrr3   r:   typer=   r@   tuplerI   rM   r/   r#   r!   r   r      s        !%$%%%5htEK01    >T%+%6 > > > [> >c > > > [> c    \ 
3 
4 
 
 
 [
 '$89 ' ' ' \' +T"?@ + + + \+   &B BBB B 	B
 B 
sCxB B B \B $    \  r#   r   c                      e Zd ZU eed<   eed<   eed<   ej        ed<   eed<   ej        ed<   ej        ed<   ej        ed<   ej        d	z  ed
<   dZe	ed<   dZ
e	ed<   dZeed<   d	Zeej        d	z           d	z  ed<   d	Zej        d	z  ed<   d	S )CPUAttentionMetadataisanum_actual_tokensmax_query_lenquery_start_locmax_seq_lenseq_lensblock_tableslot_mappingNscheduler_metadataTcausalFuse_sdpa_prefillr   num_decode_tokenssdpa_attn_maskssdpa_start_loc)rN   rO   rP   rX   rR   rV   r   Tensorrf   rQ   rg   rh   ri   rS   rj   r/   r#   r!   r\   r\   U   s         	HHH\!!!l,t++++FD #d"""s8<OT%,-.5<<<*.NEL4'.....r#   r\   c            
       f     e Zd Zdedee         dedej        ddf
 fdZ		 dd	e
d
ededefdZ xZS )r>   kv_cache_speclayer_namesvllm_configdevicer   Nc                    t                                          ||||           d| _        d }t          j                    t
          vr	d}d| _        |                     |d           || _        || _        |j	        }|j
                            |          | _        |j
                            |          | _        |j        | _        |j
        j        | _        t%          |dd          | _        | j        d| _        |j        j        | _        t-          | j        | j        | j                  | _        t1          |t2                    | _        d S )NF   Tsliding_window)super__init__rg   r	   get_cpu_architecture_CPU_ARCH_PREFER_MIXED_BATCH_init_reorder_batch_thresholdrm   ro   parallel_configmodel_configget_num_kv_headsrD   get_num_attention_heads	num_headsrE   head_dimrT   getattrwindow_sizecache_configrC   _get_attn_isar]   
isinstancer   is_cross_attention)selfrm   rn   ro   rp   reorder_batch_thresholdrz   	__class__s          r!   rv   z$CPUAttentionMetadataBuilder.__init__j   s9    	[&III %"&022:VVV '(#$(D!**+BEJJJ*&%5'4EEoVV$1II
 
 &/ -3
"=2BBGG#!D%2= T_dmLL",]<N"O"Or#   Fcommon_prefix_lencommon_attn_metadata
fast_buildc                     |j         }|j        }|j        }|j        }|j        }|j        }	|j        }
|j        }| j        rdn|j	        }|}d}| j
        rW|rU| j        sJ t          || j        d          \  }}}}|}||d          |z
  }|	d |         }	|d |dz            }|
d |         }
t          j        || j        | j        | j        |	| j        ||| j        | j        d          }t+          | j        |||||	|
|||| j
        ||          }|S )NFr   T)decode_thresholdrequire_uniformrr   )num_reqsr~   rD   r   rb   rT   r`   rf   sliding_window_sizer]   enable_kv_split)r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   rj   )r   r^   r_   ra   r`   rb   block_table_tensorrd   r   rf   rg   r   r   opscpu_attn_get_scheduler_metadatar~   rD   r   rT   r   r]   r\   )r   r   r   r   r   r^   r_   ra   r`   rb   r   rd   rf   rj   rh   num_decodesnum_prefillsnum_prefill_tokenssheduler_metadataattn_metadatas                       r!   buildz!CPUAttentionMetadataBuilder.build   s    (00B,:*6.>'01D+81R7K7R(  	BV 	B////*(%)%A$(   O[,(9;M #H+KLL9<MMN-H-.?a.?@O!3L[L!A?n*]*+ $ 0 
 
 
 -/'+#*%0!2/)
 
 
  r#   )F)rN   rO   rP   r   rS   rX   r   r   rp   rv   rV   r   rQ   r\   r   __classcell__)r   s   @r!   r>   r>   i   s        #P$#P #Y#P  	#P
 #P 
#P #P #P #P #P #PR !	@ @@ 6@ 	@
 
@ @ @ @ @ @ @ @r#   r>   c                      e Zd Zdej        ddfdededededee         dz  dedz  ded	edz  d
ededz  de	j
        dz  ddfdZ	 	 	 ddede	j
        de	j
        de	j
        de	j
        dedz  de	j
        dz  de	j
        dz  de	j
        dz  de	j
        fdZde	j
        de	j
        de	j
        de	j
        ded
ede	j
        fdZdS )r;   Nr~   rE   scalerD   alibi_slopesrs   kv_cache_dtypelogits_soft_capr4   kv_sharing_target_layer_namesinksr   c                    |
| _         || _        || _        t          |          | _        |4|	t
          j        t
          j        fv rt          	                    d           |d}|| _
        || _        | t          j        |t          j                  }|| _        |d| _        n,|	t
          j        k    r|dz
  |dz
  f| _        n|dz
  df| _        || _        | j        | j        z  | _        t'          |          rt)          d          |	| _        || _        | j         | j        j        d         |k    sJ d            d S d S )NzbCPU_ATTN does not support logits softcap for ENCODER and ENCODER_ONLY, outputs may be slightly offr   rT   )rt   rt   rr   z'FP8 KV cache is unsupported in CPU_ATTNzLSinks must have the same number of heads as the number of heads in the layer)r   r~   rE   floatr   r   r7   r8   loggerwarning_oncer   rD   r   tensorr   r   rs   r   num_queries_per_kvr   NotImplementedErrorr4   r   shape)r   r~   rE   r   rD   r   rs   r   r   r4   r   r   s               r!   rv   z CPUAttentionBackendImpl.__init__   s}    -I)""5\\
&9!&9
 ,
 ,
 I   "O.(# <EMJJJL(!"*D-444#1A#5~7I"JD#1A#5q"9D,"&.D4E"E 00 	Q%&OPPP"
:!:#A&)333% 433 "!33r#   layerquerykeyvaluekv_cacher   outputoutput_scaleoutput_block_scalec
                 6   |
J d            ||	t          d          ||S |j        }
| j        t          j        t          j        fv r?|                     |d|
         |d|
         |d|
         |d|
         || j                  S |                    d          \  }}| j        '|%|#t          j
        |||||j        |j                   |j        rY| j        
J d            |j        }|                     |||
         |||
         |||
         |||
         || j                   |}
|
dk    rdt          j        |d|
         |||d|
         |j        |j        | j        |j        | j        | j        |j        | j        |j        | j                   |S )a  Forward pass for CPU attention backend.

        Args:
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
            kv_cache: shape =
                [2, num_blocks, num_kv_heads, block_size, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
        NzOutput tensor must be provided.zJfused output quantization is not yet supported for CPUAttentionBackendImplr   z-Attention sink is unsupported in SDPA prefill)r   	key_cachevalue_cacher   r`   rb   r   rf   r   rs   rc   softcapre   s_aux)r   r^   r4   r   r8   r7   _run_sdpa_forwardunbindr   r   cpu_attn_reshape_and_cacherd   r]   rg   r   rh   cpu_attention_with_kv_cacher`   rb   r   rf   r   rs   rc   r   re   )r   r   r   r   r   r   r   r   r   r   r^   r   r   rh   s                 r!   forwardzCPUAttentionBackendImpl.forward	  s>   0 !!#D!!!#'9'E%/    M); >m8-:OPPP))((()&&&'((())))*   "*!3!3	; -5!**!   ) 	2:%%'V%%% - ?""'(99:%&778'(99:()::;   !2q  +.../#'0001 - =&/j$+!.#2)5,#0#Cj   " r#   c                 (   |j         }|| j        !t          | j        |j        |j                  }n| j        d         dk    s| j        d         dk    r<|j        J t          |j        | j        d         | j        d         |j                  }n!d g|j                            d          dz
  z  }||_         |	                    d|
                                dz
            }|	                    d|
                                dz
            }|	                    d|
                                dz
            }| j        | j        k    r8|                    | j        d          }|                    | j        d          }|t          j        k    }|j                                        }	t%          t'          |                    D ]}
||
         }|	|
         }|	|
dz            }t(          j        j                            |d d d ||d d f         |d d d ||d d f         |d d d ||d d f         |d|o|d u | j                                      d          	                    |
                                dz
  d          }||||d d d d f<   |S )	Nr   rt   rr   rH   )dimg        )	attn_mask	dropout_p	is_causalr   )ri   r   _make_alibi_biasrT   rj   rs   rb   _make_sliding_window_biassizemovedimr   rD   r~   repeat_interleaver   r   r6   numpyrangelenr   nn
functionalscaled_dot_product_attentionr   squeeze)r   r   r   r   r   r   r4   
attn_maskscausal_attnrj   imaskstart_qend_qsub_outs                  r!   r   z)CPUAttentionBackendImpl._run_sdpa_forwardp  s    #2
 ,-%K!0 


 $Q'2--1DQ1G21M1M$-9996!0'*'*K	 

 #V}'C'H'H'K'Ka'OP
,6M)aq11kk!SWWYY]++aq11..''(?R'HHC++D,C+LLE=#88&5;;==s:'' 	2 	2Aa=D$Q'G"1q5)E#@@$75=!!!34aaa12$75=!!!34"!):ddl* A   q!,,  +2F75=!!!QQQ&''r#   )NNN)rN   rO   rP   r   r6   rV   r   rS   rX   r   rk   rv   r   r\   r   r   r/   r#   r!   r;   r;      s        )-&.37%)4 44 4 	4
 4 5kD(4 d
4 4 4 4 '*Dj4 |d"4 
4 4 4 4| '+,026e ee |e \	e
 |e ,e ,d2e t#e lT)e "L4/e 
e e e eN:|: \: |	:
 : ,: : 
: : : : : :r#   r;   r   rT   rj   r   c                    g }|                     d          dz
  }|                                }t          |          D ]#}||dz            ||         z
  }t          j        ||          }|d d d f         |d d d f         z
  }| j        d         }|d d d f                             |ddf          }|                    | d d d d f                                       d           t          j	        d||f|j
                                      t          j                                       d          }	|                    ||	z                       |                     %|S )Nr   rr   r   diagonal)r   r   r   r   aranger   repeatmul_
unsqueeze_emptyrT   fill_inftriu_appendto)
r   rT   rj   attn_biasesseq_numr   seq_lenbiasr~   inf_masks
             r!   r   r     sj   
 ')K!!!$$q(G#))++N7^^ 8 8 Q'.*;;|G5111 D!!!G}tAAAtG}, &q)	D!!!G}##Y1$566		,qqq$}-..99!<<<KGW-TZ@@@UEI:UAU 	
 	D8O//667777r#   left_window_sizeright_window_sizec                    g }|                      d          dz
  }|                                 } t          |          D ]}| |dz            | |         z
  }t          j        d||fd|          }|dk    rt          j        ||          }|dk    rt          j        ||           }t          j        |          }|                    |           |S )Nr   rr   )
fill_valuerT   rt   r   )	r   r   r   r   fulltriltriulogr   )	rj   r   r   rT   r   r   r   r   r   s	            r!   r   r     s     ')K!!!$$q(G#))++N7^^ ! ! Q'.*;;z!
 
 
 "":d->???Dr!!:d.>->???Dy4    r#   rC   rE   c                    ||dz  dk    r|dz  dk    rdS t           j        j                                        }|r| t           j        fv r|dz  dk    rdS |dz  dk    r%t          j                    t          j        k    rdS dS dS )Nr%   r      vec16amxneonvec)	r   _C_cpu_is_amx_tile_supportedr   r	   rw   r   ARM)rT   rC   rE   supports_amxs       r!   r   r     s     R1!4!4R19L9Lw8=7799L %.!222zB!7K7Ku	bA		022koEE65wr#   r   )-dataclassesr   typingr   r   vllmr   r   vllm.configr   vllm.loggerr   vllm.platformsr   r	   vllm.v1.attention.backendr
   r   r   r   r   r   r    vllm.v1.attention.backends.utilsr   vllm.v1.kv_cache_interfacer   r   rN   r   X86r   rx   r   r\   r>   r;   rk   rT   rS   r   rV   r   rX   r   r/   r#   r!   <module>r     s   " ! ! ! ! !        # # # # # # " " " " " " # # # # # # 8 8 8 8 8 8 8 8                       I H H H H H H H	X		 +A 3 3 3 3 3* 3 3 3l / / / / / / / /&f f f f f":;O"P f f fRX X X X Xm X X Xv,; L 
%,	   >L  ;	
 
%,   8 BF ;$'47$J     r#   