
    .`iE                        d dl Z d dlmZmZmZ d dlmZ d dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ  ee          Z ed	
           G d d                      Z ed	d	           G d de                      Z ed	d	           G d de                      Z ed	d	           G d de                      Z ed	d	           G d de                      Z ed	d	           G d de                      Z ed	
           G d de                      Z ed	
           G d de                      Z ed	
           G d de                      Z ed	
           G d de                      Z ed	
           G d  d!e                      Ze G d" d#                      Ze G d$ d%                      Z e G d& d'                      Z!dS )(    N)	dataclassfieldsreplace)prod)Self)
VllmConfig)init_logger)cdiv)get_dtype_sizeT)frozenc                       e Zd ZU dZeed<   edefd            ZdedefdZ	dede
fdZedee
         de
fd	            Zd
S )KVCacheSpeczG
    A base class for specifying the KV cache format of one layer.
    
block_sizereturnc                     t           )zs
        The size of a page with `block_size` tokens in bytes.

        Returns:
            The page size
        NotImplementedErrorselfs    n/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/kv_cache_interface.pypage_size_byteszKVCacheSpec.page_size_bytes   s
     "!    vllm_configc                     t           )z
        The maximum possible memory usage of this KV cache in bytes.

        Returns:
            The KV cache size in bytes
        r   r   r   s     r   max_memory_usage_bytesz"KVCacheSpec.max_memory_usage_bytes&   s
     "!r   c                 $    t          | |          S )zR
        Create a new KVCacheSpec from self but replacing the block size.
        r   )r   )r   r   s     r   copy_with_new_block_sizez$KVCacheSpec.copy_with_new_block_size/   s     t
3333r   specsc                     t          fddd         D                       s
J d            t          j        d                   S )zW
        Merge a list of KVCacheSpec objects into a single KVCacheSpec object.
        c              3   0   K   | ]}|d          k    V  dS )r   N ).0specr    s     r   	<genexpr>z$KVCacheSpec.merge.<locals>.<genexpr>:   s,      ::458#::::::r      Nz7All layers in the same KV cache group must be the same.r   )allcopydeepcopy)clsr    s    `r   mergezKVCacheSpec.merge5   s]    
 ::::abb	::::: 	
 	
E	
 	
: }U1X&&&r   N)__name__
__module____qualname____doc__int__annotations__propertyr   r   r   r   r   classmethodlistr,   r#   r   r   r   r      s          
 OOO" " " " X""* " " " " "43 44 4 4 4 4 '$t* ' ' ' ' [' ' 'r   r   )r   kw_onlyc                       e Zd ZU eed<   eed<   ej        ed<   dZedz  ed<   edefd            Z	edefd            Z
dS )	AttentionSpecnum_kv_heads	head_sizedtypeNpage_size_paddedr   c                 J    | j         }| j        | j        |k    sJ | j        S |S N)real_page_size_bytesr<   )r   real_page_sizes     r   r   zAttentionSpec.page_size_bytesG   s6    2 ,(N::::((r   c                 `    d| j         z  | j        z  | j        z  t          | j                  z  S )N   )r   r9   r:   r   r;   r   s    r   r?   z"AttentionSpec.real_page_size_bytesO   sA     o  n TZ((	)	
r   )r-   r.   r/   r1   r2   torchr;   r<   r3   r   r?   r#   r   r   r8   r8   @   s         NNN;#'cDj'''    X 
c 
 
 
 X
 
 
r   r8   c                       e Zd ZU dZdZedz  ed<   dZedz  ed<   	 dZedz  ed<   d Z	de
defd	Zed
ee         dedz  fd            Zedee         defd            Zedefd            ZdS )FullAttentionSpeca  
    When hybrid allocator is disabled and the model contains both full
    attention layers and sliding window attention layers, sliding
    window attention are regarded as full attention in KV cache manager
    (blocks are allocated for all tokens), while computed as sliding window
    attention in model runner.
    In this case, we use FullAttentionSpec and record the sliding window size.
    Nhead_size_vsliding_windowattention_chunk_sizec                 Z    | j         #t                              | d| j                   d S d S )NrF   )rF   object__setattr__r:   r   s    r   __post_init__zFullAttentionSpec.__post_init__m   s3    #t]DNCCCCC $#r   r   r   c                     |j         j        }|j        j        }|j        j        }||z  dk    rt          |||z            }t          || j                  | j        z  S )Nr'   )model_configmax_model_lenparallel_configdecode_context_parallel_sizeprefill_context_parallel_sizer
   r   r   )r   r   rO   dcp_world_sizepcp_world_sizes        r   r   z(FullAttentionSpec.max_memory_usage_bytesq   sc    #0>$4Q$4R N*Q.. 0OPPMM4?33d6JJJr   window_sizesc                     t          |          dk    rd S t          |          dk    r|                                S t          d          )Nr   r'   zOAll attention layers in the same KV cache group must have the same window size.)lenpop
ValueError)r+   rU   s     r   merge_window_sizesz$FullAttentionSpec.merge_window_sizes{   sV    |!!4!####%%%$  r   r    c                    t          d |D                       s
J d            t          d |D                       }t          d |D                       }t          d |D                       r
J d             | |d         j        |d         j        |d         j        |d         j        |d         j        |d         j        | 	                    |          | 	                    |                    }|D ]O}t          t                    D ]8}t          ||j                  t          ||j                  k    s
J d	            9P|j        d
u|j        d
uz   dk    s
J d            |S )k
        Merge a list of FullAttentionSpec objects into a single
        FullAttentionSpec object.
        c              3   @   K   | ]}t          |t                    V  d S r>   
isinstancerE   r$   r%   s     r   r&   z*FullAttentionSpec.merge.<locals>.<genexpr>   -      II4:d$566IIIIIIr   JAll attention layers in the same KV cache group must be FullAttentionSpec.c              3   2   K   | ]}|j         	|j         V  d S r>   rG   r`   s     r   r&   z*FullAttentionSpec.merge.<locals>.<genexpr>   6       
 
$(T5H5TD5T5T5T5T
 
r   c              3   2   K   | ]}|j         	|j         V  d S r>   rH   r`   s     r   r&   z*FullAttentionSpec.merge.<locals>.<genexpr>   :       #
 #
(4 %4444#
 #
r   c              3   @   K   | ]}t          |t                    V  d S r>   r_   MLAAttentionSpecr`   s     r   r&   z*FullAttentionSpec.merge.<locals>.<genexpr>   -      LLdz$(899LLLLLLr   ;MLAAttentionSpec should be merged in MLAAttentionSpec.merger   )r   r9   r:   rF   r;   r<   rG   rH   RAll attention layers in the same KV cache group must have the same attention spec.Nr'   ZModel with both sliding window layers and chunked local attention layers is not supported.)r(   setanyr   r9   r:   rF   r;   r<   rZ   r   r8   getattrnamerG   rH   r+   r    rG   rH   merged_specr%   fs          r   r,   zFullAttentionSpec.merge   s    II5IIIII 	
 	
X	
 	
I  
 
,1
 
 
 
 
  # #
 #
#
 #
 #
  
  

 LLeLLLLL 	
 	
I	
 	
L cQx*q.Ah(a,(."1X611.AA!$!7!78L!M!M	
 	
 	
  	 	DM**  tQV,,QV0L0LLLL/ MLLL
 *$6,D8
  '   r   c                 j    | j         | j        z  | j        | j        z   z  t	          | j                  z  S r>   )r   r9   r:   rF   r   r;   r   s    r   r?   z&FullAttentionSpec.real_page_size_bytes   s?     O ~ 002 TZ(()	
r   )r-   r.   r/   r0   rF   r1   r2   rG   rH   rL   r   r   r4   rp   rZ   r5   r   r,   r3   r?   r#   r   r   rE   rE   Z   s0          #Kt"""!%NC$J%%% (,#*+++D D DK* K K K K K 	c#h 	3: 	 	 	 [	 *$t* * * * * [*X 
c 
 
 
 X
 
 
r   rE   c                   l    e Zd ZU dZedz  ed<   edefd            Ze	de
e         defd            ZdS )rk   Ncache_dtype_strr   c                     | j         dk    r
| j        dz  S | j        | j        z  | j        z  t	          | j                  z  S )N
fp8_ds_mlai  )ry   r   r9   r:   r   r;   r   s    r   r?   z%MLAAttentionSpec.real_page_size_bytes   sR    <// ?S((O n TZ(()	
r   r    c           	      h   t          d |D                       s
J d            t          d |D                       }t          |          dk    s
J d             | |d         j        |d         j        |d         j        |d         j        |d         j        |                                          S )Nc              3   @   K   | ]}t          |t                    V  d S r>   rj   r`   s     r   r&   z)MLAAttentionSpec.merge.<locals>.<genexpr>   s-      HH$:d$455HHHHHHr   zIAll attention layers in the same KV cache group must be MLAAttentionSpec.c              3   $   K   | ]}|j         V  d S r>   )ry   r`   s     r   r&   z)MLAAttentionSpec.merge.<locals>.<genexpr>   s%      !I!I4$"6!I!I!I!I!I!Ir   r'   zVAll attention layers in the same KV cache group must use the same quantization method.r   )r   r9   r:   r;   r<   ry   )	r(   rp   rW   r   r9   r:   r;   r<   rX   )r+   r    cache_dtype_str_sets      r   r,   zMLAAttentionSpec.merge   s    HH%HHHHH 	
 	
W	
 	
H "!I!I5!I!I!III&''1,,,# -,, sQx*q.Ah((."1X6/3355
 
 
 	
r   )r-   r.   r/   ry   strr2   r3   r1   r?   r4   r5   r   r,   r#   r   r   rk   rk      s          #'OS4Z&&&

c 

 

 

 X

 
$t* 
 
 
 
 [
 
 
r   rk   c                   *    e Zd ZU eed<   dedefdZdS )ChunkedLocalAttentionSpecrH   r   r   c                     |j         j        }|j        j        }t	          | j        |z   |          }t          || j                  | j        z  S r>   )	rN   rO   scheduler_configmax_num_batched_tokensminrH   r
   r   r   r   r   rO   r   
num_tokenss        r   r   z0ChunkedLocalAttentionSpec.max_memory_usage_bytes   sT    #0>!,!=!T %(>>
 

 J0043GGGr   Nr-   r.   r/   r1   r2   r   r   r#   r   r   r   r      sL         H* H H H H H H Hr   r   c                   *    e Zd ZU eed<   dedefdZdS )SlidingWindowSpecrG   r   r   c                     |j         j        dk    s
J d            |j        j        }|j        j        }t          | j        dz
  |z   |          }t          || j	                  dz   | j
        z  S )Nr'   zDCP not support sliding window.)rP   rQ   rN   rO   r   r   r   rG   r
   r   r   r   s        r   r   z(SlidingWindowSpec.max_memory_usage_bytes   s    *G1LLL- MLL $0>!,!=!T !#&<<m
 

 Z11A59MMMr   Nr   r#   r   r   r   r      sL         N* N N N N N N Nr   r   c                       e Zd ZU eeedf         df         ed<   eej                 ed<   dZedz  ed<   dZ	e
ed<   dZe
ed	<   d
Zeed<   edefd            ZdedefdZdS )	MambaSpec.shapesdtypesNr<   mamba2
mamba_typenonemamba_cache_moder   num_speculative_blocksr   c                     t          d t          | j        | j                  D                       }| j        | j        |k    sJ | j        S |S )Nc              3   Z   K   | ]&\  }}t          |          t          |          z  V  'd S r>   )r   r   )r$   shaper;   s      r   r&   z,MambaSpec.page_size_bytes.<locals>.<genexpr>  sM       
 
 KK.///
 
 
 
 
 
r   )sumzipr   r   r<   )r   	page_sizes     r   r   zMambaSpec.page_size_bytes  si     
 
"%dk4;"?"?
 
 
 
 
	  ,(I5555((r   r   c                     |j         j        dk    r)|j        j        }t	          || j                  | j        z  S |j         j        dk    r| j        d| j        z   z  S | j        d| j        z   z  S )Nr(   alignrB   r'   )cache_configr   rN   rO   r
   r   r   r   )r   r   rO   s      r   r   z MambaSpec.max_memory_usage_bytes%  su    #4=='4BMt77$:NNN%6'AA'1t/J+JKK'1t/J+JKKr   )r-   r.   r/   tupler1   r2   rC   r;   r<   r   r   r   r   r3   r   r   r   r#   r   r   r   r     s         %S/3&''''%+#'cDj'''J"c""""#C###    XL* L L L L L L Lr   r   c                       e Zd ZdedefdZdS )EncoderOnlyAttentionSpecr   r   c                     dS )Nr   r#   r   s     r   r   z/EncoderOnlyAttentionSpec.max_memory_usage_bytes1  s    qr   N)r-   r.   r/   r   r1   r   r#   r   r   r   r   /  s6        *       r   r   c                   "    e Zd ZdZdedefdZdS )CrossAttentionSpeczM
    KV cache spec for cross-attention layers in encoder-decoder models.
    r   r   c                 T    |j         j        }t          || j                  | j        z  S r>   )r   max_num_encoder_input_tokensr
   r   r   )r   r   max_encoder_lens      r   r   z)CrossAttentionSpec.max_memory_usage_bytes<  s)     &6SOT_558LLLr   N)r-   r.   r/   r0   r   r1   r   r#   r   r   r   r   6  sH         M* M M M M M M Mr   r   c                   P    e Zd ZU dZedz  ed<   edee         defd            Z	dS )SinkFullAttentionSpecNsink_lenr    r   c                    t          d |D                       s
J d            t          d |D                       }t          d |D                       }t          d |D                       r
J d             | |d         j        |d         j        |d         j        |d         j        |d         j        |d         j        |d         j	        | 
                    |          | 
                    |          	  	        }|D ]O}t          t                    D ]8}t          ||j                  t          ||j                  k    s
J d	            9P|j        d
u|j        d
uz   dk    s
J d            |S )r\   c              3   @   K   | ]}t          |t                    V  d S r>   r^   r`   s     r   r&   z.SinkFullAttentionSpec.merge.<locals>.<genexpr>M  ra   r   rb   c              3   2   K   | ]}|j         	|j         V  d S r>   rd   r`   s     r   r&   z.SinkFullAttentionSpec.merge.<locals>.<genexpr>Q  re   r   c              3   2   K   | ]}|j         	|j         V  d S r>   rg   r`   s     r   r&   z.SinkFullAttentionSpec.merge.<locals>.<genexpr>T  rh   r   c              3   @   K   | ]}t          |t                    V  d S r>   rj   r`   s     r   r&   z.SinkFullAttentionSpec.merge.<locals>.<genexpr>Y  rl   r   rm   r   )	r   r9   r:   rF   r   r;   r<   rG   rH   rn   Nr'   ro   )r(   rp   rq   r   r9   r:   rF   r   r;   r<   rZ   r   r8   rr   rs   rG   rH   rt   s          r   r,   zSinkFullAttentionSpec.mergeG  s    II5IIIII 	
 	
X	
 	
I  
 
,1
 
 
 
 
  # #
 #
#
 #
 #
  
  

 LLeLLLLL 	
 	
I	
 	
L cQx*q.Ah(a,1X&(."1X611.AA!$!7!78L!M!M

 

 

  	 	DM**  tQV,,QV0L0LLLL/ MLLL
 *$6,D8
  '   r   )
r-   r.   r/   r   r1   r2   r4   r5   r   r,   r#   r   r   r   r   C  sZ         HcDj+$t* + + + + [+ + +r   r   c                       e Zd ZU dZeeef         ed<   ede	fd            Z
dede	fdZedeeef         defd            Zedeeef         dedz  fd	            ZdS )
UniformTypeKVCacheSpecsa1  
    A KV cache spec for multiple layers with the same type of attention. Here,
    same types means always need the same number of token slots. For example,
    sliding window attentions with different window sizes are not the same type
    and should not be merged into one UniformTypeKVCacheSpecs.
    kv_cache_specsr   c                 b    t          d | j                                        D                       S )Nc              3   $   K   | ]}|j         V  d S r>   )r   r`   s     r   r&   z:UniformTypeKVCacheSpecs.page_size_bytes.<locals>.<genexpr>  s%      QQD4'QQQQQQr   )r   r   valuesr   s    r   r   z'UniformTypeKVCacheSpecs.page_size_bytes  s.    QQD4G4N4N4P4PQQQQQQr   r   c                 |    t          fd| j                                        D                       }|| j        z  S )Nc              3   h   K   | ],}t          |                              |j                  V  -d S r>   )r
   r   r   )r$   r%   r   s     r   r&   zAUniformTypeKVCacheSpecs.max_memory_usage_bytes.<locals>.<genexpr>  sR       
 
 ,,[994;OPP
 
 
 
 
 
r   )maxr   r   r   )r   r   max_num_pagess    ` r   r   z.UniformTypeKVCacheSpecs.max_memory_usage_bytes  sW     
 
 
 
+2244
 
 
 
 
 t333r   c                    t          d |                                D                       }t          |          dk    rdS t          t	          |                                                    t          t                    r+t          d |                                D                       S t          t                    r+t          d |                                D                       S t          t                    r-t          fd|                                D                       S t          t                    r-t          fd|                                D                       S t          t                    r-t          fd|                                D                       S t          d	t                               )
zI
        Whether all layers have the same type of KV cache spec.
        c              3   $   K   | ]}|j         V  d S r>   r   r`   s     r   r&   z:UniformTypeKVCacheSpecs.is_uniform_type.<locals>.<genexpr>  s$      NNd$/NNNNNNr   r'   Fc              3   @   K   | ]}t          |t                    V  d S r>   r^   r`   s     r   r&   z:UniformTypeKVCacheSpecs.is_uniform_type.<locals>.<genexpr>  s>        8<
4!233     r   c              3   @   K   | ]}t          |t                    V  d S r>   )r_   r   r`   s     r   r&   z:UniformTypeKVCacheSpecs.is_uniform_type.<locals>.<genexpr>  s>        9=
4!344     r   c              3   b   K   | ])}t          |t                    o|j        j        k    V  *d S r>   )r_   r   rG   r$   r%   one_specs     r   r&   z:UniformTypeKVCacheSpecs.is_uniform_type.<locals>.<genexpr>  sX          4!233 C'8+BB     r   c              3   b   K   | ])}t          |t                    o|j        j        k    V  *d S r>   )r_   r   rH   r   s     r   r&   z:UniformTypeKVCacheSpecs.is_uniform_type.<locals>.<genexpr>  sX          4!:;; O-1NN     r   c              3   b   K   | ])}t          |t                    o|j        j        k    V  *d S r>   )r_   r   r   r   s     r   r&   z:UniformTypeKVCacheSpecs.is_uniform_type.<locals>.<genexpr>  sW          4++ S/83RR     r   z Unsupported KV cache spec type: )rp   r   rW   nextiterr_   rE   r(   r   r   r   r   r   type)r+   r   block_sizesr   s      @r   is_uniform_typez'UniformTypeKVCacheSpecs.is_uniform_type  s7   
 NNn6K6K6M6MNNNNN{a5^22445566h 122 	  @N@U@U@W@W      "455 	  AOAVAVAXAX      "344 	     +1133     
 ";<< 	     +1133     
 ),, 
	     +1133      &C4>>CC  r   Nc                     |                      |          r@t          t          |                                                    j        } | ||          S dS )z
        Return a SameTypeKVCacheSpecs object if all layers have the same type
        of KV cache spec. Return None if not.
        )r   r   N)r   r   r   r   r   )r+   r   r   s      r   
from_specsz"UniformTypeKVCacheSpecs.from_specs  sX     ~.. 	d>#8#8#:#:;;<<GJ3*^LLLL4r   )r-   r.   r/   r0   dictr   r   r2   r3   r1   r   r   r   r4   boolr   r   r   r#   r   r   r   r   v  s           k)****R R R R XR4* 4 4 4 4 4 'T#{2B-C ' ' ' ' ['R 	S+-=(> 	4$; 	 	 	 [	 	 	r   r   c                   4    e Zd ZU dZeed<   ee         ed<   dS )KVCacheTensorzP
    A class for specifying how the workers should initialize the KV cache.
    size	shared_byN)r-   r.   r/   r0   r1   r2   r5   r   r#   r   r   r   r     s7           IIICyr   r   c                   4    e Zd ZU dZee         ed<   eed<   dS )KVCacheGroupSpecz
    Represents a group of model layers that share the same KV cache block table.
    These layers are regarded as one layer in the KV cache manager.
    layer_nameskv_cache_specN)r-   r.   r/   r0   r5   r   r2   r   r#   r   r   r   r     s:           cr   r   c                   N    e Zd ZU dZeed<   	 ee         ed<   	 ee         ed<   dS )KVCacheConfigz0
    The KV cache configuration of a model.
    
num_blockskv_cache_tensorskv_cache_groupsN)	r-   r.   r/   r0   r1   r2   r5   r   r   r#   r   r   r   r     sT           OOO'=))))P*++++ r   r   )"r)   dataclassesr   r   r   mathr   rC   typing_extensionsr   vllm.configr   vllm.loggerr	   vllm.utils.math_utilsr
   vllm.utils.torch_utilsr   r-   loggerr   r8   rE   rk   r   r   r   r   r   r   r   r   r   r   r#   r   r   <module>r      s    2 2 2 2 2 2 2 2 2 2        " " " " " " " " " " " " # # # # # # & & & & & & 1 1 1 1 1 1	X		 $)' )' )' )' )' )' )' )'X $%%%
 
 
 
 
K 
 
 &%
2 $%%%`
 `
 `
 `
 `
 `
 `
 &%`
F $%%%"
 "
 "
 "
 "
( "
 "
 &%"
J $%%%H H H H H H H &%H$ $%%%N N N N N N N &%N2 $L L L L L L L L: $    }    $	M 	M 	M 	M 	M 	M 	M 	M $/ / / / /- / / /d $I I I I Ik I I IX         	 	 	 	 	 	 	 	          r   