
     `i.                         d dl mZmZ d dlmZ d dlmZ d dlmZ ddl	m
Z
  G d de          Z G d	 d
e          Z G d de          ZdS )    )ABCabstractmethod)deque)ceil)Optional   )loggerc                   D   e Zd ZU dZeed<   eeee         f         ed<   e	dedede
e         dee         fd            Zdede
e         dd	fd
Ze	dedededee         fd            Ze	dedededee         fd            Ze	dedededeeef         fd            Zd	S )CacheAllocatorzAbstract base class for cache managers. Cache managers keep track of per-request cache allocations, determine
    when a new physical block needs to be allocated and compute physical indices for reading or writing to the cache._index_block_tablen_blocks
request_idfree_blocksreturnc                     dS )zxAllocates n_blocks for a given request_id. Returns the num of blocks allocated if successful and None
        otherwise.N selfr   r   r   s       /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/generation/continuous_batching/cache_manager.pyallocate_blockszCacheAllocator.allocate_blocks   s	     	    Nc                     || j         v r1| j                             |          }|                    |           dS t          j        d| j         d|            dS )z.Frees all blocks associated with a request_id.zCacheAllocator z7 attempted to free blocks for non-existent request_id: N)r   popextendr	   warningr   )r   r   r   blocks_to_frees       r   r   zCacheAllocator.free_blocks$   st    ***!.22:>>N~.....Nr$+rrfprr    r   past_lengthquery_lengthc                     dS )zUReturns the physical indices of where to read request_id's cache in the cache tensor.Nr   r   r   r   r   s       r   get_read_indiceszCacheAllocator.get_read_indices.   	     	r   c                     dS )zVReturns the physical indices of where to write request_id's cache in the cache tensor.Nr   r!   s       r   get_write_indicesz CacheAllocator.get_write_indices3   r#   r   c                     dS )gReturns the attention type of the cache allocator and the key sequence length for the given request_id.Nr   r!   s       r   get_seqlens_kzCacheAllocator.get_seqlens_k8   r#   r   )__name__
__module____qualname____doc__int__annotations__dictstrlistr   r   r   r   r   r"   r%   tupler(   r   r   r   r   r      s        y y KKKsDI~&&&&  5QT: ZbcfZg    ^
c c
 t     3 S PS X\]`Xa    ^ C c QT Y]^aYb    ^  # S UZ[^`c[cUd    ^  r   r   c            
           e Zd ZdZdededdfdZdeded	ee         dee         fd
Z	dededede
e         fdZdededede
e         fdZdedededeeef         fdZdS )FullAttentionCacheAllocatorz3Cache manager for a group of full attention layers.index
block_sizer   Nc                 0    || _         || _        i | _        dS )zInitializes the cache manager for a group of full attention layers.
        Args:
            - index: the index of the associated layer group
            - block_size: the size of the blocks in the cache
        N)r   r6   r   )r   r5   r6   s      r   __init__z$FullAttentionCacheAllocator.__init__A   s      $r   r   r   r   c                     t                    |k     rdS || j        vr
g | j        |<   | j        |                             fdt          |          D                        |S )zAllocate blocks for a given request_id. Returns the number of blocks allocated if successful and None
        otherwise. For group of full attention layers, we always allocate the number of requested blocks.Nc              3   @   K   | ]}                                 V  d S Npopleft.0_r   s     r   	<genexpr>z>FullAttentionCacheAllocator.allocate_blocks.<locals>.<genexpr>R   s/      ,\,\q[-@-@-B-B,\,\,\,\,\,\r   )lenr   r   ranger   s      `r   r   z+FullAttentionCacheAllocator.allocate_blocksK   su     {h&&4T...,.Dj)*%,,,\,\,\,\ERZOO,\,\,\\\\r   r   r   c                    | j                             |          }|t          d|           g }t          ||z             D ]>}|| j        z  }|| j        z  }||         | j        z  |z   }	|                    |	           ?|S )zReturns the physical indices of where to read request_id's cache. For a group of full attention layers, we
        first write the new cache to the cache tensor and then read the entire cache from the beginning to the end.N!No block table found for request r   get
ValueErrorrC   r6   append
r   r   r   r   block_tablephysical_indicesi	block_idxblock_offsetphysical_indexs
             r   r"   z,FullAttentionCacheAllocator.get_read_indicesU   s     '++J77MMMNNN{\122 	4 	4AT_,It.L(3doETN##N3333r   c                 
   | j                             |          }|t          d|           g }t          |||z             D ]>}|| j        z  }|| j        z  }||         | j        z  |z   }	|                    |	           ?|S )zReturns the physical indices for writing to the cache. For a group of full attention layers, we write the new
        cache as a continuation of the existing cache for the same request.NrE   rF   rJ   s
             r   r%   z-FullAttentionCacheAllocator.get_write_indicese   s     '++J77MMMNNN{K,$>?? 	4 	4AT_,It.L(3doETN##N3333r   c                     ||z   }d|fS )r'   full_attentionr   r   r   r   r   	seqlens_ks        r   r(   z)FullAttentionCacheAllocator.get_seqlens_kt   s    ,.	**r   r)   r*   r+   r,   r-   r8   r0   r   r   r   r1   r"   r%   r2   r(   r   r   r   r4   r4   >   s,       ==c s t      5QT: ZbcfZg     3  S  PS  X\]`Xa          C  c  QT  Y]^aYb        + +# +S +UZ[^`c[cUd + + + + + +r   r4   c            
           e Zd ZdZdedededdfdZded	ed
ee         dee         fdZ	d	ededede
e         fdZd	ededede
e         fdZd	edededeeef         fdZdS )SlidingAttentionCacheAllocatorz2Cache manager for sliding window attention layers.r5   r6   sliding_windowr   Nc                     || _         || _        || _        t          | j        | j        z            | _        i | _        dS )a  Initializes the cache manager for a group of sliding window attention layers.
        Args:
            - index: the index of the associated layer group
            - block_size: the size of the blocks in the cache
            - sliding_window: the size of the sliding window
        N)r   r6   rY   r   _max_blocks_per_requestr   )r   r5   r6   rY   s       r   r8   z'SlidingAttentionCacheAllocator.__init__}   sB     $,'+D,?$/,Q'R'R$r   r   r   r   c                 R   || j         vr
g | j         |<   t          | j         |                   }|| j        k    rdS t          ||z   | j                  }||z
  }t                    |k     rdS | j         |                             fdt          |          D                        |S )a  Allocate blocks for a given request_id. Returns the number of blocks allocated if successful and None
        otherwise. For group of sliding window attention layers, we only allocate up to the point where we can fit an
        entire sliding window in the cache tensor.r   Nc              3   @   K   | ]}                                 V  d S r;   r<   r>   s     r   rA   zASlidingAttentionCacheAllocator.allocate_blocks.<locals>.<genexpr>   s/      ,c,cq[-@-@-B-B,c,c,c,c,c,cr   )r   rB   r[   minr   rC   )r   r   r   r   already_allocatedafter_allocationactual_n_blockss      `   r   r   z.SlidingAttentionCacheAllocator.allocate_blocks   s     T...,.Dj) 1* =>> <<<108;T=YZZ*->>{o--4*%,,,c,c,c,cERaLbLb,c,c,ccccr   r   r   c                    | j                             |          }|t          d|           || j        k     rdn	|| j        z  }t	          || j        dz
            }g }t          |||z             D ]H}|| j        z  }|| j        z  }	|| j        z  }
||	         | j        z  |
z   }|                    |           I|dg|z  z   S )a  Returns the physical indices of where to read request_id's cache in the cache tensor.
        For a group of sliding window attention layers, we read from the cache tensor before writing on it, because the
        new cache can overwrite the old one. To form the cache + new key / values states, we read the at most
        sliding_window - 1 cache page and then manually add the new key / values states after. Hence the -1 indices
        which indicate where to store the new key or values indices.NrE   r   r   r   rG   rH   rY   r^   rC   r6   rI   )r   r   r   r   rK   start_indexcache_lengthrL   rM   rN   rO   rP   s               r   r"   z/SlidingAttentionCacheAllocator.get_read_indices   s     '++J77MMMNNN&)<<<aa+PTPcBc;(;a(?@@{K,$>?? 	4 	4A$$AT_,It.L(3doETN##N33332$"555r   c                    | j                             |          }|t          d|           || j        z  }t	          || j                  }||z
  }g }t          |||z             D ]H}	|	| j        z  }	|	| j        z  }
|	| j        z  }||
         | j        z  |z   }|                    |           I|dk    r	dg|z  |z   }|S )aB  Returns the physical indices of where to write request_id's cache in the cache tensor. For a group of
        sliding window attention layers, we write the new cache in rolling-buffer kind of way: if we reach the end of
        the allocated physical cache, we start writing from the beginning of the physical cache again.NrE   r   rc   rd   )r   r   r   r   rK   re   rf   padding_lengthrL   rM   rN   rO   rP   s                r   r%   z0SlidingAttentionCacheAllocator.get_write_indices   s    
 '++J77MMMNNN!D$77<)<==%4{K,$>?? 	4 	4A$$AT_,It.L(3doETN##N3333A "tn47GGr   c                 @    |t          || j        dz
            z   }d|fS )r'   r   sliding_attention)r^   rY   rT   s        r   r(   z,SlidingAttentionCacheAllocator.get_seqlens_k   s)     3{D4G!4K#L#LL	"I--r   rV   r   r   r   rX   rX   z   s3       <<c s C D      5QT: ZbcfZg    &63 6S 6PS 6X\]`Xa 6 6 6 6. C  c  QT  Y]^aYb        0. .# .S .UZ[^`c[cUd . . . . . .r   rX   N)abcr   r   collectionsr   mathr   typingr   requestsr	   r   r4   rX   r   r   r   <module>rp      s    $ # # # # # # #                        $ $ $ $ $S $ $ $N9+ 9+ 9+ 9+ 9+. 9+ 9+ 9+xU. U. U. U. U.^ U. U. U. U. U.r   