
    .`i"&              	       (   d dl Z d dlmZmZ d dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZmZ d dlmZ dej        d	ed
eej        ej        ej        f         fdZ G d de
          Ze G d de                      Z G d dee                   ZdS )    N)	dataclassreplace)
VllmConfig)cdiv)AttentionBackendCommonAttentionMetadata)BaseMambaAttentionMetadata!BaseMambaAttentionMetadataBuilder)AttentionSpecquery_start_loc
chunk_sizereturnc                 >   | j         dk    s
J d            t          | d                                                   dk    s
J d            | j        }|                     t
          j                  }|dd                                         }|dd                                         }t          |d                                                   }g }g }dgt          |          z  }	t          t          ||                    D ]\  }
\  }}||k    r|}||k     rr|||z  z
  }t          |||z
            }|                    t          |                     |                    |
           t          |          dz
  |	|
<   ||z  }||k     r|rrt          j        dgt          t          j        |                    z   |t
          j                  }t          |d                                                   |k    sJ n"t          j        dg|t
          j                  }t          |          dk    r!t          j        |	|t
          j                  n t          j        d|t
          j                  }t          j        ||t
          j                  }|||fS )	aa  
    Build chunk-aligned, variable-length metadata used by Mamba2 SSD kernels.

    Given per-sequence cumulative token starts `query_start_loc` of shape [B+1]
    and a physical `chunk_size`, returns three tensors on the same device:
      - cu_chunk_seqlens:  (nchunks+1,) int32   exclusive prefix-sum of
        logical-chunk lengths (each logical chunk never crosses a sequence or
        physical-chunk boundary).
      - last_chunk_indices: (B,)       int32   index of the last logical chunk
        for each sequence (=-1 for empty sequences).
      - seq_idx_chunks:     (nchunks,) int32   sequence index for each logical
        chunk in order.

    This is intentionally lightweight and CPU-side; it mirrors the metadata
    produced by the V1 Mamba2 meta-data builder and is exported so tests
    (and other callers) can avoid duplicating the logic.
       z!query_start_loc must be 1-D [B+1]r   zquery_start_loc[0] must be 0Ndevicedtype)r   )ndimintitemr   totorchint64tolistlen	enumeratezipminappendtensorlist	itertools
accumulateint32empty)r   r   r   qsl64startsendstotal
chunk_lensseq_idx_chunkslast_chunk_indicesbseposroomtakecu_chunk_seqlenslast_chunk_indices_tseq_idx_chunks_ts                      z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/attention/backends/mamba2_attn.pycompute_varlen_chunk_metadatar8      s   * 1$$$&I$$$q!&&(())Q...0N...#Fu{++E3B3Z  F9Db	  !!EJ "N%'D3v;;$6s640011  	6Aq66Aggz!12DtQW%%Dc$ii(((!!!$$$$'
OOa$7q!4KC Agg  	O <C$y+J77888+
 
 
 #B',,..//588888 <F%+NNN v;;?? 	'ekJJJJ[fEK@@@ 
 |N6UUU13CCC    c                   R    e Zd Zedefd            Zeded         fd            ZdS )Mamba2AttentionBackendr   c                      dS )NMAMBA2_ATTN r>   r9   r7   get_namezMamba2AttentionBackend.get_name[   s    }r9   Mamba2AttentionMetadataBuilderc                      t           S )N)r@   r>   r9   r7   get_builder_clsz&Mamba2AttentionBackend.get_builder_cls_   s    --r9   N)__name__
__module____qualname__staticmethodstrr?   typerB   r>   r9   r7   r;   r;   Z   sf        c    \ .T"BC . . . \. . .r9   r;   c                       e Zd ZU dZeed<   dZeed<   dZe	j
        dz  ed<   dZe	j
        dz  ed<   dZe	j
        dz  ed<   dS )	Mamba2AttentionMetadataFprep_initial_statesr   r   N	seq_idx_pcu_chunk_seqlen_plast_chunk_indices_p)rC   rD   rE   rK   bool__annotations__r   r   rL   r   TensorrM   rN   r>   r9   r7   rJ   rJ   d   s          %%%%J &*Iu|d")))
 .2u|d*111 15%,-44444r9   rJ   c                        e Zd ZeZdedee         dede	j
        f fdZdede	j        de	j        d	eee         ee         ee         f         fd
Z	 ddededed	efdZ xZS )r@   kv_cache_speclayer_namesvllm_configr   c                     t                                          ||||           |j                                        }|
J d            || _        d S )Nz@chunk_size needs to be set in the model config for Mamba2 models)super__init__model_configget_mamba_chunk_sizer   )selfrS   rT   rU   r   r   	__class__s         r7   rX   z'Mamba2AttentionMetadataBuilder.__init__z   s[     	[&III -BBDD
%%N &%%  *r9   num_prefillsnum_computed_tokens_p_cpuquery_start_loc_p_cpur   c                 0   g }g }g }d}t          |          D ]d}||                                         }	||dz                                            ||                                         z
  }
|	| j        z  dk    rd|                    |           |                    |           t	          |	| j                  | j        z  |	z
  }t          ||
          }||z  }|
|z  }
t	          |
| j                  }t          |          D ]K}|                    |           |                    |           t          | j        |
          }||z  }|
|z  }
L|
dk    sJ |                    t          |          dz
             f|                    |           |||fS )a^  
        Compute chunk-specific metadata for Mamba2.

        The code below carefully constructs the chunks such that:
        1. Chunks contain tokens from a *single* sequence only.
        2. For every sequence, we are guaranteed that we can
           retrieve the mamba state *every* chunk_size tokens.
        Constraint (1) dramatically simplifies the mamba2 kernels.
        Constraint (2) dramatically simplifies the implementation
        of prefix caching for mamba2 (wip). We need to take care
        of the interaction with chunked prefill in order to
        satisfy constraint (2).
        r   r   )ranger   r   r    r   r   r   )r[   r]   r^   r_   cu_chunk_seqlenseq_idxr-   
seqlen_posreq_idxthis_num_computedthis_new_tokens	chunk_lenn_chunkschunks                 r7   _compute_chunk_metadataz6Mamba2AttentionMetadataBuilder._compute_chunk_metadata   s   ( 
\** 	@ 	@G 9' B G G I I%gk27799'055778  !4?2a77w'''&&z222 *DO<<tN'( 
  	?;;	i'
9,OT_==Hx - -w'''&&z222AA	i'
9,"a''''%%c/&:&:Q&>????z***);;;r9   Fcommon_prefix_lencommon_attn_metadata
fast_buildc                    |                      |          }d }d }d }d}|j        dk    r.|j        +t          j        |j                                                  nd}|j        }	|j        }
|j        }|                                	                                }||	|
z
  |	         }|j
        |
 dz
  d          |z
  }|                     |
||          \  }}}t          j        ||j        j        t          j                  }t          j        ||j        j        t          j                  }t          j        ||j        j        t          j                  }t!          ||| j        |||          S )NFr   r   r   )rK   r   rL   rM   rN   )_compute_common_metadatar]   has_initial_states_pr   anyr   num_reqsnum_decode_tokenscompute_num_computed_tokenscpuquery_start_loc_cpurk   	as_tensorr   r   r%   r   r   )r[   rl   rm   rn   commonrL   rM   rN   rK   rs   r]   rt   num_computed_tokens_cpur^   r_   rb   rc   r-   s                     r7   buildz$Mamba2AttentionMetadataBuilder.build   s    ../CDD	 ## "" .: 	&566;;===   H!.L & 8 %@@BBFFHH $ )@<'(2)% %8,9J9L9LM#$ "
 <@;W;W)%< <8OW&8 +;Bk  I
 !&+;Bk! ! !
 $)?"+;Bk$ $ $   3/!5
 
 
 	
r9   )F)rC   rD   rE   rJ   metadata_clsr   r"   rG   r   r   r   rX   r   rQ   tuplerk   r   rO   r{   __classcell__)r\   s   @r7   r@   r@   u   s        +L*$* #Y*  	*
 * * * * * *<<<< $)<<<  %|	<<
 
tCy$s)T#Y.	/<< << << <<D !	A
 A
A
 6A
 	A

 
!A
 A
 A
 A
 A
 A
 A
 A
r9   r@   )r#   dataclassesr   r   r   vllm.configr   vllm.utils.math_utilsr   vllm.v1.attention.backendr   r   %vllm.v1.attention.backends.mamba_attnr	   r
   vllm.v1.kv_cache_interfacer   rQ   r   r}   r8   r;   rJ   r@   r>   r9   r7   <module>r      s       * * * * * * * *  " " " " " " & & & & & &               5 4 4 4 4 4BD\BDBD 5<u|34BD BD BD BDJ. . . . .- . . . 5 5 5 5 58 5 5 5 R
 R
 R
 R
 R
%&=>R
 R
 R
 R
 R
r9   