
    .`i:                        U d dl Z d dlmZ d dlmZmZmZmZ d dlm	Z	m
Z
mZmZmZ d dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZmZ e	rd d	lmZ d d
lmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z(m)Z)m*Z*m+Z+m,Z,  e$e-          Z.ed         Z/da0e/dz  e1d<   dZ2de3de4fdZ5e j6        d             Z7de/fdZ8e G d d                      Z9dede:e3         de;d         de<e3e9f         fdZ=de<e3e9f         de9fd Z>	 dRd!e?d"e+d#e?de@e+eejA        gejA        f         f         fd$ZBd"e+de+fd%ZC	 dSd"e+d'e?de@e?e?e?e?e?e?f         fd(ZD	 	 dTd"e+d'e?d*e4de@e?e?e?e?f         fd+ZE	 dRd,ejA        d-e?d.e?de:e@e?e?f                  fd/ZF	 dSd0d1d2d3d'e?de4fd4ZGd5ejA        d6e?dejA        fd7ZHd8ejA        dejA        fd9ZId:e3d;e
d<e:e@e3e
e
f                  de
fd=ZJe G d> d?e                      ZKd@e3dAe;e(         de;e(         fdBZLdCejA        dDejM        fdEZN	 	 	 dUdFejA        dGe?dHe?dz  dIe?dejA        f
dJZOd"e+dKejA        dLejA        de+fdMZPdNejA        dFejA        dOedPe3dejA        f
dQZQdS )V    N)Callable)	dataclassfieldfieldsmake_dataclass)TYPE_CHECKINGAnyLiteralProtocolget_args)runtime_checkable)
VllmConfigget_layers_from_vllm_config)cdiv)KVCacheSpec	MambaSpec)SchedulerOutput)
InputBatch)get_kv_connector_cache_layout)init_logger)AttentionLayerBase)AttentionBackendAttentionImplAttentionMetadataCommonAttentionMetadatasubclass_attention_backend)NHDHND_KV_CACHE_LAYOUT_OVERRIDEvaluereturnc                 .    | t          t                    v S N)r   KVCacheLayoutType)r!   s    t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/attention/backends/utils.pyis_valid_kv_cache_layoutr'   /   s    H.////    c                      d } t           $t           } t                              d|            | S t          j        } | t                      } n,t          |           sJ t                              d|            | S )NzM`_KV_CACHE_LAYOUT_OVERRIDE` variable detected. Setting KV cache layout to %s.zT`VLLM_KV_CACHE_LAYOUT` environment variable detected. Setting KV cache layout to %s.)r   logger	info_onceenvsVLLM_KV_CACHE_LAYOUTr   r'   cache_layouts    r&   get_kv_cache_layoutr0   3   s    
 26L ,0-	
 	
 	

  ,L466'555557	
 	
 	

 r(   r/   c                 
    | a d S r$   )r   r.   s    r&   set_kv_cache_layoutr2   Q   s     ,r(   c                       e Zd ZU dZeed<   edz  ed<   eed<   dZeed<    e	dd          Z
edz  ed	<    e	dd          Zedz  ed
<   dS )PerLayerParametersa  
    Currently, FlashInfer backend only support models in which all layers share
    the same values for the following hyperparameters. Should not be used for
    trtllm-gen backend since it supports different values for the following
    hyperparameters.
    window_leftNlogits_soft_capsm_scaleF	has_sinks)defaultcomparehas_same_window_leftshas_same_all_params)__name__
__module____qualname____doc__int__annotations__floatr8   boolr   r;   r<    r(   r&   r4   r4   V   s           T\!!!OOOIt).tU)K)K)K4$;KKK',uT5'I'I'IIIIIIr(   r4   vllm_configlayer_namescls_r   c                 V   t          | t          |          }i }|                                D ]{\  }}|j        }t	          ||          sJ t          |dd          }||d         nd}	t          |dd          }
|j        }t          |dd          du}t          |	|
||          ||<   ||S )zc
    Scan layers in `layer_names` and determine some hyperparameters
    to use during `plan`.
    sliding_windowNr   r    r6   sinks)r   r   itemsimpl
isinstancegetattrscaler4   )rF   rG   rH   layersper_layer_paramskeylayerrM   window_sizer5   r6   r7   r8   s                r&   get_per_layer_parametersrV   h   s     ) F
 79llnn 
 

Uz$%%%%% d$4d;;(3(?k!nnR!$(94@@:D'400<	 2(I!
 !
 r(   rR   c                    t          |           dk    s
J d            t          |                                           }|d         t          fd|D                       _        t          fd|D                       _        S )ad  
    Currently, FlashInfer backend other than trtllm-gen
    only support models in which all layers share
    the same values for the following hyperparameters:
    - `window_left`
    - `logits_soft_cap`
    - `sm_scale`

    So this function asserts that all layers share the same values for these
    hyperparameters and returns the global values.
    r   z'No attention layers found in the model.c              3   8   K   | ]}|j         j         k    V  d S r$   )r5   .0paramsglobal_paramss     r&   	<genexpr>z/infer_global_hyperparameters.<locals>.<genexpr>   s?       . .<Bm77. . . . . .r(   c              3   $   K   | ]
}|k    V  d S r$   rE   rY   s     r&   r]   z/infer_global_hyperparameters.<locals>.<genexpr>   s9       , ,$*-, , , , , ,r(   )lenlistvaluesallr;   r<   )rR   
param_setsr\   s     @r&   infer_global_hyperparametersrd      s       1$$$&O$$$&--//00JqMM*- . . . .FP. . . + +M' ), , , , ,.8, , , ) )M% r(   attn_chunk_sizecommon_attn_metadata
block_sizec                     |j                                         }|j                                        }|j        }|j        j        }|dd          |d d         z
  }|j        d         }t          j        | ||z
  | z  z
  |          	                    t          j
                  }	| ||  z  z   }
dt          ||	z
  |           z   }t          j        |          }|d          t          j        ||z
  |          }t          j         t          j
                  |z
  }t          j        ||          |z
  dz
  }t          j        ||	z
  |          }|	||dk    <   t          j        || |dz
  z  z
  |           |dk             ||dk    <   t          j         dz   t          j
                  }t          j        ||dd                     d|d<   t          j        |d         | t          j
                  }|
||dz
  <   ||z
  }t          j        ||          || z  t          j        |
|          z   z
  }||z  }| |z  dk    sJ d|  d|             | |z  }|d d d f         t          j        |t          j
                  z   }|                    d                              |j        d         dz
            }t          j        t          j        |t          j
                  ||z            }t'          j        |          t'          j        |           fd	} ||          }t'          j        |          }t'          j        |          }t+          |                                          }t/          ||                    |d
          |                    |d
          t3          |          |j        |                                |||j        d
|t'          j        |                    |fS )N   r    r   dtype)outzattn_chunk_size z  is not divisible by block_size )maxc                 @    | f                              d          S )Nr    )view)block_tablebatch_indices_torchblock_indices_torchvirtual_batchess    r&   <lambda>z6make_local_attention_virtual_batches.<locals>.<lambda>Q  s'    ;00,
d?B r(   T)devicenon_blocking)query_start_loc_cpuquery_start_locseq_lensnum_reqsnum_actual_tokensmax_query_lenmax_seq_lenblock_table_tensorslot_mappingcausal_seq_lens_cpu_num_computed_tokens_cpu)rw   numpyseq_lens_cpur~   rx   ru   shapenpminimumastypeint32r   cumsumrepeatarangeemptyfullreshapecliptorch
from_numpyrA   rm   r   tor_   r{   r   )!re   rf   rg   query_start_loc_npseq_lens_nprp   ru   	q_seqlensactual_batch_sizeq_tokens_in_first_blocktokens_in_last_blocklocal_blockscu_num_blocksblock_offsetsr   rarangeseqlens_q_localcu_seqlens_q_localseqlens_k_localnum_computed_tokens_localk_seqstarts_absoluteblock_startspages_per_local_batchblock_indicesbatch_indicesmake_block_tableblock_table_localrw   r   r}   rq   rr   rs   s!                                 @@@r&   $make_local_attention_virtual_batchesr      sO   
 .AGGII&399;;K&9K!18F"122&);CRC)@@I#)!, !jK)3FG fRX  +k_<L.LMtI(??QQQL Il++M#B'OIml:LIIMYbh777-GFil33f<q@G i	,C C\RRO#:OFaK "$*/VaZ88/# #qj#OFQJ
 /A"5RXFFFIo#5abb#9::::q gmB/QQQO)=OMA%& // A9[,??/!BI.BL$Q$QQ (:5LZ'1,,,X?XXJXX -,, ,z9( !D)BIRX- - - M "))"--22{7H7Ka7O2PPMI
	#28444,, M  *=99*=99            )(55*+=>>#O44Ll&&(())K"/+..f4.PPTBB\"".@%))++,)6"!&!12K!L!L    r(   c                    | j         dk    r| S | j        J | j        J | j        }| j        }|d |         }| j        }| j        }t          j        ||dd          d          }t          j        ||          }t          j        |dz   |j	        |j
                  }d|d<   t          j        |d          |dd <   t          |                                                                          }	t          |                                                                          }
t!          ||                    dd	          | j        ||
|	| j        | j        | j        d| j        | j        
          } | S )Nri   T)right)	minlength)ru   rk   r   dimcpu)rv   )rx   rw   ry   rz   r{   r|   r}   r~   r   r   r   r   )r|   logits_indices_paddednum_logits_indicesrz   rx   r   	bucketizebincountr   ru   rk   r   rA   rm   itemsumr   r   ry   r}   r~   r   r   r   )rf   r   r   logits_indicesrz   rx   request_idsnum_decode_tokensdecode_query_start_locdecode_max_query_lentotal_num_decode_tokenss              r&   1make_kv_sharing_fast_prefill_common_attn_metadatar   j  s    )Q.. $#5AAA2>>>0F-@*+>,>+>?N#,H*:O /./!""2ETRRRK {hGGG #[1_3?;P   !"1!&.?Q!G!G!G12204466;;==>>!"3"7"7"9"9">">"@"@AA2.255e$5OO%.1*(4/B)6*8!5!N    r(   ri   decode_thresholdc                    | j         }| j        }| j        }| j        }| j        }||k    r|dd|ddfS |dd         |dd         z
  }||k    }||k    |z  }	|                                                    d                                          }
|	                                                    d                                          }|
}||
                                         }t          j	        |          s|dd|ddfS ||z
  }||z
  }t          j	        |	          s||d||dfS ||z
  }||z
  }|||         z
  }||z
  }||||||fS )a  
    Assuming a reordered batch, finds the boundary between prefill and decode
    requests.

    Args:
        common_attn_metadata: CommonAttentionMetadata object containing the
            batch metadata.
        decode_threshold: The maximum query length to be considered a decode.

    Returns:
        num_decodes: The number of decode requests.
        num_extends: The number of extend requests.
        num_prefills: The number of prefill requests.
        num_decode_tokens: The number of tokens in the decode requests.
        num_extend_tokens: The number of tokens in the extend requests.
        num_prefill_tokens: The number of tokens in the prefill requests.
    r   ri   Nr    r   )
r|   rz   r{   rw   r   rA   argmaxr   r   any)rf   r   r|   rz   
num_tokensrx   ry   
query_lensis_prefill_or_extend
is_prefillfirst_extendfirst_prefillnum_decodesr   num_prefills_or_extendsnum_prefill_or_extend_tokensnum_extendsnum_prefillsnum_prefill_tokensnum_extend_tokenss                       r&   "split_decodes_prefills_and_extendsr     s   * )6M#,H%7J*>O#0H(((Az1a// $ss';;J%(88j(,@@J'++--444<<AACCLNN$$+++3388::MK'5::<<9)** <Q#4a;;&4#-0A#A 9Z   
#(
 	
  +-Km+L#om&DD47II r(   Frequire_uniformc                    | j         }| j        }| j        }| j        }||k    r|r|dk    r|d|dfS |dd         |dd         z
  }|d                                         |k    rd|d|fS |rQt          j        ||d         k    |dk    z            r||d         z  |k    s
J d            |d|dfS ||d         k    }n||k    }t          j        |          s|d|dfS |                                	                    d                                          }	t          j        |d|	         |k              sJ |	}
||
z
  }||	                                         }||z
  }|
|||fS )a  
    Assuming a reordered batch, finds the boundary between prefill and decode
    requests.

    Args:
        common_attn_metadata: CommonAttentionMetadata object containing the
            batch metadata.
        decode_threshold: The maximum query length to be considered a decode.
        require_uniform: If True, requires that all decode requests have the
            same query length. When set, some queries may be considered prefills
            even if they are <= decode_threshold, in order to ensure uniformity.

    Returns:
        num_decodes: The number of decode requests.
        num_prefills: The number of prefill requests.
        num_decode_tokens: The number of tokens in the decode requests.
        num_prefill_tokens: The number of tokens in the prefill requests.
    ri   r   Nr    ztokens not padded correctlyr   )
r|   rz   r{   rw   r   r   rb   r   rA   r   )rf   r   r   r|   rz   r   rx   r   r   r   r   r   r   r   s                 r&   split_decodes_and_prefillsr     s   . )6M#,H%7J*>O((( )/144J)) $ss';;J!}...(Az)) 	3 9jJqM1jAoFGG 	.jm+z999;X999Q
A--:a=0

"22
9Z   *J))NN$$+++3388::M9Z/3CCDDDDDKk)L'6;;==#&77'8:LMMr(   r   workspace_sizerequest_offsetc                    g }dt          |           }}t          j        | |k                                              sJ ||k     r|d}}||k     rV|| |                                         x}z   |k    r3||z  }|dz  }||k     r#|| |                                         x}z   |k    3|                    ||z   ||z   f           ||k     |S )a  
    Split the prefill requests into chunks such that the total sequence length
    of each chunk is less than or equal to the workspace size.

    Args:
        seq_lens_cpu: The sequence lengths of the prefill requests on CPU.
        workspace_size: The maximum workspace size (in tokens) per chunk.
        request_offset: The offset to add to the request indices.
    Returns:
        A list of tuples of (reqs_start, reqs_end) representing chunk boundaries.
    r   ri   )r_   r   rb   r   append)	r   r   r   chunk_boundsinstartchunk_totalss	            r&   split_prefill_chunksr   %  s     Lc,qA9\^34499;;;;;
a%%{!ee\!_-A-A-C-C(CDWW1KFA !ee\!_-A-A-C-C(CDWW 	U^3Q5GHIII a%% r(   input_batchr   scheduler_outputr   c                    t          | j                  }fd| j        D             }t          j        |          }| j        d|         }|dk    }||k    | z  }||k    | z  }	t          j        |j        t          j                  }
d|
|	<   d|
|<   t          |	                                          }t          |		                                          }t          j        |t          j                  }d||||z   <   d|||z   d<   |
|k    }|
                                sdS t          j        |          d         }t          j        |
|         d	          }||         }d
 t          ||          D             }|D ]I}||         }||k    r9|                     ||           |                    ||          }|||<   |}||k    9JdS )z
    Reorders the batch to split into prefill and decode requests; places all
    requests with <= decode_threshold tokens at the front of the batch.

    Returns:
        True if the batch was modified, False otherwise.
    c                 *    g | ]}j         |         S rE   )num_scheduled_tokens)rZ   idr   s     r&   
<listcomp>z?reorder_batch_to_split_decodes_and_prefills.<locals>.<listcomp>U  s/       68-b1  r(   Nr   rj   ri      Fstable)kindc                 N    i | ]"\  }}t          |          t          |          #S rE   )rA   )rZ   srcdsts      r&   
<dictcomp>z?reorder_batch_to_split_decodes_and_prefills.<locals>.<dictcomp>u  s*    VVV83CHHc#hhVVVr(   T)r_   req_idsr   arraynum_computed_tokens_cpuzerosr   r   rA   r   r   whereargsortzipswap_statesget)r   r   r   rz   r   num_scheduled_tokens_npnum_computed_tokens_npr   	is_decode	is_extendreq_regionsr   r   target_regions
needs_swaporig_indicessorted_ordersrc_indicessrc_dest_mapr   r   next_dsts    `                    r&   +reorder_batch_to_split_decodes_and_prefillsr   @  s   ( ;&''H   <G<O   !h';<<(@(K'1,J(,<<*MI(+;;LI (9?"(;;;KK	K
immoo&&Kimmoo&&KXhbh777N>?N;{!::;23N;,../.J>> u 8J''*L:k*5HEEEL|,KVVs;7U7UVVVL  3Sjj##C---#''S11H #LC Sjj 4r(   query
batch_sizec                 ,   |                                  dk    s J d|                                   d            | j        d         }| j        d         }| j        d         }||z  dk    sJ d|d|            ||z  }|                     ||||          S )	z
    Reshapes the query tensor for the specified batch size, so that
    it has shape (batch_size, seq_len, num_heads, head_dim).
       zquery must be 3D, got Dr   ri   r   ztotal_tokens=z  is not divisible by batch_size=r   r   ro   )r  r  total_tokens	num_headshead_dimseq_lens         r&   reshape_query_for_spec_decoder    s    
 99;;!DeiikkDDD;q>LAI{1~H*$)))<<<<z<< *)) j(G::j'9h???r(   attn_outputc                 6   |                                  dk    r| S |                                  dk    s J d|                                   d            | j        d         | j        d         z  }|                     || j        d         | j        d                   S )zo
    Reshapes the attention output tensor, so that
    the batch_size and seq_len dimensions are combined.
    r     zattn_output must be 4D, got r  r   ri   r   r  )r  r  s     r&   #reshape_attn_output_for_spec_decoder    s    
 A??!!!#V+//BSBS#V#V#V!!!$Q'+*;A*>>LL+*;A*>@QRS@TUUUr(   name_prefixmetadata_clsr   c                 @    | |j         z   }t          |||f          }|S )zH
    Return a new subclass of `metadata_cls` with additional fields
    )bases)r=   r   )r  r  r   nameWrappeds        r&   subclass_attention_metadatar    s,     l33DT6,AAAGNr(   c                   B    e Zd ZU dZej        dz  ed<   dZedz  ed<   dS )KVSharingFastPrefillMetadataNr   r   )	r=   r>   r?   r   r   TensorrB   r   rA   rE   r(   r&   r  r    s?         155<$.555%)d
)))))r(   r  prefixunderlying_attn_backendc                 n    |                                 } G d d|          }t          | ||          }|S )Nc            	       4     e Zd Z	 ddedededef fdZ xZS )Gcreate_fast_prefill_custom_backend.<locals>.FastPrefillAttentionBuilderFcommon_prefix_lenrf   
fast_buildr"   c                     t          |          }t                                          |||          } G d d|j        t                    } |||          S )Nc                       e Zd Zd ZdS )|create_fast_prefill_custom_backend.<locals>.FastPrefillAttentionBuilder.build.<locals>.KVSharingFastPrefillAttentionMetadatac           	          t          |j                  D ]+}t          | |j        t	          ||j                             ,|j        | _        |j        | _        d S r$   )r   	__class__setattrr  rO   r   r   )selfmetadatarf   _fields       r&   __init__zcreate_fast_prefill_custom_backend.<locals>.FastPrefillAttentionBuilder.build.<locals>.KVSharingFastPrefillAttentionMetadata.__init__  se    "();"<"< S Sfk78V[3Q3QRRRR -B . /C.UD+++r(   N)r=   r>   r?   r*  rE   r(   r&   %KVSharingFastPrefillAttentionMetadatar#    s(        V V V V Vr(   r+  )r   superbuildr%  r  )r'  r  rf   r   new_common_attn_metadatar(  r+  r%  s          r&   r-  zMcreate_fast_prefill_custom_backend.<locals>.FastPrefillAttentionBuilder.build  s     BBVWW % ww}}!#;Z HV V V V V",V V V 98CWXXXr(   )F)	r=   r>   r?   rA   r   rD   r   r-  __classcell__)r%  s   @r&   FastPrefillAttentionBuilderr    s|        
  %		Y 	Y"	Y #:	Y 		Y
 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Yr(   r0  )r  attention_backend_clsbuilder_cls)get_builder_clsr   )r  r  underlying_builderr0  attn_backends        r&   "create_fast_prefill_custom_backendr6    sp     1@@BBY Y Y Y Y&8 Y Y Y< .5/  L r(   query_start_loc_p_cpuru   c          
         | j         j        dk    sJ |                                 }i }d }d }dD ]L}| |z   }i ||<   |||         d<   |                                                                ||         d<   t          j        t          j        t          j	        t          |                    |                    }|||         d<   t          ||         d                   }	|	||         d<   t          d|	          dz  }
g }t          |          D ]'\  }}|                    t          |                     (t          j        |t
          j        	          }|||         d
<   |Qt          j        |
ft$          t
          j        |          }t          j        |
ft$          t
          j        |          }nr|                                |
k     rZ|                    |
                              t$                     |                    |
                              t$                     |d|	                             |           |d|	                             |           |||         d<   |||         d<   N|||fS )Nr   )   numstotmlist	mlist_leni   r   rj   
offsetlistrk   ru   r   	batch_ptrtoken_chunk_offset_ptr)ru   typediffr   r   r   r   r   r   r   r_   rm   	enumerateextendrangetensorr   r   PAD_SLOT_IDnelementresize_fill_copy_)r7  ru   seqlens	nums_dictr@  rA  BLOCK_Mr:  r<  r=  MAX_NUM_PROGRAMSr>  idxnums                 r&   compute_causal_conv1d_metadatarS    s    !',5555#((**GII! $N $NW$%	'%)	'6"$(HHJJOO$5$5	'5! 29SYY+?+?!F!FGG&+	'7#	'*7344	*3	';'tY//!3
!$ 	* 	*HCeCjj))))\*EK@@@
+5	'<(
!#[F  I &+Z!#[F& & &"" !!##&666!!"23399+FFF&..$ %$$$!I+$$U+++iK	

%



*3	';'7M	'344i!777r(   ry   dcp_sizedcp_rankcp_kv_cache_interleave_sizec                 P   |                      d          }|Nt          j        |t          j        | j                                      d                              |d          }n(t          j        |ggt          j        | j                  }|                     t          j                                      d                              d|j	        d                   }||z  |z  |z  }|||z  z
  }t          j
        |||z  z
  d|          }||z   }	|	                    d          S )zWhile using dcp, kv_cache size stored on each rank may be different,
    use this function to calculate split decode seq_lens of each dcp rank.
    Only consider dcp now, we can extend the case of cp based on this.
    r   Nr?  ri   r    )sizer   r   r   ru   	unsqueezer   rG  r   r   r   squeeze)
ry   rT  rU  rV  num_requestsrank_offsetsseq_lens_tiledbase	remainderdcp_local_seq_lenss
             r&   get_dcp_local_seq_lensra    s3    ==##LLX_MMMYq\\VL!$$ 	 |ZLHO
 
 
 	EK  **2..55a9KA9NOO  	&	'	 &	& 	 0I
L#>>>	# I
 	)%%a(((r(   r   new_slot_mappingc           	      b   | }|j         |dt          |j                            z   }|j        t          j        t          |j                  t          j                  z   }|                    |||j        dz   |j        |	                                z   |j
        dz   |j        dz   |          }|S )a?  
    Creates a new CommonAttentionMetadata with all query lengths increased by 1.
    Also all seq lens are increased by 1.
    This is useful e.g. in speculative decoding with draft models, where we
    extend each sequence by 1 token.
    The slot mapping is computed externally, as it requires more information.
    Nrj   ri   )rx   rw   ry   r{   r|   r}   r   )rx   r_   rw   r   r   r   replacery   r{   r  r|   r}   )rf   r   rb  cadnew_query_start_locnew_query_start_loc_cpunew_cads          r&   extend_all_queries_by_1ri  <  s     C-7QS=P9Q9Q7Q0RR!5C#$$EK9 9 9  kk+3!/#..2B2BB'!+Oa'%  
 
G Nr(   rp   kv_cache_specmamba_cache_modec                     |dv r| S t          |t                    sJ t          j        |dz
  |j        z  d          }t          j        d|j        z   | j                  }|                    d          |z   }t          j	        | d|          S )an  
    Get the block table tensor for mamba kernels from the input
    common_attn_metadata.block_table_tensor given different mamba cache modes.

    - "all":   input  (#requests, cdiv(max_model_len, block_size));
               output (#requests, cdiv(max_model_len, block_size)).

    - "none":  input  (#requests, 1 + num_speculative_blocks);
               output (#requests, 1 + num_speculative_blocks).

    - "align": input  (#requests, cdiv(max_model_len, block_size));
               output (#requests, 1 + num_speculative_blocks), which are the last
               1 + num_speculative_blocks of each request.
    )rb   noneri   r   )min)ru   )
rN   r   r   clamprg   r   num_speculative_blocksru   rY  gather)rp   ry   rj  rk  start_indicesoffsetsindices_to_gathers          r&   mamba_get_block_table_tensorru  \  s    ( ?**-33333 \m66
 
 
 ,44[=O
 
 
 *33A66@|K,=>>>r(   )r   )ri   )ri   F)ri   Nri   )R	functoolscollections.abcr   dataclassesr   r   r   r   typingr   r	   r
   r   r   r   r   r   typing_extensionsr   vllm.configr   r   vllm.utils.math_utilsr   vllm.v1.kv_cache_interfacer   r   vllm.v1.core.sched.outputr   vllm.v1.worker.gpu_input_batchr   	vllm.envsr,   /vllm.distributed.kv_transfer.kv_connector.utilsr   vllm.loggerr   /vllm.model_executor.layers.attention_layer_baser   vllm.v1.attention.backendr   r   r   r   r   r=   r*   r%   r   rB   rH  strrD   r'   	lru_cacher0   r2   r4   r`   rB  dictrV   rd   rA   tupler  r   r   r   r   r   r   r  r  r  r  r6  ru   rS  ra  ri  ru  rE   r(   r&   <module>r     s        $ $ $ $ $ $ @ @ @ @ @ @ @ @ @ @ @ @                   / / / / / / ? ? ? ? ? ? ? ? & & & & & & = = = = = = = = :999999999999            $ # # # # # N N N N N N              
X		L) 6: ,t3 : : :0C 0D 0 0 0 0   :-&7 - - - -
 J J J J J J J J"*.s);?;P	#!
!"   B3 223   j L LL1L L "Hel^U\-I$JJK	L L L L^8 18 8  8  8  8 z @ @1@@ 3S#sC'(@ @ @ @J !:N :N1:N:N :N 3S#	:N :N :N :N| LM ,03EH	%S/   < @ @@'@ @ 
	@ @ @ @F@ @3 @5< @ @ @ @ 
VU\ 
Vel 
V 
V 
V 
V


 sC}%&
 		
 
 
 
 * * * * *8 * * *
**!"23* 

* * * *Z18 <18 L18 18 18 18l '(	%) %)l%)%) Dj%) "%	%)
 \%) %) %) %)P1L l 	   @"?"?l"? "? 	"?
 \"? "? "? "? "? "?r(   