
    .`ij                     (   d Z ddlZddlmZ ddlmZ ddlmZ ddlZddl	Zddl
mc mZ ddlmZmZmZmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddl m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z,  ee-          Z.dej/        j0        _1         ej2        edd          Z3 ej2        ed          Z4dej5        dej5        fdZ6dej5        de7de7fdZ8 G d de%          Z9dej5        dej5        d e7d!e7dej5        f
d"Z:d#dd#d$dej5        d%e7de7d&e7d'e7dej5        fd(Z;d)ej5        d*ej5        d+ej5        d,ej5        fd-Z<e G d. d/                      Z= G d0 d1e'e=                   Z> G d2 d3e&          Z?d4e@deAeBe7e@z  f         fd5ZCdS )6z#Attention layer with FlexAttention.    N)	dataclass)cached_property)ClassVar)	BlockMask_mask_mod_signature_score_mod_signature	and_maskscreate_block_maskflex_attentionor_masks)
VllmConfig)
CacheDType)init_logger)vllm_is_batch_invariant)current_platform)cdiv)is_torch_equal_or_newer)AttentionBackendAttentionImplAttentionMetadataBuilderAttentionTypeCommonAttentionMetadatais_quantized_kv_cache)AttentionSpec   Tzreduce-overhead)	fullgraphmode)r   offsetsreturnc                     | j         }| dd          | d d         z
  }t          j        t          j        t	          |          |t          j                  |          S )N   devicedtype)r$   torchrepeat_interleavearangelenint32)r   r$   countss      }/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/attention/backends/flex_attention.py_offsets_to_doc_ids_tensorr-   3   sV    ^FQRR[73B3<'F"S[[u{CCCV      xmultipledimc                 >   || j         |         |z  z
  |z  }|dk    r| S |dk    r|n	| j        |z   }g }t          | j        dz
  |dz
  d          D ]7}||k    r|                    d|g            |                    ddg           8t	          j        | |dd          S )Nr   r!   r"   constant)r   value)shapendimrangeextendFpad)r/   r0   r1   
differencepad_listis         r,   pad_to_multipler>   ;   s    agclX56(BJQ##qv|CH16A:sQw++ $ $88OOQ
O,,,,OOQF####5H:Q7777r.   c                      e Zd ZU dZeed<   ej        ej        ej	        gZ
eeej                          ed<   ddgZeee                  ed<   edefd            Zed	edefd
            Zedefd            Zeded         fd            Ze	 ddedededededeedf         fd            Zeded         fd            Zedefd            Zedee         fd            ZdS )FlexAttentionBackendTaccept_output_buffersupported_dtypesautobfloat16supported_kv_cache_dtypesr   c                      dS )NFLEX_ATTENTION rH   r.   r,   get_namezFlexAttentionBackend.get_nameU   s    r.   	attn_typec                 6    |t           j        t           j        fv S )z?FlexAttention supports both decoder and encoder-only attention.)r   DECODERENCODER_ONLY)clsrJ   s     r,   supports_attn_typez'FlexAttentionBackend.supports_attn_typeY   s     ]2M4NOOOr.   c                     dS )z7FlexAttention supports full attention for image tokens.TrH   rN   s    r,   supports_mm_prefixz'FlexAttentionBackend.supports_mm_prefix^   s	     tr.   FlexAttentionImplc                      t           S N)rS   rH   r.   r,   get_impl_clsz!FlexAttentionBackend.get_impl_clsc   s      r.   
num_blocks
block_sizenum_kv_heads	head_sizecache_dtype_str.c                     d| |||fS )N   rH   )rW   rX   rY   rZ   r[   s        r,   get_kv_cache_shapez'FlexAttentionBackend.get_kv_cache_shapeg   s     :z<CCr.   FlexAttentionMetadataBuilderc                      t           S rU   )r_   rH   r.   r,   get_builder_clsz$FlexAttentionBackend.get_builder_clsq   s    ++r.   c                      dS NFrH   )argskwargss     r,   use_cascade_attentionz*FlexAttentionBackend.use_cascade_attentionu   s    ur.   c                     g S rU   rH   rQ   s    r,   get_supported_head_sizesz-FlexAttentionBackend.get_supported_head_sizesy   s    	r.   N)rC   )__name__
__module____qualname__rA   bool__annotations__r&   float16rD   float32rB   r   listr%   rE   r   staticmethodstrrI   classmethodrO   rR   typerV   inttupler^   ra   rf   rh   rH   r.   r,   r@   r@   L   s        !%$%%%5htEK01   
 >DZ<PxZ(89PPP c       \  P3 P4 P P P [P 4    [ !$23 ! ! ! \!   &D DDD D 	D
 D 
sCxD D D \D ,T"@A , , , \, $    \ c    [  r.   r@   block_tableseq_lensrX   total_blocksc                    | j         \  }}| j        }t          j        ||fdt          j        |          }t          ||          }t          j        ||          dddf         |dddf         k     }	t          j        |	| d          }
t          j        |	t          j        ||          dddf         d          }|                    d|
	                    t          j
                  |d           d|dddf<   |S )u:  
    Creates an inverse mapping from physical block locations to logical indices.

    The original block_table maps from logical blocks to physical locations:

    Logical to Physical (Original block_table):
    ┌───────────────────────────────────────────┐
    │ Request 0:                                │
    │                                           │
    │ Logical Blocks:  0  1  2  3  4  5  6  7   │
    │                  │  │  │  │  │  │  │  │   │
    │                  v  v  v  v  v  v  v  v   │
    │ Physical Blocks: 3  5  1  7  4  2  0  6   │
    └───────────────────────────────────────────┘

    This function creates the inverse mapping:

    Physical to Logical (Inverse mapping):
    ┌───────────────────────────────────────────┐
    │ Request 0:                                │
    │                                           │
    │ Physical Blocks: 0  1  2  3  4  5  6  7   │
    │                  │  │  │  │  │  │  │  │   │
    │                  v  v  v  v  v  v  v  v   │
    │ Logical Blocks:  6  2  5  0  4  1  7  3   │
    └───────────────────────────────────────────┘

    If multiple logical blocks map to the same physical block,
    this function returns the latest (maximum) logical block index.

    If a physical block is not mapped to by any logical block,
    its value in the result will be -1.

    IMPORTANT: Garbage Value Protection
    ────────────────────────────────────
    The block_table tensor may contain garbage values in unused positions
    (beyond the actual sequence length). For example, if a sequence only
    needs 3 blocks but the table has space for 8:

        block_table[0] = [10, 25, 7, 999, 1234, 888, ...]
                                    ^^^^^^^^^^^^^^^^^^^^
                                    garbage values

    These garbage values can cause issues because:
    1. They may map to valid physical blocks by coincidence
    2. The scatter_ operation will assign them logical indices
    3. Later attention computations may incorrectly access these blocks

    To prevent this, we use seq_lens and block_size to mask out unused
    entries, ensuring only valid block references are processed.

    IMPORTANT: Reused physical blocks (sliding-window / hybrid attention)
    ────────────────────────────────────────────────────────────────────
    For some attention types, physical cache blocks can be reused over time.
    This can cause the same physical block id to appear multiple times in a row
    of `block_table` at different logical block indices. In that case, only the
    latest logical block index corresponds to the current contents of that
    physical block. Therefore, the inverse mapping must pick the maximum logical
    block index for each physical block id.

    Args:
        block_table: Tensor of shape [max_reqs, max_num_blocks]
            mapping logical blocks to physical locations. May contain
            garbage values in unused positions.
        seq_lens: Tensor of sequence lengths for each request. Used to
            determine how many blocks are actually needed per sequence.
        block_size: Size of each block in tokens. Used with seq_lens to
            compute the number of valid blocks per sequence.
        total_blocks: Total number of physical blocks available

    Returns:
        A tensor of shape [max_reqs, total_blocks] where each entry
        physical_to_logical[req_id, physical_block] contains the logical
        block index for that physical block, or -1 if unused.
    r"   )r%   r$   r$   Nr   amaxreduce)r5   r$   r&   fulllongr   r(   wherescatter_reduce_toint64)rw   rx   rX   ry   max_reqsmax_num_blocksr$   physical_to_logicalnum_blocks_per_seqmaskvalid_block_tablevalid_logical_indicess               r,   physical_to_logical_mappingr      s,   b  +0HnF*	< "EJv  
 (,Hj'A'A^F333D!!!G<
QQQW
%	& 	
 D+q99!Kel>&999$'BA  ''
  --/DV (    !#1r.   r"   )r1   ignored_valpad_valMr   r   c                l   d|cxk    r|k    sn t          d          || j        z  }|                     |d          }|                                |j        d         z  |j        d         }}|                    ||          }| j        }	t          j        ||	          	                    ||          }
t          j
        ||dz   f||	          }|                    d||
d           ||k    |
|                    d|          k    z  }t          j        |                    t          j                  d          dz
  }t          j        ||          }t          j        |d	          \  }}|||f         |||||f         f<   |                    |j                                      d|          }|S )
u  
    - Keeps the first occurrence of each non-zero value while preserving order,
      then left-packs those uniques and fills the rest with `pad_val`.
    - Returns (packed, keep_mask) with the *same shape* as `x`.
    - Requires that all values be in the range [0, M]
    - Skips ignored_val

    Works on CPU or GPU, no Python loops, O(B·N) time / O(B·M) memory.

    Example:
    x =[3, 1, 0, 1, 2], M=3, ignored_val=0 => [3, 1, 2, -1, -1]
    r"   z`pad_val` must lie in [-1, M]r{   r!   aminr}   r1   T)as_tuple)
ValueErrorr6   movedimnumelr5   reshaper$   r&   r(   expandr   r   gathercumsumr   r   	full_likenonzero)r/   r   r1   r   r   x_permBNx_flatr$   idx	first_idxkeepdest_pospacked_flatrowssrc_colspackeds                     r,   unique_static_unsortedr      s   ( 'Q8999 ,CYYsBF<<>>V\"--v|B/?qA^^Aq!!FXF
,q
(
(
(
/
/1
5
5C 
Aq1u:q888IaV<<< k!cY-=-=a-H-H&HID |DGGEJ//Q777!;H/&'22K]4$777ND(28x2HKhtX~../   ..66r3??FMr.   bhq_idxkv_idxc                     ||k    S rU   rH   )r   r   r   r   s       r,   causal_mask_modr     s     F?r.   c                   (   e Zd ZU eed<   eed<   eed<   ej        ed<   eed<   ej        ed<   ej        ed<   ej        ed<   eed	<   eed
<   ej        dz  ed<   ej        dz  ed<   ej        dz  ed<   eed<   eed<   eed<   eed<   ej        ed<   ej        ed<   ej        ed<   dZeed<   dZ	dZ
edz  ed<   dZedz  ed<   eZeed<   dZej        dz  ed<   dZeed<   dZeed<   dZeed <   dZedz  ed!<   dZedz  ed"<   dZeeeeeef                  f         dz  ed#<   ed$             Zd%ej        d&ej        d'ej        d(eej        ej        ej        f         fd)Zd(efd*Zd(efd+Zd(efd,Z d(efd-Z!d. Z"d(edz  fd/Z#d(efd0Z$d(efd1Z%d2 Z&dS )3FlexAttentionMetadatacausalnum_actual_tokensmax_query_lenquery_start_locmax_seq_lenrx   rw   slot_mappinguse_cascadecommon_prefix_lenNcu_prefix_query_lensprefix_kv_lenssuffix_kv_lenstotal_cache_tokensrX   max_possible_sequence_lengthnum_reqsr   decode_offsetr   r   num_input_tokens
block_mask	score_modlogical_mask_moddoc_idsTdirect_buildr   q_block_sizekv_block_sizetransformed_score_modsliding_windowmm_prefix_rangec                     t          j        t          | j        | j                  | j        j        t           j                  S )Nr#   )r&   r(   r   r   rX   rw   r$   r   selfs    r,   logical_block_idsz'FlexAttentionMetadata.logical_block_idsO  s;    |!4?33#**
 
 
 	
r.   request_lookupr   physical_kv_idxr   c                    ||         }|| j         z  }|| j         z  }| j        ||f         }|| j         z  |z   }|dk    }	|| j        |         k     }
|dk    }|	|
z  |z  }|| j        |         z
  }|| j        |         z   }|||fS )zConvert physical indices to logical indices for both query and kv.

        NB is_within_lower_bound: do sequences start on block_boundaries?

        Returns:
            tuple of (is_valid, logical_q_idx, logical_kv_idx)
        r   )rX   r   rx   r   r   )r   r   r   r   q_reqphysical_kv_blockphysical_kv_offsetlogical_block_idxlogical_kv_idx
live_blockwithin_upper_boundwithin_lower_boundis_validlocal_q_idxlogical_q_idxs                  r,   _convert_physical_to_logicalz2FlexAttentionMetadata._convert_physical_to_logicalW  s     u% ,t>,t> 4U<M5MN*T_<?QQ '!+
+dmE.BB+q0 225GG d2599#d&8&??66r.   c           
            j         J dt          j        dt          j        dt          j        dt          j        dt          j        f
 fd}|S )a  Creates the mask_mod function for FlexAttention.

        This function creates the combined mask mod function that handles:
            1. The paged attention block mapping
            2. The mapping from packed query sequences to logical query entries

        It also by defaults adds the decoding offset to the query indices.
        With this info we create the "logical" indices that are passed to
        mask_mod functions. This allows mask mod functions to be agnostic to
        layout of the query and key/value tensors.
        Nr   r   r   r   r   c           	                               j        ||          \  }}}t          j        |                    | |||          d          S rc   )r   r   r&   r   r   )r   r   r   r   r   r   r   r   s          r,   final_mask_modzAFlexAttentionMetadata.get_causal_mask_mod.<locals>.final_mask_mod  sZ     11$,WW 6X}n ;%%aM>JJ  r.   r   r&   Tensor)r   r   s   ` r,   get_causal_mask_modz)FlexAttentionMetadata.get_causal_mask_mody  sq     |'''	|	|	 <	 #\		
 \	 	 	 	 	 	  r.   c           
          t          | j                  dt          j        dt          j        dt          j        dt          j        dt          j        f
fd}|S )zCreates the encoder mask_mod function for FlexAttention.

        Since the encoder bidirectional attention doesn't run with
        KV cache, this function creates a mask based on the
        packed query sequences.
        r   r   r   r   r   c                 (    |         |         k    S rU   rH   )r   r   r   r   r   s       r,   r   zHFlexAttentionMetadata.get_bidirectional_mask_mod.<locals>.final_mask_mod  s     "%(N6,BBBr.   )r-   r   r&   r   )r   r   r   s     @r,   get_bidirectional_mask_modz0FlexAttentionMetadata.get_bidirectional_mask_mod  s~     4D4HII	C|	C|	C <	C L		C
 \	C 	C 	C 	C 	C 	C r.   c           
      :     j         t          d          dt          j        dt          j        dt          j        dt          j        f fddt          j        dt          j        dt          j        dt          j        d	t          j        f
 fd
} j        r|nS )zCreates the sliding window mask_mod function for FlexAttention.

        Note that the sliding window mask here is bidirectional, we need
        to mask it with the bidirectional/causal mask for encoder/decoder.
        Nz7sliding_window must be set for sliding window attentionr   r   r   r   c                 D    t          j        ||z
            j        k     S rU   )r&   absr   )r   r   r   r   r   s       r,   sliding_window_mask_modzRFlexAttentionMetadata.get_sliding_window_mask_mod.<locals>.sliding_window_mask_mod  s!     9UV^,,t/BBBr.   r   r   c           	                               j        ||          \  }}}t          j        | | |||          d          S rc   r   r   r&   r   )	r   r   r   r   r   r   r   r   r   s	          r,   r   zIFlexAttentionMetadata.get_sliding_window_mask_mod.<locals>.final_mask_mod  sW     11$,WW 6X}n ;''1m^LL  r.   )r   r   r&   r   r   )r   r   r   s   ` @r,   get_sliding_window_mask_modz1FlexAttentionMetadata.get_sliding_window_mask_mod  s     &VWWW	C|	C %	C5:\	CKP<	C 	C 	C 	C 	C 	C
	|	|	 <	 #\		
 \	 	 	 	 	 	 	 "&I~~2IIr.   c           
      8     j         J  j         dt          j        dt          j        dt          j        dt          j        dt          j        f
 fddt          j        dt          j        dt          j        dt          j        d	t          j        f
 fd
}|S )z:Creates the prefix LM mask_mod function for FlexAttention.Nr   r   cu_q_idxr   r   c                     t          j        |t           j                  }j        pi                                 D ]>\  }}|         |k    }|D ]*\  }	}
||	k    ||
k    z  }||	k    ||
k    z  }|||z  |z  z  }+?|S )N)r%   )r&   
zeros_likerl   r   items)r   r   r   r   r   r   reqdoc_range_lstreq_maskstartend
doc_mask_qdoc_mask_kvr   r   s                r,   prefix_lm_mask_modzHFlexAttentionMetadata.get_prefix_lm_mask_mod.<locals>.prefix_lm_mask_mod  s     #E<<<D'+';'Ar&H&H&J&J H H"])(3s:"/ H HJE3"'5.Uc\!BJ#)U?v}"EK8j#8;#FGDDH Kr.   r   r   c           
                               j        ||          \  }}}t          j        | | ||||          d          S rc   r   )	r   r   r   r   r   r   r   r   r   s	          r,   r   zDFlexAttentionMetadata.get_prefix_lm_mask_mod.<locals>.final_mask_mod  sY     11$,WW 6X}n ;""1a~NN  r.   r   )r   r   r   r   s   ` @@r,   get_prefix_lm_mask_modz,FlexAttentionMetadata.get_prefix_lm_mask_mod  s     |'''	|	|	 l	 <		
 L	 	 	 	 	 	 	 	|	|	 <	 #\		
 \	 	 	 	 	 	 	 r.   c                    | j         r|                                 }n|                                 }| j        $|                                 }t          ||          }| j        r$|                                 }t          ||          }|S rU   )	r   r   r   r   r   r	   r   r   r   )r   mask_modr   r   s       r,   get_mask_modz"FlexAttentionMetadata.get_mask_mod  s     ; 	9//11HH6688H *&*&F&F&H&H# +BCCH 	>!%!<!<!>!>*<==Hr.   c                       j         dS t           j                   j         dt          j        dt          j        dt          j        dt          j        dt          j        dt          j        f fd}|S )	zCreates the transformed score_mod function for FlexAttention.

        This function wraps the user's score_mod to handle physical-to-logical
        index conversion, similar to how get_mask_mod works for mask functions.
        Nscorer   r   r   r   r   c                     	                     ||          \  }}}t          j        | 
| |||||          t          d                     S )N)
physical_qinf)r   r&   r   float)r  r   r   r   r   r   r   r   r   r   user_score_mods           r,   r   zNFlexAttentionMetadata.get_transformed_score_mod.<locals>.transformed_score_mod  sq     11"E?  6X}n ;1a5   u  r.   )r   r-   r   r&   r   )r   r   r   r  s   ` @@r,   get_transformed_score_modz/FlexAttentionMetadata.get_transformed_score_mod  s     >!4 4D4HII	<	|	 |	 <		
 #\	 \	 	 	 	 	 	 	 	* %$r.   c                    | j         | j        z  }|dk    r t          d| j         d| j          d          | j        | j        dt          | j        | j                  f         }| j        r| j        r|j	        }| j        J t          j        | j        j        d         |t          j                  }|| j        | j                 z
  | j        | j                 z   }t          j        || j        dz
  z
  d          }|| j        z  }| j        |dddf         k    }|                    | d           t'          || j        d	          }	|	                    |	j        d         | j        z  d
          }	|	|z  }	t-          |	                                | j                                      t          j                  }
|
dk                        d
                              t          j                  }| j        | j        f|d         |
d         dd| j        | j         f| j        d}t=          d          rd|d<   t?          j         di |S )a  Direct block mask construction for standard causal attention.

        This method constructs the block mask directly using
        BlockMask.from_kv_blocks which is much more efficient than the
        generic create_block_mask approach.

        The direct path works as follows:
        1. For each query token, fetch blocks from block_table using max_seq_len
           and exclude out of sliding window blocks if needed.
           (this fetches more blocks than needed for shorter sequences)
        2. Group query tokens into chunks of q_block_size
        3. For each group, deduplicate the blocks using unique_static_unsorted
        4. Create BlockMask using the deduplicated block indices

        Over-estimation occurs when a group of q_block_size tokens contains
        multiple sequence IDs (doc_ids). In this case, we fetch ALL blocks for
        each sequence represented in the group, even though individual query
        tokens may only need a subset of those blocks based on causal masking
        and their position.

        r!   z7FlexAttention currently requires the cache block size (z$) to be equal to the kv_block_size (z+). Please check your model's configuration.Nr   r#   )min)r0   r1   r"   )r   r   )NN)seq_lengthskv_num_blocks
kv_indicesfull_kv_num_blocksfull_kv_indices
BLOCK_SIZEr   
2.9.0.dev0Fcompute_q_blocksrH   )!r   rX   r   rw   r   r   r   r   r   r$   r&   r(   r5   r   r   r   clampr   masked_fill_r>   r   r   r   rW   r   r*   sumr   r   r   r   r   from_kv_blocks)r   page_to_block_ratio
used_pagesr$   token_indicesr   
min_kv_idxmin_block_idxsliding_maskused_pages_paddedr  r  block_mask_kwargss                r,   _build_block_mask_directz.FlexAttentionMetadata._build_block_mask_direct+  s   , #0DOC!##"O" "&" " "   %LCD!14?CCCC

  	64; 	6&F<+++!L"1%fEJ  M &t|45$T\23 
 ]d6IA6M%NTUVVVJ&$/9M1]111d75KKL##\M1555+!2
 
 
 .55#A&$*;;R
 
 .1DD+##%%$/
 
 

"U[// 	 $q--"-5588EE 2D4KL*:6$Z0"&#,d.@A
 
 #<00 	:4901'<<*;<<<r.   c           
          |                                  }| j        r| j        n| j        }t	          |d d | j        || j        j        | j        | j        f          S )N)r$   r  )	r   r   r   r   create_block_mask_compiledrw   r$   r   r   )r   r   kv_lens      r,   build_block_maskz&FlexAttentionMetadata.build_block_masky  sj    $$&&,0KS((T=S)"#*)4+=>
 
 
 	
r.   c                    | j         du s
J d            | j        dk    s
J d            | j        
J d            | j        
J d            | j        
J d            t          | j                  | _        | j        | j	        z  | _
        |                                 | _        |                                 | _        | j        r"| j        r|                                 | _        d S |                                 | _        d S )NFzNot implemented yet.r   )r   r   r   r   r   r-   r   r   r   rX   rW   r   r   r  r   r   r   r  r   r#  r   s    r,   __post_init__z#FlexAttentionMetadata.__post_init__  s   5(((*@(((%***,B***(002H000"**,B***"**,B***1$2FGG1T_D))++%)%C%C%E%E" 	6 	6";;==DOOO"3355DOOOr.   )'ri   rj   rk   rl   rm   ru   r&   r   r   rW   r   r   r   r   r   r   r   r   r   r   r   r   r   r   dictrp   rv   r   r   r   r   r   r   r   r   r  r  r#  r%  rH   r.   r,   r   r   %  sw        LLL\!!!l,,----L4''''L4'''' OOO"%%%%MMM%%%<$$$ c J#'J	D '''-1I#d*111,;);;;#'GU\D '''L$L#M39=/$6===!%NC$J%%%?COT#tE#s(O445<CCC
 
 _
 7 7 | 7 	 7
 
u|U\5<7	8 7  7  7  7D%8    @,?    (J-@ J J J J@%(; % % % %N  &"%+?$+F "% "% "% "%HL=) L= L= L= L=\
) 
 
 
 
6 6 6 6 6r.   r   c            	       n     e Zd Zdedee         dedej        f fdZ		 dde
ded	ed
efdZd
efdZ xZS )r_   kv_cache_speclayer_namesvllm_configr$   c                    t                                          ||||           |j        | _        |j        | _        |j        | _        | j                            | j                  | _        | j                            | j                  | _        | j        	                                | _
        |j        | _        || _        t          d          }|| _        |rdnd| _        |r| j        nd| _        d S )Nr  r      )super__init__model_configparallel_configcache_configget_num_attention_headsnum_heads_qget_num_kv_headsnum_heads_kvget_head_sizeheaddimrX   r(  r   r   r   r   )r   r(  r)  r*  r$   supports_small_blocks	__class__s         r,   r.  z%FlexAttentionMetadataBuilder.__init__  s     	[&III'4*:'4,DD 
 
 !->>t?STT(6688'2* 7 E E"7'<!E#5J"S$//PSr.   Fr   common_attn_metadata
fast_buildr   c                 d   |j         }|j        }|j        }|j        }|j        }|j        }	|j        }
|j        }t          |	| j	                  }|dk    }d }d }d }|rt          d          | j        j	        }| j        j        }| j        j        }|
J d            ||z  }t!          |
|	||          }|                                }t%          di d|j        d|d|d|d|d	|	d
|
d|d|d|d|d|d|d|d|d|d|d|d|d|d| j        o|j        d| j        d| j        }|S )Nr   zNot yet my friendz/FlexAttention requires num_gpu_blocks to be setr   r   r   r   r   rx   rw   r   r   r   r   r   r   rX   r   r   r   r   r   r   r   r   r   rH   )r   r   r   r   r   rx   block_table_tensorr   r   rX   NotImplementedErrorr(  r/  max_model_lenr1  num_gpu_blocksr   compute_num_computed_tokensr   r   r   r   r   )r   r   r:  r;  r   r   r   r   r   rx   r=  r   r   r   r   r   r   rX   max_possible_seq_lenr@  r   inverse_block_tableoffset_tensorouts                           r,   buildz"FlexAttentionMetadataBuilder.build  sE    (00B,:*6.>'01D+8!(DO<<'!+# 	;%&9:::'2
#0>*9))= *)) ,j89*n
 
 -HHJJ# 
 
 
'..
//
 (-
 ,O	

 $
 X
 +*
 &
 $
 0/
 "6!5
 *>
 *>
 "z
 *>)=
  X!
" !4 3#
$  21%
& (-'
(  21)
0 +K0D0K1
2 **3
4 ,,5
8 
r.   c                     dS rc   rH   )r   rd   re   s      r,   rf   z2FlexAttentionMetadataBuilder.use_cascade_attention  s    ur.   )F)ri   rj   rk   r   rp   rr   r   r&   r$   r.  ru   r   rl   r   rF  rf   __classcell__)r9  s   @r,   r_   r_     s        T$T #YT  	T
 T T T T T T: !	C CC 6C 	C
 
C C C CJ        r.   r_   c                      e Zd ZU edz  ed<   ej        dz  ed<   edz  ed<   dZe	ee
eeef                  f         dz  ed<   dej        dfdededed	ede
e         dz  dedz  d
ededz  dededz  ddfdZedej        dej        fd            Z	 	 	 ddej        j        dej        dej        dej        dej        dedej        dz  dej        dz  dej        dz  dej        fdZdS )rS   Nr   alibi_slopeslogits_soft_capr   	num_headsrZ   scalerY   kv_cache_dtyperJ   kv_sharing_target_layer_namer   c                     || _         || _        t          |          | _        || _        |	| _        |	t          j        t          j        fvrt          d|	 d          |t          d          d | _
        || _        || _        || _        | j        t          d          | j         | j        z  dk    sJ | j         | j        z  | _        |
t          d          t          | j                  rt          d          d S )NzFlexAttention does not support z
 attentionz0FlexAttention does not support alibi slopes yet.z3FlexAttention does not support logits soft cap yet.r   z.FlexAttention does not support kv sharing yet.z6FlexAttention does not support quantized kv-cache. Yet)rL  rZ   r  rM  rY   rJ   r   rM   rL   r>  rJ  r   rN  rK  num_queries_per_kvr   )r   rL  rZ   rM  rY   rJ  r   rN  rK  rJ   rO  re   s               r,   r.  zFlexAttentionImpl.__init__  s7    #"5\\
("]79NOOO%G)GGG   #%B   !%D,,.+%E   ~ 11Q6666"&.D4E"E'3%&VWWW !455 	%H  	 	r.   tensorc                 Z    | j         dk    r| S | j         dk    sJ | dddddddf         S )zView a 3d tensor as 4D.      N)r6   )rR  s    r,   
view_as_4dzFlexAttentionImpl.view_as_4d6  sH     ;!M{adAAAqqq!!!m$$r.   layerquerykeyr4   kv_cacheattn_metadataoutputoutput_scaleoutput_block_scalec
           
      |    |
J d            ||	t          d           j         j        k    }
||                    d          S |j        }d}|j         j        k    r. j        |_        |j        r|                                |_        d} j	        t          |dd          k    r'|j	         _	        |                                |_        d}|rA|j        r!|j        r|                                |_        n|                                |_        |j        s j        t           j        k    sJ t%           fd|||f          \  }}}|ddddd|ddf         }|                    d	          |k    s|                    d	          |k    r(|ddddd|ddf         }|ddddd|ddf         }nމ j        t           j        k    sJ |                    d          \  }}t,          j        j                            |||||j         j        |j        |j                   |                    d
 j         j                  }|                    d
 j         j                  }t%           fd|||f          \  }}}|ddddd|ddf         }|j        J |j        j         \  }}tC          ||||j                  }tE          ||||j#        |j         j$        |
|          }|%                    dddd          &                    d          }|d|ddddf         '                    |           |S )a  Forward pass with FLexAttention.

        Args:
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
            kv_cache: shape =
                [2, num_blocks, block_size, num_kv_heads, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
        NzOutput tensor must be provided.zDfused output quantization is not yet supported for FlexAttentionImplr   FTr   c                 Z                         |                               dddd          S Nr   r]   r!   rU  rV  permuter/   r   s    r,   <lambda>z+FlexAttentionImpl.forward.<locals>.<lambda>}  '    $//!,,44Q1a@@ r.   r"   c                 Z                         |                               dddd          S ra  rb  rd  s    r,   re  z+FlexAttentionImpl.forward.<locals>.<lambda>  rf  r.   )
enable_gqakernel_optionsr]   r!   rU  )(r>  rY   rL  fill_r   r   r   r   r   r   getattrr   r  r   r#  rJ   r   rM   mapsizerL   unbindr&   ops_C_cache_opsreshape_and_cache_flashr   rN  _k_scale_v_scaleviewrZ   r  get_kernel_optionsflex_attention_compiledr   rM  rc  squeezecopy_)r   rW  rX  rY  r4   rZ  r[  r\  r]  r^  ri  r   needs_rebuild_block_mask
key_tensorvalue_tensor	key_cachevalue_cacheblock_mblock_nrj  rE  s   `                    r,   forwardzFlexAttentionImpl.forward>  s   0 !!#D!!!#'9'E%V   &$.8
 <<??" *;#( '4+>>>+/+>M() F)6)C)C)E)E&'+$7=:KT#R#RRR#0#@D %2%?%?%A%AM"'+$# 	L) Lm.B L+8+Q+Q+S+S((+8+I+I+K+K(# )	7>]%?????.1@@@@U#/ /+E:|
 !!!QQQ 2!2 2AAA56E##&777!!"%%(999
 (111.@/@.@!!!(CD
+AAAqqq2D3D2Daaa,GH >]%:::::%-__Q%7%7"I{I"::*#	 	 	 "r4+<dnMMI%**2t/@$.QQK.1@@@@	;// /+E:|
 !!!QQQ 2!2 2AAA56E
 '333(3>+7G]%?
 
 &/$J!)	
 	
 	
 kk!Q1%%--a00!!!111aaa'(..s333r.   )NNN)ri   rj   rk   ru   rm   r&   r   r  r   r&  rp   rv   r   rL   rr   r.  rq   rV  nnModuler   r  rH   r.   r,   rS   rS     s
        $J,%%%%T\!!!?COT#tE#s(O445<CCC )-#0#8372 22 2 	2
 2 5kD(2 d
2 2 2 !2 '*Dj2 
2 2 2 2h %5< %EL % % % \% '+,026} }x} |} \	}
 |} ,} -} t#} lT)} "L4/} 
} } } } } }r.   rS   use_direct_buildc                    ddi}dt           dt           dt           fd}t                      rd|d<   d|d	<   d
|d<   |S |r||d<   ||d	<   |S | j        t          j        k    rdnd}d} |||          } |||          }	t          j                                        rt          j                                        }
t          |
d          r|
j	        }n%t          j                    rd}nt          d          |dk     r: |t          d|dz            |          } |t          d|	dz            |          }	t          ||          }t          |	|          }	||d<   |	|d	<   |S )NFORCE_USE_FLEX_ATTENTIONT	candidaterX   r   c                     |dk    r| S t          | |          } | dk    r|S || z  dk    r| S t          j        | |          } | dk    r|S | S )z8Pick a kernel block size that divides the logical block.r   r!   )r
  mathgcd)r  rX   s     r,   ensure_divisiblez,get_kernel_options.<locals>.ensure_divisible  sk    ??	:..	>>	!Q&&HY
33	>>r.   r   BLOCK_MBLOCK_NFIS_DIVISIBLE    @   shared_memory_per_block_optini   z8Unable to determine shared memory size on this hardware.i @ r!   r]   )ru   r   r%   r&   ro   cudais_availableget_device_propertieshasattrr  r   is_rocmRuntimeErrormax)rX  r  r  r  rj  r  preferred_blockblock_lower_boundblock_m_candidateblock_n_candidatedevice_propsmax_shared_memorys               r,   rv  rv    s    	#D-NC S S        $&y!$&y!).~& %6$+y!$+y! %u} < <""",,_gFF,,_gFF:""$$ 	 :;;==L |%DEE $0$N!!!)++ $)!!"N   !:--$4$4,122G% %! %5$4,122G% %!   13DEE 13DEE$5y!$5y!r.   )D__doc__r  dataclassesr   	functoolsr   typingr   r&   torch._dynamo.decoratorstorch.nn.functionalr  
functionalr9   !torch.nn.attention.flex_attentionr   r   r   r	   r
   r   r   vllm.configr   vllm.config.cacher   vllm.loggerr   *vllm.model_executor.layers.batch_invariantr   vllm.platformsr   vllm.utils.math_utilsr   vllm.utils.torch_utilsr   vllm.v1.attention.backendr   r   r   r   r   r   vllm.v1.kv_cache_interfacer   ri   logger_dynamoconfigrecompile_limitcompiler!  rw  r   r-   ru   r>   r@   r   r   r   r   r_   rS   rl   r&  rr   rv  rH   r.   r,   <module>r     s~   * )  ! ! ! ! ! ! % % % % % %                                      # " " " " " ( ( ( ( ( ( # # # # # #      , + + + + + & & & & & & : : : : : :                5 4 4 4 4 4	X		')  $*U],=    (%-$GGG      8u| 8s 8 8 8 8 8"/ / / / /+ / / /fiili i 	i
 \i i i i` 1 1 1|1 1 
	1
 1 1 \1 1 1 1h|-2\CH<    p6 p6 p6 p6 p6 p6 p6 p6f` ` ` ` `#;<Q#R ` ` `F       DB/3B	#sTz/B B B B B Br.   