
    .`iȺ                     :   d Z ddlmZ ddlmZ ddlZddlmZ ddlm	Z	 ddl
mZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZmZ ddlmZ ddlm Z  ddl!m"Z" dZ#dZ$ ej%                    rddl&Z&ddl'm(Z(m)Z) d Z*d Z+e)j,        de(j-        de(j-        de(j-        de(j-        fd            Z.dej/        dej/        dej/        dej/        dej/        dej/        d ej/        d!ej/        d"ej/        d#ej/        d$e0d%e1d&e2fd'Z3e)j,        de(j-        d(e(j-        d)e(j-        fd*            Z4dej/        dej/        dej/        dej/        d+ej/        d,e1dej/        d ej/        fd-Z5 ee6          Z7e G d. d/                      Z8e G d0 d1                      Z9e G d2 d3                      Z:e G d4 d5                      Z;e G d6 d7                      Z<e G d8 d9                      Z= G d: d;ee=                   Z> G d< d=e          Z? G d> d?e          Z@dS )@z)Attention layer with AiterFlashAttention.    )	dataclass)ClassVarN)rocm_aiter_ops)	Attention)
VllmConfigget_layers_from_vllm_config)init_logger)current_platform)cdiv)get_cu_count)AttentionBackendAttentionCGSupportAttentionImplAttentionMetadataBuilderAttentionTypeCommonAttentionMetadata
MultipleOf)"split_decodes_prefills_and_extends)merge_attn_states)AttentionSpec   i   )tltritonc                 p    t          d|                                 z  t          j        |                    S )Ni   )minelement_sizer   next_power_of_2)xhead_dims     |/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/attention/backends/rocm_aiter_fa.py
block_sizer!   '   s,    5ANN,,,f.DX.N.NOOO    c                 :    t          | t                                S N)r   r   )total_tokenss    r    num_programsr&   *   s    <000r"   DEQUANT	PAGE_SIZECACHE_FORMAT
BLOCK_SIZEc                    t          j        d          }t          j        d          }t          j        d|          }|||z  |
z  z   ||z  z   }|||z  |
z  z   ||z  z   }t          j        ||z             }t          j        ||z             }t          j        ||z             }||z
  |z   }||z  }t          j        |||z  z   |z                                 t           j                  }||z  }|dk    r<| ||
z  |z  |z  z   ||
z  |z  z   ||z  z   }|||
z  |z  |z  z   ||
z  |z  z   ||z  z   }t          j        ||z             } t          j        ||z             }!|rt          j        |          }"t          j        |	          }#| j        }$|!j        }%|                     t           j                  |"z                      |$          } |!                    t           j                  |#z                      |%          }!t          j        ||z   |            t          j        ||z   |!           d S |dk    r| ||
z  |z  |z  z   ||z  |z  z   ||z  z   }|||
z  |z  |z  z   ||z  |z  z   ||z  |z  |z  z   ||z  z   }||z  |z  |z  ||z  z   }&||z  }'t          j        ||&z             } t          j        ||'z             }!|rHd}"d}#|                     t           j                  |"z  } |!                    t           j                  |#z  }!t          j        ||z   |            t          j        ||z   |!           d S d S )Nr      NHDSHUFFLE      ?)	r   
program_idarangeloadtoint64dtypefloat32store)(key_cache_ptrvalue_cache_ptrkey_ptr	value_ptrblock_table_ptrcu_seqlens_kv_ptrtoken_to_batch_ptrseq_start_ptrk_scale_ptrv_scale_ptr	num_heads	head_sizer   max_block_numr'   r(   r)   r*   token_idhead_idcol_offsetskey_ptr_offsetvalue_ptr_offset	batch_idxbatch_starttoken_startbatch_offsetblock_offsetblock_idslot_idkey_cache_ptr_offsetvalue_cache_ptr_offsetk_regv_regk_scalev_scalek_dtypev_dtypek_reg_offsetv_reg_offsets(                                           r    cp_mha_gather_cache_kernelr[   -   s   * =##-""i:.. h*Y6699LL 	 9,y887Y;NN 	 G.9::	gmi788g/);<<+-;#y07mi77,F
 

"RX,, 	 *5  
 Y&2Y>?I%	12 I%& !  Y&2Y>?I%	12 I%& # G0;>??EG2[@AAE E'+..'+..++"*--7;;GDD"*--7;;GDDH^k15999H%3U;;;;;Y&&
 Y&2Y>?I%	12 A+ !  Y&2Y>?I%	12 a<9,q01 A+	 # '!+i7!;kAoML&?LG0<?@@EG2\ABBE 7,,w6,,w6H^k15999H%3U;;;;;7 '&r"   	key_cachevalue_cachekeyvalueblock_tablesk_scalesv_scalescu_seqlens_kvtoken_to_batch
seq_startsdequantkv_cache_layoutr%   c                 X   |dv s
J d            |j         d         }d|                                 z  }|| j         d         k    s
J d            | j         d         }| j         d         fd}t          |         | |||||||	|||||                    d          |
|||	           d S )
N)r-   r.   z)kv_cache_layout only support NHD, SHUFFLE         zaWe assume your kv cache layout is [num_blocks, page_size, num_heads, head_dim], but got otherwiser,   c                     fS r$    )metarB   r%   s    r    <lambda>z%cp_mha_gather_cache.<locals>.<lambda>   s    \95 r"   )r'   r(   r)   r*   )shaper   r[   size)r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   r%   r   r   	page_sizegridrB   s               `    @r    cp_mha_gather_cachert      s    "44447 544 9Q<)((*** 9?1----A .-- OA&	OA&	55555"4(a  (%	
 	
 	
 	
 	
 	
r"   QUANTIS_FNUZc                 f   t          j        d          }t          j        d          }t          j        d|          }||z  ||z  z   }||	z  ||z  z   }t          j        ||z             }|dk     rd S ||
z  }||
z  }||z  |z  |
z  ||z  |
z  z   }|||z  |
z  |z  z   ||z  z   ||z  z   }|||z  |z  |z  z   ||z  z   ||z  z   }t          j        | |z   |z             }t          j        ||z   |z             }|rd}d}|j        j        }|j        j        } |                    t           j                  |z                      |          }|                    t           j                  |z                      |           }t          j        ||z   |           t          j        ||z   |           d S )Nr   r,   r/   )	r   r0   r1   r2   type
element_tyr3   r6   r7   )!r:   r;   r8   r9   slot_mapping_ptrr@   rA   r   	k_stride0	v_stride0r!   rC   num_kv_headsr*   ru   rv   tidrF   offsetsrc_offset_ksrc_offset_vrP   rO   rN   
dst_offsetdst_k_shuffle_offsetdst_v_shuffle_offsetk_valv_valrU   rV   rW   rX   s!                                    r     reshape_and_cache_shuffle_kernelr      s   & mA-""1j))Y9)<<Y9)<<'*S011Q;;Fj(+|#i/*<	!J./ 	
 1z1A55q8HH6TU:U 	 a)+a/0qj Q 	 ,.788	L069:: 	AGG#(3G%*5GXXbj))G377@@EXXbj))G377@@E
!55u===
#77?????r"   slot_mappingkv_cache_dtypec                 n   |j         d         }| j         \  }	}
}|j         \  }}}	}	d|                                z  }t          j        ||
||z  ||g|j        d          }t          j        ||
||z  ||g|j        d          }|                    |          }|                    |          }d}|                    d          rd}||
f}t          |         | ||||||||                     d          |                    d          |||
||t          j
                    t          j        k               d S )	Nr   rj   rn   r5   deviceFfp8T)r*   ru   rv   )rp   r   torchemptyr5   view_as
startswithr   strider
   	fp8_dtypefloat8_e4m3fnuz)r^   r_   r\   r]   r   r   ra   rb   
num_tokens_r}   rC   
num_blocksr!   r   k_cache_templatev_cache_templatenew_key_cachenew_value_cacheru   rs   s                        r     reshape_and_cache_shuffle_tritonr      s~    "'*
%(Y"<'0$
J1)((*** ;yA~z1E/
 
 

 !;zQ	1E#
 
 

 "))*:;;%--.>??$$U++ 	E
 	).JJqMMLLOO $.00E4II!	
 	
 	
 	
 	
 	
r"   c                   B    e Zd ZU eed<   eed<   eed<   ej        ed<   dS )!AiterFlashAttentionDecodeMetadatamax_query_lenmin_query_lenmax_seq_lenquery_start_locN__name__
__module____qualname__int__annotations__r   Tensorrm   r"   r    r   r   1  D         \!!!!!r"   r   c                   B    e Zd ZU eed<   eed<   eed<   ej        ed<   dS )"AiterFlashAttentionPrefillMetadatar   r   r   r   Nr   rm   r"   r    r   r   9  r   r"   r   c                       e Zd ZU ej        ed<   ej        ed<   ej        ed<   ej        ed<   eed<   eed<   ej        ed<   dS )	AiterChunkSlidingWindowMetadataswa_seqlensswa_cu_seqlensswa_seq_startsswa_token_to_batchswa_max_seqlensswa_total_tokensswa_workspaceN)r   r   r   r   r   r   r   rm   r"   r    r   r   A  sp         L   L   $$$<r"   r   c                       e Zd ZU ej        ed<   ej        ed<   ej        ed<   ej        ed<   ee         ed<   ee         ed<   ej        ed<   eed<   ee         ed	<   ed
z  ed<   d
S )AiterChunkContextMetadata	workspacecu_seq_lens_chunkchunk_startsrd   seq_totmax_seq_lensseq_lens
num_chunkstotal_token_per_batchNswa_metadata)	r   r   r   r   r   r   listr   r   rm   r"   r    r   r   L  s         ||###,L   #Ys)lOOO9$$$1D888888r"   r   c                   L    e Zd ZU eed<   eed<   eed<   ej        ed<   eed<   dS )'AiterFlashAttentionChunkPrefillMetadatar   r   r   r   chunk_context_metadataN)r   r   r   r   r   r   r   r   rm   r"   r    r   r   Z  sP         \!!!555555r"   r   c                   f   e Zd ZU eed<   eed<   eed<   ej        ed<   eed<   ej        ed<   ej        ed<   ej        ed<   eed	<   eed
<   eed<   eed<   eed<   eed<   edz  ed<   edz  ed<   e	dz  ed<   e
ed<   eed<   eed<   eeej        f         dz  ed<   eeej        f         dz  ed<   dS )AiterFlashAttentionMetadatanum_actual_tokensnum_actual_kv_tokensr   r   r   r   r   block_tablenum_decodesnum_decode_tokensnum_prefillsnum_prefill_tokensnum_extendsnum_extend_tokensNdecode_metadataprefill_metadataextend_metadatause_cascadecommon_prefix_lenr%   rU   rV   )r   r   r   r   r   r   r   r   r   r   booldictstrrm   r"   r    r   r   c  s[         \!!!l, 6====84????<tCCCC  #u|#$t++++#u|#$t++++++r"   r   c            	            e Zd ZU ej        ZdZeed<   de	de
e         dedej        f fdZdefd	Z	 ddedededdfdZdefdZ xZS )"AiterFlashAttentionMetadataBuilderr,   reorder_batch_thresholdkv_cache_speclayer_namesvllm_configr   c                    t                                          ||||           |j        | _        |j        | _        |j        | _        | j                            | j                  | _        | j                            | j                  | _        | j        	                                | _
        |j        | _        d | _        d| _        t                      }t          | j        t"                    }|                                D ]=}t'          |j        t*                    sJ |                    |j        j                   >t1          |          dk    rM|                                }|$|d         dk    r| j        
J d            || _        t1          |          dk    Mt5          j        dt8          | j        | j
        g| j        j        |          | _        t5          j        dgt4          j         | j!                  | _"        d S )Nr   z@Aiter Flash ATTENTION can only support one valid sliding window!ri   r   r/   )#super__init__model_configparallel_configcache_configget_num_attention_headsnum_heads_qget_num_kv_headsnum_heads_kvget_head_sizeheaddimr!   aot_sliding_windowr%   setr   r   r   values
isinstanceimplAiterFlashAttentionImpladdsliding_windowlenpopr   r   _CP_TOKENS_PER_ITER_ROCMr5   extend_workspacetensorfloatr   scale)
selfr   r   r   r   sliding_window_configslayerslayersliding_window_config	__class__s
            r    r   z+AiterFlashAttentionMetadataBuilder.__init__  s    	[&III'4*:'4,DD 
 
 !->>t?STT(6688'2 ;?!">Aee,T-=yII]]__ 	B 	BEej*ABBBBB"&&uz'@AAAA())A--$:$>$>$@$@!$05J15MQS5S5S.66V 766 +@' ())A-- !&($*;T\J#)!
 !
 !

 \3%u{4;OOO


r"   common_attn_metadatac                     | j         j        | j        j        j        z  | _        |                     d|          }d| _        |S )Nr   )r   r   )r   max_model_lenr   scheduler_configmax_num_partial_prefillsr%   build)r   r   ress      r    build_for_cudagraph_capturez>AiterFlashAttentionMetadataBuilder.build_for_cudagraph_capture  sL     +/HI 	 jj1CWjXX
r"   Fr   
fast_buildreturnr   c                    t          || j                  }t          j                    r| j                                        dk    r| j        j        j        	                    d          rt          | j        t                    }d |D             d         }| j        j        j        |         j        d         j        }|d         }t!          j        || j        | j        gt           j        | j                  | _        |\  }	}
}}}}|j        }|j                                        }|dd          |d d         z
  }d }|	dk    rt3          |d |	                                                                         |d |	                                                                         |d |	                                                                         |j        d |	dz                      }d }|dk    r||	|
z   d          }|j        |	|
z   d          }t=          |                                                                |                                                                ||	|
z   d                                                                          ||d         z
            }d }|
dk    rt?          |	|	|
z             }||         }||         }||z
  }d }| j         t!          j!        ||| j         d         z   dz             }t!          j"        |
dz   t           j#        |j                  }t!          j$        |d|j%        |dd          	           t!          j&        d|
t           j#        |j                  }t!          j'        ||          }|d                                         }t!          j(        d
|| j        | j)        f| j        j*        j%        | j                  } ||z
  }!|                                                                }"|d                                         }#tW          |,                    | j        d          |,                    | j        d          |!,                    | j        d          |,                    | j        d          |"|#|           }tZ          |
z  }$t]          |                                                                |$          }%t!          j&        |%t           j#                  /                    d          0                    d|
          |$z  }&t!          j        |/                    d          |&|$z             }'|'|&z
  1                    d          }(t!          j"        |%|
dz   gt           j#        d          })t!          j$        |(d|)d d dd f         t           j#                   |)d d df                                                                         }*t!          j&        |*t           j#                  d d d d f         }+|+|)d d dd f         d d d d d f         k    },|,2                    d          },t!          j$        |,d          }-tg          | j4        |),                    | j        d          |&,                    | j        d          |(2                    d          5                                |(                    d          j6        5                                |(|-,                    | j        d          |%|)d d df         5                                |
  
        }.|j        |	|	|
z   dz            }|j        |         }/t!          j"        |
dz   t           j#        |/j                  }t!          j$        |/d|j%        |dd          	           to          |                                                                |                                                                ||                                                                         ||d         z
  |.          }t!          j2        |                                          }0|dk    }1tq          d+i d|j9        d|0d|j:        d|j        d|j;        d|j        d|j<        d|j=        d|	d|d|d |d!|
d"|d#|d$|d%|d&|1d'|d(| j>        d)| j        d*| j        }2|2S ),N)decode_thresholdr,   r   c                     g | ]}|S rm   rm   ).0ks     r    
<listcomp>z<AiterFlashAttentionMetadataBuilder.build.<locals>.<listcomp>  s    222a222r"   r   r   r   )r   r   r   r   )dimr5   outri   T)non_blocking)r   r   r   r   r   r   r   r5   )r   )r5   
pin_memory)r  r  r5   )r  )
r   r   r   r   r   r   rd   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r%   rU   rV   rm   )?r   r   r   is_shuffle_kv_cache_enabledr   numelr   r   cache_dtyper   r   r   compilation_configstatic_forward_contextkv_cacherp   r   onesr   r!   r6   r   query_start_loc_cpur   cpur   maxitemr   r   r   slicer   minimumzerosint32cumsumr5   r1   repeat_interleaver   r   r   r   r3   r   r   	unsqueezeexpandclampsumr   r   tolistr   r   r   r   r   r   block_table_tensorr   r%   )3r   r   r   r  	split_retr   first_layer_namekv_cache_shaper   r   r   r   r   r   r   r  r   query_lens_cpur   r   query_lens_for_prefillquery_start_loc_devicer   num_extends_slicequery_lens_for_extendseq_lens_for_extendcomputed_kv_lensr   swa_seqlen_for_extendcu_seq_lenstoken_to_seqfetched_shaper   re   max_seqlen_kr%   max_context_chunkr   r   
chunk_endschunk_seq_lenscu_seq_lens_cpumax_cum_tokens	range_idxidx_to_batch_tensortoken_to_batch_tensorr   seq_lens_devicer   r   attn_metadatas3                                                      r    r  z(AiterFlashAttentionMetadataBuilder.build  s
    7 !9
 
 
	 688	
  ""a'' -9DDUKK ( 11A9MMF22622215 3J$ !   (*JT.@m{  DJ 	
 3F'04466,QRR03Fss3KK???,\k\:>>@@EEGG,\k\:>>@@EEGG$\k\26688==?? 4 DEV{UVEV W	  O  !%3K+4M4O4O%P"%9%Ik)++&"  B488::??AA488::??AA$[;%>%@%@AEEGGLLNN 69OPQ9R R	      ?? %k;3L M M$23D$E!"*+<"=25JJL&2(-')D,CA,FFJ) )% $k!O+.5  
 )%+#ABB	     %|+.5	       %6 "7    !,B 4 4 6 6 %t'8$,G*7=;! ! ! 13HH
488::??AA*23355> 5 8 8$ !9 ! ! $/>>$+D>#Q#Q#-==4=#P#P'3t{QU'V'V$0%1"/
  
  
  !9K G.224499;;=NOOJ Zu{;;;1K((#$   **1--|>O/O J )<7>> ?  N $k[1_-U[T  O LA?111abb5+A    -QQQU37799>>@@N^5;GGGdTUTUTUVI"+qqq!""u/EaaaDj/Q"Q"5"9"9 #: # # %*L1D!$L$L$L!%>/"1"4"4T[t"4"T"T)__T[t_LL&**q*1188::+//A/66=DDFF'477RV7WW%&5aaae&<&C&C&E&E)& & &" &:%IkK7!;;&" 3;<MNO+au{?;Q  K LQk.?[QRQSQS_    F37799>>@@37799>>@@$%67;;==BBDD 69OPQ9R R'=  O  %y227799'!+3 
 
 
2DD
!5!5
 /<<
 1@@	

 -88
 *22
 -??
 .::
 $
 0/
 &
  21
 $
 0/
 ,O
  .-!
" ,O#
$ $%
& 0/'
( **)
* JJ+
, JJ-
0 r"   c                     dS )NFrm   )r   argskwargss      r    use_cascade_attentionz8AiterFlashAttentionMetadataBuilder.use_cascade_attention  s    ur"   )F)r   r   r   r   UNIFORM_SINGLE_TOKEN_DECODE_cudagraph_supportr   r   r   r   r   r   r   r   r   r   r   r  r   r  rH  __classcell__)r   s   @r    r   r     s         ,G#$S$$$+P$+P #Y+P  	+P
 +P +P +P +P +P +PZ	$;	 	 	 	 !	W WW 6W 	W
 
'W W W Wr        r"   r   c                   l   e Zd ZU dZeed<   ej        ej        gZ	e
eej                          ed<   edeeez           fd            Zedee         fd            Zedefd            Zeded         fd	            Zeded
         fd            Ze	 ddedededededeedf         fd            ZdS )AiterFlashAttentionBackendTaccept_output_buffersupported_dtypesr	  c                  "    t          d          gS )Nrj   )r   rm   r"   r     get_supported_kernel_block_sizesz;AiterFlashAttentionBackend.get_supported_kernel_block_sizes  s    2r"   c                 
    g dS )N)@      r   rm   )clss    r    get_supported_head_sizesz3AiterFlashAttentionBackend.get_supported_head_sizes  s    ~~r"   c                      dS )N
FLASH_ATTNrm   rm   r"   r    get_namez#AiterFlashAttentionBackend.get_name  s    |r"   r   c                      t           S r$   )r   rm   r"   r    get_impl_clsz'AiterFlashAttentionBackend.get_impl_cls  s    &&r"   r   c                      t           S r$   )r   rm   r"   r    get_builder_clsz*AiterFlashAttentionBackend.get_builder_cls  s    11r"   autor   r!   r}   rC   cache_dtype_str.c                 @    |dz  dk    rt          d          d| |||fS )Nrj   r   z$Block size must be a multiple of 16.ri   )
ValueError)r   r!   r}   rC   r_  s        r    get_kv_cache_shapez-AiterFlashAttentionBackend.get_kv_cache_shape  s3     ?aCDDD:z<CCr"   N)r^  )r   r   r   rN  r   r   r   float16bfloat16rO  r   r   r5   staticmethodr   r   rQ  classmethodrV  r   rY  rx   r[  r]  tuplerb  rm   r"   r    rM  rM    s        !%$%%%5:]EN4ShtEK01SSS d33C.D       \  c    [ c    \ '$89 ' ' ' \' 2T"FG 2 2 2 \2   &	D 	D	D	D 	D 		D
 	D 
sCx	D 	D 	D \	D 	D 	Dr"   rM  c                   (   e Zd Zdej        dfdededededee         dz  dedz  ded	edz  d
ededz  ddfdZ	de
dej        dej        dej        dedej        dedefdZde
dej        dej        dej        dej        dej        dej        dej        dedededej        dej        dej        dej        fdZ	 	 	 d$dej        j        dej        dej        dej        d ej        de
dej        dz  d!ej        dz  d"ej        dz  dej        fd#ZdS )%r   NrB   rC   r   r}   alibi_slopesr   r   logits_soft_cap	attn_typekv_sharing_target_layer_namer	  c                    || _         || _        t          |          | _        || _        | t          j        |t
          j                  }|| _        |d| _	        n|dz
  df| _	        || _
        |d}|| _        |
| _        | j         | j        z  dk    sJ | j         | j        z  | _        |	t          j        t          j        fvrt#          d          d S )Nr  )r   r   r,   r           z@Encoder self-attention is not implemented for FlashAttentionImpl)rB   rC   r   r   r}   r   r   r6   ri  r   r   rj  rl  num_queries_per_kvr   DECODERENCODER_DECODERNotImplementedError)r   rB   rC   r   r}   ri  r   r   rj  rk  rl  s              r    r   z AiterFlashAttentionImpl.__init__  s     #"5\\
(# <EMJJJL(!"*D#1A#5q"9D,"!O.,H)~ 11Q6666"&.D4E"E]2M4QRRR%R   SRr"   rD  queryoutputcu_seqlens_qmax_seqlen_qr   rU   rV   c                    |j         J |j         j        J |j         j        }|j        }|J |j        }|j        }|j        }|j        }|j        }|j        d         |j        d         }}t          ||||||	|
|||| j
                            d          d|           t          j        |||||||dd| j        d| j        | j        d|	           d S )
Nr   r,   r   r-   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   r%   rn  TF)qr  vru  cu_seqlens_krv  r:  min_seqlen_q	dropout_psoftmax_scalecausalwindow_sizeri  
return_lser  )r   r   r   r   r   r   r   r   r   rt   r   r   aiterflash_attn_varlen_funcr   r   ri  )r   rD  rs  r\   r]   rt  ru  rv  r   rU   rV   chunked_metadatar   r   r   r   r   r   key_fetchedvalue_fetcheds                       r    extend_for_sliding_windowz1AiterFlashAttentionImpl.extend_for_sliding_window  s8    ,888,COOO(8O'4'''%4%4)<&6'8&q)&q) # 	#$(-%'22599!)	
 	
 	
 	
  	$%'%(*+*	
 	
 	
 	
 	
 	
r"   r^   r_   r\   r]   r:  r|  r   c                    | j         d         dk    r |                     |||||||	|||
  
         d S t          j        ||||||	|	|d| j        d| j         | j        d          \  }}|j        J |j        j        }|j        }|j	        }|j
        }|j        }|j        }|j        }|j        }|d         |d         }}d }d }t          |          D ]}t!          |||||||||         ||         ||         | j                            d          t'          j                    rdnd	||         
           t          j        ||||||         |	||         |d| j        d| j         | j        d          \  }} ||}| }t+          j        |          }!t+          j        |          }"t/          |!|"||||            |!}|"}t/          |||||           d S )Nr   r   rn  T)ry  r  rz  ru  r{  rv  r:  r|  r}  r~  r  r  ri  r  r,   r   r.   r-   rx  F)rt  
output_lseprefix_output
prefix_lsesuffix_output
suffix_lse)rt  r  r  r  r  )r   r  r  r  r   ri  r   r   r   r   r   r   r   rd   r   rangert   r   r   r   r  r   
empty_liker   )#r   rD  rs  r^   r_   r\   r]   rt  ru  rv  r:  r|  r   r   rU   rV   r  lser   r   r   rc   max_seqlensr   rd   r   r  r  chunked_outputchunked_lse	chunk_idxsuf_outsuf_lse
tmp_outputtmp_lses#                                      r    extend_forwardz&AiterFlashAttentionImpl.extend_forward2  s   $ q!R''**   F/%%%%%*+*
 
 
S  ,888!.!>!U+6
*4	.@,9-:/> 6 L%.q\9Q<]z** 2	& 2	&I#'#(  +I6-i8'	2+66u==!=??!		29=   $  %;)*95)(3)"j /!.     GW  %!(%"-c22
*3//!%&"0*")&    ",%("	
 	
 	
 	
 	
 	
r"   r   r  output_scaleoutput_block_scalec
                 ,   |
J d            ||	t          d          ||                    d          S |j        }
|                    d          \  }}| j                            d          rL|                    t          j                              }|                    t          j                              }| j	        ||}t          j                    r+t          |||||j        | j        |j        |j                   n?t           j        j                            |||||j        | j        |j        |j                   |d|
         }|
|d|
         }|
|d|
         }|d|
         }|j        }|j        }|j        }|j        }|j        }|j        s|dk    r|j        J |||z   d         }|||z   d         }|||z   d         }t;          j        ||||j        j        |j        j        |j        j         |j        j!        dd| j"        d| j#        | j$        |||z   d         	           |dk    r|j%        J tM          |||z             }||         }||         }||         }||         }|j        }|j        }t          j                    r|j        }|j        }| '                    ||||||||j%        j        |j%        j         |j%        j!        d|j(        |||z            |j        |||z            ||
           |dk    ru|j)        J | j#        d         dk    rt          j                    r
J d            ddl*m+} |j        d|         j,        d         dz
  |j,        d         f} |d(i d|d|         d|d|d|d|         d|j        d|         ddd|j-        d|         d|j!        d| j"        ddd| j$        d| j#        d|j(        d|         d| j.        ddd|j        /                    |          d|j        /                    |           dS |j)        J t          j                    r|j,        \  }} }!}"d |0                                z  }#t!          j1        ||!|"|#z  | |#g|j2        d!"          }$t!          j1        ||!| |#z  |"|#g|j2        d!"          }%|3                    |$          }&|3                    |%          }'t;          j4        |d|         |&|'|j(        d|         |j-        d|         |j(        d|         5                    d          |j        |j        |d|         #	  	         n=|j,        \  }(})}"t!          j6        |j2                  j7        d$z  }*|j-        j,        d         }+|j!        tp          z   dz
  tp          z  },t!          j1        |+|)z  |,z  |"z  |*z  d|+|)z  |,z  z  d%z  z   t           j9        |j:        "          }-t           j        j        ;                    |d|         |-|d|         ||| j"        |j(        d|         |j        d|         |j-        d|         |j!        | j$        | j        d&| j.        |j        |j        dtp                     nt          d'          |S ))a  Forward pass with AiterFlashAttention.

        Args:
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
            kv_cache: shape =
                [2, num_blocks, block_size, num_kv_heads, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
        NOTE: FP8 quantization, flash-attn expect the size of
              {q,k,v}_descale to be (num_sequences, num_kv_heads).
              We use torch's .expand() to avoid duplicating values
        NzOutput tensor must be provided.zEfused output quantization is not yet supported for FlashAttentionImplr   r   r,   rn  T)ry  r  rz  ru  r{  rv  r:  r|  r}  r~  r  r  ri  r  )rD  rs  r^   r_   r\   r]   rt  ru  rv  r:  r|  r   r   rU   rV   r   z8Sliding window with shuffle layout is not supported yet.)unified_attentionri   ry  r  rz  r  ru  rv  	seqused_kr:  r~  r  ri  r  r   softcap	q_descale	k_descale	v_descalerj   rn   r   )	QKVr`   context_lensblock_tables_stride0K_QScaleV_QScaleout_      r-   z3Cascade attention is not implemented for ROCM AITERrm   )<rr  fill_r   unbindr   r   viewr
   r   rl  r   r  r   r   rU   rV   r   ops_C_cache_opsreshape_and_cache_flash_k_scale_v_scaler   r   r   r   r   r   r   r  r  r   r   r   r   r   ri  r   r   r  r   r   "aiter.ops.triton.unified_attentionr  rp   r   rj  r'  r   r   r5   r   
pa_fwd_asmr   finfobits_PARTITION_SIZE_ROCMuint8r   paged_attention_v1).r   r   rs  r^   r_   r  rD  rt  r  r  r   r\   r]   output_actual_tokensr   r   r   r   r   prefill_queryprefill_keyprefill_valueextend_tokens_sliceextend_querysextend_keysextend_valuesextend_outputsrU   rV   r  descale_shaper   r!   r}   rC   r   r   r   r   r   r   rB   nbytes_per_qo_elemnum_seqsmax_num_partitionsworkspace_buffers.                                                 r    forwardzAiterFlashAttentionImpl.forward  s   6 !!#D!!!#'9'E%W    <<??" *;!)!3!3	; ))%00 	I!'7'A'C'CDDI%**+;+E+G+GHHK-5! 9;;  1!.'!)!)	 	 	 	 	&>>!.'NN	 	 	 ((()?((()C,,,-E%&8'8&89#/$1#/););( g	a$5AAA %&7:K&K&M&M N!"36G"G"I"IJ %&7:K&K&M&M N,#!#!.!?!O!.!?!O!.!?!M!.!?!K!"!"&* $ 3!%!2,->AR-R-T-TU   $ Q$4@@@&+%'8;L'L' '# !&&9 :!"56 %&9 :!'(;!<..!=?? 4+3G+3G##"/'#'' +)!.!>!N!.!>!L!.!>!J!" - 9#kK&??! "/!;#kK&??" $#' $   . Q$4@@@&q)R//-IKK  R K     
 &5l{lCI!LqP!*%M &%    2!2 233#) &+ ##5$5#566	
 &3%B<K<%P%P &'Q #0"8+"F"F &3%>%> '+jj  $t &*%6%6 %)$7$7 %2$=l{l$K$K !% 4 4 #'$  #(."7"7"F"F"F!" #(."7"7"F"F"F# & F$4@@@!=?? ?FOoCJ
L)i44666A',{#\9>:qQ'o%( ( ($
 (-{#\:?IqQ)/%( ( ($
 %.$5$56F$G$GM&1&9&9:J&K&KO$ 2!2 23')%2%>||%L%2%;L[L%I-:-F([L. &))!.!6!.!6#$6%6$67     /4k+Ay)).U[)A)A)F!)K&,5;A>H%14HH1L-*.& (-{!I-0BBYN,-x)36HHIAMN $k%}( ( ($ IO661 112(0001!#
%1,;,?%5l{lC%.||<%1)+,,%  * &E   r"   )NNN)r   r   r   r   rp  r   r   r   r   r   r   r   r   r  r  nnModuler  rm   r"   r    r   r     s        )-#0#837% %% % 	%
 % 5kD(% d
% % % !% '*Dj% 
% % % %N;
2;
 |;
 ;
 l;
 ;
 \;
 ;
 ;
 ;
 ;
 ;
zv
2v
 |v
 \	v

 |v
 <v
 \v
 v
 lv
 v
 v
 v
 \v
 lv
 v
  !v
 v
 v
 v
@ '+,026U UxU |U \	U
 |U ,U 3U t#U lT)U "L4/U 
U U U U U Ur"   r   )A__doc__dataclassesr   typingr   r   vllm._aiter_opsr   vllm.attention.layerr   vllm.configr   r   vllm.loggerr	   vllm.platformsr
   vllm.utils.math_utilsr   vllm.utils.platform_utilsr   vllm.v1.attention.backendr   r   r   r   r   r   r    vllm.v1.attention.backends.utilsr   'vllm.v1.attention.ops.merge_attn_statesr   vllm.v1.kv_cache_interfacer   r  r   is_rocmr  vllm.triton_utilsr   r   r!   r&   jit	constexprr[   r   r   r   r   rt   r   r   r   loggerr   r   r   r   r   r   r   rM  r   rm   r"   r    <module>r     su   0 / ! ! ! ! ! !        * * * * * * * * * * * * ? ? ? ? ? ? ? ? # # # # # # + + + + + + & & & & & & 2 2 2 2 2 2                       F E E E E E 4 4 4 4 4 4 $  I
LLL,,,,,,,,P P P1 1 1 Z_< _<  <!_<" l#_<$ L%_< _< _< Z_<B2
<2
\2
 \2
 |	2

 l2
 ,2
 ,2
 |2
 2
 L2
 2
 2
 2
 2
 2
 2
h Z3@ L3@ |3@  !3@ 3@ 3@ Z3@j2
\2
|2
 <2
 \	2

 l2
 2
 ,2
 ,2
 2
 2
 2
j 
X		 " " " " " " " " " " " " " " " "                 
9 
9 
9 
9 
9 
9 
9 
9 6 6 6 6 6 6 6 6 &, &, &, &, &, &, &, &,RX X X X X89X X Xv"D "D "D "D "D!1 "D "D "DJr r r r rm r r r r rr"   