
    .`i';                     ^   d Z ddlmZ ddlmZ ddlZddlmZ ddlm	Z	 ddl
mZmZ ddlmZ dd	lmZmZmZmZmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlm Z   e	e!          Z"e G d d                      Z# G d dee#                   Z$ G d de          Z% G d de          Z&dS )z>Attention layer with PagedAttention and Triton prefix prefill.    )	dataclass)ClassVarN)
VllmConfig)init_logger)QuantKeykFp8StaticTensorSym)current_platform)AttentionBackendAttentionCGSupportAttentionImplAttentionMetadataBuilderAttentionTypeCommonAttentionMetadata
MultipleOf)FlashAttentionMetadata)chunked_prefill_paged_decode)PagedAttention)triton_reshape_and_cache_flash)AttentionSpecc                      e Zd ZU eed<   eed<   ej        ed<   eed<   ej        ed<   ej        ed<   ej        ed<   eed<   eed	<   ej        d
z  ed<   ej        d
z  ed<   ej        d
z  ed<   d
Zej        d
z  ed<   d
Z	ej        d
z  ed<   d
S )RocmAttentionMetadatanum_actual_tokensmax_query_lenquery_start_locmax_seq_lenseq_lensblock_tableslot_mappinguse_cascadecommon_prefix_lenNcu_prefix_query_lensprefix_kv_lenssuffix_kv_lensscheduler_metadataprefix_scheduler_metadata)
__name__
__module____qualname__int__annotations__torchTensorboolr$   r%        x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/attention/backends/rocm_attn.pyr   r   '   s          \!!!l, ,----L4''''L4'''' /3t+22259u|d299999r/   r   c            	            e Zd ZU ej        Zee         ed<   dede	e
         dedej        f fdZdedefd	Z	 ddedededefdZ xZS )RocmAttentionMetadataBuilder_cudagraph_supportkv_cache_speclayer_namesvllm_configdevicec                 $   t                                          ||||           |j        | _        |j        }|                    |j                  | _        |                    |j                  | _        |	                                | _
        d S N)super__init__
block_sizemodel_configget_num_attention_headsparallel_confignum_heads_qget_num_kv_headsnum_heads_kvget_head_sizeheaddim)selfr4   r5   r6   r7   r=   	__class__s         r0   r;   z%RocmAttentionMetadataBuilder.__init__H   s     	[&III'2"/'??'
 
 )99+:UVV#1133r/   common_attn_metadatareturnc                     |                      d|          }|j                            d           |j                                         |j                                         |S )Nr      )buildr   fill_r   zero_query_start_loc_cpu)rE   rG   attn_metadatas      r0   build_for_cudagraph_capturez8RocmAttentionMetadataBuilder.build_for_cudagraph_captureZ   sc     

1&:;; 	$$Q'''
 	,22444066888r/   Fr    
fast_buildc                    |j         }|j        }|j        }|j        }|j        }|j        }	|j        }
|dk    }|rt          j        d|gt          j	        | j
                  }t          j        |gt          j	        | j
                  }|j                                        |z
  }|                    | j
                  }nd }d }d }d }t          ||||||	|
||||||          }|S )Nr   )dtyper7   )r   r   r   r   r   r   r   r   r    r!   r"   r#   r%   )r   r   r   r   r   block_table_tensorr   r+   tensorint32r7   cputor   )rE   r    rG   rQ   r   r   r   r   r   rT   r   r   r!   r"   r#   r%   rO   s                    r0   rK   z"RocmAttentionMetadataBuilder.buildk   s*    1B,:*6.>'01D+8'!+ 	-#(<%&ek$+$ $ $  #\"#5;t{  N 2:>>@@CTTN+..t{;;NN#' !N!N(,%-/'+#*%#/!5))&?
 
 
 r/   )F)r&   r'   r(   r   ALWAYSr3   r   r*   r   liststrr   r+   r7   r;   r   r   rP   r)   r-   rK   __classcell__)rF   s   @r0   r2   r2   E   s         7I7P!34PPP4$4 #Y4  	4
 4 4 4 4 4 4$$;	   * !	/ // 6/ 	/
 
/ / / / / / / /r/   r2   c                      e Zd ZU dZeed<   ej        ej        ej	        gZ
eeej                          ed<   edeeez           fd            Zedee         fd            Zededdfd	            Zedefd
            Zeded         fd            Ze	 ddedededededeedf         fd            Zedefd            Zeded         fd            ZdS )RocmAttentionBackendTaccept_output_buffersupported_dtypesrH   c                  
    g dS )N)       i   r.   r.   r/   r0    get_supported_kernel_block_sizesz5RocmAttentionBackend.get_supported_kernel_block_sizes   s     }}r/   c                 
    g dS )N)rc   @   `                  r.   )clss    r0   get_supported_head_sizesz-RocmAttentionBackend.get_supported_head_sizes   s    4444r/   	head_sizeNc           	          |                      |          sE| j                            d          }t          d| d| d|                                  d          d S )NBackendz
Head size z is not supported by z. Supported head sizes are: zd. Set --attention-backend=FLEX_ATTENTION to use FlexAttention backend which supports all head sizes.)supports_head_sizer&   removesuffix
ValueErrorrn   )rm   ro   	attn_types      r0   validate_head_sizez'RocmAttentionBackend.validate_head_size   s    %%i00 	11)<<IGY G GY G G-0-I-I-K-KG G G  	 	r/   c                      dS )N	ROCM_ATTNr.   r.   r/   r0   get_namezRocmAttentionBackend.get_name   s    {r/   RocmAttentionImplc                      t           S r9   )rz   r.   r/   r0   get_impl_clsz!RocmAttentionBackend.get_impl_cls   s      r/   auto
num_blocksr<   num_kv_headscache_dtype_str.c                 @    |dz  dk    rt          d          d| |||fS )Nrb   r   z$Block size must be a multiple of 16.   )rt   )r~   r<   r   ro   r   s        r0   get_kv_cache_shapez'RocmAttentionBackend.get_kv_cache_shape   s3     ?aCDDD:z<CCr/   c                      dS )NFr.   )argskwargss     r0   use_cascade_attentionz*RocmAttentionBackend.use_cascade_attention   s    ur/   r2   c                      t           S r9   )r2   r.   r/   r0   get_builder_clsz$RocmAttentionBackend.get_builder_cls   s    ++r/   )r}   )r&   r'   r(   r_   r-   r*   r+   float16bfloat16float32r`   r   rZ   rS   staticmethodr)   r   rd   classmethodrn   rv   r[   ry   typer|   tupler   r   r   r.   r/   r0   r^   r^      s        !%$%%%5htEK01    d33C.D    \ 5c 5 5 5 [5 3 4    [ c    \ !$23 ! ! ! \!   &	D 	D	D	D 	D 		D
 	D 
sCx	D 	D 	D \	D $    \ ,T"@A , , , \, , ,r/   r^   c                   H   e Zd ZdefdZdej        ddfdedededede	e         dz  d	edz  d
e
dedz  dededz  dej        dz  ddfdZ	 	 	 ddej        j        dej        dej        dej        dej        dedej        dz  dej        dz  dej        dz  dej        fdZdS )rz   	quant_keyc                     |t           k    S r9   )r   )rE   r   s     r0   fused_output_quant_supportedz.RocmAttentionImpl.fused_output_quant_supported   s    ///r/   N	num_headsro   scaler   alibi_slopessliding_windowkv_cache_dtypelogits_soft_capru   kv_sharing_target_layer_namesinksrH   c                 J   || _         || _        t          |          | _        || _        | t          j        |t
          j                  }|| _        |d| _	        n|dz
  df| _	        || _
        |d}|| _        |
| _        | j         | j        z  | _        t                              |           |	t           j        t           j        fvrt'          d          t)          j                    | _        || _        |'|j        d         |k    sJ d|j         d| d            d S d S )	N)rS   )r   rJ   r   z?Encoder self-attention is not implemented for RocmAttentionImplz[Sinks must have the same number of heads as the number of heads in the layer. Sinks shape: z, num_heads: .)r   ro   floatr   r   r+   rU   r   r   r   r   r   r   num_queries_per_kvr^   rv   r   DECODERENCODER_DECODERNotImplementedErrorr	   	fp8_dtyper   shape)rE   r   ro   r   r   r   r   r   r   ru   r   r   s               r0   r;   zRocmAttentionImpl.__init__   sT    #"5\\
(# <EMJJJL(!"*D#1A#5q"9D,"O.,H)"&.D4E"E//	:::]2M4QRRR%Q   *355
;q>Y...+49K+ +'+ + + /.. ..r/   layerquerykeyvaluekv_cacherO   outputoutput_scaleoutput_block_scalec
           
          |
J d            |	t          d          ||                    d          S |j        du sJ |j        }
t	          j        || j        | j                  \  }}| j        {|j	        d         }|dk    o||dz
  z  dk    }|r0t	          j
        |||||j        | j        |j        |j                   n*t          |||||j        | j        |j        |j                   | j                            d          rI|                    | j                  }|                    | j                  }|j        d	k    s
J d
            |j        }|j        }|j        }|j        }|j        }t3          di d|d|
         d|d|
         d|d|
         d|d|
         d| j        d|d|d|d|d|d|d|d|j        d|j        d| j        d| j        d         d| j        d|d| j         |S )a  Forward pass with FlashAttention.

        Args:
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
            kv_cache: shape =
                [2, num_blocks, block_size, num_kv_heads, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
        NzOutput tensor must be provided.zPfused block_scale output quantization is not yet supported for RocmAttentionImplr   F   rJ   fp8g      ?z-A non 1.0 q_scale is not currently supported.r   r   r   r   r   	key_cachevalue_cacher   r   r   r   r   k_scalev_scaler   r   sm_scaler   r   r.   )r   rL   r   r   r   split_kv_cacher   ro   r   r   write_to_paged_cacher   r   _k_scale_v_scaler   
startswithviewr   _q_scale_floatr   r   r   r   r   r   r   r   r   r   )rE   r   r   r   r   r   rO   r   r   r   r   r   r   r<   is_pow2cu_seqlens_q	seqused_kmax_seqlen_qmax_seqlen_kr   s                       r0   forwardzRocmAttentionImpl.forward  s   0 !!#D!!!)%)  
  <<??"(E1111 *;!/!>d'"
 "
	; ,4 %*1-J 1nM*
Q*G1*LG 3!.'NN	 	 	 	 /!.'NN	 	 	 ))%00 	!t~66I%**4>::K'3...? /.. %4!*	$2$0#/ 	% 	
 	
 	
***++	
&&&''	
 ***++	
 ,,,--		

  ..	
  i	
 $	
 $	
 )L	
 Y	
 %	
 ',	
 NN	
 NN	
 **	
   .q11!	
" ZZ#	
$ &%	
& **'	
 	
, r/   )NNN)r&   r'   r(   r   r   r   r   r)   r   rZ   r[   r+   r,   r;   nnModuler   r   r.   r/   r0   rz   rz      s       0h 0 0 0 0 )-#0#837%)1 11 1 	1
 1 5kD(1 d
1 1 1 !1 '*Dj1 |d"1 
1 1 1 1v '+,026} }x} |} \	}
 |} ,} .} t#} lT)} "L4/} 
} } } } } }r/   rz   )'__doc__dataclassesr   typingr   r+   vllm.configr   vllm.loggerr   9vllm.model_executor.layers.quantization.utils.quant_utilsr   r   vllm.platformsr	   vllm.v1.attention.backendr
   r   r   r   r   r   r   %vllm.v1.attention.backends.flash_attnr   2vllm.v1.attention.ops.chunked_prefill_paged_decoder    vllm.v1.attention.ops.paged_attnr   4vllm.v1.attention.ops.triton_reshape_and_cache_flashr   vllm.v1.kv_cache_interfacer   r&   loggerr   r2   r^   rz   r.   r/   r0   <module>r      s]   E D ! ! ! ! ! !        " " " " " " # # # # # #        , + + + + +                  I H H H H H      < ; ; ; ; ;      5 4 4 4 4 4	X		 : : : : : : : ::U U U U U#;<Q#R U U UpA, A, A, A, A,+ A, A, A,Ht t t t t t t t t tr/   