
    .`i\U                        d Z ddlmZ ddlmZ ddlZddlmZmZ ddl	m
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZmZ ddlmZ ddl m!Z! ddl"m#Z# ddl$m%Z%  ee&          Z'dZ(dZ)e G d d                      Z* G d dee*                   Z+ G d de          Z, G d de          Z-dS )z-High-Performance Triton-only Attention layer.    )	dataclass)ClassVarN)CUDAGraphMode
VllmConfig)
CacheDType)init_logger)QuantKeykFp8StaticTensorSym)current_platform)DeviceCapability)next_power_of_2)AttentionBackendAttentionCGSupportAttentionImplAttentionMetadataBuilderAttentionTypeCommonAttentionMetadata
MultipleOf)context_attention_fwd)triton_reshape_and_cache_flash)unified_attention)AttentionSpec      c                      e Zd ZU eed<   eed<   ej        ed<   eed<   ej        ed<   ej        ed<   ej        ed<   eed<   eed	<   ej        ed
<   ej        ed<   ej        ed<   eed<   eed<   ej        dz  ed<   ej        dz  ed<   ej        dz  ed<   dZej        dz  ed<   dZ	ej        dz  ed<   dZ
eeeeeef                  f         dz  ed<   edej        dz  fd            ZdS )TritonAttentionMetadatanum_actual_tokensmax_query_lenquery_start_locmax_seq_lenseq_lensblock_tableslot_mappingseq_threshold_3Dnum_par_softmax_segmentssoftmax_segm_outputsoftmax_segm_maxsoftmax_segm_expsumuse_cascadecommon_prefix_lenNcu_prefix_query_lensprefix_kv_lenssuffix_kv_lensscheduler_metadataprefix_scheduler_metadatamm_prefix_rangereturnc                 X     j         dS  j        j        d         } j        j         fdt	          |          D             }t          d |D                       rdS fd|D             }t          j                            |t          j	                  
                    d          S )zConvert mm_prefix_range dict to padded tensor for Triton kernel.

        Returns shape: (num_seqs, max_ranges, 2) with 0-padding for empty ranges.
        Empty ranges have start==end==0, which kernel skips via is_valid check.
        Nr   c                 N    g | ]!}j                             |d g          pd g"S )r   r   )r0   get).0iselfs     z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/attention/backends/triton_attn.py
<listcomp>zBTritonAttentionMetadata.mm_prefix_range_tensor.<locals>.<listcomp>`   sB     
 
 
BCD $$Q11=fX
 
 
    c              3   $   K   | ]}|d gk    V  dS )r4   N )r6   rs     r9   	<genexpr>zATritonAttentionMetadata.mm_prefix_range_tensor.<locals>.<genexpr>e   s(      22qVH}222222r;   c                 z    g | ]7}t          j        |t           j                                       dd          8S )dtypedevice   )torchtensorint32view)r6   r>   rC   s     r9   r:   zBTritonAttentionMetadata.mm_prefix_range_tensor.<locals>.<listcomp>i   sL     
 
 
 L%+f===BB2qII
 
 
r;   )layout)r0   r!   shaperC   rangeallrF   nestednested_tensorjaggedto_padded_tensor)r8   num_seqsrange_listsrange_tensorsrC   s   `   @r9   mm_prefix_range_tensorz.TritonAttentionMetadata.mm_prefix_range_tensorP   s     '4=&q)%
 
 
 
GLX
 
 

 22k22222 	4
 
 
 
 
 
 

 |))%, * 
 


1

	r;   )__name__
__module____qualname__int__annotations__rF   Tensorboolr.   r/   r0   dictlisttuplepropertyrU   r=   r;   r9   r   r   ,   s         \!!!l,!!!!%%%l"""%%% ,----L4''''L4'''' /3t+22259u|d2999?COT#tE#s(O445<CCCt(;    X  r;   r   c            	            e Zd ZU ej        Zee         ed<   dede	e
         dedej        f fdZdedefd	Z	 ddedededefdZ xZS )TritonAttentionMetadataBuilder_cudagraph_supportkv_cache_speclayer_namesvllm_configrC   c                     t                                          ||||           |j         _        |j        }|                    |j                   _        |                    |j                   _        |	                                 _
         j        j        j        t          j        t          j        t          j        fv  _        t&           j        z   _         j        r6 j        j        j        }|s
J d            t-          | fd           _        t.           _        t3           j
                  }t5          j         j         j         j        |ft4          j        |           _        t5          j         j         j         j        ft4          j        |           _        t5          j         j         j         j        ft4          j        |           _        d S )Nz3CUDA Graphs enabled but no capture sizes specified.c                 2    t          | j        z
            S N)absr$   )xr8   s    r9   <lambda>z9TritonAttentionMetadataBuilder.__init__.<locals>.<lambda>   s    c!d&;";<< r;   )keyrA   ) super__init__
block_sizemodel_configget_num_attention_headsparallel_confignum_heads_qget_num_kv_headsnum_heads_kvget_head_sizeheaddimrf   compilation_configcudagraph_moder   FULL_AND_PIECEWISEFULL_DECODE_ONLYFULLdecode_cudagraph_enabledMIN_LAUNCH_GRID_SIZE_2Dr$   cudagraph_capture_sizesminNUM_PAR_SOFTMAX_SEGMENTSr%   r   rF   emptyfloat32r&   r'   r(   )	r8   rd   re   rf   rC   rq   capture_sizesheaddim_padded	__class__s	   `       r9   ro   z'TritonAttentionMetadataBuilder.__init__v   s    	[&III'2"/'??'
 
 )99+:UVV#1133 />0." 	% !84;L L ( 
	 ,?WM WW"WWW=
 %(<<<<% % %D!
 )A%(66#(;% -	 -	$
 	$
 	$
  !&"D$4d6ST-!
 !
 !

 $);"D$4d6ST-$
 $
 $
   r;   common_attn_metadatar1   c                 f    |                      d|          }|j                            d           |S )Nr      )buildr!   fill_)r8   r   attn_metadatas      r9   build_for_cudagraph_capturez:TritonAttentionMetadataBuilder.build_for_cudagraph_capture   s7     

1&:;; 	$$Q'''r;   Fr*   
fast_buildc                 F   |j         }|j        }|j        }|j        }|j        }|j        }	|j        }
|dk    }|rt          j        d|gt          j	        | j
                  }t          j        |gt          j	        | j
                  }|j                                        |z
  }|                    | j
                  }nd }d }d }d }t          di d|d|d|d|d|d|	d	|
d
|d|d|d|d|d|d| j        d| j        d| j        d| j        d| j        }|S )Nr   rA   r   r   r   r    r!   r"   r#   r)   r*   r+   r,   r-   r/   r$   r%   r&   r'   r(   r=   )r   r   r    r   r!   block_table_tensorr#   rF   rG   rH   rC   cputor   r$   r%   r&   r'   r(   )r8   r*   r   r   r   r   r    r   r!   r   r#   r)   r+   r,   r-   r/   r   s                    r9   r   z$TritonAttentionMetadataBuilder.build   s    1B,:*6.>'01D+8'!+ 	-#(<%&ek$+$ $ $  #\"#5;t{  N 2:>>@@CTTN+..t{;;NN#' !N!N(,%/ 
 
 
//
'-
 ,O
 $	

 X
 +*
 &
 $
 0/
 "6!5
 *>
 *>
 '@&?
 "22
 &*%B%B
  !% 8 8!
" "22#
$ !% 8 8%
( r;   F)rV   rW   rX   r   ALWAYSrc   r   rZ   r   r^   strr   rF   rC   ro   r   r   r   rY   r\   r   __classcell__)r   s   @r9   rb   rb   s   s         7I7P!34PPPF
$F
 #YF
  	F

 F
 F
 F
 F
 F
 F
P$;	     !	4 44 64 	4
 
!4 4 4 4 4 4 4 4r;   rb   c                   ~   e Zd ZU dZeed<   ej        ej        ej	        gZ
eeej                          ed<   g dZeee                  ed<   edeeez           fd            Zedefd            Zeded	         fd
            Ze	 d"dedededededeedf         fd            Ze	 d#dedeedf         fd            Zedefd            Zeded         fd            Zededefd            Zedefd            Zedefd            Z ededefd            Z!edefd            Z"ede#defd             Z$d!S )$TritonAttentionBackendTaccept_output_buffersupported_dtypes)autobfloat16fp8fp8_e4m3fp8_e5m2supported_kv_cache_dtypesr1   c                  "    t          d          gS )Nr   )r   r=   r;   r9    get_supported_kernel_block_sizesz7TritonAttentionBackend.get_supported_kernel_block_sizes  s    2r;   c                      dS )NTRITON_ATTNr=   r=   r;   r9   get_namezTritonAttentionBackend.get_name  s    }r;   TritonAttentionImplc                      t           S ri   )r   r=   r;   r9   get_impl_clsz#TritonAttentionBackend.get_impl_cls  s    ""r;   r   
num_blocksrp   num_kv_heads	head_sizecache_dtype_str.c                 @    |dz  dk    rt          d          | d|||fS )Nr   r   z$Block size must be a multiple of 16.rE   )
ValueError)r   rp   r   r   r   s        r9   get_kv_cache_shapez)TritonAttentionBackend.get_kv_cache_shape  s3     ?aCDDDAz<CCr;   Finclude_num_layers_dimensionc                     | rdS dS )N)r   r   rE            )r   r   rE   r   r   r=   )r   s    r9   get_kv_cache_stride_orderz0TritonAttentionBackend.get_kv_cache_stride_order&  s     ( 	&%% r;   c                      dS )NFr=   )argskwargss     r9   use_cascade_attentionz,TritonAttentionBackend.use_cascade_attention3  s    ur;   rb   c                      t           S ri   )rb   r=   r;   r9   get_builder_clsz&TritonAttentionBackend.get_builder_cls7  s    --r;   c                     |dk    S )N    r=   )clsr   s     r9   supports_head_sizez)TritonAttentionBackend.supports_head_size;  s    Br;   c                     dS NTr=   r   s    r9   supports_mm_prefixz)TritonAttentionBackend.supports_mm_prefix?      tr;   c                     dS r   r=   r   s    r9   supports_sinkz$TritonAttentionBackend.supports_sinkC  r   r;   	attn_typec                 b    |t           j        t           j        t           j        t           j        fv S )z-TritonAttention supports all attention types.)r   DECODERENCODERENCODER_ONLYENCODER_DECODER)r   r   s     r9   supports_attn_typez)TritonAttentionBackend.supports_attn_typeG  s.     !!&)	
 
 	
r;   c                     dS r   r=   r   s    r9   supports_alibi_sqrtz*TritonAttentionBackend.supports_alibi_sqrtQ  r   r;   
capabilityc                     dS r   r=   )r   r   s     r9   supports_compute_capabilityz2TritonAttentionBackend.supports_compute_capabilityU  r   r;   N)r   r   )%rV   rW   rX   r   r\   rZ   rF   float16r   r   r   r   r^   rB   r   r   staticmethodrY   r   r   r   r   typer   r_   r   r   r   r   classmethodr   r   r   r   r   r   r   r=   r;   r9   r   r      s        !%$%%%5htEK01   
= = =xZ(89     d33C.D       \  c    \ #$45 # # # \#   &	D 	D	D	D 	D 		D
 	D 
sCx	D 	D 	D \	D -2
 
&*
	sCx
 
 
 \
 $    \ .T"BC . . . \. 3 4    [ 4    [ d    [ 
3 
4 
 
 
 [
 D    [ 5E $    [  r;   r   c                      e Zd ZdefdZdej        dddfdedededed	e	e         dz  d
edz  de
dedz  dededz  dej        dz  deddfdZ	 	 	 ddej        j        dej        dej        dej        dej        dedej        dz  dej        dz  dej        dz  dej        fdZdej        dej        dej        dej        dedej        j        dej        fdZdS )r   	quant_keyc                     |t           k    S ri   )r
   )r8   r   s     r9   fused_output_quant_supportedz0TritonAttentionImpl.fused_output_quant_supported[  s    ///r;   NF	num_headsr   scaler   alibi_slopessliding_windowkv_cache_dtypelogits_soft_capr   kv_sharing_target_layer_namesinksuse_alibi_sqrtr1   c                 `   || _         || _        t          |          | _        || _        | t          j        |t
          j                  }|| _        |d| _	        n6|	t          j        t          j        fv r|dz
  |dz
  f| _	        n|dz
  df| _	        || _        |d}|| _        |
| _        | j         | j        z  | _        |	| _        t%          j                    | _        || _        |'|j        d         |k    sJ d|j         d| d            || _        t%          j                    | _        d S )N)rB   )rD   rD   r   r   z[Sinks must have the same number of heads as the number of heads in the layer. Sinks shape: z, num_heads: .)r   r   floatr   r   rF   rG   r   r   r   r   r   r   r   r   r   num_queries_per_kvr   r   	fp8_dtyper   rK   r   is_cudasupports_quant_query_input)r8   r   r   r   r   r   r   r   r   r   r   r   r   s                r9   ro   zTritonAttentionImpl.__init__^  s]    #"5\\
(# <EMJJJL(!"*D=0-2LMMM#1A#5~7I"JD#1A#5q"9D,"O.,H)"&.D4E"E")355
;q>Y...+49K+ +'+ + + /..
 -*:*B*D*D'''r;   layerqueryrm   valuekv_cacher   outputoutput_scaleoutput_block_scalec
           
         |
J d            |	t          d          ||                    d          S |j        du sJ |j        }
| j        t
          j        t
          j        fv r:|                     |d|
         |d|
         |d|
         |d|
         ||          S |	                    d          \  }}| j
        ||z|x| j                            d          r4|                    | j                  }|                    | j                  }t          |||||j        | j        |j        |j                   | j                            d          rY|j        | j        k    r4|                    | j                  }|                    | j                  }|j        dk    s
J d	            |j        }|j        }|j        }|j        }|j        }|j        }|j        }|j        }|j        }|j        }|j        d         dz
  |j        d
         f}|j         }tC          d&i d|d|
         d|d|d|d|
         d|d|d|d|d| j"        ddd| j#        d| j$        d| j%        d|d| j&        ddd|j        '                    |          d|j        '                    |          d|d|d |d!|d"|d#| j(        d$|d%| |S )'a  Forward pass with Paged Attention impl. in Triton.

        Args:
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
            kv_cache: shape =
                [num_blocks, 2, block_size, num_kv_heads, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
        NzOutput tensor must be provided.zRfused block_scale output quantization is not yet supported for TritonAttentionImplr   Fr   r   g      ?z-A non 1.0 q_scale is not currently supported.rE   qkvoutcu_seqlens_qmax_seqlen_q	seqused_kmax_seqlen_ksoftmax_scalecausalTr   r   window_sizer"   softcap	q_descale	k_descale	v_descaler$   r%   r&   r'   r(   r   r   r0   r=   ))NotImplementedErrorr   r)   r   r   r   r   r   _forward_encoder_attentionunbindr   r   
startswithrI   r   r   r#   _k_scale_v_scalerB   _q_scale_floatr   r!   r   r    r"   r$   r%   r&   r'   r(   rK   rU   r   r   r   r   r   r   expandr   )r8   r   r   rm   r   r   r   r   r   r   r   	key_cachevalue_cacher   r   r   r   r"   r$   r%   r&   r'   r(   descale_shaperU   s                            r9   forwardzTritonAttentionImpl.forward  s!   0 !!#D!!!)%+  
  <<??"(E1111 *; >m8-:OPPP 22((()&&&'((())))*   "*!3!3	; -5! "--e44 ?%NN4>::	)..t~>> +*#	 	 	 ))%00 	$.00%NN4>::	)..t~>>'3...? /.. %4!*	$2$0#/(9#0#I +?(9+?%+A.2IOA4FG!.!E 	
 	
 	
&&&''	
i	
 k	
 )))**		

 &	
 &	
  i	
 &	
 **	
 4	
 **	
  ..	
 ++	
 $	
 ((	
  d!	
" n++M:::#	
$ n++M:::%	
& .-'	
( &>%=)	
* !4 3+	
, .--	
. !4 3/	
0 **1	
2 &3	
4 325	
 	
: r;   c                     | j                             d          rt          d          |j        }|j        }|j        }	t          |||||||	d| j        | j        d         | j        d                    |S )a  Forward pass for encoder attention without KV cache.

        Args:
            query: shape = [num_encoder_tokens, num_heads, head_size]
            key: shape = [num_encoder_tokens, num_kv_heads, head_size]
            value: shape = [num_encoder_tokens, num_kv_heads, head_size]
            output: shape = [num_encoder_tokens, num_heads, head_size]
            attn_metadata: Encoder attention metadata
            layer: The attention layer
        r   z3quantization is not supported for encoder attentionFr   r   )r   r   r   ob_start_loc	b_seq_lenmax_input_len	is_causalr   sliding_window_qsliding_window_k)	r   r  r  r   r!   r   r   r   r   )
r8   r   rm   r   r   r   r   r   r!   r   s
             r9   r  z.TritonAttentionImpl._forward_encoder_attention  s    ( ))%00 	%E  
 (7 )%3 	''*!03!03	
 	
 	
 	
 r;   )NNN)rV   rW   rX   r	   r   r   r   rY   r   r^   r   rF   r[   r\   ro   nnModuler   r  r  r=   r;   r9   r   r   Z  s*       0h 0 0 0 0 )-#0#837%)$0E 0E0E 0E 	0E
 0E 5kD(0E d
0E 0E 0E !0E '*Dj0E |d"0E 0E 
0E 0E 0E 0Et '+,026M MxM |M \	M
 |M ,M /M t#M lT)M "L4/M 
M M M M^,|, \, |	,
 , /, x, 
, , , , , ,r;   r   ).__doc__dataclassesr   typingr   rF   vllm.configr   r   vllm.config.cacher   vllm.loggerr   9vllm.model_executor.layers.quantization.utils.quant_utilsr	   r
   vllm.platformsr   vllm.platforms.interfacer   vllm.utils.math_utilsr   vllm.v1.attention.backendr   r   r   r   r   r   r   .vllm.v1.attention.ops.triton_prefill_attentionr   4vllm.v1.attention.ops.triton_reshape_and_cache_flashr   .vllm.v1.attention.ops.triton_unified_attentionr   vllm.v1.kv_cache_interfacer   rV   loggerr   r   r   rb   r   r   r=   r;   r9   <module>r)     s   4 3 ! ! ! ! ! !        1 1 1 1 1 1 1 1 ( ( ( ( ( ( # # # # # #        , + + + + + 5 5 5 5 5 5 1 1 1 1 1 1                  Q P P P P P      M L L L L L 4 4 4 4 4 4	X		    C C C C C C C CLI I I I I%=>U%V I I IXX X X X X- X X Xvq q q q q- q q q q qr;   