
    .`i<+                         d Z ddlZddlmZ ddlmZ ddlmZ  e            rddlmZ ddl	m
Z
 ddlmZ d	d
lmZmZmZmZ  e
e          Z G d de          Z G d de          ZdS )z$Attention layer with FlashAttention.    N)AttentionType)#is_flash_attn_varlen_func_available)%triton_reshape_and_cache_flash_diffkv)flash_attn_varlen_func)init_logger)get_kv_cache_layout   )FlashAttentionBackendFlashAttentionImplFlashAttentionMetadatacascade_attentionc                       e Zd ZU dZeed<   ededdfd            Zede	fd            Z
eded         fd            Ze	 dd
edededede	deedf         fd            Ze	 ddedeedf         fd            ZdS )FlashAttentionDiffKVBackend   head_size_vreturnNc                     || _         d S N)r   )clsr   s     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/attention/backends/flash_attn_diffkv.pyset_head_size_vz+FlashAttentionDiffKVBackend.set_head_size_v    s    %    c                      dS )NFLASH_ATTN_DIFFKV r   r   r   get_namez$FlashAttentionDiffKVBackend.get_name$   s    ""r   r   c                      t           S r   )FlashAttentionDiffKVImplr   r   r   get_impl_clsz(FlashAttentionDiffKVBackend.get_impl_cls(   s    ''r   auto
num_blocks
block_sizenum_kv_heads	head_sizecache_dtype_str.c                 X    |dz  dk    rt          d          | |||t          j        z   fS )N   r   z$Block size must be a multiple of 16.)
ValueErrorr   r   )r!   r"   r#   r$   r%   s        r   get_kv_cache_shapez.FlashAttentionDiffKVBackend.get_kv_cache_shape.   sA     ?aCDDD3??	
 	
r   Finclude_num_layers_dimensionc                     t                      }|dk    r| rdS |dk    rd}n&|dk    r| rdS |dk    rd}nt          d| d          |S )	NNHD)r	   r            )r   r	   r-   r.   HND)r	   r.   r   r-   r/   )r   r-   r	   r.   zUnknown cache layout format .)r   r(   )r*   cache_layoutstride_orders      r   get_kv_cache_stride_orderz5FlashAttentionDiffKVBackend.get_kv_cache_stride_order?   s     +,,5  %A  #?U""'LLU""'C" #?U""'LLKLKKKLLLr   )r    )F)__name__
__module____qualname__r   int__annotations__classmethodr   staticmethodstrr   typer   tupler)   boolr4   r   r   r   r   r      sP        K&# &$ & & & [& #c # # # \# ($34 ( ( ( \(
   &
 


 
 	

 
 
sCx
 
 
 \
  -2 &*	sCx   \  r   r   c                       e Zd Z	 	 	 ddej        j        dej        dej        dej        dej        dedej        dz  d	ej        dz  d
ej        dz  dej        fdZdS )r   Nlayerquerykeyvaluekv_cacheattn_metadataoutputoutput_scaleoutput_block_scaler   c
                    |
J d            | j         
J d            ||	t          d          ||                    d          S | j        }
|j        }|
t
          j        t
          j        fv r:|                     |d|         |d|         |d|         |d|         ||          S |dd| j	        f         }|d| j	        df         }| j
        -|+|)t          ||||j        | j        |j        |j                   | j                            d          rCt#          j        | j                  }|                    |          }|                    |          }|j        s|j        }|j        }|j        }|j        }|j        }|j        }|j        d         dz
  | j        f}| j        dk    r|                     |d|         |d|         |d|         |||d|         ||j                             |          |j                             |          |j                             |          	
  
         |S | j!        tE          | j!                  nd}tG          d+i d
|d|         d|d|d|d|         d|d|d|d|d| j$        d|j%        d| j&        d|d|d| j'        d|d| j         d|j                             |          d|j                             |          d|j                             |          d|j(        d| j)         |S tU          |d|         |d|         ||fi d|j        d |j        d!|j+        d"|j,        d#|j-        d$|j        d| j$        d| j&        d%| j!        d&| j'        d|j        d'|j.        d(|j(        d| j         d)|j/        d*|j        d|j        d|j        d|j        d| j)         |S ),a  Forward pass with FlashAttention.

        Args:
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size_v]
            kv_cache: shape =
                [num_blocks, block_size, num_kv_heads, head_size + head_size_v]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size_v]
        NOTE: FP8 quantization, flash-attn expect the size of
              {q,k,v}_descale to be (num_sequences, num_kv_heads).
              We use torch's .expand() to avoid duplicating values
        NzOutput tensor must be provided.z$FlashAttention version not detected.zEfused output quantization is not yet supported for FlashAttentionImplr   .fp8r	   )	q_descale	k_descale	v_descaleqkvoutcu_seqlens_qmax_seqlen_q	seqused_kmax_seqlen_ksoftmax_scalecausalalibi_slopeswindow_sizeblock_tablesoftcapscheduler_metadata
fa_versionrL   rM   rN   
num_splitss_auxcu_query_lensmax_query_lencu_prefix_query_lensprefix_kv_lenssuffix_kv_lens
max_kv_lensliding_windowlogits_soft_capcommon_prefix_lenmax_num_splitsprefix_scheduler_metadatasuffix_scheduler_metadatar   )0vllm_flash_attn_versionNotImplementedErrorfill_	attn_typenum_actual_tokensr   ENCODER_ONLYENCODER_forward_encoder_attentionr$   kv_sharing_target_layer_namer   slot_mappingkv_cache_dtype_k_scale_v_scale
startswithr
   get_fp8_dtype_for_flashattnviewuse_cascadequery_start_locseq_lensrb   max_seq_lenr[   r]   shaper#   dcp_world_size_forward_with_dcp_q_scaleexpandrg   listr   scalerX   rY   rh   rj   sinksr   rc   rd   re   ri   rk   )selfrA   rB   rC   rD   rE   rF   rG   rH   rI   rp   rq   	key_cachevalue_cachedtyperS   rU   rT   rV   r[   r]   descale_shapesliding_window_sizes                          r   forwardz FlashAttentionDiffKVImpl.forwardX   s   6 !!#D!!!+772 877 #'9'E%W    <<??"N	 *; 3]5JKKK 22((()&&&'((())))*   S"2DN"223	sDN$4$445 -5! 2*#   ))%00 	2)E# E "u--I%**511K( 5	(8L%.I(6L(4L'3K!.!A)/2Q68IJM"Q&&&&,,,-***+,,,----.!#n33MBB#n33MBB#n33MBB '     *6 ,--- $
 '   ...//i "k 1 1122	
 ". ". (i ". #'** )// "&!2!2 !4 3 !, !00 (:'9   $;;!" $n33MBBB#$ $n33MBBB%& $n33MBBB'(  -;;)* **+ .  	%%%&$$$%		
 	
 	

 (77	
 (55	
 "/!C!C	
 )77	
 )77	
 %00	
 **	
 **	
  ..	
 !00	
 &11	
  ,==!	
" )77#	
$ 33%	
& '4&M&M'	
( '4&F&F)	
* nn+	
, nn-	
. nn/	
0 **1	
 	
4 r   )NNN)	r5   r6   r7   torchnnModuleTensorr   r   r   r   r   r   r   W   s         '+,026} }x} |} \	}
 |} ,} .} t#} lT)} "L4/} 
} } } } } }r   r   )__doc__r   vllm.v1.attention.backendr   #vllm.v1.attention.backends.fa_utilsr   4vllm.v1.attention.ops.triton_reshape_and_cache_flashr   r   vllm.loggerr    vllm.v1.attention.backends.utilsr   
flash_attnr
   r   r   r   r5   loggerr   r   r   r   r   <module>r      sZ   + *  3 3 3 3 3 3 S S S S S S      '&(( KJJJJJJ # # # # # # @ @ @ @ @ @            
X		8 8 8 8 8"7 8 8 8v~ ~ ~ ~ ~1 ~ ~ ~ ~ ~r   