
    0`iG                         d dl mZmZmZmZ d dlZd dlmZ 	 ddlm	Z	 dZ
dZn# e$ rZ ee          Z
dZY dZ[ndZ[ww xY w	 ddlmZ dZdZn# e$ rZ ee          ZdZY dZ[ndZ[ww xY w	 d dlmZ dZdZn# e$ rZ ee          ZdZY dZ[ndZ[ww xY wd	Zd d
eeee         f         fdZd d
eeee         f         fdZd d
eeee         f         fdZd ded
efdZd ded
ee         fdZd Zej        dddddd dddd dd fdej         deej                  deej                  deej                  dee         f
dZ!ddddddddddddddddddd eddd dfdeee                  dedefdZ"	 	 	 	 	 	 	 d!ddddZ#	 	 	 	 	 	 	 d!ddddZ$dS )"    )OptionalUnionTupleListN   )_vllm_fa2_CTF)_vllm_fa3_C)_flash_attn_fwd   returnc                     t           sddt           fS t          j                            |           d         dk     rdS dS )NFzFA2 is unavaible due to: r      )Fz=FA2 is only supported on devices with compute capability >= 8TN)FA2_AVAILABLEFA2_UNAVAILABLE_REASONtorchcudaget_device_capabilitydevices    }/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/vllm_flash_attn/flash_attn_interface.py_is_fa2_supportedr   (   sQ     KJ2HJJJJz''//2Q66L L:    c                     t           sddt           fS t          j                            |           d         dk     s)t          j                            |           d         dk    rdS dS )NFzFA3 is unavaible due to: r   	   
   )Fz<FA3 is only supported on devices with compute capability 9.0r   )FA3_AVAILABLEFA3_UNAVAILABLE_REASONr   r   r   r   s    r   _is_fa3_supportedr   0   sq     KJ2HJJJJz''//2Q66:++F33A6"<<K K:r   c                     t           sddt           fS t          j                            |           d         dk    rdS dS )NFzFA4 is unavaible due to: r   r   )Fz>FA4 is only supported on devices with compute capability == 10r   )FA4_AVAILABLEFA4_UNAVAILABLE_REASONr   r   r   r   s    r   _is_fa4_supportedr#   9   sQ     KJ2HJJJJz''//2b88M M:r   
fa_versionc                     | dv sJ d|              | dk    rt          |          d         S | dk    rt          |          d         S | dk    rt          |          d         S d S )Nr         Unsupported FA version: r   r   r'   r(   r   r   r#   r$   r   s     r   is_fa_version_supportedr,   A   s    """$Kz$K$K"""Q ((++	q ((++	q ((++ 
r   c                     | dv sJ d|              | dk    rt          |          d         S | dk    rt          |          d         S | dk    rt          |          d         S d S )Nr&   r)   r   r   r'   r(   r*   r+   s     r   fa_version_unsupported_reasonr.   J   s    """$Kz$K$K"""Q ((++	q ((++	q ((++ 
r   c                 d    | -|                      d          dk    r|                                 n| S )Nr   )stride
contiguous)xs    r   maybe_contiguousr4   Z   s,    ]qxx||q/@/@1<<>>>aGr   r0   r0   cache_seqlenscu_seqlens_qcu_seqlens_k_newcache_leftpad	page_sizec                     t          |          }||}t          j        j                            | |||||||||	d |
d |||||d         |d         ||||          }|S )Nr   r   )r4   r   opsr	   get_scheduler_metadata)
batch_sizemax_seqlen_qmax_seqlen_knum_heads_qnum_heads_kvheaddimr6   	qkv_dtype	headdim_vr7   r8   r9   r:   max_seqlen_k_newcausalwindow_sizehas_softcap
num_splitspack_gqa	sm_marginscheduler_metadatas                        r   r=   r=   ^   s    " %]33M	.EEL,\7T]AA! & r           rH   rJ   c                    ||
J d            ||
J d            ||
J d            |
| j         d         dz  }
|d}n%t          |          dk    sJ |d	         |d
         f}d | ||fD             \  } }}t          j        |          }|dk    r||||t	          d          |t	          d          |d
k    rt	          d          t          j        j                            | ||||||n||d|||||	|
d||d	         |d
         ||o|	d	k    |d          \  }} n|dk    r|
J d            t          j        j        j	        g | ||dd||||dd||||ddddd||||
||d	         |d
         |d||dd	||||R  \  }} }!}!nt          d|           |r|| fn|S )aU  dropout_p should be set to 0.0 during evaluation
    Supports multi-query and grouped-query attention (MQA/GQA) by passing in K, V with fewer heads
    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.

    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
        1 1 1 1 0
        1 1 1 1 1
    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
        0 0
        0 0
        0 0
        1 0
        1 1
    If the row of the mask is all zero, the output will be zero.

    If window_size != (-1, -1), implements sliding window local attention. Query at position i
    will only attend to keys between
    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.

    Arguments:
        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into q.
        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into kv.
        max_seqlen_q: int. Maximum query sequence length in the batch.
        max_seqlen_k: int. Maximum key sequence length in the batch.
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
            is added to the attention score of query i and key j.
        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
            which is slightly slower and uses more memory. The forward pass is always deterministic.
        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
           testing only. The returned probabilities are not guaranteed to be correct
           (they might not have the right scaling).
    Return:
        out: (total, nheads, headdim).
        softmax_lse [optional, if return_softmax_lse=True]: (nheads, total_q_seqlen). The
            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
            normalization factor).
    Nz*cu_seqlens_k or seqused_k must be providedz>cu_seqlens_k and seqused_k cannot be provided at the same timez5seqused_k must be provided if block_table is providedr0         r5   r   r   r   c                 ,    g | ]}t          |          S  r4   .0r3   s     r   
<listcomp>z*flash_attn_varlen_func.<locals>.<listcomp>   !    666q""666r   zHFA2 does not support scheduler_metadata, q_descale, k_descale, v_descalezFA2 does not support s_auxz#FA2 does not support num_splits > 1Fr'   zAlibi is not supported in FA3Tr)   )shapelenr   
empty_likeNotImplementedErrorr<   r   
varlen_fwdr	   fwd
ValueError)"qkvr?   r7   r@   cu_seqlens_k	seqused_kq_v	dropout_psoftmax_scalerG   rH   softcapalibi_slopesdeterministicreturn_attn_probsblock_tablereturn_softmax_lseoutrM   	q_descale	k_descale	v_descalerJ   r$   s_auxcp_world_sizecp_rankcp_tot_seqused_kreal_window_sizedummy_cu_seqlens_ksoftmax_lse_s"                                     r   flash_attn_varlen_funcry      s   l #y'<'<4 (='<<9#4#4H $5#44)"7"7? #8"77 - #;1$$$$'NKN;66Q1I666GAq!),77Q)i.C%)*?)+   %&BCCC>>%&KLLL 90;;q! #/"6LQQ09q=-
 
[[0 
q##%D###!&!6!: "
"
"
"
"
"
 "
 	"

 "
 "
 "
 "
 "
 "
 '"
 "
 "
 "
 "
 "
 "
 "
 !"
 #,"
 "
  !"
" Q#"
" "2!!4#"
$ %"
& '"
( )"
* +"
, -"
. /"
0 1"
2 3"
4 5"
6 7"
 "
 "
[!QQ< @J@@AAA!3<C<r   )rl   rm   c                    || j         d         dz  }d | ||fD             \  } }}t          j        j                            | |||||||||||	|
|o|dk    d          \  }}|r||fn|S )a  Compute attention with vertical and slash sparsity patterns.
    Most Arguments are the same with the flash_attn_func interface, except for 4 extra args:
    block_count and block_offset for slash sparsity patterns, and
    column_count and column_index for vertical sparsity patterns.
    For more details please refer to Appendix C.4.2 of paper https://arxiv.org/abs/2407.02490.

    Arguments:
        q: (batch_size, seqlen, nheads, headdim)
        k: (batch_size, seqlen, nheads_k, headdim)
        v: (batch_size, seqlen, nheads_k, headdim)
        block_count: (batch_size, nheads, cdiv(seqlen, BLOCK_M))
        block_offset: (batch_size, nheads, cdiv(seqlen, BLOCK_M), NNZ_S)
        column_count: (batch_size, nheads, cdiv(seqlen, BLOCK_M))
        column_index: (batch_size, nheads, cdiv(seqlen, BLOCK_M), NNZ_V)
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
            is added to the attention score of query i and key j.
        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
            which is slightly slower and uses more memory. The forward pass is always deterministic.
        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
           testing only. The returned probabilities are not guaranteed to be correct
           (they might not have the right scaling).
    Return:
        out: (batch_size, seqlen, nheads, headdim).
        softmax_lse [optional, if return_softmax_lse=True]: (batch_size, nheads, seqlen). The
            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
            normalization factor).
    Nr0   rP   c                 ,    g | ]}t          |          S rR   rS   rT   s     r   rV   z$sparse_attn_func.<locals>.<listcomp>o  rW   r   r   )rX   r   r<   r   
fwd_sparse)r_   r`   ra   block_countblock_offsetcolumn_countcolumn_indexre   rf   rG   rg   rh   ri   rj   rl   rm   rw   s                    r   sparse_attn_funcr   9  s    f -66Q1I666GAq!y,77			+i!m C" "4<C<r   c                    || j         d         dz  }d | ||fD             \  } }}t          j        j                            | |||||||||d||	|
||d|||o|dk    d          \  }}|r||fn|S )al
  Compute attention with vertical and slash sparsity patterns.
    Most Arguments are the same with the flash_attn_varlen_func interface, except for 4 extra args:
    block_count and block_offset for slash sparsity patterns, and
    column_count and column_index for vertical sparsity patterns.
    For more details please refer to Appendix C.4.2 of paper https://arxiv.org/abs/2407.02490.
    
    Arguments:
        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
        block_count: (batch_size, nheads, cdiv(seqlen, BLOCK_M))
        block_offset: (batch_size, nheads, cdiv(seqlen, BLOCK_M), NNZ_S)
        column_count: (batch_size, nheads, cdiv(seqlen, BLOCK_M))
        column_index: (batch_size, nheads, cdiv(seqlen, BLOCK_M), NNZ_V)
        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into q.
        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into kv.
        max_seqlen_q: int. Maximum query sequence length in the batch.
        max_seqlen_k: int. Maximum key sequence length in the batch.
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        softcap: float. Anything > 0 activates softcapping attention.
        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
            is added to the attention score of query i and key j.
        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
            which is slightly slower and uses more memory. The forward pass is always deterministic.
        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
           testing only. The returned probabilities are not guaranteed to be correct
           (they might not have the right scaling).
    Return:
        out: (total, nheads, headdim).
        softmax_lse [optional, if return_softmax_lse=True]: (nheads, total_q_seqlen). The
            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
            normalization factor).
    Nr0   rP   c                 ,    g | ]}t          |          S rR   rS   rT   s     r   rV   z+sparse_attn_varlen_func.<locals>.<listcomp>  rW   r   Fr   )rX   r   r<   r   varlen_fwd_sparse)r_   r`   ra   r}   r~   r   r   r7   rb   r?   r@   re   rf   rG   rg   rh   ri   rj   rl   rm   rw   s                        r   sparse_attn_varlen_funcr     s    | -66Q1I666GAq!y,>>			+i!m+ C. "4<C<r   )N)rN   NFrN   NFF)%typingr   r   r   r   r   torch.nnnn r   r   r   ImportErrorestrr	   r   r   flash_attn.cute.interfacer
   r"   r!   DEFAULT_FA_VERSIONboolr   r   r#   intr,   r.   r4   bfloat16Tensorr=   ry   r   r   rR   r   r   <module>r      s   0 / / / / / / / / / / /       !MM    SVVMMMMMM!MM    SVVMMMMMM999999!MM    SVVMMMMMM   dHSM.A(B     dHSM.A(B     dHSM.A(B    , , ,t , , , ,, ,c ,}, , , , H H H n+//3,0#' '<'
 5<(' u|,' EL)' }' ' ' 'b '+(
An= n= $s)$n=4 5n=8 9n= n= n= n=r H=  #H= H= H= H= H=n %Y=( +Y= Y= Y= Y= Y= Y= Y=s@   
# ?:?
A A*A%%A*.
A9 9B>BB