
     `io                     8   d dl mZ d dlZddlmZ ddlmZ 	  e            r	d dlmZ eZ	n e
d          n # e$ rZ ee          Zd Z	Y dZ[ndZ[ww xY w	 	 	 	 	 	 	 dd	ej        j        d
ej        dej        dej        deej                 dedej        fdZdS )    )OptionalN   )PagedAttentionCache)is_flash_attn_2_available)flash_attn_varlen_funczFlash Attention 2 is not installed. Please refer to https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install itc                  0    t          dt                     )Nz)flash_attn_varlen_func is not available: )	Exceptionmsg)argskwargss     y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/integrations/flash_paged.pyFLASH_ATTN_VARLEN_FUNCr      s    ICIIJJJ    moduleqkvattention_maskcachereturnc           	         t          | dd          sdn| j        dz
  df}|dk    rdnd}| |j        ||| j        fi |\  }}t	          |t
                    r||         }|	|         }	|
t          |
d	          r|
j        }nt          }d
|v rd
|	                    d
          ini } ||
                    dd                              d                                          |                                |                                |                    t          j                  |                    t          j                                                  ||	f| j        d|d|}t	          |t$                    r|d         }|dfS )a  Perform the forward pass of attention with paged key-value cache.

    This function handles the cache updates and performs the attention computation
    using the flash_attn_varlen_func for efficient processing.

    Args:
        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full k
        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full v
        cu_seq_lens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into q.
        cu_seq_lens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into kv.
        max_seqlen_q: int. Maximum query sequence length in the batch.
        max_seqlen_k: int. Maximum key sequence length in the batch.
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
    sliding_windowF)r      r   full_attentionsliding_attentionNr   s_auxr   T)softmax_scalecausalwindow_size)getattrr   update	layer_idx
isinstancedicthasattrr   r   get	transposesqueeze
contiguoustotorchint32clonescalingtuple)r   r   r   r   r   r   cu_seq_lens_qcu_seq_lens_kmax_seqlen_qmax_seqlen_kimplementationr   r   
layer_typer   custom_kwargsattn_outputs                    r   paged_attention_forwardr9      s   H &-V5Eu%M%MqXXTZTilmTmopSqN%3x%?%?!!EXJ u|Aq&"2==f==1 -&& 0%j1#J/!gn>V&W&W!!/!F!76=6G6GWfjj1122RM((	Aq!!!$$//11		%%%%++-- n"   K +u%% %!!nr   )NNNNNNN)typingr   r,   generation.continuous_batchingr   utilsr   
flash_attnr   r   RuntimeErrorr	   ereprr
   nnModuleTensorr9    r   r   <module>rE      s          @ @ @ @ @ @ - - - - - -K  "" 
555555!7l ]
 
 	
 	
  K K K
$q''CK K K K K K K KK .2!%F FHOF|F |F |	F
 U\*F F \F F F F F Fs   7 AAA