
    .`iQ                        d dl Z d dlmZ d dlmZmZ ej        dej        dej        dej        dej        dej        d	ej        fd
            Zej        dej        dej        dej        dej        dej        dej        dej        dej        d	ej        dej        fd            Z	ej        dej        dej        dej        dej        dej        dej        dej        fd            Z
ej        dej        dej        dej        dej        dej        dej        d	ej        dej        fd            Z G d de j        j                  Zej        Z	 	 d%de j        de j        de j        de j        dede j        dz  dee j        e j        f         fdZej        dej        dej        fd            Z	 d&de j        de j        de j        d!e j        d"e j        d#e j        dede j        fd$ZdS )'    N)	rearrange)tltritonbhdeBLOCKCBLOCKc           	         t          j        d          }||z  }||z  }t          j        d          }||z  }||z  |z  }||z  |	z  }||z  |	z  }||
z  }||z  }||	z  }||	z  }||z  }||z  }||	z  }| |z   |z   |z   t          j        d|          d d d f         |z  z   t          j        d|          d d d f         z   }||z   |z   t          j        d|          d d d f         |z  z   t          j        d|          d d d f         z   }||z   |z   t          j        d|          d d d f         |	z  z   t          j        d|	          d d d f         z   }||z   |z   |z   t          j        d|          d d d f         |	z  z   t          j        d|	          d d d f         z   }||z   } t          j        |           }!|}"t          j        d|          |"|z  z   }#t          j        |||#d d d f         z   |k     d                              t           j                  }$t          j        ||	gt           j                  }%t          |"dz             D ]B}&t          j        d|          |&|z  z   }'|#d d d f         |'d d d f         z
  }(|!|(z  })t          j        |(dk    |) t          d                    })t          j
        |)          }*t          j        |||'d d d f         z   |k     d                              t           j                  }+t          j        |||'d d d f         z   |k     d                              t           j                  },t          j        |$|+          |*z  }-|%t          j        |-|,          z  }%|||z  z  }|||	z  z  }Dt          j        ||%                    |j        j                  ||#d d d f         z   |k                d S )Nr              maskotherdtypez-infr   )r   
program_idarangeloadtofloat32zerosrangewherefloatexpdotstorer   
element_ty).QKVOutSr   r   nr   r	   r
   	NUM_BLOCKr   offoff_bh	off_block
off_cblockoff_h	qk_offsetv_offseto_offsetblock_offsetqk_block_offsetv_block_offseto_block_offsetcblock_offsetq_cblock_offseto_cblock_offsetQ_block_ptrK_trans_block_ptrV_block_ptrO_block_ptrS_block_ptrsiq_indexqqkvjkv_indexdiffs_indexdecayk_transvqks.                                                 }/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/layers/lightning_attn.py_fwd_diag_kernelrK   
   s   & -

CIFiIq!!JQJE 
QIzA~HzA~H u$L"Q&O!A%N!A%N 'M#a'O#a'O 	

	
	 	 )Av

qqq$w
'!
+		,
 )Aq//$'
"	#  	

	
	 )Av

tQQQw
'!
+	, )Aq//!!!T'
"		#  	

	
	 )Av

qqq$w
'!
+	, )Aq//$'
"		#  	
	
	 	 )Av

qqq$w
'!
+		,
 )Aq//$'
"	#  e)K
AAi6""QZ/G 	,D1A"AA"ESQQQTT

	 	A
 (FA;bj
1
1
1C 1q5\\ " "9Q''!f*4qqq$w(47"33d((419whf>>w 'qqq 11A5
 
 
 "RZ..	 	
 GD 11A5
 
 
 "RZ..	 	
 VAw%' 	rvb!}} 	VaZ'vz! H{ +,,GAAAtG,,q0         D_FBLOCKE_FBLOCK
NUM_FBLOCK
NUM_CBLOCKc                    t          j        d          }t          j        d          }||z  }||	z  }||z  }||z  }||z  |z  }||z  |z  }||z  |z  }||
z  |z  |z  }| |z   |z   t          j        d|          d d d f         |z  z   t          j        d|          d d d f         z   }||z   |z   t          j        d|          d d d f         |z  z   t          j        d|          d d d f         z   }||z   |z   t          j        d|          d d d f         |z  z   t          j        d|          d d d f         z   }|||	z  z   t          j        d|          z   }t          j        d|          }t          j        ||gt           j                  }||
dz
  k    r||
dz
  |	z  z
  n|	} t          j        | |          |z  | z
  }!t          t          j        | |          |          }"|||"z
  |z  z  }t          |"          D ]}#d|#z
  |!z  }$t          j        ||!|z  z
  |d d d f         |$k    d          }%t          j        ||!|z  z
  |d d d f         |$k    d          }&t          j        |          }'|'d d d f         }'|t          j	        |%|'z  |&          z  }|||z  z  }|||z  z  }||z  }t          j
        ||                    |j        j                             d S )Nr   r   r   r   r   )r   r   r   r   r   cdivminr   r   r   r    r   r   r!   )(r#   r$   K_decayKVr   r   r'   r   r	   r
   r(   rM   rN   rO   r   rP   r*   r+   r-   r1   k_block_offsetr3   kv_block_offsetk_offsetr/   	kv_offsetr9   r:   KV_block_ptrk_decay_ptrrC   kvsplit_n
left_shift
num_blocksrB   
left_boundrG   rH   k_decays(                                           rJ   _fwd_kv_parallelrb      s   * ]1Fa  IQJEu$L "A%N!A%N!ma'O zA~HzA~H"Q&*I 	

	
	 )Av

tQQQw
'!
+	, )Ax
 
 D
)		*  	

	
	 )Av

qqq$w
'!
+	, )Ax
 
 qqq
)		*  	
	
	 )Ax
 
 D
)A
-	. )Ax
 
 qqq
)		*  EEM)BIa,@,@@KyF##H 
8X&bj	9	9	9B .7)a--G-Ga9q=E)))UG&))F2W<JRWWf--z::JJ+v55K :  !ez)
'
Q.$'"j0
 
 

 G*q.(!!!T'"j0
 
 
 '+&&
 $'"
bfWw&*** 	VaZ'vz!v H\255!3!>??@@@@@rL   c                    t          j        d          }||z  }||	z  |z  |z  }||z   t          j        d|
          d d d f         |z  z   t          j        d|          d d d f         z   }| |z   }t          j        |          }||z  |z  }||z   t          j        d|
          d d d f         |z  z   t          j        d|          d d d f         z   }t          j        |                              t           j                  }t          |	          D ]}t          |||z  z
  |          }t          j        |                    t           j                   |z            }t          j        |                              t           j                  }t          j	        ||                    |j
        j                             ||z  |z   }|||z  z  }t          j	        ||           d S )Nr   )r   r   r   r   r   r   r   rS   r   r    r   r!   )r&   rU   
KV_HISTORYr   r   r'   r   r	   r
   r(   rM   rN   r*   r-   rY   rZ   s_ptrsr=   kv_history_offsetKV_HISTORY_block_ptrkv_prer>   
block_sizeblock_decaykv_curs                            rJ   _fwd_kv_reducerl      s   " ]1FQJE"Q&*I 	
	
)Ax
 
 D
)A
-	. )Ax
 
 qqq
)	*  YF
A 
Q
	
)Ax
 
 D
)A
-	. )Ax
 
 qqq
)	*  W)**--bj99F 9  QY..
fadd2:...;<< &&))"*55
vyy);)FGGHHH v%.A H!6*****rL   c                 *   t          j        d          }||z  }t          j        d          }||z  }||z  }t          j        d          }||	z  }||z  }||z  }||z   }||z  |z  ||z   |z  z   }||z  |z  ||z   |z  z   |z   }||
z  |z  |z  ||z  |z  z   |z   }| |z   t          j        d|          d d d f         |z  z   t          j        d|          d d d f         z   }||z   t          j        d|          d d d f         |z  z   t          j        d|          d d d f         z   }||z   t          j        d|          d d d f         |z  z   t          j        d|          d d d f         z   }||z   }t          j        |          }t          j        d|          } t          j        |                              t           j                  }!|t          j        d|          z   }"t          j        ||"d d d f         |k     d                              t           j                  }#t          j        |                    t           j                   ||z  | d d d f         z   z            }$t          j        |#|!          |$z  }%t          j        ||"d d d f         |k     d                              t           j                  }&|&|%z   }'t          j        ||'                    |j	        j
                  |"d d d f         |k                d S )Nr   r      r   r   r   )r   r   r   r   r   r   r   r   r    r   r!   )(r"   r%   r&   rU   r   r   r'   r   r	   r
   r(   rN   r   rP   r*   r-   off_ncoff_noff_coff_en_offsetc_offsete_offsetr1   q_offsetr0   rY   r8   r;   rZ   r<   r=   c_arrayr\   r?   r@   q_decayqkv_none_diagqkv_diagrA   s(                                           rJ   _fwd_none_diag_kernelr{   9  sZ   ( ]1FQJE]1Fj EZEM!Eu}Hv~HxHh&L zA~H!4 99HzA~H!4 99HDH"Q&*UQY]:XEI 	
HryF++AAAtG4q8829Q??4QRQRQR7;SS  	
	
)Av

qqq$w
'!
+	, )Ax
 
 qqq
)	*  	Y1aD1A55	!X8N8NtUVUVUVw8WW 
 e)K
Ai6""G 
			!	!"*	-	-BRYq&111G 	'!!!T'"2Q"6cBBBEEbjQQA fadd2:&&&%&.7111d7;K*KLMMG F1bMMG+M w{D)9A)=SIIILLRZXXH ]
"C HSVVK-8994@PST@T     rL   c                   $    e Zd Zed             ZdS )
_attentionc                 R   |                                 }|                                 }|                                 }|                                 }t          j                                        }|d         dk     rt	          dd          |j        \  }}}	}
|j        d         }t          j        |||	|f|j        |j                  }d}t          j
        |	|          }d}||z  }||z  dk    s
J d	            t          j        d||j        
          dz   }t          j        | ||                    dd          z
  z            }||z  |z  |f}t          |         ||||||||	|
||||           d}|
|z  }|
|z  dk    sJ ||z  }||z  dk    sJ d}||z  }||z  dk    s
J d	            t          j        ||||
|ft          j        |j                  }||z  |f}t!          |         |||||||	|
||||||||           ||z  |f}t#          |         ||||||	|
|||||           ||z  ||z  f}t%          |         |||||||	|
||||||           |                     |||||           || _        |t          j        ||                    d          gd          fS )Nr      z(Flash attention currently only supportedzfor compute capability >= 80r   device       z"BLOCK must be a multiple of CBLOCK)r   r   )r
   r(   r   @   )r
   r(   rM   rN   rO   r   rP   )r
   r(   rM   rN   )r
   r(   rN   r   rP   rn   )dim)
contiguoustorchcudaget_device_capabilityRuntimeErrorshapeemptyr   r   r   rR   r   r   reshaperK   r   rb   rl   r{   save_for_backwardr
   cat	unsqueeze)ctxr@   krH   r=   
kv_history
capabilityr   r   r'   r   r	   or
   r(   r   rP   arrayra   gridrO   rM   rN   r\   s                           rJ   forwardz_attention.forward  s    LLNNLLNNLLNNLLNN Z5577
a=1:.   W
1aGBK KAq!AGAHEEE K5))	f_
v~"""$H""" Qah777!;)QB%%--2*>*>">?@@ A	!:.	
 	
 	
 	
" 

?:~""""
?:~""""f_
v~"""$H""" [!Q	1a0ahWWWAy!!!!	
 	
 	
 	
* Az"t	
 	
 	
 	
  Ay:-.d#!	
 	
 	
 	
$ 	aAq"---	%)R!5!5a!8!89qAAAAArL   N)__name__
__module____qualname__staticmethodr    rL   rJ   r}   r}     s7        ~B ~B \~B ~B ~BrL   r}   r   r@   r   rH   edri   r   returnc                    | j         d         }|j         d         }|                                dk    r|                    dddd          }|dk    rdnd|z  dk    sJ d| d d            fd	t          |z  dz             D             }|d         |k    r|                    |           t          |          }	d}
|At          j        | j         d         | j         d         ||ft          j        | j	                  }n&|
                                                                }t          |	dz
            D ]H}||         }||dz            }| d||f         }|d||f         }t          |||||          \  }}|
|z   }
I|
|fS )a$  
    Apply lightning attention algorithm
    to compute attention efficiently.

    Args:
        q: Query tensor of shape [batch, heads, seq_len, dim]
        k: Key tensor of shape [batch, heads, seq_len, dim]
        v: Value tensor of shape [batch, heads, seq_len, dim_v]
        ed: Decay rate tensor of shape [heads]
        block_size: Size of blocks for block-sparse attention
        kv_history: Optional key-value history from previous computations

    Returns:
        output: Attention output
        kv: Updated key-value history
    r   r      r   r   zDimension d (z) must be divisible by m ()c                     g | ]}|z  S r   r   ).0r>   ms     rJ   
<listcomp>z'lightning_attention.<locals>.<listcomp>4  s    
,
,
,Q1q5
,
,
,rL   Nr   .)r   r   viewr   appendlenr   r   r   r   cloner   lightning_attention_)r@   r   rH   r   ri   r   r   r	   arrr'   outputr>   r=   q1k1r   r\   r   s                    @rJ   lightning_attentionr     s   0 	
A	A	vvxx1}}WWQAq!! CxxRAq5A:::HqHHAHHH:::
,
,
,
,%Q
++
,
,
,C
2w!||

1CAF [WQZQ*%-
 
 


  %%''2244
 1q5\\  FAJsAaCx[sAaCx[$RQJ??2!2:rL   D
BLOCK_SIZEc                    t          j        d          }t          j        d          }t          j        d          }t          j        ||z                                 t           j                  }|dk    rdS |}|}t          j        ||z             }t          j        d|          }t          j        d|          ||z  z   }|dddf         |z  |dddf         |z  z   }||z  ||	z  z   }||z  ||	z  z   }||z  ||	z  z   }||
z  ||z  z   }||k     }||k     }t          j        | |z   |z   |d          }t          j        ||z   |z   |d          } t          j        ||z   |z   |d          }!| dddf         |!dddf         z  }"|dddf         |dddf         z  }#t          j        |           }||z   |z   }$t          j        |$|#d          }%|"||%z  z   }"|dddf                             t           j                  |"z  }&t          j        |&d          }&t          j	        |$|"|#	           t          j	        ||z   |z   |&|	           dS )
z
    Kernel for linear attention decoding with KV cache.

    This kernel computes attention for a single token using the KV cache.
    r   r   rn   r   Nr   r   )axisr   )
r   r   r   r   int64r   r   r   sumr    )'q_ptrk_ptrv_ptrkv_cache_ptr
slope_rateslot_idx
output_ptrr   qkv_b_strideqkv_h_stridecache_b_stridecache_h_stridecache_d0_stridecache_d1_strider   pid_bpid_hpid_dslot_idbatch_idhead_idratioqk_d_offsetsv_d_offsetscache_d_offsetsrv   rX   r/   cache_offsetqk_maskv_maskr@   r   rH   kv_outerkv_maskkv_ptrkv_cache_oldr   s'                                          rJ   _linear_attn_decode_kernelr   M  s   . M!EM!EM!E gh&''**2844G "}}HG GJ&''E 9Q??L)Az**UZ-??KQQQW/+dAAAg2F2XX 
 ,&<)??H,&<)??H,&<)??H^+g.FFL QG1_F 	 </gSIIIA
 </gSIIIA
 ;.V3GGGA DzAdAAAgJ&Haaagaaa0G FE6NNEL(?:F76s;;;L%,..H qqq$wZ]]2:&&1FVF###F HVXG,,,,HZ("[0&vFFFFFFrL   r   	kv_cachesr   r   c                 H   | j         \  }}}	}
|j         ||d|
fk    sJ |j         ||d|
fk    sJ t          j        |           }|||
|z  f}|                     d          }|                     d          }|                    d          }|                    d          }|                    d          }|                    d          }t	          |         | |||||||
|||||||           t          |d          }|                    d                                          S )a  
    Perform linear attention decoding using Triton kernels.

    Args:
        q: Query tensor of shape [B, H, 1, D]
        k: Key tensor of shape [B, H, 1, D]
        v: Value tensor of shape [B, H, 1, D]
        kv_caches: Key-value cache tensor
        slope_rate: Decay rate tensor
        slot_idx: Slot indices for batches
        BLOCK_SIZE: Size of blocks for processing

    Returns:
        output: Attention output tensor
    r   r   rn      )r   zb h n d -> b n (h d))r   r   
empty_likestrider   r   squeezer   )r@   r   rH   r   r   r   r   BH_r   r   r   r   r   r   r   r   r   s                      rJ   linear_decode_forward_tritonr     sT   0 JAq!Q7q!Ql""""7q!Ql"""" a  F q!z/"D 88A;;L88A;;L%%a((N%%a((N&&q))O&&q))O t$				   & v566F>>!'')))rL   )r   N)r   )r   einopsr   vllm.triton_utilsr   r   jit	constexprrK   rb   rl   r{   autogradFunctionr}   applyr   Tensorinttupler   r   r   r   rL   rJ   <module>r      s          ( ( ( ( ( ( ( ( } 
|} 
|} 
|} 
|} <} L} } } }@ iA
 
|iA 
|iA 
|iA 
|iA <iA liA liA iA LiA  !iA iA iA iAX =+ 
|	=+
 
|=+ 
|=+ 
|=+ <=+ l=+ l=+ =+ =+ =+@ O
 
|O 
|O 
|O 
|O <O lO LO O O O Od@B @B @B @B @B( @B @B @BH "'  &*7 7|7|7 |7 		7
 7 t#7 5<%&7 7 7 7t NG 
|NG NG NG NG NGp @* @*|@*|@* |@* |	@*
 @* l@* @* \@* @* @* @* @* @*rL   