
    `iV              #          d dl mZ ddlmZ ddlZddlmZmZ ddl	Z	ddl
Z
ddlmZ ddlmZ ddlmZ ddlmZ  G d	 d
e          ZddlZ ej        e          Zi Zddddej        dddddddfdej        dej        dej        deej                 deej                 dee         dee         dej        dededededeeef         dee         deej                 deej        ej        f         f d Z dS )!   )FineGrainedReductionQK    )driverN)TupleOptional)from_dlpack)_convert_to_cutlass_data_type)APIBasec            "           e Zd ZdZddddej        ddddddfdej        d	ej        d
ej        dej        dej        deej                 deej                 dee         dee         dej	        dededede
deeef         dee         f  fdZde
fdZd'deej                 ddfdZ	 	 	 	 d(dej        dej        d ej        d!ej        d"ej        d#eej                 d$eej                 d%e
deej                 ddfd&Z xZS ))TopKReductiona  
    Top-K Reduction for Native Sparse Attention.

    This class performs top-k reduction on attention scores to identify the most important
    key-value pairs for each query position.

    Note:
        The returned values calculated by the kernel exclude the first block and neighboring blocks from the reduction.
        As a result, it is expected to see rows of all -inf values and -1 values in the final topk_scores and topk_indices output tensors, respectively.
    N   @       T   r   sample_qsample_k
sample_lsesample_topk_scoressample_topk_indicessample_cum_seqlen_qsample_cum_seqlen_kmax_s_qmax_s_k	acc_dtypek_valueselection_block_sizecompress_stride	is_causalmma_tiler_mnscale_softmaxc                    t                                                       t          | _        | j                            d           | j                            d           || _        || _        || _	        || _
        || _        || _        || _        || _        |	| _        |
| _        || _        || _        || _        || _        || _        || _        d S )Nz$TopKReduction is an experimental APIzEntering __init__)super__init__r   _kernel_loggerwarningdebugr   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   )selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   	__class__s                    {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/cudnn/native_sparse_attention/top_k/api.pyr$   zTopKReduction.__init__   s    & 	-CDDD.///  $"4#6 #6 #6 "$8!."(*    returnc                    | j                             d           | j                             d           | j        | j        	d| _        nw| j        P| j        Hd| _        | j        j        dk    rM| j                             d           | j                            d          	                    dd	          | _        | j
        j        dk    rM| j                             d
           | j
                            d          	                    dd	          | _
        | j        j        d	k    rN| j                             d           | j                            d          	                    dd	          | _        nr| j        j        dk    rb| j                             d           |                     | j        d	d                              d          	                    dd	          | _        | j        j        dk    rM| j                             d           | j                            d          	                    dd	          | _        | j        j        dk    rM| j                             d           | j                            d          	                    dd	          | _        | j        j        dk    r| j                             d           t          | j        j        dz
            D ]!}| j                            d          | _        "| j        j        dk    rt#          d| j        j         d          | j        j        dk    r| j                             d           t          | j        j        dz
            D ]!}| j                            d          | _        "| j        j        dk    rt#          d| j        j         d          | j        b| j                             d           | j        dd          | j        d d         z
                                                                  | _        | j        b| j                             d           | j        dd          | j        d d         z
                                                                  | _        nt#          d| j         d| j                   | j        j        \  }}}}| j
        j        \  }}}}| j        j        ||||fk    r#t#          d||||f d| j        j                   | j
        j        ||||fk    r#t#          d||||f d| j
        j                   | j        j        |||dfk    r9| j                             d           | j                            d          | _        | j        j        |||fk    r"t#          d|||f d| j        j                   | j                            d          dk    r8| j                             d           | j                                        | _        | j        j        |||| j        fk    r(t#          d |||| j        f d| j        j                   | j        j        |||| j        fk    r(t#          d!|||| j        f d| j        j                   | j        dk    r|nt7          | j                  dz
  | _        |||c| _        | _        | _        | j        dk    r||c| _        | _        | j                             d"           | j        j         | j
        j         k    r)t#          d#| j        j          d| j
        j                    | j        j         | _         | j        j         | j!        k    r$t#          d$| j        j          d| j!                   | j        j         | j!        k    r$t#          d%| j        j          d| j!                   | j        j         tD          j#        k    rt#          d&| j        j                    | j        dk    r]| j        j         tD          j#        k    s| j        j         tD          j#        k    r)t#          d'| j        j          d| j        j                    | j                             d(           tD          j$        %                                stM          d)          tD          j$        '                                }tD          j$        (                    |          \  }	}
|	d*z  |
z   }|d+k     rtM          d,| d-|           |d.k    rtM          d/          d0| _)        | j                             d1           d0S )2NzEntering check_supportz+Checking shape normalization and validationzB,H,S,DT,H,D   (reshaping q_tensor from T,H,D to 1,H,T,Dr   r      (reshaping k_tensor from T,H,D to 1,H,T,Dz&reshaping lse_tensor from T,H to 1,T,H(reshaping lse_tensor from T,H,1 to 1,H,Tr   2reshaping topk_scores_tensor from T,H,D to 1,H,T,D3reshaping topk_indices_tensor from T,H,D to 1,H,T,DzGcum_seqlen_q must be 1D tensor. Attempting to squeeze last dimension(s)z$cum_seqlen_q must be 1D tensor, got DzGcum_seqlen_k must be 1D tensor. Attempting to squeeze last dimension(s)z$cum_seqlen_k must be 1D tensor, got z1max_s_q not provided, inferring from cum_seqlen_qz1max_s_k not provided, inferring from cum_seqlen_kzAcum_seqlen_q and cum_seqlen_k must be None or both not None, got  and z.Input shape mismatch: expected Q tensor shape z, got z.Input shape mismatch: expected K tensor shape z;reshaping lse_tensor from (b, h_q, s_q, 1) to (b, h_q, s_q)z0Input shape mismatch: expected LSE tensor shape xlse_tensor is expected to have leading stride in last dimension of shape (b, h_q, s_q), copying lse_tensor to contiguousz8Input shape mismatch: expected TopK Scores tensor shape z9Input shape mismatch: expected TopK Indices tensor shape zChecking dtypesz&Q and K must have the same dtype, got z2LSE and Accumulator must have the same dtype, got z:TopK Scores and Accumulator must have the same dtype, got z TopK Indices must be int32, got z9cum_seqlen_q and cum_seqlen_k tensors must be int32, got zChecking environmentzCUDA is not available
   d   z>TopKReduction requires SM100+ compute capability, but found SMz on device g   z/cuteDSL TopKReduction is not supported on SM103Tz$check_support completed successfully)*r&   r(   r   r   input_layoutr   ndiminfo	unsqueeze	transposer   r   _unpad_tensor_to_ndimr   r   rangesqueeze
ValueErrorr   r'   maxitemr   shapestride
contiguousr   len
batch_sizeh_qh_khead_dimdtyper   torchint32cudais_availableRuntimeErrorcurrent_deviceget_device_capability_is_supported)r)   _brN   s_qdrO   s_kdevicemajorminorcompute_capabilitys               r+   check_supportzTopKReduction.check_supportI   s
   3444 	HIII#+0H0P )D%1d6N6Z 'D}!Q&&!!"LMMM $ 7 7 : : D DQ J J}!Q&&!!"LMMM $ 7 7 : : D DQ J J#q((!!"JKKK"&/";";A">">"H"HA"N"N%**!!"LMMM"&"<"<T_aQ]"^"^"h"hij"k"k"u"uvwyz"{"{&+q00!!"VWWW*.*A*K*KA*N*N*X*XYZ\]*^*^'',11!!"WXXX+/+C+M+Ma+P+P+Z+Z[\^_+`+`(',11!!"klllt7<q@AA T TA/3/G/O/OPR/S/SD,,+0A55$%lDLdLi%l%l%lmmm',11!!"klllt7<q@AA T TA/3/G/O/OPR/S/SD,,+0A55$%lDLdLi%l%l%lmmm|#$$%XYYY $ 8 <t?WX[Y[X[?\ \aacchhjj|#$$%XYYY $ 8 <t?WX[Y[X[?\ \aacchhjj  [aeay  [  [  AE  AY  [  [  \  \  \,3Q,3Q=1c3"222yaQTVY[\M]yydhdqdwyyzzz=1c3"222yaQTVY[\M]yydhdqdwyyzzz? QS!$444L[\\\"o55b99DO? QSM11zPQSVX[}zzcgcrcxzz{{{?!!"%%**L    "\  ]  ]  ]"o88::DO"(QS$,,GGG  YXY[^`ceieqWr  Y  Yy}  zQ  zW  Y  Y  Z  Z  Z#)ac4<-HHH  [YZ\_adfjfrXs  [  [z~  {S  {Y  [  [  \  \  \ $ 1Y > >!!c$JbFcFcfgFg,/a)$(DM	))),c&DL$,,---=$-"555udmFYuu`d`m`suuvvv](
? DN22~RVRaRg~~nrn|~~"(DN::  OZ^ZqZw  O  O  C  M  O  O  P  P  P#)U[88`@X@^``aaa'''-<<@X@^bgbm@m@m  VPTPhPn  V  Vuy  vN  vT  V  V  
 	1222z&&(( 	86777**,,z77??u"RZ%/##   H`r   H   H  @F   H   H  I  I  I$$PQQQ!ABBBtr,   current_streamc                 (   | j                             d           |                     |          }|                                  g | j        | j        R }|                     t          | j                  t          | j	                  | j
        | j        | j        || j                  }| j        dt          j        | j                  z  n| j        }t          j        t          j                  }||z  }| j        | j        | j        | j        | j        | j        f}t1          | j        d                              d          }t1          | j        d                              d          }	t1          | j        d                              d          }
t1          | j        d                              d          }t1          | j        d                              d          }| j        d	k    r&t1          | j                                                   nd }| j        d	k    r&t1          | j!                                                  nd }tE          j#        ||||	|
||||||
          | _$        | j                             d           d S )NzEntering compileelement_dtyper   r   r   compress_block_sliding_stride	mma_tilerr         ?r   assumed_alignr0   leading_dimr2   r/   
problem_sizeQKLSETopk_scoresTopk_indicessoftmax_scale_log2_ecumulative_s_qcumulative_s_kstreamzKernel compiled successfully)%r&   r(   _get_default_stream_ensure_support_checkedr    rP   r%   r	   rQ   r   r   r   r   r   r!   mathsqrtlog2erM   r   r   rN   rO   r   r   mark_layout_dynamicr   r   r   r   r>   r   r   cutecompile_compiled_kernel)r)   rd   ri   topk_reductionr!   log2_erv   rp   sample_q_cutesample_k_cutesample_lse_cutesample_topk_scores_cutesample_topk_indices_cutesample_cum_seqlen_q_cutesample_cum_seqlen_k_cutes                  r+   r   zTopKReduction.compile   s   -...11.AA$$&&&7d'777	7
CC3DNCCL!%!:*.*>n & 
 
 ;?:L:Tdi6666Z^Zl46"",v5OLLHHM
 $DMDDDXXefXgg#DMDDDXXefXgg%doRHHH\\ij\kk"-d.EUW"X"X"X"l"lyz"l"{"{#.t/GWY#Z#Z#Z#n#n{|#n#}#} bfbsw~b~b~;t/G#H#H#\#\#^#^#^  EI bfbsw~b~b~;t/G#H#H#\#\#^#^#^  EI  $%/1!533!!
 !
 !
 	9:::::r,   Fq_tensork_tensor
lse_tensortopk_scores_tensortopk_indices_tensorcumulative_s_q_tensorcumulative_s_k_tensorskip_compilec
                 
   | j                             d           |                     |	          }	| j        dk    r||t	          d          |j        dk    rC| j                             d           |                    d                              dd          }|j        dk    rC| j                             d	           |                    d                              dd          }|j        dk    rD| j                             d
           |                    d                              dd          }nc|j        dk    rX| j                             d           | 	                    |dd                              d                              dd          }|j        dk    rC| j                             d           |                    d                              dd          }|j        dk    rC| j                             d           |                    d                              dd          }|j        dk    r/| j                             d           |
                    d          }|                    d          dk    r.| j                             d           |                                }t          |d                              d          }
t          |d                              d          }t          |d                              d          }t          |d                              d          }t          |d                              d          }| j        dk    r!t          |                                          nd }| j        dk    r!t          |                                          nd }| j        dt#          j        | j                  z  n| j        }t#          j        t"          j                  }||z  }| j        | j        | j        | j        | j        | j        f}|sk| j        t	          d          | j                             d           |                     ||
||||||||	
  
         | j                             d           d S | j                             d           |                     t;          | j                  t;          | j                  | j         | j!        | j"        g | j#        | j        R | j$                  } |||
||||||||	
  
         | j                             d           d S )NzEntering executer/   zTcumulative_s_q_tensor and cumulative_s_k_tensor are required when using T,H,D layoutr0   r1   r   r   r2   r3   z&reshaping lse_tensor from T,H to 1,H,Tr4   r   r5   r6      z1reshaping lse_tensor to remove trailing dimensionr7   r:   r   rk   rm   rj   z!TopKReduction kernel not compiledzExecuting with compiled kernelro   z*Executed with compiled kernel successfullyz'Executing without compiled kernel (JIT)rf   zExecuted successfully)%r&   r(   rz   r>   rF   r?   r@   rA   rB   rC   rE   rJ   r'   rK   r   r   r!   r|   r}   rP   r~   r   rM   r   r   rN   rO   r   r%   r	   rQ   r   r   r   r   r    r   )r)   r   r   r   r   r   r   r   r   rd   q_cutek_cutelse_cutetopk_scores_cutetopk_indices_cutecumulative_s_q_cutecumulative_s_k_cuter!   r   rv   rp   r   s                         r+   executezTopKReduction.execute   s    	-...11.AA''$,0E0M !wxxx}!!!!"LMMM#--a00::1a@@}!!!!"LMMM#--a00::1a@@!##!!"JKKK'11!44>>q!DD

A%%!!"LMMM!77
A|TT^^_`aakklmopqq
!&!++!!"VWWW%7%A%A!%D%D%N%NqRS%T%T""'1,,!!"WXXX&9&C&CA&F&F&P&PQRTU&V&V#?aLQRRR#++B//JR  A%%L    "\  ]  ]  ]#..00JXR888LLYZL[[XR888LLYZL[[z<<<PP]^P__&'9LLL``mn`oo'(;2NNNbbopbqqZ^ZkovZvZvk*?@@TTVVV  }AZ^ZkovZvZvk*?@@TTVVV  }A:>:L:Tdi6666Z^Zl46"",v5OLLHHM
  (	8$, !DEEEL?@@@!!),.%922% "    LKLLLLLLHIII!\\;DJGG7GG%)%>.2.B=D-=t}==. *  N N),.%922%    L677777r,   )N)NNFN)__name__
__module____qualname____doc__rR   float32Tensorr   intrQ   boolr   floatr$   rc   rT   CUstreamr   r   __classcell__)r*   s   @r+   r   r      sI       	 	$ 7;6:!%!%!&$&!(2)-#)+ )+,)+ ,)+ L	)+
 "L)+ #\)+ &el3)+ &el3)+ #)+ #)+ ;)+ )+ ")+ )+ )+  CHo!)+"  #)+ )+ )+ )+ )+ )+Zht h h h hT2; 2;ht}&= 2; 2; 2; 2; 2;v 9=8<"26g8 g8,g8 ,g8 L	g8
 "Lg8 #\g8  (5g8  (5g8 g8 !/g8 
g8 g8 g8 g8 g8 g8 g8 g8r,   r   r   r   r   Tr   r   r   r   cum_seqlen_q_tensorcum_seqlen_k_tensorr   r   r   r   r   r   r   r    r!   rd   r-   c                    t                               d           d\  }}|p|n|d                                         }|j        d         }t	          j        ||||| j                  }t	          j        |||t          j        | j                  }n||| j        \  }}}}|j        \  }}}}t	          j        |||||| j                                      dd          }t	          j        ||||t          j        | j                                      dd          }nt          d| d|           | j        |j        |j        ||j        nd ||j        nd | j
        |j
        |j
        ||j
        nd ||j
        nd |                                 |                                |                                ||                                nd ||                                nd |||||	|
|||f}|t          v rHt                               d	           t          |         }|                    | |||||||
           ||fS t          di d| d|d|d|d|d|d|d|d|d|d|d|	d|
d|d|d|}|                                sJ |                    |           |                    | |||||||
           |t          |<   ||fS )Nz7topk_reduction_wrapper: Entering topk_reduction_wrapper)NNr7   r   )rQ   r_   r2   zmcum_seqlen_q_tensor and cum_seqlen_k_tensor must either both be None (B,H,S,D) or both not None (T,H,D), got r9   zDtopk_reduction_wrapper: Using previously cached TopKReduction object)r   r   r   r   r   r   r   rd   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   )rd    )r&   r(   rH   rI   rR   emptyr_   rS   rB   rF   rQ   rJ   _cache_of_TopKReductionObjectsr   r   rc   r   )r   r   r   r   r   r   r   r   r   r   r   r   r    r!   rd   r   r   total_seq_len_qrO   r[   rZ   r\   	cache_keyr   s                           r+   topk_reduction_wrapperr   W  s   $ MMKLLL.8++&+>+J-b16688nQ"[#wi`h`oppp#k/3u{ckcrsss		$)<)D~1c1~31"[CgYW_Wfgggqqrsuvww#k!S#wekZbZijjjttuvxyzz l  }P  l  l  Wj  l  l
 
 	

 	%8%D!!$%8%D!!$%8%D!!$%8%D!!$(;(G""$$$T(;(G""$$$T1I6 222\]]]7	B!1 3"5"5) 	 		
 		
 		
 "#666& 
 
 
X
X
 "z
  21	

 !4 3
 !4 3
 !4 3
 G
 G
  i
 G
 "6!5
 ,O
  i
 &
  (-!
$ ++-----n===!1 3"5"5) 	 		
 		
 		
 5C&y1222r,   )!nsa_top_k_reduction_fwdr   cuda.bindingsr   rT   rR   typingr   r   r|   cutlasscutlass.cuter   cutlass.cute.runtimer   cudnn.datatypesr	   cudnn.api_baser
   r   logging	getLoggerr   r&   r   r   r   r   rQ   r   r   r   r   r   r,   r+   <module>r      s7   ; ; ; ; ; ; ( ( ( ( ( (  " " " " " " " "         , , , , , , 9 9 9 9 9 9 " " " " " "~8 ~8 ~8 ~8 ~8G ~8 ~8 ~8B
 
'
H
%
%!#  3726!!"] "$.%).2m3 m3lm3lm3 m3 "%,/	m3
 "%,/m3 c]m3 c]m3 {m3 m3 m3 m3 m3 S/m3 E?m3 T]+m3  5<%&!m3 m3 m3 m3 m3 m3r,   