
    `i	d              (       :   d dl mZmZ d dlZd dlmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ dd	lmZ d
dlmZ d
dlmZ  G d de          Zd dlZ ej        e          Zi Zddddej         ej         ddddddddfdej!        dej!        dej!        deej!                 deej!                 de"deej#                 dej#        dej#        dee$e$f         de"de%de%de%d e%d!ee%         d"eej&                 d#eej!        eej!                 f         f$d$Z'dS )%    )TupleOptionalN)driver)from_dlpack)Int32)APIBase)_convert_to_cutlass_data_type   )make_tensor_strided_like   )'BlackwellFusedMultiHeadAttentionForward)fmha_helpersc            "       r    e Zd Zdddej        ej        dddddddfdej        dej        dej        dej        d	eej                 d
eej                 deej                 dej        dej        dee	e	f         de
dededededee         f  fdZde
fdZd#deej                 ddfdZ	 	 	 	 	 	 	 	 	 	 d$dej        dej        dej        dej        deej                 deej                 d eej                 deej                 d!e
dee         dee         dee         dee         dee         ddfd"Z xZS )%CompressionAttentionN   r   F      ?sample_qsample_ksample_vsample_o
sample_lsesample_cum_seqlen_qsample_cum_seqlen_kqk_acc_dtypepv_acc_dtypemma_tiler_mnis_persistentscale_qscale_kscale_vinv_scale_oscale_softmaxc                    t                                                       t          | _        | j                            d           | j                            d           || _        || _        || _	        || _
        || _        |d u| _        || _        || _        || _        |	| _        |
| _        || _        || _        || _        || _        || _        || _        d | _        d | _        d | _        d | _        d | _        d | _        d | _        d | _        d | _         | j                            d!                    g d|j"         d|j"         d|j"         d|j"         d||j"        nd	 d
||j"        nd	 d||j"        nd	 d| d|	 d|
 d| d| d| d| d| d|                      d S )Nz+CompressionAttention is an experimental APIzEntering __init__ z'__init__ completed with args: sample_q z, sample_k z, sample_v z, sample_o z, sample_lse Nonez, sample_cum_seqlen_q z, sample_cum_seqlen_k z, qk_acc_dtype z, pv_acc_dtype z, mma_tiler_mn z, is_persistent z
, scale_q z
, scale_k z
, scale_v z, inv_scale_o z, scale_softmax )#super__init__r   _kernel_loggerwarningdebugr   r   r   r   r   
enable_lser   r   qk_acc_dtype_torchpv_acc_dtype_torchr   r   r   r    r!   r"   r#   
batch_sizes_qs_kh_qh_kh_rhead_dimproblem_size_compiled_kerneljoinshape)selfr   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   	__class__s                    /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/cudnn/native_sparse_attention/compression/api.pyr(   zCompressionAttention.__init__   s   & 	>JKKK.///    $$D0#6 #6  #/".(* &*   $ F
  F
  F
  F
  F
  F
hn  F
  F
  F
  F
QYQ_  F
  F
  F
  F
ltlz  F
  F
  F
  F
  HP  HV  F
  F
  F
  F
  yC  yO  eo  eu  eu  U[  F
  F
  F
  F
  Pc  Po  sF  sL  sL  u{  F
  F
  F
  F
  pC  pO  Sf  Sl  Sl  U[  F
  F
  F
  F
  lx  F
  F
  F
  F
  IU  F
  F
  F
  F
  fr  F
  F
  F
  F
  DQ  F
  F
  F
  F
  ]d  F
  F
  F
  F
  pw  F
  F
  F
  F
  C	J	  F
  F
  F
  F
  Z	e	  F
  F
  F
  F
  w	D
  F
  F
  F
  F
	
 	
 	
 	
 	
    returnc                    | j                             d           | j                             d           | j        j        dk    r
d| _        | j        j        \  }}}}| j        j        \  }}}}| j        j        \  }}}}| j        j        \  }}}}| j        j        ||||fk    r#t          d||||f d| j        j                   | j        j        ||||fk    r#t          d||||f d| j        j                   | j        j        ||||fk    r#t          d||||f d| j        j                   | j        j        ||||fk    r#t          d	||||f d| j        j                   | j
        r~|                     | j        d
d          | _        | j        j        |||fk    r"t          d|||f d| j        j                   | j                                        st          d          | j        | j        | j                             d           || _        || _        || _        || _        || _        ||z  | _        || _        nz| j        j        d
k    rMd| _        | j        j        \  }	}}| j        j        \  }
}}| j        j        \  }
}}| j        j        \  }	}}| j        j        |	||fk    r"t          d|	||f d| j        j                   | j        j        |
||fk    r"t          d|
||f d| j        j                   | j        j        |
||fk    r"t          d|
||f d| j        j                   | j        j        |	||fk    r"t          d	|	||f d| j        j                   | j
        rT|                     | j        dd          | _        | j        j        |	|fk    r!t          d|	|f d| j        j                   | j        | j        t          d| j         d| j                   |                     | j        dd          | _        |                     | j        dd          | _        | j        j        dk    s| j        j        dk    r*t          d| j        j         d| j        j         d          | j        j        t2          j        t2          j        hvs$| j        j        t2          j        t2          j        hvr)t          d| j        j         d| j        j                   t9          | j                  t9          | j                  k    r9t          dt9          | j                   dt9          | j                             t9          | j                  dz
  | _        d | _        d | _        || _        || _        ||z  | _        || _        nt          d| j        j                   ||k    rt          d          |dvrt          d          ||z  dk    rt          d           | j                             d!           | j        j        }| j        j        }| j        j        |k    s| j        j        |k    r,t          d"| j        j         d#| j        j         d$|           |t2          j        t2          j        t2          j        hvrt          d%|           |t2          j        t2          j        t2          j        hvrt          d&|           | j         t2          j!        hvrt          d'| j                    | j"        t2          j!        hvrt          d(| j"                   | j#        ;| j                             d)           d*tI          j%        | j                  z  | _#        | j                             d+           t2          j&        '                                stQ          d,          t2          j&        )                                }t2          j&        *                    |          \  }}|d-z  |z   }|d.k     rtQ          d/| d0|           |d1k    rtQ          d2          d3| _+        | j                             d4           d3S )5NzEntering check_supportz+Checking shape normalization and validation   B,H,S,Dz.Input shape mismatch: expected Q tensor shape z, got z.Input shape mismatch: expected K tensor shape z.Input shape mismatch: expected V tensor shape z/Output shape mismatch: expected O tensor shape    r   z1Output shape mismatch: expected LSE tensor shape zLSE tensor must be contiguouszJsample_cum_seqlen_q and sample_cum_seqlen_k are ignored for B,H,S,D layoutT,H,Dr
   zSsample_cum_seqlen_q and sample_cum_seqlen_k must be provided for T,H,D layout, got  and r   r   r   zDsample_cum_seqlen_q and sample_cum_seqlen_k must be 1D tensors, got zD and DzHsample_cum_seqlen_q and sample_cum_seqlen_k must be int32 or int64, got zKsample_cum_seqlen_q and sample_cum_seqlen_k must have the same length, got zOInvalid input layout: sample_q must be rank-3 (T,H,D) or rank-4 (B,H,S,D), got zD_qk must match D_v>       @   r   z*Head dimension D_qk must be 32, 64, or 128r   z1H_q must be divisible by H_k (GQA/MQA constraint)zChecking dtypesz'Inputs must have the same dtype, got K z, V z for Q z7Inputs must be Float16, BFloat16, or Float8E4M3FN, got z8Outputs must be Float16, BFloat16, or Float8E4M3FN, got z"qk_acc_dtype must be Float32, got z"pv_acc_dtype must be Float32, got z2No scale_softmax provided, using default 1/sqrt(d)r   zChecking environmentzCUDA is not available
   d   zECompressionAttention requires SM100+ compute capability, but found SMz on device g   z!cuteDSL is not supported on SM103Tz$check_support completed successfully),r*   r,   r   ndiminput_layoutr:   r   r   r   
ValueErrorr-   _unpad_tensor_to_ndimr   is_contiguousr   r   r+   r0   r1   s_kvr3   h_kvr5   r6   dtypetorchint32int64lenfloat16bfloat16float8_e4m3fnr.   float32r/   r#   mathsqrtcudais_availableRuntimeErrorcurrent_deviceget_device_capability_is_supported)r;   bh_qos_qod_qkrR   rQ   d_vr3   tt_kvin_dtype	out_dtypedevicemajorminorcompute_capabilitys                    r=   check_supportz"CompressionAttention.check_supportS   s
   3444 	HIII="" )D"&-"5AtT4"&-"5AtT4!%!4AtT3 $ 3AsD#}"q$d&;;;   "CRSUY[_aeQf  "C  "Cmqmz  nA  "C  "C  D  D  D}"q$d&;;;   "CRSUY[_aeQf  "C  "Cmqmz  nA  "C  "C  D  D  D}"q$c&:::   "BRSUY[_adQe  "B  "Blplyl  "B  "B  C  C  C}"q#tS&999   "BSTVY[_adRe  "B  "Blplyl  "B  "B  C  C  C F"&"<"<T_aQ]"^"^?(QTN::$  &EYZ\_aeXf  &E  &Emqm|  nC  &E  &E  F  F  F4466 F$%DEEE'3t7O7[$$%qrrr  DODHDIDHDId{DH DMM]1$$ 'D=.LAsD#}2D$"m1OD$--KAsC}"q#tn44 !{RSUXZ^Q_!{!{fjfsfy!{!{|||}"tT4&888 !RVX\^bQc!!jnjwj}!!  A  A  A}"tT3&777 !~RVX\^aQb!~!~imivi|!~!~}"q#sm33 !{STVY[^R_!{!{fjfsfy!{!{||| @"&"<"<T_aQ]"^"^?(QH44$%~YZ\_X`%~%~gkgvg|%~%~'/43K3S  djn  kC  d  d  JN  Jb  d  d   (,'A'A$BZ\]_t'u'uD$'+'A'A$BZ\]_t'u'uD$',11T5M5RVW5W5W  a[_[s[x  a  a  AE  AY  A^  a  a  a   '-6   )/U[7QQQ  e_c_w_}  e  e  EI  E]  Ec  e  e   4+,,D4L0M0MMM  fbefjf~bb  f  f  GJ  KO  Kc  Gd  Gd  f  f   "$":;;a?DODHDIDHDId{DH DMM   Doso|  pB  D  D  E  E  E3;;2333}$$IJJJ:??PQQQ,---=&M'	=(**dm.AX.M.M  Gt}GZ  G  G`d`m`s  G  G  }E  G  G  H  H  HEM5>5;NOOOaW_aabbbU]ENE<OPPPcXaccddd"5=/99[$BY[[\\\"5=/99[$BY[[\\\ %LSTTT!$ty'?'?!?D 	1222z&&(( 	86777**,,z77??u"RZ%/##   Ogy   O   O  GM   O   O  P  P  P$$BCCC!ABBBtr>   current_streamc           	      	   | j                             d           |                     |          }|                                  |                     t          | j                  t          | j                  g | j        | j	        R | j
        t          j        j                  }t          j        t          j        d                    }| j        | j        z  | j        z  }||z  }| j        | j        z  }| j        dk    r| j        n%t1          | j                                                  }| j        dk    r| j        n%t1          | j                                                  }| j        |||| j        | j        | j	        f| _         | j                             d           tC          j"        |fi dtG          | j$        d          j%        d	| j        dk    r-| j$        &                    d
d          '                                n9| j$        '                                d         g| j$        '                                R dtG          | j(        d          j%        d| j        dk    r-| j(        &                    d
d          '                                n9| j(        '                                d         g| j(        '                                R dtG          | j)        d          j%        d| j        dk    r-| j)        &                    d
d          '                                n9| j)        '                                d         g| j)        '                                R dtG          | j*        d          j%        d| j        dk    r-| j*        &                    d
d          '                                n9| j*        '                                d         g| j*        '                                R d| j         d| j        dk    rtG          | j        d          nd d| j        dk    rtG          | j        d          nd d| j+        rtG          | j,        d          j%        nd d| j        dk    r-| j,        &                    d
d          '                                ndg| j,        '                                R d|d|d|dd dt[          d          d|| _.        | j                             d           d S ) NzEntering compile	mask_typer   rB   z7Compiling CompressionAttention kernel with cute.compileq_iter   assumed_alignq_strider   r
   r   k_iterk_stridev_iterv_strideo_itero_strider7   cum_seqlen_qrD   cum_seqlen_klse_iter
lse_stridescale_softmax_log2r#   scale_outputwindow_size_leftwindow_size_rightstreamzKernel compiled successfully)/r*   r,   _get_default_stream_ensure_support_checkedr)   r	   r.   r/   r   r6   r   
fmha_utilsMaskTypeCOMPRESSED_CAUSAL_MASKr\   log2expr   r    r#   r!   r"   rM   r1   maxr   itemrQ   r   r0   r3   rR   r7   cutecompiler   r   iterator	transposestrider   r   r   r-   r   r   r8   )	r;   rr   fmha_kernellog2_er#   r   r   r1   rQ   s	            r=   r   zCompressionAttention.compile   s   -...11.AA$$&&&ll)$*ABB)$*ABB/d/// )@ # 
 
 48C==))t|3d6HH*V3|d&66+y88dhhc$BZ>[>[>`>`>b>b -::tyyDD\@]@]@b@b@d@dOHIM
 	TUUU $!
 !
 !
t}B???HH!
 AE@QU^@^@^dm--a33::<<<eiereyeye{e{|}e~  eY  BF  BO  BV  BV  BX  BX  eY  eY!
 t}B???HH	!

 AE@QU^@^@^dm--a33::<<<eiereyeye{e{|}e~  eY  BF  BO  BV  BV  BX  BX  eY  eY!
 t}B???HH!
 AE@QU^@^@^dm--a33::<<<eiereyeye{e{|}e~  eY  BF  BO  BV  BV  BX  BX  eY  eY!
 t}B???HH!
 AE@QU^@^@^dm--a33::<<<eiereyeye{e{|}e~  eY  BF  BO  BV  BV  BX  BX  eY  eY!
 **!
 VZUfjqUqUq+d&>bQQQQw{!
 VZUfjqUqUq+d&>bQQQQw{!
 RVQ`jk$/DDDMMfj!
 EIDUYbDbDb11!Q77>>@@@ij  iGmqm|  nD  nD  nF  nF  iG  iG!
  21!
  (-!!
" &#!
$ "T%!
& $Ahhh'!
( ">)!
, 	9:::::r>   q_tensork_tensorv_tensoro_tensor
lse_tensorcum_seqlen_q_tensorcum_seqlen_k_tensorskip_compilec                 j   | j                             d           |                     |          }| j        r0|t	          d          |                     ||j        dz
  d          }| j        dk    rG||t	          d| d|           |                     |dd          }|                     |dd	          }|
| j        n|
}
|| j	        n|}|| j
        n|}|| j        n|}|| j        n|}t          j        t          j                  }|
|z  |z  }||z  }||z  }|	s| j        t	          d
          | j                             d           |                     t#          | j        dk    r|                    dd          n|d          j        t#          | j        dk    r|                    dd          n|d          j        t#          | j        dk    r|                    dd          n|d          j        t#          | j        dk    r|                    dd          n|d          j        | j        | j        dk    rt#          |d          j        nd | j        dk    rt#          |d          j        nd | j        rt#          |d          j        nd |||d t+          d          |           | j                             d           d S | j                             d           |                     t/          | j                  t/          | j                  g | j        | j        R | j        t:          j        j                  } |d)i dt#          | j        dk    r|                    dd          n|d          j        d| j        dk    r(|                    dd                                           n/|                                 d         g|                                 R dt#          | j        dk    r|                    dd          n|d          j        d| j        dk    r(|                    dd                                           n/|                                 d         g|                                 R dt#          | j        dk    r|                    dd          n|d          j        d| j        dk    r(|                    dd                                           n/|                                 d         g|                                 R dt#          | j        dk    r|                    dd          n|d          j        d| j        dk    r(|                    dd                                           n/|                                 d         g|                                 R d| j        d| j        dk    rt#          |d          nd d| j        dk    rt#          |d          nd d | j        rt#          |d          j        nd d!| j        dk    r(|                    dd                                           ndg|                                 R d"|d#|d$|d%d d&t+          d          d'| | j                             d(           d S )*NzEntering executez\kernel was compiled with lse_tensor provided, but lse_tensor was not provided during executer   r   rD   zScum_seqlen_q_tensor and cum_seqlen_k_tensor must be provided for T,H,D layout, got rE   r   r   z(CompressionAttention kernel not compiledzExecuting with compiled kernelrB   r
   rw   rx   r   )rv   r{   r}   r   r7   r   r   r   r   r#   r   r   r   r   z*Executed with compiled kernel successfullyz'Executing without compiled kernel (JIT)rt   rv   rz   r{   r|   r}   r~   r   r   r7   r   r   r   r   r   r#   r   r   r   r   zExecuted successfully )!r*   r,   r   r-   rN   rO   rL   rM   r   r    r!   r"   r#   r\   r   er8   r   r   r   r7   r   r)   r	   r.   r/   r   r6   r   r   r   r   r   )r;   r   r   r   r   r   r   r   rr   r   r   r    r!   r"   r#   r   scale_softmax_valscale_softmax_log2_valscale_output_valr   s                       r=   executezCompressionAttention.execute  s   " 	-...11.AA? 	a! !  A  A  A33JPQ@QS_``J''"*.A.I  Zj}  Z  Z  EX  Z  Z   #'"<"<=PRSUj"k"k"&"<"<=PRSUj"k"k #*/$,,w")/$,,w")/$,,w*5*=d&&;.;.C**46""#g-=!2V!;"[0 K	8$, !KLLLL?@@@!!"151Bi1O1OX''1---U]"$   "151Bi1O1OX''1---U]"$   "151Bi1O1OX''1---U]"$   "151Bi1O1OX''1---U]"$   !.]a]nry]y]yk*=RPPPYY  @D]a]nry]y]yk*=RPPPYY  @DPTP_i+jCCCLLei#9/-!%"'((%5 "   8 LKLLLLLLHIII,,-d.EFF-d.EFF3$#3T]33"$-D '  K K      "151Bi1O1OX''1---U]"$   (	 
 @D?PT]?]?](,,Q2299;;;dldsdsduduvwdx  dN  |D  |K  |K  |M  |M  dN  dN  #151Bi1O1OX''1---U]"$   (  @D?PT]?]?](,,Q2299;;;dldsdsduduvwdx  dN  |D  |K  |K  |M  |M  dN  dN  #151Bi1O1OX''1---U]"$   (  @D?PT]?]?](,,Q2299;;;dldsdsduduvwdx  dN  |D  |K  |K  |M  |M  dN  dN   #151Bi1O1OX''1---U]"$   (' ( @D?PT]?]?](,,Q2299;;;dldsdsduduvwdx  dN  |D  |K  |K  |M  |M  dN  dN) * "..+ , UYTeipTpTpk*=RPPPPvz- . UYTeipTpTpk*=RPPPPvz/ 0 QUP_i+jCCCLLei1 2 DHCTXaCaCaJ00A66==???hi  hAlvl}l}ll  hA  hA3 4 $:#95 6 0/7 8 .-9 : "&; < #((((= > &~?   B L677777r>   )N)
NNNNFNNNNN)__name__
__module____qualname__rT   r[   Tensorr   rS   r   intboolfloatr(   rq   r^   CUstreamr   r   __classcell__)r<   s   @r=   r   r      s        .26:6:$)M$)M(2# )-#<
 <
,<
 ,<
 ,	<

 ,<
 U\*<
 &el3<
 &el3<
 k<
 k<
 CHo<
 <
 <
 <
 <
  !<
"  #<
 <
 <
 <
 <
 <
|Ct C C C CJ6; 6;ht}&= 6; 6; 6; 6; 6;| .26:6:26"#'#'#''+)-v8 v8,v8 ,v8 ,	v8
 ,v8 U\*v8 &el3v8 &el3v8 !/v8 v8 %v8 %v8 %v8 e_v8  v8  
!v8 v8 v8 v8 v8 v8 v8 v8r>   r   Fr   r   r   r   r   r   r   r-   o_dtyper   r   r   r   r   r    r!   r"   r#   r   r?   c                    t                               d           d\  }}||n| j        }| j        dk    rq| j        \  }}}}|j        \  }}}}t          | ||||f|| j                  }|r:t          j        |||t          j	        | j                  
                                }n| j        dk    r| j        \  }}}|j        \  }}}t          | |||f|| j                  }|rOt          j        d||t          j	        | j                  
                                                    ddd	          }nt          d
| j                   | j        |j        |j        ||j        nd||j        nd| j        |j        |j        ||j        nd||j        nd|                                 |                                |                                ||                                nd||                                nd|||||	|
|||||f}|t          v rEt                               d           t          |         }|                    | |||||||           nt                               d           t!          di d| d|d|d|d|d|d|d|d|d|	d|
d|d|d|d|d|}|                                sJ |                                 |                    | |||||||           |t          |<   ||fS )z
    Compression Attention Wrapper that returns output (and optionally LSE) tensors directly.

    Returns:
        tuple: (o_tensor, lse_tensor | None)
    zNcompression_attention_wrapper: Creating empty output tensor o and optional lse)NNNrA   )rS   rm   rC   r   r
   r   zOInvalid input layout: q_tensor must be rank-4 (B,H,S,D) or rank-3 (T,H,D), got zRcompression_attention_wrapper: Using previously cached CompressionAttention object)r   r   r   r   r   r   r   rr   z_compression_attention_wrapper: No cached object found, creating new CompressionAttention objectr   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r   )r*   r,   rS   rL   r:   r   rm   rT   emptyr[   
contiguouspermuterN   r   %_cache_of_CompressionAttentionObjectsr   r   rq   r   )r   r   r   r   r   r-   r   r   r   r   r   r   r    r!   r"   r#   r   r   r   rd   r3   r1   d_r4   r2   rh   ri   	cache_key	comp_attns                                 r=   compression_attention_wrapperr     s   2 MMbccc%Hj ,gg(.G}!3Q#>3S+Hq#sC6HPW`h`oppp 	lQSho^^^iikkJ	!		N	3n3+Hq#sm7[c[jkkk 	{QQemHO\\\ggiiqqrsuvxyzzJzkskxzz{{{ 	%8%D!!$%8%D!!$%8%D!!$%8%D!!$(;(G""$$$T(;(G""$$$T5I8 999jkkk9)D	! 3 3! 	 		
 		
 		
 		
 	wxxx( 
 
 
X
X
 X
 X	

 "z
 !4 3
 !4 3
 &
 &
 &
 (-
 G
 G
 G
 $
  (-!
	$ &&(((((! 3 3! 	 		
 		
 		
 <E-i8Zr>   )(typingr   r   r\   cuda.bindingsr   r^   rT   cutlasscutlass.cuter   cutlass.cute.runtimer   cutlass.cute.typingr   cudnn.api_baser   cudnn.datatypesr	   utilsr   fmhar   r%   r   r   r   logging	getLoggerr   r*   r   r[   r   r   rS   r   r   r   r   r   r>   r=   <module>r      s   " " " " " " " "  ( ( ( ( ( (         , , , , , , % % % % % % " " " " " " 9 9 9 9 9 9 , , , , , , 9 9 9 9 9 9 ( ( ( ( ( (r8 r8 r8 r8 r87 r8 r8 r8j 
'
H
%
%(* % 3726%) % %$.%)&*#y  y ly ly  ly  "%,/	y 
 "%,/y  y  ek"y  +y  +y  S/y  y  y  y  y  y   E?!y " T]##y $ 5<%,//0%y  y  y  y  y  y r>   