
    `i                    :   d dl Z d dlZd dlZd dlZd dlZd dlmZmZmZm	Z	 d dl
Z
d dlmc mZ d dlmc mZ d dlZd dlmZ d dlmc mc mZ d dlmZ d dlmZ d dlm
Z d dlmc m Z! d dl"mc m#Z# d dl$m%Z% d dl&m'Z'm(Z(m)Z) d dl*m+Z, 	 de-fdZ. G d d	          Z/dS )
    N)TypeTupleUnionOptional)from_dlpack)Int32Int64Float32)fmha_helperssizec                 J    t          j        t           j        j        |           S )N)pipelineCooperativeGroupAgentThread)r   s    /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/cudnn/native_sparse_attention/compression/fmha.pymake_thread_cooperative_groupr   g   s    $X^%:DAAA    c            1       L   e Zd Zdeej                 deej                 deeeef         dede	j
        f
dZd Zej        dej        d	ej        eeeeef         g         d
ej        dej        eeeeef                  dej        dej        eeeeef                  dej        dej        eeeeef                  deeeeeeef         deej                 deej                 deej                 dej        eeeef                  dedededee         dee         dej        f&d            Zej        dej        dej        dej        dej        d ej        d!ej        d"ej        d#ej        d$ej        d%ej        deej                 deej                 d&eej                 dedededee         dee         d'ej        d(ej        d)ej        d*ej        d+ej        d,e	j        f0d-            Zej        d.ed/ed0ed1ed2ed3ed4ed5eeeej        j         ej!        ej        ej!        ej        f         fd6            Z"ej        d.ed7ed8edeej                 deej                 ded9ej#        j$        d:ej        d;ej        dee         dee         d<ej!        d=ej        d>ej!        d?ej        d,e	j        f d@            Z%ej        dAej#        j$        dBej        dCefdD            Z&ej        dAej#        j$        dBej        d&ej        dEz  dFej        dGedHed8edIededCedJej        fdK            Z'dES )L'BlackwellFusedMultiHeadAttentionForwardqk_acc_dtypepv_acc_dtype	mma_tileris_persistent	mask_typec                    || _         || _        d|d         z  |d         |d         f| _        || _        |d         |d         |d         f| _        d| _        || _        || _        d| _        d| _	        d| _
        d| _        d	| _        d
| _        d| _        d}|| _        d| _        | j        t#          g | j        | j	        | j
        | j        | j        | j        | j        R           z  | _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _         d}t#          g | j        | j	        R           |z  | _!        dS )a  Initializes the configuration for a Blackwell Fused Multi-Head Attention (FMHA) kernel.

        This configuration includes several key aspects:

        1.  Data Type Settings:
            - qk_acc_dtype: Data type for Q*K^T matrix multiplication accumulator
            - pv_acc_dtype: Data type for P*V matrix multiplication accumulator

        2.  MMA Instruction Settings:
            - mma_tiler: The (M, N, K) shape of the MMA instruction unit
            - qk_mma_tiler: MMA shape for Q*K^T computation
            - pv_mma_tiler: MMA shape for P*V computation

        3.  Kernel Execution Mode:
            - is_persistent: Boolean indicating whether to use persistent kernel mode
            - mask_type: Specifies the type of mask to use (no mask, residual mask, or causal mask)
            - window_size_left/right: Sliding window size for attention masking

        :param qk_acc_dtype: Data type for Q*K^T matrix multiplication accumulator
        :type qk_acc_dtype: Type[cutlass.Numeric]
        :param pv_acc_dtype: Data type for P*V matrix multiplication accumulator
        :type pv_acc_dtype: Type[cutlass.Numeric]
        :param mma_tiler: The (M, N, K) shape of the MMA instruction
        :type mma_tiler: Tuple[int, int, int]
        :param is_persistent: Whether to use persistent kernel mode
        :type is_persistent: bool
        :param mask_type: Type of mask to use
        :type mask_type: fmha_utils.MaskType
        :param window_size_left: Left-side sliding window size for attention masking
        :type window_size_left: int
        :param window_size_right: Right-side sliding window size for attention masking
        :type window_size_right: int
           r      )r   r   )r   r   r      )            )   	   
                  i             i        `   i   r    N)"r   r   	cta_tilerqk_mma_tilerpv_mma_tilercluster_shape_mnr   r   softmax0_warp_idssoftmax1_warp_idscorrection_warp_idsmma_warp_idload_warp_idepilogue_warp_idempty_warp_idtmem_alloc_colsthreads_per_warplenthreads_per_ctacta_sync_bar_idtmem_alloc_sync_bar_idtmem_s0_offsettmem_s1_offsettmem_o0_offsettmem_o1_offsettmem_p0_offsettmem_p1_offsettmem_vec0_offsettmem_vec1_offsetnum_regs_softmaxnum_regs_correctionnum_regs_otherbuffer_align_bytessoftmax_warpgroup_count)selfr   r   r   r   r   SM100_TMEM_CAPACITY_COLUMNSnum_warps_per_warpgroups           r   __init__z0BlackwellFusedMultiHeadAttentionForward.__init__l   s   T )(	!aLaL

 &aLaLaL

 !'*"!-!-#1  "&)#: "#4s'' )  	
 ! % " 
8
 
8
 
 
  !&'#!!! ! !" # ##%  "&"#'*+]T-C+]dF\+]+]'^'^by'y$$$r   c                     d| _         | j        j        dk    rdnd| _        d| _        d| _        d| _        d| _        d| _        dS )a  Set up configurations and parameters for the FMHA kernel operation.

        This method initializes and configures various attributes required for the
        execution of the fused multi-head attention kernel, mainly about the pipeline stages:

        - Sets up staging parameters for Q, K, V inputs and accumulator data
        - Configures pipeline stages for softmax, correction, and epilogue operations
        r   r$   r    r   r   N)	q_stageq_dtypewidthkv_stage	acc_stagesoftmax_corr_stagemma_corr_stagemma_softmax_stage	epi_stage)rP   s    r   _setup_attributesz9BlackwellFusedMultiHeadAttentionForward._setup_attributes   sO     !\/144!"#!"r   q_iterq_stridek_iterk_stridev_iterv_strideo_itero_strideproblem_sizecum_seqlen_qcum_seqlen_klse_iter
lse_stridescale_softmax_log2scale_softmaxscale_outputwindow_size_leftwindow_size_rightstreamc                 v   FGH |	\  }}}}}}}||z  }|
|n|d|z   z  }||n|d|z   z  }|
|nd}t          j        ||||f|ff|d         |d         |d         |d         |z  f|d         ff          }|
dn| |d         z  } t          j        || z   |          }!t          j        ||||f|ff|d         |d         d|d         f|d         ff          }"|dn| |d         z  }#t          j        ||#z   |"          }$t          j        ||||f|ffd|d         d|d         f|d         ff          }%|dn| |d         z  }&t          j        ||&z   |%          }'|
dn| |d         z  }(t          j        ||||f|ff|d         |d         |d         |d         |z  f|d         ff          })t          j        ||(z   |)          }*t          j        |du          rSt          j        |||f|ff|d         |d         ||d         z  f|d         ff          }+t          j        ||+          },nd},|!j         _        |$j         _        |'j         _        |*j         _	        t          j        t          j        ||||f|ff           j         j                  \   _        }-t           j                            |!                                           _        t           j                            |$                                           _        t           j                            |'                                           _        t           j                            |*           _        t          j         j        t0          j        j        k              rt7          d          t          j         j        t0          j        j        k              rt7          d          t          j         j        t0          j        j        k              rt7          d	          t          j         j         j        k              rt;          d
 j         d j                   t          j         j         j        k              rt;          d
 j         d j                                                     t0          j        j         }.t0          j!        j"        }/t0          j        j        }0tG          j$         j         j         j         j%        |. j&        dd                   }1tG          j$         j        |0 j         j'        |. j(        dd         |/          }2g  j)        dR  _*        t          j+        t          j         j*                  |1j,        j        f           _-         j(        dd          _.        tG          j/        |1 j&         j         j0                  HtG          j1        |1 j&         j         j2                  FtG          j/        |2 j(         j         j3                  }3tG          j1        |2 j(         j         j2                  }4tG          j4         j	         j         j.         j5                  Gt           j6        j7        8                    |.          }5t           j6        j7        9                                }6t          j:        Hg d          }7t           j6        ;                    |5|!|7 j&        |1 j-        j                  \  }8}9t          j:        Fg d          }:t           j6        <                    |5|$|: j&        |1 j-        j                  \  };}<t          j:        |4g d          }=t           j6        <                    |5|'|= j(        |2 j-        j                  \  }>}?t          j:        Gddg          }@t           j6        j7        =                    |6|*|@ j.                  \  }A}Bt          j>         j        |7          }Ct          j>         j        |:          }D|C _?        |D _@        t           jA         G FGH fdd                      }E|E _B         C                    |1|2|8|9|;|<|>|?|A|B|
||,|||||HF|3|4G j                  D                    |- jE        ddg j*        |d           dS )a%  Execute the Fused Multi-Head Attention operation on the provided tensors.

        This method prepares the input tensors for processing, validates their shapes and types,
        configures the computation parameters, and launches the CUDA kernel.

        The method handles:
        1. Tensor layout transformations for specific memory access patterns
        2. Validation of tensor shapes and data types
        3. Initialization of hardware-specific parameters and memory layouts
        4. Configuration of TMA (Tensor Memory Access) operations
        5. Grid and work scheduling computation
        6. Kernel launch with appropriate parameters

        :param q_iter: The query tensor pointer
        :type q_iter: cute.Pointer
        :param q_stride: The stride of the query tensor. (B, S, H, D) for bshd, (T, T, H, D) for thd (note that the T stride is duplicated)
        :type q_stride: cutlass.Constexpr[Tuple[int, int, int, int]]
        :param k_iter: The key tensor pointer
        :type k_iter: cute.Pointer
        :param k_stride: The stride of the key tensor. (B, S, H, D) for bshd, (T, T, H, D) for thd (note that the T stride is duplicated)
        :type k_stride: cutlass.Constexpr[Tuple[int, int, int, int]]
        :param v_iter: The value tensor pointer
        :type v_iter: cute.Pointer
        :param v_stride: The stride of the value tensor. (B, S, H, D) for bshd, (T, T, H, D) for thd (note that the T stride is duplicated)
        :type v_stride: cutlass.Constexpr[Tuple[int, int, int, int]]
        :param o_iter: The output tensor pointer
        :type o_iter: cute.Pointer
        :param o_stride: The stride of the output tensor. (B, S, H, D) for bshd, (T, T, H, D) for thd (note that the T stride is duplicated)
        :type o_stride: cutlass.Constexpr[Tuple[int, int, int, int]]
        :param problem_size: The problem size with shape [b, s_q, s_lse, s_k, h_q, h_k, d]. If cum_seqlen_q or cum_seqlen_k is not None, s_q and s_k are the max of the cumulative sequence length respectively.
        :type problem_size: Tuple[Int32, Int32, Int32, Int32, Int32, Int32]
        :param cum_seqlen_q: The cumulative sequence length tensor for query
        :type cum_seqlen_q: Optional[cute.Tensor]
        :param cum_seqlen_k: The cumulative sequence length tensor for key
        :type cum_seqlen_k: Optional[cute.Tensor]
        :param lse_stride: The stride of the log-sum-exp tensor. (B, S, H) for bshd, (0, T, H) for thd
        :type lse_stride: cutlass.Constexpr[Tuple[int, int, int]]
        :param scale_softmax_log2: The log2 scale factor for softmax
        :type scale_softmax_log2: Float32
        :param scale_softmax: The scale factor for softmax
        :type scale_softmax: Float32
        :param scale_output: The scale factor for the output
        :type scale_output: Float32
        :param window_size_left: Left-side sliding window size for attention masking.
        :type window_size_left: Optional[Int32]
        :param window_size_right: Right-side sliding window size for attention masking.
        :type window_size_right: Optional[Int32]
        :param stream: The CUDA stream to execute the kernel on
        :type stream: cuda.CUstream
        :raises TypeError: If tensor data types don't match or aren't supported
        :raises RuntimeError: If tensor layouts aren't in supported formats
        Nr   r   r   r   )stridez The layout of q is not supportedz The layout of k is not supportedz The layout of v is not supportedzType mismatch: z != )r   r   r   modec                      e Zd ZU ej        j        ej        dz  f         ed<   ej        j        ej	        dz  f         ed<   ej        j        ej
        dz  f         ed<   ej        j        ej
        dz  f         ed<   ej        j        ej        dz  f         ed<   ej        j        ej        dz  f         ed<   ej        j        ej        f         ed<   ej        j        ej        dz  f         ed	<   ej        j        ej        dz  f         ed
<   ej        j        edf         ed<   eed<   ej        j        ej        j        j         ej                  f         j        f         ed<   ej        j        ej        j        j         ej                  f         j        f         ed<   ej        j        ej        j        j         ej                   f         j        f         ed<   dS )GBlackwellFusedMultiHeadAttentionForward.__call__.<locals>.SharedStorager   load_q_mbar_ptrload_kv_mbar_ptrmma_s0_mbar_ptrmma_s1_mbar_ptrs0_corr_mbar_ptrs1_corr_mbar_ptrs0_s1_sequence_mbar_ptrcorr_epi_mbar_ptrmma_corr_mbar_ptrr   tmem_dealloc_mbar_ptrtmem_holding_bufsOsQsKN)__name__
__module____qualname__cutestructMemRanger	   rU   __annotations__rX   r\   rZ   rO   r]   r[   r   Aligno_dtypecosizerN   rV   k_dtype)k_smem_layout_stagedo_smem_layout_stagedq_smem_layout_stagedrP   s   r   SharedStoragerw     sS         "[1%9I2IJJJJ"k25$-!:K3KLLLL![1%9ORS9S2STTTT![1%9ORS9S2STTTT"k25$:QTU:U3UVVVV"k25$:QTU:U3UVVVV%)[%9%A]:]%^^^^#{3E4>A;M4MNNNN#{3E4;NQR;R4RSSSS#';#7q#AAAA####!$T\;4;?S3T3T%TU')    !$T\;4;?S3T3T%TU')    !$T\;4;?S3T3T%TU')     r   r   )gridblockclusterrq   min_blocks_per_mp)Fr   make_layoutmake_tensorcutlass
const_exprelement_typerV   r   v_dtyper   
fmha_utilscompute_gridshaper2   r   tile_sched_paramsutils
LayoutEnumfrom_tensormma_major_modeq_major_modek_major_modev_major_modeo_layouttcgen05OperandMajorModeKRuntimeErrorMN	TypeErrorr^   CtaGroupONEOperandSourceTMEMsm100_utilsmake_trivial_tiled_mmar   r3   r   r4   r5   cluster_shape_mnktiled_dividethr_idcluster_layout_vmnkepi_tilemake_smem_layout_arU   make_smem_layout_brX   rY   make_smem_layout_epir]   nvgpucpasyncCopyBulkTensorTileG2SOpCopyBulkTensorTileS2GOpselectmake_tiled_tma_atom_Amake_tiled_tma_atom_Bmake_tiled_tma_atomsize_in_bytestma_copy_q_bytestma_copy_kv_bytesr   shared_storagekernellaunchr@   )IrP   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   bs_qs_lses_kh_qh_kdh_rb_qob_kvb_lseq_layoutq_offsetqk_layoutk_offsetkv_layoutv_offsetvo_offsetr   o
lse_layoutlser   	cta_groupp_sourcep_major_modeqk_tiled_mmapv_tiled_mmap_tmem_layout_stagedv_smem_layout_stagedtma_load_optma_store_opq_smem_layout
tma_atom_qtma_tensor_qk_smem_layout
tma_atom_ktma_tensor_kv_smem_layout
tma_atom_vtma_tensor_vo_smem_layout
tma_atom_otma_tensor_oq_copy_sizek_copy_sizer   r   r   r   sI   `                                                                     @@@r   __call__z0BlackwellFusedMultiHeadAttentionForward.__call__   sq	   V +7'3sCaSj (qqcQUm (qqcQUm!)q #!sCj$'(1+x{S018A;?
 
 
 %,113$!2DVh.99#!sCj$'(QK!8A;/?!.MN
 
 
 %,113$!2DVh.99#sCj$'(x{a!%5x{$CD
 
 
 %,113$!2DVh.99$,113$!2D#!sCj$'(1+x{S018A;?
 
 
 Vh.99hd233 	)#sU+,qM mS:a=%89:a=I  J "8Z88CCC ~~~~'1'>JQ#sQ011N(
 (
$ ",88;;JJLL!,88;;JJLL!,88;;JJLL(44Q77d/73K3MMNN 	CABBBd/73K3MMNN 	CABBBd/73K3NNOO 	CABBB dldl:;; 	PNdlNNNNOOOdldl:;; 	PNdlNNNNOOO   $(	(-/1"9Lbqb!
 
 #9Lbqb!
 
 "=4#8!<!!<!<#'#4T344 &($
 $
 
 )"1"-*=LL	 
  
  +=LM	 
  
  +=LN	 
  
  +=LM	 
  
  +?LMMN	 
  
 j(@@KKz)AACC$8yyyIII#':#C#C$*$
 $
 
L $8yyyIII#':#C#C$*$
 $
 
L $8yyyIII#':#C#C$*$
 $
 
L $81vFFF#':#5#I#IM	$
 $
 
L (}EE(}EE +!,		 	 	 	 	 	 	 	 	 	 	 	 
	8 , 	     "1	
 	
2 &'A.*  
 
 
 
 
r   r   r   r   mQ_qdlr   mK_kdlr   mV_dklr   mO_qdlmLSEr   r   r   r   r   r   c                 |P   t           j                            t           j                                                  }t           j                                        \  }}}|| j        k    rt           j        j                            |           t           j        j                            |           t           j        j                            |           t           j        j                            |	           t          j
                    }|                    | j                  }t          j                            | j        t#          t%          | j        g                    t#          t%          | j        g                    | j        |j                                                                                  \  }}t          j                            | j        t#          t%          | j        g                    t#          t%          | j        g                    | j        |j                                                                                  \  } }!t          j                            | j        t#          t%          | j        g                    t#          | j        t%          | j                  z            |j                                                                                  \  }"}#t          j                            | j        t#          t%          | j        g                    t#          | j        t%          | j                   z            |j!                                                                                  \  }$}%t          j"                            | j#        t#          | j        t%          | j                  z            t#          | j        t%          | j$                  z            |j%                                                                                  \  }&}'t          j"                            | j#        t#          | j        t%          | j                   z            t#          | j        t%          | j$                  z            |j&                                                                                  \  }(})t          j"                            | j'        t#          | j        t%          | j$                  z            t#          | j        t%          | j(        g          z            |j)                                                                                  \  }*}+t          j                            | j*        t#          t%          | j        g                    t#          | j        t%          | j$                  z            |j+                                                                                  \  },}-t          j"                            dt#          | j        t%          | j                  z            t#          | j        t%          | j                   z            |j,                                                                                  \  }.}/|j-                                        }0|| j.        k    rKt           j        /                    |0| j        t%          g | j        | j         | j$        R           z             t           j        0                                 |j1        2                    |j3        |j4                  }1|j5        2                    |j3        |j4                  }2t          j6        |2j7        |j4                  }3t          j8        |3|j3                  }4|j9        2                    |j3        |j4                  }5|:                    d          }6|:                    d          }7|6;                    |1          }8|6<                    |2          }9|7<                    |4          }:|6=                    | j>        d         | j>        d         f          };|6?                    |;          }<|7=                    | j@        d         | j@        d         f          }=|7?                    |=          }>t          j8        |<j7        | jA        z   |<jB                  }?t          j8        |<j7        | jC        z   |<jB                  }@t          j8        |>j7        | jD        z   |>jB                  }At          j8        |>j7        | jE        z   |>jB                  }Bt          j8        |<j7        |j3                  }C|7;                    |C          d         }Dt          j8        |Dj7        | jF        jG        | jH        jG        z  | jI        z  z   |DjB                  }Et          j8        |Dj7        | jF        jG        | jH        jG        z  | jJ        z  z   |DjB                  }Ft           j        K                    | jL        | jM                   || j.        k    r$t           j        N                    | jO                   || j        k    rt           j        N                    | jO                   t          jQ        |t           j        R                                t           j        S                                          }G|GT                                }H|HjU        rl|HjV        }I|Id         d         }Jd	}Kt          d          }L|jX        d         }Mt          jZ        |d
u          rI||J         }L||Jdz            |Lz
  }Mt          j[        \                    | j]        d         |Id         |M           }K|Ks|}N|}O|}P|jX        d         }Q|I}R|I}St          jZ        |d
u          rY|jX        d         |Mz
  dd|L|Mz   ff}Tt          j^        |T|          }N|Id         |Id         |Id         d         t          d          ff}Rt          jZ        |d
u          r||J         }U||Jdz            |Uz
  }Q|jX        d         |Qz
  dd|U|Qz   ff}Vd|jX        d         |Qz
  d|U|Qz   ff}Wt          j^        |V|          }Ot          j^        |W|          }P|Id         |Id         |Id         d         t          d          ff}St          j_        |Nt          j`        | j>        ddg                    }X|6a                    |X          }Yt           j        j        b                    |dt          jc        d          t          jd        |1dd          t          jd        |Ydd                    \  }Z}[|[d
d
d|Rd         f         }\t          j_        |Ot          j`        | j>        ddg                    }]|6e                    |]          }^t           j        j        b                    |dt          jc        d          t          jd        |2dd          t          jd        |^dd                    \  }_}`|`d
d
d|Sd         f         }at          j_        |Pt          j`        | j@        ddg                    }b|7e                    |b          }ct           j        j        b                    |dt          jc        d          t          jd        |4dd          t          jd        |cdd                    \  }d}e|ed
dd
|Sd         f         }fd|Rd         z  }g|f                                }ht          jg        ||\d
|gf         |Zd
|hjh        f         |hjK                   t          ji        j                    | jk        |I| j]        |M|Q|          }i|i}j| f                                }kt          jg        ||ad
|jf         |_d
|kjh        f         |kjK                   |gdz   }l|f                                }mt          jg        ||\d
|lf         |Zd
|mjh        f         |mjK                   | f                                }nt          jg        ||fd
|jf         |dd
|njh        f         |njK                   |jdz  }jt          ji        l                    | jk        |I| j]        |M|Q||          dz
  }ot          jm        d|odd          D ]}p| f                                }kt          jg        ||ad
|jf         |_d
|kjh        f         |kjK                   | f                                }nt          jg        ||fd
|jf         |dd
|njh        f         |njK                   |jdz  }j|Gn                                 |Go                                }H|HjU        l|| j        k    	rt           j        N                    | jO                   t          | jp                  }qt           j        q                    |q|jr                   t           j        K                    | js        | j                   t          jQ        |t           j        R                                t           j        S                                          }G|GT                                }H|HjU        r.|HjV        }I|Id         d         }Jd	}K|jX        d         }Mt          jZ        |d
u          rI||J         }L||Jdz            |Lz
  }Mt          j[        \                    | j]        d         |Id         |M           }K|Ksx|jX        d         }Qt          jZ        |d
u          r||J         }U||Jdz            |Uz
  }Q|t                                }h|8d
d
d
|hjh        f         }r|!t                                }k|9d
d
d
|kjh        f         }s|"f                                }tt          ju        |rdg          }ut          jm        |ud          D ]T}vd
d
|vf}w|v                    t          jx        jy        |vdk               t          jz        ||?|r|w         |s|w         |?           U|t{                                 |t                                }m|8d
d
d
|mjh        f         }x|$f                                }yt          ju        |xdg          }ut          jm        |ud          D ]T}vd
d
|vf}w|v                    t          jx        jy        |vdk               t          jz        ||@|x|w         |s|w         |@           U|y{                                 |k|                                 |!t                                }n|:d
d
d
|njh        f         }z|,f                                }{|"f                                }tt          ju        |Edg          }ut          jm        |ud          D ]T}vd
d
|vf}w|v                    t          jx        jy        |vdk               t          jz        ||A|E|w         |z|w         |A           U|{{                                 t          ji        l                    | jk        |I| j]        |M|Q||          dz
  }od	}|t          jm        d|odd          D ]}p|!t                                }k|9d
d
d
|kjh        f         }}t          ju        |rdg          }~t          jm        |~d          D ]T}vd
d
|vf}w|v                    t          jx        jy        |vdk               t          jz        ||?|r|w         |}|w         |?           U|t{                                 |,f                                }|$f                                }yt          ju        |Edg          }~t          jm        |~d          D ]R}vd
d
|vf}w|v                    t          jx        jy        ||           t          jz        ||B|F|w         |z|w         |B           d}|S|{                                 |n|                                 t          ju        |xdg          }~t          jm        |~d          D ]T}vd
d
|vf}w|v                    t          jx        jy        |vdk               t          jz        ||@|x|w         |}|w         |@           U|y{                                 |k|                                 |!t                                }n|:d
d
d
|njh        f         }z|,f                                }{|"f                                }tt          ju        |Edg          }~t          jm        |~d          D ]P}vd
d
|vf}w|v                    t          jx        jy        d           t          jz        ||A|E|w         |z|w         |A           Q|{{                                 |h|                                 |m|                                 |,f                                }|$f                                }yt          ju        |Fdg          }ut          jm        |ud          D ]R}vd
d
|vf}w|v                    t          jx        jy        ||           t          jz        ||B|F|w         |z|w         |B           d}|S|{                                 |n|                                 |t{                                 |y{                                 |Gn                                 |Go                                }H|HjU        .t           j        }                                 t           j        ~                    |0d           t          | jp                  }qt           j                            t           d|jr                  }t           j                            ||q           || j(        k    rt           j        N                    | jO                   t          jQ        |t           j        R                                t           j        S                                          }G|GT                                }H|HjU        r|HjV        }I|Id         d         }Jd	}Kt          d          }L|jX        d         }Mt          jZ        |d
u          rI||J         }L||Jdz            |Lz
  }Mt          j[        \                    | j]        d         |Id         |M           }K|Ks4|I}|
}t          jZ        |d
u          rL|jX        d         |Mz
  dd|L|Mz   ff}t          j^        ||          }|Id         |Id         |Id         d         dff}d|d         z  }|dz   }t          j_        |t          j`        | j@        ddg                    }|d
d
d
d|d         f         }t           j        j        b                    |	dt          jc        d          t          jd        |5dd          t          jd        |dd                    \  }}|+t                                }{t          jg        |	|d         |d
|f                    t           j                                         |+t                                }t          jg        |	|d         |d
|f                    t           j                                         t           j                            dd           |{|                                 t           j                            dd           ||                                 |Gn                                 |Go                                }H|HjU        || j         d         k     r~t           j                            | j                   |                     d|jX        d         |jX        d         ||||6|<|?|||#|&|/|.|           t           j                            |0           || j$        d         k     r|| j         d         k    r~t           j                            | j                   |                     d|jX        d         |jX        d         ||||6|<|@|||%|(|/|.|           t           j                            |0           || j$        d         k    r|| j        k     rt           j        N                    | j                   t          j        | j>        d         | j>        d         f          }|6                    |          }t          j        |<jB        t          jc        d                    }t          j8        |<j7        | j        z   |          }t          j8        |<j7        | j        z   |          }t          j        |jB        t          jc        d                    }t          j8        |j7        |          }t          j        t          jg                            t          jg                            d                    | jF                  }t          j        ||          }|| j        t%          | j$                  z  z  }|:                    |          }|                    |          }|                    |          }|                    |          }t          jQ        |t           j        R                                t           j        S                                          }G|GT                                }H|HjU        r|HjV        }I|I}|Id         d         }J|jX        d         }Qd	}Kt          d          }L|jX        d         }Mt          jZ        |d
u          rh||J         }L||Jdz            |Lz
  }M|Id         |Id         |Id         d         dff}t          j[        \                    | j]        d         |Id         |M           }K|Ks|Id         | j]        d         z  |d         d         z   }t          jZ        |d
u          r||J         }U||Jdz            |Uz
  }Q|'t                                }||                                 |)t                                }t          ji        l                    | jk        |I| j]        |M|Q||          dz
  }ot          jm        d|odd          D ]}p|'t                                }t          j        |jX        | jF                  }t          jg        |||           ||d         |d         z
  z  }t           j                            |d          }|-t                                }{|                     |7|A|           ||                                 t           j                                         |{|                                 |)t                                }t          jg        |||           ||d         |d         z
  z  }t           j                            |d          }|-t                                }|                     |7|B|           ||                                 t           j                                         ||                                 ||                                 |'t                                }t          j        |jX        | jF                  }t          jg        |||           t           j                                         ||                                 |-t                                }{|*f                                }|                     |7|A||||L|M||||d         z  |5d                    |{|                                 |{                                 |)t                                }t          jg        |||           t           j                                         ||                                 |-t                                }|*f                                }|| j>        d         z  }|                     |7|B||||L|M||||d         z  |5d                    ||                                 |{                                 |Gn                                 |Go                                }H|HjU        t           j                            |0           d
S )a  The device kernel implementation of the Fused Multi-Head Attention.

        This kernel coordinates multiple specialized warps to perform different phases of the FMHA computation:
        1. Load warp: Loads Q, K, V data from global memory to shared memory using TMA
        2. MMA warp: Performs matrix multiplications (Q*K^T and P*V)
        3. Softmax warps: Compute softmax normalization on attention scores
        4. Correction warps: Apply adjustments to intermediate results
        5. Epilogue warp: Handles final output transformation and storage

        The kernel implements a complex pipeline with overlapping computation and memory operations,
        using tensor memory access (TMA) for efficient data loading, warp specialization for different
        computation phases, and optional attention masking.

        :param qk_tiled_mma: Tiled MMA for Q*K^T
        :type qk_tiled_mma: cute.TiledMma
        :param pv_tiled_mma: Tiled MMA for P*V
        :type pv_tiled_mma: cute.TiledMma
        :param tma_atom_q: TMA copy atom for query tensor
        :type tma_atom_q: cute.CopyAtom
        :param mQ_qdl: Partitioned query tensor
        :type mQ_qdl: cute.Tensor
        :param tma_atom_k: TMA copy atom for key tensor
        :type tma_atom_k: cute.CopyAtom
        :param mK_kdl: Partitioned key tensor
        :type mK_kdl: cute.Tensor
        :param tma_atom_v: TMA copy atom for value tensor
        :type tma_atom_v: cute.CopyAtom
        :param mV_dkl: Partitioned value tensor
        :type mV_dkl: cute.Tensor
        :param tma_atom_o: TMA copy atom for output tensor
        :type tma_atom_o: cute.CopyAtom
        :param mO_qdl: Partitioned output tensor
        :type mO_qdl: cute.Tensor
        :param scale_softmax_log2: The log2 scale factor for softmax
        :type scale_softmax_log2: Float32
        :param scale_output: The scale factor for the output
        :type scale_output: Float32
        :param window_size_left: Left-side sliding window size for attention masking.
        :type window_size_left: Optional[Int32]
        :param window_size_right: Right-side sliding window size for attention masking.
        :type window_size_right: Optional[Int32]
        :param q_smem_layout_staged: Shared memory layout for query tensor
        :type q_smem_layout_staged: cute.ComposedLayout
        :param k_smem_layout_staged: Shared memory layout for key tensor
        :type k_smem_layout_staged: cute.ComposedLayout
        :param p_tmem_layout_staged: Tensor memory layout for probability matrix
        :type p_tmem_layout_staged: cute.ComposedLayout
        :param v_smem_layout_staged: Shared memory layout for value tensor
        :type v_smem_layout_staged: cute.ComposedLayout
        :param o_smem_layout_staged: Shared memory layout for output tensor
        :type o_smem_layout_staged: cute.ComposedLayout
        :param tile_sched_params: Scheduling parameters for work distribution
        :type tile_sched_params: fmha_utils.FmhaStaticTileSchedulerParams
        )
num_stagesproducer_groupconsumer_grouptx_countbarrier_storage)r  r  r  r  r   )swizzler   )NNNr   )
barrier_idnumber_of_threadsr   FNrt   r   )tma_bar_ptrunrollT)unroll_full   )	alignmentptr_to_buffer_holding_addr)Nr   )Nr   )read)stageseqlen_kseqlen_qrh   ri   rl   
qk_thr_mmatStStStSiro   rp   mma_si_consumersi_corr_producers0_s1_sequence_consumers0_s1_sequence_producerr   r-   r   fastmath)NNr   )NNr   )r   archmake_warp_uniformwarp_idx
thread_idxr:   r   r   prefetch_descriptorr   SmemAllocatorallocater   r   PipelineTmaUmmacreaterU   r   r?   r9   r   rx   data_ptrmake_participantsrX   r   ry   PipelineUmmaAsyncr\   r>   r6   rz   r7   r{   PipelineAsyncrZ   r8   r|   r}   r]   r;   r   r[   r   r~   r   r<   mbarrier_initmbarrier_init_fencer   
get_tensorouterinnerr   
recast_ptriteratorr   r   	get_slicemake_fragment_Amake_fragment_Bpartition_shape_Cr3   make_fragment_Cr4   rC   layoutrD   rE   rF   r   rW   rV   rG   rH   barrierrA   r@   warpgroup_reg_deallocrM   r   !create_fmha_static_tile_scheduler	block_idxgrid_diminitial_work_tile_infois_valid_tiletile_idxr   r   r   r   FmhaStaticTileSchedulercheck_valid_work_for_seqlen_qr2   domain_offsetflat_divider   partition_Atma_partitionr   group_modespartition_Bacquire_and_advancecopyindex	FusedMaskget_trip_startr   get_trip_countrangeadvance_to_next_workget_current_workr=   
alloc_tmemr   rB   wait_and_advancer   setr   Field
ACCUMULATEgemmcommitreleaserelinquish_tmem_alloc_permitmbarrier_waitretrieve_tmem_ptrr
   dealloc_tmemcp_async_bulk_commit_groupcp_async_bulk_wait_groupwarpgroup_reg_allocrK   softmaxmbarrier_arriverL   make_identity_tensorpartition_CcompositionrI   rJ   make_copy_atom
Ld32x32bOp
Repetitionmake_tmem_copypartition_Spartition_Dmake_rmem_tensormathexp2correction_rescalefence_view_async_tmem_storefence_view_async_tmem_loadcorrection_epilog)rP   r   r   r   r   r   r   r   r   r   r   rh   ri   r  rl   rm   rn   ro   rp   r   r   r   r   r   r   r"  tidx_smemstorageload_q_producerload_q_consumerload_kv_producerload_kv_consumermma_s0_producermma_s0_consumermma_s1_producermma_s1_consumers0_corr_producers0_corr_consumers1_corr_producers1_corr_consumercorr_epi_producercorr_epi_consumermma_corr_producermma_corr_consumerr  r  r   r   r   sV_ptrsVr   r  
pv_thr_mmatSrQtSrKtOrVqk_acc_shaper  pv_acc_shapetOtOtStS0tStS1tOtO0tOtO1tPtOrPtOrP0tOrP1
tile_sched	work_tilecurr_block_coordbatch_coordcontinue_cond
cuseqlen_qr  mQ_qdl_mK_kdl_mV_dkl_r  curr_block_coord_qcurr_block_coord_kvlogical_offset_mQ
cuseqlen_klogical_offset_mKlogical_offset_mVgQ_qdltSgQ_qdltQsQtQgQ_qdltQgQgK_kdltSgK_kdltKsKtKgK_kdltKgKgV_dkltSgV_dkltVsVtVgV_dkltVgVq0_coord	q0_handleseqlen_kv_loop_startkv_coordk_handleq1_coord	q1_handlev_handleseqlen_kv_loop_stepsir=   tSrQ0tSrK0	s0_handlenum_kphases
kphase_idxkphase_coordtSrQ1	s1_handletOrVi	o0_handlepv_whether_acctSrKiinner_num_kphases	o1_handletmem_ptrcurr_block_coord_omO_qdl_logical_offset_mOo0_coordo1_coordgO_qdlgOtOsOtOgOcStScStStS_vec_layout	tStS_vec0	tStS_vec1tScS_vec_layouttScS_vectmem_load_v_atomtiled_tmem_load_vecr#  thr_tmem_load_vectTMEM_LOAD_VECtS0tTMEM_LOAD_VECtS1tTMEM_LOAD_VECcScurr_block_coord_lserow_idxvec0_handlevec1_handletTMEM_LOAD_VECrSscale_scaleo0_final_handleo1_final_handles                                                                                                                                                                    r   r   z.BlackwellFusedMultiHeadAttentionForward.kernel8  s'   d 9..ty/A/A/C/CDDY))++
a
 t(((J22:>>>J22:>>>J22:>>>J22:>>> "$$-- 344+3+C+J+J|8d>O=P9Q9QRR8d>N=O9P9PQQ*#3<<>> ,K ,
 ,
 


 	) .6-E-L-L}8d>O=P9Q9QRR8d>N=O9P9PQQ+#4==?? .M .
 .
 


 	+* ,4+E+L+L-8d>N=O9P9PQQ89NQTUYUkQlQl9lmm#3<<>>	 ,M ,
 ,

 


 	) ,4+E+L+L-8d>N=O9P9PQQ89NQTUYUkQlQl9lmm#3<<>>	 ,M ,
 ,

 


 	) .6-C-J-J.89NQTUYUkQlQl9lmm89NQTUYUmQnQn9noo#4==??	 .K .
 .

 


 	+* .6-C-J-J.89NQTUYUkQlQl9lmm89NQTUYUmQnQn9noo#4==??	 .K .
 .

 


 	+* 08/E/L/L~89NQTUYUmQnQn9noo89NQTVZVkUlQmQm9mnn#5>>@@	 0M 0
 0

 


 	-, 08/I/P/P*8d>N=O9P9PQQ89NQTUYUmQnQn9noo#5>>@@	 0Q 0
 0

 


 	-, <D;Q;X;X89NQTUYUkQlQl9lmm89NQTUYUkQlQl9lmm#;DDFF	 <Y <
 <

 


 	9!8 !( = F F H H t)))I##%%// 1  
 
 
 		%%''' Z""#7#=G[Ga"bbZ""#7#=G[Ga"bb .B.HIIf&:&@AAZ""#7#=G[Ga"bb!++A..
!++A..
))"--))"--))"--!33T5Fq5I4K\]^K_4`aa)),77!33T5Fq5I4K\]^K_4`aa)),77 1D!DdkRR 1D!DdkRR 1D!DdkRR 1D!DdkRRdm-A-GHH))"--.AB MD-3t|7IIDL___K
 
  MD-3t|7IIDL___K
 
 		+"2 	 	
 	
 	
 t)))I++D,?@@@
 t(((I++D,?@@@#EFWY]YbYlYlYnYnptpy  qC  qC  qE  qE  F  FJ"99;;I) j:#,#5 .q1!4 %"1XX
!<?%l$&>?? !-k!:J+K!O<zIH(2(J(h(hq)(+ ) ) %M
 % X&$G$G$G%|AH)9&*:'),d*BCC "LOh6
X 56-)
 #'"45F"O"O,Q/,Q/-a03U1XX>.* ),d*BCC %1+%>
#/a#@:#M"LOh6
X 56-) "LOh6
X 56-)
 #'"45F"O"O"&"45F"O"O,Q/,Q/-a03U1XX>/+ "-gt{4CT\]_`[a7b7b7bccF)55f==H%)Z%7%E%E"(++(Q22(1a88& &ND( $D$3Ea3H$HID!-gt{4CT\]_`[a7b7b7bccF)55f==H%)Z%7%E%E"(++(Q22(1a88& &ND( $D$3Fq3I$IJD!-gt{4CT\]_`[a7b7b7bccF)55f==H%)Z%7%E%E"(++(Q22(1a88& &ND( $D!T3Fq3I$IJD  !#5a#88H / C C E EII"T8^,T9?23$-$5	    ,6+?+N+N(  (, ,(  4H/CCEEHI"T8^,T8>12$,$4	     (!|H / C C E EII"T8^,T9?23$-$5	     0CCEEHI"T8^,T8>12$,$4	    MH #,;; N, N$$,-  	 ) %]1.BAaPPP & &#3#G#G#I#I	& x0 x~!56(0(8	    $4#G#G#I#I	& x0 x~!56(0(8	    !A //111&7799	U ) j:` t'''I++D,?@@@ $D$899OI  '2JKKKI6"&"7     $EFWY]YbYlYlYnYnptpy  qC  qC  qE  qE  F  FJ"99;;I) m:#,#5 .q1!4 %!<?%l$&>?? !-k!:J+K!O<zIH(2(J(h(hq)(+ ) ) %M % ['%|AH),d*BCC N%1+%>
#/a#@:#M !0 @ @ B BI tT9?!BCE/@@BBH tT8>!ABE / C C E EI"&)E"<"<"<K&-mKT&R&R&R 	 	
(,dJ'?$(()A:QR?SSS	(!!,/!,/!    $$&&&
 !0 @ @ B BI tT9?!BCE / C C E EI"&)E"<"<"<K&-mKT&R&R&R 	 	
(,dJ'?$(()A:QR?SSS	(!!,/!,/!    $$&&&$$&&&  0@@BBH tT8>!ABE !2 E E G GI !0 C C E EI"&)E"<"<"<K&-mKT&R&R&R 	 	
(,dJ'?$(()A:QR?SSS	(!!,/!,/!    $$&&& #,;; N, N$$,-  	 ) &+N$]1.BAaPPP T+ T+ $4#D#D#F#F $T4x~%E F,0Ie1#,F,F,F)*1-8IW[*\*\*\ 	 	J,0$
+CL(,,W]-EzUVWWW I , % %l 3 %l 3 %    "((***
 %6$I$I$K$K	$3$G$G$I$I	,0Ie1#,F,F,F)*1-8IW[*\*\*\ 
2 
2J,0$
+CL(,,W]-E~VVV I , % %l 3 %l 3 %   .2NN!((*** ((***
 -1Ie1#,F,F,F)*1-8IW[*\*\*\ 	 	J,0$
+CL(,,W]-EzUVWWW I , % %l 3 %l 3 %    "((*** ((***
 $4#D#D#F#F $T4x~%E F$5$I$I$K$K	$3$G$G$I$I	,0Ie1#,F,F,F)*1-8IW[*\*\*\ 	 	J,0$
+CL(,,W]-EtLLL I , % %l 3 %l 3 %    "((****
 %%'''%%''' !2 E E G GI / C C E EI"&)E"<"<"<K&-mKT&R&R&R 
. 
.
(,dJ'?$(()A>RRR	(!!,/!,/!   *.$$&&&$$&&& $$&&&$$&&& //111&7799	[ ) m:b I22444I##$91===#D$899Oy22+2+C 3  H
 I""8_===
 t,,,I++D,?@@@#EFWY]YbYlYlYnYnptpy  qC  qC  qE  qE  F  FJ"99;;I) C:#,#5 .q1!4 %"1XX
!<?%l$&>?? !-k!:J+K!O<zIH(2(J(h(hq)(+ ) ) %M
 % 0()9&$G),d*BCC #M!,x7
X 56-)
 #'"45F"P"P,Q/,Q/-a03Q7.*  !#5a#88H'!|H!-gt{4CT\]_`[a7b7b7bccFdD!5G5J JKB!%!3!A!A"(++(Q22(Q22" "JD$ !2 B B D DIIj$w-dHn9MNNNI88::: !2 B B D DIIj$w-dHn9MNNNI88::: I66qt6DDD%%'''I66qt6DDD%%''' //111&7799	G ) C:R d,Q///I))$*?@@@LLaa))#5%!1"3 /!1(?(?"3!    $ I%%&;<<<
 d.q111h$BXYZB[6[6[I))$*?@@@LLaa))#5%!1"3 /!1(?(?"3!    $ I%%&;<<<
 t/222x$BR7R7RI++D,DEEE*D,=a,@$BSTUBV+WXXB))"--D".t{D<LX<V<VWWO(9N)NP_``I(9N)NP_``I".t{D<LX<V<VWWO'GGH#2''(?(?(B(BCC!   
 #*"89I9"U"U!6T=U9V9V!VWJ 3 = =j I I 1 = =i H H 1 = =i H H0<<XFF#EFWY]YbYlYlYnYnptpy  qC  qC  qE  qE  F  FJ"99;;I) @:#,#5 '7$.q1!4!<? %"1XX
!<?%l$&>?? !-k!:J+K!O<zIH )+(+)!,Q/3,(
 )3(J(h(hq)(+ ) ) %M % e-.q1DN14EEHXYZH[\]H^^G),d*BCC N%1+%>
#/a#@:#M"2"C"C"E"EK'')))"2"C"C"E"EK #,;; N, N$$,-  	 ) %]1.BAaPPP , ,&6&G&G&I&I+/+@AQAWY]Yj+k+k(	"57HJZ[[[!37G7JM]^_M`7`!a $	v E E$5$F$F$H$H	//
E5III#++---	==???!))+++ '7&G&G&I&I	"57HJZ[[[!37G7JM]^_M`7`!a $	v E E$5$F$F$H$H	//
E5III#++---	==???!))++++''))) #3"C"C"E"EK'+'<=M=SUYUf'g'g$I13DFVWWWI88:::''))) 1 B B D DI&7&K&K&M&MO**"(" ,%$'7'::=)   %%'''#**,,, #3"C"C"E"EKI13DFVWWWI88:::''))) 1 B B D DI&7&K&K&M&MOt033G**"(" ,%$'7'::=)   %%'''#**,,,//111&7799	A ) @:D I%%&;<<<r   r  need_apply_mask	iter_args
value_argspipeline_args	atom_argstensor_argsreturnc           	         |\  }}	}
}|\  }}}}}|\  }}}}|\  }}}}}}}|\  }}}| j         d         t          j        z  | j        j        z  }|                    |          } t          j        | j        t          j        d                    }!t          j	        | j
        |!          }"t          j        | j        t          j        d|f                    }#t          j	        | j
        |#          }$|                    |           }%|                    |"          }&|                    |$          }'|                                }(t          j        |%j        | j                  })t          j        |||)           |r| j        t&          j        j        k    r,t&          j                            | j        |)|%||||           na||z  }*|%d         d         }+|%d         d         },t1          |dz
  |+dz   |*z  dz
  |,t          j        |%          z             }-t1          |dz
  |+t
          j                                        d         dz  z
  |*z  dz
            }.t1          |dz
  |+dz   t
          j                                        d         dz  z
  |*z  dz
            }/|.| d         d         z
  dk     r1t9          j        dd          D ]}0|0|,z   |-k    rt          j         |)|0<   t9          j        dt          j        |)                    D ]}0|0|,z   |-k    rt          j         |)|0<   |	}1|)                                                     t
          j!        j"        |	d          }	|	}2|	t8          j        j         k    rd}2t          j        |&j        | j                  }3|1|3d<   |2|3d<   t          j        ||3|           t
          j        #                                 |$                                 t          j        |'j        | j                  }4t          j	        t          j%        |4j
        | j&                  |)j                  }5|}6d|2z
  |6z  }7t9          j'        |dk              r|(                                }8n|                                }9d	}:t          j        |)          |:z  };t          j)        |)t          j        |;                    }<t          j)        |5t          j        |;                    }=t9          j        |:          D ].}>t9          j        dt          j        |<dg
          d          D ]}?t
          j        *                    |<|?|>f         |<|?dz   |>f         f|6|6f|7|7f          \  |<|?|>f<   |<|?dz   |>f<   t
          j+        ,                    |<|?|>f         d          |<|?|>f<   t
          j+        ,                    |<|?dz   |>f         d          |<|?dz   |>f<   |<d|>f                                         }@|=d|>f         -                    |@.                    | j&                             0t9          j'        |dk              r|8$                                 n|9/                                 t          j        ||4|           t
          j        #                                 |(/                                 |(                                }|6|1|2z
  z  }At
          j+        ,                    |Ad          dz  }B|
|Bz  }
|
|
f}Cd}Dd}Ed}Fd	}Gt          j        |)          |Gz  };t          j)        |)t          j        |;                    }<t9          j0        dt          j        |<dg
          d          D ]}>t
          j        1                    |C|<|>df         |<|>dz   df         f          }Ct
          j        1                    |D|<|>df         |<|>dz   df         f          }Dt
          j        1                    |E|<|>df         |<|>dz   df         f          }Et
          j        1                    |F|<|>df         |<|>dz   df         f          }Ft
          j        1                    |C|D          }Ct
          j        1                    |E|F          }Et
          j        1                    |C|E          }C|Cd         |Cd         z   }
|	|
|||||fS )a  Perform a single step of the softmax computation on a block of attention scores.

        This method processes one block of the attention matrix, computing numerically stable
        softmax by first finding the row maximum, subtracting it from all elements, applying
        exponential function, and then normalizing by the sum of exponentials. It also handles
        optional masking of attention scores.

        The method involves several key operations:
        1. Loading attention scores from tensor memory
        2. Applying optional masking based on position
        3. Computing row-wise maximum values for numerical stability
        4. Transforming scores using exp2(x*scale - max*scale)
        5. Computing row sums for normalization
        6. Coordinating pipeline synchronization between different processing stages

        :param stage: Processing stage (0 for first half, 1 for second half)
        :type stage: int
        :param need_apply_mask: Whether to apply attention masking
        :type need_apply_mask: bool
        :param iter_args: Tuple containing the counting tensor, row_max, row_sum, and vector buffer's handle for current iteration
        :type iter_args: tuple
        :param value_args: Tuple containing seqlen_k, seqlen_q, and scale_softmax_log2
        :type value_args: tuple
        :param pipeline_args: Tuple containing pipeline related arguments for MMA, correction, and sequence synchronization
        :type pipeline_args: tuple
        :param atom_args: Tuple containing mma & copy atoms
        :type atom_args: tuple
        :param tensor_args: Tuple containing softmax related tensors
        :type tensor_args: tuple
        :param fused_mask: Compute trip counts and apply masking for attention blocks
        :type fused_mask: fmha_utils.FusedMask
        :return: Updated state values (row_max, row_sum, and pipeline related arguments)
        :rtype: tuple
        r   r  r-   r   r,   @           )dtyper    rt   r   Tr  Ng      ?)r  r  r   )2r3   r
   rW   r   re  r   rf  r9  r   r   r3  rl  rk  rT  rm  r   r   rK  r   r   MaskTypeCOMPRESSED_CAUSAL_MASKrM  
apply_maskminr   r   r#  r   rP  infloadreduceReductionOpMAXrq  rY  r2  rV   r   rJ  logical_dividefma_packed_f32x2rn  ro  storetorZ  range_constexpradd_packed_f32x2)HrP   r  r  r  r  r  r  r  r  row_maxrow_sumvec_i_handler  r  rl   ro   rp   r  r  r  r  r  tiled_tmem_loadtiled_tmem_storetiled_tmem_store_vecthr_tmem_loadthr_tmem_storethr_tmem_store_vectTMEM_LOADtStTMEM_STORE_VECtStTMEM_STOREtS_x4tilePlikeFP32r  r  r  tScS_P_layouttScS_PtTMEM_LOADcStTMEM_STORE_VECcStTMEM_STOREcS	si_handletTMEM_LOADrScompression_factorindex_qindex_k0largestUnmaskedKsmallestUnmaskedKInWarplargestUnmaskedKInWarpr  old_row_maxrow_max_safetTMEM_STORE_VECrStTMEM_STORErS_x4tTMEM_STORErS_x4_er  minus_row_max_scalesequence_producer_handlesequence_consumer_handlefrg_cntfrg_tiletTMEM_LOADrS_frgtTMEM_STORErS_x4_e_frgjr   s_vec
acc_scale_	acc_scalelocal_row_sum_0local_row_sum_1local_row_sum_2local_row_sum_3reduction_unrollsH                                                                           r   softmax_stepz4BlackwellFusedMultiHeadAttentionForward.softmax_step  s"	   j .7*GWlV`S(.0@BS 	
## 	
  		
 )!,=@RR%%b))*4;8H8R8RSS#DM?CC(d6F]G[6\6\]]!$-??$0066.::8DD&226:: $4466	,\-?ARSS	/<>>> (	7~!4!KKK$//N  $%    &.%9"&q/!,'?1-#&qLq[%77!;ty666$ $  +.qL	 4 4 6 6q 9B >?DVVYZZ+ +'
 *-qLr\TY%9%9%;%;A%>%CDI[[^__* *&
 +T!WQZ7"<<$]1b11 ; ;x<*:::/6{lLO r49\+B+BCC 7 7A8|&666+2;,Q##%%,,T-=-A7ANNw****L 12C2I4K\]]*!+!	&(9;LMMM	--///01DdFWXX!-O,5T\JJJ
 

 #"\1U: eqj)) 	R'>'R'R'T'T$$'>'O'O'Q'Q$9\**g5.|T=Mh=W=WXX!%!45GIYZbIcIc!d!dw'' 	J 	JA]1di0@s&K&K&KQOO 
g 
gEIYE_E_%ad+-=a!eQh-GHEN(*=>F FB A&(8Q(B *.8HA8NY])^)^ A&-1Y^^<LQQRUTUX<Vae^-f-f Q**$T1W-2244E"47+11%((4<2H2HIIIIeqj)) 	/$++----$,,...	"$46FGGG	--///';;==kL89
INN:N==C	9"G,$$$9\**.>>.|T=Mh=W=WXX(DI6FaS,Q,Q,QSTUU 	@ 	@A"i88K[\]_`\`Kacstuxyty{|t|c}J~O"i88K[\]_`\`Kacstuxyty{|t|c}J~O"i88K[\]_`\`Kacstuxyty{|t|c}J~O"i88K[\]_`\`Kacstuxyty{|t|c}J~OO)44_oVV)44_oVV)44_oVV!!$q'99 ##
 	
r   r  r  r  r  r  r  r  r  r  c           
         t           j                                        \  }}}|| j        |dk    rt	          | j                  nt	          | j                  z  z  }t          j        | j        d         | j        d         f          }| j        d         dz  | j	        j
        z  }|                    |          }t          j        |j        t          j        d                    }|dk    r| j        n| j        }t          j        |j        |z   |          }t          j        |j        t          j        d                    }t          j        |j        |          }t          j        |j        t          j        d|f                    }|dk    r| j        n| j        }t          j        |j        |z   |          }t          j        t,          j                            t,          j                            d                    | j                  }t-          j        ||	          } || j        |dk    rt	          | j                  nt	          | j                  z  z  }|                     |          }!|!                    |	          }"t          j        t,          j                            t,          j                            d                    | j                  }#t-          j        |#|          }$|$                    |          }%|%                    |          }&|%                    |          }'t          j        t,          j                            t,          j                            d                    | j                  }(t-          j        |(|          })|)                    |          }*|*                    |          }+tA          j!        |t           j        "                                t           j        #                                          },|,$                                }-|-j%        rP|-j&        }.|.d         d         }/|}0d}1tO          d          }2|}3tQ          j)        |du          rI||/         }2||/dz            |2z
  }3t@          j*        +                    | j,        d         |.d         |3           }1|1stQ          j)        |du          r||/         }4||/dz            |4z
  }0tZ          j.         }5d	}6|0|3||
|f}7|| |)|$|!|*|%f}8|"|&|+f}9|.d         | j,        d         z  || j        d         z  z   df}:t          j/        |:|          };|0                                }<t@          j1        2                    | j3        |.| j,        |3|0|
          }=t@          j1        4                    | j3        |.| j,        |3|0|
|          }>tQ          j5        |=|=|>z   dd
          D ]V}?t          j/        d|?| j        d         z  f|;          }@|@|5|6|<f}A||||f}B| 6                    |d|A|7|B|8|9          \  }5}6}<}}}}Wt@          j1        7                    | j3        |.| j,        |3|0|
|          }CtQ          j5        |=|>z   |=|>z   |Cz   dd
          D ]V}?t          j/        d|?| j        d         z  f|;          }@|@|5|6|<f}A||||f}B| 6                    |d|A|7|B|8|9          \  }5}6}<}}}}Wt@          j1        8                    | j3        |.| j,        |3|0|
|          }DtQ          j5        |=|>z   |Cz   |=|>z   |Cz   |Dz   dd
          D ]V}?t          j/        d|?| j        d         z  f|;          }@|@|5|6|<f}A||||f}B| 6                    |d|A|7|B|8|9          \  }5}6}<}}}}W|9                                }Et          j:        |'j;        | j                  }F|6d	k    rd}6|5tP          j-        j.         k    rd	}5|6|Fd<   |5|Fd<   t          j        |$|F|&           t           j        <                                 |<=                                 |>                                 |E?                                 |,@                                 |,A                                }-|-j%        NdS dS )a'
  Compute softmax on attention scores from QK matrix multiplication.

        This method handles the softmax computation for either the first or second half of the
        attention matrix, depending on the 'stage' parameter. It calculates row-wise maximum
        and sum values needed for stable softmax computation, applies optional masking, and
        transforms raw attention scores into probability distributions.

        The implementation uses specialized memory access patterns and efficient math operations
        for computing exp(x) using exp2 functions. It also coordinates pipeline
        synchronization between MMA, correction, and sequence processing stages.

        :param stage: Processing stage (0 for first half, 1 for second half of attention matrix)
        :type stage: int
        :param seqlen_k: Length of the key sequence
        :type seqlen_k: Int32
        :param seqlen_q: Length of the query sequence
        :type seqlen_q: Int32
        :param cum_seqlen_q: Cumulative sequence lengths for queries
        :type cum_seqlen_q: cute.Tensor | None
        :param cum_seqlen_k: Cumulative sequence lengths for keys
        :type cum_seqlen_k: cute.Tensor | None
        :param scale_softmax_log2: Log2 scale factor for softmax operation
        :type scale_softmax_log2: Float32
        :param qk_thr_mma: Thread MMA operation for QK matrix multiplication
        :type qk_thr_mma: cute.core.ThrMma
        :param tStS: Shared tensor for softmax input/output
        :type tStS: cute.Tensor
        :param tStSi: Input tensor containing attention scores
        :type tStSi: cute.Tensor
        :param window_size_left: Left-side sliding window size for attention masking.
        :type window_size_left: Optional[Int32]
        :param window_size_right: Right-side sliding window size for attention masking.
        :type window_size_right: Optional[Int32]
        :param mma_si_pipeline: Pipeline for synchronizing with MMA operations
        :type mma_si_pipeline: pipeline.PipelineAsync
        :param si_corr_pipeline: Pipeline for synchronizing with correction operations
        :type si_corr_pipeline: pipeline.PipelineAsync
        :param s0_s1_sequence_pipeline: Pipeline for synchronizing between stage 0 and 1
        :type s0_s1_sequence_pipeline: pipeline.PipelineAsync
        :param tile_sched_params: Parameters for tile scheduling
        :type tile_sched_params: fmha_utils.FmhaStaticTileSchedulerParams
        :param fused_mask: Compute trip counts and apply masking for attention blocks
        :type fused_mask: fmha_utils.FusedMask
        r   r   r,   r  r-   r   FNr  r  Tg      ?)Br   r   r#  r>   r?   r6   r7   rd  r3   r   rW   re  rf  r9  r   rI   rJ   r   r3  rG   rH   rg  r   rK  rh  ri  r   rj  r4  rk  
St32x32bOprl  r   r<  r=  r>  r?  r@  rA  r   r   r   rB  rC  r2   r
   r  rD  rJ  rM  rN  r   get_masked_leading_countrP  r8  get_unmasked_trip_countget_masked_trailing_countrT  rm  r   rq  rY  acquirerZ  rQ  rR  )GrP   r  r  r  rh   ri   rl   r  r  r  ro   rp   r  r  r  r  r   rt  ru  r#  cS_baser  r  r  tmem_vec_offsettStS_vecr  r  tStS_P_layouttmem_p_offsettStS_Ptmem_load_atomr  r  r  tmem_store_vec_atomr  r  r  r  tmem_store_atomr  r  r  r  r  r  r  	seqlen_k_r  r  	seqlen_q_r  r	  r
  r  r  r  logical_offsetr  r  start_countleading_mask_countr  cS_iterr  r  unmask_counttrailing_mask_countr  r%  sG                                                                          r   rb  z/BlackwellFusedMultiHeadAttentionForward.softmax  s}	   @ Y))++
aT2UZ^_U_U_c$:P6Q6Q6Qehimi  fA  fA  B  C
+T->q-A4CTUVCW,XYY)!,2T\5GG%%g..*4;8H8R8RSS38A::$//4CX#DMO$C_UU*4;8H8R8RSS#DM?CC(d6F]G[6\6\]]/4zz++t?R!$--"?OO,L##GL$;$;B$?$?@@
 
 "0GGT2UZ^_U_U_c$:P6Q6Q6Qehimi  fA  fA  B  C
'11*==$0077"1L##GL$;$;A$>$>??
 
  '56I8TT1;;JGG.::8DD.::8DD-L##GL$;$;B$?$?@@
 
 #1/6JJ)33J??)55f==ABSUYU^UhUhUjUjlplul~l~  mA  mA  B  B
5577	% F	6(1*1-a0K I!MqJ I!,d":;; )+6
(q9JF	$.$F$d$dN1%$Q'% % ! ! r$%l$&>?? K!-k!:J ,[1_ =
 JI";,&$%
 #$(!"&	 !%$ %Q'$.*;;edFWXYFZ>ZZ" '@@/CCEE(2AAN$N$  &0%9%R%RN$N$%& &" !{KBT4TVW`abbb  A"0!Q9J19M5M1NPRSSG!('7L II'(//	%M ))!"%!# $'(///  *3KKN$N$%    !"44"44|C	    A #0!Q9J19M5M1NPRSSG!('7L II'(//	%M ))!"%!# $'(/// '1&:&T&TN$N$%' '# !"44|C"44|CFYY	    A #0!Q9J19M5M1NPRSSG!('7L II'(//	%M ))!"%!# $'(/// ,<<>>	$($9:K:QSWSd$e$e!c>>!Gw2222!G'.!!$'.!!$	.0ACTUUU	55777##%%% ((***!!### ++---"3355IM % F	6 F	6 F	6 F	6 F	6r   thr_mmar  r  c                    | j         d         | j         d         f}t          j        |          }|                    |          }d}t          j        t
          j                            t
          j                            |                    | j	                  }t          j        t
          j        
                    t
          j                            |                    | j	                  }	t          j        |j        t          j        d|f                    }
t          j        |j        t          j        d|f                    }t          j        |j        |
          }t          j        |j        |          }t          j        ||          }t          j        |	|          }t          j                                        \  }}}|| j        t)          | j                  z  z  }|                    |          }|                    |          }|                    |          }|                    |          }|                    |          }t          j        |j        d|z  f| j	                  }t7          j        | j        d         |z            D ]>}|d|f         }t          j        |j        t          j        |j        d                             }t          j        |j        |          }t          j        |j        ||z  z   |j                  }t          j        |j        ||z  z   |j                  }t          j        |||           t7          j        dt          j        |          d          D ]A}t          j                            ||         ||dz            f||f          \  ||<   ||dz   <   Bt          j        |||           @dS )a  Rescale intermediate attention results based on softmax normalization factor.

        This method performs a crucial correction step in the attention computation pipeline.
        When processing attention in blocks, the softmax normalization factors may change
        as new blocks are processed. This method rescales previously computed partial
        output values to account for updated normalization factors.

        The implementation uses efficient tensor memory operations to:
        1. Load existing partial attention output from tensor memory
        2. Apply the scaling factor to all elements
        3. Store the rescaled results back to tensor memory

        :param thr_mma: Thread MMA operation for the computation
        :type thr_mma: cute.core.ThrMma
        :param tOtO: Tensor representing partial attention output to be rescaled
        :type tOtO: cute.Tensor
        :param scale: Scaling factor to apply to the partial results
        :type scale: Float32
        r   r   r  r-   r   N) r4   r   rd  re  rg  r   rK  rh  ri  r   r:  rf  r9  r   r   r3  rj  r   r#  r>   r?   r8   r4  rk  rl  rm  r   r   rP  r2   r   mul_packed_f32x2) rP   rP  r  r  pv_tiled_mma_shapecOtOcOcorr_tile_sizerE  rG  tOtO_i_layouttOcO_i_layouttOtO_itOcO_ir  r  rt  ru  r#  r  r  tTMEM_LOADtOtTMEM_LOADcOtTMEM_STOREtOtTMrOr  tTMrO_i_tTMrO_i_layouttTMrO_itTMEM_LOADtO_itTMEM_STOREtO_ir/  s                                    r   rp  z:BlackwellFusedMultiHeadAttentionForward.correction_rescale  sr   8 a a 
 &'9::""2&&,L##GL$;$;N$K$KLL
 
 -L##GL$;$;N$K$KLL
 

 (d6F^G\6]6]^^(d6F^G\6]6]^^!$-??!$-??!0HH"1/6JJY))++
aT2S9Q5R5RRS
'11*==)33J??$0088$0088&226::%|'93.;P&QSWSdeet~a0NBCC 	B 	BAT1W~H!-hot?OPUP[\]P^?_?_``N&x'8.IIG!-l.Ca.FX.XZfZmnnN".}/ENHZ/Z\i\pqqOIo~w???]1di&8&8!<<  -1Y-G-GQZQ0EN. .*
GAENN I&AAAA	B 	Br   Nr  r  r  	blk_coordr   c                 J   | j         d         | j         d         f}t          j        |          }d| j        j        z  }|                    |          }|                    |          }t          j        |t          j        d|f                    }t          j        |t          j        d|f                    }t          j        |t          j        d|f                    }t          j        	                                \  }}}|| j
        t          | j                  z  z  }| j        d         |f}t          j        | j         | j        | j        | j        |d          }t%          j        ||d                   }|                    |          }t          j        | j        | j        | j        |          }t          j        ||          }|                    |d                   }|                    |d                   }|                    |d                   }t3          j        | j        d	         |z            D ]:} |d
dd| f         }!|d
dd| f         }"t          j        |d
dd| f         j        | j                  }#t          j        ||!|#           t3          j        dt          j        |#          d	          D ]A}$t          j                             |#|$         |#|$dz            f|
|
f          \  |#|$<   |#|$dz   <   Bt          j        |#j        | j                  }%|#!                                }&|%"                    |&#                    | j                             t          j        ||%|"           <t3          j$        |d
u          rIt          j%        &                    |d         d          |	|d         z  z   }'||k     r|'|||z   |d	         f<   t          j        '                    t          j        j(        j)        t          j        j*        j+                   d
S )al  Apply final scaling and transformation to attention output before writing to global memory.

        This correction_epilog function handles the final processing step for attention output values.
        It applies a scaling factor to the accumulated attention results and prepares the
        data for efficient transfer back to global memory.

        The method performs:
        1. Loading of accumulated attention results from tensor memory
        2. Application of the final output scaling factor
        3. Type conversion if necessary (typically from higher precision accumulator to output precision)
        4. Reorganization of data for optimal memory access patterns
        5. Preparation for efficient TMA store operations

        :param thr_mma: Thread MMA operation for the computation
        :type thr_mma: cute.core.ThrMma
        :param tOtO: Tensor containing accumulated attention output
        :type tOtO: cute.Tensor
        :param mLSE: Tensor containing log-sum-exp values for LSE calculation
        :type mLSE: cute.Tensor | None
        :param tTMEM_LOAD_VECrS: Tensor containing row sum and max values for softmax calculation
        :type tTMEM_LOAD_VECrS: cute.Tensor
        :param row_idx: Index of the current row being processed
        :type row_idx: Int32
        :param cuseqlen_q: Cumulative sequence length of the current query
        :type cuseqlen_q: Int32
        :param seqlen_q: Sequence length of the current query
        :type seqlen_q: Int32
        :param blk_coord: Coordinate of the current block being processed
        :type blk_coord: Int32
        :param scale_softmax: Scaling factor for softmax calculation
        :type scale_softmax: Float32
        :param scale: Final scaling factor to apply to the output
        :type scale: Float32
        :param sO: Shared memory tensor for the final output
        :type sO: cute.Tensor
        r   r   r.   r-   F)use_2cta_instrs)NNr   )rg  Nr   NTr  )space),r4   r   rd  r   rW   re  r  r   r   r#  r>   r?   r8   r   r   get_tmem_load_opr   r   r   rj  r4  get_smem_store_opmake_tiled_copy_Drk  rl  r   rP  r2   rm  r   rK  r   rR  r  r  r  r   rn  logfence_proxy	ProxyKindasync_sharedSharedSpace
shared_cta)(rP   rP  r  r  r  r  r  r  rd  rm   r  r   rS  rT  rV  r  rU  rY  rZ  tOsO_irt  ru  r#  epi_subtiletmem_copy_atomr  r  smem_copy_atomtiled_smem_storer[  tTMEM_LOADsOtTMEM_LOADoOr  rb  tTMEM_LOADsO_ir^  r/  tSMrOo_vecr   s(                                           r   rs  z9BlackwellFusedMultiHeadAttentionForward.correction_epiloga  s   j a a 
 &'9::4<#55""2&&""2&&$T4+;S.<Q+R+RSS$T4+;S.<Q+R+RSS$T4+;S.<Q+R+RSSY))++
aT2S9Q5R5RRS
}Q'8$5ML!
 
 
 "0AXYY'11*==$6t}dlTXTegvww1./RR$008J1KLL$008J1KLL$008J1KLLt~a0NBCC 	? 	?A)$1a-8N)$1a-8N),tQ1}*E*KTM^__EIo~u===]1di&6&6::  )-)C)C1XuQU|,EN* *&a%A,,
 )%+t|DDEJJLLEKK..///I&~>>>>d$.// 	?)-- 0 3d-CCmVfghViFiiC!!;>Wz)9Q<78 		I,)'2 	 	
 	
 	
 	
 	
r   )(r   r   r   r   r   Numericr   intboolr   r  rS   r^   r   jitPointer	Constexprr   r   Tensorr
   cudaCUstreamr   r   TiledMmaCopyAtomComposedLayoutFmhaStaticTileSchedulerParamstupler   PipelineProducerImmutableResourceHandlePipelineConsumerr8  coreThrMmarb  rp  rs   r   r   r   r   k   sY       gz7?+gz 7?+gz c3'	gz
 gz &gz gz gz gzR  $ 
XM
M
 #U3S#+=%>$?@M
 	M

 #E#sC*<$=>M
 M
 #E#sC*<$=>M
 M
 #E#sC*<$=>M
 E5%uDEM
 t{+M
 t{+M
 4<(M
 %eCcM&:;M
 $M
  !M
" #M
$ #5/%M
& $E?'M
( )M
 M
 M
 XM
`
 
[BmB mB M	B
 B MB B MB B MB B t{+B t{+B t{#B $B  !B" #B$ #5/%B& $E?'B( #1)B* #1+B, #1-B. #1/B0 #11B2 &C3B B B [BH 
X`
`
 `
 	`

 `
 `
 `
 `
 
!9!!!!	#
`
 `
 `
 X`
F 
Xn6n6 n6 	n6
 t{+n6 t{+n6 $n6 I$n6 kn6 {n6 #5/n6 $E?n6 "2n6 #3n6 "*!:n6  "*!:!n6" &C#n6 n6 n6 Xn6b	 
XKB!KB kKB 	KB KB KB XKBZ 
Xq
!q
 kq
 kD 	q

 +q
 q
 q
 q
 q
 q
 q
 Kq
 q
 q
 Xq
 q
 q
r   r   )0argparsern  ossystimetypingr   r   r   r   torchtorch.nn.functionalnn
functionalFcuda.bindings.driverbindingsdriverr  r   cutlass.cuter   cutlass.cute.nvgpu.tcgen05r   r   cutlass.utilsr   cutlass.pipeliner   cutlass.torchcutlass_torchcutlass.utils.blackwell_helpersblackwell_helpersr   cutlass.cute.testingtestingcutlass.cute.runtimer   cutlass.cute.typingr   r	   r
   )cudnn.native_sparse_attention.compressionr   r   r}  r   r   r  r   r   <module>r     s  :   				 



  / / / / / / / / / / / /           # # # # # # # # #        , , , , , , , , , , , ,       # # # # # # % % % % % % 5 5 5 5 5 5 5 5 5 & & & & & & & & & , , , , , , 5 5 5 5 5 5 5 5 5 5 P P P P P P.bB B B B Bh!
 h!
 h!
 h!
 h!
 h!
 h!
 h!
 h!
 h!
r   