
    `i              	          d Z ddlmZmZ ddlmZmZmZmZm	Z	m
Z
mZmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZmZ ddlmZ ddlZd	ddd
deeef         deeef         dedefdZ	 	 	 	 d&ddddededefdZd	ddddededefdZddddedefdZ d'deeef         dedeeef         fdZ!d'deeef         dedeeef         fdZ" G d d          Z# G d  d!          Z$ G d" d#          Z% G d$ d%e%          Z&dS )(z
Shared utilities for grouped GEMM kernels.

This module contains the tile scheduler classes and helper functions used by both
the forward (grouped_gemm_swiglu) and backward (grouped_gemm_dswiglu) kernels.
    )TupleUnion)BooleanIntegerInt32minextract_mlir_valuesnew_from_mlir_valuesdsl_user_op
const_expr)ir)scfllvmnvvm)T)Float32r   NT)nanlocipabr   returnc                ,   |rd}nd}t          t          j        t          j                    t          |                               ||          t          |                              ||          g|dddt          j        j                            S )zCompute the minimum of two float32 values with NaN handling.

    :param a: First operand
    :param b: Second operand
    :param nan: If True, propagate NaN values
    :return: Minimum value
    zmin.NaN.f32 $0, $1, $2;zmin.f32 $0, $1, $2;r   r   z=f,f,fTFhas_side_effectsis_align_stackasm_dialect)r   r   
inline_asmr   f32ir_value
AsmDialectAD_ATT)r   r   r   r   r   	ptx_instrs         l/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/cudnn/grouped_gemm/utils.pyfminr&   <   s      *-		)	EGGQZZ  SR 00'!**2E2E#RT2E2U2UV! .	
 	
 	

 
 
        Fr   mask_and_clampabsc                &   t          |           }|                     ||          }t          |                              ||          }	d}
 |t          j        t          j                    ||	g|
dddt          j        j                            S )al  Perform a warp-level reduction synchronization for max with abs and NaN.

    :param value: Value to reduce
    :param kind: Reduction kind (unused, kept for API compatibility)
    :param mask_and_clamp: Warp mask and clamp value
    :param abs: Whether to use absolute value
    :param nan: Whether to handle NaN values
    :return: Reduced value across warp
    r   z&redux.sync.max.abs.NaN.f32 $0, $1, $2;z=f,f,iTFr   )	typer!   r   r   r   r   r    r"   r#   )valuekindr)   r*   r   r   r   
value_typevalue_irmask_irr$   s              r%   warp_redux_syncr2   \   s    & eJ~~#"~--HN##,,,<<G8I:EGGw! .	
 	
 	

 
 
r'   )positive_onlyr   r   r-   r3   c                |   t          j        t          j                    |                    ||          ||          }t          j        t          j                    t          j        j	        j        j
        j        | |||          }t          t          j        t          j                    |||                    S )a\  Perform atomic max operation on a float32 value in global memory.

    This implementation works correctly for non-negative values (>= 0) using direct bitcast.

    :param ptr: Pointer to the memory location
    :param value: The float32 value to compare and potentially store (should be >= 0)
    :return: The old value at the memory location
    r   resopptrr   r   r   )r   bitcastr   i32r!   r   	atomicrmwcutlass_mlirdialectsAtomicOpKindMAXr   r    )r8   r-   r3   r   r   	value_intold_value_ints          r%   atomic_max_float32rC      s      QUWWennn&D&D#RTUUUINEGG=!&37
  M 4<CBGGGHHHr'   c          	          t          j        t          j                    t          j        j        j         j        j        | |	                    ||          ||          }t          |          S )zPerform atomic add operation on a float32 value in global memory.

    :param ptr: Pointer to the memory location
    :param value: The float32 value to add
    :return: The old value at the memory location
    r   r5   )r   r;   r   r    r<   r=   r>   r?   FADDr!   r   )r8   r-   r   r   	old_values        r%   atomic_add_float32rG      s`     EGG=!&38
..SR.
(
(  I 9r'   fastmathc                     t           j                            dt           j                            |  |          z             S )zCompute the sigmoid function: 1 / (1 + exp(-a)).

    :param a: Input value
    :param fastmath: Whether to use fast math approximations
    :return: Sigmoid of input
    g      ?rH   )cutearch
rcp_approxmathexpr   rH   s     r%   sigmoid_f32rQ      s3     9dimmQBm&J&J JKKKr'   c                 *    | t          | |          z  S )zCompute the SiLU (Swish) activation: a * sigmoid(a).

    :param a: Input value
    :param fastmath: Whether to use fast math approximations
    :return: SiLU of input
    rJ   )rQ   rP   s     r%   silu_f32rS      s     {1x00000r'   c                       e Zd ZdZdej        defdZdee	j
                 fdZdee	j
                 dd fdZedefd	            Zedej        fd
            ZdS )WorkTileInfozA class to represent information about a work tile.

    :ivar tile_idx: The index of the tile.
    :type tile_idx: cute.Coord
    :ivar is_valid_tile: Whether the tile is valid.
    :type is_valid_tile: Boolean
    tile_idxis_valid_tilec                 <    || _         t          |          | _        d S N)	_tile_idxr   _is_valid_tile)selfrV   rW   s      r%   __init__zWorkTileInfo.__init__   s    !%m44r'   r   c                 |    t          | j                  }|                    t          | j                             |S rY   )r	   rV   extendrW   r\   valuess     r%   __extract_mlir_values__z$WorkTileInfo.__extract_mlir_values__   s4    $T]33)$*<==>>>r'   ra   c                     t          |          dk    sJ t          | j        |d d                   }t          | j        |d         g          }t	          ||          S )N   )lenr
   rZ   r[   rU   )r\   ra   new_tile_idxnew_is_valid_tiles       r%   __new_from_mlir_values__z%WorkTileInfo.__new_from_mlir_values__   s\    6{{a+DNF3B3KHH01DvbzlSSL*;<<<r'   c                     | j         S )zCheck latest tile returned by the scheduler is valid or not.

        Any scheduling requests after all tasks completed will return an invalid tile.

        :return: The validity of the tile.
        :rtype: Boolean
        )r[   r\   s    r%   rW   zWorkTileInfo.is_valid_tile   s     ""r'   c                     | j         S )zgGet the index of the tile.

        :return: The index of the tile.
        :rtype: cute.Coord
        )rZ   rk   s    r%   rV   zWorkTileInfo.tile_idx   s     ~r'   N)__name__
__module____qualname____doc__rK   Coordr   r]   listr   Valuerb   ri   propertyrW   rV    r'   r%   rU   rU      s         5 5G 5 5 5 5bh    
=tBH~ =. = = = = #w # # # X# $*    X  r'   rU   c                       e Zd ZdZe	 	 dddddej        dej        ded	efd
            Z	d Z
d Zeddddedeeeef         fd            ZdS )PersistentTileSchedulerParamsa  A class to represent parameters for a persistent tile scheduler.

    This class is designed to manage and compute the layout of clusters and tiles
    in a batched gemm problem.

    :ivar cluster_shape_mn: Shape of the cluster in (m, n) dimensions (K dimension cta count must be 1).
    :type cluster_shape_mn: tuple
    :ivar problem_layout_ncluster_mnl: Layout of the problem in terms of
        number of clusters in (m, n, l) dimensions.
    :type problem_layout_ncluster_mnl: cute.Layout
    T   Nr   problem_shape_ntile_mnlcluster_shape_mnkraster_along_mswizzle_sizec                v   |d         dk    rt          d|d                    |dk     rt          d|           || _        || _        |dd         | _        || _        || _        || _        t          j        t          j	        | j        |dd         ||          ||          | _
        |dk    rt          j        | j
        j        |rd|dfn|ddf          }|rUt          j        |d         ||d         |z  f|d         f|d||d         z  f|d         |d         z  f||          | _
        nTt          j        ||d         |z  f|d         |d         fd||d         z  f||d         |d         z  f||          | _
        |dk    rt          j        | j
        ||          }| j
        j        d         }	| j
        j        d         }
t          j        |||          | _        t          j        |	||          | _        t          j        |
||          | _        dS d| _        d| _        d| _        dS )	a  Initializes the PersistentTileSchedulerParams with the given parameters.

        :param problem_shape_ntile_mnl: The shape of the problem in terms of
            number of CTA (Cooperative Thread Array) in (m, n, l) dimensions.
        :type problem_shape_ntile_mnl: cute.Shape
        :param cluster_shape_mnk: The shape of the cluster in (m, n) dimensions.
        :type cluster_shape_mnk: cute.Shape
        :param swizzle_size: Swizzling size in the unit of cluster. 1 means no swizzle
        :type swizzle_size: int
        :param raster_along_m: Rasterization order of clusters. Only used when swizzle_size > 1.
            True means along M, false means along N.
        :type raster_along_m: bool

        :raises ValueError: If cluster_shape_k is not 1.
           rx   zunsupported cluster_shape_k z"expect swizzle_size >= 1, but get Nr   r   )strider   r   )
ValueErrorry   _cluster_shape_mnkcluster_shape_mnr|   _raster_along_m_locrK   make_layoutceil_divproblem_layout_ncluster_mnlround_upshapesizefast_divmod_create_divisor	batch_fddcluster_shape_m_fddcluster_shape_n_fdd)r\   ry   rz   r{   r|   r   r   problem_shape_ncluster_mnlproblem_layout_sizecluster_count_mcluster_count_ns              r%   r]   z&PersistentTileSchedulerParams.__init__  s   6 Q1$$R<Ma<PRRSSS!P,PPQQQ'>$"3 1"1" 5(-	 ,0+;M$68I"1"8MSV[]^^^,
 ,
 ,
( !)-06(6PL!$$\1a<P* *&
  373C215%'A!'D'TU215 %L+Ea+HHI2158RST8UU
 4 4 400 483C%'A!'D'TU215215 L+Ea+HHI$2158RST8UU
 4 4 40" 1"&)D,LRUZ\"]"]"]">DQGO">DQGO "<=PVY^`aaaDN (,'F\_df'g'g'gD$ (,'F\_df'g'g'gD$$$ "DN'+D$'+D$$$r'   c                    g g c}| _         | j        | j        | j        | j        fD ]=}t          |          }||z  }| j                             t          |                     >g }g }t          d| j	        fd| j
        fd| j        fg          D ]C\  }\  }}|9t          |          }	|                    |	           |                    |           D||z  }| j                             t          |                     || _        |S )Nr   r   r   )_values_posry   r   r   r|   r	   appendrf   	enumerater   r   r   r_   _fastdivmod_indices)
r\   ra   obj
obj_valuesfastdivmod_valuesfastdivmod_indicesifdd_namefdd_obj
fdd_valuess
             r%   rb   z5PersistentTileSchedulerParams.__extract_mlir_values__u  s?   #%r  (# 	
 	5 	5C -S11Jj F##C
OO4444 &/dn-&(@A&(@A'
 '
 	- 	-"A"' "099
!((444"))!,,,##$6 7 7888#5 r'   c           	      z   g }t          |          }t          | j        | j        | j        | j        g| j        d d                   D ]:\  }}|                    t          ||d |                              ||d          };t          t          |          d| j        i}g d}t          | d          rt          | j                  dk    rot          | j                  D ]Z\  }}	||	         }
t!          | |
          }|;|t          |          k     r(t          |||         g          }t#          ||
|           [|S )Nre   r   )r   r   r   r   r   )rr   zipry   r   r   r|   r   r   r
   rw   tupler   hasattrrf   r   r   getattrsetattr)r\   ra   obj_listvalues_copyr   n_items
new_params	fdd_namesjoriginal_indexr   original_fddreconstructed_fdds                r%   ri   z6PersistentTileSchedulerParams.__new_from_mlir_values__  so   6ll  ,'$!	 SbS!
 
 
	0 
	0LC OO0k(7(6KLLMMM%ghh/KK 3U8__U49UU
 POO	4.// 		EC8P4Q4QTU4U4U%.t/G%H%H E E!>$^4&tX66+C4D4D0D0D(<\KXYNK[(\(\%J2CDDDr'   max_active_clustersr   c                H   t          d t          | j        j        | j                  D                       | j        j        d         fz   }t          j        |||          }t          j        | j        ||          }||z  }t          ||          }||z  }	g | j        |	R S )a  Computes the grid shape based on the maximum active clusters allowed.

        :param max_active_clusters: The maximum number of active clusters that
            can run in one wave.
        :type max_active_clusters: Int32

        :return: A tuple containing the grid shape in (m, n, persistent_clusters).
            - m: self.cluster_shape_m.
            - n: self.cluster_shape_n.
            - persistent_clusters: Number of persistent clusters that can run.
        c              3   J   K   | ]\  }}t          j        |          |z  V  d S rY   )rK   r   ).0xys      r%   	<genexpr>z?PersistentTileSchedulerParams.get_grid_shape.<locals>.<genexpr>  s3      }}$!QTYq\\A-}}}}}}r'   r~   r   )r   r   r   r   r   rK   r   r   )
r\   r   r   r   num_ctas_mnlnum_ctas_in_problemnum_ctas_per_clusternum_ctas_per_wavenum_persistent_ctasnum_persistent_clusterss
             r%   get_grid_shapez,PersistentTileSchedulerParams.get_grid_shape  s     }}#d>^>dfjf{:|:|}}}}},215A
 
 #i#"EEE#y)>CBOOO/2FF!"57HII"59M"M@&@(?@@@r'   )Trx   )rm   rn   ro   rp   r   rK   Shapeboolintr]   rb   ri   r   r   r   r   ru   r'   r%   rw   rw      s
       
 
 
  $g, g, g, g,!%g,  :g, 	g,
 g, g, g, [g,R" " "H# # #J @D A A A% ARWX_ahjqXqRr A A A [A A Ar'   rw   c                      e Zd ZdZdedededej        def
dZde	e
j                 fd	Zd
e	e
j                 dd fdZeeddddedeeeef         deeeef         fd                        Zeddddededeeeef         fd            ZddddedefdZddddedeeeef         fdZeddddefd            Zeddddefd            Zedddddefd            Zedefd            ZdS )StaticPersistentTileSchedulera  A scheduler for static persistent tile execution in CUTLASS/CuTe kernels.

    :ivar params: Tile schedule related params, including cluster shape and problem_layout_ncluster_mnl
    :type params: PersistentTileSchedulerParams
    :ivar num_persistent_clusters: Number of persistent clusters that can be launched
    :type num_persistent_clusters: Int32
    :ivar cta_id_in_cluster: ID of the CTA within its cluster
    :type cta_id_in_cluster: cute.Coord
    :ivar _num_tiles_executed: Counter for executed tiles
    :type _num_tiles_executed: Int32
    :ivar _current_work_linear_idx: Current cluster index
    :type _current_work_linear_idx: Int32
    paramsr   current_work_linear_idxcta_id_in_clusternum_tiles_executedc                 L    || _         || _        || _        || _        || _        dS )a  Initializes the StaticPersistentTileScheduler with the given parameters.

        :param params: Tile schedule related params, including cluster shape and problem_layout_ncluster_mnl.
        :type params: PersistentTileSchedulerParams
        :param num_persistent_clusters: Number of persistent clusters that can be launched.
        :type num_persistent_clusters: Int32
        :param current_work_linear_idx: Current cluster index.
        :type current_work_linear_idx: Int32
        :param cta_id_in_cluster: ID of the CTA within its cluster.
        :type cta_id_in_cluster: cute.Coord
        :param num_tiles_executed: Counter for executed tiles.
        :type num_tiles_executed: Int32
        N)r   r   _current_work_linear_idxr   _num_tiles_executed)r\   r   r   r   r   r   s         r%   r]   z&StaticPersistentTileScheduler.__init__  s1    * '>$(?%!2#5   r'   r   c                 f   t          | j                  }|                    t          | j                             |                    t          | j                             |                    t          | j                             |                    t          | j                             |S rY   )r	   r   r_   r   r   r   r   r`   s     r%   rb   z5StaticPersistentTileScheduler.__extract_mlir_values__  s    $T%ABB)$*GHHIII)$*@AABBB)$*BCCDDD 	)$+66777r'   ra   c                 r   t          |          dk    sJ t          | j        |d         g          }t          | j        |d         g          }t          | j        |dd                   }t          | j        |d         g          }|dd          }t          | j        |          }t          |||||          S N   r   rx   r~      )rf   r
   r   r   r   r   r   r   r\   ra   new_num_persistent_clustersnew_current_work_linear_idxnew_cta_id_in_clusternew_num_tiles_executedparams_valuesr   s           r%   ri   z6StaticPersistentTileScheduler.__new_from_mlir_values__  s    6{{a&:4;WZ`abZcYd&e&e#&:4;X[abc[dZe&f&f# 4T5KVTUVWTW[ Y Y!5d6NQWXYQZP[!\!\ qrr
)$+}EE
,''!"
 
 	
r'   Nr   	block_idxgrid_dimc                h   t          j        |||          t          j        | j        ||          z  }|\  }}}t          |          }	t          || j        d         z            t          || j        d         z            t          d          f}
t          d          }t	          | ||	|
|          S )a   Initialize the static persistent tile scheduler.

        :param params: Parameters for the persistent tile scheduler.
        :type params: PersistentTileSchedulerParams
        :param block_idx: The 3d block index in the format (bidx, bidy, bidz).
        :type block_idx: Tuple[Integer, Integer, Integer]
        :param grid_dim: The 3d grid dimensions for kernel launch.
        :type grid_dim: Tuple[Integer, Integer, Integer]

        :return: A StaticPersistentTileScheduler object.
        :rtype: StaticPersistentTileScheduler
        r   r   rx   )rK   r   r   r   r   )r   r   r   r   r   r   bidxbidybidzr   r   r   s               r%   createz$StaticPersistentTileScheduler.create%  s    2 #')H#""E"E"ESYSjpsxzI{I{I{"{$dD #(++ $03344$03344!HH
 #1XX,##
 
 	
r'   r   c                2    |                      |||          S )a  Calculates the grid shape to be launched on GPU using problem shape,
        threadblock shape, and active cluster size.

        :param params: Parameters for grid shape calculation.
        :type params: PersistentTileSchedulerParams
        :param max_active_clusters: Maximum active clusters allowed.
        :type max_active_clusters: Int32

        :return: The calculated 3d grid shape.
        :rtype: Tuple[Integer, Integer, Integer]
        r   )r   )r   r   r   r   s       r%   r   z,StaticPersistentTileScheduler.get_grid_shapeV  s     ( $$%8cb$IIIr'   c                   |t          j        | j        j        ||          k     }| j        j        dk    r|                     |||          }n"| j        j                            |||          }t          d t          || j	        g | j        j
        t          d          R           D                       }t          ||          S )h  Compute current tile coord given current_work_linear_idx and cta_id_in_cluster.

        :param current_work_linear_idx: The linear index of the current work.
        :type current_work_linear_idx: Int32

        :return: An object containing information about the current tile coordinates
            and validity status.
        :rtype: WorkTileInfo
        r   rx   c              3   |   K   | ]7\  }}}t          |          t          |          z  t          |          z   V  8d S rY   )r   )r   r   r   zs       r%   r   zQStaticPersistentTileScheduler._get_current_work_for_linear_idx.<locals>.<genexpr>  sX       
 
1a !HHuQxx%((*
 
 
 
 
 
r'   )rK   r   r   r   r|   %_get_cluster_work_idx_with_fastdivmodget_flat_coordr   r   r   r   r   rU   )r\   r   r   r   is_validcur_cluster_coordcur_tile_coords          r%    _get_current_work_for_linear_idxz>StaticPersistentTileScheduler._get_current_work_for_linear_idxm  s     +TYt{7^dgln-o-o-oo ;#q(( $ J JKbhkpr J s s !% G V VWntw|~ V    
 
!&9$+.9a99 
 
 
 
 
 NH555r'   c                    t          || j        j                  \  }}t          || j        j                  \  }}t          || j        j                  \  }}	||	|fS )a  FastDivmod optimized CLUSTER coordinate calculation.

        CRITICAL: This should mimic problem_layout_ncluster_mnl.get_hier_coord()
        which returns CLUSTER coordinates, not tile coordinates!

        :param current_work_linear_idx: Linear index in the work space
        :type current_work_linear_idx: Int32
        :return: Cluster coordinates (m, n, l) or None if FastDivmod not available
        :rtype: Tuple[Int32, Int32, Int32] or None
        )divmodr   r   r   r   )
r\   r   r   r   work_iterationwork_unit_idcluster_n_batch	cluster_mbatch_l	cluster_ns
             r%   r   zCStaticPersistentTileScheduler._get_cluster_work_idx_with_fastdivmod  sa     (..Et{G\']']$ &,L$+:Y%Z%Z" $OT[5TUU9g..r'   c                <    |                      | j        ||          S Nr   )r   r   r\   r   r   s      r%   get_current_workz.StaticPersistentTileScheduler.get_current_work  s"    44T5RX[`b4cccr'   c                0    |                      ||          S r   )r   r   s      r%   initial_work_tile_infoz4StaticPersistentTileScheduler.initial_work_tile_info  s    $$$444r'   rx   )advance_countr   r   r   c                    | xj         t          |          t          | j                  z  z  c_         | xj        t          d          z  c_        d S )Nrx   )r   r   r   r   )r\   r   r   r   s       r%   advance_to_next_workz2StaticPersistentTileScheduler.advance_to_next_work  sN    %%})=)=dFb@c@c)cc%%  E!HH,    r'   c                     | j         S rY   )r   rk   s    r%   r   z0StaticPersistentTileScheduler.num_tiles_executed  s    ''r'   )rm   rn   ro   rp   rw   r   rK   rq   r]   rr   r   rs   rb   ri   staticmethodr   r   r   r   r   rU   r   r   r   r   r   r   rt   r   ru   r'   r%   r   r     s        6-6 "'6 "'	6
  :6 "6 6 6 66	bh 	 	 	 	
tBH~ 
Ba 
 
 
 
&  ,
 ,
 ,
-,
'723,
 '12,
 ,
 ,
 [ \,
^ 
 J J J-J"J 
w(	)J J J \J, W[_c 6 6 6 6ht 6 6 6 6B \`dh / / /U /mrsxz  BG  tG  nH / / / /6 &*t d d d d d d [d ,0T 5 5 5l 5 5 5 [5 ;<$4 - - -S - - - [- (E ( ( ( X( ( (r'   r   c                        e Zd ZdZ	 ddedededej        dedef fd	Z	d
e
ej                 dd fdZee	 dddddedeeeef         deeeef         defd                        ZddddedefdZ xZS )$StaticPersistentRuntimeTileSchedulera  A scheduler for static persistent runtime tile execution in CUTLASS/CuTe kernels.

    This scheduler will always launch all the SMs and the scheduler will generate
    the real tile info for each SM.

    :ivar params: Tile schedule related params, including cluster shape and problem_layout_ncluster_mnl
    :type params: PersistentTileSchedulerParams
    :ivar num_persistent_clusters: Number of persistent clusters that can be launched
    :type num_persistent_clusters: Int32
    :ivar cta_id_in_cluster: ID of the CTA within its cluster
    :type cta_id_in_cluster: cute.Coord
    :ivar _num_tiles_executed: Counter for executed tiles
    :type _num_tiles_executed: Int32
    :ivar _current_work_linear_idx: Current cluster index
    :type _current_work_linear_idx: Int32
    rx   r   r   r   r   r   
inner_modec                     t                                          |||||           |dvrt          d|           || _        dS )aF  Initializes the StaticPersistentRuntimeTileScheduler with the given parameters.

        :param params: Tile schedule related params, including cluster shape and problem_layout_ncluster_mnl.
        :type params: PersistentTileSchedulerParams
        :param num_persistent_clusters: Number of persistent clusters that can be launched.
        :type num_persistent_clusters: Int32
        :param current_work_linear_idx: Current cluster index.
        :type current_work_linear_idx: Int32
        :param cta_id_in_cluster: ID of the CTA within its cluster.
        :type cta_id_in_cluster: cute.Coord
        :param num_tiles_executed: Counter for executed tiles.
        :type num_tiles_executed: Int32
        :param inner_mode: The inner mode along which the linear index will be decomposed first.
        :type inner_mode: int
        )r   rx   z;inner_mode must be 0(for M mode) or 1(for N mode), but got N)superr]   r   r   )r\   r   r   r   r   r   r   	__class__s          r%   r]   z-StaticPersistentRuntimeTileScheduler.__init__  s`    0 	##	
 	
 	
 V##g[egghhh$r'   ra   r   c                 ~   t          |          dk    sJ t          | j        |d         g          }t          | j        |d         g          }t          | j        |dd                   }t          | j        |d         g          }|dd          }t          | j        |          }t          |||||| j                  S r   )	rf   r
   r   r   r   r   r   r   r   r   s           r%   ri   z=StaticPersistentRuntimeTileScheduler.__new_from_mlir_values__  s    6{{a&:4;WZ`abZcYd&e&e#&:4;X[abc[dZe&f&f# 4T5KVTUVWTW[ Y Y!5d6NQWXYQZP[!\!\ qrr
)$+}EE
3''!"O
 
 	
r'   Nr   r   r   c                j   t          j        |||          t          j        | j        ||          z  }|\  }}}	t          |	          }
t          || j        d         z            t          || j        d         z            t          d          f}t          d          }t	          | ||
|||          S )a  Initialize the static persistent tile scheduler.

        :param params: Parameters for the persistent tile scheduler.
        :type params: PersistentTileSchedulerParams
        :param block_idx: The 3d block index in the format (bidx, bidy, bidz).
        :type block_idx: Tuple[Integer, Integer, Integer]
        :param grid_dim: The 3d grid dimensions for kernel launch.
        :type grid_dim: Tuple[Integer, Integer, Integer]
        :param inner_mode: The inner mode along which the linear index will be decomposed first.
        :type inner_mode: int

        :return: A StaticPersistentRuntimeTileScheduler object.
        :rtype: StaticPersistentRuntimeTileScheduler
        r   r   rx   )rK   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   s                r%   r   z+StaticPersistentRuntimeTileScheduler.create  s    8 #')H#""E"E"ESYSjpsxzI{I{I{"{$dD #(++ $03344$03344!HH
 #1XX3##
 
 	
r'   c                |   | j         j        j        }d}t          | j        dk              r't          j        ||d         f|d         df          }n&t          j        |d         |fd|d         f          }|                    |          }|d         |d         t          d          f}d}	t          ||	          S )r   irx   )r   r   T)
r   r   r   r   r   rK   r   get_hier_coordr   rU   )
r\   r   r   r   ntile_shapeint_maxntile_layoutcluster_tile_coord_mnr   r   s
             r%   r   zEStaticPersistentRuntimeTileScheduler._get_current_work_for_linear_idx9  s     k=Cdo*++ 	c+Wk!n,E{[\~_`NabbbLL+[^W,EqR]^_R`NabbbL , ; ;<S T T!!$!!$!HH
 NH555r'   )rx   )rm   rn   ro   rp   rw   r   rK   rq   r   r]   rr   r   rs   ri   r   r   r   r   r   rU   r   __classcell__)r   s   @r%   r   r     sq        0 !% !%-!% "'!% "'	!%
  :!% "!% !% !% !% !% !% !%F
tBH~ 
Bh 
 
 
 
( 
 	0
 0
 0
 0
-0
'7230
 '120
 	0
 0
 0
 [ \0
f W[_c 6 6 6 6ht 6 6 6 6 6 6 6 6r'   r   )Nr(   FN)F)'rp   typingr   r   cutlass.cutlass_dslr   r   r   r   r	   r
   r   r   cutlass._mlirr   cutlass._mlir.dialectsr   r   r   r   cutlass.cute.typingr   	CuteInt32cutlass.cuterK   r<   floatr   r&   r   r2   rC   rG   rQ   rS   rU   rw   r   r   ru   r'   r%   <module>r     s  :          	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	       2 2 2 2 2 2 2 2 2 2 ! ! ! ! ! ! ; ; ; ; ; ; ; ;          UG^UG^ 
	    D 
$" 	" " " " 
	"
 
" " " "R I I II 	I I I I ID 	      2L L5( LD LU5RY>EZ L L L L1 1eWn% 1 1%wBW 1 1 1 1* * * * * * * *Z\A \A \A \A \A \A \A \A~[( [( [( [( [( [( [( [(|X6 X6 X6 X6 X6+H X6 X6 X6 X6 X6r'   