
    `i	                      d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlmZmZ d dlmZ d dlmZmZmZmZmZmZ d dlZd dlmZ d dlZd dlZd dlmc mZ d dl m!Z! d dl"m#Z#m$Z$ d d	l%m&Z& d d
l'm(Z( d dl)m*Z*m+Z+m,Z, d dl-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4 ddl5m6Z6 ddl7m8Z8m9Z9m:Z: ddl;m<Z< ddl=m>Z>m?Z?m@Z@mAZA ddlBmCZC ddlDmEZE ddlFmGZG ddlHmIZImJZJmKZKmLZL ddlMmNZNmOZO ddlPmQZQmRZRmSZSmTZT ddlmUZUmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZb ddlcmdZemfZfmgZgmhZh ddlimjZj ddlkmlZl ddlmmnZnmoZompZpmqZqmrZrmsZsmtZtmuZumvZvmwZwmxZxmyZymzZzm{Z{m|Z| dd l}m~Z~mZmZmZmZmZ dd!lmZmZmZmZmZ dd"lmZ er/d d#lmZ d d$lmZ d d%lmZ dd&l9mZ dd'lmmZ dd(lmZ  ed)          Z ej        e          Zej                            ed*          Zej                            ed+          Zej                            ed,          Z e<            Z; G d- d.          Z ed          dpd1            Z ed          dpd2            Z G d3 d4          Zej         G d5 d6                      Zej         G d7 d8                      Zej         G d9 d:e                      Zej         G d; d<e                      ZdqdAZ G dB dCew          Z e            j        ZdrdFZdrdGZdsdHZdrdIZdtdKZdudNZ G dO dPer          ZdvdRZdwdxdVZ G dW dXev          Ze                    dY            G dZ d[e          Z G d\ d]          Zej         G d^ d_                      Z G d` da          Zej         G db dc                      Z G dd deeqeeeeeef         f         f                   Zej         G df dg                      Z G dh diee                   Z G dj dke          ZdydoZdS )z    )annotationsN)IterableSequence)	lru_cache)AnyCallablecastOptionalTYPE_CHECKINGUnion)
PRECEDENCE)get_interface_for_device)identitypreserve_rng_state)is_integer_dtype)
OrderedSet)CeilDivFloorDivModularIndexing)has_triton_packagehas_triton_stable_tma_api   )free_symbol_is_type
prefix_strsymbol_is_typeSymT)ValueRanges   )configirmetrics)AsyncCompile)	code_hashget_pathPyCodeCachewrite_atomic)DefaultHandler)triton_heuristics)benchmarker)AutotuneHintDevicePropertiesTRITON_MAX_BLOCKTRITON_MAX_RSPLIT)get_max_y_gridnext_power_of_2)BaseSchedulerNodeFusedSchedulerNode	SchedulerSchedulerNode)cache_on_selfDelayReplaceLineget_bounds_index_exprget_fused_kernel_nameget_kernel_metadatais_welford_reductionPlaceholderprefix_is_reduction	sympy_dotsympy_product
sympy_substriton_typetriton_version_uses_attrs_dictupcast_compute_type)_opsReductionType	StoreModeV)"get_kernel_category_by_source_code   )BlockPatternMatcher)ArgNameBackendFeatureConstexprArgCSECSEVariableDeferredLineIndentedBufferInplacedBufferOpOverridesPythonPrinter
RemovedArgSizeArg	TensorArgWorkspaceArgWorkspaceZeroMode)constant_reprIterationRangesIterationRangesEntryIterationRangesRoot
SIMDKernelSIMDScheduling)	config_ofequal_1_arg_indicesnon_constexpr_signatureshould_unwrap_unspec_argsignature_to_meta)SymbolicCallArg)
ModuleType)TypeVarDtypePropagationOpsHandler)IRNode)BlockShapeType)SIMDKernelFeatures_T
perf_hintsschedulefusionc                  H    e Zd ZU dZi Zded<   i Zded<   edd            ZdS )OpDtypeSupportz
    Some Triton ops such as libdevice and tl.math only support float32 and float64.
    This class records which dtypes are supported by specific IR ops.
    z"dict[str, OrderedSet[torch.dtype]]supported_dtypeszdict[str, bool]convert_outputsfuncCallable[..., str]convert_outputboolreturnNonec                    |j         }t          t          j        t          j        g          | j        |<   || j        |<   d S N)__name__r   torchfloat32float64rq   rr   )clsrs   ru   op_names       r/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torch/_inductor/codegen/triton.pyregister_upcastzOpDtypeSupport.register_upcast   s:    -(2EM5=3Q(R(RW%'5G$$$    N)rs   rt   ru   rv   rw   rx   )	r{   
__module____qualname____doc__rq   __annotations__rr   classmethodr    r   r   rp   rp   x   sc          
 <>====')O))))6 6 6 [6 6 6r   rp   rw   strc                 f    t                      sdS ddl} t          | j        j        d          rdS dS )zd
    import AttrsDescriptor if the triton version is new enough to have this
    class defined.
     r   NAttrsDescriptorz4from triton.compiler.compiler import AttrsDescriptor)r   triton.compiler.compilerhasattrcompiler)tritons    r   gen_attr_descriptor_importr      sH      r#### v'):;; EErr   c                     t                      } |                     d           t                      x}r|                     |           |                     d           |                                 S )NzD
        import triton
        import triton.language as tl
        a  
        from torch._inductor.runtime import triton_helpers, triton_heuristics
        from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
        from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
        )rO   splicer   	writelinegetvalue)imports	attr_descs     r   gen_common_triton_importsr      s    GNN	   /000y %)$$$NN	   r   c                      e Zd ZdZ eej        ej        g          Z eej	        ej
        ej        ge          Zd eD             Zd eD             Zedd            Zedd	            Zd
S )TritonSymbolszU
    Stores sympy.Symbol instances and constants associated with triton codegen.
    c                Z    i | ](}|t          j        t          |          d dd          )S )offsetTintegernonnegative)sympySymbolr   .0symts     r   
<dictcomp>zTritonSymbols.<dictcomp>   sJ        	elj.666RVWWW  r   c                ~    i | ]:}|t          j        t          |                                          d dd          ;S )BLOCKTr   positive)r   r   r   upperr   s     r   r   zTritonSymbols.<dictcomp>   s^         	el$%%''...t
 
 
  r   treerY   rw   sympy.Symbolc                &    | j         |j                 S rz   )block_sizesr   r   r   s     r   get_block_sizezTritonSymbols.get_block_size   s    ty))r   c                &    | j         |j                 S rz   )block_offsetsr   r   s     r   get_block_offsetzTritonSymbols.get_block_offset   s     ++r   N)r   rY   rw   r   )r{   r   r   r   r   r   R0_INDEXR1_INDEXreduction_typesXBLOCKYBLOCKZBLOCKblock_typesr   r   r   r   r   r   r   r   r   r      s          !j$-!?@@O*dk4;VoVWWK   M
   	  K * * * [* , , , [, , ,r   r   c                      e Zd ZU ded<   ded<   ded<   ded<   d	ed
<   ded<   ddZddZddZddZddZe	dd            Z
dS )IndexingOptionsr   	index_strOrderedSet[str]	mask_varszOptional[str]
expand_strrv   _has_rindex
sympy.Exprindexz#Optional[Sequence[Union[int, str]]]expand_shaperw   c                *    t          | j                  S rz   )rv   r   selfs    r   has_maskzIndexingOptions.has_mask   s    DN###r   c                @    t          | j        t          j                  S rz   )r   r   r   TMPr   s    r   has_indirectzIndexingOptions.has_indirect   s    "4:tx888r   c                    | j         S rz   )r   r   s    r   
has_rindexzIndexingOptions.has_rindex   s    r   c                >    t          d | j        D                       S )Nc              3  Z   K   | ]&}t          |                              d           V  'dS )tmpNr   
startswithr   masks     r   	<genexpr>z.IndexingOptions.has_tmpmask.<locals>.<genexpr>   s6      JJ43t99''..JJJJJJr   anyr   r   s    r   has_tmpmaskzIndexingOptions.has_tmpmask   s!    JJ4>JJJJJJr   c                >    t          d | j        D                       S )Nc              3  Z   K   | ]&}t          |                              d           V  'dS )rNr   r   s     r   r   z,IndexingOptions.has_rmask.<locals>.<genexpr>   s6      HH3t99'',,HHHHHHr   r   r   s    r   	has_rmaskzIndexingOptions.has_rmask   s!    HHHHHHHHr   c                    | j         r:d                    t          t          t          | j                                       ndS )N & rx   )r   joinsortedmapr   r   s    r   mask_strzIndexingOptions.mask_str   s:     =ANVEJJvc#t~6677888PV	
r   Nrw   rv   rw   r   )r{   r   r   r   r   r   r   r   r   propertyr   r   r   r   r   r      s         NNN5555$ $ $ $9 9 9 9       K K K KI I I I 
 
 
 X
 
 
r   r   c                  0   e Zd ZU dZded<   ded<   ded<   ded	<   d
ed<   ded<   d
ed<   dZded<   ed1d            Zed1d            Zed1d            Z	ed1d            Z
ed2d            Zd3d!Zd4d"Zd5d$Zd6d%Zd7d'Zd7d(Zd7d)Zd7d*Zd7d+Zd8d0ZdS )9BlockDescriptorOptionsz
    This is a base class that describes a block descriptor used in Triton kernels.
    It can be used to create either a tensor descriptor (with TensorDescriptorOptions)
    or a block pointer (with BlockPtrOptions).
    BlockParametersparamsr   constant_offset	list[int]orderr   r   Sequence[sympy.Expr]broadcast_shapez
list[bool]broadcasting_dimsfinal_shapeNzOptional[list[int]]_boundary_checkrw   list[sympy.Expr]c                    | j         j        S rz   )r   shaper   s    r   r   zBlockDescriptorOptions.shape  s    {  r   c                    | j         j        S rz   )r   block_shaper   s    r   r   z"BlockDescriptorOptions.block_shape  s    {&&r   c                    | j         j        S rz   )r   stridesr   s    r   r   zBlockDescriptorOptions.strides	      {""r   c                    | j         j        S rz   )r   offsetsr   s    r   r   zBlockDescriptorOptions.offsets  r   r   range_treeslist[IterationRangesRoot]get_max_blockCallable[[str], int]c                  t           j        j        dfd} ||j                  |_         ||j                  |_        fd|j        D             }fd|j        D             }t          |          rd|d	<   d
 t          |j        |          D             }	d t          ||          D             d t          ||          D             }fdt          di fdt          j
        |                                          D             }d |D             }
t           j        j        r(|d         j        dk    sJ |
                    d           t           j        j        }t           j        j        srt%          |j                  t%          t           j        j                  |z
  k    r;t           j        j                                        r|
t,          j        j        g|z  z  }
 | |t           j        j                            |          t5          t7          t9          t%          |j                                                ||
|	|          }|                    ||           |S )z2Helper to create a BlockDescriptorOptions instanceexprsIterable[sympy.Expr]rw   r   c                     fd| D             S )Nc                :    g | ]}                     |          S r   )lookup_precomputed_size)r   exprsizevarss     r   
<listcomp>zFBlockDescriptorOptions.create.<locals>.lookup_size.<locals>.<listcomp>   s'    MMMtH44T::MMMr   r   )r  r  s    r   lookup_sizez2BlockDescriptorOptions.create.<locals>.lookup_size  s    MMMMuMMMMr   c                <    g | ]}                     |d           S )r   statically_known_equals)r   strider  s     r   r	  z1BlockDescriptorOptions.create.<locals>.<listcomp>(  s6     
 
 
<BH,,VQ77
 
 
r   c                <    g | ]}                     |d           S )rG   r  )r   dimr  s     r   r	  z1BlockDescriptorOptions.create.<locals>.<listcomp>.  s6     
 
 
9<H,,S!44
 
 
r   Fc                    g | ]	\  }}||
S r   r   r   r  is_singletons      r   r	  z1BlockDescriptorOptions.create.<locals>.<listcomp>8  s2     
 
 
!\

 
 
r   c                ,    g | ]}t          |          S r   )r   )r   dimss     r   r	  z1BlockDescriptorOptions.create.<locals>.<listcomp>?  s    WWW#d))WWWr   c                    g | ]	\  }}||
S r   r   r  s      r   r	  z1BlockDescriptorOptions.create.<locals>.<listcomp>C  s2     
 
 
!\

 
 
r   c                8    d t          |           D             S )z@Removes any broadcasting or singleton dims from a given sequencec                    g | ]	\  }}||
S r   r   )r   itemis_removables      r   r	  zFBlockDescriptorOptions.create.<locals>.remove_dims.<locals>.<listcomp>K  s2       &D,#  r   )zip)itremovable_dimss    r   remove_dimsz2BlockDescriptorOptions.create.<locals>.remove_dimsI  s.     *-b.*A*A   r   c                .    i | ]\  }}| |          S r   r   )r   keyvalr  s      r   r   z1BlockDescriptorOptions.create.<locals>.<dictcomp>S  s)    XXXcsKK$$XXXr   c                B    g | ]}t                               |          S r   )r   r   r   r   s     r   r	  z1BlockDescriptorOptions.create.<locals>.<listcomp>W  s&    RRRd}33D99RRRr   r   x)r   r   r   r   r   r   r   )r  r  rw   r   r   )rE   graphr  r   r   r   allr  r   dataclassesasdictitemskernelno_x_dimprefixpopnum_reduction_dimsinside_reductionlennumelsfeaturesis_reductionr   SOner  listreversedrangecompute_boundary_check)r   r   r   r   r   r   r
  r   singleton_dimsr   r   reduction_ndimresultr  r  r  s                @@@r   createzBlockDescriptorOptions.create  s    7#	N 	N 	N 	N 	N 	N #{6<00$V^44
 
 
 
FLn
 
 

 
 
 
@F@R
 
 
 ~ 	'!&N2

 
%();^%L%L
 
 
 XWNDU0V0VWWW
 
%():N%K%K
 
 
	 	 	 	 	 ! 
 
XXXX[5G5O5O5U5U5W5WXXX
 

 SRkRRR8 	q>(C////OOA4)	:FN##s18?';';n'LLL!..00 M EGK=>99KG,DD_UUxc&,&7&7 8 899::#+/
 
 
 	%%m[AAAr   r  replacementr   r   c                J    t           j        |         }t          |||i          S zN
        Replaces instances of {symt}_offset with the new expression.
        r   r   r>   r   r  r?  r   roffsets        r   replace_offsetz%BlockDescriptorOptions.replace_offsetq  &      -d3$+ 6777r   c                v    t           j        D ]+}|                     |t          j        d          |          },|S Nr   r   r   rE  r   Integerr   r  r   s      r   remove_roffsetsz&BlockDescriptorOptions.remove_roffsetsz  >    !1 	E 	ED&&tU]1-=-=tDDDDr   rx   c                    t           j        j        fd|D             t          t	          t           j        j        |                     fdt          t           j	                            D              _
        dS )z6List of indices to pass to tl.load(boundary_check=...)c                r    i | ]3}t           j        |j                  t          |j                           4S r   )r   r   r   r   )r   tr   s     r   r   zABlockDescriptorOptions.compute_boundary_check.<locals>.<dictcomp>  sF     /
 /
 /
 %af-}}Z=O/P/P/
 /
 /
r   c           	        g | ]}                     j        |         t          j        j                  sՉr.t
          j        t          j                 j	        |         j
        v sf                    j        |         j	        |                   sy                    j        |         t          j	        |                             s?t          j        j        r,j	        |         t
          j        t          j                 k    |
S r   )r  r   r   r5  Zeror   r   r   r   r   free_symbolsstatically_known_multiple_ofr   r>   rE   r+  r,  r   )r   idxblock_to_maxneeds_overflow_gridr   r  s     r   r	  zABlockDescriptorOptions.compute_boundary_check.<locals>.<listcomp>  s     
  
  
44T\#5FUU	 
 , 
 *5dkB+C0=> > %AA JsOT-=c-B 	> !) E E JsO&t'7'<lKK! !> H%> (-1J4;1WWW- , XWWr   N)rE   r&  r  r   r   r+  needs_yz_grid_overflowr9  r1  r   r   )r   r   r   rV  rW  r  s   `` @@@r   r:  z-BlockDescriptorOptions.compute_boundary_check  s     7#/
 /
 /
 /
 /
 /
 /
 "#ah&E{"S"STT 
  
  
  
  
  
  
S__-- 
  
  
r   c                "    | j         J | j         S rz   )r   r   s    r   boundary_checkz%BlockDescriptorOptions.boundary_check  s    #///##r   rv   c                    dS NFr   r   s    r   r   z#BlockDescriptorOptions.has_indirect      ur   c                >    t          d | j        D                       S )Nc              3  J   K   | ]}t          |t          j                  V  d S rz   )r   r   r   )r   r  s     r   r   z4BlockDescriptorOptions.has_rindex.<locals>.<genexpr>  sC       
 
  m&CDD
 
 
 
 
 
r   )r   r   r   s    r   r   z!BlockDescriptorOptions.has_rindex  s3     
 
(
 
 
 
 
 	
r   c                *    |                                  S rz   )r   r   s    r   r   z BlockDescriptorOptions.has_rmask  s       r   c                    dS r\  r   r   s    r   r   z"BlockDescriptorOptions.has_tmpmask  r]  r   c                D    t          |                                           S rz   )rv   rZ  r   s    r   r   zBlockDescriptorOptions.has_mask  s    D''))***r   valuer   initial_shapeallow_implicitc                   d t          | j        | j                  D             }t          |||          }t          j        j        |oHt          |          t          |          k    o(t          fdt          ||          D                       }t          | j                  r-|s+d| dt          j
                            | j                   d}t          || j        |          }|S )z
        Generate a broadcast and a reshape for the block descriptor.
        This restores stride-0 dimensions which were removed from the block descriptor.
        c                >    g | ]\  }}|rt           j        j        n|S r   )r   r5  r6  )r   r  is_broadcastings      r   r	  zHBlockDescriptorOptions.codegen_broadcast_and_reshape.<locals>.<listcomp>  s9     
 
 
$_ +3EGKK
 
 
r   c              3  v   K   | ]3\  }}                     |d           p                     ||          V  4dS rG   Nr  )r   pre_dimpost_dimr  s      r   r   zGBlockDescriptorOptions.codegen_broadcast_and_reshape.<locals>.<genexpr>  sf         &GX 00!<< G33GXFF     r   tl.broadcast_to(, ))r  r   r   triton_reshaperE   r&  r  r1  r'  r   r+  index_to_str)r   rc  rd  r   re  pre_broadcast_shapesupports_implicit_broadcastr  s          @r   codegen_broadcast_and_reshapez4BlockDescriptorOptions.codegen_broadcast_and_reshape  s'   
 
(+$d&<) )
 
 
 um5HII 7#&4 '
#$$K(8(88      *--@+)N)N     	$ t%&& 	_/J 	_^u^^0E0EdFZ0[0[^^^E ud&:KHHr   rw   r   )r   r   r   r   r   r   r   r   r   r   rw   r   r  r   r?  r   r   r   rw   r   r  r   rw   r   )r   r   r   r   rw   rx   )rw   r   r   )
rc  r   rd  r   r   r   re  rv   rw   r   )r{   r   r   r   r   r   r   r   r   r   r   r   r>  rE  rL  r:  rZ  r   r   r   r   r   rt  r   r   r   r   r      s          ))))!!!!%%%%+/O////! ! ! X! ' ' ' X' # # # X# # # # X# ] ] ] []~8 8 8 8   
2
 2
 2
 2
h$ $ $ $   
 
 
 
! ! ! !   + + + +) ) ) ) ) )r   r   c                      e Zd ZdddZdS )	TensorDescriptorOptionsTnamer   rw   c                
   t           j        j        }| j        dk    r| d || j                   dn|d || j                   d || j                   d || j                   g}dd                    |           dS )	a  
        Codegen a call to tl.make_tensor_descriptor()

        Args:
            name: variable name for pointer
            roffset: unused, but kept for compatibility with BlockPtrOptions.format()

        Returns:
            "tl.make_tensor_descriptor(...)"
        r    + (ro  shape=strides=block_shape=ztl.make_tensor_descriptor(rn  )rE   r+  rq  r   r   r   r   r   )r   rz  rD  fargss        r   formatzTensorDescriptorOptions.format  s     H! '1,, 77QQt3447777$QQtz]]$$(qq((011T-..00	
 ?DIIdOO>>>>r   NTrz  r   rw   r   )r{   r   r   r  r   r   r   ry  ry    s-        ? ? ? ? ? ? ?r   ry  c                  0    e Zd ZddZddZdddZddZdS )BlockPtrOptionsr  r   r?  r   r   rw   c                J    t           j        |         }t          |||i          S rA  rB  rC  s        r   rE  zBlockPtrOptions.replace_offset  rF  r   c                v    t           j        D ]+}|                     |t          j        d          |          },|S rH  rI  rK  s      r   rL  zBlockPtrOptions.remove_roffsets  rM  r   Trz  r   c           	     |    t           j        j        }g  j        }|s fd|D             } j        dk    r| d | j                   dn|d | j                   d | j                   d | j                   d | j                   d	 ||           g}d
d	                    |           dS )a  
        Codegen a call to tl.make_block_ptr()

        Args:
            name: variable name for pointer
            roffset: should rn_offset be included in offsets=..., for use with tl.advance()

        Returns:
            "tl.make_block_ptr(...)"
        c                :    g | ]}                     |          S r   )rL  )r   r   r   s     r   r	  z*BlockPtrOptions.format.<locals>.<listcomp>0  s'    JJJt++F33JJJr   r   r|  ro  r}  r~  r  zorder=zoffsets=ztl.make_block_ptr(rn  )
rE   r+  rq  r   r   r   r   r   r   r   )r   rz  rD  r  r   r  s   `     r   r  zBlockPtrOptions.format"  s    H!!DL/ 	KJJJJ'JJJG '1,, 77QQt3447777$QQtz]]$$(qq((011T-..00$QQtz]]$$#qqzz##
 7DIIdOO6666r   c                Z     t           j                  fd j        D             }|S )av  
        Codegen string to pass to tl.advance(name, ...).

        Advance is the difference between offsets in each loop iteration.
        To compute it, we replace rN_offset with multiples of RN_BLOCK.
        Since we expect rN_offset to vary in range(0, rN_numel, RN_BLOCK), the first
        iteration has rN_offset=0, while the second has rN_offset=RN_BLOCK.
        c                    g | ]@}                     |                               |t          j        j                  z
  AS r   )rE  r   r5  rR  )r   r   rblockr   r   s     r   r	  z3BlockPtrOptions.advance_roffset.<locals>.<listcomp>I  s\     
 
 

  ##FFD99%%feglDAAB
 
 
r   )r   r   r   )r   r   advancer  s   `` @r   advance_roffsetzBlockPtrOptions.advance_roffset?  sQ     *40
 
 
 
 
 

 ,
 
 
 r   Nrv  rw  r  r  )r   r   rw   r   )r{   r   r   rE  rL  r  r  r   r   r   r  r    si        8 8 8 8   
7 7 7 7 7:     r   r  rc  	old_shaper   	new_shapec                   t          |t                    rt          |t                    sJ d |D             }d |D             }||k    r| S d |D             |k    rd|  dd                    |           dS d}g }|D ]Y}|t          |          k     r'|||         k    r|                    d	           |d
z  }<|dk    sJ |                    d           Z|t          |          k    sJ |  dd                    |           dS )z<Workaround https://github.com/triton-lang/triton/issues/2836c                L    g | ]!}t           j                            |          "S r   rE   r+  rq  r   r   s     r   r	  z"triton_reshape.<locals>.<listcomp>Y  (    IIIeQX**511IIIr   c                L    g | ]!}t           j                            |          "S r   r  r  s     r   r	  z"triton_reshape.<locals>.<listcomp>Z  r  r   c                    g | ]
}|d k    |S )1r   )r   ss     r   r	  z"triton_reshape.<locals>.<listcomp>^  s    ---aAHHHHHr   ztl.reshape(z, [rn  z])r   :rG   r  rx   [])
isinstancer7  r   r1  append)rc  r  r  old_shape_strnew_shape_strrU  expandsizes           r   rp  rp  S  sS    i&&F:i+F+FFFFIIyIIIMIIyIIIM%%--=--->>CUCCtyy'?'?CCCC
CF " "]####c0B(B(BMM#1HCC3;;;;MM&!!!!#m$$$$$$**dii''****r   c                      e Zd Zd%dZd%dZd%dZd%dZd%d	Zd%d
Zd%dZ	d%dZ
d%dZd%dZd%dZd%dZd%dZd%dZd&dZd%dZd%dZd%dZd%dZd%dZd%dZd%dZd%dZd%dZd%dZd%dZd%d Zd%d!Zd%d"Zd%d#Z d$S )'TritonPrinterr  r   rw   r   c                    t          |j                  dk    sJ d|                     |j        d                    dt          j        j         dS )NrG   libdevice.trunc(r   ).to(ro  r1  r  _printrE   r+  index_dtyper   r  s     r   _print_TruncToIntzTritonPrinter._print_TruncToInts  M    49~~""""Vt{{49Q<88VVqx?SVVV	
r   c                b    t          j                    rt          j        j        r| }nd| d}|S )Nztl.full([], z, tl.float64))r   	is_fbcoder|   versionhip)r   r  rets      r   _print_FloatzTritonPrinter._print_Floaty  s=     	5%-"3 	5)CC4444C
r   c                    t          |j                  dk    sJ |                     |j        d         t          d         dz
            }| dS )NrG   r   Atom      ?z.to(tl.float64))r1  r  parenthesizer   )r   r  r  s      r   _print_ToFloatzTritonPrinter._print_ToFloat  sO    49~~""""dilJv,>,DEE$$$$r   c                    |j         \  }}|j        r1|j        r*|                     |j         dt          d         dz
            S |                     |          }|                     |          }d| d| dS )N % r  r  z!triton_helpers.remainder_integer(rn  ro  )r  is_nonnegative	stringifyr   r  r   r  quotdivquot_sdiv_ss         r   _print_PythonModzTritonPrinter._print_PythonMod  s    I	c 	N3#5 	N>>$)UJv4F4LMMMT""C  E6EEUEEEEr   c                    |j         sJ |j        \  }}|j        r1|j        r*|                     |j        dt          d         dz
            S |                     |          }|                     |          }d| d| dS )N // r  r  z!triton_helpers.div_floor_integer(z,  ro  )
is_integerr  r  r  r   r  r  s         r   _print_FloorDivzTritonPrinter._print_FloorDiv  s    I	c 	O3#5 	O>>$)VZ5G#5MNNNT""C  F6FFeFFFFr   c                V    |                      |j        dt          d         dz
            S )N / r  r  )r  r  r   r  s     r   _print_IntTrueDivzTritonPrinter._print_IntTrueDiv  s$    ~~di
60BS0HIIIr   c                    t          |j                  dk    sJ d|                     |j        d                    dt          j        j         dS NrG   libdevice.floor(r   r  ro  r  r  s     r   _print_floorzTritonPrinter._print_floor  r  r   c                    t          |j                  dk    sJ d|                     |j        d                    dt          j        j         dS r  r  r  s     r   _print_FloorToIntzTritonPrinter._print_FloorToInt  r  r   c                    t          |j                  dk    sJ d|                     |j        d                    dt          j        j         dS NrG   libdevice.ceil(r   r  ro  r  r  s     r   _print_ceilingzTritonPrinter._print_ceiling  K    49~~""""XTYq\!:!:XXAUXXXXr   c                    t          |j                  dk    sJ d|                     |j        d                    dt          j        j         dS r  r  r  s     r   _print_CeilToIntzTritonPrinter._print_CeilToInt  r  r   c                4    d|                      |           dS )Nzlibdevice.sqrt(().to(tl.float32)))r  r  s     r   _helper_sqrtzTritonPrinter._helper_sqrt  s    F$++d"3"3FFFFr   c                    d|                      |j        d                    d|                      |j        d                    dS )Nlibdevice.pow(r   rn  rG   ro  )r  r  r  s     r   _print_FloatPowzTritonPrinter._print_FloatPow  sB    VT[[166VV$++diPQl:S:SVVV	
r   c                .   |j         d         j        r?dt          |j         d                    d|                     |j         d                    dS d|                     |j         d                    d|                     |j         d                    dS )Nr   r  rn  rG   ro  )r  
is_Integerfloatr  r  s     r   _print_PowByNaturalz!TritonPrinter._print_PowByNatural  s    9Q<" 	XWE$)A,$7$7WW4;;tyQR|;T;TWWWWVT[[166VV$++diPQl:S:SVVV	
r   c                    |                      |j        d                   }|                      |j        d                   }|                      |j        d                   }d| d| d| dS )Nr   rG   r   	tl.where(rn  ro  )doprintr  )r   r  cpqs        r   _print_WherezTritonPrinter._print_Where  sh    LL1&&LL1&&LL1&&)1))))Q))))r   cmpc                   t          |j                  dk    r |                     |j        d                   S t          |j                  dz  }t          |          }|                      ||j        d|                    }|                      ||j        |d                    }t	          d ||fD                       \  }}|dv sJ d| d            d	| d
| d| d| d| d
| d| d| dS )zI
        Helper for max/min code generation.
        cmp: > or <
        rG   r   r   Nc              3  "   K   | ]
}d | dV  dS )(ro  Nr   r   r%  s     r   r   z6TritonPrinter._print_min_max_helper.<locals>.<genexpr>  s*      ..!XXXX......r   )><zUnexpected comparator: ''r  z * ( z= z) + )))r1  r  r  typetuple)r   r  r  midr   abs          r   _print_min_max_helperz#TritonPrinter._print_min_max_helper  s$   
 ty>>Q;;ty|,,,$)nn!4jjKKTYtt_-..KKTYstt_-.. ..1v.....1j   "CS"C"C"C   B1BB!BBcBBQBBABB1BBsBBQBBBBr   c                .    |                      |d          S )Nr  r  r  s     r   
_print_MinzTritonPrinter._print_Min      ))$444r   c                .    |                      |d          S )Nr  r  r  s     r   
_print_MaxzTritonPrinter._print_Max  r  r   c                ~    t          |j                  dk    sJ d|                     |j        d                    dS )NrG   tl_math.abs(r   ro  r1  r  r  r  s     r   
_print_AbszTritonPrinter._print_Abs  s>    49~~"""":dkk$)A,77::::r   c                ~    t          |j                  dk    sJ d|                     |j        d                    dS )NrG   zlibdevice.cos((r   r  r  r  s     r   _print_OpaqueUnaryFn_cosz&TritonPrinter._print_OpaqueUnaryFn_cos  >    49~~""""MTYq\!:!:MMMMr   c                ~    t          |j                  dk    sJ d|                     |j        d                    dS )NrG   zlibdevice.cosh((r   r  r  r  s     r   _print_OpaqueUnaryFn_coshz'TritonPrinter._print_OpaqueUnaryFn_cosh  >    49~~""""N$++dil";";NNNNr   c                ~    t          |j                  dk    sJ d|                     |j        d                    dS )NrG   zlibdevice.acos((r   r  r  r  s     r   _print_OpaqueUnaryFn_acosz'TritonPrinter._print_OpaqueUnaryFn_acos  r  r   c                ~    t          |j                  dk    sJ d|                     |j        d                    dS )NrG   zlibdevice.sin((r   r  r  r  s     r   _print_OpaqueUnaryFn_sinz&TritonPrinter._print_OpaqueUnaryFn_sin  r  r   c                ~    t          |j                  dk    sJ d|                     |j        d                    dS )NrG   zlibdevice.sinh((r   r  r  r  s     r   _print_OpaqueUnaryFn_sinhz'TritonPrinter._print_OpaqueUnaryFn_sinh  r  r   c                ~    t          |j                  dk    sJ d|                     |j        d                    dS )NrG   zlibdevice.asin((r   r  r  r  s     r   _print_OpaqueUnaryFn_asinz'TritonPrinter._print_OpaqueUnaryFn_asin  r  r   c                ~    t          |j                  dk    sJ d|                     |j        d                    dS )NrG   zlibdevice.tan((r   r  r  r  s     r   _print_OpaqueUnaryFn_tanz&TritonPrinter._print_OpaqueUnaryFn_tan  r  r   c                ~    t          |j                  dk    sJ d|                     |j        d                    dS )NrG   zlibdevice.tanh((r   r  r  r  s     r   _print_OpaqueUnaryFn_tanhz'TritonPrinter._print_OpaqueUnaryFn_tanh  r  r   c                ~    t          |j                  dk    sJ d|                     |j        d                    dS )NrG   zlibdevice.atan((r   r  r  r  s     r   _print_OpaqueUnaryFn_atanz'TritonPrinter._print_OpaqueUnaryFn_atan  r  r   c                ~    t          |j                  dk    sJ d|                     |j        d                    dS )NrG   zlibdevice.log2((r   r  r  r  s     r   _print_OpaqueUnaryFn_log2z'TritonPrinter._print_OpaqueUnaryFn_log2  r  r   c                    t          |j                  dk    sJ d|                     |j        d                    dt          j        j         dS )NrG   zlibdevice.llrint(r   r  ro  r  r  s     r   _print_RoundToIntzTritonPrinter._print_RoundToInt  sM    49~~""""WDIaL 9 9WW@TWWW	
r   c                    t          |j                  dk    sJ |j        \  }}|j        r|dk     sJ t          d| d          |                     |t
          d                   }d| d| d|  S )	Nr   r   zOFor integer inputs, only non-negative ndigits are currently supported, but got .Mulzlibdevice.nearbyint(1e * z) * 1e)r1  r  r  
ValueErrorr  r   )r   r  numberndigits
number_strs        r   _print_RoundDecimalz!TritonPrinter._print_RoundDecimal  s    49~~"""") 	Q;;;;lbilll   &&vz%/@AA
PPPJPPwhPPPr   N)r  r   rw   r   )r  r   r  r   rw   r   )!r{   r   r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r
  r  r  r  r  r  r  r  r  r#  r   r   r   r  r  r  s       
 
 
 
   % % % %
F F F FG G G GJ J J J

 
 
 

 
 
 
Y Y Y YY Y Y YG G G G
 
 
 


 
 
 
* * * *C C C C&5 5 5 55 5 5 5; ; ; ;N N N NO O O OO O O ON N N NO O O OO O O ON N N NO O O OO O O OO O O O
 
 
 
Q Q Q Q Q Qr   r  dtypetorch.dtypec                :    t          t          |                     S )zCConvert torch.dtype to triton type and upcast [b]float16 to float32)r?   rA   r$  s    r   triton_compute_typer(  "  s    *511222r   c                X    | t           j        k    rt           j        } t          |           S )z@Convert torch.dtype to triton type, with fix for storing tl.bool)r|   rv   int8r?   r'  s    r   triton_store_typer+  '  s$    

ur   c                z    t          |           r| j        r| j        dk    rt          j        S t          |           S )z0Implicit upcasts used for Triton reduction types   )r   	is_signeditemsizer|   int32rA   r'  s    r   upcast_acc_dtyper1  .  s=     5? u~7J7J{u%%%r   c                :    t          t          |                     S )z:Convert torch.dtype to triton type, with reduction upcasts)r(  r1  r'  s    r   triton_acc_typer3  5  s    /66777r   rv   c                &    | j         dk    o| j        S )Nr   )r/  is_floating_pointr'  s    r   low_precision_fpr6  :  s    >Q:5#::r   varUnion[CSEVariable, Any]c                    t          | t                    sdS | j        }t          |t          j                  rt	          |          ndS r\  )r  rM   r$  r|   r6  )r7  r$  s     r   low_precision_fp_varr:  >  sE    c;'' uIE&0&D&DOE"""%Or   c                  *     e Zd Z	 dd fdZd Z xZS )TritonCSEVariableNrz  r   boundsValueRanges[Any]r$  r%  r   ri   rw   rx   c                    t                                          ||||           t                      | _        |
J d            d S )Nr   z!TritonCSEVariable must have dtype)super__init__r   r   )r   rz  r=  r$  r   	__class__s        r   rB  zTritonCSEVariable.__init__G  sL     	vuE:::*4,,  "E     r   c                B   |D ]}t          |t                    r | j                            |j                   7t          |t          j                  rJt          j        D ]=}t          ||          r+| j                            t          |          dg            n>d S )Nr   )
r  r<  r   updater   r   r   r   r   r   )r   rz  r  kwargsargr   s         r   update_on_argsz TritonCSEVariable.update_on_argsU  s     
	 
	C#011 	%%cm4444C..  *5  D%c400 --*T2B/H/H/H.IJJJ
	 
	r   rz   )
rz  r   r=  r>  r$  r%  r   ri   rw   rx   )r{   r   r   rB  rH  __classcell__rC  s   @r   r<  r<  F  s]         !%
F 
F 
F 
F 
F 
F 
F      r   r<  rg   c                 "    ddl m}   |             S )Nr   rf   )!torch._inductor.dtype_propagationrg   rf   s    r   get_dtype_handlerrM  c  s#    LLLLLL%%'''r   Tru   Callable[[_T], _T]c                0     d	dd
fdd fd}|S )z
    Codegen helper to upcast arguments to float32, depending on the config and dtype.
    This decorates tl.math/libdevice codegen functions.
    rw   rv   c                    t           j        j         o3t          | t                    o| j        t          j        t          j        fv S rz   )	r   r   codegen_upcast_to_fp32r  rM   r$  r|   float16bfloat16)r7  s    r   needs_upcastz*maybe_upcast_float32.<locals>.needs_upcasto  s<    44 =3,,=	emU^<<	
r   r   c                .     |           rdnd}|  | S )N.to(tl.float32)r   r   )r7  upcast_stringrT  s     r   maybe_upcast_argz.maybe_upcast_float32.<locals>.maybe_upcast_argv  s.    -9\#->->F))B&}&&&r   rs   Callable[..., Any]c                R     t                                           d fd}|S )Nrw   r   c                    fd| D             }fd|                                 D             } 
|i |}	o?t          fdt          j        | |                                          D                       }|sd n& t          t                      
j                  | i |}|t          j	        d fv}|r|dt          |           dnd}| | S )Nc                &    g | ]} |          S r   r   )r   rG  rX  s     r   r	  zLmaybe_upcast_float32.<locals>.decorator.<locals>.wrapped.<locals>.<listcomp>  s%    AAAS++C00AAAr   c                .    i | ]\  }}| |          S r   r   )r   r!  r"  rX  s      r   r   zLmaybe_upcast_float32.<locals>.decorator.<locals>.wrapped.<locals>.<dictcomp>  s+    WWWHCS"2"23"7"7WWWr   c              3  .   K   | ]} |          V  d S rz   r   )r   r7  rT  s     r   r   zKmaybe_upcast_float32.<locals>.decorator.<locals>.wrapped.<locals>.<genexpr>  s>       6 6&)S!!6 6 6 6 6 6r   .to(ro  r   )r*  r   	itertoolschainvaluesgetattrrM  r{   r|   r}   r?   )r  rF  upcast_argsupcast_kwargsr=  any_needs_upcastresult_dtypeneeds_downcastdowncast_stringru   rs   rX  rT  s            r   wrappedz8maybe_upcast_float32.<locals>.decorator.<locals>.wrapped~  s6   AAAADAAAKWWWWWWWM T;8-88F-  # 6 6 6 6-6_T6==??-S-S6 6 6 3 3
 (R@W.00$-@@$Q&QQ 
 *%-1FFN "&2&> 4{<003333 
 /o///r   r   )rp   r   )rs   rj  ru   rX  rT  s   ` r   	decoratorz'maybe_upcast_float32.<locals>.decoratorz  sN    &&t^<<<	0 	0 	0 	0 	0 	0 	0 	0 	0. r   r   r   )rs   rY  rw   rY  r   )ru   rk  rX  rT  s   ` @@r   maybe_upcast_float32rl  i  sg    
 
 
 
' ' ' ' ' '       : r   c                  H   e Zd ZdZ ej        ej                  Ze	 	 dRdSd            Z	edTd	            Z
ed
             Zed             Ze e            d                         Zed             Zed             Ze e            d                         Ze e            d                         Ze e            d                         Ze e            d                         Zed             Zed             Zed             Zed             Zedej        dddd            Ze e            d                         Ze e            d                         Zed             Zed             Z e e            d                         Z!e e            d                         Z"e e            d                          Z#e e            d!                         Z$e e            d"                         Z%e e            d#                         Z&e e            d$                         Z'e e            d%                         Z(e e            d&                         Z)e e            d'                         Z*e e            d(                         Z+e e            d)                         Z,e e            d*                         Z-e e            d+                         Z.e e            d,                         Z/e e            d-                         Z0e e            d.                         Ze e            d/                         Z1ed0             Z2ed1             Z3ed2             Z4ed3             Z5ed4             Z6ed5             Z7ed6             Z8ed7             Z9ed8             Z:ed9             Z;ed:             Z<ed;             Z=ed<             Z>ed=             Z?e e            d>                         Z@e e            d?                         ZAe e            d@                         ZBe e            dA                         ZCe e            dB                         ZDedC             ZEe e            dD                         ZFe e            dE                         ZGe e            dF                         ZHe edGH          dI                         ZIe edGH          dJ                         ZJe e            dK                         ZKe e            dL                         ZLedM             ZMedN             ZNe e            dO                         ZOedP             ZPe e            dQ                         ZQdS )UTritonOverrideszMap element-wise ops to TritonNTr$  r%  	src_dtypeOptional[torch.dtype]c                <   dd}|8t           |||          t          j        j                  t          j        _        |t          j        k    rd|  dS |t          j        k    r||j        s||  d	S |rt          |          }nt          |          }|  d
| dS )Nro  r%  	dst_dtyperw   intc                    | |k    rdS t           j        t           j        f}| |v r||v r| |k    r
J d            | t           j        k    s|t           j        k    rdS | t           j        k    s|t           j        k    rdS dS )Nr   zCConversions between float8_e5m2 and float8_e4m3fn is not supported!r-  r   )r|   float8_e4m3fnfloat8_e5m2)ro  rr  
fp8_dtypess      r   _get_min_elements_per_threadz>TritonOverrides.to_dtype.<locals>._get_min_elements_per_thread  s     I%%q #!J Z''++***T +*
 E---e>O1O1OqE///9@S3S3Sq1r   r  z != 0)z.to(tl.int16).to(tl.uint8)r_  ro  )ro  r%  rr  r%  rw   rs  )
maxrE   r+  min_elem_per_threadr|   rv   uint8r5  r(  r+  )r%  r$  ro  use_compute_typesrx  	out_dtypes         r   to_dtypezTritonOverrides.to_dtype  s    	 	 	 	6   ,/,,Y>>,, ,AH(
 EJ q=== ek!!!i&A!YEV 3333 	1+E22II)%00I%%%%%%r   c                    |j         |j         k    sJ | j        |k    r|  dt          |           d} |  dt          |           d}t          |          |k    r"| dt          t          |                     d}|S )Nr_  ro  z, bitcast=True))r/  r$  r?   rA   )r%  r$  ro  outs       r   to_dtype_bitcastz TritonOverrides.to_dtype_bitcast  s    !U^3333 7i33+i00333A;;E**;;;u%%..HHk*=e*D*DEEHHHC
r   c           	        t           j                            |          }t           ||                     }t	          |          }|dk    r|S | dk     r#|j        sd|dd           }d| d| d| d| d	S d| d| d| dS )	Nz
tl.float32r   ztl.r-  tl.full(rn  r  ro  )r|   _prims_commondtype_to_typerX   r(  r.  )rc  r$  r   type_
triton_valr?   triton_signed_types          r   _shaped_constantz TritonOverrides._shaped_constant  s    #11%88"55<<00
)%00,&& 199U_9!8{122!8!8\e\\z\\5G\\k\\\\CeCCzCC[CCCCr   c                2    |                      ||g           S )Nr@  )r  )r   rc  r$  s      r   constantzTritonOverrides.constant  s    ##E5#;;;r   c                    d|  dS )Nr   ro  r   r%  s    r   abszTritonOverrides.abs       #a""""r   c                    d|  d| d}t          |           st          |          rQt                                          | |          }|t          j        t          j        fv r| dt          |           d}|S )Nr  r  ro  r_  )r:  rM  truedivr|   rR  r}   r?   r%  yr  r}  s       r   r  zTritonOverrides.truediv  s    !nnnnn"" 	<&:1&=&= 	<)++33Aq99IU]EM:::;;+i"8"8;;;
r   c                    d|  d| d}t          |           st          |          rQt                                          | |          }|t          j        t          j        fv r| dt          |           d}|S )Nr  r  ro  r_  )r:  rM  modr|   rR  r}   r?   r  s       r   r  zTritonOverrides.mod  s    !nnnnn"" 	<&:1&=&= 	<)++//155IU]EM:::;;+i"8"8;;;
r   c                2    t           j        rd|  dS d|  dS )z
        When use_fast_math, use the ftz (flushing to zero) variant
        of exponent computation.

        Check https://github.com/triton-lang/triton/issues/5735 for
        more details.
        ztl_math.exp(ro  zlibdevice.exp()r   use_fast_mathr  s    r   expzTritonOverrides.exp  s1      	)&!&&&&(A((((r   c                    d|  dS )Nzlibdevice.exp2(ro  r   r  s    r   exp2zTritonOverrides.exp2-       &%%%%r   c                    d|  dS )Nzlibdevice.expm1(ro  r   r  s    r   expm1zTritonOverrides.expm12       '!&&&&r   c                    d|  dS )Nzlibdevice.sqrt(ro  r   r  s    r   sqrtzTritonOverrides.sqrt7  r  r   c                    t           j        j        }|dk    rdS |dk    r	d|  d|  dS |dk    r|  dS |2t          j        t          j        d	t          j                  |           S t          d
|          )Ncompile_errorzcompile error!runtime_errorz"triton_helpers.device_assert_then(z == 0, "injected assert fail", ro  accuracyz + 1r   z:unrecognized config triton.inject_relu_bug_TESTING_ONLY = )	r   r   inject_relu_bug_TESTING_ONLYopsmaximumr  r|   r0  AssertionError)r%  bugs     r   reluzTritonOverrides.relu<  s    m8/!!##O## _^^Z[^^^^J:::[;s|Au{;;Q??? TSTT  r   c                    d|  d| dS )Nztriton_helpers.minimum(rn  ro  r   r  r  s     r   minimumzTritonOverrides.minimumN      222a2222r   c                    d|  d| dS )Nztriton_helpers.maximum(rn  ro  r   r  s     r   r  zTritonOverrides.maximumR  r  r   c                    d|  d| d| dS )Nr  rn  ro  r   )r  r  r  s      r   wherezTritonOverrides.whereV  s#    )1))))Q))))r   rG   )constraintsr$  is_purepackc                    t          |          }d                    d |D                       }|#d                    dgd |D             z             }d|  d| d| d| d	| d
| dS )Nrn  c                ,    g | ]}t          |          S r   r   r   is     r   r	  z:TritonOverrides.inline_asm_elementwise.<locals>.<listcomp>_  s    7771A777r   z=rc                    g | ]}d S )r   r   )r   _s     r   r	  z:TritonOverrides.inline_asm_elementwise.<locals>.<listcomp>a  s    -B-B-Bac-B-B-Br   ztl.inline_asm_elementwise('z', 'z', [z	], dtype=z
, is_pure=z, pack=ro  )r(  r   )asmr  r$  r  r  inputsr?   
input_refss           r   inline_asm_elementwisez&TritonOverrides.inline_asm_elementwiseZ  s     *%00YY7777788
))TF-B-B6-B-B-B$BCCK KS  K  Kk  K  Kz  K  K\g  K  Ksz  K  K  DH  K  K  K  	Kr   c                    d|  dS )Nztl_math.cos(ro  r   r  s    r   coszTritonOverrides.cosd  r  r   c                    d|  dS )Nztl_math.sin(ro  r   r  s    r   sinzTritonOverrides.sini  r  r   c                     t          d          )Nz/ops.index_expr not implemented outside a kernelNotImplementedError)r   r  r$  s      r   
index_exprzTritonOverrides.index_exprn  s    !"STTTr   c                     t          d          )Nz+ops.masked not implemented outside a kernelr  )r   bodyothers      r   maskedzTritonOverrides.maskedr  s    !"OPPPr   c                    d|  dS )Nzlibdevice.lgamma(ro  r   r  s    r   lgammazTritonOverrides.lgammav       (1''''r   c                    d|  dS )Nzlibdevice.erf(ro  r   r  s    r   erfzTritonOverrides.erf{       %$$$$r   c                    d|  dS )Nzlibdevice.cosh(ro  r   r  s    r   coshzTritonOverrides.cosh  r  r   c                    d|  dS )Nzlibdevice.sinh(ro  r   r  s    r   sinhzTritonOverrides.sinh  r  r   c                    d|  dS )Nzlibdevice.acos(ro  r   r  s    r   acoszTritonOverrides.acos  r  r   c                    d|  dS )Nzlibdevice.acosh(ro  r   r  s    r   acoshzTritonOverrides.acosh  r  r   c                    d|  dS )Nzlibdevice.asin(ro  r   r  s    r   asinzTritonOverrides.asin  r  r   c                    d|  dS )Nzlibdevice.asinh(ro  r   r  s    r   asinhzTritonOverrides.asinh  r  r   c                    d|  d| dS )Nzlibdevice.atan2(rn  ro  r   r%  r  s     r   atan2zTritonOverrides.atan2       ,!++q++++r   c                    d|  dS )Nzlibdevice.atan(ro  r   r  s    r   atanzTritonOverrides.atan  r  r   c                    d|  dS )Nzlibdevice.atanh(ro  r   r  s    r   atanhzTritonOverrides.atanh  r  r   c                    d|  d| dS )Nzlibdevice.copysign(rn  ro  r   r  s     r   copysignzTritonOverrides.copysign  s     /Q..!....r   c                    d|  dS )Nzlibdevice.erfc(ro  r   r  s    r   erfczTritonOverrides.erfc  r  r   c                    d|  dS )Nzlibdevice.erfinv(ro  r   r  s    r   erfinvzTritonOverrides.erfinv  r  r   c                    d|  d| dS )Nzlibdevice.hypot(rn  ro  r   r  s     r   hypotzTritonOverrides.hypot  r  r   c                    d|  dS )Nzlibdevice.log10(ro  r   r  s    r   log10zTritonOverrides.log10  r  r   c                    d|  dS )Nzlibdevice.log2(ro  r   r  s    r   log2zTritonOverrides.log2  r  r   c                    d|  d| dS )Nzlibdevice.nextafter(rn  ro  r   r  s     r   	nextafterzTritonOverrides.nextafter  s     0a//1////r   c                    |  d| S Nr   r   r  s     r   logical_andzTritonOverrides.logical_and      ||||r   c                    |  dS )Nz == 0r   r  s    r   logical_notzTritonOverrides.logical_not  s    {{{r   c                    |  d| S Nz | r   r  s     r   
logical_orzTritonOverrides.logical_or  r  r   c                    d|  d| dS )Nr   ^ ro  r   r  s     r   logical_xorzTritonOverrides.logical_xor  s    1~~~~~r   c                    |  d| S r  r   r  s     r   bitwise_andzTritonOverrides.bitwise_and  r  r   c                    d|  S )N~r   r  s    r   bitwise_notzTritonOverrides.bitwise_not  s    1wwr   c                    |  d| S r  r   r  s     r   
bitwise_orzTritonOverrides.bitwise_or  r  r   c                    |  d| S )Nr  r   r  s     r   bitwise_xorzTritonOverrides.bitwise_xor  r  r   c                    |  d| S )Nz << r   r  s     r   bitwise_left_shiftz"TritonOverrides.bitwise_left_shift      }}}}r   c                    |  d| S )Nz >> r   r  s     r   bitwise_right_shiftz#TritonOverrides.bitwise_right_shift  r  r   c                     d| d}d|  d| dS )Nr  ).to(tl.uint32)ztl.rand(rn  ro  r   seedr   s     r   randzTritonOverrides.rand  s+    ,V,,,+$++&++++r   c                     d| d}d|  d| dS )Nr  r  z	tl.randn(rn  ro  r   r  s     r   randnzTritonOverrides.randn  s+    ,V,,,,4,,6,,,,r   c           	     ,    d| d}d|  d| d| d| d	S )Nr  r  ztriton_helpers.randint64(rn  ro  r   )r  r   lowhighs       r   	randint64zTritonOverrides.randint64  s;    ,V,,,K4KK6KKSKKDKKKKr   c                     t          d          )Nz.ops.load_seed not implemented outside a kernelr  )rz  r   s     r   	load_seedzTritonOverrides.load_seed  s    !"RSSSr   c                    d|  dS )Nzlibdevice.rsqrt(ro  r   r  s    r   rsqrtzTritonOverrides.rsqrt  r  r   c                    d|  dS )Nzlibdevice.log1p(ro  r   r  s    r   log1pzTritonOverrides.log1p  r  r   c                    d|  dS )Nzlibdevice.tan(ro  r   r  s    r   tanzTritonOverrides.tan  r  r   c                    d|  dS )Nzlibdevice.tanh(ro  r   r  s    r   tanhzTritonOverrides.tanh  r  r   c                    d|  dS )Nztl.sigmoid(ro  r   r  s    r   sigmoidzTritonOverrides.sigmoid  s     "Q!!!!r   c                    d|  d|  d|  dS )Nz(libdevice.signbit(z) != 0) if (z).dtype is tl.float32 else z < 0r   r  s    r   signbitzTritonOverrides.signbit$  s(     W!VVVVqVVV	
r   c                    d|  d| dS )Nzlibdevice.fmod(rn  ro  r   r  s     r   fmodzTritonOverrides.fmod+  s     +**a****r   c                    d|  d| dS )Nr  rn  ro  r   r  s     r   powzTritonOverrides.pow0  s     *))Q))))r   c                    d|  dS )Nztl_math.log(ro  r   r  s    r   logzTritonOverrides.log5  r  r   F)ru   c                    d|  dS )Nzlibdevice.isinf().to(tl.int1)r   r  s    r   isinfzTritonOverrides.isinf:       3!2222r   c                    d|  dS )Nzlibdevice.isnan(r/  r   r  s    r   isnanzTritonOverrides.isnan?  r1  r   c                    d|  dS )Nzlibdevice.nearbyint(ro  r   r  s    r   roundzTritonOverrides.roundD  s     +a****r   c                    d|  dS )Nr  ro  r   r  s    r   floorzTritonOverrides.floorI  r  r   c                H    |  d| }|  d| }d|  d| d| d| d| d| d	S )
Nr  r  z
tl.where((z
 < 0) != (z < 0), tl.where(z != 0, z - 1, ), ro  r   )r  r  r  rems       r   floordivzTritonOverrides.floordivN  s]    
 }}}}llqlldAddddCddddTXdd]addddr   c                B   t          j        dt          j                  }t          j        t          j        ||           t          j                  }t          j        t          j        | |          t          j                  }t          j        ||          }| d|  dS )Nr   r_  .dtype))r  r  r|   r0  r~  ltr*  sub)r%  zleftrightr?  s        r   signzTritonOverrides.signW  sw    LEK((|SVAq\\EJ77cfQllUZ88gdE""%%1%%%%r   c                    d|  dS )Nr  ro  r   r  s    r   trunczTritonOverrides.trunc_  r  r   c                    |  d| S )Nr  r   r  s     r   truncdivzTritonOverrides.truncdivd  s     }}}}r   c                    d|  dS )Nr  ro  r   r  s    r   ceilzTritonOverrides.ceilj  r  r   )NT)r$  r%  ro  rp  )r$  r%  ro  r%  )Rr{   r   r   r   mathr  e_LOG_2_Estaticmethodr~  r  r  r   r  rl  r  r  r  r  r  r  r  r  r  r  r  r|   r}   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r	  r  r  r  r  r  r  r  r  r!  r#  r%  r'  r)  r+  r-  r0  r3  r5  r7  r;  rC  rE  rG  rI  r   r   r   rn  rn    s
       ((ty  H ,0	:& :& :& :& \:&x    \  D D \D" < < [< # #  \#   \   \ ) )  \) & &  \& ' '  \' & &  \&   \" 3 3 \3 3 3 \3 * * \* "&emTPQK K K K \K # #  \# # #  \# U U [U Q Q \Q ( (  \( % %  \% & &  \& & &  \& & &  \& ' '  \' & &  \& ' '  \' , ,  \, & &  \& ' '  \' / /  \/ & &  \& ( (  \( , ,  \, ' '  \' & &  \& 0 0  \0   \   \   \   \   \   \   \   \   \   \ , , \, - - \- L L \L T T \T ' '  \' ' '  \' % %  \% & &  \& " "  \" 
 
 \
 + +  \+ * *  \* # #  \# ///3 3 0/ \3 ///3 3 0/ \3 + +  \+ ' '  \' e e \e & & \& ' '  \'   \
 & &  \& & &r   rn  r   c                       e Zd ZdZ fdZeej        d                         Zed             Z	ed             Z
ed             Zed             Zed             Zed	             Z xZS )
TritonKernelOverridesa   Map element-wise ops to Triton within a TritonKernel

    Unlike TritonOverrides, these assume the code is going to be inserted into
    the body of the main triton kernel and so it may use indexing and mask
    variables which are assumed to already be defined in the current scope.
    c                b     t                      j        |i | |                                  d S rz   )rA  rB  _setup_libdevice_routing)r   r  rF  rC  s      r   rB  zTritonKernelOverrides.__init__{  s9    $)&))) 	%%'''''r   c                   ddl m t          j        j        j        D ]}t          | |          sJ t          | |          }fd}|dk    rOt          d          sJ t          j	        |||          }||_
        t          | |t          |                     ~d }t          j	        |||          }||_
        t          | |t          |                     dS )z<Set up routing to libdevice implementations for fp64 inputs.r   )OpDecompositionsc                    | j         t          j        k    r ||           S  t          |          |           j        S rz   )r$  r|   r~   rc  rc  )r%  _original_impl_fn_namerS  s      r   decomposition_routerzLTritonKernelOverrides._setup_libdevice_routing.<locals>.decomposition_router  s@    7em++)>!,,,>7#3X>>qAAGGr   r%  )rU  rV  c                T    | j         t          j        k    r	d| d|  dS  ||           S )Nz
libdevice.r  ro  )r$  r|   r~   )r%  rU  rV  s      r   dtype_routerzDTritonKernelOverrides._setup_libdevice_routing.<locals>.dtype_router  s:    7em++77717777)>!,,,r   N)torch._inductor.codegen.commonrS  r|   	_inductorutilsop_requires_libdevice_fp64r   rc  	functoolspartialr{   setattrrM  )r   fn_nameoriginal_implrW  fnrY  rS  s         @r   rQ  z.TritonKernelOverrides._setup_libdevice_routing  s:   
 	DCCCCC,G 	4 	4G3(((((#C11MH H H H H )##/;;;;;&(QX   &Wl2&6&6777- - - "]W  B "BKC,r"2"23333;	4 	4r   c                z    t           j                                        }dg|z  }|                     |||          S )NrG   r@  )rE   r+  triton_tensor_ndimr  )r   rc  r$  ndimr   s        r   r  zTritonKernelOverrides.constant  s=    
 x**,,d
##E5#>>>r   c                   t           j                            |dd           }t          |t                    sJ t           j                                        }|t          j        t          j        fvr|n|}t          j
        j        }	 dt          j
        _        t           j        j                            t           j        j        |j        t!          |          ||j                  }|t          j
        _        n# |t          j
        _        w xY w|t          j        t          j        fvr^t           j        j                            t           j        j        |                     ||          t'          |          |j                  }n|}|j        D ]U}t-          |t.          j                  r9t          j        |t           j        j        j        |j                 j                  }V||k    rPt           j        j                            t           j        j        |                     ||          ||j                  }|j        |_        |S )NF	block_ptrtma_compatibility_checkerr=  r$  r   r$  r   )rE   r+  indexingr  r   get_index_dtype_as_torch_dtyper|   r0  int64r   test_configsruntime_triton_dtype_assertcsegeneratecomputer   r6   r   r~  rA   r   rS  r   r   r   promote_typesvarname_maprz  r$  r   )r   r  r$  rm  r  origr7  	index_vars           r   r  z TritonKernelOverrides.index_expr  s   8$$ET % 
 
 (O44444 h==??u{EK&@@@k ">
	C>CF;(,'' ",T22+ (  C ?CF;;dF;BBBBek222(,'' S%(()%00i	 (  CC  E!.  	!)TX66 !/qx|7	GM E ##hl++H$LLk22%)	 ,   !*
s   A C: :Dc           
        | Zt           j        j        It          j        j                            t          j        j        |  dt           j        | j	                  } |j
                            d          }|s
J d            d}|D ]5}|j        D ]+}|j        dk    st          |j        d                   rd	} n,6|rd n|}t          j                            | |
          5 } |            }	d d d            n# 1 swxY w Y   |r|	j        j        rt          |          }t          j        j                            t          j        j        d|	 dt%          |           d|	 dt'          j        |          |	j        |	j	                  }t-          j        ||	|          }
n|	}
|
j                            |           |
S )N.to(tl.int1)rl  output)opz)graph for body does not contain an outputFloadrG   Trc  r  z.shape, rn  r=  rk  )r|   r  r  rE   r+  rr  rs  rt  rv   r   r&  
find_nodesr  targetra   
mask_loadsr=  is_boolrX   r   wrapr$  r  r  r   discard)r   r  r  nodes
need_wherenoderG  rc  new_maskr=  r  s              r   r  zTritonKernelOverrides.masked  s    1 =8<(( %%%jj	 )  D 
%%%22AAAAAu
  	 	Dy  :''+CCHQK+P+P'!%JE ( #-X  U 33 	xTVVF	 	 	 	 	 	 	 	 	 	 	 	 	 	 	  	}$ $UHL)) R6RR=+?+?RR6RRR"'..ll *  E )Hfe44CCCh'''
s   %C<<D D c                    t           j        j                            |           }d| dt           j        j                            d|           dS )Ntl.load( + load_seed_offsetro  )rE   r+  r  inputseed_offset)rz  r   r7  s      r   r  zTritonKernelOverrides.load_seed  sI    hm!!$''WsWWqx}889KVTTWWW	
r   c                   d|  d}t           j        j                            |          x}r|S t           j        j                            | j        | j                  }t           j        j                            t          j        | j                  }t           j        j	        
                    | d| d|  d           t           j        j                            |||f           ||fS )Nzfrexp(ro  rl  rn  z = triton_helpers.frexp()rE   r+  rr  try_getnewvarr$  r   r|   r0  rt  r   put)r%  	cache_keycse_valmantissaexponents        r   frexpzTritonKernelOverrides.frexp  s    !QMMM	hl**95557 	N8<&&QWAG&DD8<&&U[&HH	""AA8AAQAAA	
 	
 	
 	
Xx$8999(##r   c                .    d|  dt          |           dS )Nztl.device_assert(rn  ro  )repr)condmsgs     r   device_assert_asyncz)TritonKernelOverrides.device_assert_async-  s!    74774997777r   )r{   r   r   r   rB  r   r^  cacherQ  r  r  rM  r  r  r  r  rI  rJ  s   @r   rO  rO  s  s        ( ( ( ( ( _"4 "4 _ ["4H ? ? [? 5 5 [5n , , \,\ 
 
 \
 $ $ \$ 8 8 \8 8 8 8 8r   rO  c                  J    e Zd ZU dZded<   ded<   ddZd	d
ddZd Zd ZdS )HelperFunctionsz#An ordered set of helper functions.zdict[str, str]_templates_seen	list[str]finalized_helpersrw   rx   c                "    i | _         g | _        d S rz   )r  r  r   s    r   rB  zHelperFunctions.__init__8  s    !!#r   _triton_helper_fn	base_nametemplate_coder   c                   | j                             |          }||S | t          | j                   }|| j         |<   | j                            |                    |                     |S )a9  This accepts a function definition with the function name
        left as a format specifier e.g.

            @triton.jit
            def {name}(arg0, arg1):
                return arg0 + arg1

        We add the templated code to the function set and return the name
        assigned to that function.

        N)rz  )r  getr1  r  r  r  )r   r  r  existing_namerz  s        r   addzHelperFunctions.add<  s{     ,00??$  :S!788::.2]+%%m&:&:&:&E&EFFFr   c                *    t          | j                  S rz   )iterr  r   s    r   __iter__zHelperFunctions.__iter__R  s    D*+++r   c                    | j         |         S rz   )r  )r   rU  s     r   __getitem__zHelperFunctions.__getitem__U  s    %c**r   Nrw   rx   )r  r   rw   r   )	r{   r   r   r   r   rB  r  r  r  r   r   r   r  r  2  s         --####    $ $ $ $ 4G      ,, , ,+ + + + +r   r  c                      e Zd ZU dZ ej        e          Zded<    ej        e          Z	ded<    ej        e          Z
ded<    ej        e          Zded<   dd
ZdS )r   zM
    Class representing ND block dimensions, for block pointer analysis.
    )default_factoryr   r   r   r   r   r  rw   c                    t          |           }t          d | |fD                       \   |di fdD             S )z0
        Concatenates block parameters.
        c              3  >   K   | ]}t          j        |          V  d S rz   )r(  r)  r  s     r   r   z*BlockParameters.__add__.<locals>.<genexpr>i  s-      BBq['**BBBBBBr   c                4    i | ]}||         |         z   S r   r   )r   r!  r  r  s     r   r   z+BlockParameters.__add__.<locals>.<dictcomp>j  s'    888sc1S6AcF?888r   r   )r  r  )r   r  r   r  r  s      @@r   __add__zBlockParameters.__add__d  sc     4jjBBT5MBBBBB1s9988888a888999r   N)r  r   rw   r   )r{   r   r   r   r(  fieldr7  r   r   r   r   r   r  r   r   r   r   r   Y  s           0k/EEEEEEEE$5K$5d$K$K$KKKKKK 1 1$ G G GGGGGG 1 1$ G G GGGGGG: : : : : :r   r   c                  ,    e Zd ZdZd Zd	dZd Zd ZdS )
"CooperativeReductionWorkspaceCachez
    The scratch space used for cooperative reductions can be reused
    after two reduction loops.  This keeps track of what can be reused.
    c                    || _         g | _        g | _        t          j        t          j                  | _        d| _        d| _        d S rH  )	r  current_loop
prior_loopcollectionsdefaultdictdequeready_for_reuse
loop_countstore_count)r   r  s     r   rB  z+CooperativeReductionWorkspaceCache.__init__s  sC    	*6{7HIIr   nbytesr   c                    | j                             |          }|r|                                S | j                            |d          \  }}| j                            |||f           ||fS r\  )r  r  popleftr  	workspacer  r  )r   r  cachedws_name	ws_offsets        r   allocatez+CooperativeReductionWorkspaceCache.allocate{  su    %))&11 	$>>###!Y00??  &'9!=>>>##r   c                    | j         D ](\  }}}| j        |                             ||f           )| j        | _         g | _        | xj        dz  c_        d S NrG   )r  r  r  r  r  )r   r  r  r  s       r   on_loop_endz.CooperativeReductionWorkspaceCache.on_loop_end  sf    *./ 	F 	F&FGY (//)0DEEEE+1r   c                4    | j         }| xj         dz  c_         |S r  )r  )r   priors     r   increment_store_countz8CooperativeReductionWorkspaceCache.increment_store_count  s#     Ar   N)r  r   )r{   r   r   r   rB  r  r  r  r   r   r   r  r  m  s_         
  $ $ $ $      r   r  c                  &    e Zd ZU ded<   d Zd ZdS )FixedTritonConfigzdict[str, int]r   c                    | j         |         S rz   r   r   r  s     r   r  zFixedTritonConfig.__getitem__  s    {4  r   c                    || j         v S rz   r  r  s     r   __contains__zFixedTritonConfig.__contains__  s    t{""r   N)r{   r   r   r   r  r  r   r   r   r  r    s@         ! ! !# # # # #r   r  c                      e Zd ZdZddZdS )		TritonCSEz
    Subclasses CSE to apply the current load mask to the cache key to avoid CSEing
    variables across separate masked blocks.
    r  r   rw   Union[str, tuple[str, str]]c                >    t           j        j        x}r	||j        fS |S rz   )rE   r+  
_load_maskrz  )r   r  r   s      r   augment_keyzTritonCSE.augment_key  s'    8&&4 	ty))r   N)r  r   rw   r  )r{   r   r   r   r  r   r   r   r  r    s2         
     r   r  c                  H    e Zd ZU dZded<   ded<   ded<   d Zdd
ZddZdS )TMACompatibilityCheckerzO
    Checks if the TMA API can be used for load / store triton operations.
    TritonKernelr+  r%  r$  rv   	for_storec                    d| _         d S )Nz2Cannot use TMA descriptor for load / store since: )failed_debug_prefixr   s    r   __post_init__z%TMACompatibilityChecker.__post_init__  s    #W   r   rw   c                   t           j                                        j        dk    rSt          j                                        d         dk    r+t          j        j	        rt          j
        rt                      s"t                              d| j                   dS | j        r.| j        j        r"t                              d| j                   dS dS )Ncudar   	   z}%s Requires triton>=3.4.0, a CUDA device with cc>=9.0 and `use_tensor_descriptor` and `assume_aligned_inputs` options enabledFz/%s stores with `no_x_dim` cannot load 16 bytes.T)rE   r&  get_current_device_or_throwr  r|   r  get_device_capabilityr   r   use_tensor_descriptorassume_aligned_inputsr   r-  debugr  r  r+  r,  r   s    r   can_use_tmaz#TMACompatibilityChecker.can_use_tma  s     G//116&@@
002215::3 ;, ; *++ ; II[ (   5 > 	dk2 	IIA(   5tr   block_paramsr   c           
        t           j        j                            |j        d         t          j        d                    s"t                              d| j	                   dS | j
        j        }|j        dd         D ]}t           j        j                            t          ||z  dt          j        d                    t          j        d                    s#t                              d| j	                    dS |j        d         }d}d}|j        D ]'}t          j        D ]}t#          ||          r|}|} n(|r|sJ | d	t          j                     | j        j        r| j        st*          |         }	d}
| j        j        D ]}|j        r|j        |	k    r	|j        }
 n|
J | j                            |
          }|                    ||i          |z  }t           j        j                            |t          j        d                    s"t                              d
| j	                   dS n	 t;          t=          t          j        ||z  dz
  |d                              }t           j                             |          }| j        j!        rL|| j        j!        |         k    r5t                              d| j	        || j        j!        |         |           dS n;tE          || j        j#        $                    |d                    | j        j#        |<   n1# tJ          $ r$ t                              d| j	                   Y dS w xY wdS )zB
        Check if the block parameters are valid for TMA.
        r  rG   z-%s TMA API requires innermost stride to be 1.FN   r   z8%s TMA API requires outer strides to be 16 byte aligned.z, expr must contain a single block type from zC%s persistent reduction innermost block shape cannot load 16 bytes.zT%s For block %s, fixed config block size %d is smaller than the minimum required: %dz.%s innermost block shape cannot load 16 bytes.T)&rE   r&  r  r  r   r   rJ  r-  r  r  r$  r/  r   r   rS  r   r   r   r+  persistent_reductionr  r   r   r4  r-  numel_get_persistent_RBLOCKsubsstatically_known_geqr/   rs  nsolverq  fixed_configry  tma_min_block_sizesr  r  )r   r  element_sizer  innermost_block_shapeinnermost_block_typeinnermost_block_symtblock_type_str
block_symtinnermost_tree_prefix
tree_numelrP  persistent_rblockinnermost_block_bytesmin_block_sizes                  r   are_block_parameters_compatiblez7TMACompatibilityChecker.are_block_parameters_compatible  s    w77 $emA&6&6
 
 	 II?(   5z*"*3B3/ 		 		F7#;; 5q%-:K:KLLa     		N,   uu !- 8 <##3@ 	 	N+7  
!.*== +9(+5(E $ 	
(< 	
 	
$mmR_Rkmm	
 	
<
 ;+ E	DN E	 %//C$D!J[,  > x#888%&W
))) $ B B: N N%**,@BS+TUU " 7#88%u}R'8'8   		Y,   u%!01L@2E0  " " "#!6!67K!L!L;+ %(@(PPP		< 4* K4^D*    %u Q GJ&7;;NANNG GDK3NC
    		D,   uu ts   >B*L' *<L' '*MMNr   )r  r   rw   rv   )r{   r   r   r   r   r  r  r  r   r   r   r  r    s           OOOX X X   Bw w w w w wr   r  c                      e Zd ZU dZeZded<   eZded<   dZ	e
Z	 	 	 	 dd fdZddZddZd Zd Zd Zd ZddZd Zedd            Zdddddd dd%Z	 ddd,Zdd-Zdd2Zd3 Zdd4Z	 ddd9Zd: Zdd=Z	 	 dddGZ ddHZ!ddJZ"ddKZ#ddPZ$ddQZ%ddRZ&dS Z'dT Z(dU Z)dV Z*ddWZ+ddXZ,dd\Z-dd_Z.ddbZ/dc Z0ddeZ1df Z2dg Z3dh Z4e5di             Z6dddjZ7e5dk             Z8e5dl             Z9dm Z:ddoZ;dp Z<dddsZ=ddtZ>ddvZ?ddyZ@dd{ZAdd}ZBdd~ZCddZDddZEddZFddZGddZHeIdd            ZJddZKddZLeIdd            ZMddZNddZOddZP xZQS )r  zdA class to represent a triton kernel and helpers to generate
    triton kernel programmatically
    r  helper_functionszCallable[[sympy.Expr], str]kexprTr   Ntilingdict[str, sympy.Expr]r  Optional[FixedTritonConfig]hint_overrideOptional[int]rw   rx   c                   || _         || _         t                      j        |fi | t	          | j        | j                  | _        t                      | _	        t                      | _
        t          t                               | _        || _        t          j                    | _        t%          t&          t&          f                     | _        t+                      | _        t/          j        t$                    | _        t%          t&          t4          f                     | _        || _        t/          j                    | _        t          t>                               | _         d | _!        | j"        r| #                    | j$                   | j%        r| &                                 | '                                 | j%        r| (                                 d S d S rz   ))optimize_maskr  rA  rB  r  newvar_prefixsuffixrr  rO   post_loop_combinepost_loop_storer   r   outside_loop_varsrz  r`  countblock_ptr_iddictr   block_ptr_to_bufferr  r  r  r  pointer_advancementsrs  r  r  Counter_load_countsr*   autotune_hintstriton_metar0  codegen_reduction_numelsr  cooperative_reductioninit_cooperative_reductioncodegen_range_treeinit_cooperative_reduction_mask)r   r	  rz  r  r  r  rF  rC  s          r   rB  zTritonKernel.__init__]  s    $1(**6***T/==1?1A1A/=/?/?!+C!2!2#6 %O--#'S>#3#3  / 1 1#D)) 	! $(S>#3#3 *6A6I6K6K )68859  	5))$)444% 	.++---!!!% 	30022222	3 	3r   r$  r%  r   c                     t          |          S rz   )r?   )r   r$  s     r   dtype_to_strzTritonKernel.dtype_to_str  s    5!!!r   rv   c                X    | j         o#t          j                            | j                  S rz   )r0  rE   choices should_use_cooperative_reductionr3  r   s    r   r'  z-TritonKernel.should_use_cooperative_reduction  s+    $ 
)S)SM*
 *
 	
r   c                     j         sJ  j        D ]}|j        |xj        dz  c_         j        d         } j        rt          | j        d                   } j                            |           _        t           j                   _
         j                            d           t           fd j        D                       r j                            d           dS dS )z/One time setup code for cooperative reductions.NrG   r%  r   a              RSPLIT_NEXT_POWER_OF_2: tl.constexpr = triton_helpers.constexpr_next_power_of_2(RSPLIT)
            RSPLIT_IS_POWER_OF_2: tl.constexpr = RSPLIT == RSPLIT_NEXT_POWER_OF_2
            HAS_RSPLIT: tl.constexpr = RSPLIT > 1
            rsplit_id = tl.program_id(0)
            num_rblocks = (rnumel + RBLOCK - 1) // RBLOCK
            rsplit_chunk = (num_rblocks + RSPLIT - 1) // RSPLIT * RBLOCK
            rsplit_start = rsplit_chunk * rsplit_id
            rsplit_end = rsplit_chunk * (rsplit_id + 1)
            c              3  R   K   | ]!}|j         	                    |           V  "d S rz   )r4  _has_constant_mask)r   r   r   s     r   r   z:TritonKernel.init_cooperative_reduction.<locals>.<genexpr>  sQ       
 
 
''---
 
 
 
 
 
r   z>rsplit_end = tl.where(rsplit_end < rnumel, rsplit_end, rnumel))r  r   grid_dimr2  r  r   r  
semaphoressemaphores_namer  %cooperative_reduction_workspace_cacher  r   r   r   )r   r   	sem_counts   `  r   r   z'TritonKernel.init_cooperative_reduction  s.   )))) $ 	# 	#D}("K$	 	H	4+<X+FGGI#y33I>>5WI6
 6
2 				
 	
 	
  
 
 
 
(
 
 
 
 
 	
 IP    	 	r   c                   d}| j         s| d}| j                            d|            |                                 r| j                            d           d S | j         rJ | j                            d           d S )Nz$tl.arange(0, RSPLIT_NEXT_POWER_OF_2)z	[None, :]zrsplit_arange = z                if RSPLIT_IS_POWER_OF_2:
                    rsplit_mask: tl.constexpr = None
                else:
                    rsplit_mask = rsplit_arange < RSPLIT
                zSrsplit_mask = xmask if RSPLIT_IS_POWER_OF_2 else ((rsplit_arange < RSPLIT) & xmask))r,  r  r   _has_constant_xmaskr   )r   rsplit_aranges     r   r"  z,TritonKernel.init_cooperative_reduction_mask  s    >} 	8,777M	>}>>???##%% 	I     }$$$Ie    r   c                   | j         D ]c}|j        s|                     || j                   %| j        r7| j                            |j         d|                     |                      d| j        rt          d | j         D                       r_| 	                    ddd          }| 
                    |          }| j                            d|                     |                      d S |                     | j                   d S d S )Nzbase = c              3  $   K   | ]}|j         V  d S rz   is_loopr$  s     r   r   z2TritonKernel.codegen_range_tree.<locals>.<genexpr>  s$      ==D4<======r   baseTr   zrbase = )r   r6  iteration_ranges_codegen_headerr  r0  r   r-  iteration_ranges_ranges_coder   _get_reduction_symbols_flatten_reduction_indicesr   rq  codegen_reduction_indices)r   r   rn_basesrbases       r   r!  zTritonKernel.codegen_range_tree  sB   $ 		 		D< 44T49EEEE&  	##{TT4+L+LT+R+RTT     
	:==D,<===== 	:66Dd 7   77AA	  !FD,=,=e,D,D!F!FGGGGG ..ty99999
	: 
	:r   c                    dS )z
        Indicate whether we need provide numel as arguments for the generated
        kernel calls in the benchmark.

        Should be true for pointwise/reduction kernels but false for triton
        matmul kernels.
        Tr   r   s    r   need_numel_argszTritonKernel.need_numel_args  s	     tr   c                d    | j         o)t          j                            | j        | j                  S rz   )r0  rE   r&  should_use_persistent_reductionr3  r  r   s    r   rB  z,TritonKernel.should_use_persistent_reduction  s0    $ 
)R)RM45*
 *
 	
r   c                    | j         o7t          | j                  | j        dz   k    o| j        o| j        d         dk    S )NrG   r   )r  r1  r2  r/  r  r   s    r   want_no_x_dimzTritonKernel.want_no_x_dim  sN    % 1DK  D$;a$??1!1 !(+q0		
r   c                    dS )Nztl.device_assertr   r   s    r   assert_functionzTritonKernel.assert_function  s    !!r   F)
copy_shapedense_indexingoverride_maskri  rj  r   r   rj  !Optional[TMACompatibilityChecker]c          
     	                                    j        }d}t                      t          |t	          j        d                    D ]8t          t          j                  sJ |pt          t          j                  }|r>t          t          j                  r2 j        j        j                 }	                    |	j                   t          t          j        t          j        t          j        t          j        t          j        t          j        f          r݈fdt          j        D             }
t5          |
          dk    sJ dj                                         |
d          d           :t8          j        j        p
|p j        d	uodk    }d
}d}t                      }                                  D ]>}|!                    |j"                  rd
}nd}|                    |j#         d           ?|r j$        rt8          j        j%        sr~&                                rj|sh j        sat5          |z
            dk    rK '                              s6|r4 j(        dk    r)d"dd" fdd#fdd$ fd} |            }||S d	}d	} )                              }t          t          j*                  r|r| dn +                                }|rd	n tY           -                                          }d| d| d} j.        r% /                                st          dg          nt                       j        r                     j                   ta          ||||          S |rN|sL|r| dn +                                }|rd	n tY           -                                          }d| d| d}|n|s|rd| d| d }||,|s|r&|rd	n tY           -                                          }nd!}|rt          |g           j        r                     j                    1                               ta          ||||          S )%zO
        Compute the index and mask to pass to tl.load() or tl.store()
        Frz  r!  c                J    g | ]}t          |          t          |          S r   )r   r   )r   r   r7  s     r   r	  z)TritonKernel.indexing.<locals>.<listcomp>'  s>     " " "%c400"t$" " "r   rG   zAmbiguous type: r   r   NTtl.int32r   r   
range_treer[   rw   Optional[BlockParameters]c                    t          j        | |                                          }|dS t          |j        gt
                              |          g|gt
                              |          g          S )z
                Matches expressions of the form:
                    idx = s * xindex

                This implies stride (s,), and shape (XBLOCK,).
                Nr   r   r   r   )rH   match_affine_block_exprsymbolr   r  r   r   r   )r   rO  r  s      r   match_affine_blockz1TritonKernel.indexing.<locals>.match_affine_blockQ  s     -D:,,..  >4&%+,!.!=!=j!I!I J#H*;;JGGH	   r   c                ~                                    t          j        dt          j        t          j        g                    \  }}t          dt          j                  | 	                    t          |                    | 	                    t          ||                    z             }t          j        | j        |          }|dS |\  }}}t          j        |          }	t           j        j                            j                  t+          fd|	D                       rdS t,                                        t1          |	d                   gfdt3          |	d	d         |d	d                   D             z   }
fd
|D             }t5          ||
||          S )a  
                Matches higher-dimensional blocks coming from FloorDiv and ModularIndexing.

                Example expression to match:
                   sN * ((rindex//(d1 * ... * d(N-1))))
                       + s1 * ModularIndexing(rindex, 1, d1)
                       + ...
                       + s(N-1) * ModularIndexing(rindex, d1 * ... * d(N-2), d(N-1))

                This iterates over a block of shape (dN, ..., d1) and stride
                (sN, ..., s1). (d1,...,d(N-1)) and (s1,...,sN) are
                wildcards that we match.

                Note that dN does not appear in the expression, but we solve for it
                using range tree numels and the other dims.
                zdenom modulo)exclude)r   r   Nc              3  r   K   | ]1}                     |           o                    |           V  2d S rz   )rT  statically_known_power_of_2)r   r  	max_blockr  s     r   r   zETritonKernel.indexing.<locals>.match_mod_div_block.<locals>.<genexpr>  sf          !==eYOOO H$@@GGG     r   r   c                \    g | ](\  }}t          j        t          |          |          )S r   )r   Minr   )r   r  r  linear_block_sizes      r   r	  zFTritonKernel.indexing.<locals>.match_mod_div_block.<locals>.<listcomp>  sD       "s Ig&7??EE  r   rG   c           	     d    g | ],}t          |t                                        i          -S r   )r>   r   r   )r   r  rx  rO  s     r   r	  zFTritonKernel.indexing.<locals>.match_mod_div_block.<locals>.<listcomp>  sM     3 3 3  y-*H*H*T*TU 3 3 3r   rR  )rT  r   symbolsr^  r_  Wildry  r1  range_tree_nodesr  r   r   rH   match_mod_div_block_exprr  get_slice_numelsrE   r&  r  rZ  r-  r   r   r   r   r  r   )r   rO  denommodulonum_dimsmatch_resultr  r   block_index_exprsslice_numelsr   r   rx  r]  rZ  r  r   s    `          @@@@r   match_mod_div_blockz2TritonKernel.indexing.<locals>.match_mod_div_blockg  s,   ( '--//	 !&"!)%*ykJJJ! ! !v -..HY$>$>??++oi&O&OPPQ	   3K9j&6     '4 !	%2CDII 7+ NN:+<==	      ".      
  4 %2$@$@$L$L!-|A??1   &),qrr*:DH&E&E  13 3 3 3 3 !2	3 3 3 ' +#)	   r   r  c                :    fD ]} || |          }||c S dS )ze
                Match a block indexing subexpression involving a single range tree.
                Nr   )r  rO  
match_funcmatchrU  rj  s       r   match_block_subexprz2TritonKernel.indexing.<locals>.match_block_subexpr  sJ     ''# % %J 'JtZ88E($ ) tr    Optional[BlockDescriptorOptions]c                   
 t          d j                                        D                       
                                } 
fd| D             }t	          d | D                       }t                      }t          | |          D ]I\  }}t          |                    |j	                            dk    r d S  ||          }| d S ||z  }J
t          |          z
  }                               t          j        j        rt          nt           }|                    ||| j                  }	|t           k    r1t'          t(                                        |	j                  sd S |	S )Nc                $    i | ]\  }}||j         S r   r  )r   vrP  s      r   r   zCTritonKernel.indexing.<locals>.match_block_expr.<locals>.<dictcomp>  s     PPP$!QAqvPPPr   c                ^    g | ])}t          j        |                                          *S r   )rH   get_subexpr_involving_symbolrT  )r   r   index_relative_to_xyr_indexs     r   r	  zCTritonKernel.indexing.<locals>.match_block_expr.<locals>.<listcomp>  sE     " " "  (D3T[[]] " " "r   c              3  >   K   | ]}|                                 V  d S rz   )rT  r$  s     r   r   zBTritonKernel.indexing.<locals>.match_block_expr.<locals>.<genexpr>  s*      *Q*QT4;;==*Q*Q*Q*Q*Q*Qr   rG   )r   r   r   r   r   )r>   ra  r*  active_range_treesr   r   r  r1  intersectionrS  sumfilter_masksr   r   use_block_ptrr  ry  r>  rZ  r	   r  r  r   )r   index_subexprsrange_symbolsr  r   subexprr   r   options_classoptionsrv  r   r   rn  r   rj  s             @r   match_block_exprz/TritonKernel.indexing.<locals>.match_block_expr  s   .8PP$2G2M2M2O2OPPP/ /+ #5577
" " " " !,	" " " !+*Q*Q[*Q*Q*Q Q Q.00%(n%E%E 
+ 
+MD' =55g6JKKLLqPP#tt 10$??F~#tt F*LL 5s>7J7JJ !!),,, }21OO0 
 (..'$* +'"&. /   !$;;;04/1J1 1- 5TT  $  $tr   z.shaper  rn  z, tl.int32)xmask)r   rm  ro  .shape)r   )r   r   rO  r[   rw   rP  )r  r   rO  r[   rw   rP  )rw   ro  )2prepare_indexingrS  r   r   operator
attrgetterr  r   r   r   r   r   r   r   rr  rv  rz  rE  r   UNBACKED_INTSIZEPRECOMPUTED_SIZEINDEXFLOATUNBACKED_FLOATr   r1  r  r   r   rH  r  rx  ry  var_listr-  allow_block_ptrr|  r  is_indirect_indexingr  rq  rJ  dense_size_strr  dense_size_listr  r1  r   r{  )r   r   rG  rH  rI  ri  rj  
index_varsr   cse_varprefix_matches
need_dense
have_densehave_loop_varsdense_mask_varsr   r  r  r   r   r   r   rU  rn  rj  r7  s   ``    `              @@@@@r   rm  zTritonKernel.indexing  s    %%e,,'

%/\\	*(*=f*E*EFFF 	: 	:Cc5<00000# ~]2( (J  :TX.. :(.sx8  !23333%I)JJ'
 
 : " " " " - 9" " "
 >**a///1NCH1N1N///!28889999 M( ++d* qj	 	 
+5<<++-- 	6 	6D&&t}55 #!%"
4; 4 4 45555 Z	#3Z	8>8SZ	 .	Z	
 2==??Z	 "Z	 OZ	 I/00A55--e44 6 6  J..   ,` ` ` ` ` `D       < < < < < < < < < <~ '&((G"
'+%%e,,	eU]++ 	2<WJ....$BUBUBWBWJ#-P4459M9M9O9O3P3PLG:GGGGGI  ))A)A)C)C )&y11		&LL	 /do...")     	(j 	(2<WJ....$BUBUBWBWJ#-P4459M9M9O9O3P3PLE9EE
EEEI'II 	(J 	(K9KK
KKKI'I "Z "'1TttuT=Q=Q=S=S7T7T! 	4"M?33I? 	+MM$/***)$$$%
 
 
 	
r   r   rz  r7  rm  /Union[BlockPtrOptions, TensorDescriptorOptions]tuple[str, str]c                .   |                                 }t          |t                    r|r|r
|dk    sJ d}n|sd}n|r|dk    sJ d|d}nd|}| j        r"| j        d         j        r|                                rt          | j                  }t          |t                    rd| }nd| }| j
                            t          || d|                    |d	
                                t          |t                    ro|| j        |<   t          j        D ]X}|                    |          }	t%          d |	D                       r1| j        |         }
||
vsJ d| d| d            |	|
|<   Yn|                    |          }||fS )N, other=0.0r   , boundary_check=z, padding_option='zero'r  ri  tma_descriptor = F)rD  c              3     K   | ];}t           j        j                            |t	          j        d                     V  <dS r   N)rE   r&  r  r  r   rJ  )r   r   s     r   r   z1TritonKernel.codegen_block_ptr.<locals>.<genexpr>	  s\         # (@@"EM!$4$4      r   z#duplicate advancement for pointer 'z' at type 'r  )rZ  r  ry  r0  r   r6  r   nextr  r  r  r   rN   r  r  r   r   r  r'  r  )r   rz  r7  rm  r  checkblock_descriptor_idblock_descriptorr   advance_offsetsadvancementss              r   codegen_block_ptrzTritonKernel.codegen_block_ptrR	  sS    ''))h 788 	6   ---- 6 6----LELLL5E55 !(	4 $,(	4 ##%%(	4
 #'t'8"9"9(O44 J#D/B#D#D  #I4G#I#I I-WW(//#u/2U2UWW    (O44 E >B()9: *9 E ED&.&>&>t&D&DO    '6	     ! !#'#<T#BL+<???b>Nbb[_bbb @?? 6EL!122's33&&r   c                   d| d|j          d}t          t          |j         |j                            D ]7\  }\  }}t          j        j                            ||          r
d|j        |<   8|	                    ||j         |j
        d          }| dt          t          j                            |                     d}t          |t                    rd| d| | dS | dt          j                            |j                   d| dS )Nrm  rn  ro  Fr_  	tl.store(z.store()r   	enumerater  r   rE   r&  r  r  r   rt  r   r+  	get_dtyper  r  r+  rq  r   )	r   rz  rm  ri  rc  r  rU  r  broadcast_dims	            r   codegen_block_ptr_store_linez)TritonKernel.codegen_block_ptr_store_line	  s<    D5CCH,@CCC *3$h&>??*
 *
 	8 	8%C%#} w77]KK 827*3/668')=u
 

 KK/0A0A$0G0GHHKKKh00 	<;y;;E;5;;;;WWAH$9$9(:J$K$KWWuWWWWr   r  r  lowerr   c                   |s|sd S t          |t          j                  sJ |                     |dd           }t          |t                    sJ |j        }|                                r|j        nd }|r"t          | 	                    |                    nd }| 
                    ||rdnd ||          }	|                     |          }
| j                            |
|	dt          j                   d S )NFrh  0)
assignmentr$  )r  r   Exprrm  r   r   r   r   texprrename_indexingindirect_assertget_load_bufferrr  rs  r|   r0  )r   r  r  r  r   rm  r   r   size_strlinebuffers              r   check_boundszTritonKernel.check_bounds	  s     	 	F$
+++++==RV=WW(O44444&	(0(9(9(;(;E8$$8=G5--d334444 ##e-ssx
 
 %%h//&$5LLLLLr   c                    |                                 s|                                r| j        S | j        r-| j        d         j        r|                                s| j        S | j        S )Nr  )	r   r   rt  r0  r   r6  r   r  loads)r   rm  s     r   r  zTritonKernel.get_load_buffer	  sw      "" 	h&:&:&<&< 	<!		 $,		 ''))		 9:r   c           
      
   | j                                       }| j        xx         dz  cc<   t          }|                     |          |}t
          j                                      }|                     |d| 	                    | |d                    }|
                                |                                }t          d |                     |                                          D                       }	|                     |          rd}
nO|	sd}
nJ| j        rA| j        d         j        r/fd	}         d
}
t'          j        t*          d|          }nd}
|sr6|                                r"| j        rdt1          | j                   }nd}nd}	 d}t2          j        j        r%| j                                        }|         dk    }	 |                     |           o| j         o| o|	}d}|rd}d}d}t=                    r,|}|t>          j         t>          j!        fv rt>          j"        }d}nktG          |tH          tJ          f          r| &                    |||          \  }}tG          |tH                    rd| | |
 | d}n*| dt
          j'        (                    |j)                   d}|*                    ||j+        |j,        d          }|j,        }nPtG          |tZ          j.                  rd| d| d}|j/        }d}n#d| d|j0         d|j1         |
 | | d
}|j2        }|t>          j         t>          j!        fv r"t2          j        j3        r|dz  }t>          j"        }|t>          j4        k    r"t>          j5        j6        |dz  }t>          j4        }| 7                    |          }| j8        9                    | ||          ||          }|j:        dk    rxx         dz  cc<   tG          |tv                    sJ |j<        |_<        |rd| d| d}| j8        9                    ||||j2                  }|j<        rp|j=        rd}n|t>          j4        k    rd}nd}| j        rt1          | j                  n|}d |j1         d| d| d}| j8        9                    ||||j>                  }| j        r|?                                ss| j@        A                    |           |S )!zc
        Load from the memory location 'name', offset by some indexing expression 'index'.
        rG   TFr  rh  c              3  "   K   | ]
}|d k    V  dS rj  r   r  s     r   r   z$TritonKernel.load.<locals>.<genexpr>	  s7       
 
AF
 
 
 
 
 
r   z, eviction_policy='evict_last'r  c                 ,              k    rsrdS dS )N
evict_lastevict_firstr   )expected_countr   indirect_indexingload_countsrz  s   r   decide_laterz'TritonKernel.load.<locals>.decide_later	  s/    t$~55 6"3 6 (<$}r   z, eviction_policy='<EP>'z<EP>r   z, other=r  z, cache_modifier='.cg'Nr   r  ro  z.load(r|  r  r9  rV  rz  rl  rm  rn  z0.0Truer  r  )Br  r  r  r   r  rE   r&  r  rm  tma_compatibility_checker_clsr   r   r   get_strides_of_loadrb  is_broadcastedr0  r   r6  r^  r_  r5   r   _load_otherrX   r   r   skip_l1_cacher3  buffer_read_countsra   r|   rR  rS  r}   r  r  ry  r  r+  rq  r   rt  r   r   r   rJ  r   r   r   r   rQ  rv   r  r  r  rr  rs  	use_countr<  r   r5  r   r   r  r  )r   rz  r   r7  	make_lineoriginal_indexr$  rm  r   is_coalescedepr  r  has_read_depsr  r  cachemodappend_broadcastr   r  r  load_buffer
result_varzero	other_valr  r   r  r  s    `                       @@@@r   r}  zTritonKernel.load	  sW    iood##'DQCK	 55e<<!!$''==&*&H&Heu 'I ' ' ! 
 
 ((**
**,,  
 
 44^DDKKMM
 
 
 
 
 ~.. 	1BB 	1BB" 	t'7';'C 	% % % % % % % % % ).N+B!)*:FLQQIIB 	: 	8+<+<+>+> 	 &D=1A#B#BDD%E	 =& 	9!%!A!A!C!C.t4q8M	 ##N333 ))!! 	 	  	0/H $#D)) (	#D 777EE (_6M$NOO .*.*@*@#x+ +' % h88 aN&6NNrN8NNNDD.``ah6K6KHL\6]6]```D==(.0Dd  !,NEM:: .=#==>===#+#6 k#kk8+=kk(BSkUWkY^k`hkkk - %-888M8 9 ))
""u}'8'@ &
**844X&&4U ' 
 

 !##"*&788888'1
 	GjGG4DGGGD**Th6K +  J ! *  DDej((!DDD7;7GQM$"2333T  S8#4RR
RRiRRR!X..U*:J /  
 $ 	3X-?-?-A-A 	3* 	3"&&z222r   rc  rM   moderD   c           	        | j                             |          }|}t          j                            |          }d }||                     | |d          }|                     |d|d u |          }	|| j         j        v }
|                     |          }|
r*|r(| j	        
                    t          |d                     t          |	t          t          f          r4|                     |||	          \  }}|                     ||	|||          }nN|d| d|	j         d| d|	j         d		}n2|d
k    rd| d|	j         d| d|	j         d	}nt'          d|           t)          j                    }| j        s5| j        r.|                    |                     || j	                             | j	        
                    t          ||                     | j        s| j                            |           |                                 d S )NTr  )rH  ri  rj  ztl.debug_barrier()r  r|  r9  rn  ro  
atomic_addztl.atomic_add(z, sem='relaxed')zstore mode=)r  r{  rE   r&  r  r  rm  inplace_buffersr  storesr   rN   r  r  ry  r  r  r   r   r  
contextlib	ExitStackr0  r  enter_contextguard_cooperative_storer  r  close)r   rz  r   rc  r  r7  r  r$  rj  rm  
is_inplacer  r  r  r  
exit_stacks                   r   storezTritonKernel.storew
  sg    it$$!!$''$(!<(,(J(Jet )K ) )% ==dl&?	 ! 
 
 TY66
,,^<< 	L. 	LK!!,t5I"J"JKKKh2I JKK 	<&*&<&<T3&Q&Q#e44h 0% DD \\s\\(:\\u\\HY\\\DD\!!pCppX-?ppEppXM^pppDD%&:D&:&:;;;)++
$ 	V)C 	V$$T%A%A$%T%TUUUl466777$ 	."&&u---r   c                    | j                                         }|                    t          |d| d                     |                                S )z
        For cooperative reductions only one thread block should write out the result.
        We rotate which thread block does each write for better parallelism
        zif rsplit_id == (z % RSPLIT):)r.  r  r   rN   indent)r   rz  r  rU  s       r   r  z$TritonKernel.guard_cooperative_store
  sP    
 8NNPPd,P,P,P,PQQRRR}}r   	variablesOptional[CSEVariable]c                b    d }|D ])}|t          |d          r||j        }||j        z  }*|S )Nr   )r   r   )r   r  maskselems       r   _combine_maskszTritonKernel._combine_masks
  sP     	3 	3D|t[)) 3= NEE!DN2Er   rb  
boundaries.tuple[str, sympy.Expr, sympy.Expr, sympy.Expr]boundary_indicesindexing_dtyperB  sorter Optional[tuple[str, sympy.Expr]]sorter_indicesc                   | j                             t          j                   | j                            |d                   }|                     |d                   }	|                     |d                   }
|                     |d                   }|r | j                            |d                   nd}|r|                     |d                   nd}|t          j        k    rd}n"|t          j	        k    rd}nt          d          | j                            | j        d	| d
| d
|	 d
|
 d
| d
| d
| d
| d
| d
| d
| d||j                  }|                     |||          }||_        |S )z3
        See [Note: Inductor bucketize op]
        r   rG   r   r   rx   rN  ztl.int64z5Bucketize only supports indexing with int32 and int64z'triton_helpers.bucketize_binary_search(rn  z, )rl  )r  r  r*   ONE_ELEMENT_PER_THREADr  r  rq  r|   r0  ro  r  rr  rs  rt  r   r  r   )r   rb  r  r  r  rB  r  r  boundaries_ptrboundary_sizeboundaries_underlying_numelboundary_stride
sorter_ptrsorter_stridetriton_dtyper=  r  s                    r   	bucketizezTritonKernel.bucketize
  s   $ 	 CDDDA77))*Q-88&*&7&7
1&F&F#++JqM::39ETY__VAY///v
8>J))&)444FU[((%LLu{**%LL%G   ""Lf   !. 2M Q`     	 
  
 +     !, # 
 
 ##F,<nMM r   c                    |                                  }|dk    rd| dS | j        }dg||z
  z  dg|z  z   }| dd                    |           dS 	NrG   z!triton_helpers.promote_to_tensor(ro  r  rx   r  rn  r  re  r/  r   )r   rc  ndimsnreducesizess        r   reduction_resizezTritonKernel.reduction_resize
  st    ''))A::?u????))VHw,>>--$))E**----r   tuple[str, BlockShapeType]c                    |                                  }|dk    rd| d|fS | j        }dg||z
  z  dg|z  z   }|g |d ||z
           dg|z  R nd }| dd                    |           d|fS r  r  )r   rc  r   r  r  r  r  s          r   reduction_resize_and_shapez'TritonKernel.reduction_resize_and_shape  s    ''))A::?u???FF))VHw,>>=B=N9e'uw'(9A3=999TX 	 --$))E**---y88r   c                &   | j         dk    r|S |                                 | j         z
  }|                                 }|d|         dgz   }| j                            |t          t          |          ||          |t          |                    S )zC
        Reshape to RBLOCK, collapsing all reduction dims.
        rG   NRBLOCKrl  )r/  re  r  rr  rs  rp  r   r  )r   r  rc  r$  target_ndimrd  target_shapes          r   reduction_collapse_dimsz$TritonKernel.reduction_collapse_dims  s     "a''L--//$2II,,..$\k\2hZ?x  3u::}lCC%%	 ! 
 
 	
r   ro  reduction_typerC   +Union[CSEVariable, tuple[CSEVariable, ...]]c                L   1234567 dBd}d t          j        |          D             }t          j        ||          }t          d |D                       r>t	          j        |t          j                  }t	          j        t          j                   j        sJ t          d  j	        D                       } 
                    |           t          |          } j        r|                     j                    j	        d         j        d	         }                                 3                     3 fd
|          }                                  j        z
  4dC4 fd5dD5fd}	46 fd}
||f}| j        j        v r j        j        |         S t+          |          }t-          |          }t/                                                     }d|4<    j                            |t5          |                    }t          d |D                       |_        d                    |          22fd7 j        r7t<          j                             |          }                     tB          |          }dE 7fd1dk    rn>tE          |t4                    r1fdtG          ||          D             }n 1||          }dv rtE          |tH                    sJ tJ          j&        '                                }tQ           j        )                     j*        d| d| d||j+                            }dd d         6 |
 j*        |||           ||_,        n܉d!k    r; j-        r .                    ||7|          }n /                    |          }nd"k    rXtE          |t`                    sJ |\  }}}t5           fd# 1                     j*        |||4          D                       }n=dk    r 2                    |          }ntE          |tH                    sJ  5 j*        ||j,                  \  }}} j        )                     j*        |||          }nȉ j        3                    d$| |t5                                                               }t<          j        4                    |          }                     tB          |          }tE          |t4                    s8 j5        6                    | d%                                  d&| d&| d'           dv rd$| d(} j7        8                                } j5        6                    | d%                                  d&t	          j9        |          j:         d& ;                    |           d'           dd d         6 j*        <                    d)| d*| d+6 d,| d&| d&| d&| d-| d. 7| d/|           d0| d. 7| d/|           d0            |
 j=        |||           nt}                    r .                    ||7|          }nۉdk    r2d$| d1}d$| d2} j5        6                    | d%                                  d3| d'            j5        6                    | d4                                  d&| d'            j*        <                    d5| d*| d6| d&| d&| d&t~          j@         d7            j*        <                    d5| d. 7| d/|           d5| d. 7| d/|           d5	           |} j                            |j+                  } A                     j=        ||||4          }nt=          jB        |          }  | ||          }! j*        6                    | d. 7|!|                      |t          jC        k    r5 j        )                     j=        | d8t          jD        |j+                  } |	 j=        ||d             j-        rt<          j        4                    |          }t          jF                    }" j=         jG        fD ]>}#|#6                    d9           |"H                    |#I                                           ?dv r j=        6                    | d: J                    | d;                       K                    | d<||          }$ j7        8                                } K                    ||t	          j9        |          j:                  }% |
 jG        ||$|%           nt}                    rd!k    sJ |\  }&}'}( K                    |&t-          |          |d	                   }) K                    |'t-          |          |d=                   }* K                    |(t-          |          |d>                   }+ L                     jG        |&|'|(|)|*|+4	  	         nΉdk    r|\  }}tE          |t`                    sJ  K                    |t-          |          |d	                   }, K                    |t-          |          |d=                   }- A                     jG        |||,|-4           n7 K                    |t-          |          |          }. |	 jG        ||.d            |"M                                 | j        j        |<   tE          |t4                    rt          d? |D                       sJ  jO        P                    |           d@v r't          |          d=k    sJ t          |          |z  }t          |          t          |          k    sJ tG          ||          D ]D\  }/}0|0J |/j,        |0k    r0 j=        6                    |/ d.|/ dAt          |0           d'           EntE          |t                    sJ  jO        T                    |           |j,        |d	         k    r@|d	         J  j=        6                    | d.| dAt          |d	                    d'           |S )FNrc  rM   rw   c                    | j         t          j        t          j        fv rt	          j        | t          j                  n| S rz   )r$  r|   rR  rS  r  r~  r}   r~  s    r   maybe_upcastz,TritonKernel.reduction.<locals>.maybe_upcast)  sB     ;MN  UEM222 r   c                    g | ]	}|j         
S r   r'  )r   r"  s     r   r	  z*TritonKernel.reduction.<locals>.<listcomp>7  s    JJJ39JJJr   c              3  J   K   | ]}|t           j        t           j        fv V  d S rz   )r|   rR  rS  r  s     r   r   z)TritonKernel.reduction.<locals>.<genexpr>9  s0      MMqU]EN33MMMMMMr   c              3  *   K   | ]}|j          d V  dS r   Nr-  r$  s     r   r   z)TritonKernel.reduction.<locals>.<genexpr>?  -      MMDdk///MMMMMMr   r  r   c           	         j                             j        d|  d d| j        t	                                                              S )Nrm  rn  ro  rl  )rr  rs  rt  r$  r  r  )rs  r  r   s    r   <lambda>z(TritonKernel.reduction.<locals>.<lambda>N  sU    dh''9199999gD002233	 (   r   result_typerp  1tuple[str, Optional[torch.dtype], BlockShapeType]c           
     L   	dv }|rdnd}
                     | |          }	dv r+
                    | d	 d| d d|j                  \  }}n*
                    | d	 d	| d d|j                  \  }}|| d
                    |           d}n|j        }|||fS )zK
            Helper to generate a reduction call, e.g. tl.sum.
            )r   ry  minprodtriton_helperstl)ry  r!  r  z2(rn  ro  r  Nr_  )r  r  r   r$  r$  )r  rc  r  
use_helpermoduler=  r   r  r$  r  r   s          r   final_reductionz/TritonKernel.reduction.<locals>.final_reductionZ  s
    (+HHJ)3=%%F00FFE// $ ? ?AAAA%AA3AAA5;! ! !% ? ?@@@@@@#@@@%+! ! &"II(9(9+(F(FIII#k;--r   r  rx   c                ^     | ||          \  }}}|                      | d|            dS )zU
            Generate a reduction and assign it to an existing variable.
            r  N)r   )r  r  rc  r  r  r'  s        r   final_reduction_definez6TritonKernel.reduction.<locals>.final_reduction_definev  sD     */&%EEKE1aMMZ33E3344444r   c                                         | |          }                     | |          }|                     d| d| d d| d| d d| d                    | d           d	           d S )
N                z_val, z_idx = triton_helpers.z_with_index(rn  )
                r  _idx
                )r  r   r  )r  r  rc  r   r  r$  root_opr   s       r   final_argreducez/TritonKernel.reduction.<locals>.final_argreduce  s    00FFE00FFEMM #- EL Z_ ch lo   $ 5 56I6I6I J J      r   r  rl  c              3  D   K   | ]}t          |d                    |V  dS r  )r;   )r   r7  s     r   r   z)TritonKernel.reduction.<locals>.<genexpr>  sH       *
 *
(;CF(C(C*
*
 *
 *
 *
 *
 *
r   r   c                D    s| S t                               | |          S rz   )rO  r  )tvalfvalr  s     r   
where_condz*TritonKernel.reduction.<locals>.where_cond  s(     (..tT4@@@r   c                    j                             j         | |          | j        | j        | j        n|j                  S )Nrl  )rr  rs  rt  r$  r   )rc  defaultr   r5  s     r   _mask_valuez+TritonKernel.reduction.<locals>._mask_value  sM    x((LJug..+).)@%++gm	 )   r   online_softmax_reducec                .    g | ]\  }} ||          S r   r   )r   rs  dr8  s      r   r	  z*TritonKernel.reduction.<locals>.<listcomp>  s)    RRRdaAq 1 1RRRr   )argmaxargminrm  zindex, r  ry  r!  welford_reducewelford_combinec              3  d   K   | ]*\  }}j                             j        ||           V  +dS )rl  N)rr  rs  rt  )r   rc  r   r$  r   s      r   r   z)TritonKernel.reduction.<locals>.<genexpr>  sV       # #$u H%%dlEe%TT# # # # # #r   r   = tl.full(rn  ro  _indexr+  _next, z_next = triton_helpers.z%imum_with_index(
                    z(index
                )
                r  _nextr.  _max_sumz, float('-inf'),  = tl.zeros(z
                    zG_next = triton_helpers.online_softmax_combine(
                        z+
                    )
                    z.to(tl.int8)zif HAS_RSPLIT:z_bval = _val_bvalrG   r   c              3  @   K   | ]}t          |t                    V  d S rz   )r  r<  r  s     r   r   z)TritonKernel.reduction.<locals>.<genexpr>  s-      LLAz!%677LLLLLLr   )r>  r9  r_  )rc  rM   rw   rM   )rc  rM   r  rp  rw   r  )r  rM   rc  rM   r  rp  rw   rx   rw   rM   )Upytreetree_leavestree_mapr   r|   ru  r}   r0  r   r   r{  r   r  r  r-  r  _map_tuple_or_scalarre  r/  rr  reduction_cacher3  r1  r7  r  r  r  r   r   r  r    	Reductiondefault_valuerX   r  r  rM   rE   r+  rn  r   rs  rt  r   r$  r  r>  welford_reduce_fallbackr   _welford prepare_softmax_twopass_fallbacknamedvardefault_accumulatorr  r   r3  select_index_dtypeiinfory  r$  r   r  r9   r   r  %online_softmax_reduce_final_reductionget_reduction_combine_fnrv   r*  r  r  r  r  r  r  *codegen_cooperative_reduction_peer_combinewelford_reduce_final_reductionr  r'  r  rE  r1  r(  r<  r  )8r   r$  ro  r  rc  r  original_dtypesr  reduction_range_prefixr)  r0  r  acc_typetorch_acc_typeresult_shaper  r7  masked_valueaccumulator_dtypeaccumulator_indexmeanm2weight_result_dtype_shapeaccumulatorr  accumulator_maxaccumulator_sum
result_max
result_sum
combine_fnupdatedr  bufpeer_valpeer_idxresult_mean	result_m2result_weight	peer_meanpeer_m2peer_weightpeer_maxpeer_sumpeersr7  
orig_dtyper8  r  r  r  r'  r/  r5  s8   `` `                                             @@@@@@@r   	reductionzTritonKernel.reduction"  s   	 	 	 	 KJ0B50I0IJJJe44MM_MMMMM 	>+Iu}EEI'u}==E$$$$MMD<LMMMMM%   u? 	*LL)))!%!1"!5!<Q!? ,,..))     
 
 %%''$*AA	. 	. 	. 	. 	. 	. 	. 	. 	.8
	5 
	5 
	5 
	5 
	5 
	5	 	 	 	 	 	 	 	 6	0008+I66"9--))44D002233S(// l(;(; * 
 

  * *
 *
 *
 *
 *
  
  

 zz%  	A 	A 	A 	A 	A
 $ p	l00KKG//wGGG       !888 E5)) ;RRRRc%>Q>QRRR*{5'::!555!,<<<<<$%H$K$K$M$M!$'H%%_+A__,___/*0	 &  % %! &+e<<^LL*l<M   $5
  #333- 	L!%!4!4"NE:xQV" "JJ "&!=!=eU!K!KJJ#444!,99999%1"r6" # # # # #(,dBU) )# # #  

  #::: "BB5%PP

!,<<<<<*9/L,0B+ +' "X..L'v /  

 (++ J  $D002233 ,  K
 l66~yQQG//wGGGgu-- 	##"^^t/B/B/D/D^^^^S[^^^   !555$:
$:$:$:!"m>>@@	##( Y YT5H5H5J5J Y Y{;//3Y Y7;7H7H7U7UY Y Y   &+e<<^L## %6 OV   $5 9> BX  	  ",{,A,A,A;!O!O	 
 # 
 (2z5F2M2M2MO`'a'a      *JEV    &n55 H!00z8U 

  #:::"6j"6"6"6"6j"6"6"6 	##&ff43F3F3H3Hff[cfff   	##&XXD4G4G4I4IXXXXXX   ##$ -< ( ,; ?D HNH\     ##$ )34M4M4M)_)_ $ )34M4M4M)_)_     (
!X__5
@P_QQ
!GG*## 

  8SS
$*[%88&&"IIzz';'G'GII   
** #'("3"3.&444#j)/	 #4 # #K '&*JT   % H	l66~yQQG#-//J.0DE 7 7.///((6666!555&00!WW4+@+@JATATAT+U+UWW    JJ!((()W  #m>>@@JJU[-E-E-I    4j(HUUUU%n55 3V%)999998B5Y KK$Y//AJ 	
 II$Y//AJ 
 #MM!$Y//AJ 
 33(!
 
 
 
  #:::)3&
J!'844444JJ 0 ; ;WQZ   JJ 0 ; ;WQZ  ::(    GG 0 ; ;W  '&t';ZPTUUU.8 +j%(( 	LLLLLLLLLL"))*555 !LLL?++q0000"%j//O"Cz??c/&:&:::::#&z?#C#C  Z!---9
***44NN3NN,?
,K,KNNN   j*;<<<<<"&&z222 ?1#555&q)555&00!``j``6I/Z[J\6]6]```   r   c                t                          ||          }                      ||          } fdt          d          D             \  }}|                    d| d| d| d| d| dt          j         d| d                     |            d| d                     |            d           ||fS )Nc                `    g | ]*}t          j                                                 +S )r'  )r   rr  r  )r   r  r$  r   s     r   r	  z7TritonKernel._online_softmax_reduce.<locals>.<listcomp>  s1    !V!V!V#dhooEo&B&B"C"C!V!V!Vr   r   
            rn  9 = triton_helpers.online_softmax_reduce(
                )
            r  )r  r9  r   r   r  r  )r   r  rm  rn  r  r$  ro  rp  s   `    `  r   _online_softmax_reducez#TritonKernel._online_softmax_reduce  s8    66vPUVV66vPUVV!V!V!V!V!VUSTXX!V!V!V
J %   $3 7: >D>R    !11Z/BB  	  !11Z/BB	  	
 	
 	
 :%%r   c           	     ,   	  fd|||fD             \  }}}d| d| d| d d	}fd		 fd|||fD             }                     d                    d |D                        d|            t           fd	|D                       S )
z;
        Helper to codegen triton_helpers.welford.
        c              3  F   K   | ]}                     |          V  d S rz   )r  )r   rc  r  r$  r   s     r   r   z(TritonKernel._welford.<locals>.<genexpr>  sI       
 
 ((>>
 
 
 
 
 
r   ztriton_helpers.welford(rn  ro  c                N    t          | d         | dz   d          z             S )Nr   rG   )r  )r   r  s    r   reduced_shapez,TritonKernel._welford.<locals>.reduced_shape  s+    qucAgii(88999r   c                d    g | ],}j                              |j                             -S rl  rr  r  r   )r   rc  r$  r  r   s     r   r	  z)TritonKernel._welford.<locals>.<listcomp>  sG     
 
 
 HOO%}}U[/I/IOJJ
 
 
r   c                ,    g | ]}t          |          S r   r  )r   r   s     r   r	  z)TritonKernel._welford.<locals>.<listcomp>  s    &G&G&G!s1vv&G&G&Gr   r  c              3  N   K   | ]}                     ||j                  V   d S rz   )r  r   )r   rc  r   s     r   r   z(TritonKernel._welford.<locals>.<genexpr>  sI       
 
 ++E5;??
 
 
 
 
 
r   )r   r   r  )
r   r  rf  rg  rh  r  r$  welfordwelford_resultsr  s
   ``   ``  @r   rT  zTritonKernel._welford  s;   
 
 
 
 
 
F+
 
 
b& KDJJBJJ&JJCJJJ	: 	: 	: 	: 	:
 
 
 
 
 
F+
 
 
 	DII&G&G&G&G&GHHVVWVVWWW 
 
 
 
(
 
 
 
 
 	
r   c                   |                                  | j        z
  }t          | dt          |                                           |t          j                              }t          | dt          |                                           |t          j                              }	t          | dt          |                                           |t          j                              }
| j                            | d| 	                                 d| d           | j                            |	 d| 	                                 d| d           | j                            |
 d| 	                                 d| d           |dk    r=|\  }}}| j
                            d	| d
|	 d
|
 d| d|	 d|
 d| d| d| d           n8|dk    sJ | j
                            d	| d
|	 d
|
 d| d| d|	 d|
 d           | j
                            d| d || d|           d|	 d ||	 d|	           d|
 d ||
 d|
           d           |}|                     | j        |dd||	|
||	  	        S )z%Helper to codegen a welford reduction_meanr   r$  r=  _m2_weightrG  rn  ro  r?  r+  rC  z<_next = triton_helpers.welford_combine(
                    z,
                    z#
                )
                r>  z;_next = triton_helpers.welford_reduce(
                    z1, roffset == 0
                )
                z            r  rD  r  N)re  r/  r<  r  r  r   unknownr  r   r  rt  r   r]  r  )r   r  r  rc  r5  r`  r$  r  rl  accumulator_m2accumulator_weightrf  rg  rh  rv  s                  r   r>  zTritonKernel.welford_reduce  s    %%''$*AA'   ,,..//&((	
 
 
 +,,..//&((	
 
 
 /""",,..//&((	
 
 
 		LL(;(;(=(=LLLLL	
 	
 	
 		OO4+>+>+@+@OOHOOO	
 	
 	
 		!SSt/B/B/D/DSSSSS	
 	
 	
 ...$D"fL %3 <N   $2 6H       $*      "%55555L %3 <N  ) -; ?Q     	 'Z;(=(=(={KK   *
n+C+C+C^ T T     %/J2D/K/K/KM_$`$`  	
 	
 	
 !22"

 

 
	
r   c
           
     D   t          |                     ||||||	                    }
|||g}t          t          ||
                    D ]H\  }\  }\  }}|!| j                            |	|          }|||<   |                    | d|            It          |          S )z0Helper to codegen call to triton_helpers.welfordNrl  r  )r7  rT  r  r  rr  r  r   r  )r   r  rv  rw  rx  rf  rg  rh  r  r$  rb  result_exprsr  result_exprrc  r   s                   r   r]  z+TritonKernel.welford_reduce_final_reduction-  s     dmmFD"fc5IIJJ#Y>09#lF:S:S0T0T 	6 	6,A,^eU""hooEoGG"-QMM[44U445555\"""r   c                2   |                      |||          }|                      |||          }	|                    d| d| d| d|	 d| dt          j         d| d|                     |            d| d|                     |            d           ||fS )Nr  rn  r  r  r  )r  r   r   r  r  )
r   r  ro  rp  r|  r}  r  r$  rm  rn  s
             r   rZ  z2TritonKernel.online_softmax_reduce_final_reductionE  s    66vxOO66vxOO %   $3 7: >D>R    !11Z/BB  	  !11Z/BB	  	
 	
 	
 :%%r   c                8    | j         r| j         d         S t          S )NRSPLIT)r  r-   r   s    r   
max_rsplitzTritonKernel.max_rsplitT  s!     	/$X..  r   c                *   | j         d         }|                                 sdnd}||j        z  |                                 z  }| j                            |          \  }}| j                            d| d| d|                     |           dt          |           d| d	| d
| dd           | 
                    | dddg|t          j                              }	| j                            |	 d| dt          |           d           |	S )a	  
        Generate code to save a [XBLOCK, RSPLIT] temporary workspace, where each thread block writes a different
        column.  After the barrier, every thread block loads the completed value so that it can compute the final
        value independently.
        r%  zxindex < xnumelNr.  z_ws = (r  z).to(tl.pointer_type(z))
                tl.store(z%_ws + (xindex * RSPLIT + rsplit_id), rn  r  Tstrip_peersr   r  r  z = tl.load(z_ws + (xindex * RSPLIT + rsplit_arange), rsplit_mask, eviction_policy='evict_first', other=triton_helpers.if_mask(rsplit_mask, r  )r2  r1  r/  r  r.  r  r  r   rq  r?   create_cse_varr   r  r  r   rX   )
r   r  r$  default_valxnumelr   r  r  r  r~  s
             r   r\  z7TritonKernel.codegen_cooperative_reduction_peer_combineY  s    S!(,(@(@(B(BL  %.(4??+<+<<!GPPQWXX%% $+ 040A0A)0L0L cnotcucu $ KU Y]    	& 	
 	
 	
 ##!!!X&&((	 $ 
 
 	&& D D D Ders~eeD D D	
 	
 	
 r   c                n   | j         sJ d| _         t          j                            |          }|                     |d|                     | |d                    }d| _         | j                            |          }t          j	                    }| j
        r.|                    |                     || j                             t          |t          t           f          rh| j                            t%          ||                     |||                    |          |d|                                                               nVt          |t,                    sJ | j                            t%          |d| d|j         d| d	|j         d
	                     |                                 d S )NFT)r+  r$  r  rh  r  r  r|  r9  rn  ro  )r0  rE   r&  r  rm  r  r  r{  r  r  r  r  r  r  r  r  ry  r   rN   r  r  rZ  r   r   r   r  )r   rz  r   rc  r$  rm  r7  r  s           r   store_reductionzTritonKernel.store_reductionz  s    $$$$ %!!$''==&*&H&H5D 'I ' ' ! 
 
 !%it$$)++
% 	$$,,T43GHH   h2I JKK 	 **55  ,,IH,C,C,E,EII 	 	    h88888 **]]]);]]]]IZ]]]    	r   tuple[CSEVariable, ...]dtypestuple[torch.dtype, ...]c                  
 t                                          d           t                      

fdt          d          D             }d                    d t
          j                            |          D                       }                    d| d           t                      dd	d
l	m
} d	dlm}  |             |             G 
fddt                    }                                5  t          j         |                      5   || }	d                    d |	D                       }	                    d|	            d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   | j                                                                      S )Nz@triton.jitc                    g | ]9t          fd t          t                              D                       :S )c              3  j   K   | ]-\  }\  }}                     d  d| ||j                  V  .dS )rG  r  rl  N)rV  r   )r   nrc  r$  rr  r  s       r   r   z7TritonKernel._lift_helper.<locals>.<listcomp>.<genexpr>  s_        %A~u ]1]]q]]%u{KK     r   )r  r  r  )r   r  rr  r  rb  s    @r   r	  z-TritonKernel._lift_helper.<locals>.<listcomp>  ss     
 
 

 	      )23vv3F3F)G)G    
 
 
r   r   rn  c              3  4   K   | ]}t          |          V  d S rz   r  r  s     r   r   z,TritonKernel._lift_helper.<locals>.<genexpr>  s(      RRc!ffRRRRRRr   zdef {name}():r  r   rf   )ShapePropagationOpsHandlerc                  &    e Zd Zd fd	Zd
S )+TritonKernel._lift_helper.<locals>.CSEProxyrz  r   r  tuple[Any, ...]rF  dict[str, Any]rw   r   c                    	d| z  	 t          |          |i |} t          |          |i |}                     t          
|          |i |||          S )Nr  rl  )rc  rs  )r   rz  r  rF  output_dtypeoutput_shaperr  dtype_handlerhelperhelper_name	overridesshape_handlers         r   _defaultz4TritonKernel._lift_helper.<locals>.CSEProxy._default  s     z4zz) w!     # " #  #
 w!     # " #  #
 ||,GIt,,d=f==&&	 $   r   N)rz  r   r  r  rF  r  rw   r   )r{   r   r   r  )rr  r  r  r  r  r  s   r   CSEProxyr    sL                    r   r  c              3  4   K   | ]}t          |          V  d S rz   r  )r   r{  s     r   r   z,TritonKernel._lift_helper.<locals>.<genexpr>  s(      BBFBBBBBBr   return r  )rO   r   rL   r9  r   r`  ra  from_iterablern  rL  rg   !torch._inductor.shape_propagationr  r'   r  rE   set_ops_handlerr  r  r   )r   rc  rb  r  r  	signaturerg   r  r  outputsrr  r  r  r  r  r  s     ``      @@@@@@r   _lift_helperzTritonKernel._lift_helper  s   
  !!'''ee
 
 
 
 
 

 1XX
 
 
 IIRRio.K.KD.Q.QRRRRR	6666777#%%	 *PPPPPPPPPPPP22442244	 	 	 	 	 	 	 	 	 	 	 	~ 	 	 	0 ]]__ 	2 	2a/

;; 	2 	2b$iGiiBB'BBBBBG0w00111	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2
 $(():):k(RRRs6   
F'=E0$F0E4	4F7E4	8FFFrq  UCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]c                     j         sJ  j        r
J d            t          d  j        D                       }                     |           t          |          } j        r
J d            g }g }t          d |D                       }t          j	         j
        j         j                                       |||          }                                  j        z
  }t!          ||          D ][\  }	}
 j
                             j        |	 dt#          |
           d|
|	j                  } j
                             j        d| d	                                  d|
t                                                               }	|                    |	           t-          |
          } j        s                                 }d
|d<    j
                            |
|          }dd	                    |           d}|
j        rdnd} j                            | d| d	| d	| d           |                    |           ]d  fd} |d |           d| d	| d|||          } j        sd fd|D             } |t          |          t          |                    } |t          |          |          }fdt!          ||          D             }t!          |||          D ])\  }}} j                            | d| d	| d           *n|}|D ]-}t;          |t<                    sJ t          |          |_        .t          |          S )z:
        Perform an associative scan on 'values'.
        TODOc              3  *   K   | ]}|j          d V  dS r  r  r$  s     r   r   z$TritonKernel.scan.<locals>.<genexpr>  r  r   z(ops.scan not supported inside ops.maskedc              3  4   K   | ]}t          |          V  d S rz   rA   r   r$  s     r   r   z$TritonKernel.scan.<locals>.<genexpr>   +      FFe*511FFFFFFr   r_  ro  rl  rm  rn  r  r  r  r  zfloat('nan')z-1rA  c                @    d                     d | D                       S )Nr  c              3      K   | ]	}| d V  
dS ,Nr   r   rc  s     r   r   z1TritonKernel.scan.<locals>.csv.<locals>.<genexpr>$  (      <<EuKKK<<<<<<r   r   rb  s    r   csvzTritonKernel.scan.<locals>.csv#  #    88<<V<<<<<<r   c                    t          |          } fdt          |          D             }t          
fd|D                       r
fd|D             S 
fdt          ||          D             }
j                             	|           d             t          ||          D ])\  }}r|_        
j                            ||           *t          |          S )Nc                $    g | ]} d | d  S rn  r   r   r  r  r  s     r   r	  z;TritonKernel.scan.<locals>.cse_multiple.<locals>.<listcomp>(  /    DDDaT11Q11%11DDDr   c              3  L   K   | ]}j                             |          V  d S rz   rr  containsr   r  r   s     r   r   z:TritonKernel.scan.<locals>.cse_multiple.<locals>.<genexpr>)  3      LLI48$$Y//LLLLLLr   c                D    g | ]}j                             |          S r   rr  r  r  s     r   r	  z;TritonKernel.scan.<locals>.cse_multiple.<locals>.<listcomp>*  '    LLLIY//LLLr   c                X    g | ]&\  }}j                             ||j                   'S r  r  r   r$  rc  r   s      r   r	  z;TritonKernel.scan.<locals>.cse_multiple.<locals>.<listcomp>+  sA       "UE e5;??  r   r  
r1  r9  r'  r  rt  r   r   rr  r  r  )r  rb  r  r  r  
cache_keysresult_varsr  r  r  r   s   ` `      r   cse_multiplez'TritonKernel.scan.<locals>.cse_multiple&  s2   FADDDDD588DDDJLLLLLLLLL MLLLLLLLL   &)&&&9&9  K L""3{##....   *-[*)E)E 4 4%
I 1+0J(Y
3333%%%r   ztl.associative_scan((r9  c                J    | j         d S t          | j                   }d|d<   |S )Nr  r  )r   r7  )r7  r   s     r   _partial_scan_shapez.TritonKernel.scan.<locals>._partial_scan_shapeC  s*    9$4 OOE #E"I Lr   c           	     j    g | ]/} d | dt          |j                   |                    0S )ztriton_helpers.select_one((z1), rbase == (RBLOCK - 1), dim=-1, keep_dims=True)rl  )rA   r$  )r   partial_scan_varr  cse_computes     r   r	  z%TritonKernel.scan.<locals>.<listcomp>K  si     # # # % u2Buuu-.>.DEE--.>??  # # #r   c           	     T    g | ]$\  }} d | d| d|j         |j                  %S )ztl.where(roffset > 0, rn  ro  rl  rl  )r   	full_scanpartial_scanr  s      r   r	  z%TritonKernel.scan.<locals>.<listcomp>U  sb        ,I| IYII,III&,&,    r   z = tl.where(roffset > 0, ) r0  r  r   r   r{  r   r  r  r^  r_  rr  rs  rt  r  re  r/  r  r(  r   r  r  r  r3  r  r  r   r5  r  r   r  r<  r   )r   r  rq  rb  r  broadcasted_valuesaccumulatorscombine_helper_fnr  rc  r$  value_dtyper`  reduced_sizerl  reduced_size_strr7  r  partial_scan_varspartial_reduce_vars	accs_nextfull_scan_varsr  acc_nextpartial_reducer  r  r  r  s   `                         @@@r   scanzTritonKernel.scan  s    $$$$-55v55-MMD<LMMMMM%   u?NN$NNN"FFvFFFFF'(94<HH --j&&II%%''$*AA// 	1 	1LE5(++;;1%88;;;k	 ,  K H%%J;JJ$2E2E2G2GJJJD002233	 &  E %%e,,,&u--H, 1#3355#&R "hooEoNN#Atyy'>'>#A#A#A ,1,CM..	##"YY/?YY7YYhYYY   ##K000	= 	= 	=	& 	& 	& 	& 	& 	&$ )L[CC(:$;$;[[[[GX[[[	
 
 ( %	,! ! !# # # # # ):# # # #
5#6#6>Q8R8RSSI'Zl(;(;=NOON    03>CT/U/U  K :=<)<: :  5+~ &&"ZZXZZZZZ    ,K% 	5 	5Jj*;<<<<<#-e#4#4J  [!!!r   stable
descendingc                     j         sJ  j        r
J d            t          d  j        D                       }                     |           t          |          } j        r
J d             j        s
J d            t          j	         j
        j         j                                                    j        z
  }t          d D                       t!                    t!          |          k    sJ  fdt#          |          D             }d  fd} j        d	         j        sJ                       j        d	                   rd
nd}	t!          |          dk    r0d|d          d|d          d|	 d| d| d| d}
 ||
||          }nt)          d          t+          ||          D ]\  }}||_        |j        |_        t          |          S )Nr  c              3  *   K   | ]}|j          d V  dS r  r  r$  s     r   r   z$TritonKernel.sort.<locals>.<genexpr>u  r  r   z(ops.sort not supported inside ops.maskedz3ops.sort is only supported in persistent reductionsc              3  4   K   | ]}t          |          V  d S rz   r  r  s     r   r   z$TritonKernel.sort.<locals>.<genexpr>  r  r   c           
         g | ]Q\  }} d | d                                  d|         t                                                              RS )rm  rn  ro  rl  )r  r  r  )r   r  rc  r  r  r   s      r   r	  z%TritonKernel.sort.<locals>.<listcomp>  s     
 
 
 5 KD5DDD,?,?,A,ADDDQiD002233  
 
 
r   c                @    d                     d | D                       S )Nr  c              3      K   | ]	}| d V  
dS r  r   r  s     r   r   z1TritonKernel.sort.<locals>.csv.<locals>.<genexpr>  r  r   r  r  s    r   r  zTritonKernel.sort.<locals>.csv  r  r   c                    t          |          } fdt          |          D             }t          
fd|D                       r
fd|D             S 
fdt          ||          D             }
j                             	|           d             t          ||          D ])\  }}r|_        
j                            ||           *t          |          S )Nc                $    g | ]} d | d  S r  r   r  s     r   r	  z;TritonKernel.sort.<locals>.cse_multiple.<locals>.<listcomp>  r  r   c              3  L   K   | ]}j                             |          V  d S rz   r  r  s     r   r   z:TritonKernel.sort.<locals>.cse_multiple.<locals>.<genexpr>  r  r   c                D    g | ]}j                             |          S r   r  r  s     r   r	  z;TritonKernel.sort.<locals>.cse_multiple.<locals>.<listcomp>  r  r   c                X    g | ]&\  }}j                             ||j                   'S r  r  r  s      r   r	  z;TritonKernel.sort.<locals>.cse_multiple.<locals>.<listcomp>  sA        E5 e5;??  r   r  r  )r  r  r  r  r  r  r  r  r  r  r   s   ` `      r   r  z'TritonKernel.sort.<locals>.cse_multiple  s6   &''ADDDDD588DDDJLLLLLLLLL MLLLLLLLL   $'0B$C$C  K L""3{##....   *-[*)E)E 4 4%
I 1+0J(Y
3333%%%r   r  rx   rnumelr   ztriton_helpers.sort_with_index(r   rn  rG   z	, stable=z, descending=ro  zUnhandled sort)r0  r  r   r   r{  r   r  r  r^  r_  rr  rs  rt  re  r/  r  r1  r  r4  r*  r  r  r   r=  )r   r  rb  r  r  r  r  r  r  r  r  r  r  	input_varr  r  s   ``            @@r   sortzTritonKernel.sortl  s    $$$$-55v55-MMD<LMMMMM%   u?NN$NNN"( 	
 	
A	
 	
(  '(94<HH%%''$*AAFFvFFFFF6{{c&kk))))
 
 
 
 
 
 &f--
 
 
	= 	= 	=	& 	& 	& 	& 	& 	&$ #00002243CB3GHHVhv;;!O2DQ2G O OK]^_K` O OO O!O O,2O OAKO O O  ',t-?OOKK !1222%(f%=%= 	1 	1!J	#(J  ) 0J[!!!r   c                
   | j         s%| j        s| j        s| j        s| j        s	| j        sdS d | j        D             }| j        rtt          |          dk    r`t          |          D ]\  }}| j
                            |          5  |j        }| j        rdnd}| j        rdn| d}| j
                            d	| d
| d| d|                                 d	           ddd           n# 1 swxY w Y   | j
                            |dz             5  |                     || j
                   ddd           n# 1 swxY w Y   | j
                            t          |                    5  |                     | j
                   | j
                            | j                    | j
                            | j                   | j
                            | j                   | j
                            | j                   ddd           n# 1 swxY w Y   t'          g t          |                    D ]w\  }}| j
                            |dz             5  | j        |j                                                 D ]\  }}|t          |          dz
  k     rn||dz            }	| j        |	j                 |         }
t.                              |	          }t3          |	j        |          fdt7          ||
          D             }| j
                            t9          | j        |         | d| dt<          j                             |           d                     	 ddd           n# 1 swxY w Y   | j!        "                    | j#                   |$                                 yn|| j
                            | j                    | j
                            | j                   | j
                            | j                   | j
                            | j                   | j
                            | j                   | j        rQ| j        s| j        rC| j%         d}| j
                            d| dd           | j&        '                                 | j
                            | j                   | j         (                                 | j        (                                 | j        (                                 | j        (                                 | j        (                                 | j        (                                 dS )a  
        Concat output code from index_code, loads, compute, stores,
        suffix into self.body.

        For pointwise kernels, this is called just once at the end.

        For reduction kernels, this generates a loop over the reduction
        axis.
        Nc                     g | ]}|j         	|S r   r5  r$  s     r   r	  z-TritonKernel.codegen_body.<locals>.<listcomp>  s    HHHt4<HdHHHr   r   )r   rsplit_startr  
rsplit_endr  zfor zoffset in range(rn  zBLOCK):rG   c                &    g | ]\  }}||z  z
  S r   r   )r   curprevprev_num_iters      r   r	  z-TritonKernel.codegen_body.<locals>.<listcomp>  s7     + + +$-C !$d]&: :+ + +r   z = tl.advance(ro  z + tl.program_id(1)zR
                if HAS_RSPLIT:
                    triton_helpers.x_grid_barrier(r,  Tr  ))indexing_coder  r  rt  r  r  r   r0  r1  r  r  r  r-  r  r   r   r8  r<  r   r8  r  r   r*  r   r   r   r  r  rN   r  rE   r+  rq  rr  
invalidater  cache_clearr-  r.  r  clear)r   
loop_treeslevelr   r-  
loop_startloop_endri  advancement	prev_treeprev_advancement
prev_blocksem_ptrr  s                @r   codegen_bodyzTritonKernel.codegen_body  s    	z	 {	 |		
 %	 #	 FHHt'7HHH
  9	*S__q%8%8(44 J JtY%%U%33  ![F373M!VSVJ(,(BX6HXHXHX  I''hvhhzhhXhhQWQ]Q]Q_Q_hhh                 Y%%UQY%77 J J88tyIIIJ J J J J J J J J J J J J J J !!Z!99 . ...ty999	  !3444	  ,,,	  ...	  ---. . . . . . . . . . . . . . .  ((@)J*?*?(@AA # #tY%%UQY%77  262K	3egg .	; !3z??Q#666(2519(=I/3/H )0'0), *7)E)Ei)P)PJ,3IOZ,P,PM+ + + +14[BR1S1S+ + +K
 	++( $ 8 C#, n nI n nI^I^_jIkIk n n n    !              4 ##D$:;;;  """"9#< IT/000ITZ(((IT\***IT[)))	/000% 	E"	E&*&:	E -BBBGI3:        6BBDDD	-...  """
$$&&&""$$$$$sK   
AC00C4	7C4	EE	E	5BHHH%DM::M>	M>	r  c                   g }|                                  r3g }|                     d|g            |D ]}t          |t                    r#|                    t          |                     ;t          |t                    rJ|                    t          t          j        j	        
                    |j                                       t          |t          j                  rE|                    t          t          j        j	        
                    |                               t          dt          |                     |S )Nr   z!Unsupported numel argument type: )r@  add_numel_to_call_argsr  rs  r  r   rc   rE   r&  r  	size_hint
inner_exprr   r  r  r  )r   r  
numel_argsrG  s       r   kernel_benchmark_extra_argsz(TritonKernel.kernel_benchmark_extra_args  s$   !! 	V+-J''J;;;! V Vc3'' VKKC))))_55 VKKAG$4$>$>s~$N$N O OPPPPUZ00 VKKAG$4$>$>s$C$C D DEEEE$%Tc%T%TUUUr   c                   t                      }| j                                        \  }}}}|                    g d           |                                5  t          j                    }g }t          ||          D ]\  }	}
dt          |           }t          j
                            |	          }|r|                    | dt          j
        j                            |                                | j                   dt          j
        j                            |                                | j                   d|                                 d|                                 d
           n|	t          j
        j        v rt          j
        j        |	         }|                    | dt          j
        j                            |                                | j                   dt          j
        j                            |                                | j                   d|j         d|j         d
           nt3          |
t4                    rOt          j
        j                            |
j                  }d	|
j        v rd
}|                    | d|            nt3          |
t<                    rnt          j
                                        }t          j
        j                            |
j                  }|                    | d| d| d|
j         d           ntA          d|	           |!                    |           |"                    | #                                           |                    dd$                    |           d           d d d            n# 1 swxY w Y   |                    g d           t          j
                                        }|j%        }|                                5  |                    dt          j
        j&        '                    |           d           |                                5  |                    t          j
        j&        (                    |                     d| }|                    | d| d           |                    tS          tT          j+                   d| d           d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   |                    g d           |                                5  |                    dt          j
        j&        '                    |           d           |                                5  |                    t          j
        j&        (                    |                     |                    dtS          tT          j+                   d           d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   |                    g d           |                                5  |                    d           |                    d           |                    d           |                    d           |                    d|            |                    d           |                    d           d d d            n# 1 swxY w Y   |S ) N)r   r   zdef get_args():arg_z = rand_strided()r  rn  z
, device='z	', dtype=ro  r  r   r  z = torch.zeros(z*Don't find the buffer or const tensor for r  r  )
r2  zdef call(args):zwith r  streamz = get_raw_stream(z.run(*args, stream=)r2  r2  z def benchmark_all_configs(args):z.benchmark_all_configs(*args))r2  r2  zif __name__ == '__main__':z<from torch._inductor.runtime.benchmarking import benchmarkerr   zargs = get_args()z:ms = benchmarker.benchmark_gpu(lambda: call(args), rep=40)z	num_gb = zgb_per_s = num_gb / (ms / 1e3)z<print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")),rO   r  python_argdefs
writelinesr  r`  r  r  r  rE   r&  try_get_bufferr   r  
size_hintsget_sizer  
get_stride
get_devicer  	constantsr  r  devicer$  r  rT   r,  r  rz  rV   r  KeyErrorr  extendr/  r   r   
device_opsdevice_guard
set_devicer   r:   KERNEL_NAME)r   num_gbr=  _argdefs	call_argsr  r  name_cnt	var_namesarg_namearg_sigvar_namers  const_tensorsymval_hintr<  r  current_devicer   stream_names                       r   codegen_kernel_benchmarkz%TritonKernel.codegen_kernel_benchmark&  s	   !!,0I,D,D,F,F))Y555666]]__ %	@ %	@ ((HI%(I%>%>  +  +!'2$x..22g,,X66 $$#  O  OQW5E5P5PQTQ]Q]Q_Q_os  pB5P  6C  6C  O  O  GH  GN  GW  Gb  Gb  cf  cq  cq  cs  cs  CG  CU  Gb  GV  GV  O  O  be  bp  bp  br  br  O  O  }@  }J  }J  }L  }L  O  O  O    !222#$7#4X#>L$$#  _  _QW5E5P5PQ]QbQbQdQdtx  uG5P  6H  6H  _  _  LM  LS  L\  Lg  Lg  ht  h{  h{  h}  h}  MQ  M_  Lg  L`  L`  _  _  lx  l  _  _  JV  J\  _  _  _     11 "#'"2"<"<W\"J"JK
 %44&'$$%B%B[%B%BCCCC66 	W@@BBFG,66w}EEE$$#ffEffVffV]Vcfff    #OXOO     ****T==??@@@>tyy';';>>>???K%	@ %	@ %	@ %	@ %	@ %	@ %	@ %	@ %	@ %	@ %	@ %	@ %	@ %	@ %	@N 	999:::<<>>$]]__ 
	 
	NQW%7%D%DU%K%KNNNOOO    G&11%88   /u..  K!K!K5!K!K!KLLL  ;233VVVVV                
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 	JJJKKK]]__ 	 	NQW%7%D%DU%K%KNNNOOO    G&11%88     Yc+"9::YYY  	              	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	DDDEEE]]__ 	 	N   R   0111L   111222=>>>N  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	  s   MN55N9<N9AT B
S6*T6S:	:T=S:	>TTTAXA(X;XX	XX	XX"%X"B[77[;>[;c                    t          j        d                    t          j        j                            d                              S )Nzl
            from torch._dynamo.testing import rand_strided
            {}
            import torch
        get_raw_stream)textwrapdedentr  rE   r&  r?  import_get_raw_stream_asr   s    r   imports_for_benchmark_kernelz)TritonKernel.imports_for_benchmark_kernel  s>     F17%>>?OPPQQ
 
 	
r   c                `    | j         rdS | j        rdS | j        r| j        sJ dS | j        rdS dS )Nr  r  r  r  	pointwise)r  r  r  r0  r   s    r   _get_heuristiczTritonKernel._get_heuristic  sX     	!>' 	**& 	(((())" 	;{r   c                    t           j        j                                        t          j                    t
          j        t
          j        t
          j        j	        t
          j
        t
          j        t
          j        t
          j        t
          j        t
          j        j        t
          j        j        t
          j        j        d} t           j        j        d| d<   t          j                    rd| d<   t
          j        r<t
          j        | d<   t
          j        | d<   t
          j        | d<   t
          j        | d<   t
          j        r-t
          j        | d	<   t
          j        | d
<   t
          j        | d<   | S )N)backend_hash$are_deterministic_algorithms_enabledassert_indirect_indexingautotune_local_cacheautotune_pointwiseautotune_remote_cacheforce_disable_cachesdynamic_scale_rblockmax_autotunemax_autotune_pointwisemin_split_scan_rblockspill_thresholdstore_cubinTis_hipr  profile_bandwidthprofile_bandwidth_regexprofile_bandwidth_output/profile_bandwidth_with_do_bench_using_profilingcoordinate_descent_tuning coordinate_descent_search_radius'coordinate_descent_check_all_directions)r|   r\  _tritontriton_hash_with_backendr[  r   r\  r]  r   r^  r_  r`  ra  rb  rc  rd  re  rf  r  r  r  rh  ri  rj  rk  rl  rm  rn  )inductor_metas    r   inductor_meta_commonz!TritonKernel.inductor_meta_common  s>    "K/HHJJ494^4`4`(.(G$*$?"(-"B%+%A$*$?$*$?"/&,&C%+]%H%}<!=4
 
 =(&*M(# 	.)-M+&# 	171IM-.7=7UM348>8WM45F KL + 		0 56 7 <= > CD r   c                0  +, t                      }i }| j                                        D ]\  }}t          |          r| j        st
          j        j                            |          }t          |t          t          j        f          sd}nt          t          |                    }|||<   ||                    t                                 t
          j                                        j        }|dk    r|                    d           n|                    d           t$          j        r'|                    |                                            | j                                        \  +}	,}	t/          ,          D ]\  }
}t          |t0                    rit3          t          j        |j                  }|t
          j        j        j        v r2t1          |j        t
          j        j        j        |                   ,|
<   t=                      }| j        D ]}|| j        j         v r%|!                    | j        j         |                    || j        j"        v rY|t
          j        j#        vrF|| j#        vr=|!                    t3          tH          | j        j"        |                   j%                   || j        j&        v r>| j        j&        |         }t          |tN                    rJ |!                    |           tQ          +,          D ]I\  }}t          |tR                    r/|j*        tV          j,        k    r|!                    |j                   Jt[          |          }| .                                D ][}t1          |j/         d|j0                  },1                    |           +1                    te          |j                             \+,fd}| j3        D ]>}|j4        r| j5        r|j6         ||j/        7                                 d           ?| j8        r |d	           ts          ,| j:        +
          }|tw          j<        t
          j                                                  i d}t
          j        j=        pt
          j        j>        }| ?                                j@        t          | jB                  t          t          jE                  ||| jF        | jG        | jH        d| I                                }t
          jJ        jK        4                                o| j5         }| jL        }t          | jN                  dk    o|duod|v }|ri|rf|J | jK        O                    | jN                  }|jP        jQ        jR        d         }|jS        }|d         t          |d         d          z  }|jU        jQ        jV        }|jP        jQ        jV        } t
          j        j        W                    |t$          jX                  t          t
          j        j        W                    | t$          jX                  d          z  }!|!dk    ro|dk    rit
          j        j        Y                    | jK        jZ        d          r:t
          j        j        [                    | jK        jZ        d          r|dk    rd|d<   | jL        r
| jL        |d<   | j\        r
| j\        |d<   | j8        r
| j5        |d<   d}"t$          j        st$          j]        r| ^                                dz  }"|"|"|d<   t$          j        r| _                                }#|#|#|d<   t          ,          g|d <   t          ,          D ]}$d|d!         ,|$         j        <   || _b        | c                                 | jd        D ],}%|e                    d"           |                    |%           -| jf        r,d#| g                                 d$| jf        j        d%|d&|d'	}&n| j        r>| jK        h                                }'d#| g                                 d(|d)|' d%|d&|d'}&nfd"}(t          |          dk    r%t          t          ,                    d*k    rd+}(nd,}(d#| g                                 d(|d-|( d.|d&|d/| jj         d'}&|                    |&           |e                    d0|pt          t          jk                   d1d-l                    d2 +D                        d3           |m                                5  | n                    |           | j        o                                D ]\  })}*|e                    |) d4|*             |                    | jp                   ddd           n# 1 swxY w Y   t$          j        r(|                    | q                    |"                     |r                                S )5z
        Convert the TritonKernel from Inductor SIMD IR to triton code, including inductor triton heuristics, imports,
        metadata, and benchmarking infra.
        i    Ncpuz"triton_helpers.set_driver_to_cpu()z"triton_helpers.set_driver_to_gpu()r  c                    t                      r"                    t          |                                          t          | d                     d S )NT)is_constexpr)r@   r  rK   rI   )rH  argdefsr  s    r   add_constexpr_argz6TritonKernel.codegen_kernel.<locals>.add_constexpr_arg#  sR    -// 9  h!7!7888NN78$???@@@@@r   r   r  )
size_dtyperw  )r  r<  r;  )	grid_typer  kernel_namemutated_arg_namesoptimize_memr,  num_loadnum_reductionr   r%  r   r0_rG   )fallbackg?g       @i   i   
   Tadd_persistent_rblocktiling_scoresr  r  g    eAkernel_num_gbkernel_flopconfigsr;  r   z$
                @triton_heuristics.z(
                    config=zI,
                    filename=__file__,
                    triton_meta=z$,
                    inductor_meta=z;
                )
                @triton.jit
            z!(
                    size_hints=z%,
                    reduction_hint=r-  ztile_hint=TileHint.SQUARE,ztile_hint=TileHint.DEFAULT,rn  zH
                    filename=__file__,
                    triton_meta=z*,
                    min_elem_per_thread=zdef r  c              3  >   K   | ]}|                                 V  d S rz   )	full_namer  s     r   r   z.TritonKernel.codegen_kernel.<locals>.<genexpr>  s+      CcCcVWAKKMMCcCcCcCcCcCcr   r  r  )srO   r2  r*  r;   r0  rE   r&  r  symbolic_hintr  rs  r   rJ  r/   r   r   r  r  r   benchmark_kernelrU  r  r4  r  rT   r	   r   r  inv_precomputed_replacementsrz  r   	mutationsinput_buffersr  r  removed_buffersrP   
inner_nameoutput_buffersrS   r  rV   	zero_moderW   ZERO_ON_CALLr   rx  r-  r  r  rI   r   r4  r  
tensor_dimr   r  rb   r  r+   r>  is_inferenceis_backward_get_grid_typer{   setr  r   r:   DESCRIPTIVE_NAMEr,  r~  r  rr  r+  r3  r  r1  r	  memory_stats
persistentmemoryr  count_per_threadry  loopedbytesr,  unbacked_symint_fallbackstatically_known_leqreduction_numelstatically_known_gtr  rh  estimate_kernel_num_bytesestimate_flopsr^   r_   r  r)  r  r   r  rX  get_reduction_hintr`   rz  rB  r   r  codegen_static_numelsaliasesr  rO  r   )-r   rz  coder7  r-  r  
numel_hintr,  device_typer  r  rG  rT  mutated_argsmutationmutation_argargnamer   sizeargrx  triton_meta_signaturer  r}  rq  
looped_redr  	two_d_redr  	dim_statsmem_ops_per_threadr_coalesce_ratio
looped_mempersistent_memsaved_bytes_ratiorC  flopsarg_numr  heuristics_linereduction_hint	tile_hintoldnewrw  r  s-                                              @@r   codegen_kernelzTritonKernel.codegen_kernel  s9    
![..00 	+ 	+MFE"6** 43H )77>>Jj3*>?? = !		+C
OO<<	!*Jv<KK133444'==??DKe##@AAAA@AAA& AD==??@@@#'9#;#;#=#= Iq	** 	 	FAs#w''  elCH55QW-JJJ#*!'"2"OPV"W$ $IaL )3 	/ 	/H49222  !8!BCCCDI555AG$;;;D$888  )B8)LMMX   49333#y7A%lJ?????  ...  33 	/ 	/LGS3--/M%6%CCC  ...l++++-- 	2 	2D333TZ@@GW%%%NN77<001111	A 	A 	A 	A 	A 	A $ 	= 	=D  T%> &!2!2!4!4;;;<<<<% 	(h''' 1$"2G!
 !
 !
 /&-ag.Q.Q.S.STT'
 '
 w+Bqw/B ,,..7!$"566{;<<!-(!/
 
 ''))
 X&3355Wd>W:W
*!Xm4&?XC=DX 	  &	>) &	> ,,,=55dkBBL$/6:1=I!*!;  -U3c-:La6P6PP%,39J)4;AN ! 0 : :V%D !; ! ! **"V-L +   	 ! "S(( %++G$99M15  , G$88M14  , '",,9=56 	@-1-?M/*# 	L373KM/0% 	N484MM01" 	8f&> 	83355;F!17o." 	5''))E /4m,"+I"6"6!7I +955 	B 	BG@AK$Yw%7%<==&+ 	  	 FNN2KK (	$($7$7$9$9  -4  "-	 
 $1  OO " 	!]==??N	$($7$7$9$9	 	 *	 	 %3	 	
 "-	 	 $1	 	 	OO I:!##/	::;;q@@ <II =I	$($7$7$9$9	 	 *	 	09	 	 "-		 	
 $1	 	 *.)A	 	 	O 	O$$$g473{677gg$))CcCc[bCcCcCc:c:cggg	
 	
 	
 [[]] 	# 	#&&t,,, I--// 1 1S#//#//0000KK	"""		# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# " 	?KK55f==>>>}}s   A)gg	g	c                   t           j        j                            |           } t	          | t
          j        t          f          rt          |           }t          |          }nid}t           j        j        	                    | |          sB|dk    rt          d|            |dz  }t           j        j        	                    | |          B|S )N   i @  z!Failed to find static RBLOCK for r   )rE   r&  r  simplifyr  r   rJ  rs  r/   r  r  )r  r"  s     r   r  z#TritonKernel._get_persistent_RBLOCK  s    !**622fu}c233 	f++C!#&&CCCg&;;FCHH ??$%Q%Q%QRRRq g&;;FCHH  
r   c                ^    	 t                               |            dS # t          $ r Y dS w xY w)NTF)r  r  r  )r  s    r   has_persistent_RBLOCKz"TritonKernel.has_persistent_RBLOCK  sB    	//7774 	 	 	55	s    
,,c                x   dd}| j         D ],}|j        r| j        r`t          j        j                            |j                  } ||          r,|                    |j	         dt          |                      |j        r| j        r| j        r4|                     |                     |j                            }d| d}n|                     |j                  }|                    |j	                                         d	|            |j	        d
k    r| j        r|                    d           .dS )a  
        We get a small speedup from hard coding numels if they are static.

        This code stomps on the passed-in values by writing an constant to the top of the kernel.

        In a kernel like:
        def KERNEL_NAME(in_ptr0, in_ptr1, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):

        We would add
        xnumel = 4096
        r0_numel = 768

        After the signature, before the kernel code, if we decided to make these static. As its hardcoded, it becomes
        a better signal to triton on how to unroll and do some static indexing. So, it's not so much that downstream
        knows that its a static numel, as that you just plop a constant into the kernel.
        r  r   rw   rv   c                D    t          | t          j        t          f          S rz   )r  r   rJ  rs  rr  s    r   is_static_integerz=TritonKernel.codegen_static_numels.<locals>.is_static_integer  s    dU]C$8999r   znumel = z*triton_helpers.constexpr_next_power_of_2((z + RSPLIT - 1) // RSPLIT)zBLOCK: tl.constexpr = r%  zXBLOCK: tl.constexpr = 1N)r  r   rw   rv   )r   r4  r0  rE   r&  r  r  r  r   r-  rs  r  r  r  r  r  r   r,  )r   r  r  r   simplified_tree_numelr  r"  s          r   r  z"TritonKernel.codegen_static_numels  sg   $	: 	: 	: 	: $ 	; 	;D$ Y(= Y()(8(A(A$*(M(M%$$%:;; YNNdk#W#W3?T;U;U#W#WXXX  TT%> T- B JJt';';DJ'G'GHHEgugggCC55djAAC$+"3"3"5"5RRSRRSSS{c!!dm!9:::	; 	;r    type[triton_heuristics.GridExpr]c                j   t          d | j        D                       }| j        r|dk    sJ t          j        S |dk    rt          j        S |dk    r?t          t          | j        | j                            rt          j	        S t          j
        S |dk    rt          j        S t          d|           )Nc                8    g | ]}t          |j                   S r   )rs  r4  r$  s     r   r	  z/TritonKernel._get_grid_type.<locals>.<listcomp>  s&    III**++IIIr   rG   r   r   z"Unsupported number of dimensions: )rz  r   r  r(   CooperativeReductionGridGrid1Dr   r   rX  Grid2DWithYZOverflowGrid2DGrid3Dr  )r   r  s     r   r  zTritonKernel._get_grid_type  s    II8HIIIJJ% 
	,6666$==!VV$++!VV3t2D4DEEFF >(==$++!VV$++AaAABBBr   c                T   | j         D ]}t          |j        t          j        t          j        f          r|j        }n%t          j        j        	                    ||          }|j
        r| j        r7|                    |           |                    t          |                     d S rz   )r   r  r  r   rJ  r   rE   r&  wrapper_codegenerate_numel_exprr4  r0  r  r  )r   rz  rE  	arg_typesr   r  s         r   r+  z#TritonKernel.add_numel_to_call_args&  s    $ 	- 	-D$*u}el&CDD Lzw+??dKK$ -(= -  &&&  d,,,	- 	-r   r  Optional[IRNode]c                   t           j        j        }|                                 | j                                        \  }}}}|                     |||           | j        j        D ]}|                    |           |	                    ||d|| j
                   t          | j        j                  D ]}|                    |           d S )NT)r   r  r  )rE   r&  r  write_triton_header_oncer  r4  r+  workspace_argsgenerate_workspace_allocationgenerate_kernel_callr  r8  generate_workspace_deallocation)r   rz  r  wrapperr  rE  r  wss           r   call_kernelzTritonKernel.call_kernel2  s    '&((***%)Y%=%=%?%?"9a##D)Y???)* 	6 	6B11"5555$$( 	% 	
 	
 	
 49344 	8 	8B33B7777	8 	8r   c                   t           j        j        }| j                                        \  }}}}t          ||          D ]~\  }}t          |t                    rdt           j        j        r|	                    d| d| d           Hd| d}|	                    |           d| d}|	                    |           d S )Nz:AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_check_inf_and_nan("z", z));zassert not z.isnan().any().item()z.isinf().any().item())
rE   r&  r  r  r4  r  r  rU   cpp_wrapperr   )r   r  r  rE  arg_signaturesrG  arg_signaturer  s           r   codegen_nan_checkzTritonKernel.codegen_nan_checkF  s    '&*.)*B*B*D*D'9na"%i"@"@ 
	, 
	,C-33 	,7& ,%%eUXee]`eee    DCCCD%%d+++CCCCD%%d+++
	, 
	,r   r<  c                    t          |i |S rz   )r<  )r   r  rF  s      r   r  zTritonKernel.create_cse_varU  s     $1&111r   entryrZ   c                    |j          d|                     |                     |j                             }|j        j        r| j                            |           d S | j                            |           d S )Nr  )	rz  r  r  r  rootr6  r  r   r  )r   r  r  s      r   codegen_iteration_ranges_entryz+TritonKernel.codegen_iteration_ranges_entryX  s{    *OOD,@,@,L,L!M!MOO: 	&((..... I%%%%%r   r[   c                    |j         J |                     |j                   }| j        }|dk    rd| dnd}| j        r| j        r|j        r| d}d|j                                         d| | S )NrN  r_  ro  r   z + rsplit_startztl.arange(0, zBLOCK))r  indexing_size_strr  r  r  r4  r-  r   )r   r  r  r  r  s        r   r9  z)TritonKernel.iteration_ranges_ranges_code`  s    +++%%e&677&*5*C*C&&&&&&	0)	0 "	0
 ///FIu|1133II4IIIIr   r   c                \    | j         }|                                 }dg|z  }d| d| d| dS )NrG   r  rn  ro  )r  re  )r   r  rc  r  rf  r  s         r   iteration_ranges_scalar_codez)TritonKernel.iteration_ranges_scalar_codem  sI     &&&((sTz9$99%99;9999r   c                    |j         J d|j          d}|                     |          rd| d|j         dz    d|j          d}|j                            ||          }| j        dk    r| d	| j         dS |S )
Nztl.program_id(ro  r  z + tl.program_id(rG   z) * tl.num_programs(r  rN  r_  )r+  rX  	pid_cacher  r  )r   r  r!  pids       r   iteration_ranges_get_pidz%TritonKernel.iteration_ranges_get_pidu  s    ~)))0u~000 &&u-- 	g gcffENQ,>ffTYTbfffCo!!#s++z))22t/2222
r   c                    |j         dk    oF|j         o>| j         o6t          j        j                            |j        t                                 S r  )	r+  has_zdimr  rE   r&  r  r  r  r.   )r   r  s     r   rX  z#TritonKernel.needs_yz_grid_overflow  sY    Na YN"Y..Y G$99%+~GWGWXXX		
r   r-  rs  c                    | j         r"| j         |                                 d         S t          |                                         S )Nr   )r  r   r,   )r   r-  s     r   rZ  zTritonKernel.max_block  s?     	?$%=%=%=>>//r   r   c                   | j         sdS | j        rQ|j                                         d| j        v r.| j        |j                                         d         dk    rdS n,t          j        j                            |j        d          rdS |j	        r"| j
        r|                     |j                  }n/|j        dk    r
| j        rd}n|                     |j                  }|j	        r| j        r||                                 z  }t          j        j                            |j        |          rH|j        dk    p<|j        p5t          j        j                            |j        t)                                S dS )NFr   rG   Tr%  )r  r  r-  r   rE   r&  r  r  r  r4  r  r  r,  rZ  r  r  rT  r+  r  r  r.   )r   r   rZ  s      r   r*  zTritonKernel._has_constant_mask  s   ! 	5 	DK$5$5$7$7!>!>!>$BS!S!S DK$5$5$7$7!>!>!>?1DDt E w77
AFF t  	4!: 	433DJ??II[CDMIIt{33I 	6!; 	6!DOO$5$55I 788YOO 	" W=W7#88^EUEUVV ur   c                `    | j         d         }|j        dk    sJ |                     |          S )Nr   r%  )r   r-  r*  )r   xtrees     r   r1  z TritonKernel._has_constant_xmask  s6     #|s""""&&u---r   r   r   c                    | j         D ]4}|                     |          r|                    |j         d           5|                    d           d S )Nr   rx   )r   r*  r  r-  )r   r   r   s      r   r{  zTritonKernel.filter_masks  sf    $ 	8 	8D&&t,, 8!!T["6"6"6777 	&!!!!!r   c                b    d t          t          j                  d | j                 D             S )Nc                (    g | ]}t           |         S r   )r   r   s     r   r	  z7TritonKernel.get_reduction_prefixes.<locals>.<listcomp>  s-     
 
 
 t
 
 
r   )r7  r   r   r/  r   s    r   get_reduction_prefixesz#TritonKernel.get_reduction_prefixes  s<    
 
]:;;<Ud>U<UV
 
 
 	
r   r  rO   c                l   d | j         D             }d                    t          d |D                                 }|                    d|                     |                      d | j         D             }t          |          }|                    d|                     |                      dS )z^
        Generates code that flattens ND reduction numels, block sizes, etc. into 1D.
        c                     g | ]}|j         	|S r   )r4  r$  s     r   r	  z9TritonKernel.codegen_reduction_numels.<locals>.<listcomp>  s     RRRD@QR4RRRr   r  c              3  *   K   | ]}|j          d V  dS )r  Nr  r$  s     r   r   z8TritonKernel.codegen_reduction_numels.<locals>.<genexpr>  s-      "U"UTdk#8#8#8"U"U"U"U"U"Ur   z	rnumel = c                J    g | ] }|j         	t          j        |j                 !S r   )r4  r   r   r   r$  s     r   r	  z9TritonKernel.codegen_reduction_numels.<locals>.<listcomp>  s;     
 
 
 
%di0
 
 
r   zRBLOCK: tl.constexpr = N)r   r   r   r   r  r=   )r   r  reduction_treesr  	rn_blocksr  s         r   r  z%TritonKernel.codegen_reduction_numels  s    
 SRD,<RRRF"U"U_"U"U"UUUVV6$**V"4"466777
 
(
 
 
	
 y))D

60B0BDDEEEEEr   r  list[sympy.Symbol]c                L    |                                  }fd|D             S )zK
        Helper to initialize symbols like rn_numel, rn_base, etc.
        c                <    g | ]}t          j        |  fi S r   )r   r   )r   r-  rF  r  s     r   r	  z7TritonKernel._get_reduction_symbols.<locals>.<listcomp>  s6    VVV000;;F;;VVVr   )r  )r   r  rF  rn_prefixess    `` r   r:  z#TritonKernel._get_reduction_symbols  s4     1133VVVVV+VVVVr   r   c                    |                                  }|                     ddd          fdt          t          |          dz
            D             t	          j        d          gz   S )z
        Compute coefficients to convert ND reduction indices to linear indices.
        For example:
          rindex = r0_index * r1_numel * ... * rn_numel + ... + rn_index.
        r  Tr   c                D    g | ]}t          |d z   d                   S rj  )r=   )r   rU  	rn_numelss     r   r	  z<TritonKernel._get_reduction_index_coeffs.<locals>.<listcomp>  s;     
 
 
47M)C!GII.//
 
 
r   rG   )r  r:  r9  r1  r   rJ  )r   r  r	  s     @r   _get_reduction_index_coeffsz(TritonKernel._get_reduction_index_coeffs  s     1133//PT/UU	
 
 
 
;@[AQAQTUAU;V;V
 
 
]1 	r   
multi_indsc                J    |                                  }t          ||          S )zK
        Compute linear reduction indices from N dimensional ones.
        )r
  r<   )r   r  coeffss      r   r;  z'TritonKernel._flatten_reduction_indices  s%     1133,,,r   c                f   |                      ddd          }|                      ddd          }|                     |          }|                    d|                     |                      |                     |          }|                    d|                     |                      dS )zX
        Generates code that converts ND reduction indices into linear indices.
        r   Tr   r   z
roffset = z	rindex = N)r:  r;  r   rq  )r   r  
rn_offsetsrn_indsrD  rindexs         r   r<  z&TritonKernel.codegen_reduction_indices  s    
 00d 1 
 

 --gtQU-VV 11*==?4#4#4W#=#=??@@@0099=$"3"3F";";==>>>>>r   r  c                   |j         }|j        r$|                    |j         d| d| d           n|j        K|                    |j         d|                     |                      |                    | d           n|j        | d|                     |           }n|                     || d          }|                    | d| 	                    |           d|
                                 d|j         d| g           |                     |          r1|                                 }|                    | d	| d
           d S |                    | d|j         d| d           d S )Nr  z	offset + r7  z
offset = 0r   z	offset = r  r   zmask = tl.full(z, True, tl.int1)zmask = z < r  )r-  r6  r   rz  r+  r9  r  r  r5  r  r   r*  r  )r   r  r  r%  r  r  s         r   r8  z,TritonKernel.iteration_ranges_codegen_header  s    L= 	NNej@@Q@@@@@AAAA^#NNejWWT-N-Nu-U-UWWXXXNNa+++,,,,+PPd&G&G&N&NPP881MMOO\\4#@#@#G#G\\AGGII\\\z,,d,,   ""5)) 	A''))ENNaGGGGGHHHHHNNa??
??q???@@@@@r   )r   TNN)r	  r
  r  r  r  r  rw   rx   r$  r%  rw   r   r   r   )r   r   rj  rJ  )r   )rz  r   r7  r   rm  r  rw   r  )r  r   r  r   r  rv   r   rv   )rz  r   r   r   rz   )
rz  r   r   r   rc  rM   r  rD   rw   rx   )r  r  NN)rb  rM   r  r  r  rM   r  r%  rB  rv   r  r  r  r  rw   rM   )rw   r	  )rc  rM   r$  r%  rw   rM   )
r$  r%  ro  r%  r  rC   rc  r  rw   r  )r$  r%  rK  )rz  r   r   r   rc  r  )rb  r  r  r  rw   r   )r  r  rq  r  rb  r  rw   r  )
r  r  rb  r  r  rv   r  rv   rw   r  )rw   r  )rw   r  )rz  r   r  r  r  )rw   r<  )r  rZ   )r  r[   rw   r   )r  r[   rc  r   rw   r   )r  r[   rw   rv   )r-  r   rw   rs  )r   r[   rw   rv   )r   r   rw   rx   )r  rO   rw   rx   )r  r   rw   r  ru  )r  r   rw   r   )r  r[   r  rO   rw   rx   )Rr{   r   r   r   rO  r  r   r  r  r  r  r  rB  r$  r'  r   r"  r!  r@  rB  rD  r   rF  rm  r  r  r  r  r}  r  r  r  r  r  r  r  r  r  rT  r>  r]  rZ  r  r\  r  r  r  r  r)  r/  rO  rU  rX  rM  rr  r  r  r  r  r  r+  r  r  r  r  r9  r  r  rX  rZ  r*  r1  r{  r4   r  r  r:  r
  r;  r<  r8  rI  rJ  s   @r   r  r  R  s,          &I%%%%).E....O$;!
 48'+(3 (3 (3 (3 (3 (3 (3T" " " "
 
 
 

# # #J  *: : :0  
 
 
 


 
 
 " " " X" GKT
 T
 T
 T
 T
 T
v
 A' A' A' A' A'FX X X X2M M M M4  ^ ^ ^ ^B SW4 4 4 4 4l  
 
 
 
& 48045 5 5 5 5n. . . .
9 
9 
9 
9
 
 
 
(X X X Xt& & & &"
 
 
 
0F
 F
 F
P# # #0& & &! ! !
   B/ / / /b>S >S >S >S@" " " "BD" D" D" D"Lb% b% b%H    X X Xt
 
 

 
 
 % % \%N\ \ \ \ \|   \   \$; $; $;LC C C C
- 
- 
-8 8 8 8 8(, , , ,2 2 2 2& & & &J J J J: : : :   
 
 
 
0 0 0 0
) ) ) )V. . . .
" " " " 
 
 
 ]
F F F F$W W W W 
 
 
 ]
- - - -? ? ? ? A A A A A A A Ar   r  c            
           e Zd ZU eZded<    eej        ej	        ej
        ej        ej        ej        ej        ej        g          Zd  fdZed!d
            Zd Zd Zd"d#dZ	 d$d%dZd&dZd'dZd Z xZS )(TritonSchedulingz	type[Any]kernel_type	schedulerOptional[Scheduler]rw   rx   c                    t                                          |           |t          |d          sd S |j        D ]*}t	          |t
          t          f          rt          |_        +d S )Nr  )	rA  rB  r   r  r  r3   r1   debug_triton_codedebug_device_str)r   r  r  rC  s      r   rB  zTritonScheduling.__init__9  sq    ###GIw$?$?FO 	: 	:D$0B CDD :(9%	: 	:r   r<  torch.devicec                    t           j        j        st           j        j        r"t	          g | j        t          j                  S | j        S rz   )r   r   cooperative_reductionsforce_cooperative_reductionsr   backend_featuresrJ   REDUCE_TO_SINGLE_ELEMENT)r   r<  s     r   get_backend_featuresz%TritonScheduling.get_backend_featuresA  sP     M0	}9	 P#&P(OP   ##r   c                f   t           j        j        }t          ||          \  }}|r|                    |           t
          j        reddlmm	 t          fd|D                       sDfd|D             }|                    |j         dd                    |                      d S d S d S )Nr   )r0   ForeachKernelSchedulerNodec              3  8   K   | ]}t          |          V  d S rz   )r  )r   r  r%  s     r   r   z3TritonScheduling.codegen_comment.<locals>.<genexpr>X  s?        >?
1899     r   c                X    g | ]&}t          |          |                                'S r   )r  get_name)r   r  r0   s     r   r	  z4TritonScheduling.codegen_comment.<locals>.<listcomp>]  sB       !!%677JJLL  r   z Fused node name list: rn  )rE   r&  r  r8   make_commentr   debug_fusiontorch._inductor.schedulerr0   r%  r   commentr   )r   node_scheduler  origins_detailed_origins
node_namesr0   r%  s         @@r   codegen_commentz TritonScheduling.codegen_commentL  s+   '&%8%P%P"" 	*  ))) 	       
     CP     
   *  

 $$VVtyy?T?TVV    !	 	 r   c                `   t           j        j        }||j        v r|j        |         }nt          j        j        rt          |t          j        j                  nd}t          |          d d         }d	                    d|||
                                g          }t          j        j        rt          j        j         d| }||j        |<   t          j        j        r|nd}|                    t          t           j                  |          }|                    t          t           j                  |          }|                    dd          }t'          t)          |                                          d          \  }	}
}t-                      }t.                                          rt.                              ||           |                    d	|d
           |                    |d           t           j                                        }|                    d|j         d           d| }t;          ||          \  }}|d|z   dz   |z   z  }|                    ||                                |           tA          j!        d          rtA          j"        |||           |S )Nr   r   r  r   triton_z#pragma CMT#pyzasync_compile.triton(z, '''Tr  z''', device_str='z')z# kernel path: r2  kernel_metadata)#rE   r&  r  src_to_kernelr   r   descriptive_namesr7   rF   r   next_kernel_suffixaot_inductormodel_name_for_generated_filesunique_kernel_namesreplacer   r:   r  rB  r$   r#   r  rO   async_compileuse_process_poolr   r   r  r  r8   define_kernelr   r!   is_metric_table_enabledlog_kernel_metadata)r   src_coder-  r+  r  r{  
fused_namekernel_category	subs_name	_basenamer  kernel_pathcompile_wrapperrM  metadata_commentr.  detailed_originss                    r   r@  zTritonScheduling.define_kernelf  s   '&w,,,!/9KK =2%mV]5TUUU 
 AJJ2A2NO((?J8R8R8T8TU K "A d "(!4!SccVacc /:G!(+'-}'HWiI
  ''K,H(I(I;WWH''K,C(D(DiPPH  ''s;;H(08>>;K;K1L1Ld(S(S%Iq+,..O--// : $$Y999%%&Pi&P&P&PQQQ""84"888W@@BBN%%&Q.:M&Q&Q&QRRR>>>(;M7(S(S%G%w 58H HH!!_55779I   ./@AA P+KhOOOr      tuple[float, str]c                    |                      |d          }t          j        |          }|                     ||t	          d |D                                 S )z
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        T)r  c              3  >   K   | ]}|                                 V  d S rz   r(  r   r  s     r   r   z9TritonScheduling.benchmark_fused_nodes.<locals>.<genexpr>  s*      :W:WA1::<<:W:W:W:W:W:Wr   )r0  )generate_kernel_code_from_nodesr%   r}  benchmark_codegened_moduler   )r   r  n_spills_thresholdrC  r  s        r   benchmark_fused_nodesz&TritonScheduling.benchmark_fused_nodes  sj    
 77PT7UUx((..#
:W:WQV:W:W:W0W0W / 
 
 	
r   Nr0  Optional[OrderedSet[str]]c                  	
 t          t          j        j                  }t	                      5  |                    t          j                                                  5  dfd

fd}
fd}||nt          dg          }t          	                    d|j
                    |            !j
        fcddd           cddd           S                                 	j        j        	   j        	 d                    n# t          $ rr}t           j        j        r t          	                    d||           t'          d	           |             j
        fcY d}~cddd           cddd           S d}~ww xY wj        }t+          |          d
k    sJ |d         j        |k    rt'          d	          nLt/          j        	fd          t+          j                  dk    rt/          j        	fd          z
  t          	                    d|            |             j
        fcddd           cddd           S # 1 swxY w Y   ddd           dS # 1 swxY w Y   dS )z$Benchmark an already compiled moduleNc                 p     j         J t          j                             j                   d         dz   S Nr   z.kernel_perf__file__ospathsplitextr  s   r   cache_file_pathzDTritonScheduling.benchmark_codegened_module.<locals>.cache_file_path  s3    |///w''55a8>IIr   c                 V                 } t          | t                               d S rz   r&   r   )r]  r`  mss    r   store_cachez@TritonScheduling.benchmark_codegened_module.<locals>.store_cache  s,    &((T3r77+++++r   c                                  } t           j                            |           rHt          |           5 }t	          |                                          cd d d            S # 1 swxY w Y   d S rz   )r\  r]  existsopenr  readr]  fdr`  s     r   
load_cachez?TritonScheduling.benchmark_codegened_module.<locals>.load_cache  s    &((7>>$'' 0d 0r$RWWYY//0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0ts   !A((A,/A,r  %kernel src code for %s written to: %sr   z*Exception (%s) in compiling fused nodes %sinfrG   c                 6      j           d                   S rH  
clone_argsr  callwrapped_jit_functions   r   r  z=TritonScheduling.benchmark_codegened_module.<locals>.<lambda>  "    DD!@!5!@$!G!JKK r   c                      j           S rz   ro  r  rs  s   r   r  z=TritonScheduling.benchmark_codegened_module.<locals>.<lambda>  s     ? 4 ? F r   z+The fused kernel for %s took %.3f ms to run)r   rE   r&  r  r   r<  r  r   r-  r  r[  get_argsrr  r3  rp  	Exceptionr   r   .disallow_failing_autotune_kernels_TESTING_ONLYr  	launchersr1  n_spillsr)   benchmark_gpur|  )r   r  rT  r0  device_interfacerd  rk  rK  rz  r  r`  rr  rc  rs  s    `       @@@@@r   rS  z+TritonScheduling.benchmark_codegened_module  sq    4AG4GHH  O	$ O	$##AG$G$G$I$IJJO	$ O	$ BJ J J J J, , , , , ,     )4

*i[:Q:Q  II7  
 B~3<'?O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$B <<>>D8D#&; (4)4d;A>???? 
( 
( 
(=O 		@  
 5\\3<''''''cO	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$N
( -6Iy>>Q&&&& |$'9995\\ !.KKKKKK  +=>>BBk7FFFFF  B II=  
 KMMMs|#_O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$ O	$sz   2I>%AI&I>"I&=DI&
F!AF0F1I&5I>FB:I&I>&I*	*I>-I*	.I>>JJkernel_featuresrj   kernel_args	list[Any]kernel_kwargsr  list[TritonKernel]c                   |                     d          }|o*t          d |                                D                       }| j        }|rddlm} |}|rd|d<   |                     d          r
d|d	<   d|d<   t                              |j                  s|	                    d	          rJ d|d	<   t          j                            ||||          } ||i |}|                     |||          S )
Nr  c              3  >   K   | ]}|                                 V  d S rz   )is_split_scanr   r  s     r   r   z9TritonScheduling.create_kernel_choices.<locals>.<genexpr>  s?       (
 (
%)D  (
 (
 (
 (
 (
 (
r   rG   )TritonSplitScanKernelFoverride_cooperative_reductionr  Toverride_persistent_reduction)contains_opr   scheduler_nodesr  triton_split_scanr  r  r  r  r  rE   r&  triton_kernel_kwargsadd_multi_kernel_choices)	r   r~  r  r  is_scanr  r  r  r+  s	            r   create_kernel_choicesz&TritonScheduling.create_kernel_choices  sO    "--f55 
C (
 (
-<-L-L-N-N(
 (
 (
 %
 %
 +/*: 	0@@@@@@/K 	D>CM:; &&v.. 	D=AM9:>CM:;11/2QRR 	C$(()HIIIII=BM9:	66+}
 
 k;];;,,V[-PPPr   r+  r  c           	        |g}t           j        j        s|S |j        o|                    d           }|j        o|                    d           }|r$|                     | j        |i |ddi           |r|j        j	        }t          j        j                            |d          rT|                     | j        |i |ddix}           |r,|j        r%|                     | j        |i |ddd           t          |          dk    r0|dd          D ]}	|j        |	_        |                    d            |S )	Nr  r  Fi   )r  r  rG   c                    | j         S rz   )r  )ks    r   r  z;TritonScheduling.add_multi_kernel_choices.<locals>.<lambda>]  s	    q'= r   rL  )r   r   multi_kernelr  r  r  r  r  r3  r  rE   r&  r  r  r1  must_keep_buffersr  )
r   r+  r  r  kernelsoptional_persistentoptional_cooperativer  r  kernel2s
             r   r  z)TritonScheduling.add_multi_kernel_choices+  s    (.h}) 	N$9 
-BSBS+C
 C
 ?
  &;  
MDUDU,E
 E
 A
  	NN   #  38       	_4Fw44VUCC -T-$'  8=   E   ' 5+E NN(((+ <A:?	      w<<!"122; E E,2,D))LL==L>>>r   c                H   fdfd}fd}dg }}d}t           j        j        }t          |          t           j        _        t           j        j        }t          |          t           j        _        t
          j        dk    }	t
          j        dk    }
|                     |d|	|
d          }|D ]\  }}}d |D             }d	 |D             }|	                    t          t          j                  d
          }t          j        |          t                              d|j                    |            \  %|z  }|z  }|                    j                                                   j        j          j         d                    j        }t1          |          dk    sJ |d         j        dk    rt5          d          xn1t7          j        fd          t7          j        fd          t                              dt          d |D                                   |             |z  }|z  }|                    j                   |t           j        _        |t           j        _        |||fS )Nc                 p     j         J t          j                             j                   d         dz   S rY  rZ  r_  s   r   r`  z@TritonScheduling.benchmark_combo_kernel.<locals>.cache_file_pathe  s3    <+++7##CL11!4~EEr   c                 "                } t           j                            |           rdt          |           5 }t	          d |                                                                D                       cd d d            S # 1 swxY w Y   dS )Nc              3  4   K   | ]}t          |          V  d S rz   )r  )r   rK  s     r   r   zNTritonScheduling.benchmark_combo_kernel.<locals>.load_cache.<locals>.<genexpr>m  s(       E Eaq E E E E E Er   r  )r\  r]  rf  rg  r  rh  splitri  s     r   rk  z;TritonScheduling.benchmark_combo_kernel.<locals>.load_cachei  s    "?$$Dw~~d## F$ZZ F2  E E27799??3D3D E E EEEF F F F F F F F F F F F F F F F<s   =BBBc                 |                 } t          | t                    dz   t                    z              d S )Nr  rb  )r]  r`  rc  ms_clones    r   rd  z<TritonScheduling.benchmark_combo_kernel.<locals>.store_cachep  s;    "?$$Ds2ww}s8}}<=====r   r   g        T)subkernel_nodescustom_part_algorithmenable_autotunemixed_sizesonly_gen_src_codec                6    g | ]}|                                 S r   )	get_nodesr  s     r   r	  z;TritonScheduling.benchmark_combo_kernel.<locals>.<listcomp>  s"    HHHT 0 0HHHr   c                @    g | ]}|D ]}|                                 S r   rP  )r   r  r  s      r   r	  z;TritonScheduling.benchmark_combo_kernel.<locals>.<listcomp>  s-    OOOeOOAQZZ\\OOOOr   r3  rl  rG   rm  c                 6      j           d                   S rH  ro  rq  s   r   r  z9TritonScheduling.benchmark_combo_kernel.<locals>.<lambda>  rt  r   c                 $     j           d         S rH  ro  rv  s   r   r  z9TritonScheduling.benchmark_combo_kernel.<locals>.<lambda>  s    ;0;TB1E r   zDThe fused kernel for %s took %.3f ms to run, %.3f ms to clone inputsc              3  >   K   | ]}|                                 V  d S rz   rP  rQ  s     r   r   z:TritonScheduling.benchmark_combo_kernel.<locals>.<genexpr>  s*      <<A1::<<<<<<<<r   )rE   r&  r  r   inplaced_to_remover   combo_kernels_autotunecombo_kernel_allow_mixed_sizesgenerate_combo_kernel_coder=  r   r:   rB  r%   r}  r-  r  r[  r  rw  rr  r3  rp  rz  r1  r{  r  r)   r|  )r   	node_listrk  rd  total_ms	file_listtotal_clone_msremoved_buffers_originplaced_to_remove_origr  r  kernel_code_listrC  r  
node_groupfused_node_listsnamesrz  r  r`  rr  r  rc  r  rs  s                     @@@@@@@r   benchmark_combo_kernelz'TritonScheduling.benchmark_combo_kernel`  s&   
	F 	F 	F 	F 	F	  	  	  	  	 	> 	> 	> 	> 	> 	> 	>  ) # w6",-A"B"B"#'"<%/0G%H%H" 7!;;a?::%"&+#" ; 
 
 (8 2	+ 2	+#HaHHZHHHOO/?OOOE''K,C(D(DiPPH"8,,CII7  
 &:<<LB~B(*  ...<<>>D8D#&;  D0%0$7:;;;,6Iy>>Q&&&&|$q(( %e,XX !.KKKKKK  '4EEEEE  IIV<<<<<<<	   KMMMNHh&NS\****"6%<"22r   )r  r  rw   rx   )r<  r  )rL  )rw   rM  )rL  N)r0  rV  rw   rM  )r~  rj   r  r  r  r  rw   r  )r+  r  r  r  r  r  rw   r  )r{   r   r   r  r  r   r   rJ   FOREACH	BUCKETIZEINPLACE_BUFFERSMASKED_SCATTER_WITH_INDEXSCANSORTTRITON_TEMPLATESTUPLE_REDUCTIONr!  rB  r   r#  r1  r@  rU  rS  r  r  r  rI  rJ  s   @r   r  r  *  sY        )K))))!z"$*4+*		
 : : : : : : $ $ $ [$  4= = =~	
 	
 	
 	
 	
 RVT$ T$ T$ T$ T$l#Q #Q #Q #QJ3 3 3 3jY3 Y3 Y3 Y3 Y3 Y3 Y3r   r  r  r0   r  c                ^   g }|                                  }|t          |t          j                  sJ |r3|j        ,|                    |                                  d           nCddlm} | 	                                }|J | j
                            |          }t          |t          |f          sJ dt          |                       t          j                            |          5  t"          j        }|                    |                                                                           }|t"          _        d d d            n# 1 swxY w Y   |                    |                                  d           |                    t-          j        |d                     |S )Nz" Unfinalized multi template bufferr   )CUDACombinedSchedulingz]Scheduling backend should be SIMD or CUDACombined when generating debug Triton strings, got: z Triton code:z    )get_template_noder  r    MultiTemplateBuffermake_kernel_renderr  r(  0torch._inductor.codegen.cuda_combined_schedulingr  r:  r  get_backendr]   r  rE   r&  set_current_devicer!   generated_kernel_countrR  r  r  rR  r  )r  linesmulti_templater  r<  backendold_generated_kernel_counttriton_codes           r   r  r    s   E++--N!Z@V%W%W!!W ;.;CKKKLLLL	
 	
 	
 	
 	
 	
 ""!!!.,,V44'N4J#KLL 	
 	
{lpqxlyly{{	
 	
L W''// 	H 	H *1)G&!AA   egg  .HG*	H 	H 	H 	H 	H 	H 	H 	H 	H 	H 	H 	H 	H 	H 	H 	666777X_[&99:::Ls   2AEEEr   )rc  r   r  r   r  r   rw   r   r  )r$  r%  rw   r%  )r$  r%  rw   rv   )r7  r8  rw   rv   )rw   rg   r  )ru   rv   rw   rN  )r  r0   rw   r  )
__future__r   r  r  r(  r^  r`  loggingrJ  r  r\  rR  collections.abcr   r   r   typingr   r   r	   r
   r   r   r   sympy.printing.precedencer   r|   torch._loggingtorch.utils._pytreer\  _pytreerL  torch._dynamo.device_interfacer   torch._dynamo.utilsr   r   torch._prims_commonr   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._tritonr   r   utils._sympy.symbolr   r   r   r   utils._sympy.value_rangesr   r   r   r    r!   r>  r"   	codecacher#   r$   r%   r&   ops_handlerr'   runtimer(   runtime.benchmarkingr)   runtime.hintsr*   r+   r,   r-   runtime.runtime_utilsr.   r/   r  r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   virtualizedrB   r  rC   rD   rE   wrapper_benchmarkrF   block_analysisrH   commonrI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   simdrX   rY   rZ   r[   r\   r]   triton_utilsr^   r_   r`   ra   rb   r  rc   typesrd   re   rL  rg   rh   ri   simd_kernel_featuresrj   rk   	getLoggerr{   r-  _logginggetArtifactLoggerperf_hint_logschedule_log
fusion_logrp   r   r   r   	dataclassr   r   ry  r  rp  r  r  r  r(  r+  r1  r3  r6  r:  r<  rM  rl  rn  _initialize_pointwise_overridesrO  r  r   r  r  r   r  r  r  r  r  r  r   r   r   <module>r     s%   " " " " " "                        				  . . . . . . . .       F F F F F F F F F F F F F F F F  0 0 0 0 0 0      $ $ $ $ $ $ $ $ $ C C C C C C < < < < < < < < 0 0 0 0 0 0 / / / / / / K K K K K K K K K K M M M M M M M M X X X X X X X X X X X X 4 4 4 4 4 4 " " " " " " " " " " ( ( ( ( ( ( F F F F F F F F F F F F ( ( ( ( ( ( ' ' ' ' ' ' . . . . . .            D C C C C C C C W W W W W W W W W W W W                                 C B B B B B B B B B B B B B B B B B / / / / / /                                 "                             % $ $ $ $ $  
      LLLLLL&&&&&&888888	Bg!!00<HH~//*EE^--hAA
6 6 6 6 6 6 6 6  4   $ 4   *, , , , , , , ,: 
 
 
 
 
 
 
 
B A A A A A A A AH ? ? ? ? ?4 ? ? ?8 = = = = =, = = =@+ + + +>jQ jQ jQ jQ jQM jQ jQ jQZ 	3 3 3 3
   & & & &8 8 8 8
; ; ; ;P P P P       :( ( ( (. . . . .bS& S& S& S& S&k S& S& S&l  / / 9 9 9|8 |8 |8 |8 |8O |8 |8 |8~$+ $+ $+ $+ $+ $+ $+ $+N : : : : : : : :&! ! ! ! ! ! ! !H # # # # # # # #
 
 
 
 
%uS%S/-A'BBC 
 
 
 e e e e e e e ePU+A U+A U+A U+A U+A:/0 U+A U+A U+ApVO3 O3 O3 O3 O3~ O3 O3 O3d     r   