
    `i                      d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZ d dlmZmZmZmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d d	lm Z  d d
l!m"Z" d dl#m$Z$m%Z%m&Z& d dl'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7m8Z8m9Z9 erddl0m:Z: ddl;m<Z< ddl=m>Z> ddl?m@Z@mAZA ddl1mBZBmCZCmDZD ddlEmFZFmGZGmHZHmIZImJZJmKZKmLZLmMZMmNZNmOZO ddlPmQZQmRZRmSZS ddlTmUZU ddlVmWZWmXZXmYZYmZZZ ddl[m\Z\ ddl]m^Z^m_Z_m`Z`maZambZb erd d lcmdZdmeZemfZf d d!lmgZg  ejh        ei          Zjejk        l                    eid"          Zmejk        l                    eid#          Znejk        l                    eid$          Zo eZ            jp        Zq e"g d%          Zrd@dAd)Zsejt         G d* d+                      Zu G d, d-eu          Zv G d. d/eu          ZwdBd3Zx ed4eWeW5          Zy G d6 d7eYey         eey                   Zz G d8 d9eC          Z{ ejt        d:;           G d< d=                      Z| G d> d?e}          Z~dS )C    )annotationsN)Counter)AnyCallableGenericno_type_checkOptionalTYPE_CHECKINGUnion)TypeVar)MultiTemplateBuffer)analyze_memory_coalescing)free_unbacked_symbols)immutable_dict)
OrderedSet)FloorDivIdentityModularIndexing)free_symbol_is_type
prefix_strsymbol_is_typeSymT   )counters   )configir	scheduler)prologue_preserves_zero_mask)	code_hash)	MemoryDepStarDepWeakDep)IRNode)'set_kernel_post_grad_provenance_tracing)!indexing_dtype_strength_reduction)
green_textyellow_text)BaseSchedulerNodeBaseScheduling	WhyNoFuse)
cache_on_selfexpr_fits_within_32bitget_dtype_sizeIndentedBufferPlaceholderprefix_is_reductionsympy_index_symbolsympy_product
sympy_subsunique)ops
OpsWrapperV   )BlockPatternMatcher)CSEVariableindex_prevent_reorderingKernelPythonPrinter)MultiKernel)DisableReductionEnableReductionNodeScheduleEntryNodeScheduleMarkerSIMDKernelFeatures)IterableIteratorSequence)CoalesceVarAnalysis
perf_hintsschedulefusion)zyxr0_r1_defaultintreturnc                D    t           j        j        j        j        }||n| S N)torch	_inductorr   triton	max_tiles)rQ   rY   s     p/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torch/_inductor/codegen/simd.pyget_max_tilesr[   Z   s!    &-7I!-997:    c                       e Zd ZdZej        j        ej        j        dd fdZee	e
dd                                    ZddZee	e
dd                                    Z xZS )IterationRangesa  
    Each range tree represents multiple sets of iteration indexing
    in a single tiled dimension in the output kernel.

    If you have two loops ranges one (4, 3, 2) and another (4, 6),
    then the range tree will be:
            4 (i0)
        3 (i1)  6 (i3)
        2 (i2)
    Where i0 is shared between both loops, but then the split into
    different indexing vars.  All loop ranges must iterate over
    the same number of elements.
    )divisorlengthnamestrvar_listlist[sympy.Symbol]
var_rangesdict[sympy.Symbol, sympy.Expr]numel
sympy.Exprprefixkernel
SIMDKernelrootIterationRangesRootrS   Nonec                   t                                                       || _        || _        || _        || _        || _        || _        || _        || _	        |	| _
        d S rU   )super__init__ra   rc   re   rg   ri   r_   r`   rj   rl   )selfra   rc   re   rg   ri   rj   r_   r`   rl   	__class__s             rZ   rq   zIterationRanges.__init__o   s^     		 $
			r\   boolc                *    t          | j                  S rU   )r1   ri   rr   s    rZ   is_reductionzIterationRanges.is_reduction   s     #4;///r\   sympy.Symbolc                *    t          | j                  S rU   )r2   ra   rv   s    rZ   symbolzIterationRanges.symbol   s    !$),,,r\   r   c                V    d t          j                    D             }|| j                 S )Nc                    i | ]\  }}||	S  r}   ).0symtri   s      rZ   
<dictcomp>z(IterationRanges.symt.<locals>.<dictcomp>   s    NNN<4&$NNNr\   )r   itemsri   )rr   prefix_to_symts     rZ   r   zIterationRanges.symt   s.     ON:;K;M;MNNNdk**r\   )ra   rb   rc   rd   re   rf   rg   rh   ri   rb   rj   rk   rl   rm   rS   rn   rS   rt   rS   rx   )rS   r   )__name__
__module____qualname____doc__sympySOnerq   propertyr,   r   rw   rz   r   __classcell__rs   s   @rZ   r^   r^   _   s         . w{       0 0 0 0 ] ] X0- - - - + + + ] ] X+ + + + +r\   r^   c                  `     e Zd ZdZ	 d'd( fdZd)dZd*dZd+dZd,dZd-d"Z	d.d$Z
d/d&Z xZS )0rm   z
    Root of a iteration range tree that represents a single
    tiled dimension in the output kernel. It contains multiple
    sets of iteration represented with IterationRangesEntry.
    Nra   rb   rg   rh   ri   indexrR   rj   rk   	pid_cacheOptional[dict[str, str]]is_looprt   
tensor_dimOptional[int]grid_dimhas_zdimrS   rn   c          	         |i }t                                          |g i ||||            || _        i | _        || _        |r| j        r|	J || _        || _        |	| _        |
| _	        d S )N)ra   rc   re   rg   ri   rj   rl   )
rp   rq   r   nodesr   rw   r   r   r   r   )rr   ra   rg   ri   r   rj   r   r   r   r   r   rs   s              rZ   rq   zIterationRangesRoot.__init__   s     I 	 	
 	
 	
 
=?
 *3 Ft0FX5E5EF$  r\   c                (    d| j         d| j         dS )NzIterationRangesRoot(, z, ...))ra   rg   rv   s    rZ   __repr__zIterationRangesRoot.__repr__   s    GdiGGTZGGGGr\   c                f    | j                                         D ]}|                                 d S rU   )r   valuescache_clear)rr   nodes     rZ   r   zIterationRangesRoot.cache_clear   s>    J%%'' 	 	D	 	r\   rx   c                0    t          | j         d          S )Nr   )r2   ri   rv   s    rZ   	index_symzIterationRangesRoot.index_sym   s    !T["7"7"7888r\   r_   r`   IterationRangesEntryc                t   t           j        j                            ||z  | j                  r#t          |                                 |          }n#t          |                                 ||          }|| j        vrt          | j
         t          t           j        j                   ||||           }|t           j        j        |                                <   | j                            |                                           || j        |                                <   || j        |<   | j        |         S )zF
        Lookup a given RangeTreeEntry, creating it if needed
        )r8   graphsizevarsstatically_known_equalsrg   r   r   r   r   r   ri   nextrj   iter_vars_countrange_tree_nodesrz   rc   appendre   )rr   r_   r`   exprr   s        rZ   lookupzIterationRangesRoot.lookup   s    733Gf4DdjQQ 	FDNN,,g66DD"4>>#3#3WfEEDtz!!';@QX%= > >@@ D 8<AH%dkkmm4M  ///-3DODKKMM*#DJtz$r\   lengthslist[sympy.Expr]list[IterationRangesEntry]c                    t           j        j        }g }t          |          D ]0}|                    |                     ||                     ||z  }1g t          |          S rU   )r   r   r   reversedr   r   )rr   r   r_   itervarsr`   s        rZ   construct_entriesz%IterationRangesRoot.construct_entries   sh     '+w'' 	' 	'FOODKK88999&GG$(##$$r\   rd   c                @    d |                      |          D             S )Nc                6    g | ]}|                                 S r}   )rz   )r~   es     rZ   
<listcomp>z1IterationRangesRoot.construct.<locals>.<listcomp>   s     DDDq

DDDr\   )r   rr   r   s     rZ   	constructzIterationRangesRoot.construct   s$    DDD$:$:7$C$CDDDDr\   +tuple[list[sympy.Symbol], list[sympy.Expr]]c           
         ddd |j         D             } fd|D             }|                    fd	           t          j        j        g g fd
}|D ]p}t
          j        j                            |j	                  s9 | 
                    t          |j	                                       |j	         ||           qt
          j        j                             j                  s2 | 
                    t           j                                       g t                    g t                    fS )z,Figure out vars from this tree used in indexrN   r   rS   tuple[int, bool]c                    t           j        j                            | j        t
          j                  }t           j        j                            | j        t
          j                  dk    }|| fS )a:  
            Gets the key for sorting nodes. When two nodes have the
            same divisor, the node with length as 1 should be handled
            first so the current divisor is not changed after multiplied
            node.length. Returns `not length_is_one_hint` for ascending
            sort.
            fallbackr9   )r8   r   r   	size_hintr_   r   unbacked_symint_fallbackr`   )rN   divisor_hintlength_is_one_hints      rZ   get_sort_keyz8IterationRangesRoot.vars_and_sizes.<locals>.get_sort_key   sv     7+55	F$C 6  L  **Hv'F +     !&8"899r\   c                V    g | ]&}t           j        j                            |          'S r}   )r8   rj   r   getr~   ss     rZ   r   z6IterationRangesRoot.vars_and_sizes.<locals>.<listcomp>  s+    NNNa*..q11NNNr\   c                8    g | ]}||j         j         k    |S r}   )ri   )r~   nrr   s     rZ   r   z6IterationRangesRoot.vars_and_sizes.<locals>.<listcomp>  s,    CCCqQC18t{+B+B+B+B+Br\   c                     |           S rU   r}   )rN   r   s    rZ   <lambda>z4IterationRangesRoot.vars_and_sizes.<locals>.<lambda>  s    a r\   keyc                                         |                                                                 | j                   | j        z  d S rU   )r   rz   r`   )r   r_   
index_varssizess    rZ   addz/IterationRangesRoot.vars_and_sizes.<locals>.add  sC    dkkmm,,,LL%%%+GGGr\   )rN   r   rS   r   )free_symbolssortr   r   r   r8   r   r   r   r_   r   r   rg   r   )	rr   r   r   r   r   r_   r   r   r   s	   `    @@@@rZ   vars_and_sizesz"IterationRangesRoot.vars_and_sizes   s   
	: 	: 	: 	:& ON5;MNNNCCCCECCC

0000
111'+
	, 	, 	, 	, 	, 	, 	,  	 	D7#;;DL'RR 'DKK$,)H)HIIJJJ,CIIIIw77
GLL 	ECGXdj'%B%BCCDDD&*%%&(:(5//(:::r\   rU   )ra   rb   rg   rh   ri   rb   r   rR   rj   rk   r   r   r   rt   r   r   r   r   r   rt   rS   rn   rS   rb   rS   rn   r   )r_   rh   r`   rh   rS   r   )r   r   rS   r   )r   r   rS   rd   )r   rh   rS   r   )r   r   r   r   rq   r   r   r   r   r   r   r   r   r   s   @rZ   rm   rm      s          /3(! (! (! (! (! (! (!TH H H H   9 9 9 9       .% % % %E E E E/; /; /; /; /; /; /; /;r\   rm   c                  X     e Zd Zd fdZddZddZddZddZddZddZ	ddZ
 xZS )r   ra   rb   r_   rh   r`   r   parentr^   rS   rn   c                   t                                          ||j        |z  |j        |j        |j        |||j        |j        	  	         || _         t          j
        d           | j                  | _        || _        d S )N)	ra   rg   rc   re   ri   r_   r`   rj   rl   )rp   rq   rg   rc   re   ri   rj   rl   r   	functools	lru_cache_codegencodegenr   )rr   ra   r_   r`   r   r   rs   s         rZ   rq   zIterationRangesEntry.__init__*  s     	,'_(== 	 
	
 
	
 
	
 0y*400??			r\   c                X    d| j          d| j         d| j         d| j         d| j         dS )NzIterationRangesEntry(r   ))ra   r_   r`   r   re   rv   s    rZ   r   zIterationRangesEntry.__repr__A  sA    rtyrrDLrrDKrrSWS\rr`d`orrrrr\   c                D    fd| _         d | j         _        | _        d S )Nc                      S rU   r}   ra   s   rZ   r   z/IterationRangesEntry.set_name.<locals>.<lambda>E  s    t r\   c                     d S rU   r}   r}   r\   rZ   r   z/IterationRangesEntry.set_name.<locals>.<lambda>F  s    4 r\   )r   r   ra   )rr   ra   s    `rZ   set_namezIterationRangesEntry.set_nameD  s'    #|||#/< 			r\   c                8    | j                                          d S rU   )r   r   rv   s    rZ   r   z IterationRangesEntry.cache_clearI  s      """""r\   c                N    t           j                            |            | j        S rU   )r8   rj   codegen_iteration_ranges_entryra   rv   s    rZ   r   zIterationRangesEntry._codegenL  s     	//555yr\   r   c                   g }t          | j        t          j                  r|S t          | j        t          t
          f          sJ t          | j                              | j        j        dd          D ]p}t          |t          j        t          j        f          sH|j	        }t          |          dk    r.t          d |D                       r|                    |           q|S )Nr9   r   c              3  J   K   | ]}t          |t          j                  V  d S rU   )r   r   SIZEr   s     rZ   	<genexpr>z8IterationRangesEntry.precomputed_args.<locals>.<genexpr>Y  s?       , ,56N1di00, , , , , ,r\   )
isinstancer   r   Symbolr   r   typeargsIntegerr   lenallr   )rr   precomputed_argsargsymbolss       rZ   r   z%IterationRangesEntry.precomputed_argsP  s    -/di.. 	$##$)h%@AARR4	??RRA9>!""% 	1 	1CcEM5<#@AA 1*w<<!## , ,:A, , , ) )# %++C000r\   rR   c                *    t          | j                  S rU   )hashra   rv   s    rZ   __hash__zIterationRangesEntry.__hash___  s    DIr\   otherobjectrt   c                P    t          |t                    sJ | j        |j        k    S rU   )r   r   ra   )rr   r   s     rZ   __eq__zIterationRangesEntry.__eq__b  s(    %!566666yEJ&&r\   )ra   rb   r_   rh   r`   rh   r   rh   r   r^   rS   rn   r   )ra   rb   rS   rn   r   )rS   r   rS   rR   )r   r   rS   rt   )r   r   r   rq   r   r   r   r   r   r   r   r   r   s   @rZ   r   r   )  s             .s s s s   
# # # #             ' ' ' ' ' ' ' 'r\   r   valueUnion[int, float]rb   c                    | t          d          k    rdS | t          d          k    rdS t          j        |           rdS t          |           S )Ninfzfloat("inf")z-infzfloat("-inf")zfloat("nan"))floatmathisnanrepr)r   s    rZ   constant_reprr  g  sQ    e~	%--			E		 ~;;r\   CSEVariableType)boundrQ   c                      e Zd ZU dZeZded<   ded<   dZded<   ded	<   	 	 	 	 dndo fdZe	e
edpd                                    ZdqdZdrdZe	dsd            Zdtd Zdud&Zdvd(Zdwd+Zdxd1Zdtd2Zdtd3Zdyd5Zdpd6Zdzd8Zd{d:Zdsd;Zd|d<Zd}d?Zd}d@Zd~dBZddEZ e!ddJ            Z"e#e$j%        j&        fddL            Z'e#e$j%        j&        fddM            Z(ddOZ)e#ddP            Z*ddQZ+ddRZ,ddSZ-d|dTZ.ddUZ/ddWZ0ddXZ1ddd[Z2e3j4        dd`            Z5ddaZ6e!db             Z7dddZ8de Z9df Z:dg Z;dh Z<di Z=dj Z>ddmZ? xZ@S )rk   zo
    Common base class for Triton/Halide codegen which both use flattened indexing rather than loop nests.
    zCallable[[sympy.Expr], str]sexprkexprFrt   allow_block_ptrrb   kernel_nameNtilingdict[str, sympy.Expr]featuresrD   r   r   override_persistent_reductionOptional[bool]override_cooperative_reductiontiling_scoresOptional[dict[str, sympy.Expr]]rS   rn   c                    |i }t                                                       | _        |                                 _        t                       _        t                       _        d |                                D              _	        g  _
        i  _        t          j                     _        |                                 _        ||n                                  _        | _        | _        ||n                                  _                                          _        d  _        t4          j        d fd            }| _                             |           d S )Nc                ^    i | ]*\  }}|t           j        j                            |          +S r}   )r8   r   r   simplify)r~   ri   vals      rZ   r   z'SIMDKernel.__init__.<locals>.<dictcomp>  s>     
 
 
7BvsFAG$--c22
 
 
r\   r   rh   c                    t           j        j                            |                                           } j        D ]}                    | |          }                     |           S rU   )r8   r   r   simplify_with_rangesre   range_treescombine_contiguous_dimscombine_modular_indexing_pairs)r   treerr   s     rZ   simplify_indexingz.SIMDKernel.__init__.<locals>.simplify_indexing  sf    G$99%ARARSSE( B B44UDAA66u===r\   )r   rh   )rp   rq   r  get_mutations	mutationsr/   bodyindexing_coder   numelsr  r   	itertoolscountr   rw   inside_reduction should_use_cooperative_reductioncooperative_reductionr  r  should_use_persistent_reductionpersistent_reductionwant_no_x_dimno_x_dimr    r   cacher"  initialize_range_tree)	rr   r  r  r   r  r  r  r"  rs   s	   `       rZ   rq   zSIMDKernel.__init__~  s    I !//11"$$	+--
 
FLllnn
 
 
 79JL(00 ( 5 5 7 7 .9 +*6688 	"
 ?L-3 -8 *)5577 	!
 **,,(, 
	> 	> 	> 	> 	> 
	> "3""9-----r\   rR   c                >    t          d | j        D                       S )Nc              3  4   K   | ]}t          |          V  d S rU   )r1   )r~   ri   s     rZ   r   z0SIMDKernel.num_reduction_dims.<locals>.<genexpr>  s+      II6&v..IIIIIIr\   )sumr'  rv   s    rZ   num_reduction_dimszSIMDKernel.num_reduction_dims  s#     IIT[IIIIIIr\   dtypetorch.dtypec                    t           rU   NotImplementedError)rr   r7  s     rZ   dtype_to_strzSIMDKernel.dtype_to_str      !!r\   c                4    | j                                         S rU   )r  select_index_dtyperv   s    rZ   get_index_dtype_as_torch_dtypez)SIMDKernel.get_index_dtype_as_torch_dtype  s    }//111r\   c                P    |                      |                                           S rU   )r<  r@  rv   s    rZ   index_dtypezSIMDKernel.index_dtype  s"      !D!D!F!FGGGr\   c                    dS NFr}   rv   s    rZ   r/  zSIMDKernel.want_no_x_dim      ur\   r*  rw   r'  r0  list[IterationRangesRoot]c                &   t          fdt          D                       }| p| }dd}g d}	t          t          |	                    }
ddg}|r|}n
|r|
}n|
|z   } |||          } ||	t                    }g }t	          |          D ]\  }}t          |          }|                    |          }|                    |          }||n|}|                    t          | d|         ||| ||o| j	         ||d	v 

  
                   |S )Nc              3  $   K   | ]
}|v |V  d S rU   r}   )r~   ri   r'  s     rZ   r   z3SIMDKernel.construct_range_trees.<locals>.<genexpr>  s6       %
 %
61A1AF1A1A1A1A%
 %
r\   rS   dict[Any, int]c                N    d t          fd| D                       D             S )Nc                    i | ]\  }}||	S r}   r}   )r~   idxr  s      rZ   r   zPSIMDKernel.construct_range_trees.<locals>.filtered_index_map.<locals>.<dictcomp>  s+       %S#S  r\   c              3  $   K   | ]
}|v |V  d S rU   r}   )r~   r  masks     rZ   r   zOSIMDKernel.construct_range_trees.<locals>.filtered_index_map.<locals>.<genexpr>  s(      2U2U3PT32U2Ur\   )	enumerate)seqrN  s    `rZ   filtered_index_mapz<SIMDKernel.construct_range_trees.<locals>.filtered_index_map  sA     )22U2U2U2U#2U2U2U)U)U   r\   )rN   rM   rL   rO   rP   r   rL   )r   r   r   r   r   )rS   rI  )
r   all_prefixeslistr   rO  r1   r   r   rm   r.  )rr   r   r*  rw   r'  r0  active_prefixesno_r_dimrQ  	grid_dimspointwise_tensor_dimsreduction_dimstensor_dimstensor_dim_mapgrid_dim_mapr  iri   r   r   r   s       `                rZ   construct_range_treesz SIMDKernel.construct_range_trees  s    % %
 %
 %
 %
!-%
 %
 %
 
 
 (';|+;	 	 	 	
 $OO	 $Xi%8%8 9 9 	A(KK 	A/KK/.@K ,+KII)))\BB"?33 	 	IAv.v66L'++F33J#''//H!)AAxE#$$$6N'(J1J-J)% F]      r\   dict[str, str]c                    |                      || j        | j                                        | j        | j                  }| j                            |           d S rU   )r]  r*  r  rw   r'  r0  r  extend)rr   r   r  s      rZ   r2  z SIMDKernel.initialize_range_tree  sZ    00!M&&((KM
 
 	,,,,,r\   indicesSequence[sympy.Expr]c                    dS )zr
        Hook called right before codegen with every index that will be
        used in the fused kernel.
        Nr}   )rr   ra  s     rZ   finalize_indexingzSIMDKernel.finalize_indexing  s      r\   ra   r   rh   r   r;   c                t    | j         }d| _         	 |                     |||          || _         S # || _         w xY wrD  )r*  store)rr   ra   r   r   priors        rZ   store_reductionzSIMDKernel.store_reduction
  sI    % %	*::dE511$)D!!ED!))))s   . 	7c                    dS rD  r}   rv   s    rZ   r+  z+SIMDKernel.should_use_cooperative_reduction  rE  r\   c                    dS rD  r}   rv   s    rZ   r-  z*SIMDKernel.should_use_persistent_reduction  rE  r\   rf   c                x    t          t          j                            d | j        D                                 S )Nc              3  H   K   | ]}|j                                         V  d S rU   )re   r   r~   r!  s     rZ   r   z(SIMDKernel.var_ranges.<locals>.<genexpr>  sA       * *,0%%''* * * * * *r\   )dictr(  chainfrom_iterabler  rv   s    rZ   re   zSIMDKernel.var_ranges  sH    O)) * *484D* * *  
 
 	
r\   c                >    t          d | j        D                       S )Nc              3  B   K   | ]}t          |j        d u          V  d S rU   )rR   r   rm  s     rZ   r   z0SIMDKernel.triton_tensor_ndim.<locals>.<genexpr>   s1      QQ3td233QQQQQQr\   )r5  r  rv   s    rZ   triton_tensor_ndimzSIMDKernel.triton_tensor_ndim  s"    QQ@PQQQQQQr\   r\  c                n    dg|                                  z  }d||<   dd                    |           dS )Nrn   :[r   ])rs  join)rr   r\  r   s      rZ   indexing_size_strzSIMDKernel.indexing_size_str"  s@    422444a&499U##&&&&r\   	list[str]c                    dg|                                  z  }| j        D ]<}|j        
|j        r| j        r$|j                                         d||j        <   =|S )N1BLOCK)rs  r  r   rw   r*  ri   upper)rr   r   r!  s      rZ   dense_size_listzSIMDKernel.dense_size_list'  sy    //111$ 	G 	GD&$ G(= G,0K,=,=,?,?)F)F)Fdo&r\   c                \    |                                  }dd                    |           dS )Nrv  r   rw  )r  rx  rr   r   s     rZ   dense_size_strzSIMDKernel.dense_size_str1  s0    $$&&&499U##&&&&r\   c                   t          |t                    s|S |j        d         }| j                            |          x}|S t          |||j        i          }t          j        j	        
                    |          }t          ||j                                        |j                            t          j        j        |j        j                                                  i          S Nr   )r   r   r   r   r   r4   r   r8   r   r   r   rl   r   r   r   r   r   rg   rz   )rr   r   rN   	tree_node	new_indexs        rZ   r   z)SIMDKernel.combine_modular_indexing_pairs5  s    %11 	LJqM.221555I>Luq).&9::	G$CCINN	((**IN,A,AGK!5- -&((
 
 	
r\   r!  rm   c                    t           j        j                            |          x}r)|\  }}t	          |                     ||          |          S |                     ||          S rU   )r8   r   r   expand_floor_divr   _combine_contiguous_dims)rr   r   r!  
expand_resr  denominators         rZ   r  z"SIMDKernel.combine_contiguous_dimsG  sd     )::5AAA: 	>%/"I{D99)TJJKXXX00===r\   c                   t          |t          j        t          j        f          r|S |                    |          \  }}t          |          dk    r|S t          j        j        	                    ||t          |g||                    \  }}}||k    r|S |                    |          }t          |t          t          | ||                                        }	|	S )zI
        More aggressive simplification to merge contiguous dims
        r9   )r   r   r   r   r   r   r8   r   r   _simplify_loopsr<   r   r4   rn  zip)
rr   r   r!  r   r   	new_sizesreindex_prunenew_index_varsr  s
             rZ   r  z#SIMDKernel._combine_contiguous_dimsP  s     eemU\:;; 	L //66
Eu::??L%&W%5%E%E7USS&
 &
"	7F L	22ud3z77>;R;R+S+S&T&TUU	r\   'contextlib.AbstractContextManager[None]c                |      j         d         j        p j        t          j         fd            } |            S )Nc               3     K    j                                         s j        rJ d V  d S r                                  d _        	 d V  r                                  d _        d S # d _        w xY w)NFT)r  rw   r*  codegen_body)rr   should_flushs   rZ   ctxz)SIMDKernel.disable_reduction.<locals>.ctxg  s      =--// 0000 $ !!###$)D!- (%%'''(,%%%%,,,,s   
A- -	A6)r  r   r,  
contextlibcontextmanager)rr   r  r  s   ` @rZ   disable_reductionzSIMDKernel.disable_reductiond  sU    '+3Qt7Q		"	- 	- 	- 	- 	- 
#	"	-$ suur\   r   rd   c                    t          |          t          | j                  k    sJ d t          || j                  D             S )Nc                >    g | ]\  }}|                     |          S r}   )r   )r~   r`   rangess      rZ   r   z)SIMDKernel.set_ranges.<locals>.<listcomp>~  s:     
 
 
 V$$
 
 
r\   )r   r  r  r   s     rZ   
set_rangeszSIMDKernel.set_ranges|  sR    7||s4#3444444
 
"%gt/?"@"@
 
 
 	
r\   groupsIterable[sympy.Expr]Sequence[Sequence[sympy.Expr]]Stuple[list[list[sympy.Expr]], list[list[Callable[[list[sympy.Expr]], sympy.Expr]]]]c                   t          d |D                       rd | D             g fS t          j        j        d | D             fd| D             t	          j                    dfd
}dd}g }d}|D ]}g }|D ]}                    |d          r|                    d            0|t                    k     rP                    |         d          r4|dz  }|t                    k     r                    |         d          4|dz   t                    k     r	                    ||                   rz
                    ||                   st          |         }	t          ||                   }
|                     ||
 |||	           ||dz   |
                               ?|t                    k     r1|                    t          j         |||                               |                    |           t          d D                       sJ d d|             |fS )Nc              3  <   K   | ]}t          |          d k    V  dS r   Nr   )r~   r`   s     rZ   r   z5SIMDKernel._split_iteration_ranges.<locals>.<genexpr>  s-      66Fs6{{a666666r\   c                    g | ]}g S r}   r}   )r~   groups     rZ   r   z6SIMDKernel._split_iteration_ranges.<locals>.<listcomp>  s    +++5B+++r\   c                    g | ]}g S r}   r}   )r~   _s     rZ   r   z6SIMDKernel._split_iteration_ranges.<locals>.<listcomp>  s    -A-A-AQb-A-A-Ar\   c                :    g | ]}                     |          S r}   )r  )r~   gsvs     rZ   r   z6SIMDKernel._split_iteration_ranges.<locals>.<listcomp>  s#    444R[[^^444r\   r\  rR   r   rh   rS   c                                         |          }                    |          |          st          t          |          |          | <   |                              |           t                    S rU   )r  statically_known_multiple_of	CantSplitr   r   r   )r\  r   
new_ranges	remainingr  	var_counts     rZ   	add_rangez5SIMDKernel._split_iteration_ranges.<locals>.add_range  sp    ;;t$$D229Q<FF  #IaL$77IaLqM  &&&	??"r\   sizeidx1idx2(Callable[[list[sympy.Expr]], sympy.Expr]c                     d fd}|S )N	flat_varsr   rS   rh   c                ,    |          z  |          z   S rU   r}   )r  r  r  r  s    rZ   getterzISIMDKernel._split_iteration_ranges.<locals>.make_combined.<locals>.getter  s    io-	$??r\   )r  r   rS   rh   r}   )r  r  r  r  s   ``` rZ   make_combinedz9SIMDKernel._split_iteration_ranges.<locals>.make_combined  s<    @ @ @ @ @ @ @ @ Mr\   r   r9   c                $    t           j        j        S rU   )r   r   Zero)r  s    rZ   r   z4SIMDKernel._split_iteration_ranges.<locals>.<lambda>  s
    EGL r\   c              3  f   K   | ],}t           j        j                            |          d k    V  -dS )r9   Nr8   r   r   r   r   s     rZ   r   z5SIMDKernel._split_iteration_ranges.<locals>.<genexpr>  s:      II!17#--a00A5IIIIIIr\   zfailed to set ranges  )r\  rR   r   rh   rS   rR   )r  rh   r  rR   r  rR   rS   r  )r   r8   r   r   r(  r)  r   r   r   statically_known_gtr  r  r   operator
itemgetter)r  r   r  r  return_getters_groupscurrent_grouplength_groupreturn_gettersr  size1size2r  r  r  r  s              @@@@rZ   _split_iteration_rangesz"SIMDKernel._split_iteration_ranges  s    66g66666 	0++F+++R//W-A-A&-A-A-A
4444V444	O%%		# 	# 	# 	# 	# 	# 	# 	# 	#	 	 	 	 !## %	9 %	9LN$ " "--dA66 "))*@*@AAA#c)nn449S9Sm,: :4
 "Q&M $c)nn449S9Sm,: :4 !1$s9~~55":P:P)M2; ;5 ::i6  ( (%m4E$T9]+CDDE"))%!%ImU;;%Ima&7??     %s9~~55&--$/		-0N0NOO   "((8888IIyIIIII 	
 	
9I9999	
 	
I 000r\   reduction_numelc                ,   t           j        j        }t          |d                   dk    ri|                    |t
          j        j                  sD|                    t          |          t          |d                   |z            r|d         |gfS |S )z1Fill in the reduction numel of lengths if missingr9   r   )	r8   r   r   r   r   r   r   r   r3   )clsr  r   r  r   s        rZ   prepare_split_iteration_lengthsz*SIMDKernel.prepare_split_iteration_lengths  s     7#wqz??a00%'+NN  00f%%gaj))O;    AJ 122r\   c                    |                      |||          }	 |                     ||           dS # t          $ r Y dS w xY wNTF)r  r  r  )r  r  r   r  s       rZ   is_compatiblezSIMDKernel.is_compatible  s]     55fgWW	''8884 	 	 	55	s   1 
??list[list[sympy.Expr]]c                    d | j         D             }| j        s(|D ]%}t          |          rt          j        j        ||<   &g |                                }|                     ||| j                  S )a5  
        Split and set iteration ranges for the kernel based on the provided lengths.

        This method maps the kernel's tiling structure to the node's iteration space,
        handling both pointwise and reduction dimensions appropriately.

        Args:
            lengths: A sequence of sequences of symbolic expressions representing
                    the sizes of different dimensions for each node.

        Returns:
            A list of lists of symbolic expressions representing the mapped
            iteration variables for each dimension.
        c                (    i | ]}|j         |j        S r}   )ri   rg   )r~   rts     rZ   r   z3SIMDKernel.split_and_set_ranges.<locals>.<dictcomp>  s    AAA"")RXAAAr\   )	r  r*  r1   r   r   r   r   map_kernel_groups_to_node_sizesr  )rr   r   r  ri   r  s        rZ   split_and_set_rangeszSIMDKernel.split_and_set_ranges  s    $ BA0@AAA $ 	1  1 1&v.. 1%*W[F6N $6==??# 33FGT_UUUr\   c                2   t          |          t          |          k    r,t          d t          ||          D                       r || S |                     ||          \  }}g t          j                             ||           fd|D             S )a  
        We may want to fuse `for i0 in s0*s1` into a tiled kernel with groups (s0, s1).

        To do this we need to split up the iteration space of i0 into something like:
            for i1 in s0:
              for i2 in s1:
                i0 = i1*s1 + i2
                ....

        This function matches and resplits lengths to the groups of
        this kernel to enable tiled + non-tiled fusions.
        c              3     K   | ]?\  }}t           j        j                            t	          |          |z
            d k    V  @dS r  r8   r   r   r  r3   )r~   rN   r  s      rZ   r   z=SIMDKernel.map_kernel_groups_to_node_sizes.<locals>.<genexpr>*  s_       /
 /
1 G%%mA&6&6&:;;q@/
 /
 /
 /
 /
 /
r\   c                ,    g | ]}fd |D             S )c                &    g | ]} |          S r}   r}   )r~   fnr   s     rZ   r   zISIMDKernel.map_kernel_groups_to_node_sizes.<locals>.<listcomp>.<listcomp>2  s!    ,,,"H,,,r\   r}   )r~   fnsr   s     rZ   r   z>SIMDKernel.map_kernel_groups_to_node_sizes.<locals>.<listcomp>2  s.    NNN,,,,,,,NNNr\   )r   r   r  r  r(  ro  rp  )r  r  r   r  r  r  r   s         @rZ   r  z*SIMDKernel.map_kernel_groups_to_node_sizes  s    & w<<3v;;&&3 /
 /
GV,,/
 /
 /
 ,
 ,
& :w'',/,G,GPW,X,X)
)LY_22::z3JKKLNNNN8MNNNNr\   c                6    t          |t          j                  S rU   )r   r   TMPrr   r   s     rZ   is_indirect_indexingzSIMDKernel.is_indirect_indexing4  s    "5$(333r\   c                   |                      |          rdS dgt          | j                  z  }|j        D ]T}|| j        vr| j        |         }t          |j        t                    sJ ||j        j        xx         |j	        z  cc<   Ut          j        j        j        t          fdt          || j                                                  D                       S )NFr9   c              3  N   K   | ]\  }} |           |          k    V   d S rU   r}   )r~   	idx_range
iter_ranger  s      rZ   r   z,SIMDKernel.is_broadcasted.<locals>.<genexpr>I  sT       
 
%	: HY88J#7#77
 
 
 
 
 
r\   )r  r   r'  r   r   r   r   rm   r   r`   r8   r   r   r  anyr  r   )rr   r   index_numelsrz   entryr  s        @rZ   is_broadcastedzSIMDKernel.is_broadcasted8  s   $$U++ 	5sS---( 	= 	=FT222)&1Eel,?@@@@@+,,,<,,,, 7#, 
 
 
 
),\4;;M;M;O;O)P)P
 
 
 
 
 	
r\   c                    t          |t                    r,dd                    t          | j        |                     dS |                     |                     |                    S )a  
        Convert an index expr to a string that can be used in output code.
        e.g. a sympy expression "s2" may actually appear as "ks1" in the generated kernel.

        Index expressions often need to be passed in as arguments to the triton kernel.
        Rename_indexing and codegen_indexing keep track of the needed indices and add
        new parameters to the function signature.
        rv  r   rw  )r   rS  rx  mapindex_to_strr  rename_indexingr  s     rZ   r  zSIMDKernel.index_to_strN  sa     eT"" 	CBtyyT%6!>!>??BBBBzz$..u55666r\   c                   |                      |          }t          |t          j        j        j                  }t          |                    t          j	                            s,t          |                    t          j
                            r)|                    t          j        j        j                  }t          |                    t          j
                            r|                    t          j
                  D ]k}|j        }t          |          dk    rOt          d |D                       r6|t          j        j                            |          i}t          ||          }l|                      |          }t          |t                     s|n|j        d         }|                     |          S )Nr   c              3  b   K   | ]*}t          |t          j        t          j        f          V  +d S rU   )r   r   r   PRECOMPUTED_SIZEr   s     rZ   r   z.SIMDKernel.prepare_indexing.<locals>.<genexpr>o  sJ       , , #1ty$2G&HII, , , , , ,r\   )r"  r4   r8   r   r   precomputed_replacementsr   atomsr   floorceilingsubsr   r   lookup_precomputed_sizer   r   r   codegen_indexing)rr   r   ar   replacements
simp_indexs         rZ   prepare_indexingzSIMDKernel.prepare_indexing[  s    &&u--5!'"2"KLLu{{5;''(( 	JCEM0J0J,K,K 	JJJqw/HIIE u{{5=))** 
	<[[// 	< 	< .w<<!## , ,$, , , ) )# %&qw'7'O'OPQ'R'R#SL&ul;;E++E22
 )X>>VJJJOTUDV 	 $$Z000r\   c                *      fd j         D             S )Nc                0    g | ]}|j         rj        |S r}   )rw   r*  )r~   trr   s     rZ   r   z1SIMDKernel.active_range_trees.<locals>.<listcomp>  s;     
 
 
q~
AEAV

 
 
r\   )r  rv   s   `rZ   active_range_treeszSIMDKernel.active_range_trees  s1    
 
 
 
'
 
 
 	
r\   r   c                   t           j        j                            ||                                           }t          |j        t                    D ]}|| j        v ri }| j        |         	                                D ])}t           j        j        
                    |          ||<   *t          |          dk    r0t          | j        |         j        |          | j        |         _        | j        |                                          |S )Nr   r   )r8   r   r   r  re   sortedr   rb   r   r   r  r   r4   r   r   )rr   r   symr  pss        rZ   r  zSIMDKernel.codegen_indexing  s    w44T4??;L;LMM$+555 	5 	5Cd+++  "/4EEGG T TB'(w'7'O'OPR'S'SL$$|$$q((6@-c27$7 7D)#.3 %c*22444r\   c                     t          d          )NzNYI: codegen_nan_checkr:  rv   s    rZ   codegen_nan_checkzSIMDKernel.codegen_nan_check  s    !":;;;r\   r   Optional[IRNode]c                     t          d          )NzNYI: call_kernelr:  )rr   ra   r   s      rZ   call_kernelzSIMDKernel.call_kernel  s    !"4555r\   rN  Union[str, OpsWrapper]r  Iterator[str]c              #     K   | j         }| j        }|rt          j        ||          }t	          j        |          }|| _         || _        	 |V  || _         || _        dS # || _         || _        w xY w)z:Context manager to add an additional mask to tl.load/storeN)
_load_mask_load_otherr6   logical_andr7   _unwrap)rr   rN  r   rg  	prior_vals        rZ   
mask_loadszSIMDKernel.mask_loads  s      
 $	 	0?4//D!$'' 	)JJJ#DO(D $DO(D((((s   A A/c                    d | j                                         D             }t          ||          }i }| j        D ]>}t	          |j                  }t          ||di          t          ||di          z
  ||<   ?|S )a\  
        This gets the stride of the index for each of the tiling variables
        (technically, it does it at index 0)

        For example, if
        xindex = x0 + 512*x1 + 1024*r0
        x0 = (xindex//512)
        x1 = (xindex % 512)
        r0 = rindex // 1024

        this function would return
        {xindex: 512, rindex: 1024}
        c                $    i | ]\  }}||j         S r}   )r   )r~   kvs      rZ   r   z2SIMDKernel.get_strides_of_load.<locals>.<dictcomp>  s      U U Utq!AF U U Ur\   r9   r   )r   r   r4   r  r2   ra   )rr   r   index_to_tile_indexesindex_in_tile_varsstrides
range_treer   s          rZ   get_strides_of_loadzSIMDKernel.get_strides_of_load  s     !V Ut7L7R7R7T7T U U U'/DEE* 	 	J":?33A#$6A??*"QFC C GAJJ r\   c                |    t          |t                    rt          t          | |                    S  | |          S rU   )r   tupler  )r  r   s     rZ   _map_tuple_or_scalarzSIMDKernel._map_tuple_or_scalar  s8    eU## 	)R(((r%yyr\   r   c                    d t          j        | j        j                  D             }t	          t          d |                    S )Nc                6    g | ]}|                                 S r}   )estimate_flopsr~   r   s     rZ   r   z-SIMDKernel.estimate_flops.<locals>.<listcomp>  s4     
 
 
 !!
 
 
r\   )rC   
only_nodesr  node_scheduler5  filter)rr   flopss     rZ   r%  zSIMDKernel.estimate_flops  sJ    
 
*5dm6QRR
 
 
 6$&&'''r\   c           	     ^   g }t          t          | j        j                                                            }| j                                        \  }}}}| j                                        }t          j	        j
                            t          | j                                                            }t          |          D ]U\  }}||vr|                    d            t          j	                            |          }	t          j	        j
                            |	          }
|
|k    rt#          t$                               }d}||         D ]V}t'          |t(          t*          f          r|                    d|            |dz  }<|                    |j                   Wt          |          |z  }n|
}t          j	                            |          }t3          |          }|                    ||z  dt5          ||k               z   z             Wt7          |          S )a+  
        Try the best to estimate the total size (in bytes) of the
        kernel's inputs and outputs, which is used for estimating the memory
        throughput of this kernel. This information is used for checking how
        far we are from the peak memory bandwidth. It's important that
        we want to avoid overestimating the sizes of the inputs and outputs,
        because it can wrongfully give us a very large memory traffic value,
        which may be even larger than the theoretical bandwidth and thus
        become very misleading. This is particularly problematic for cases
        where we slice some inputs. In those cases, we should only count
        the size of the "slices" instead of the original inputs, because
        only the slices contribute to the real memory traffic.
        r   no_index_dep_r9   )r   r5   r   inplace_buffersr   python_argdefsr  buf_accessesr8   r   r   r   r3   r'  rO  r   	get_numelr   r   r   r"   r#   r   r   	get_dtyper.   rR   r5  )rr   nbytesninplace_argsr  	call_argsr/  	out_numelr\  r   	arg_numelbuf_sizera  no_index_dep_countdeprg   r7  
dtype_sizes                    rZ   estimate_kernel_num_bytesz$SIMDKernel.estimate_kernel_num_bytes  s    F49#<#C#C#E#EFFGG!Y55779a}1133 G$..}T[=O=O=Q=Q/R/RSS		** 	M 	MFAs ,&&a   ))#..Iw'11)<<H)## %S/++%&"', / /C!#'9:: /$H4F$H$HIII*a/**CI....Gy0 G%%c**E'..JMM%*,CM8I4J4J0JKLLLL6{{r\   c           	        t          | j        j                  dk    r<t          | j        j                  dk    rt          | j        j                  dk    rdS | j                                        \  }}}}d}|D ]=}t          j                            |          }|s%|	                                }	t          |	j
                  dk    rt          d |	j
        D                       dk    rtt          j        |	j                  }
||
}||
k    rt          d| dd	|
 d
| z             }t                              |           d |D             }d |D             }d |D             }d |D             }t          d| d| d| d| d| dz             }t                              |            dS ?t#          d| d          }t                              |           dS )zr
        Print message if the kernel have mixed layout inputs.
        Only care about 4D tensor for now.
        r9   r   N   c                    g | ]
}|d k    |S )r9   r}   r~   rN   s     rZ   r   z.SIMDKernel.warn_mix_layout.<locals>.<listcomp>-  s    999a!q&&&&&r\   r   zExpected stride order z, but found stride orderr  z for kernel c                    g | ]k}t           j                            |          rHt          j        t           j                            |                                          j                  nd lS rU   )r8   r   try_get_bufferr   get_stride_order
get_buffer
get_layoutstrider~   ra   s     rZ   r   z.SIMDKernel.warn_mix_layout.<locals>.<listcomp>9  sz     ) ) ) ! 711$77"+G..t44??AAH   ") ) )r\   c                    g | ]Y}t           j                            |          r6t           j                            |                                          j        nd ZS rU   )r8   r   rA  rC  rD  r  rF  s     rZ   r   z.SIMDKernel.warn_mix_layout.<locals>.<listcomp>A  sf     ! ! ! ! 711$77"**400;;==BB!! ! !r\   c                f    g | ].}|t           j        j        v rd n|t           j        j        v rdnd/S )
GraphInputIntermediateBufferN)r8   r   graph_inputsname_to_bufferrF  s     rZ   r   z.SIMDKernel.warn_mix_layout.<locals>.<listcomp>G  s[     # # # !	  17#777 %  17#999 21!# # #r\   c                    g | ]	}|j         
S r}   r   r?  s     rZ   r   z.SIMDKernel.warn_mix_layout.<locals>.<listcomp>P  s    #<#<#<qAF#<#<#<r\   z  param names z
  buf names z
  strides z	
  sizes z
  sources 
z%All the inputs for the triton kernel z have uniform layout)r   r   input_buffersoutput_buffersr-  r.  r8   r   rA  rD  r  r   rB  rE  r(   logwarningr'   )rr   r  argdefsr4  
_signaturer  uniform_stride_orderarg_namebuflayoutstride_ordermsgstride_order_list	size_listsource_listargdef_namess                   rZ   warn_mix_layoutzSIMDKernel.warn_mix_layout  sv    	'((A--DI,--22DI-..!33
 F,0I,D,D,F,F)J#! 0	 0	H'((22C ^^%%F6;1$$996;999::a??!26=AA'/+7(()\99%_1E___ElEEEEF C KK$$$) ) %.) ) )%! ! %.	! ! !I# # %.# # #K $=#<G#<#<#<L%oooYoo\mooMyMMkMMMN C KK$$$FFUKUUU
 
 	Cr\   c                d   t          j        ||d|          }d| _        t          j        | j        j        |          }t          j        ||          }d| _        t          j        ||          }t          j        ||          }t          j        ||d|          }t          j
        |||f          S )Nr5  FT)r6   	reductionr*  
index_exprr  r  truedivsubmulr7   r  )	rr   r7  r   sum_rnumelmeandxdx2m2s	            rZ   welford_reduce_fallbackz"SIMDKernel.welford_reduce_fallback\  s    }UE5%88 % =uEE{4(( $WUD!!gb"oo]5%44!4V"4555r\   c                    t          j        ||d|          }t          j        ||          }t          j        |          }t          j        ||d|          }t	          j        ||f          S )Nmaxr5  )r6   ra  rd  expr7   r  )rr   r7  r   vmaxrd  ro  vsums          rZ    prepare_softmax_twopass_fallbackz+SIMDKernel.prepare_softmax_twopass_fallbackh  s_    }UE5%88geT""gcll}UE5#66!4,///r\   c                    t           rU   r:  rv   s    rZ   codegen_kernelzSIMDKernel.codegen_kernelo  r=  r\   c                    d S rU   r}   rv   s    rZ   r  zSIMDKernel.codegen_bodyr      r\   r  r   c                    d S rU   r}   )rr   r  s     rZ   r   z)SIMDKernel.codegen_iteration_ranges_entryu  rv  r\   )NNNN)r  r  r  rD   r   r   r  r  r  r  r  r  rS   rn   r   )r7  r8  rS   rb   )rS   r8  r   r   )r   r   r*  rt   rw   rt   r'  r  r0  rt   rS   rF  )r   r^  rS   rn   )ra  rb  rS   rn   )ra   rb   r   rh   r   r;   rS   rn   )rS   rf   )r\  rR   rS   rb   )rS   rz  )r   rh   rS   rh   )r   rh   r!  rm   rS   rh   )rS   r  )r   rh   rS   rd   )r  r  r   r  rS   r  )r  r  r   r  r  rh   rS   r  )r  r  r   r  r  rh   rS   rt   )r   r  rS   r  )r  rb  r   r  rS   r  )r   rh   rS   rt   )r   rh   rS   rb   )rS   rF  )r   rh   rS   rh   r   rU   )ra   rb   r   r  rS   rn   )rN  r  r   r  rS   r  )r   rh   rS   rf   )rS   r   )r  r   )Ar   r   r   r   pexprr  __annotations__r  rq   r   r,   r   r6  r<  r@  rB  r/  r]  r2  rd  rh  r+  r-  re   rs  ry  r  r  r   r  r  r  r  staticmethodr  classmethodr   r   r   r  r  r  r  r  r  r  r   r  r  r
  r  r  r  r  r  r"  r%  r;  r_  rl  rr  rt  r  r   r   r   s   @rZ   rk   rk   t  s          */E....&&&&!O!!!! /38<9=9=0. 0. 0. 0. 0. 0. 0.d J J J ] ] XJ" " " "2 2 2 2 H H H XH   5 5 5 5n- - - -   * * * *      
 
 
 
R R R R' ' ' '
   ' ' ' '
 
 
 
$> > > >   (   0
 
 
 
 L1 L1 L1 \L1\ 
 ',gk	    [& 
 ',gk	    [ V  V  V  VD O O O [O84 4 4 4
 
 
 
,7 7 7 7$1 $1 $1 $1L
 
 
 

   "< < < <6 6 6 6 6 ) ) ) )&   0   \
( ( ( (= = =~E E EN
6 
6 
60 0 0" " "         r\   rk   c                     e Zd ZU dZeZded<   d Zd ZeZ	eZ
d ZdMd	ZedNd            ZdOdZdPdZd ZdddZddddQdZd  Z	 dRdSd(Zd) Ze ej        d*          dTd,                        ZedUd1            ZedVd4            ZedWd7            ZedXd9            ZedYd@            ZedZdA            Z ed[dD            Z!ee"j#        j$        dfd\dF            Z%ee"j#        j$        dfd]dG            Z&dH Z'd^dIZ(	 d_d`dJZ)dK Z*dL Z+dS )aSIMDSchedulingzo
    Single Instruction Multiple Data parent class used for fusion across
    multiple different backends.
    z	type[Any]kernel_typec                4    t          d |D                       S )Nc              3  x   K   | ]5}t           j        j                            t	          |                    V  6d S rU   r  r   s     rZ   r   z*SIMDScheduling.group_fn.<locals>.<genexpr>  s=      PPQQW%..}Q/?/?@@PPPPPPr\   )r!  r  s     rZ   group_fnzSIMDScheduling.group_fn  s    PP%PPPPPPr\   c                p	   t          |t          j                  st          |t          j                  r t          j                            ||          S |j        \  }\  }}|j        \  }\  t          ||          }|                                r4|                                s |                                r |d           nG|                                r3|                                s|                                r |d           |                                r3|                                r|k    o|k    }|s |d||           |S |                                s|                                s|k    r|k    s|                                s |d||           dS |	                                D ]m}|                                r nV|
                                |                                z  sB|j        \  }\  }	}
||	k    r||
k    s |d||	||
            dS n||fD ]}|                                r dS |                     |	                                ||          }|                     |	                                ||          }|                     |	                                |	                                z   ||          }t          j        j        rkd}t!          |          dk    r+t!          |          dk    r||cxk    o|k    nc }n ||k    }nt!          |          dk    r||k    }|s |d|||           dS dS |                                s|                                r|d	k    rd	k    sJ |z  k    rt#          fd
|	                                D                       s |d           dS t          j        j        rs|                                s_t'          |                     |	                                |                                                    |d	fd	ffv }|s |d           |S dS |k    r |d           |k    S |                                r|                                rJ |                     ||          S )z
        Hook called by Scheduler to determine if the Triton backend
        can fuse node1 and node2.  These nodes might already be
        FusedSchedulerNodes.
        z&Split scan cannot fuse with reductionsz1numel/rnumel mismatch (reduce) (%s, %s), (%s, %s)z5numel/rnumel mismatch (non-reduce) (%s, %s), (%s, %s)Fz:numel/rnumel mismatch prologue mismatch (%s, %s), (%s, %s)Tr   ztiling mismatch (%s, %s, %s)r9   c              3  v   K   | ]3}t                               f|                                          V  4d S rU   )rk   r  
get_ranges)r~   r   numel2rnumel2s     rZ   r   z*SIMDScheduling.can_fuse.<locals>.<genexpr>  sT         ,,fg->OO     r\   z"nodes numel/rnumel incompatibilityzinvalid tiling for reductionznodes numel incompatibility)r   r   ForeachKernelSchedulerNodecan_fuser  r+   is_split_scanrw   is_template	get_nodesused_buffer_namesget_buffer_namesselect_tilingr   rX    tiling_prevents_pointwise_fusionr   r    tiling_prevents_reduction_fusionr!  r   can_fuse_horizontal)rr   node1node2r  numel1rnumel1whyreduction_can_fuser   	pro_numel
pro_rnumelr   tiling1tiling2tiling3condis_reduction_tiling_validr  r  s                    @@rZ   r  zSIMDScheduling.can_fuse  s    eYABB 	Oj97G
 G
 	O 7@@NNN${FG${FGu%%   	>)<)<)>)> 	>!!## ><===  "" 	>5+>+>+@+@ 	>!!## ><=== 
	&E$6$6$8$8 
	&!'6!1!Hg6H% G   &%!!## =	E,>,>,@,@ =	f$$G););((** )CO   !5 !& 1 1 ) )++-- "!E  $5577%:P:P:R:RR %$59Z22Iz &) 3 3:8M8MC \ & ) ' *   $)55 9N U^    ==??  44  (():):FGLLG(():):FGLLG((!!EOO$5$55vw G }= !w<<!##7||a''&'<<<<W<<<<&'1\\A%%"g-D !C6	   !54!!## 	$(:(:(<(< 	$a<<GqLLL0')))     "__..     ! C<=== 5MB5!--//5 16**5??+<+<fEELLNN1 1  !,1- 5 <:;;;44t1222V##!!##@E,>,>,@,@@@@''u555r\   c           
        g t          t          j                             t                      t                      d fd}fd}fd}fd}t          j        fd            }fd}	|D ]}
|
v r                    |
            ||
          r\ |	|
          r# |            5  	 d d d            n# 1 swxY w Y   r ||
          spt                    nd  ||
            ||
          r8 |            5                      |
           d d d            n# 1 swxY w Y   t          d d d	|
j	        d
                    S )Nc                T    | j         \  }\  }}|k    r|k    p|z  k    o|dk    S Nr9   r  r   r  
node_numelnode_rnumelrg   rg  s       rZ   fits_in_main_bodyz@SIMDScheduling.generate_node_schedule.<locals>.fits_in_main_body  sF    +,7(A(
K%'AK6,A efn,A1Ar\   c                B    | j         \  }\  }}|k    o|dk    odk    S r  r  r  s       rZ   fits_outside_reductionzESIMDScheduling.generate_node_schedule.<locals>.fits_outside_reduction  s2    +,7(A(
K&K;!+;K!Kr\   c                >    | j         j        D ]}|j        v r dS dS r  )read_writesreadsra   )r   readcurrent_loop_buffer_usages     rZ   expect_improved_memory_usagezKSIMDScheduling.generate_node_schedule.<locals>.expect_improved_memory_usage  s6    +    9 99944 :5r\   c                4                        |                                |                                d | j        j        D                        |                                 rt          | t          j                  rlt          | j	        t          j                  rMt          | j	        j        t          j                  s)                     |                                            d S                     d | j        j        D                        d S )Nc                    g | ]	}|j         
S r}   r   r?  s     rZ   r   zXSIMDScheduling.generate_node_schedule.<locals>.schedule_node_in_loop.<locals>.<listcomp>$  s    -R-R-Raf-R-R-Rr\   c                    g | ]	}|j         
S r}   r   r?  s     rZ   r   zXSIMDScheduling.generate_node_schedule.<locals>.schedule_node_in_loop.<locals>.<listcomp>0  s    1W1W1WQ!&1W1W1Wr\   )r   r   updater  r  rw   r   r   SchedulerNoder   r   ComputedBufferdataScanget_namewrites)r   r  doner(  not_ready_yet_nodess    rZ   schedule_node_in_loopzDSIMDScheduling.generate_node_schedule.<locals>.schedule_node_in_loop!  s    HHQKKK  ###%,,-R-Ram>Q-R-R-RSSS
   Yq)"9::Y qvr'899Y #16;88	Y $''

55555)001W1W!-BV1W1W1WXXXXXr\   c               3    K   r$d         t           u r                                 n                    t                     r;                    t                                         dz   t                      d d V                      t                                                                                         d S )Nr  r9   )rA   popr   r@   insertclear)r  maybe_split_indexr(  r  s   rZ   end_current_reduction_loopzISIMDScheduling.generate_node_schedule.<locals>.end_current_reduction_loop2  s        7r!2o!E!E!!####$$%5666  )$$%68HIII$$%6%:OLLL$(!EEE  111%%'''%++-----r\   c                    dk    rdS | j         z  sdS |r"t          |d         t          t          f          rJ t	                    S )Nr9   Fr  )	ancestorsr   rA   r@   rt   )r   r(  r  rg  s     rZ   #requires_closing_previous_reductionzRSIMDScheduling.generate_node_schedule.<locals>.requires_closing_previous_reductionB  si    {{u&7 u  b!O5E#F* *    +,,,r\   zunexpected group: (r   z) != r9   )
r   r   r)   r  r  r   r   r   r;  r  )rr   r   rg   rg  r  r  r  r  r  r  r   r  r  r  r(  r  s     ``       @@@@@rZ   generate_node_schedulez%SIMDScheduling.generate_node_schedule  s   #%)5688 0:||5?\\!+/	 	 	 	 	 		L 	L 	L 	L 	L 	L	 	 	 	 		Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y" 
	"	. 	. 	. 	. 	. 	. 	. 
#	"	.	- 	- 	- 	- 	- 	-  	 	Dt||HHTNNN  && 66t]KK 3355                 - -5Q5QRV5W5W -(9(OS=O=O%% )-%%%d++++''-- //11 / /!((.../ / / / / / / / / / / / / / / *O%OO6OO
1OO   s$   >CC	C	D;;D?	D?	r   <Union[scheduler.FusedSchedulerNode, scheduler.SchedulerNode]c                l   |                                 }t          j        j        j        j        rt          |          }nd}t          |d           j        \  }\  }}| 	                    |||          }t                              d|           |                     t          ||||                    S )zK
        Given a set of pre-fused nodes, generate a Triton kernel.
        Nc                D    t          |                                           S rU   rR   rw   rN   s    rZ   r   z-SIMDScheduling.codegen_node.<locals>.<lambda>u  s    c!..:J:J6K6K r\   r   zSchedule:
 %s)r  rV   rW   r   rX   coalesce_tiling_analysisr   rn  r  r  schedule_logdebugcodegen_node_schedulerD   )rr   r   r   coalesce_analysisr  rg   rg  r(  s           rZ   codegen_nodezSIMDScheduling.codegen_nodeh  s     04~~/?/??!(A 	% 9$ ? ? $ ,K,KLLLR?E633E5&II+];;;))}eV=NOO
 
 	
r\   rg   rh   buffersGIterable[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject, ir.IRNode]]rS   rt   c                   t          j        t           j                  j        }t	          |           sdS d |D             }|D ]S}|                                s=t          |t          j                  r#|	                                }|d |D             z  }Tt          d |D                       sdS t          j        j                            | |           |D ]'}t          j        j                            ||           (dS )NFc                    g | ]<}|                                 |                                                                =S r}   has_tensor_outputrD  storage_sizer~   rW  s     rZ   r   z9SIMDScheduling.can_use_32bit_indexing.<locals>.<listcomp>  sP     
 
 
$$&&
NN))++
 
 
r\   c                    g | ]<}|                                 |                                                                =S r}   r  r  s     rZ   r   z9SIMDScheduling.can_use_32bit_indexing.<locals>.<listcomp>  sP       ,,..NN$$1133  r\   c              3  4   K   | ]}t          |          V  d S rU   )r-   )r~   r  s     rZ   r   z8SIMDScheduling.can_use_32bit_indexing.<locals>.<genexpr>  s+      FFD)$//FFFFFFr\   T)rV   iinfoint32rn  r-   r  r   r   MutationOutputget_mutation_buffersr   r8   r   r   	check_leq)rg   r  int_max	buf_sizesrW  mutated_bufsr  s          rZ   can_use_32bit_indexingz%SIMDScheduling.can_use_32bit_indexing~  s1    +ek**.%e,, 	5
 

 
 
	  	 	C((** z#r?P/Q/Q "7799  +   	 FFIFFFFF 	5 	
""5'222 	6 	6DG&&tW5555tr\   kernel_featuresrD   c                P   |j         }|                     ||j        |j        |j                  \  }}|                     ||g||d          }|D ]}|                     ||           t          j        |           g }|D ]}t          j
        |          5  |                                }d d d            n# 1 swxY w Y   |                     |||          }	t          j        j        dk    r't!          ||	          }
|                    |	|
f           t$                              d|	           |	|_        t+          |          |_        ~t-          |          dk    rt          |          }n|\  }t          j
        |          5  |                                D ]}|                                 	 d d d            n# 1 swxY w Y   |                     |           |D ]*\  }	}
t          j        j                            |	|
           +|                    |j                   t          j        r|                                 t          j         r |                     |d         j                   t          j        xj!        |j!        z  c_!        t          j        xj"        |j"        z  c_"        t          j        j        j#        rt          j$        r|d         j%        &                                }|                                D ]}|'                                }||vr|j(        J |j(        )                                }|KtT          d         dxx         dz  cc<   t          j        j        +                    d|j,        d| d	           | -                                 d S )
N)r  r  r   z+Generating kernel code with kernel_name: %sr9   inductorintermediate_hookszrun_intermediate_hooks(r   r   ).r(  get_tiling_and_scoresrg   r  r  create_kernel_choices!codegen_node_schedule_with_kernelr?   merge_workspaces_inplacer8   set_kernel_handlerrt  define_kernelr   traceprovenance_tracking_levelr%   r   rQ  r  r  r    r   scheduler_nodesmark_runcodegen_commentr   wrapper_codewrite_provenance_debug_handler  nan_assertsr
  r_  removed_buffersinplaced_to_removesupports_intermediate_hooksgenerate_intermediate_hooksr   live_output_buffersr  r   get_origin_noder   	writelinera   free_buffers_in_scheduler)rr   r  r(  r  tiling_scorekernelsrj   debug_handlessrc_coder  debug_handlefinal_kernelr   	live_outsra   origin_nodes                   rZ   r  z$SIMDScheduling.codegen_node_schedule  sY   '5#99!+-	 
  
 ,,H(<HH
 

  	J 	JF22=&IIII,W5559; 	3 	3F%f-- 3 3!00223 3 3 3 3 3 3 3 3 3 3 3 3 3 3,,X}fMMK|5::F!    $$k<%@AAAIIC[QQQ!,F(22F w<<!&w//LL%O\!,// 	  	 '7799     	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	]+++)6 	 	%KG >>\    	  !9::: 	-**,,,! 	A(()?@@@	<#??	""l&EE"" G <	2	  
;;==I'7799 
 
}}y((y,,,"i7799*Z()=>>>!C>>>G(22O+2BOOOOO   	&&(((((s$   B77B;	>B;	,GGGlist[SIMDKernel]c                     | j         |i |gS rU   )r~  )rr   r  kernel_argskernel_kwargss       rZ   r  z$SIMDScheduling.create_kernel_choices  s+     D 
 	
r\   c           	        |5  t          j                    }i }|D ]}|t          u r(|                    |                                           3|t
          u r|                                 Q|                                 |                    |	                                          }|
                    t                              |j                            |                                                               |                    |                                           |D ]}|t          u r(|                    |                                           3|t
          u r|                                 Qt%          |j                   |                    |	                                          }|                    |           	 d d d            d S # 1 swxY w Y   d S rU   )r  	ExitStackr@   enter_contextr  rA   closedecide_inplace_updater  r  r  rn  fromkeys_bodyindexing_from_argsr   rd  keysr&   r   )rr   r(  rj   stackall_indexingr   r   s          rZ   r  z0SIMDScheduling.codegen_node_schedule_with_kernel  s    	- 	-(**EL &  +++''(@(@(B(BCCCC_,,KKMMMM..000!'!<!<T__=N=N!O!OJ '' J99*EELLNN     $$\%6%6%8%8999 & 	- 	-+++''(@(@(B(BCCCC_,,KKMMMM 6djAAA!'!<!<T__=N=N!O!OJLL,,,,	--	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	-s   GGGGFonly_gen_src_codec               
   i }|                                 }g }	|D ]}
|
                                }|	                    |
           ||z  rjt          |          dk    sJ |	|t	          t          |                    <   |j                            t	          t          |                               g }	t          |	          dk    sJ |5  |s|g|D ]}|                                  |            }|	                    d          5  |D ]<}|
                    |                    |                                                     =|j                            t                                 ddd           n# 1 swxY w Y   |j                                        D ]y\  }}d| d}|                    |                                g           x}	rBt)          d |	D                       }t+          j        d|           5  |	                    |          5  |	D ]}t          |                                          dk    rDt          |	          dk    r1t/          |          r"|xj        |                                z  c_        |
                    |                    |                                                     |j                            t                                 ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y   {	 ddd           n# 1 swxY w Y   t3          |t4                    smt6          j                            |j        j                  5  |                     d	           ddd           n# 1 swxY w Y   |                     d
d           tC          j"        |          5  |j        #                                D ]}d| d}|                     |d            |	                    d          5  t3          |t4                    s|                     d           ddd           n# 1 swxY w Y   t3          |t4                    r|}n|$                                }g |||}t*          j%        rX|&                                dz  }|'                                 d| d|(                    |          )                                 }|r|cddd           S | *                    |||          |_+        t*          j,        j-        dk    rt]          ||j+                   |cddd           S # 1 swxY w Y   dS )zK
        Helper method to codegen a single template kernel variant
        r9   r   z<STORE_OUTPUT>Nz<LOAD_INPUT_>c              3  >   K   | ]}|                                 V  d S rU   )can_codegen_without_upcasts)r~   p_ns     rZ   r   z:SIMDScheduling._codegen_single_template.<locals>.<genexpr>P  s?       5 5>A77995 5 5 5 5 5r\   ztriton.codegen_upcast_to_fp32z<DEF_KERNEL>z	<ARGDEFS>F)strictg    eArN  )/r  r  r   r   r   iterprologue_fused_inputsr   r  set_subgraph_bodyr   r  r  cse
invalidater   named_input_nodesr   r   r  r   r   patchr   #prologue_fused_inputs_preserve_zeror   rb   r   r$   current_originsr   originsfinalize_hookr8   r  r  finalize_remainingbenchmark_kernelr;  imports_for_benchmark_kernelcodegen_kernel_benchmarkgetvaluer  r  r  r  r%   )rr   rj   rendertemplate_nodeepilogue_nodesprologue_nodesr  buf_name_to_prologue_grouptemplate_readsprologue_groupprologuenamesr   partial_code
input_namebuffersubgraph_namecan_codegen_without_upcastprologue_noder  r(  num_gbs                         rZ   _codegen_single_templatez'SIMDScheduling._codegen_single_template   s    &("&88::& 	$ 	$H--//E!!(+++~% $5zzQ@N*4U+<+<=,00d5kk1B1BCCC!# >""a'''' ,	@ ,	@$ $ +<^< $ $DMMOOOO!688L))*:;; 4 4* Q QDLL!<!<T__=N=N!O!OPPPP
%%jll3334 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 '-&>&D&D&F&F @ @"
F <z < < <%?%C%COO%%r& & > @ 25 5 5ES5 5 5 2 2.  7=W9W  @ @ $55mDD @ @1? " "$'(F(F(H(H$I$IQ$N$N(+N(;(;q(@(@'CM'R'R %*(.(R(R,9,J,J,L,L)*(R(R !. 5 5$*$?$?(5(@(@(B(B%& %&!" !" !" !"
 #J11*,,???!@ @ @ @ @ @ @ @ @ @ @ @ @ @ @@ @ @ @ @ @ @ @ @ @ @ @ @ @ @@,	@ ,	@ ,	@ ,	@ ,	@ ,	@ ,	@ ,	@ ,	@ ,	@ ,	@ ,	@ ,	@ ,	@ ,	@\ ,,, 	B**=+=+EFF ; ;**>:::; ; ; ; ; ; ; ; ; ; ; ; ; ; ;&&{5&AAA !&)) &	 &	 %6;;== H H
 <z < < <**=*GGGG))*:;; A A!,44 A ../?@@@A A A A A A A A A A A A A A A ,,, =' (::<<MnMmMnMM& 99;;cA::<< L LL L66v>>GGIIL L  !  ;&	 &	 &	 &	 &	 &	 &	 &	> "&!3!3HmV!T!TF|5::7!6#5   M&	 &	 &	 &	 &	 &	 &	 &	 &	 &	 &	 &	 &	 &	 &	 &	 &	 &	s   >L)A&E=1L)=F	L)F	BL)LCK8	,L8K<<L?K< LL)LL)LL))L-0L-2NNN
AU+QUQ	UQ	BUAUUUNr  hint_overrider?  r   Optional[str]c          	     H   |j         \  }\  }}|dk    sJ t          |j        t                    r|j        j        ryg }	g }
|j        j                                        D ]} ||j        |          \  }}|rH|                     |||||d          }t          |t                    sJ |
                    |           a|                     |||||d          }|	                    |           |rd	                    |
          S t          j        |	           t          |	          }g |||}|                     |           |                    |j                   t          j        xj        |j        z  c_        t          j        xj        |j        z  c_        |                                  dS |j                            |j        |          \  }}|r|                     |||||d          S |                     |||||d          }g |||}|                     |           |                    |j        |j                   t          j        xj        |j        z  c_        t          j        xj        |j        z  c_        |                                  dS )z
        Codegen a triton template with multi-kernel dispatch support

        If `only_gen_src_code=True` the src code will be returned instead of being
        codegenned into the wrapper
        r9   )r?  Tr  Fz

N)r  r   r   r   _make_kernel_rendersr   r=  rb   r   rx  r?   r  r  r  r  r8   r   r  r  r  make_kernel_render)rr   r.  r/  r0  r  r?  r  _numelrg  r  	src_codesrC  rj   r-  r  multi_kernelr(  s                    rZ   codegen_templatezSIMDScheduling.codegen_template  s%     ,1FF{{{{ })+>??N	"7N	 GI&3&8&M&T&T&V&V + +"!3!3!&m" " " % +#<<%&&*.  =    H &h44444$$X....!::%&&*/ ;  F NN6****  .{{9---0999&w//LMnMmMnMM  ///$$\%=>>>G##|'CC##G&&,*II&&**,,,4*/BB"- C  NFF ! 44!""&* 5    66!""&+ 7   !R. Q- Q. Q$$]333""6#5}7IJJJ''6+AA''**f.GG**..000tr\   c                    t           j        j                            t           j        j                                                   d S rU   )r8   r   r  r  
device_opssynchronizerv   s    rZ   codegen_synczSIMDScheduling.codegen_sync  s2    	&&qw'9'E'E'G'GHHHHHr\   subkernel_nodeslist[BaseSchedulerNode]custom_part_algorithmenable_autotunemixed_sizesr  list[tuple[str, Any, Any]]c           	        ddl m} d |D             }i i }	}t          ||          D ]\  }
}t          |d           j        \  }\  }}|                     |||          }|                     |||          }||||f|	|
<   |                    |t          |||          |           ||
<   |	                    || |||	          }t                              dt          |          d	 |D                        g }|D ]G}d
 |D             } |||          }t          ||          D ]\  }
}|                     |	|
         d         |                    ||
                              ||
         }|	|
         d         }|sXt          j        |          5  t#          j        |          D ]}|                                 	 d d d            n# 1 swxY w Y   t          j        xj        |j        z  c_        t          j        xj        |j        z  c_        |                                }|                    |||f           I|S )Nr9   )ComboKernelc                6    g | ]}|                                 S r}   r  r&  s     rZ   r   z=SIMDScheduling.generate_combo_kernel_code.<locals>.<listcomp>
  s"    IIIDNN,,IIIr\   c                D    t          |                                           S rU   r  r  s    rZ   r   z;SIMDScheduling.generate_combo_kernel_code.<locals>.<lambda>      #ann>N>N:O:O r\   r   )r  optimize_mask)r   triton_schedulingcustom_algorithm
kernel_mapnode_info_mapz1ComboKernels: %d nodes partitioned into %s groupsc                ,    g | ]}t          |          S r}   r  )r~   ps     rZ   r   z=SIMDScheduling.generate_combo_kernel_code.<locals>.<listcomp>!  s    (((SVV(((r\   c                6    g | ]}|                                 S r}   rU  r&  s     rZ   r   z=SIMDScheduling.generate_combo_kernel_code.<locals>.<listcomp>%  s"    HHHT 0 0HHHr\   )rO  rP  r   )triton_combo_kernelrS  r  rn  r  r  r  create_triton_kernelrD   horizontal_partitionrQ  r  r   r  create_sub_kernelr8   r  rC   r'  r  r   r  r  rt  r   )rr   rL  rN  rO  rP  r  rS  fused_node_listssubkernel_mapnode_schedule_mappnr   r  rg   rg  r(  r  
partitionskernel_code_list
node_grouprj   	subkernelr   r  s                           rZ   generate_combo_kernel_codez)SIMDScheduling.generate_combo_kernel_code   s    	544444IIIII+-r(_.>?? 		 		IB!$U0O0O!P!P!P!VAv 77ufMMM''ufEEF$165&$Hb! + @ @+M5&II"-o !A ! !M" !55!"2$+ 6 
 

 			?  ((Z(((	
 	
 	

 $ 	D 	DJHHZHHH [ /'  F
 !-=>> K K	E66%b)!,,,]2->??   *"-	 1" 5a 8( ,-i88 , ,$6$A-$P$P , ,D MMOOOO,, , , , , , , , , , , , , , , ''9+DD''**i.JJ***,,..H##Xvz$BCCCCs   ,GGGc                (   |                                 }|j        }|j        }t          j        dk    pt          j        dk    o|}|                     ||||          }|D ]\  }}}	|                     ||g|          }
t          j        j        dk    rt          |j
        |
           |                     |g           t                              d|
           |                    t          j        j        |
           |                                  d S )Nr9   r   z"ComboKernels: generated kernel %s.)get_subkernel_nodesuse_custom_partition_algorO  r   combo_kernel_allow_mixed_sizesrl  r  r  r  r%   snodesr  rQ  r  r  r8   r   r  r  )rr   combo_kernel_noderL  rN  rO  rP  ri  r  rj   r  r  s              rZ   codegen_combo_kernelz#SIMDScheduling.codegen_combo_kernel=  s0   +??AA 1 K+;;a? 
1Q6P;P 	  ::2O[
 
 $4 		B 		BHfa,,X8I7JFSSK|5::7%,k     "3!4555II:KHHHqw3[AAAA&&(((((r\       list[CandidateTiling]c                b   	 dk    }d 	fd}|                                 \  }	t          |          dk    rt          	          dk    st          |	z             rg S |                                 \  }	 |||r|n	|                    |                    } fd|D             }|S )	Nr9   is_pointwisert   rS   ru  c                X   t          |j                  t          |          k    sJ d|j        d|            |j        |j        g}t	          d t
          j                            |          D                       sJ d t
          j                            |          D             }t          d |j        D                       }dd
}t          
                     ||          g|           dd          g}|D ](}t          j        j                            |j        |j                  }	t          |	          t          |          k    sJ 	 |	                    d          dz   }
|
t          |          k    rt	          d |	|
d         D                       rn# t           $ r Y w xY w ||d|
                    |||
d                   f}t          j        j                            t%          d t'          ||	          D                                 }|j        |v r|dz  }t                              |d                   r|dz  }t                              |d                   r|dz  }t          j        j                            |t%          t          j        |                    z
            dk    rb|                    t          
                     ||d|
                    |||
d                   g          ||j                             *|S )zX
            Compute tiling candidates by dividing up the iteration ranges.
            zrw.range_vars=z ranges=c              3  N   K   | ] }t          |t          t          f          V  !d S rU   )r   r!   r"   r~   r9  s     rZ   r   zHSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>f  sE         3G 455     r\   c                l    g | ]1}|j         t          j        j        vt	          |t
                    /|2S r}   )ra   r8   r   r  r   r!   rz  s     rZ   r   zISIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<listcomp>j  sF       817#:::sI.. ; :::r\   c                    g | ]	}|j         
S r}   r   rz  s     rZ   r   zISIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<listcomp>p  s    %D%D%D3ch%D%D%Dr\   r  rb  rS   rh   c                d    t           j        j                            t	          |                     S rU   r  )r  s    rZ   collapse_rangeszNSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.collapse_rangesr  s#    w'00v1F1FGGGr\   noner   )r  ra   scorer9   c              3  "   K   | ]
}|d k    V  dS r  r}   r   s     rZ   r   zHSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>  s&      ;;a16;;;;;;r\   Nc              3  ,   K   | ]\  }}|d k    |V  dS r  r}   )r~   r  rE  s      rZ   r   zHSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>  s4       " "!-vST" "r\   r   r  r  ra   )r  rb  rS   rh   )r   
range_varsr  r  r   r(  ro  rp  r   CandidateTilingcreate_partial_tilingr8   r   r   stride_hintsr   
ValueErrorr   r3   r  ra   is_good_sizer   )rw  r  rwdep_sourcesdepswrite_namesr~  tilingsr9  r  splittiled_groupsr  r  r  reduction_rangess                rZ   tile_rangesz5SIMDScheduling.candidate_tilings.<locals>.tile_ranges[  s    r}%%V4446S6S6S&6S6S444 8RY/K  $?88EE        $?88EE  D %%D%D")%D%D%DEEKH H H H
  44(001<     G  4 4'*77	2=QQ7||s6{{2222
#MM!,,q0EF++ ;;7566?;;;;; ! !!
 "   H $OF6E6N33#OF566N33  (22! " "14VW1E1E" " "   
 8{**QJE"//Q@@ QJE"//Q@@ QJE G$..iofFV.W.W X XX   
 NN'#&#<#<$3OF6E6N$C$C$3OF566N$C$C!" !0$ $ #(!$
 
 
   Ns   +F$!F$$
F10F1c           	     |    g | ]8}t                              |j                  |j        |j                   9S )r  )r  complete_partial_tilingr  r  ra   )r~   r  r  rg   r  s     rZ   r   z4SIMDScheduling.candidate_tilings.<locals>.<listcomp>  sb     	
 	
 	
  22M5/  l[  	
 	
 	
r\   )rw  rt   rS   ru  )r  r   r   "pointwise_or_reduction_read_writes)
r  r   rg   r  rw  r  pointwise_rangespartial_tilingsfull_tilingsr  s
   ` ``     @rZ   candidate_tilingsz SIMDScheduling.candidate_tilingsV  s)    '!+\	 \	 \	 \	 \	 \	 \	 \	| .2__->->** !!Q&&$%%**$%58H%HII + I .2__->->**%+ ,B2B33LAA
 
	
 	
 	
 	
 	
 	
 *	
 	
 	
 r\   	pw_tilingrb  reduction_tilingimmutable_dict[str, sympy.Expr]c                    g dt          |           d         }ddgdt          |                   }t          g t          ||          t          ||                    S )zK
        Create a tiling dict from pointwise and reduction splits.
        )rL   rM   rN   NrO   rP   )r   r   r  )r  r  r  pw_prefixesreduction_prefixess        rZ   create_tilingzSIMDScheduling.create_tiling  ss     &oos9~~o&7&78#U^,Cc2B.C.C,CDVc+y))VC0BDT,U,UV
 
 	
r\   r  rw  c                >    |                      |r|ng |s|ng           S rU   )r  )r  r  rw  s      rZ   r  z$SIMDScheduling.create_partial_tiling  s4       "*FF&.FFB
 
 	
r\   r  r  c                    t          |                                          }d|v }||z  }|t          |          z  g}|r||fn||f} | j        | S )zb
        Given a tiling for only pointwise or reduction dimensions, adds the missing one.
        rN   )rS  r   r3   r  )	r  r  rg   r  splitsrw  total_numelmissing_tilingtiling_argss	            rZ   r  z&SIMDScheduling.complete_partial_tiling  sq     fmmoo&&f}o-%f(=(==> )5RV^$$>6:R 	 !s +..r\   %list[immutable_dict[str, sympy.Expr]]c           
     4   |dk    }t          t          t          t          j        f                              }t          j        |          D ]}t          |t          j	                  s|
                                }|st          |d                   dk    rN||rdnd         }|g}	d |j                                        D             }
|
D ]}g |j                                        }t          j        j        }t$          j        j        }t+          |          D ]%\  }\  }}||z  }|                    ||          r n&|                    ||          s|dz   }|r
|d|         n	||d         }g }|D ]\  }}t1          j        |j        |          }t7          d|                    t:                    |                    t<                    z   t          |                    }t1          j        ||||          }||d         n|g}|                     |           d |D             }t          |          dk    r|	!                    |           |	D ]}t7          dt          |          tE          d          z
            }|dz   }tG          |d|                   }|ftI          ||d                   z   }|%                    | &                    | '                    ||          ||                     tQ          |t          d          }|S )	z
        Creates N-dimensional tiling candidates, attempting to simplify loads/stores
        by tiling the kernel into higher dimensions.

        Returns a list of tilings ranked by dimensionality.
        r9   r   c                l    g | ]1}t          |t                    rt          |j                  d k    /|2S )r   )r   r!   r   r  rz  s     rZ   r   z1SIMDScheduling.get_nd_tilings.<locals>.<listcomp>$  sI       c9-- 36cj//A2E2E 2E2E2Er\   Nr   c                z    g | ]8}t           j        j                            |t          j        j                  6|9S r}   )r8   r   r   r   r   r   r   )r~   dims     rZ   r   z1SIMDScheduling.get_nd_tilings.<locals>.<listcomp>[  sG          7+CCCUU      r\   T)r   reverse))r   r   rb   r   ExprrA   r)  r   r   r  r  r   r  reads_and_writesr  r   r   r   r8   r   r   rO  statically_known_geqr   r:   get_subexpr_involving_symbolr   rn  r)  r   r   match_mod_div_block_exprr`  r   r[   r3   r!  r   r  r  r  )r  r(  pointwise_numelr  rw  r  r   node_rangesranges_to_tilenode_tilingsmemory_depsr9  all_var_rangespointwise_vars_numelr   pointwise_end_idxvarrg   reduction_start_idxre   index_tilingr   num_dimsmatch_resultdimsnode_tilingnum_leading_dimsfirst_trailing_dimcollapsed_leading_dimcollapsed_splitsranked_tilingss                                  rZ   get_nd_tilingszSIMDScheduling.get_nd_tilings  s    '!+^CO<=??#*=99 _	 _	DdI$;<<  //++K CA$7$71$<$< )l)ABN*+L
 +<<>>  K
 # 96 96 "73:#3#3#5#5!6',w{$7+7@7P7P  3%|U(E1(44,o     77(/    '8!&;# $>N#7$7#788'(;(<(<=   "", . .JC/L	3 E
  #H--O0L0LLN++   H $7#OsE8$ $L /;.F<??UGD ''----   +      |$$q(( ''555  ,  #&q#k*:*:]1=M=M*M#N#N %5%9"(5kBUCUBU6V(W(W%$9#;e 2 3 34? ? $  //112BLQQ''    "  
 
 
 r\   r(  list[NodeScheduleEntry]r  r  rH   =tuple[dict[str, sympy.Expr], Optional[dict[str, sympy.Expr]]]c                r   j         sdnj         j        j        j        j        j        j        j        fdD             fdD             t          j        t                    k    fd           t          j        t                    k    fd           i g }	 	 	 ddf	d}|	                     |d           |d          f           r.|	                     |fdd           |d          f           j
                                        z  }|D ]/}|	                     ||fd           |d          f           0t          d          dk    rJdk    rDt          j        |d          D ].}	|	                     ||	d           |d          f           /g }
|D ]z\  \  }}\  }}t          |                     ||          t#          |          t#          |          z             }|                     ||          }|
	                    ||f           {|                     gg          }ddfd}t%          |
|          D ]\  }}|                     |j                  rtt+          |j                  dk    rdndz
  }|t          d          k    r6t,                              d|t          j        j        j        j                   |j        |fc S |j        |k    r|j        |fc S |dfS ) zr
        Generates a tiling, and a score of each tile according to each tile's coalesced memory accesses.
        Nc                     g | ]
}|         S r}   r}   r~   r  r  s     rZ   r   z:SIMDScheduling.compute_tiling_strategy.<locals>.<listcomp>  s    6661VAY666r\   c                     g | ]
}|         S r}   r}   r  s     rZ   r   z:SIMDScheduling.compute_tiling_strategy.<locals>.<listcomp>  s    666AfQi666r\   c                      d d  S Nr   r}   )r(  r  	pw_rangess   rZ   r   z8SIMDScheduling.compute_tiling_strategy.<locals>.<lambda>  s    yFFOFF}FF r\   c                      d d  S r  r}   )r(  
red_rangesr  s   rZ   r   z8SIMDScheduling.compute_tiling_strategy.<locals>.<lambda>  s    zGG_GGGG r\   r}   Fvars_to_usetuple[sympy.Expr, ...]use_split_varrt   rw  rS   tuple[list[int], list[int]]c                  	 |rn}|rn}|s|r|gg fS g g fS t          |           ||f}                    |d          x}r|S |rn}g }g }	d}
d}t          ||          D ]\  }}|| vr!|
|z  }
j                            |d          }+|r|k    rj        }|J |j        }t          ||j                  }|                    |
|z             |	                    |j                   |                    |           |	                    j                            |d                     d}
d}|
|z  }
|                    |
           |	                    j                            |d                     d}
 |
dk    s|r=t          |          dk    r*|                    |
           |	                    |           t          t          |                    D ]\}t          j        j                            ||         d          }t          |d          }t!          |	|         |z  dz            |	|<   ]||	f|<   ||	fS )z]
            Generate a tiling, and a tiling score, given vars to use as splits.
            Nr9   r   rt  r      )r  r   r  coalesced_by_varsuggested_splittiling_factorr   r   r  r   ranger8   r   r   r   minrR   )r  r  rw  r  target_numelr   outsplitting_varsr  split_scoresprodprev_var_coalesced_scorer  v_range
var_tilingtile	remainderr\  r   all_iter_varsall_red_varsr  r  r  r  r  scored_sub_split
tiling_vars                      rZ   process_node_varszASIMDScheduling.compute_tiling_strategy.<locals>.process_node_vars  s    #/>YYJF.:O??L $ $)NB//8O$$m\BC&**3555s 
.:L]]NFLD'($ ".&99  
7K''GOD/@/Q/U/U10 0,   Q*__!2!BJ%111%3D (*2J K KIMM$"2333 ''
(8999MM$''' ''(9(J(N(NqRS(T(TUUUD/0,d#####$5$F$J$J1a$P$PQQQqyy\yc&kkQ.>.>d#####$<=== 3v;;'' ? ?G$..vay2.FF1II"%l1o&9A&=">">Q%+\$:S!L))r\   T)rw  )r  rw  r   rQ   r9   r   )r  gffffff?gGz?c                    d}| d         j                                         D ]'}t                              |          s|z  }"|z  }(| d         j         |z  S )Ng      ?r   )r  r   r  r  r  )r  score_factor	tile_size"bad_size_additional_tiling_penaltygood_size_tiling_penaltys      rZ   	score_modz9SIMDScheduling.compute_tiling_strategy.<locals>.score_mod+	  so    LqT[//11 K K	&33I>> K#/2T#TLL#/2J#JLLaDJ;--r\   r   r   zmFound optimal tiling with %s tiles but torch._inductor.config.triton.max_tiles set to %s. Consider increasing)r}   FF)r  r  r  rt   rw  rt   rS   r  )r  r  norm_read_writesr   reduce_varsre   rV   _checkr3   r   r  r  r[   r(  combinationsr  r  r5  r  tiling_is_compatibler  r   perf_hint_loginforW   r   rX   rY   )r  r(  r  r  r  score_splitr  overlapping_iter_varsr  r  r  pw_splitpw_score	red_split	red_score	candidater  default_tilingr  cand
tiling_lenr  r  r  r  r  r  r  r  r  s    ````                @@@@@@@@@rZ   compute_tiling_strategyz&SIMDScheduling.compute_tiling_strategy~  s    %47DD"26 	 *:E(9E"3>6666666	6666666
)$$7FFFFFF	
 	
 	
 	*%%8GGGGGG	
 	
 	
 DF  	
 35"'!&K	* K	* K	* K	* K	* K	* K	* K	* K	* K	* K	* K	* K	* K	* K	*\ 	!!t444!!u555	
 	
 	
  	%%#T   &%5999	   ->CCEEE 	 ' 	 	A%%qd>>>%%5999    ###q((_-A-A(56KQOO  ""))+DIII))u===    RT<G 	6 	68 Xx"89i'!!(I66(mmc)nn4  I ,,XyAALNNI|45555**O+<>OPP .3*#( 	. 	. 	. 	. 	. 	. #)i"@"@"@ 	1 	1D,''  1 !--o6J6JPQR
a 8 8 888!&&9".5?	   {L0000 {n,,{L0000 - t##r\   c                l    t          t                    sJ t          fd|D                       S )Nc              3     K   | ]_}t          |t          j                  t                                                              |                                           V  `dS ))r  N)r   r   r  rk   r  r   r  )r~   r   r  r  s     rZ   r   z6SIMDScheduling.tiling_is_compatible.<locals>.<genexpr>W	  sz       
 
 $	 788
$$!2!2O %  
 
 
 
 
 
r\   )r   rn  r   )r  r(  rg   r  r  s      ``rZ   r  z#SIMDScheduling.tiling_is_compatibleN	  s[     &$''''' 
 
 
 
 
 &	
 
 
 
 
 	
r\   r  list[dict[str, sympy.Expr]]c                H    |D ]}|                      ||||          r|c S d S rU   )r  )r  r(  rg   r  r  r  s         rZ   get_first_compatible_tilingz*SIMDScheduling.get_first_compatible_tiling_	  sD     % 	 	F''uovVV  tr\   Optional[CoalesceVarAnalysis]c                >    |                      ||||          d         S r  )r  )r  r(  rg   r  r  s        rZ   r  zSIMDScheduling.select_tilingm	  s.     ((5/3D
 

 	r\   c                ^   |dk    }|                      |g|g          }t          j        j        j        j        r+|r)t          j        j        s|                     ||||          S |st          j        j        rt          d          dk    rt          j        t          j        k    r~t          j        |          D ]i}t          j        j        sVt!          |                     |||                    dk    r.t                              t'          j        d                      nj|dfS t+                      }t-          j                    }	t          j        |          D ]\}|                     |||          D ]B}
|
j        |v r|
j        |                    |
j                   |	|
xx         |
j        z  cc<   C]d |	                                D             }t          d          dk    rH|rFdd}t9          dt!          |                    D ]$} ||d         ||                   }||g|z   } n%t!          |          dk    rt                              d|           t          j        j        r|                     |||          |z   }|                     ||||          x}r|dfS |dfS )z
        Heuristics to decide how to tile kernels.
        Currently, we tile based on stride-1 dimensions.

        Returns:
            `(tile1, tile2, reduction_numel)` s.t. `tile1 * tile2 == numel`

        r9   r   r  r   z
                                Reduction over non-contiguous dims.
                                Consider setting config.triton.tile_reductions to True.
                                Nc                "    g | ]\  }}|j         S r}   )r  )r~   candidate_tilingr  s      rZ   r   z8SIMDScheduling.get_tiling_and_scores.<locals>.<listcomp>	  s0     7
 7
 7
' % #7
 7
 7
r\   r   tiling0r  r  rS   r  c                L   | d         |                      dd          }}|d         |                     dd          }}t          ||g          s+t          j        j                            ||z
            dk    rd S t          j        j                            ||z
            dk     r||f||fc\  }}\  }}t          j        j                            ||z
            dk    sJ t          j        j                            ||          sd S |t          ||          || d         d}|S )NrN   rM   r9   r   rO   )rL   rM   rN   rO   )r   r   r8   r   r   r   r  r   )r  r  a0a1b0b1
new_tilings          rZ   convert_tiling_to_3dzBSIMDScheduling.get_tiling_and_scores.<locals>.convert_tiling_to_3d	  s/    !w{{3':':B w{{3':':B *2r(33 w'11"r'::a??47#--b2g66::*,bB8&HRhr2w'11"r'::Q>>>>w'DDRLL  4 !"b))"5>	 
 "!r\   zpossibly bad tiling: %s)r  r  r  r  rS   r  )r  rV   rW   r   rX   r  prefer_nd_tilingr  tile_reductionsr[   r  levelloggingWARNINGrA   r)  r   r  r  textwrapdedentr   collectionsr   ra   r   r  most_commonr  r  r	  )r  r(  rg   r  r  rw  r  r   
seen_namescandidate_tilesr  r  r  r\  new_3d_tilingr  s                   rZ   r  z$SIMDScheduling.get_tiling_and_scoresy	  s`   " '!+ **E7_4EFF O")B	!	 M2	
 ..uo7H    	(V]%B 	(}H
 H
 H
H H "go55+2=AA  D"M9 5 5dE? S STTWXXX%**$O!$    !4''&0ll
4?4G4I4I#*=99 	L 	LD$'$9$9$$W$W L L #(J66%*6NN#3#8999 01115E5KK1111L7
 7
+:+F+F+H+H7
 7
 7

 ###q((\(" " " "8 1c.1122   4 4"1%~a'8! ! !,&3_~%ENE - ~""8.III =) 	""=%II ! 
 445/>
 
 
6 	  4<t##r\   c                    d S rU   r}   rv   s    rZ   flushzSIMDScheduling.flush	  rv  r\   c                    dS rD  r}   rv   s    rZ   ready_to_flushzSIMDScheduling.ready_to_flush	  rE  r\   c                H   t          d |D                       st          |d           j        \  }\  }}|                     |||          }|                     |||          }|                     |t          |||                    }	|                     ||	           t          j	        d|          5  t          j        |	          5  |	                                }
d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   nf|d                             |          \  }}}t          j	        d|          5  |                     |||d|          }
d d d            n# 1 swxY w Y   |
                    t!          t"          j                  d	          }
|
S )
Nc              3  >   K   | ]}|                                 V  d S rU   )r  )r~   r   s     rZ   r   zASIMDScheduling.generate_kernel_code_from_nodes.<locals>.<genexpr>
  s*      22q1==??222222r\   c                D    t          |                                           S rU   r  r  s    rZ   r   z@SIMDScheduling.generate_kernel_code_from_nodes.<locals>.<lambda>
  rW  r\   r   )r  r)  r   Tr>  triton_)r  rn  r  r  r  r~  rD   r  r   r#  r8   r  rt  get_prologue_template_epiloguerG  replacerb   r0   KERNEL_NAME)rr   r   r)  r?  r  rg   rg  r(  r  rj   r  r4  templateepilogues                 rZ   generate_kernel_code_from_nodesz.SIMDScheduling.generate_kernel_code_from_nodes 
  sz    22E22222 	!$U0O0O!P!P!P!VAv 77ufMMM''ufEEF%%+M5&II &  F 22=&III/1ABB3 3$V,,3 3 "0022	3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ,18+R+R, ,(Hh 02BCC  00&*"/ 1                 ##C(?$@$@)LLsH   6DC, D,C0	0D3C0	4DD
DE**E.1E.c                    d S rU   r}   )rr   r(  s     rZ   r  zSIMDScheduling.codegen_comment!
  rv  r\   c                    t           rU   r:  )rr   r  r(  rj   s       rZ   r  zSIMDScheduling.define_kernel$
  r=  r\   )r   r  )rg   rh   r  r  rS   rt   )r  rD   )r  rD   rS   r  )r?  r   rS   r@  )F)rL  rM  rN  rt   rO  rt   rP  rt   r  rt   rS   rQ  )rS   ru  )r  rb  r  rb  rS   r  )r  rb  rw  rt   rS   r  )r  r  rg   rh   r  rh   rS   r  )rS   r  )
r(  r  r  rh   r  rh   r  rH   rS   r  )r(  r  rg   rh   r  rh   r  r  )r(  r  rg   rh   r  rh   r  r  )r  r
  rS   r  )r  r
  rS   r  r   )FN)r?  r   ),r   r   r   r   rk   r~  ry  r  r  can_fuse_verticalr  r  r  rz  r  r  r  r  r=  rG  rK  rl  rs  r{  r   r   r  r  r  r  r  r  r  r	  r   r   r   r  r  r$  r&  r0  r  r  r}   r\   rZ   r}  r}  y  sm         
 (K''''Q Q Q6 6 6B !"^ ^ ^@
 
 
 
, $ $ $ \$LM) M) M) M)^
 
 
 
 -  -  -T  x x x x x@  '+a a a a a aFI I I #(;  ;  ;  ;  ; z) ) )2 Y} } }  [}~ 

 

 

 [

 
 
 
 [
 / / / [/( w w w [wr M$ M$ M$ [M$^ 
 
 
 [
     [ 
 ;?	 	 	 	 [	 
 ;?~$ ~$ ~$ ~$ [~$@      MQ    B  " " " " "r\   r}  T)frozenc                  H    e Zd ZU ded<   ded<   dZded<   ed             ZdS )	r  r  r  rR   r  Nr@  ra   c                h    t           j        j                            |           } | dk    o| dz  dk    S )z@Somewhat arbitrary heuristic used to boost scores for some sizesrt  r   r  )r   s    rZ   r  zCandidateTiling.is_good_size.
  s3     G&&q))Bw(AFaK(r\   )r   r   r   ry  ra   rz  r  r}   r\   rZ   r  r  (
  sU         !!!!JJJD) ) \) ) )r\   r  c                      e Zd ZdS )r  N)r   r   r   r}   r\   rZ   r  r  5
  s        Dr\   r  )r   )rQ   rR   rS   rR   )r   r  rS   rb   )
__future__r   r  r  dataclassesr   r(  r  r  r  r  r   typingr   r   r   r   r	   r
   r   typing_extensionsr   r   rV   torch._loggingtorch._inductor.irr   torch._inductor.tiling_utilsr   %torch.fx.experimental.symbolic_shapesr   torch.fx.immutable_collectionsr   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._sympy.symbolr   r   r   r   _dynamo.utilsr    r   r   r   analyze_preserves_zero_maskr   	codecacher    dependenciesr!   r"   r#   r$   r  r%   optimize_indexingr&   runtime.runtime_utilsr'   r(   r)   r*   r+   utilsr,   r-   r.   r/   r0   r1   r2   r3   r4   r5   virtualizedr6   r7   r8   block_analysisr:   commonr;   r<   r=   r>   rF  r?   simd_kernel_featuresr@   rA   rB   rC   rD   collections.abcrE   rF   rG   rH   	getLoggerr   rQ  _logginggetArtifactLoggerr  r  
fusion_logdoprintrx  rR  r[   	dataclassr^   rm   r   r  r	  rk   r}  r  	Exceptionr  r}   r\   rZ   <module>rX     s5   " " " " " "                               X X X X X X X X X X X X X X X X X X % % % % % %       2 2 2 2 2 2 B B B B B B G G G G G G 9 9 9 9 9 9 / / / / / / L L L L L L L L L L            & % % % % % $ $ $ $ $ $ $ $ $ $ F F F F F F ! ! ! ! ! ! 6 6 6 6 6 6 6 6 6 6   ; ; ; ; ; ; A A A A A A ; ; ; ; ; ; ; ; D D D D D D D D D D                        - , , , , , , , , , / / / / / / P P P P P P P P P P P P % % % % % %               A<<<<<<<<<<@@@@@@ g!!00<HH~//*EE^--hAA
 	z77788; ; ; ; ;
 5+ 5+ 5+ 5+ 5+ 5+ 5+ 5+pN; N; N; N; N;/ N; N; N;b;' ;' ;' ;' ;'? ;' ;' ;'|    '+;TTTB B B B B('/*B B B BJl" l" l" l" l"^ l" l" l"^- d###	) 	) 	) 	) 	) 	) 	) $#	)	 	 	 	 		 	 	 	 	 	r\   