
    `ipk                      U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlmZmZ d dlmZmZmZmZmZmZmZ d dlmZmZ erd dlmZmZ d dlm Z  d dl!Z!d dl"Z"d dl#Z"d dl$m%c m&Z' d dl(m)Z)m*Z* d d	l+m,Z,m-Z- d d
l.m/Z/ d dl0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8m9Z9m:Z: d dl;m<Z< ddl=m>Z>m?Z?m@Z@mAZAmBZBmCZC ddlDmEZE ddlFmGZGmHZHmIZI ddlJmKZKmLZL ddlAmMZMmNZNmOZOmPZP ddlQmRZRmSZS ddlTmUZU ddlBmVZVmWZWmXZXmYZYmZZZ ddl[m\Z\ ddl]m^Z^m_Z_ ddl`maZambZb ddlcmdZd ddl%meZemfZfmgZgmhZhmiZimjZjmkZkmlZlmmZmmnZnmoZompZpmqZqmrZrmsZsmtZtmuZu ddlvmwZw  ejx        ey          Zze"j{        |                    eyd          Z}e"j{        |                    eyd           Z~e"j{        |                    eyd!          Zed"         Zd#ed$<    ed%          Z ed&          Zej         G d' d(                      Zej         G d) d*e                      Z G d+ d"          Zej        dhd.            Zdid1Zdjd3Zdkd5Z G d6 d7          Zdld:Z G d; d<          ZdmdCZ G dD dEe          Z G dF dGe          Z G dH dIe          ZdndLZdodQZ G dR dSe          Z G dT dUe          Z G dV dWe          Z	 dpdqd`Zej         G da db                      Z ej                    ZdrddZ G de dN          Z G df dg          ZdS )s    )annotationsN)Counterdefaultdict)AnyCallableGenericOptionalTYPE_CHECKINGTypeVarUnion)	ParamSpec	TypeAlias)IteratorSequence)
ModuleType)countersdynamo_timed)LambdaFuturePyCodeCache)TritonTemplateCallerBase)get_metric_tableis_metric_table_enabled)free_symbols
OrderedSet)free_symbol_is_typesymbol_is_typeSymT)
has_triton   )commsconfigconfig_commsdependenciesirmetrics)can_codegen_without_upcasts)BackendFeatureget_scheduling_for_deviceKernel) estimate_nccl_collective_runtime/estimate_nccl_collective_runtime_nccl_estimator)Dep	MemoryDepStarDepWeakDep)GPUTooOldForTritonTritonMissing)count_flops_fx)get_device_typeGraphPartitionSignatureMultiOutputMultiOutputLayout
NoneLayout)LoopBody)MemoryPlanningInfoForBufferMemoryPlanningInfoForNode)
green_textred_text)SimplifyIndexing)&_unstable_customized_partition_wrappercache_on_selfcmpdevice_need_guardget_device_tflopsget_dtype_sizeget_gpu_dram_gbpsGraphPartitionMapIndentedBufferis_collectiveis_cudagraph_unsafe_opis_gpuis_multi_outputs_template#is_output_of_multi_outputs_templateis_waitmaybe_log_cudagraph_partitionsympy_product)Vfusionloop_orderingcompute_dependenciesBaseSchedulerNoder   PartitionType_T_Pc                      e Zd ZU ded<   ded<   ded<    ej        e          Zded	<    ej        e          Z	d
ed<   ddZ
ddZddZddZd dZd!dZd"dZd#dZd#dZd$dZdS )%SchedulerBuffer	Scheduler	schedulerz	ir.BuffernodeOptional[BaseSchedulerNode]defining_op)default_factorylist[NodeUser]usersr:   
mpi_bufferreturnstrc                @    | j         }|J |                                S N)r^   get_name)selfops     m/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torch/_inductor/scheduler.pydefining_op_namez SchedulerBuffer.defining_op_namei   s!    ~~~{{}}    intc                4    t          | j        j                  S rf   )hashr\   namerh   s    rj   __hash__zSchedulerBuffer.__hash__n   s    DIN###rl   c                   t                      }|                                 }|                    | dt          | j                  j                    |                    | d| j        j                    |                                 r9|                    | dt          |                                                       | 	                                r9|                    | dt          | 	                                                      t          | j                  dk    r |                    | d| j                    n}|                    | d           |                    d          5  | j        D ]}|                    | d           	 d d d            n# 1 swxY w Y   |                    d	           |                                S )
N: z
.layout = z.aliases = z.mutations = r    z	.users = z
.users = [,])rG   rg   	writelinetyper\   __name__layoutget_aliasespformatget_mutationslenra   indentgetrawvalue)rh   resultrp   users       rj   	debug_strzSchedulerBuffer.debug_strq   s   !!}}D>>DOO$<>>???D>>DI,<>>??? 	PNN9I9I9K9K1L1LNNOOO 	TRR74;M;M;O;O3P3PRRSSStz??a;;tz;;<<<<000111q!! 1 1 J 1 1D$$ZZZ000011 1 1 1 1 1 1 1 1 1 1 1 1 1 1 S!!!!!###s   7#F''F+.F+c                4    | j                                         S rf   r\   rg   rq   s    rj   rg   zSchedulerBuffer.get_name       y!!###rl   Nonec                ^   | j         J | j                                         sd S | j                                         sJ| j                                         s1t	          | j                                         t          j                  r+t          j	        j
                            | j                    d S t          t          j        d          r|                                 t          j        j        v rt          j        j        |                                          }|| j        j        v r| j        j        |         j         }n| j        j        |         j         }t          j	        j
                            || j                    d S t          j	        j
                            | j                    d S )Nargs)r\   should_allocateget_inputs_that_alias_outputget_mutation_names
isinstanceget_output_specr%   CommBufferLayoutrP   graphwrapper_codecodegen_allocationhasattrkernelrg   inplace_update_buffersr[   name_to_donated_buffername_to_bufcodegen_inplace_reuse)rh   input_buffer_nameinput_buffers      rj   allocatezSchedulerBuffer.allocate   sy   y$$$y((** 	F I2244	y++--	 $)3355r7JKK	
 G 33DI>>>F AHf%%	?18#BBB !" ? P DN$III#~D%    $~9:KLQG 66	    
 G 33DI>>>>>rl   boolc                    | j         J t          | j         j        t          j                  st          | j                   rdS | j        D ]}t          |j         t                    r dS  dS NFT)r\   r   rz   r%   r8   rK   ra   
OutputNode)rh   uses     rj   can_freezSchedulerBuffer.can_free   s}    y$$$di&66 	:SI;
 ;
 	 5: 	 	C#(J// uutrl   c                @   i }|D ]r}t          |j                  |v rC|                    |t          |j                                     |t          |j                  <   [||t          |j                  <   st          |                                          | _        d S rf   )idr\   mergelistvaluesra   )rh   ra   r   r   s       rj   	set_userszSchedulerBuffer.set_users   s    &( 	+ 	+C#(||v%%'*yy381E'F'Fr#(||$$'*r#(||$$&--//**


rl   Sequence[str]c                F    | j         J | j                                         S rf   )r\   r   rq   s    rj   r{   zSchedulerBuffer.get_aliases   s$    y$$$y55777rl   c                F    | j         J | j                                         S rf   )r\   r   rq   s    rj   r}   zSchedulerBuffer.get_mutations   $    y$$$y++---rl   Optional[torch.device]c                X    | j                                                                         S rf   )r\   r   
get_devicerq   s    rj   r   zSchedulerBuffer.get_device   s"    y((**55777rl   Nrc   rd   rc   rm   rc   r   rc   r   )ra   r`   rc   r   rc   r   rc   r   )ry   
__module____qualname____annotations__dataclassesfieldr   ra   r:   rb   rk   rr   r   rg   r   r   r   r{   r}   r    rl   rj   rY   rY   _   sQ        OOO,,,,-K-dCCCECCCC.?k.?3/ / /J       
$ $ $ $$ $ $ $($ $ $ $? ? ? ?B
 
 
 
+ + + +8 8 8 8. . . .8 8 8 8 8 8rl   rY   c                      e Zd ZU dZded<   dS )SchedulerDonatedBufferNr]   r^   )ry   r   r   r^   r   r   rl   rj   r   r      s#         /3K333333rl   r   c                     e Zd ZU ded<   ded<   ded<   ded<   ded	<   d
ed<   dZded<   dndZdodZdpdZdpdZdpdZ	dqdZ
dpdZdrdZdsd#Zdtd&Zdud)Zdvd*Zdwd,Zdxd0Zdrd1Zdyd2Zdyd3Zdrd4Zdrd5Zdzd8Zdpd9Zdpd:Zedyd;            Zedyd<            Zedvd=            Zedvd>            Zd{d@Z d|dBZ!d}dEZ"d~dGZ#dvdHZ$dvdIZ%dvdJZ&dvdKZ'dvdLZ(dvdMZ)dvdNZ*ddQZ+dvdRZ,drdSZ-	 dddXZ.eddY            Z/eddZ            Z0edd[            Z1dd^Z2dd`Z3eddb            Z4dddZ5edde            Z6ddgZ7ddiZ8e9ddm            Z:dS )rT   z7tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]]groupdependencies.ReadWritesread_writesOrderedSet[Dep]unmet_dependenciesrm   	min_order	max_orderr;   mpi_nodeNOptional[float]override_estimated_runtimer[   rZ   rc   r   c                $    || _         d | _        d S )Nc                     g S rf   r   )r   kwargss     rj   <lambda>z,BaseSchedulerNode.__init__.<locals>.<lambda>   s    B rl   )r[   debug_device_str)rh   r[   s     rj   __init__zBaseSchedulerNode.__init__   s    $-&& 	rl   r\   ir.Operationc                
    | _         t                       _        t          t                                _        d _         fd|                                D              _        d  j        D              _        i  _	        d S )NFc                >    g | ]}t          j        |           S ))r[   r\   r^   )rY   r[   ).0outputrh   s     rj   
<listcomp>z5BaseSchedulerNode._init_from_node.<locals>.<listcomp>   sE     /
 /
 /
  .   /
 /
 /
rl   c                8    i | ]}|                                 |S r   rg   r   bufs     rj   
<dictcomp>z5BaseSchedulerNode._init_from_node.<locals>.<dictcomp>   s/     <
 <
 <
$'CLLNNC<
 <
 <
rl   )
r\   r   	ancestorsrd   
last_usagewrittenget_outputsoutputsoutputs_by_namemutation_renamesrh   r\   s   ` rj   _init_from_nodez!BaseSchedulerNode._init_from_node   s    ,0	*4,,$
   /
 /
 /
 /
 **,,/
 /
 /
<
 <
+/<<
 <
 <
 13rl   rd   c                Z    t          |           j         d|                                 dS )Nz(name=)rx   ry   rg   rq   s    rj   __repr__zBaseSchedulerNode.__repr__   s*    t**%AAT]]__AAAArl   c                   |                                  }t                      }|                    | dt          |           j         dt          t          | dd                    j         d| dt          | j        j                   d| dt          | j	                   d| d	t          | j        j
        | j	        z
             d| d
           |                                5  |                                 D ])}|                    |                                           *	 ddd           n# 1 swxY w Y   |                    d           	 |                    |                                            n,# t           $ r t"                              dd           Y nw xY w|                                                                S )#Longer form printout for trace logsrt   (r\   N)

.writes = 
.unmet_dependencies = .met_dependencies = z.outputs = [
        rv   Ignoring error in debug_str()Texc_info)rg   rG   splicerx   ry   getattrr|   r   writesr   readsr   r   r   rw   debug_str_extra	Exceptionlogwarningr   rstrip)rh   rp   r   outs       rj   r   zBaseSchedulerNode.debug_str  sG   }}

 	d	 #GD&$$?$?@@I  )011    %T%<==  	  #4#3#9D<S#STT	 
   	
 	
 	
 ZZ\\ 	, 	,'')) , ,

3==??++++,	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	c	HJJt++--.... 	H 	H 	HKK7$KGGGGG	H   '')))s$   %?D11D58D5'E: :&F#"F#c                    dS )N r   rq   s    rj   r   z!BaseSchedulerNode.debug_str_extra      rrl   	list[str]c                ,    |                      |           S rf   )r   rq   s    rj   _debug_str_for_devicez'BaseSchedulerNode._debug_str_for_device  s    $$T***rl   c                   t          | j        dd           }d}t          |t          j        j        j                  r/d|                    |                                gdd          z   }net          |t          j        j        j	                  rAd|                    |
                                |                                gdd          z   }|  | S )Ndatar   z, F)shorten	multiline)r   r\   r   torch	_inductorr%   	Pointwise
str_helperget_size	Reductionget_reduction_sizeget_reduction_type)rh   
maybe_datadata_strs      rj   debug_str_shortz!BaseSchedulerNode.debug_str_short   s    TY55
j%/"4">?? 		j33$$&&'% 4   HH 
EO$6$@AA 	j33..00*2O2O2Q2QR 4   H
 """"rl   c                ^    t                               d| | j        | j        j                   d S )Nz(%s: unmet_dependencies = %s, writes = %s)r   infor   r   r   rq   s    rj   log_detailszBaseSchedulerNode.log_details/  s7    6##		
 	
 	
 	
 	
rl   self_depr.   	other_depr   c                    dS NFr   )rh   r  r  s      rj   reorder_loops_by_dep_pairz+BaseSchedulerNode.reorder_loops_by_dep_pair7  	     url   renamesdict[str, str]c                    fdd | j                                         D             D             | _        |                     | j                             | j                             d S )Nc                *    i | ]}|v ||         S r   r   )r   rp   r  s     rj   r   z:BaseSchedulerNode.update_mutated_names.<locals>.<dictcomp>=  s0     !
 !
 !
w '$-rl   c              3  $   K   | ]}|j         V  d S rf   rp   r   deps     rj   	<genexpr>z9BaseSchedulerNode.update_mutated_names.<locals>.<genexpr>?  s$      QQcQQQQQQrl   )r   reads_and_writesr   set_read_writesrenamerh   r  s    `rj   update_mutated_namesz&BaseSchedulerNode.update_mutated_names<  s{    !
 !
 !
 !
QQT-=-N-N-P-PQQQ!
 !
 !

 	T-44T5JKKLLLLLrl   r   r-   c                `    |                      | j                            |                     d S rf   )r#  r   	with_readrh   r   s     rj   add_fake_depzBaseSchedulerNode.add_fake_depD  s-    T-77<<=====rl   c                X    t          d |                                 D                       S )Nc              3  f   K   | ],}|                                 p|                                V  -d S rf   )r{   r}   r   s     rj   r!  z=BaseSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>H  sN       
 
9<COO4!2!2!4!4
 
 
 
 
 
rl   )anyr   rq   s    rj   has_aliasing_or_mutationz*BaseSchedulerNode.has_aliasing_or_mutationG  s<     
 
@D@P@P@R@R
 
 
 
 
 	
rl   rwc                ^    || _         | j         j        | _        |                                  d S rf   )r   r   r   
prune_deps)rh   r/  s     rj   r#  z!BaseSchedulerNode.set_read_writesL  s.    "&"2"8rl   future_used_buffersOrderedSet[str]mutation_real_namec                z    |                                  }t          fd|D                       }||z
  | _        d S )Nc              3  D   K   | ]}                     ||          V  d S rf   )get)r   kr4  s     rj   r!  z3BaseSchedulerNode.set_last_usage.<locals>.<genexpr>U  s4      !U!U1"4"8"8A">">!U!U!U!U!U!Url   )used_or_aliased_buffer_namesr   r   )rh   r2  r4  used_bufferss     ` rj   set_last_usagez BaseSchedulerNode.set_last_usageQ  sI     88::!!U!U!U!U!U!U!UUU&)<<rl   c                B    | j         D ]}|                                 d S rf   )r   r   )rh   r   s     rj   mark_runzBaseSchedulerNode.mark_runX  s,    < 	 	CLLNNNN	 	rl   c                    t          d t          j        | j        j        | j        j                  D                       S )Nc              3  $   K   | ]}|j         V  d S rf   r  r  s     rj   r!  z6BaseSchedulerNode.used_buffer_names.<locals>.<genexpr>]  s8       
 
 H
 
 
 
 
 
rl   )r   	itertoolschainr   r   r   rq   s    rj   used_buffer_namesz#BaseSchedulerNode.used_buffer_names\  sH     
 
 t'7'=t?O?VWW
 
 
 
 
 	
rl   c                    t                      d t          j        | j        j        | j        j                  D             }t          |          dk    r|                                }                    |           t          j
        j                            |          rH|                    fdt          j
        j        |                                         D                        t          |          dk    S )Nc                    g | ]	}|j         
S r   r  r  s     rj   r   zBBaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<listcomp>e  s*     
 
 
 H
 
 
rl   r   c              3  $   K   | ]
}|v|V  d S rf   r   )r   alias
used_namess     rj   r!  zABaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<genexpr>m  s>         J..	  /... rl   )r   r@  rA  r   r   r   r~   popaddrP   r   name_to_bufferr7  extendr   )rh   depsr   rG  s      @rj   r9  z.BaseSchedulerNode.used_or_aliased_buffer_namesb  s    &0ll

 
 t'7'=t?O?VWW
 
 
 $ii!mm((**CNN3w%))#..     !"!7"2244	     	 $ii!mm rl   c                R     t           fd j        D                        _        d S )Nc              3  B   K   | ]}|j         j        j        v|V  d S rf   )rp   r[   available_buffer_namesr   r   rh   s     rj   r!  z/BaseSchedulerNode.prune_deps.<locals>.<genexpr>w  sA       -
 -
xt~DDD DDDD-
 -
rl   r   r   rq   s   `rj   r1  zBaseSchedulerNode.prune_depsv  sD    ", -
 -
 -
 -
.-
 -
 -
 #
 #
rl   c                     d fdt          fd j        j        D                       }                      j                            |                     d S )Nr   r-   rc   r   c                    t          | t                    sdS j        j        | j                                                 }|t          j        j        v S r  )	r   r0   r[   r   rp   rk   rP   r   removed_operations)r   op_namerh   s     rj   should_prunez7BaseSchedulerNode.prune_weak_deps.<locals>.should_prune  sG    c7++ un0:KKMMGag888rl   c              3  2   K   | ]} |          |V  d S rf   r   r   r   rV  s     rj   r!  z4BaseSchedulerNode.prune_weak_deps.<locals>.<genexpr>  sF       
 
\\#5F5F

 
 
 
 
 
rl   r   r-   rc   r   )r   r   r   r#  remove_reads)rh   	to_removerV  s   ` @rj   prune_weak_depsz!BaseSchedulerNode.prune_weak_deps}  s    	9 	9 	9 	9 	9 	9  
 
 
 
+1
 
 
 
 
	 	T-::9EEFFFFFrl   name_to_fused_nodedict[str, BaseSchedulerNode]c                <    t          | || j        j                   d S rf   )_prune_redundant_depsr[   r   )rh   r]  s     rj   prune_redundant_depsz&BaseSchedulerNode.prune_redundant_deps  s"     	d$68RSSSSSrl   c                F    | j         J | j                                         S rf   )r\   get_operation_namerq   s    rj   rg   zBaseSchedulerNode.get_name  r   rl   c                *    |                                  S rf   r   rq   s    rj   get_first_namez BaseSchedulerNode.get_first_name  s    }}rl   c                X    t          d |                                 D                       S )Nc              3  >   K   | ]}|                                 V  d S rf   r   r   r\   s     rj   r!  z8BaseSchedulerNode.get_operation_names.<locals>.<genexpr>  s*      GGd$--//GGGGGGrl   )r   	get_nodesrq   s    rj   get_operation_namesz%BaseSchedulerNode.get_operation_names  s)    GGdnn6F6FGGGGGGrl   c                >    t          d | j        D                       S )Nc              3  >   K   | ]}|                                 V  d S rf   r   r   r   s     rj   r!  z5BaseSchedulerNode.get_buffer_names.<locals>.<genexpr>  s*      AAS#,,..AAAAAArl   )r   r   rq   s    rj   get_buffer_namesz"BaseSchedulerNode.get_buffer_names  s!    AADLAAAAAArl   c                X    t          d |                                 D                       S )Nc              3  b   K   | ]*}t          |t                    ot          |d           V  +dS )T)disallow_fp32_opsNr   SchedulerNoder'   r   ns     rj   r!  zABaseSchedulerNode.can_codegen_in_low_precision.<locals>.<genexpr>  sX       
 
  q-(( G+AFFF
 
 
 
 
 
rl   allri  rq   s    rj   can_codegen_in_low_precisionz.BaseSchedulerNode.can_codegen_in_low_precision  s<     
 
 ^^%%
 
 
 
 
 	
rl   c                X    t          d |                                 D                       S )Nc              3  ^   K   | ](}t          |t                    ot          |          V  )d S rf   rr  rt  s     rj   r!  z@BaseSchedulerNode.can_codegen_without_upcasts.<locals>.<genexpr>  sN       
 
 q-((K-H-K-K
 
 
 
 
 
rl   rv  rq   s    rj   r'   z-BaseSchedulerNode.can_codegen_without_upcasts  s:     
 
^^%%
 
 
 
 
 	
rl   Sequence[BaseSchedulerNode]c                    | gS rf   r   rq   s    rj   ri  zBaseSchedulerNode.get_nodes  s	    vrl   Sequence[SchedulerBuffer]c                    | j         S rf   )r   rq   s    rj   r   zBaseSchedulerNode.get_outputs  s
    |rl   buf_namerY   c                    | j         |         S rf   )r   )rh   r  s     rj   
get_outputzBaseSchedulerNode.get_output  s    #H--rl   r   c                F    | j         J | j                                         S rf   )r\   r   rq   s    rj   r   zBaseSchedulerNode.get_device  s$    y$$$y##%%%rl   c                H    |                                  }|d uo
|j        dk    S Ncpu)r   rx   rh   devices     rj   is_cpuzBaseSchedulerNode.is_cpu  s(    ""T!:fkU&::rl   c                Z    |                                  }|d uot          |j                  S rf   )r   rJ   rx   r  s     rj   rJ   zBaseSchedulerNode.is_gpu  s+    ""T!9fV[&9&99rl   c                    dS r  r   rq   s    rj   is_reductionzBaseSchedulerNode.is_reduction      url   c                    dS r  r   rq   s    rj   is_split_scanzBaseSchedulerNode.is_split_scan  r  rl   c                    dS r  r   rq   s    rj   is_templatezBaseSchedulerNode.is_template  r  rl   c                    dS r  r   rq   s    rj   	is_externzBaseSchedulerNode.is_extern  r  rl   c                    dS r  r   rq   s    rj   
is_foreachzBaseSchedulerNode.is_foreach  r  rl   read_depdependencies.Depc                    dS r  r   rh   r  s     rj   can_inplacezBaseSchedulerNode.can_inplace  r  rl   c                    dS r  r   rq   s    rj   has_side_effectsz"BaseSchedulerNode.has_side_effects  r  rl   c                    ddl m} t           t                    rt          j        rt          j                             	                                t          j                  rht          t          j        t          j        j        j        j                  rt%          t          j        dd          t'          t          j        d          sdS  j        t          j        j        z   j        j        z  d fd
}                                 D ]Z}|j        }|J |                                rM|                                s9|                                s%|                                t          j        j        v rp j        j         D ]}|j!         j        j"        v r j        j"        |j!                 }n$ j        j#        $                    |j!                  }|rt          j        j%        &                    |           rat          |j'        tP                    sF|j)        J fd|j)        D             }tU          |          dk    r|d         j+        r|d         j         u r|j        t          |j        ,                                tZ          j.        tZ          j/        tZ          j0        f          s|j'        r[t          |j'        j        tZ          j1        tZ          j2        f          r+tU          |j                                                  dk    sE ||j        |j                  r. ||          r"t          j        j3        4                    |                                |                                           t          t          j        t          j        j        j        j                  rlt          j        j5        6                    |                                           t          j        j5        6                    |                                           |                                t          j        j7        |                                <    nސ\dS )z~
        Decide if there should be inplace updates for the node
        and record the decision in the active kernel.
        r    )can_match_buffer_size	mutationsNr   buf_to_be_inplacedrY   rc   r   c                   | j                                       }|                                 t                      }| j        D ]}|j        }t          |t                    s|                                | j         j	        vs| j                             |          |ur\|fd|j
                                        D             z  }t          |          dk    r dS dS )Nc              3  2   K   | ]}|j         k    |V  d S rf   r  )r   or  s     rj   r!  z^BaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node.<locals>.<genexpr>
  s<        v)) )))) rl   r    FT)r[   get_fused_noderg   r   ra   r\   r   rT   re  r]  r   r"  r~   )r  
fused_noderL  r   	user_noder  rh   s        @rj   single_index_in_fused_nodezKBaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node  s   
 ,5DDTJJJ)2244H %/LLD*0 ! ! I	!)->??  ,,..-7JK K)3BB9MM%& &     &2CCEE   
 t99q== 55 ! 4rl   c                J    g | ]}|j                                         v| S r   r   )r   xinconsequential_nodess     rj   r   z;BaseSchedulerNode.decide_inplace_update.<locals>.<listcomp>,  s;     & & &6??,,4III IIIrl   r   )r  rY   rc   r   )8codegen.wrapperr  r   rs  r"   inplace_buffersrP   r   has_featurer   r(   INPLACE_BUFFERSr   r  r  codegensimd
SIMDKernelr   r   r   rT  r[   completed_operationsr   r\   r   r   r   rg   removed_buffersr   r   rp   r   r   r7  r   	can_reuser^   NopKernelSchedulerNodera   r~   r  r   r%   r8   r7   MutationLayoutSHOULDREMOVEFallbackKernelr6   r   make_inplacer  rI  r   )	rh   r  r  r   buf_noderead	input_bufremaining_usesr  s	   `       @rj   decide_inplace_updatez'BaseSchedulerNode.decide_inplace_update  s3   
 	;::::: t]++	&	 ##DOO$5$5~7UVV	
 qx)@)E)PQQ	 18[$77C &)) D
 F Ng()n12 	 	  	  	  	  	  	D ##%% C	 C	CxH''',,..88:: ..00 <<>>QW%<<<(. 8 89 EEE $ Edi PII $ : > >ty I II 1,66y$GG1 'y'<>TUU1
 %?666& & & &!*& & &N N++q00*1-9 1*1-2d::%N6 *%N::<< " " 4 " =! ! 7 &1 7 !+ ) 5 :!#!2BN C! ! 7 !$IN$O$O$Q$Q R RUV V V11).#(KK !W 76yAA !W 2293E3E3G3GXXX%Heo&=&B&M  C H.2293E3E3G3GHHHH.223<<>>BBB &..00 7G GC	 C	rl   TbufferrG   	only_oncec                n   t           j        sd S |r	| j        rd S | j        J | j                                        }g }|D ]B}|j        dk    r|                    d           |                    d           d|j         d|j         }d|j        v r|d|j        d          z   }|                    |           d|j        v r|j        d          }|	                    d	d
          d         }|                    d|
                    dd          
                    dd          
                    dd          
                    dd          z              |                    d           |                    d           Dt          |          dk    rd S |                    |           d| _        d S )Nr   r   z#pragma CMT ORIGIN:z#pragma CMT  seq_nrz seq_nr:stack_trace|r    )maxsplit{z{{}z}}r   \z\\z#pragma CMT END ORIGINr   T)r"   comment_originr   r\   get_originsri   appendtargetmetarsplitreplacer~   
writelines)	rh   r  r  origins	out_linesr  op_info_strr  stack_trace_last_lines	            rj   codegen_originating_infoz*BaseSchedulerNode.codegen_originating_infoY  s    $ 	F 	 	Fy$$$)''))	 	% 	%AtxR   2333:::::K16!!),Iqvh7G,I,II[)))&&!"!68(3(:(:3(:(K(KB(O%  "+33C>>WS$''WT4((Wf 	     !9:::  $$$y>>QF 	)$$$rl   c                0    |                      dd          S )NTinclude_readsinclude_writes!get_read_write_buffers_sizes_implrq   s    rj   get_read_write_buffers_sizesz.BaseSchedulerNode.get_read_write_buffers_sizes  s$    55t 6 
 
 	
rl   c                0    |                      dd          S )NTFr  r  rq   s    rj   get_read_buffer_sizesz'BaseSchedulerNode.get_read_buffer_sizes  s$    55u 6 
 
 	
rl   c                0    |                      dd          S )NFTr  r  rq   s    rj   get_write_buffer_sizesz(BaseSchedulerNode.get_write_buffer_sizes  s$    55 6 
 
 	
rl   r  r  c                r    t          |                     ||                                          d          S )Nr  r   )start)sumget_read_write_buffer_accessesr   )rh   r  r  s      rj   r  z3BaseSchedulerNode.get_read_write_buffers_sizes_impl  sD     //+N 0  fhh	
 
 
 	
rl   dict[str, int]c                    t           t                    ri S t           t                    rt           j        t                    ri S t           t                    rCt           j        t
          j                  r$ j        j        t          j	        j
        j        u ri S ddt           t                    rY t                                           d                   t                                           d                   z            nt          d          t!          j        t$                    }|r/ j        j        D ]"}||j                                     |           #|r/ j        j        D ]"}||j                                     |           #|r#t1          d	  j        j        D                       nt1                      }|r#t1          d
  j        j        D                       nt1                      }d fdt           t2                    r&t1           fd|D                       }||z
  }||z
  }i }||z  D ]}	t5          fd||	         D                       |	t6          j        j        v rt6          j        j        |	         }
n,|	t6          j        j        v rt6          j        j        |	         }
nzd fd |
          }|	|vr|||	<   ||	xx         |z  cc<   |S )az  
        Counting the number of bytes accessed for a kernel is
        surprisingly tricky. In particular, there is a differentiation
        between 'theoretical' memory accesses and practical memory
        accesses. For example, a layernorm kernel may actually access an
        input 3 times, but in theory, it only needs to access its input
        once (and may be optimized to do so through say, persistent
        reductions)

        Another example is that even though a buffer is passed in, we may
        not access the entire buffer. This may occur if we are accessing
        a slice of the buffer. Another tricky case is for indirect
        indexing, where the amount of bytes accessed depends on the
        values of the input.

        What this function aims to compute is the memory accesses for
        worst-case inputs, best-case optimization. What this means is
        that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

        1. Numel in ranges multiplied by number of deps the buffer has
        2. The buffer size

        Returns memory accesses per buffer.
        s
sympy.Exprrc   rm   c                N    t           j        j                            | d          S )Nr   fallback)rP   r   sizevars	size_hint)r  s    rj   try_size_hintzGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.try_size_hint  s     7#--a!-<<<rl   r   r        eAc              3  $   K   | ]}|j         V  d S rf   r  r  s     rj   r!  zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>  s$      BBCsxBBBBBBrl   c              3  $   K   | ]}|j         V  d S rf   r  r  s     rj   r!  zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>  s$      CCCsxCCCCCCrl   r   rd   snodesr{  r   c                    j         j        |          j        }t          d |D                       }t	          |t          |          z
            dk    S )Nc              3  $   K   | ]}|j         V  d S rf   r\   )r   r   s     rj   r!  z\BaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized.<locals>.<genexpr>  s$      !>!>$)!>!>!>!>!>!>rl   r   )r[   r   ra   r   r~   )r   r  ra   buf_usesrh   s       rj   is_materializedzIBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized  sQ    N.s39E!!>!>!>!>!>>>Hx*V"4"445599rl   c              3  >   K   | ]} |j                   |V  d S rf   r  )r   r   r  rh   s     rj   r!  zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>  sJ       ) )__S$+-N-N)) ) ) ) ) )rl   c              3     K   | ]}V  d S rf   r   )r   r   
node_numels     rj   r!  zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>  s#      $R$RCZ$R$R$R$R$R$Rrl   <Optional[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject]]c                *   | sdS t          | t          j                  r|                                 S t          | j        t
                    rj        j        |                                          j	        }d}|D ]o}t          |j
        t                    sJ t          |j
        j
        t                    r0|j
                                        D ]}| |j
                  z  }m dS |S t          | j        t          j                  r-t          fd|                                 D                       S  	t#          |                                                     }t'          |                                           t+          |          z  S )Nr   c              3  h   K   | ],} t           j                            |                    V  -d S rf   )rP   r   
get_buffer)r   mut_nameget_buf_bytess     rj   r!  zZBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes.<locals>.<genexpr>  sQ        $ &ag&8&8&B&BCC     rl   )r   r%   TorchBindObjectr  rz   r7   r[   r   rg   ra   r\   rT   r6   r   r8   r  r   rO   r	  rD   	get_dtypemin)
r   ra   totr   	sched_buf	buf_elemsbuf_accessed_elemsr  rh   r  s
         rj   r  zGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes  s     1c2#566 ,,...
,=>>  !N6s||~~FLEC % 	% 	%)$)5FGGGGG%dinkBB %-1Y-B-B-D-D E E	 #}}Y^'D'D DE $%11J
BM:: 	    (+(>(>(@(@     
 !.mCLLNN.K.K L LI)#--//::S*I> >  rl   )r  r  rc   rm   )r   rd   r  r{  rc   r   )r   r  rc   rm   )r   r  ExternKernelSchedulerNoder\   r6   r%   r  op_overloadr  _prims	rng_primsgraphsafe_run_with_rng_staters  rO   
get_rangesrm   collectionsr   r   r   r   rp   r  r   r   FusedSchedulerNoder  rP   r   rJ  graph_inputs)rh   r  r  buf_accessesr   r   r   r  buf_byte_accessesr  r   	buf_bytesr  r  r  r  r  s   `           @@@@@rj   r  z0BaseSchedulerNode.get_read_write_buffer_accesses  s   6 d233 	Id566 	:I{<
 <
 	 It677	49b&788	 	%|%BC C I	= 	= 	= 	= dM** 	"&doo//233 1 1! 4556 JJ
 SJ".t44 	3'- 3 3SX&--c2222 	3'. 3 3SX&--c2222 JBB4+;+ABBBBBB 	 JCC4+;+BCCCCCC 		: 	: 	: 	: 	: 	:
 d.// 	,( ) ) ) ) )%) ) )  O o-FO+E,. 1	9 1	9H!$$R$R$R$R<;Q$R$R$R!R!R17111g,X6QW111g*84! ! ! ! ! ! ! ! !F &c**I000.7!(++!(+++y8++++  rl   
int | Nonec                    | j         d S | j                                         }|d S t          |          }|d S t          j        j                            |d          }t          d         dxx         |z  cc<   |S )Nr   r  inductor
flop_count)r\   get_origin_noder3   rP   r   r  r  r   )rh   fx_nodeflopsresolved_flopss       rj   estimate_flopsz BaseSchedulerNode.estimate_flops0  s    94)++--?4w''=4)33EA3FF\***n<***rl   floatc                F    | j         | j         S |                                 S rf   )r   _get_estimated_runtimerq   s    rj   get_estimated_runtimez'BaseSchedulerNode.get_estimated_runtime@  s&    *622**,,,rl   c                   |                                  d                                         d         }|j                                        }t	          t          |                    sdS t          | j                  r,t          | j        t          j	                  sJ 	 t          j        rt          |           }t                      }|                    |          }|t          |t                    sJ |S t!          |           }|t#          | j                  }|                    ||           |S t#          | j                  S # t&          $ r%}t(                              |           Y d}~dS d}~wt,          $ r%}t(                              |           Y d}~dS d}~ww xY wt/          | j                  rdS t1          |           }||S |j                                        }		 t5                      }
t7          |	          dz  }|
dk    rt9          d|
           |dk    rt9          d|           n# t:          $ r Y dS w xY w|                                 }|dk    s||                                 |
z  }|dz  }|S d}|                                 }|dn|}||z  |z  d	z  }||
z  }tA          ||          }|dz  }|S )
zC
        Returns estimated op runtime in milliseconds (ms)
        r   Nvaluel    J)z-gpu_memory_bandwidth cannot be <= 0, but got z"gpu_flops cannot be <= 0, but got g    .Ag      ?r  )!ri  r   r\   r   rJ   r4   rH   r   r%   IRNoder#   ,runtime_estimations_use_nccl_lib_estimations)get_estimate_runtime_cache_key_from_snodeget_estimate_runtime_cachelookupr  r,   r+   	set_value
ValueErrorr   r  	TypeErrorrM    maybe_estimate_runtime_benchmarkmaybe_get_dtyperE   rC   AssertionErrorr   r  r  max)rh   r   rz   	cache_keycache	cache_valmseretdtypegpu_memory_bandwidth	gpu_flops	flops_estnsfactorcounted_bytescompute_timetransfer_times                     rj   r   z(BaseSchedulerNode._get_estimated_runtimeF  s#   
 nnq!--//2))++of--.. 	1 ## "	di33333L  I$ O OI688E %Y 7 7I ,))U;;;;;((HNNBz=diHHOOIRO888I7	BBB    qqqqq   qqqqq
 TY 	
 1.t44?J((**	#4#6#6 )%0069I $q(($ZDXZZ   A~~$%U)%U%UVVV  	 	 	11	 ''))	>>Y.22447KKBcBI 99;;*2*Y6#=%(<< }--#X	sD   !AE :=E 8E 
F(E66F(F##F(.AH? ?
IIOptional[ir.TemplateBuffer]c                    d S rf   r   rq   s    rj   get_template_nodez#BaseSchedulerNode.get_template_node      trl   ir.TemplateBufferc                6    |                                  }|J |S rf   rB  )rh   templates     rj   get_template_node_or_throwz,BaseSchedulerNode.get_template_node_or_throw  s$    ))++###rl   nodeslist[BaseSchedulerNode]Jtuple[list[BaseSchedulerNode], BaseSchedulerNode, list[BaseSchedulerNode]]c                    t          d t          |           D                       }| d|         }| |         }| |dz   d         }|||fS )zQ
        For the list of nodes, get the prologue, template, and epilogue
        c              3  H   K   | ]\  }}|                                 |V  d S rf   r  r   iru  s      rj   r!  zCBaseSchedulerNode.get_prologue_template_epilogue.<locals>.<genexpr>  s3      PPDAqPaPPPPPPrl   Nr    )next	enumerate)rI  template_indexprologuetemplate_nodeepilogues        rj   get_prologue_template_epiloguez0BaseSchedulerNode.get_prologue_template_epilogue  sb     PPIe,<,<PPPPP.)n-!+--.00rl   )r[   rZ   rc   r   )r\   r   rc   r   r   )rc   r   r   r  r.   r  r.   rc   r   r  r  rc   r   )r   r-   rc   r   r   )r/  r   rc   r   r2  r3  r4  r  rc   r   rc   r3  r]  r^  rc   r   rc   r{  )rc   r}  )r  rd   rc   rY   r   r  r  rc   r   T)r  rG   r  r   rc   r   r   )r  r   r  r   rc   rm   )r  r   r  r   rc   r  rc   r  )rc   r  rc   r@  )rc   rD  )rI  rJ  rc   rK  );ry   r   r   r   r   r   r   r   r   r   r   r  r  r  r&  r*  r.  r#  r;  r=  rB  r9  r1  r\  ra  rg   re  r@   rj  rn  rx  r'   ri  r   r  r   r  rJ   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r!  r   rB  rH  staticmethodrW  r   rl   rj   rT   rT      s&        BBBB(((('''' NNNNNN''''266666
 
 
 
3 3 3 34B B B B* * * *2   + + + +# # # #
 
 
 
   
M M M M> > > >
 
 
 

   
= = = =   
 
 
 
   (
 
 
 
G G G GT T T T
. . . .    H H H ]H B B B ]B 
 
 
 ]
 
 
 
 ]
      . . . .& & & &; ; ; ;: : : :                     @ @ @ @F 9=- - - - -^ 
 
 
 ]

 
 
 
 ]

 
 
 
 ]


 
 
 
J! J! J! J!X    ]- - - - U U U ]Un      
 1 1 1 \1 1 1rl   rc   $torch._inductor.codecache.LocalCachec                 H    t           j        j                                        S rf   )r  r  	codecache
LocalCacher   rl   rj   r(  r(    s    ?$//111rl   snoderd   c                \   t          | j        dd          }| j        j        }| j                            g || j        j        | j        j                  }| j        j        }t          j        ||f          \  }}ddt          |ft          fd|D                       z             }|S )Npython_kernel_namer   rc   r   c                l    t          | t          j                  ot          | t          j                   S rf   )r   r%   r%  GeneratorStater  s    rj   _is_tensor_irz@get_estimate_runtime_cache_key_from_snode.<locals>._is_tensor_ir  s)    !RY''P
1b>O0P0P,PPrl   c              3  t   K   | ]2} |          r!t          |                                          nd V  3d S rf   )tupler	  )r   arm  s     rj   r!  z<get_estimate_runtime_cache_key_from_snode.<locals>.<genexpr>  sG      UUa}}Q'7'7Aajjll###TUUUUUUrl   r   )
r   r\   inputsfill_non_provided_argsconstant_argsr   pytreetree_flattenrd   ro  )rg  ri  r   r   	flat_argsflat_args_pytree_specr1  rm  s          @rj   r'  r'    s     -A2FF:D:,,*$*)*
 D ZF'-':D&>'J'J$I$Q Q Q Q 	
UUUU9UUU
U
U	V I rl   Optional[Callable[[Any], Any]]c                >   t          | t                    sd S t          j        j        j        t          j        j        j        t          j        j        j        d}t          | j	        dd          }||vrd S t          | j	        t          j                  sd S ||         S )N)zextern_kernels.mmzextern_kernels.bmmzextern_kernels.addmmri  r   )r   r	  r  opsatenmmbmmaddmmr   r\   r%   ExternKernel)rg  mms_fnsri  s      rj   _get_mm_like_fnr    s    e677 t"Y^.#in0 %	 4 G
 !-A2FF((tej"/22 t%&&rl   r   c                   	
 d 	d }t           j        rt                     }|d S |	 fd}nd S t                     }t	                      }|                    |          }|t          |t                    sJ |S ddlm	  |            \  
ddl
m}  |	
fd          }|                    ||           |S )Nc                                 S rf   r   )rg  snode_args_kwargss   rj   r   z2maybe_estimate_runtime_benchmark.<locals>.<lambda>  s    !2!25!9!9 rl   r    )r  r   )do_benchc                       i S rf   r   )r   bench_fnr   s   rj   r   z2maybe_estimate_runtime_benchmark.<locals>.<lambda>  s    ((D3F33 rl   r#  )r"   !runtime_estimations_mms_benchmarkr  r'  r(  r)  r   r  utilsr  triton.testingr  r*  )rg  args_kwargs_fnmm_fnr1  r2  r3  r  r4  r   r  r   r  s   `       @@@@rj   r-  r-    s
   HN/ &&=499999t9%@@I&((EY''I)U+++++((((((!>##LD&''''''	333333	4	4B	OOIRO(((Irl   c                  D    e Zd ZU g dZded<   ded<   ddZddZddZdS )	WhyNoFusename1name2reasonr   rd   r  ztuple[Any, ...]r   node1rT   node2rc   r   c                j    |                                 | _        |                                 | _        d S rf   )rg   r  r  rh   r  r  s      rj   r   zWhyNoFuse.__init__  s(    ^^%%
^^%%


rl   r   c                V    || _         || _        t                              |            d S rf   )r  r   
fusion_logdebug)rh   r  r   s      rj   __call__zWhyNoFuse.__call__  s*    	rl   c                H    d| j          d| j         d| j        | j        z  z   S )Nzcannot fuse z with rt   r  rq   s    rj   __str__zWhyNoFuse.__str__  s2    >dj>>
>>>K$)#
 	
rl   Nr  rT   r  rT   rc   r   )r  rd   r   r   rc   r   r   )ry   r   r   	__slots__r   r   r  r  r   rl   rj   r  r    st          544IKKK& & & &   

 
 
 
 
 
rl   r  objr   c                    t          | t          t          f          rt          | t                    } t          j        | d          }d|v rdt          j        |d           S |S )Nkey   )r   r       )	r   r   setsortedrd   pprintr|   textwrapr   )r  r   s     rj   r|   r|     sg    #
C()) #Sc"""^C***Fv~~6HOFG44666Mrl   c                  2    e Zd ZddZddZdd	ZddZeZdS )r   r   r/   rc   r   c                0    t          |g          | _        d S rf   rQ  r)  s     rj   r   zOutputNode.__init__  s    ",cU"3"3rl   r   c                    dS r  r   rq   s    rj   r  zOutputNode.is_reduction   r  rl   r   c                    dS )Nr   r   rq   s    rj   r   z'OutputNode.get_inputs_that_alias_output#  r   rl   rd   c                    dS )NOUTPUTr   rq   s    rj   rg   zOutputNode.get_name&  s    xrl   N)r   r/   rc   r   r   r   r   )ry   r   r   r   r  r   rg   r   r   rl   rj   r   r     se        4 4 4 4          HHHrl   r   r\   r]  r^  r   dict[str, SchedulerBuffer]r   c                    t          j                     j        D ]^}t          |t                    sG|j                                                 }|                                         xx         dz  cc<   _d	 fdt          fd j        D                       }|r> j        |z
   _         	                     j
                            |                     dS dS )
am  
    Prunes weakdeps intended for mutation ordering
    on an upstream fused node if after fusion there is another dependency
    on the fused upstream node, making the weakdep redundant

    In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
    be incrementally removed, enabling other fusions, ensuring they are fused in order.
    r    r   r-   rc   r   c                    t          | t                    rS| j                                                 }|                                                  dk    }|         k    }|p|S dS )Nr   F)r   r0   rp   rk   rg   )r   rU  is_redundantis_self_depr   name_to_dep_countr]  r\   s       rj   rV  z+_prune_redundant_deps.<locals>.should_prune@  sp    c7## 		!#(+<<>>G,-?-H-Q-Q-S-STWXXL -W5=K.;.5rl   c              3  2   K   | ]} |          |V  d S rf   r   rX  s     rj   r!  z(_prune_redundant_deps.<locals>.<genexpr>L  sF        ,,s2C2C     rl   NrY  )r  r   r   r   r0   rp   rk   rg   r   r#  r   rZ  )r\   r]  r   r   rU  deps_to_pruner  rV  s   ```   @@rj   r`  r`  ,  s?    '2&9&;&;& K K#w'' 	K!#(+<<>>G09BBDDEEEJEEE
 
 
 
 
 
 
 
 
     .    M  K"&"9M"IT-::=IIJJJJJK Krl   c                  8     e Zd Zd fdZdd	ZddZddZ xZS )r	  r[   rZ   r\   r   rc   r   c                    t                                          |           |                     |           |                     |                                           d S rf   superr   r   r#  get_read_writesrh   r[   r\   	__class__s      rj   r   z"ExternKernelSchedulerNode.__init__V  U    ###T"""T113344444rl   rd   c                \    |                                   dt          | j        dd            S )Nz.node.kernel = ri  )rg   r   r\   rq   s    rj   r   z)ExternKernelSchedulerNode.debug_str_extra[  s.    --//bb'$)EY[_2`2`bbbrl   r   c                    dS NTr   rq   s    rj   r  z#ExternKernelSchedulerNode.is_extern^  rC  rl   c                p    | j         J t          | j         d          o| j                                         S )Nr  )r\   r   r  rq   s    rj   r  z*ExternKernelSchedulerNode.has_side_effectsa  s6    y$$$ty"455V$):T:T:V:VVrl   r[   rZ   r\   r   rc   r   r   r   )ry   r   r   r   r   r  r  __classcell__r  s   @rj   r	  r	  U  s        5 5 5 5 5 5
c c c c   W W W W W W W Wrl   r	  c                        e Zd Zd fdZ xZS )	r  r[   rZ   r\   r   rc   r   c                    t                                          |           |                     |           |                     |                                           d S rf   r  r  s      rj   r   zNopKernelSchedulerNode.__init__g  r  rl   r  )ry   r   r   r   r  r  s   @rj   r  r  f  s=        5 5 5 5 5 5 5 5 5 5rl   r  c                  >    e Zd ZU dZded<   ded<   d@ fdZ	 	 dAdBdZ	 	 dAdCdZdDdZdEdZ	dFdZ
dGd ZdHd$ZdId&ZdJd(ZdKd)ZdKd*ZdKd+ZdLd-ZdMd0ZdNd2ZdOd3Z	 dPdQd7ZedRd8            ZedRd9            ZdSd<ZedTd>            ZedK fd?            Z xZS )Urs  zu
    A SchedulerNode is a node for scheduling that encapsulates either
    a ComputedBuffer or a TemplateBuffer.
    z tuple[Sequence[sympy.Expr], ...]_sizesr9   _bodyr[   rZ   r\   +Union[ir.ComputedBuffer, ir.TemplateBuffer]rc   r   c                    t                                          |           |                     |           |                                  d S rf   )r  r   r   _compute_attrsr  s      rj   r   zSchedulerNode.__init__v  sI    
 	###T"""rl   Nextra_indexing_constraints*Optional[tuple[dict[Any, Any], list[Any]]]recompute_sizes_body_funcOptional[Callable[_P, _T]]c                   t          | j        t          j        t          j        f          sJ | j                            ||          \  | _        }|| _        | j                                        }| j	        
                    |          j        }| || j                  f| _        t          j         pt          |j                   }t          | j        t          j                  r0|                     | j                            |                     d S |                     t'          j        | j        g| j        R d|i           d S )Nr  r  )	normalizer  )r   r\   r%   ComputedBufferTemplateBuffersimplify_and_reorderr  r  get_device_or_errorr[   get_backendgroup_fnr   r"   loop_ordering_after_fusionrJ   rx   r#  extract_read_writesr$   )rh   r  r  bodyr  r  should_normalizes          rj   r  zSchedulerNode._compute_attrs  so   
 $)b&79J%KLLLLL I::'A&? ; 
 
T 
..00>--f55>hht{334
  &@@ 
KI
 I
 E
 di!233 		  	--8H-II       0J!%  8H     rl   Optional[Callable[..., Any]]c                4    |                      ||           d S )Nr  )r  )rh   r  r  s      rj   recompute_size_and_bodyz%SchedulerNode.recompute_size_and_body  s1    
 	'A&? 	 	
 	
 	
 	
 	
rl   r  r   need_clear_tiling_cachec                   t          d | j        j        D                       }|                     t	          j        | j        g| j        R d|i                    |          	                    | j
                             | j                            |            |r!ddlm} |j                                         d S d S )Nc              3  R   K   | ]"}t          |t          t          f          |V  #d S rf   )r   r0   r/   r  s     rj   r!  z5SchedulerNode.refresh_dependencies.<locals>.<genexpr>  sJ       0
 0
ZgwEW5X5X0
0
 0
 0
 0
 0
 0
rl   r  r    SIMDScheduling)r   r   r   r#  r$   r  r  r  r(  r$  r   pointwise_read_writesclear_cachecodegen.simdr  candidate_tilingscache_clear)rh   r  r  	fake_depsr  s        rj   refresh_dependenciesz"SchedulerNode.refresh_dependencies  s    
 &0 0
 0
+10
 0
 0
 &
 &
	 	,
![  4=  Yy!!VD)**	
 	
 	
 	"..t444" 	;444444 ,88:::::	; 	;rl   	new_orderSequence[int]c                    | j                             |          | _         | j         j        | _        |                     dd           d S )NFTr  r  )r  reorder_iter_loopssizesr  r  )rh   r  s     rj   apply_new_loop_orderz"SchedulerNode.apply_new_loop_order  sK    Z22
 

 j&!!E4!PPPPPrl   	dimensionrm   	new_rangec                   t          | j        t          j        t          j        f          sJ | j                            ||          | _        | j        j        | _        | j        	                                }| j
                            |          j        }| || j                  f| _        |                     dd           d S )NTr  )r   r\   r%   r  r  r  #expand_dimension_for_pointwise_noder  r  r  r[   r  r  r   r  )rh   r  r  r  r  s        rj   r  z1SchedulerNode.expand_dimension_for_pointwise_node  s     $)b&79J%KLLLLLZCCy
 

 j&..00>--f55>hht{334
 	!!D$!OOOOOrl   c                    | j                                         | _         | j         j        | _        |                     dd           d S )NTFr  )r  merge_loopsr  r  r  rq   s    rj   r  zSchedulerNode.merge_loops  sD    Z++--
j& 	!!D%!PPPPPrl   r  r.   r  c                   d }| j         d         }t          |          |j        cxk    r|j        k    rn n|                    |          }|rZt          xj        dz  c_        t                              d|                                 |           | 	                    |           dS t                              d|                                            dS )Nr   r    z"Reorder loops for %s with order %sTzEDon't reordering %s because we can not decide the suitable loop orderF)
r  r~   num_varsdecide_loop_order_to_matchr&   num_loop_reorderingloop_ordering_logr  rg   r  )rh   r  r  r  
self_sizess        rj   r  z'SchedulerNode.reorder_loops_by_dep_pair  s     	[^
z??h/EEEE93EEEEEE ;;IFFI 	''1,''##4dmmooy   %%i0004##W   5rl   rd   c                0   |                                  }| d| j        d          | d| j        d          | d| j         g}| j                                        D ]}t          |t                    sl|j        }t          j	        
                    |          }t          |t          j                  s,|                    | dt          |j                              t          | j        t"                    rX|                    d| d           |                    t%          j        | j                                        d	                     | j        J |                    |                                            d
                    |          S )Nz.group.device = r   z.group.iteration = r    z	.sizes = z
_layout = zclass z_loop_body:r  r   )rg   r   r  r   r"  r   r0   rp   rP   r   r  r%   r  r  r|   rz   r  r9   r  r   r   r\   rK  r   join)rh   rp   linesr   r  r   s         rj   r   zSchedulerNode.debug_str_extra  s   }}44TZ]4477
177++dk++

 #4466 	O 	OCc7++ O8g((22!#r'9:: OLLH!M!M
8K8K!M!MNNNdj(++ 	JLL3$333444LL)=)=)?)?HHIIIy$$$T//11222yyrl   Sequence[Sequence[sympy.Expr]]c                    | j         S rf   )r  rq   s    rj   r  zSchedulerNode.get_ranges  
    {rl   c                    t          | j        t          j        t          j        f          sJ dt          | j                              t          | j                                                  S Nztype(self.node)=)r   r\   r%   r  r  rx   r   r  rq   s    rj   r  zSchedulerNode.is_reduction  sd    $)b&79J%KLL 	
 	
!tDI!!	
 	
L DI0022333rl   c                   t          | j        t          j        t          j        f          sJ dt          | j                              t          | j        t          j                  o#t          | j        j        t          j                  S r  )r   r\   r%   r  r  rx   r  	SplitScanrq   s    rj   r  zSchedulerNode.is_split_scan"  s{    $)b&79J%KLL 	
 	
!tDI!!	
 	
L $)R%677 
JINBL=
 =
 	
rl   c                @    t          | j        t          j                  S rf   r   r\   r%   r  rq   s    rj   r  zSchedulerNode.is_template*  s    $)R%6777rl   r@  c                R    t          | j        t          j                  r| j        nd S rf   r  rq   s    rj   rB  zSchedulerNode.get_template_node-  s"    &ty"2CDDNtyy$Nrl   
index_varsSequence[sympy.Expr]c                    |                                   |                                  |                     |           d S rf   )r  r=  r  )rh   r  s     rj   runzSchedulerNode.run0  s9    ""$$$Z     rl   dict[sympy.Expr, sympy.Expr]c                R   | j         }t          t          t          |                    t          t          t          |                    k    sJ t	          t          t          j                            |          t          j                            |                              }|S rf   )	r  r  mapr~   dictzipr@  rA  from_iterable)rh   r  r  
var_rangess       rj   ranges_from_index_varsz$SchedulerNode.ranges_from_index_vars5  s     3sE??##s3sJ+?+?'@'@@@@@--j99--e44 
 

 rl   c                   |                      |          }	 t          j        t          t          j                    |                    5  t          j                            |           5   | j        |  ddd           n# 1 swxY w Y   ddd           dS # 1 swxY w Y   dS # t          $ r" t          
                    d| j                    w xY w)a  
        Generate code for this node using the provided index variables.

        This method sets up the appropriate context for code generation, including
        simplifying indexing expressions based on the variable ranges, and then
        calls the node's body function with the index variables.

        Args:
            index_vars: A sequence of sequences of sympy expressions representing
                        the index variables for each dimension of the computation.
        NzError in codegen for %s)r  rP   set_ops_handlerr>   get_ops_handlerr   set_current_noder  r   r   fatalr\   )rh   r  r  s      rj   r  zSchedulerNode.codegenB  sX    00<<
	!"213D3F3F
"S"STT( ())$//( ( 
J''	( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( (
  	 	 	II/;;;	sS   3B& 
 B*B5BB	BB		BB& BB&  B!B& &,CT	pointwiser   c                    |r| j         nt          | j                   \  }}t          j        | j        |t
          j        j        gt          |          z  g          S )z\
        Get the memory dependencies in either the pointwise or the reduction axes.
        )hidden_args)	r  reversedr$   r  r  sympySZeror~   )rh   r  
keep_sizesignore_sizess       rj   "pointwise_or_reduction_read_writesz0SchedulerNode.pointwise_or_reduction_read_writesY  s\     3<#V4;;$+AVAV 
L/J
%',#lBSBS1S0T
 
 
 	
rl   c                .    |                      d          S )zH
        Get the memory dependencies in the non-reduction axes.
        Tr  r(  rq   s    rj   r  z#SchedulerNode.pointwise_read_writesd  s    
 666FFFrl   c                .    |                      d          S )zD
        Get the memory dependencies in the reduction axes.
        Fr*  r+  rq   s    rj   reduction_read_writesz#SchedulerNode.reduction_read_writesk  s    
 666GGGrl   r  r  c                   |                                  rdS t          d |                                 D                       rdS t          | j        j                  dk    rt          |t          j                  rzt          t          | j        j                            }t          |t          j                  sJ dt          |                      |j        |j        k    o|j        |j        k    S dS )NFc              3  >   K   | ]}|                                 V  d S rf   )r{   rm  s     rj   r!  z,SchedulerNode.can_inplace.<locals>.<genexpr>u  s,      ??Ss  ??????rl   r    ztype(write_dep)=)r  r-  r   r~   r   r   r   r$   r.   rQ  iterrx   indexsize)rh   r  	write_deps      rj   r  zSchedulerNode.can_inplacer  s     	5??D,<,<,>,>????? 	5t&''1,,l,2
 2
, T$"2"9::;;Ii)?@@WWBWT)__BWBWWW@>Y_4X).9XXurl   r3  c                   t                      }t          | j        t                    r| j                                        D ]}|j        dk    r|j        dk    rd|j        v r|j        d         dk    s)t          |j	                  dk    ra|j	        d         dk    rP|
                    d|j        v r|j        d         n&t          |j	                  dk    r|j	        d	         nd
           |S )Ncall_methodstoremode
atomic_add   r  rp      r    r   )r   r   r  r9   ri  ri   r  r   r~   r   rI  )rh   buffers_store_as_atomic_addr\   s      rj   _get_atomic_add_buffersz%SchedulerNode._get_atomic_add_buffers  s    7A||#dj(++ 	
,,..  G},,w..4;..4;v3F,3V3V	NNa//DIaLL4P4P 033!T[00 F++.1$)nn.A.Adillr  
 +*rl   c                    | j         | j                             d          rdS t                                                      S )Ndevice_assert_asyncT)r  has_opr  r  rh   r  s    rj   r  zSchedulerNode.has_side_effects  s>     :!dj&7&78M&N&N!4ww'')))rl   )r[   rZ   r\   r  rc   r   NN)r  r  r  r  rc   r   )r  r  r  r  rc   r   )r  r   r  r   rc   r   )r  r  rc   r   )r  rm   r  rm   rc   r   r   rX  r   )rc   r  r   ra  )r  r  rc   r   )r  r  rc   r  )r  r  rc   r   r_  )r  r   rc   r   )rc   r   r^  r[  )ry   r   r   __doc__r   r   r  r  r  r  r  r  r  r   r  r  r  r  rB  r  r  r  r(  r@   r  r-  r  r<  r  r  r  s   @rj   rs  rs  m  s         
 -,,,OOO      RV@D    F RVBF
 
 
 
 
; ; ; ;<Q Q Q QP P P P"
Q 
Q 
Q 
Q   ,       ,   4 4 4 4
 
 
 
8 8 8 8O O O O! ! ! !
      0 !%	
 	
 	
 	
 	
 G G G ]G H H H ]H    + + + ]+& * * * * * ]* * * * *rl   rs  group_snode/Union[FusedSchedulerNode, GroupedSchedulerNode]c                     j         }                     t          j                            d |D                                  t           fdt          j        d |D              D                        j        j        z
   _	        d S )Nc                    g | ]	}|j         
S r   r   r   r  s     rj   r   z3refresh_group_node_dependencies.<locals>.<listcomp>  s    +J+J+JaAM+J+J+Jrl   c              3  R   K   | ]!}|j                                         v|V  "d S rf   rp   rn  )r   r   rC  s     rj   r!  z2refresh_group_node_dependencies.<locals>.<genexpr>  sH       
 
x{;;==== ====
 
rl   c                    g | ]	}|j         
S r   )r   rH  s     rj   r   z3refresh_group_node_dependencies.<locals>.<listcomp>  s    )O)O)O1!*>)O)O)Orl   )
r  r#  r$   
ReadWrites
merge_listr   unionr   r   r   )rC  r  s   ` rj   refresh_group_node_dependenciesrO    s     F**+J+J6+J+J+JKK  
 	 
 
 
 
!')O)O)O)O)OP
 
 
 	
 	

 
!
(	) """rl   r[   rZ   r  rJ  c                   t          | t          t          f          sJ || _        || _        d | _        t          j        d |D              | _        t          |            t          d | j        D                       | _        t          d | j        D                       | _        d |                                 D             | _        d S )Nc                *    g | ]}|j         	|j         S rf   )r   rH  s     rj   r   z#init_group_node.<locals>.<listcomp>  s!    	A	A	A!)@!+)@)@)@rl   c              3  $   K   | ]}|j         V  d S rf   r   rH  s     rj   r!  z"init_group_node.<locals>.<genexpr>  $      HHHHHHHHrl   c              3  $   K   | ]}|j         V  d S rf   )r   rH  s     rj   r!  z"init_group_node.<locals>.<genexpr>  rT  rl   c                8    i | ]}|                                 |S r   r   r   s     rj   r   z#init_group_node.<locals>.<dictcomp>  s/     # # # ## # #rl   )r   r  GroupedSchedulerNoder  r[   r\   r   rN  r   rO  r  r   r0  r   r   r   )rC  r[   r  s      rj   init_group_noderX    s    
 k$68L#MNNNNNK%KK&,	A	Av	A	A	AK $K000HH[5GHHHHHKHH[5GHHHHHK# #'2'>'>'@'@# # #Krl   c                      e Zd ZU dZded<   ed8d            Zed9d
            Zd:dZ	d; fdZ
ed<d            Zd<dZed=d            Zd>dZd<dZd<dZd? fd Zed=d!            Zed=d"            Zd@d$Zd<d%ZedAd&            ZedAd'            ZedAd(            ZedBd*            ZdCd,ZedAd-            ZdDd/ZdEd2ZdFd5Zd<d6ZedA fd7            Z  xZ!S )Gr  z
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be fused together. The way it does this is by maintaining
    its unmet dependencies as the union of its constituent nodes.
    rJ  r  r  rT   r  rc   c           	        |j         |j         u sJ t          |t          t          f          sJ |                                rt          |t
                    rxt          |j        t                    sJ t          |j	        j
                  dk    sJ t          t          t          |j	        j
                            t                    sJ t          t          |j	        j
                            j        }d |                                D             }t          |          dk    sJ |d         }t          |j	        j
                  dk    sJ t          t          |j	        j
                            }t          |t                     sJ t#          t!          ||j        |j        |j        |j                  g          |j	        _
        nt          |t          t          f          sJ t-          t/          j        |                                |                                                    } | |j         |          S )Nr    c                :    g | ]}|                                 |S r   rN  rh  s     rj   r   z+FusedSchedulerNode.fuse.<locals>.<listcomp>  s)    WWWtDDTDTDVDVWdWWWrl   r   )r[   r   rs  r  r  r	  r\   r6   r~   r   r   rQ  r0  r/   rp   ri  r.   r   r1  	var_namesr2  r7  r   r@  rA  )clsr  r  rp   template_nodesrU  writerI  s           rj   fusezFusedSchedulerNode.fuse  s    %/1111%-1C!DEEEEE 	J:e5N#O#O 	J ej+66666u(/00A5555d4(9(@#A#ABBGLLLLLU.56677<DWWu/@/@WWWN~&&!++++*1-M}0788A====m7>??@@EeY/////'1ek5?EJ
 ( (E$$ em5G%HIIIIIY_U__%6%68I8IJJKKs5?E***rl   r  c                    t          t          d d |                                 D                                 }t          |          dk    rd S t	          |          }|S )Nc              3     K   | ]@}|                                 s|                                *|                                V  Ad S rf   r  r  r  rh  s     rj   r!  z4FusedSchedulerNode.estimate_flops.<locals>.<genexpr>  h        '')) .2^^-=-=''))     rl   r   r   filterri  r~   r  rh   fpsr6  s      rj   r  z!FusedSchedulerNode.estimate_flops  o       $ 0 0   	
 	
 s88q==4#hh
rl   r  r.   r  r   c                   |                                  rdS d}| j        D ]p}t          |t                    sJ |Ht	          |          t	          |j        d                   k    rt                              d            dS |j        d         }qd}|J t          |          |j	        cxk    r|j	        k    rn n|
                    |          }|s/t                              d|                                            dS t          xj        dz  c_        t                              d|                                 |           | j        D ].}t          |t                    sJ |                    |           /t          |            dS )	z@
        Return true if a loop reordering is performed.
        FNr   z1Can not reorder fused node due to different sizeszODont reordering fused node %s because we can not decide the suitable loop orderr    z-Reorder loops for fused node %s with order %sT)r  r  r   rs  ro  r  r  r  r~   r  r  rg   r&   r  r  rO  )rh   r  r  r   rg  r  s         rj   r  z,FusedSchedulerNode.reorder_loops_by_dep_pair   s     	5
[ 	) 	)Ee]33333%%
*;*;uU\RS_?U?U*U*U!''G   uuaJJ	%%%z??h/EEEE93EEEEEE ;;IFFI 	##a   5##q(##;T]]__i	
 	
 	
 [ 	2 	2Ee]33333&&y1111'---trl   r[   rZ   r   c                    t                                          |           t          | ||           g | _        t	          |d           j        | _        d S )Nc                D    t          |                                           S rf   )rm   r  rl  s    rj   r   z-FusedSchedulerNode.__init__.<locals>.<lambda>-  s    s1>>3C3C/D/D rl   r  )r  r   rX  ra   r0  r   )rh   r[   r  r  s      rj   r   zFusedSchedulerNode.__init__)  sS    ###i000%'
%D%DEEEK


rl   rd   c                J    d                     d | j        D                       S )N_c                6    g | ]}|                                 S r   r   rH  s     rj   r   z/FusedSchedulerNode.get_name.<locals>.<listcomp>1       ;;;!;;;rl   r  r  rq   s    rj   rg   zFusedSchedulerNode.get_name/  %    xx;;t{;;;<<<rl   c                @    | j         d                                         S Nr   r  rg   rq   s    rj   re  z!FusedSchedulerNode.get_first_name3      {1~&&(((rl   r3  c                <    t          j        d | j        D              S )Nc                6    g | ]}|                                 S r   rn  rH  s     rj   r   z7FusedSchedulerNode.get_buffer_names.<locals>.<listcomp>8  $    !L!L!L1!"4"4"6"6!L!L!Lrl   r   rN  r  rq   s    rj   rn  z#FusedSchedulerNode.get_buffer_names6  !    !L!L!L!L!LMMrl   list[SchedulerBuffer]c                l    g }| j         D ])}|                    |                                           *|S rf   r  rK  r   rh   r   r\   s      rj   r   zFusedSchedulerNode.get_outputs:  >    (*K 	. 	.DMM$**,,----rl   c                .     fdt           j                  D             } j        d         j        }|'|                                                                t          j        d                    |                                          d          S )Nc                r    g | ]3\  }}                                  d | d|                                 4S )z.snodes[z] =
)rg   r   )r   rP  r\   rh   s      rj   r   z6FusedSchedulerNode.debug_str_extra.<locals>.<listcomp>A  sU     
 
 
4 }}BBBB0@0@BB
 
 
rl   r   r   r  )	rR  r  r\   rK  r   r  r   r  r   )rh   r  r\   s   `  rj   r   z"FusedSchedulerNode.debug_str_extra@  s    
 
 
 
$T[11
 
 
 {1~"LL3355666tyy//6688&AAArl   c                2    d | j         D             }|  d| S )Nc                6    g | ]}|                                 S r   )r  rh  s     rj   r   z6FusedSchedulerNode.debug_str_short.<locals>.<listcomp>L  s$    EEEd**,,EEErl   z
, snodes: r  )rh   
snodes_strs     rj   r  z"FusedSchedulerNode.debug_str_shortK  s+    EEEEE
..*...rl   r2  r4  r  c                    t                                          ||           t                      }t          | j                  D ]2}|                    ||           |                    |j                   3d S rf   )r  r;  r   r"  r  updater   )rh   r2  r4  r\   r  s       rj   r;  z!FusedSchedulerNode.set_last_usageO  s    
 	24FGGG 0:||T[)) 	8 	8D 35GHHH&&t7777	8 	8rl   c                <    t          j        d | j        D              S )Nc                6    g | ]}|                                 S r   )rB  rH  s     rj   r   z8FusedSchedulerNode.used_buffer_names.<locals>.<listcomp>^  s$    !M!M!MA!"5"5"7"7!M!M!Mrl   r{  rq   s    rj   rB  z$FusedSchedulerNode.used_buffer_names\  s!    !M!M!M!M!MNNrl   c                <    t          j        d | j        D              S )Nc                6    g | ]}|                                 S r   )r9  rH  s     rj   r   zCFusedSchedulerNode.used_or_aliased_buffer_names.<locals>.<listcomp>c  s$    DDD1a,,..DDDrl   r{  rq   s    rj   r9  z/FusedSchedulerNode.used_or_aliased_buffer_names`  s%    DDDDD
 	
rl   r{  c                    | j         S rf   r  rq   s    rj   ri  zFusedSchedulerNode.get_nodesf  r  rl   c                Z    t          |           j         d|                                  dS )Nz(nodes=r   r   rq   s    rj   r   zFusedSchedulerNode.__repr__i  s*    t**%@@dmmoo@@@@rl   c                >    t          d | j        D                       S )Nc              3  >   K   | ]}|                                 V  d S rf   )r  rH  s     rj   r!  z2FusedSchedulerNode.is_reduction.<locals>.<genexpr>n  s,      991>>##999999rl   r-  r  rq   s    rj   r  zFusedSchedulerNode.is_reductionl  s!    99T[999999rl   c                >    t          d | j        D                       S )Nc              3  >   K   | ]}|                                 V  d S rf   )r  rH  s     rj   r!  z3FusedSchedulerNode.is_split_scan.<locals>.<genexpr>r  s,      ::1??$$::::::rl   r  rq   s    rj   r  z FusedSchedulerNode.is_split_scanp  s!    ::dk::::::rl   c                >    t          d | j        D                       S )Nc              3  >   K   | ]}|                                 V  d S rf   rN  rH  s     rj   r!  z1FusedSchedulerNode.is_template.<locals>.<genexpr>v  s*      88q1==??888888rl   r  rq   s    rj   r  zFusedSchedulerNode.is_templatet  s!    88DK888888rl   r@  c                n    | j         D ],}|                                r|                                c S -d S rf   )r  r  rB  r   s     rj   rB  z$FusedSchedulerNode.get_template_nodex  sI    K 	0 	0D!! 0--/////0trl   torch.devicec                    | j         d         S rt  )r   rq   s    rj   r   zFusedSchedulerNode.get_device  s    z!}rl   c                >    t          d | j        D                       S )Nc              3  >   K   | ]}|                                 V  d S rf   )r.  rH  s     rj   r!  z>FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  s.      EEA1--//EEEEEErl   r  rq   s    rj   r.  z+FusedSchedulerNode.has_aliasing_or_mutation  s!    EEEEEEEErl   r  c                    t           rf   NotImplementedErrorr%  s     rj   r&  z'FusedSchedulerNode.update_mutated_names      !!rl   rp   r-   c                    t           rf   r  )rh   rp   s     rj   r*  zFusedSchedulerNode.add_fake_dep  r  rl   r  r  c                    t           rf   r  r  s     rj   r  zFusedSchedulerNode.can_inplace  r  rl   c                   |                                  }d                    d | j        D                       }t                      }|                    | dt          |           j         d| d| dt          | j        j	                   d| dt          | j
                   d| d	t          | j        j        | j
        z
             d| d
           |                                5  |                                 D ])}|                    |                                           *	 ddd           n# 1 swxY w Y   |                    d           	 |                    |                                            n,# t"          $ r t$                              dd           Y nw xY w|                                                                S )r   ru   c              3  >   K   | ]}t          |          j        V  d S rf   )rx   ry   rt  s     rj   r!  z/FusedSchedulerNode.debug_str.<locals>.<genexpr>  s+      FFQQ 0FFFFFFrl   rt   r   r   r   r   r   r   z.outputs = [
            Nrv   r   Tr   )rg   r  r  rG   r   rx   ry   r|   r   r   r   r   r   r   r   rw   r   r   r   r   r   r   )rh   rp   node_typestrr   r   s        rj   r   zFusedSchedulerNode.debug_str  sR   }}xxFF$+FFFFF

 	d	 +  )011    %T%<==  	  #4#3#9D<S#STT	 
   	
 	
 	
 ZZ\\ 	, 	,'')) , ,

3==??++++,	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	c	HJJt++--.... 	H 	H 	HKK7$KGGGGG	H   '')))s$   (?D44D8;D8'E= =&F&%F&c                    | j         t          d | j         D                       S t                                                      S )Nc              3  >   K   | ]}|                                 V  d S rf   )r  rh  s     rj   r!  z6FusedSchedulerNode.has_side_effects.<locals>.<genexpr>  s.      GG4t,,..GGGGGGrl   )r  r-  r  r  r@  s    rj   r  z#FusedSchedulerNode.has_side_effects  s@    ;"GG4;GGGGGGww'')))rl   r  rT   r  rT   rc   r  r`  rX  )r[   rZ   r  rJ  rc   r   r   r[  rc   r}  rZ  r]  r   ra  )rc   r  rY  )rp   r-   rc   r   r^  )"ry   r   r   rB  r   classmethodr`  r@   r  r  r   rg   re  rn  r   r   r  r;  rB  r9  ri  r   r  r  r  rB  r   r.  r&  r*  r  r   r  r  r  s   @rj   r  r    s	          $###+ + + [+B    ]"' ' ' 'RL L L L L L = = = ]=) ) ) ) N N N ]N   	B 	B 	B 	B/ / / /8 8 8 8 8 8 O O O ]O 
 
 
 ]

   A A A A : : : ]: ; ; ; ]; 9 9 9 ]9    ]    F F F ]F
" " " "" " " "" " " "* * * *4 * * * * * ]* * * * *rl   r  c                      e Zd ZU dZd,dZd-dZed.d
            Zed/d            Z	 	 	 d0d1 fdZ	ed2d            Z
ed3d            ZeZded<   ed4d            Zed3d             Zd5d!Zd5d"Zd6d#Zd7d$Zd8d&Zd9d(Zd:d+Z xZS );ForeachKernelSchedulerNodez
    This is a schedular node that consists of a set of scheduler nodes that
    has no data dependencies among them and can be executed in parallel.
    producerrT   rc   r]   c                    |                                 D ]>}|                                | j        v r!| j        |                                         c S ?d S rf   )r   rg   read_to_node)rh   r  r   s      rj   get_consumer_subnode_forz3ForeachKernelSchedulerNode.get_consumer_subnode_for  s]     '')) 	9 	9C||~~!222(8888 3 trl   consumerc                   t          t                               }|j        j        D ]h}|j        | j        j        vr| j        j        |j                                                 }|| j        v r |	                    | j        |                    it          |          dk    rt          t          |                    S d S Nr    )r   rT   r   r   rp   r[   r   rk   name_to_noderI  r~   rQ  r0  )rh   r  	producersrd	node_names        rj   get_producer_subnode_forz3ForeachKernelSchedulerNode.get_producer_subnode_for  s     0133	&, 	< 	<Bwdn888227;LLNNID---d/	:;;; y>>QY(((4rl   r   c                
   t          |          }                                r|                                rt          j        t                    t          j        t          |          }t          j                  t          |j                  k    }|s |d           |o2t          fdt          j        |j                  D                       S |                                rz	                                r |d           dS t          j        t          |          }|
                              }||j                            |          S  |d           dS                                 rz|	                                r |d           dS t          j        t                                        |          }|j                            ||          S  |d           dS t          d          )	Nzforeach do not have same lengthc              3  T   K   | ]"\  }}j                             ||          V  #d S rf   )r[   can_fuse)r   lrr  s      rj   r!  z6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>  sN       ) )Aq "++Aq11) ) ) ) ) )rl   zXcandidate producer is a reduction, foreach ops cannot be fused with reductions currentlyFz5candidate producer is not dep of any foreach consumerzXcandidate consumer is a reduction, foreach ops cannot be fused with reductions currentlyz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)r  r  typingcastr  r~   r  rw  r  r  r  r[   r  r  r/  )r]  r  r  whyforeach_matchconsumer_subnodeproducer_subnodes    `     rj   r  z#ForeachKernelSchedulerNode.can_fuse  s<   (++   &	X%8%8%:%: &	{#=xHHH{#=xHHH00C4H4HHM  75666  S ) ) ) )AA) ) ) & &    "" 	$$&& n   u{#=xHHH'@@JJ+)228=MNNNCGHHH5  "" 	$$&& n   u{#=xHHH'@@JJ+)223CXNNNCGHHH5f
 
 	
rl   c                   |                                 s|                                 sJ |                                 r)t          j        t          |          }|j        }|j        }n(t          j        t          |          }|j        }|j        }d }d }|                                 rn|                                 rZt          j        t          |          }t          j        t          |          }d t          |j        |j                  D             }nO|                                 rt          j        t          |          }|                    |          }g }|}d }|j        D ]N}	|	|u r3t          
                    |	|          }
|
}|                    |
           9|                    |	           On|                                 rt          j        t          |          }|                    |          }g }|}d }|j        D ]N}	|	|u r3t          
                    ||	          }
|
}|                    |
           9|                    |	           Ont          d           | |j        |||||          S )Nc                J    g | ] \  }}t                               ||          !S r   )r  r`  )r   r  r  s      rj   r   z3ForeachKernelSchedulerNode.fuse.<locals>.<listcomp>  s<       Aq #''1--  rl   zTAt least one node passed to ForeachKernelSchedulerNode.fuse should be a foreach node)use_custom_partition_algoprev_node_1prev_node_2enable_autotune)r  r  r  r  r  r  r  r  r  r  r`  r  r  r/  r[   )r]  r  r  r  r  r  r  fused_nodesr  r\   new_noder  s               rj   r`  zForeachKernelSchedulerNode.fuse  s    ""$$=(;(;(=(====   	7{#=xHHH(0(J%&6OO{#=xHHH(0(J%&6O   &	X%8%8%:%: &	{#=xHHH{#=xHHH AA  KK   "" 	{#=xHHH'@@JJK"KK  - -+++166tXFFH"*K&&x0000&&t,,,,-   "" 	{#=xHHH'@@JJK"KK  - -+++166xFFH"*K&&x0000&&t,,,,- !f   s&?##+
 
 
 	
rl   NFr[   rZ   r  rJ  r  r  r  r  r   c                    i  _         i  _        ||ht                                          ||           |D ]A}|j        j        D ]}| j         |j        <   |                                D ]}	| j        |	<   Bn| _        | _	        d  _
        g  _                             t          j                            |j        |j        g                     t!           fdt!          j        |j        |j                  D                        j        j        z
   _        t)          |j        |j        g           _        t-          |j        |j        g           _        |                                rt3          |t4                    sJ ||}}
nt3          |t4                    sJ ||}}
|
j         _         j                            |j                   |
j         _        |                                D ]}	| j        |	<   d  j	        D              _        | _        |d                                         }|sJ |tA          j!        d          fff _"        t!          tF          j$        j%                              _&        | _'        d S )Nc              3  R   K   | ]!}|j                                         v|V  "d S rf   rJ  rP  s     rj   r!  z6ForeachKernelSchedulerNode.__init__.<locals>.<genexpr>f  sL         xt'<'<'>'>>>	  ?>>> rl   c                R    i | ]$}|j                                         D ]\  }}||	%S r   )r   items)r   rg  r8  vs       rj   r   z7ForeachKernelSchedulerNode.__init__.<locals>.<dictcomp>  sW     @ @ @%:O:U:U:W:W@ @26!Q1@ @ @ @rl   r   combo_kernel)(r  r  r  r   r   r   rp   rj  r[   r  r\   ra   r#  r$   rL  rM  r   rN  r   r   r  r   r0  r   r  r   r  r   r  r   r  r   r#  Exprr   r  fxNoder  r  )rh   r[   r  r  r  r  r  r\   r  rp   foreach_node
other_noder  r  s   `            rj   r   z#ForeachKernelSchedulerNode.__init__D  s    +"5GGY/// 3 3 ,2 8 8D37D%di00 4466 3 3D.2D%d++3	3 'DN DKDI)+DJ  '22 ,k.EF        )/#68V        ")* # !+"79N!OPPDN +"79N!OPPDN%%'' D!+/IJJJJJ+6j!+/IJJJJJ+6j)3DNN!!*"6777 , 9D"6688 5 5*4!$''@ @"&+@ @ @D  *C&%%''v
> : :<>?
!%(-022.rl   rI  c                   d |D             }|r3t                               dt          |          d |D                        d |D             }d |D             }|r(t                               dt          |                     d |D             }d |D             r)t                               d	t                               fd
|D             }|S )Nc                <    g | ]}t          |t                    |S r   )r   r	  rH  s     rj   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>  s(    OOOj4M&N&NO!OOOrl   z/ComboKernels: %d external nodes are filtered %sc                N    g | ]"}|j         	|j                                         #S rf   r\   r  rh  s     rj   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>  s-    UUUTty?T&&((?T?T?Trl   c                J    g | ] }t          |t          t          f          |!S r   )r   r  r	  rH  s     rj   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>  s?     
 
 
a"8:S!TUU

 
 
rl   c                <    g | ]}t          |t                    |S r   r   r  rH  s     rj   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>  s8     
 
 
A7Q)R)R

 
 
rl   z+ComboKernels: %d foreach nodes are filteredc                <    g | ]}t          |t                    |S r   r  rH  s     rj   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>  s8     
 
 
Z;U-V-V

 
 
rl   c                :    g | ]}|                                 |S r   rN  rH  s     rj   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>  s%    GGGq}}G!GGGrl   z0ComboKernels: %d template nodes are filtered: %sc                    g | ]}|v|	S r   r   )r   r  r^  s     rj   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>  s#    OOOq7N7N!7N7N7Nrl   )r   r  r~   )r]  rI  externfiltered_nodesforeach_nodesr^  s        @rj   combinable_nodesz+ForeachKernelSchedulerNode.combinable_nodes  s=    POUOOO 	IIAFUUVUUU  

 

 
 


 
%
 
 
  	YIICSEWEWXXX
 
%
 
 
 HG^GGG 	IIBN##  
 POOO^OOOrl   list[list[BaseSchedulerNode]]c           
         |                                  }g }d|D ]@|                    fdt          dt                              D                        A|S )zS
        Returns a list of lists of nodes that are to be grouped together.
           c                *    g | ]}||z            S r   r   )r   rP  max_num_nodesrI  s     rj   r   zUForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels.<locals>.<listcomp>  s8        !a-//0  rl   r   )_topological_sort_nodesrK  ranger~   )r[   sorted_nodesgrouped_nodesr  rI  s      @@rj   &_default_group_nodes_for_combo_kernelszAForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels  s     !88::! 	 	E      "1c%jj-@@      rl   4Callable[[Scheduler], list[list[BaseSchedulerNode]]]!group_algorithm_for_combo_kernelscustom_group_algorithmc                    | t           _        d S rf   r  r  )r  s    rj   %set_group_algorithm_for_combo_kernelsz@ForeachKernelSchedulerNode.set_group_algorithm_for_combo_kernels  s    
 # 	#DDDrl   c                6    t                               |           S rf   r  r[   s    rj   group_nodes_for_combo_kernelsz8ForeachKernelSchedulerNode.group_nodes_for_combo_kernels  s     *KKIVVVrl   c                    t           rf   r  rq   s    rj   r=  z#ForeachKernelSchedulerNode.mark_run  r  rl   c                    t           rf   r  rq   s    rj   r  z"ForeachKernelSchedulerNode.codegen  r  rl   c                    dS r  r   rq   s    rj   r  z%ForeachKernelSchedulerNode.is_foreach  rC  rl   c                *    t          | j                  S )zeReturns a list of nodes which comprise the combo kernel.
        These nodes may be vertically fused.)r   r  rq   s    rj   get_subkernel_nodesz.ForeachKernelSchedulerNode.get_subkernel_nodes  s     DK   rl   r{  c                x    t          t          j                            d | j        D                                 S )zqReturns all nodes contained in this kernel, unpacking fused nodes
        into their constituent scheduler nodes.c              3  >   K   | ]}|                                 V  d S rf   )ri  rH  s     rj   r!  z7ForeachKernelSchedulerNode.get_nodes.<locals>.<genexpr>  s*      1U1UA!++--1U1U1U1U1U1Url   )r   r@  rA  r  r  rq   s    rj   ri  z$ForeachKernelSchedulerNode.get_nodes  s3     IO111U1U1U1U1UUUVVVrl   rd   c                @    | j         d                                         S rt  )r  re  rq   s    rj   re  z)ForeachKernelSchedulerNode.get_first_name  s    {1~,,...rl   r]  r^  c                z    t          | || j        j                   | j        D ]}|                    |           d S rf   )r`  r[   r   r  ra  )rh   r]  r\   s      rj   ra  z/ForeachKernelSchedulerNode.prune_redundant_deps  sO     	d$68RSSSK 	: 	:D%%&89999	: 	:rl   )r  rT   rc   r]   )r  rT   rc   r]   r  rT   r  rT   rc   r   )r  rT   r  rT   rc   r  )NNF)r[   rZ   r  rJ  r  r   r  r]   r  r]   r  r   rc   r   rI  rJ  rc   rJ  )r[   rZ   rc   r  )r  r  rc   r   r   r   rc   rJ  r]  r   r\  )ry   r   r   rB  r  r  r  r  r`  r   r  rb  r  r  r   r  r  r=  r  r  r  ri  re  ra  r  r  s   @rj   r  r    s         
      & ,
 ,
 ,
 [,
\ >
 >
 >
 [>
J 4837 %F/ F/ F/ F/ F/ F/ F/P    [B    \* 	/ & / / / / 
 
 
 \
 W W W \W
" " " "" " " "   ! ! ! !
W W W W
/ / / /: : : : : : : :rl   r  c                       e Zd ZU dZded<   ed d            Z	 d!d" fdZd#dZd$dZ	e
d%d            Zd%dZe
d&d            Zd'dZe
d(d            Zd)dZed*d            Z xZS )+rW  aC  
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be *grouped* together (it does not allow another node to be scheduled
    in between its constituent nodes, nor does it allow another node to fuse into any of its constituent nodes).
    The way it does this is by maintaining its unmet dependencies as the union of its constituent nodes.
    Fusion will still happen among the nodes within each GroupedSchedulerNode.
    At codegen time, this scheduler node will be unpacked and codegen is called on each constituent node.
    rJ  r  rc   c                    |d         j         t          fd|D                       sJ  | |          }|D ]}|j        |                                <   |j        |                                <   |S )Nr   c              3  *   K   | ]}|j         u V  d S rf   r  )r   r\   r[   s     rj   r!  z.GroupedSchedulerNode.create.<locals>.<genexpr>  s*      BB44>Y.BBBBBBrl   )r[   rw  r]  rg   )r]  r  grouped_snoderg  r[   s       @rj   createzGroupedSchedulerNode.create  s    1I'	BBBB6BBBBBBBBIv.. 	K 	KE=JI()9)9::AN	$]%;%;%=%=>rl   Fr[   rZ   temp_groupingr   r   c                z    t                                          |           t          | ||           || _        d S rf   )r  r   rX  r  )rh   r[   r  r  r  s       rj   r   zGroupedSchedulerNode.__init__	  s?     	###i000 +rl   c                    | j         r| j        S | j        D ]#}|| j        j        |                                <   $| j        j        |                                 = | j                            | j                  S )z
        Do fusion among nodes within this GroupedSchedulerNode,
        and then unpack this GroupedSchedulerNode into regular nodes.
        )r  r  r[   r]  rg   
fuse_nodes)rh   rg  s     rj   unpackzGroupedSchedulerNode.unpack  ss    
  	;[ 	H 	HEBGDN-enn.>.>??N-dmmoo>~((555rl   fake_depr-   c                    |                      | j                            |                     | j                            |           d S rf   )r#  r   r(  r   rI  )rh   r  s     rj   r*  z!GroupedSchedulerNode.add_fake_dep%  sD    T-77AABBB##H-----rl   rd   c                J    d                     d | j        D                       S )Nrn  c                6    g | ]}|                                 S r   r   rH  s     rj   r   z1GroupedSchedulerNode.get_name.<locals>.<listcomp>+  rp  rl   rq  rq   s    rj   rg   zGroupedSchedulerNode.get_name)  rr  rl   c                @    | j         d                                         S rt  ru  rq   s    rj   re  z#GroupedSchedulerNode.get_first_name-  rv  rl   r3  c                <    t          j        d | j        D              S )Nc                6    g | ]}|                                 S r   ry  rH  s     rj   r   z9GroupedSchedulerNode.get_buffer_names.<locals>.<listcomp>2  rz  rl   r{  rq   s    rj   rn  z%GroupedSchedulerNode.get_buffer_names0  r|  rl   r}  c                l    g }| j         D ])}|                    |                                           *|S rf   r  r  s      rj   r   z GroupedSchedulerNode.get_outputs4  r  rl   r  c                    t          t          d d |                                 D                                 }t          |          dk    rd S t	          |          }|S )Nc              3     K   | ]@}|                                 s|                                *|                                V  Ad S rf   rc  rh  s     rj   r!  z6GroupedSchedulerNode.estimate_flops.<locals>.<genexpr>@  rd  rl   r   re  rg  s      rj   r  z#GroupedSchedulerNode.estimate_flops:  ri  rl   r{  c                    | j         S rf   r  rq   s    rj   ri  zGroupedSchedulerNode.get_nodesL  r  rl   r  rT   r  c                    dS r  r   )r]  r  r  s      rj   r  zGroupedSchedulerNode.can_fuseO  r  rl   )r  rJ  rc   rW  F)r[   rZ   r  rJ  r  r   rc   r   r  )r  r-   rc   r   r   r[  r  r`  r]  r  )ry   r   r   rB  r   r  r  r   r  r*  r@   rg   re  rn  r   r  ri  r  r  r  s   @rj   rW  rW    sg          $###   [ $	+ + + + + + +6 6 6 6. . . . = = = ]=) ) ) ) N N N ]N       ]"       [    rl   rW  r   stride_lengthslist[list[int]]r  r  priority_idxr  	list[int]c           
     :    t           j        d	 fd            }t          t          t	          t           d                                                 }t          |          dk    r fd|D              t          j        r|                    |           |S )
z
    A heuristic to decide loop iteration orders.  This has not been well
    tuned and may be something we should autotune.
    rp  rm   brc   c                              dk    s         dk    r$t                    dk             dk              S  fdD             }fdD             }t          d t          ||          D                       }t          d t          ||          D                       }||k    rdS ||k    rdS t                     S )Nr    c                :    g | ]}t          |                   S r   abs)r   slrp  s     rj   r   z6pick_loop_order.<locals>.index_cmp.<locals>.<listcomp>g  #    <<<rBqE

<<<rl   c                :    g | ]}t          |                   S r   r!  )r   r#  r  s     rj   r   z6pick_loop_order.<locals>.index_cmp.<locals>.<listcomp>h  r$  rl   c              3  4   K   | ]\  }}|d k    p||k     V  dS r   Nr   r   sl_asl_bs      rj   r!  z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>l  D       
 
)3tDAI$
 
 
 
 
 
rl   c              3  4   K   | ]\  }}|d k    p||k     V  dS r'  r   r(  s      rj   r!  z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>o  r+  rl   r  )rA   r  r  )rp  r  stride_len_astride_len_ba_firstb_firstr  r  s   ``    rj   	index_cmpz"pick_loop_order.<locals>.index_cmp_  s   8q==E!HMMuQx1}eAh!m444 =<<<^<<<<<<<^<<<  
 
7:<7V7V
 
 
 
 
  
 
7:<7V7V
 
 
 
 
 W2W1 1ayyrl   r   c                     g | ]
}|         S r   r   )r   pir  s     rj   r   z#pick_loop_order.<locals>.<listcomp>}  s    DDD.,DDDrl   r  )rp  rm   r  rm   rc   rm   )		functools
cmp_to_keyr   r"  r  r~   r"   pick_loop_orderssort)r  r  r  r1  orders   ``   rj   pick_loop_orderr9  U  s           4 %N1$5 6 6778899E
<1DDDD|DDD "

y
!!!Lrl   c                  V    e Zd ZU ded<   dZded<   dZded<   dd	ZddZddZddZ	dS )NodeUser$Union[BaseSchedulerNode, OutputNode]r\   Fr   r  is_weakrc   rm   c                h    t          | j                                        | j        | j        f          S rf   )ro   r\   rg   r  r=  rq   s    rj   rr   zNodeUser.__hash__  s*    TY''))4+;T\JKKKrl   otherobjectc                    t          |t                    oI|                                 |                                k    o| j        |j        k    o| j        |j        k    S rf   )r   r;  rg   r  r=  rh   r?  s     rj   __eq__zNodeUser.__eq__  sY    uh'' .5>>#3#33. E$55. -		
rl   rd   c                4    | j                                         S rf   r   rq   s    rj   rg   zNodeUser.get_name  r   rl   c                ~    | j         |j         u sJ t          | j         | j        o|j        | j        o|j                  S rf   )r\   r;  r  r=  rB  s     rj   r   zNodeUser.merge  sH    yEJ&&&&I2!2L*U]
 
 	
rl   Nr   )r?  r@  rc   r   r   )r?  r;  rc   r;  )
ry   r   r   r   r  r=  rr   rC  rg   r   r   rl   rj   r;  r;    s         ....K GL L L L
 
 
 
$ $ $ $
 
 
 
 
 
rl   r;  r   c                     t           j        S rf   )r"   r  r   rl   rj   *used_non_deterministic_runtime_estimationsrG    s    33rl   c                      e Zd ZdZddZd fdZdd	Zedd            Zej	        dd            ZddZ
ddZddZddZddZddZddZddZddZddZdd Zdd!Zdd"Zdd#Zdd&Z	 ddd,Zdd0Zdd1Zdd3Zdd7Zdd8Zdd9Zddd;Z dd<Z!dd>Z"dd?Z#dd@Z$ddCZ%ddDZ&ddGZ'ddHZ(ddIZ)ddNZ*ddPZ+ddQZ,ddRZ-ddUZ.ddZZ/dd\Z0dd]Z1dd_Z2ddbZ3ddcZ4dddZ5ddeZ6ddhZ7ddjZ8ddkZ9ddlZ:ddpZ;	 dddsZ<dduZ=ddxZ>dd}Z?ddZ@ddZAddZBddZCddZDddZEddZFddZGddZHddZIddZJddZKddZLddZMddZNddZO xZPS )rZ   z
    A Scheduler is a graph of BaseSchedulerNodes. It is responsible for
    optimizations such as fusion, reorder, and graph partition.
    rI  list[ir.Operation]rc   r   c                    t          d          5  |                     |           d d d            d S # 1 swxY w Y   d S )NzScheduler.__init__)r   _initrh   rI  s     rj   r   zScheduler.__init__  s    .// 	 	JJu	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   377c           
     Z    t                                                        t          j        _        i  _        t          t                     _        t          j
                     _        t                       _        t          g t          j        j                                        t          j        j                                        t          j        j                                                   _         fd|D              _        d  _                                           j                            t          j        j                                                    j        D ]}|                                 d  _                                          _        d  j        D              _        d  j        D              _         j                                         _        i  _        i  _         tC          j"         j         j         j                   _         #                                  $                     j                   _         %                                 d  j        D              _         &                                 tN          xj(        tS           j                  z  c_(        ddl*m+}m,}  | j                   tS           j                   _-         .                                  $                     j                   _        t          t^          t`          t`          f                               _1        td          j3        te          j3         j                   _         4                     j                   _        td          j5        te          j5         j                   _         6                                  7                                 td          j8        r@ts          ddd	          5   :                    d 
           d d d            n# 1 swxY w Y   td          j;        rddl<m;}  | j         j         j        t          t          j        j                                                  t          t          j        =                                                     _        td          j>        rtd          j;        sddl<m?}  | j         j                   t                      r"t          jB        rddl!mC}  | j                   ddlDmE}  |dd  fd           tC          jF         j                   _         G                                 t          jI        j2        jJ        rYt          jI        j2        jK        jL        r> M                     j                   _         N                     j                   _         O                                 t          jI        j2        jP        jQ        r R                                  | j                   t          jS        T                     j                    U                                 t                       _V        i  _W        t          d          Y                     fd           d S )Nc                :    g | ]}                     |          S r   )create_scheduler_noder   ru  rh   s     rj   r   z#Scheduler._init.<locals>.<listcomp>  s'    CCCd0033CCCrl   c                8    i | ]}|                                 |S r   r   rt  s     rj   r   z#Scheduler._init.<locals>.<dictcomp>  s/     ;
 ;
 ;
 !AJJLL!;
 ;
 ;
rl   c                f    i | ].}|                                 D ]}|                                |/S r   )r   rg   )r   r\   r   s      rj   r   z#Scheduler._init.<locals>.<dictcomp>  sO     8
 8
 8
$($BRBRBTBT8
 8
;>CLLNNC8
 8
 8
 8
rl   c                8    i | ]}|                                 |S r   r   rt  s     rj   r   z#Scheduler._init.<locals>.<dictcomp>  s"    "G"G"Gq1::<<"G"G"Grl   r   )log_ir_post_fusionlog_ir_pre_fusionz#Scheduler.create_combo_kernel_nodesTlog_pt2_compile_eventlog_waitcounter)num_ck_nodesr    )reorder_for_peak_memory)1assign_memory_planning_info_for_scheduler_buffers)6align_runtime_estimations_across_all_distributed_ranks)trace_structuredartifactc                     dddS )N#scheduler_nodes_before_comm_overlapstring)rp   encodingr   r   rl   rj   r   z!Scheduler._init.<locals>.<lambda>4	  s    A (% % rl   c                 f    d                     d t           j                  D                       S )Nz

c                z    g | ]8\  }}d | d|                                 z   d|                                 z   9S )zsnode[rv   z buffer_names:)r   rn  rO  s      rj   r   z5Scheduler._init.<locals>.<lambda>.<locals>.<listcomp>9	  sc        !Aq &++--(A1+=+=+?+?AAB  rl   )r  rR  rI  rq   s   rj   r   z!Scheduler._init.<locals>.<lambda>8	  s=    6;;  %.dj$9$9	  $ $ rl   )metadata_fn
payload_fngraph_statsc                 H     j          j        t           j                  dS )N)graph_idnum_nodes_before_fusionnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesr~   rI  rq   s   rj   r   z!Scheduler._init.<locals>.<lambda>\	  s&     3+/+>*-dj//  rl   )Zr  r   rP   r   r[   backendsrQ  _post_grad_graph_counterrl  r@  count_graph_partition_counterr   r  r  keys	constantstorchbind_constantsrO  rI  current_nodeupdate_zero_dim_cpu_tensorr  r1  default_device_contextget_donated_buffersr   r  r   copyr]  r4  r   r!   decide_global_ordering_of_commsrS   topological_sort_scheduledead_node_eliminationcompute_ancestorsr&   ir_nodes_pre_fusionr~   torch._inductor.debugrT  rU  rm  create_foreach_nodesro  rd   logged_slow_fusionr"   _pre_fusion_custom_passr
  _post_fusion_custom_passr  finalize_multi_template_bufferscombo_kernelsr   create_combo_kernel_nodesrZ  memoryget_output_names reorder_for_compute_comm_overlapr[  rG  r#   6runtime_estimations_align_across_all_distributed_ranksr\  torch._loggingr]  $reorder_compute_and_comm_for_overlapprocess_grouped_nodesr  r  graph_partitiontriton
cudagraphs&maybe_reorder_for_minimizing_partition,reorder_for_partition_with_simple_dependencycompute_last_usagetest_configstrack_memory_lifecycleinsert_memory_check_nodesr  graph_diagramdebug_draw_graphbuffer_names_to_freeorigin_to_indexr   add_row)
rh   rI  r\   rT  rU  rZ  r[  r\  r]  r  s
   `        rj   rK  zScheduler._init  s    <>"&'?"@"@(1(9(9%5?\\!&0%**,,"'')) ,1133'
 '
# DCCCUCCC
9='')))#**17+<+A+A+C+CDDDJ 	 	DOO ?C# $$&& 	#;
 ;
%)Z;
 ;
 ;
8
 8
,0J8
 8
 8
 AE@Q@V@V@X@X 35 13 :J#
 

 	!!###33DJ??
""$$$"G"GDJ"G"G"G   ##s4:6##OOOOOOOO$*%%%!$*oo!!###33DJ??
",U38_"="?"?)57
CCDJ__TZ00
*68DDDJ,,... 	B5&* $   B B
 ..D.AAAB B B B B B B B B B B B B B B ) 		77777700
 '17/44667717335566 DJ 2 #	P1 UUUUUUAAJ 0  
 ;<<S WS      GFtzRRR777777        CDJOODJ""$$$ O"2	W&-8	W DDTZPPDJJJ4:VVDJ!!!?!.E 	-**,,,4:&&&	dj))) 6@\\! :<''//   	
 	
 	
 	
 	
s   
P--P14P1!dict[str, SchedulerDonatedBuffer]c                    i }t           j        j        D ][}t          t           j        j        |         t          j                  r*t          | t           j        j        |         d           ||<   \|S )N)r^   )rP   r   graph_inputs_originalr   r%   DonatedBufferr   )rh   name_to_donated_bufrp   s      rj   rx  zScheduler.get_donated_buffersc	  sq     G1 	 	D!'7=r?OPP ,BG1$7 $- - -#D)
 #"rl   r   c                $    t           j        j        S rf   rP   r   current_devicerq   s    rj   r  zScheduler.current_devicen	  s    w%%rl   r  c                (    |t           j        _        d S rf   r  r  s     rj   r  zScheduler.current_devicer	  s    !'rl   c                    t           j                            dd          dk    rddlm}  || j        d           dS dS )z,Generate an image of the graph for debuggingINDUCTOR_WRITE_SCHEDULER_GRAPHN1r    )draw_buffersT)print_graph)osenvironr7  r  r  rI  )rh   r  s     rj   r  zScheduler.debug_draw_graphv	  sV    :>>:DAASHH++++++L666666 IHrl   labelrd   c                    t                               t          j                  r9t                               d|           | j        D ]}|                                 d S d S )Nz%s:)r   isEnabledForloggingINFOr  rI  r  )rh   r  r\   s      rj   debug_print_nodeszScheduler.debug_print_nodes}	  sh    GL)) 	#HHUE"""
 # #  """"	# 	## #rl   r\   r   rT   c                d   |                                 
J d            |                                rt          | |          S t          |t          j        t          j        f          rt          | |          S t          |t          j                  rt          | |          S t          |          )Nz2All nodes passed to scheduling must have an origin)r  is_no_opr  r   r%   r  r  rs  r  r	  r  r   s     rj   rO  zScheduler.create_scheduler_node	  s    !!--@ .-- ==?? 	,)$555r0"2CDEE 	, t,,,bo.. 	,,T4888%d+++rl   c                    t                      g } j                                        t          j        j                                        D ]~} fd|D             }|s                    |            fd|D             }t          j	        dk    }t           |d|          }|                    |           |D ]}| j        |<   fd j        D             t          |          z    _        d S )Nc                \    g | ](}|v t          j        |         t                    &|)S r   )r   r  r  )r   rp   kept_node_namesrh   s     rj   r   z2Scheduler.create_foreach_nodes.<locals>.<listcomp>	  sI       ?**"4#4T#:<RSS + ***rl   c                *    g | ]}j         |         S r   )r  r   rp   rh   s     rj   r   z2Scheduler.create_foreach_nodes.<locals>.<listcomp>	  s!    @@@$d'-@@@rl   r    Fr  r  c                @    g | ]}|                                 v|S r   r   )r   r\   removed_node_namess     rj   r   z2Scheduler.create_foreach_nodes.<locals>.<listcomp>	  s3     
 
 
4==??BT+T+TD+T+T+Trl   )r   r]  rr  rP   r   listsr   r  r"   combo_kernels_autotuner  r  rI  r   )	rh   fe_nodesnamesr  r  fe_noderp   r  r  s	   `      @@rj   r  zScheduler.create_foreach_nodes	  sQ   .8ll16688W]))++ 	8 	8E    !  E  %%e,,,@@@@%@@@F$;a?O0*/ /	  G OOG$$$ 8 807'--8
 
 
 
!Z
 
 
NN


rl   c                z   !"#$  G !fddt           t                             !t          j        !          # j        D ]}|                                D ]}|                                }t          |j        j	        t          j                  r&t          |                                          dk    r`|                                D ]k}|#v rJ|#v rF#|         }#|         }||z   }#                                D ]}#|         |u s
#|         |u r|#|<   P|#v r#|         #|<   `#|         #|<   ld)$ fd$	 	 d*d+#$fd}	i }
t          j        j                                        D ]{\  }}t          |t&          j                  r|j        D ]}d|
|<   /t          |t          j                  r2d |                                D             }|D ]}|j        D ]}d|
|<   |d} j        D ]r}|j        J t1          |j                                        d           }|D ];}t          |t&          j                  sJ d}||
vr|                                |
|<   <s j        D ]}t6                              d|j                   |r|j        J t1          |j                            d          d           }|D ]u}||
v sJ | d|
             |
|         x}V j        |                                         D ]6}|                    tA          |                                                     7vt          |j!        j"                  dk    rEtG          tI          |j!        j"                            x}rt          |tJ                    r|j&        }nd}|                                D ]@}t          |'                                          dk    sJ |'                                D ] } $|          } |	||           |                    tA          ||                     #|         j        D ]}|                                |                                k    r-t          |j        tP                    sJ |j        )                                D ]Q} $|          }|                    tU          ||                                                      |	||d           RB|j!        j+        D ]<}t          |tT                    s% |	|j,        ||-                    |                     =|.                     j/                   |                                D ]}|'                                D ]x}|                                 j/         $|          <   |                                 j/        |<    j0        1                    ||           j0        |                                <   yt          j        2                                D ]C}t6                              d|            |	|tg          tA          |                               D|rt          j        j4        D ]}|                    d          D ]}||
v s!J | d|
                                             |
|         x}rd j        |         )                                D ]D}t6                              d||            |	|tg          tA          |                               E j/        D ]}|t          j        j        v rK |	|tg          tA          |                               t          j        j5        6                    |           `|t          j        j7        v r& |	|tg          tA          |                               d tq          t          j        j                                                  D             ""fd t          j        j5        D             t          j        _9         j        D ]K}|                                D ]4}|:                    #|                                         j                   5L j;        D ]-} j;        |         :                    #|         j                   .ty                      }|=                    d!           #                                D ]^\  }}|>                                5  d" |j        D             }|=                    d#| d$| d%           ddd           n# 1 swxY w Y   _|=                    d&           |?                                @                                } t                              d'           t                              d(|            dS ),zi
        Create dependency edges between nodes, handling aliasing and
        mutation properly.
        c                  6    e Zd ZdZ	 	 ddd	ZddZd fdZdS )1Scheduler.compute_dependencies.<locals>.DedupListan  
            This data structure behaves like a list except it makes sure the
            elements remain unique.
            Normally one could use a OrderedSet/dict for this purpose however
            the list in question gets elements appended as it is being
            iterated over which means that we need to keep the list
            semantics.
            Nr  Optional[list[_T]]
membershipOptional[OrderedSet[_T]]rc   r   c                B    |pg | _         |pt                      | _        d S rf   )r  r   r  )rh   r  r  s      rj   r   z:Scheduler.compute_dependencies.<locals>.DedupList.__init__	  s#    
 #[b
","<
rl   	node_userrV   c                    || j         v rd S | j                            |           | j                             |           d S rf   )r  r  r  rI  )rh   r  s     rj   r  z8Scheduler.compute_dependencies.<locals>.DedupList.append	  sF    //F
!!),,,##I.....rl   r?  DedupList[_T]c                     t          j         j        |j                  } j         fd|j        D             z   } ||          S )Nc                &    g | ]}|j         v|S r   )r  )r   r  rh   s     rj   r   zMScheduler.compute_dependencies.<locals>.DedupList.__add__.<locals>.<listcomp>	  s,     * * *at.F.FA.F.F.Frl   )r   rN  r  r  )rh   r?  new_membership	new_items	DedupLists   `   rj   __add__z9Scheduler.compute_dependencies.<locals>.DedupList.__add__	  sc    !+!1$/5CS!T!T J * * * *${* * * 	 !yN;;;rl   rA  )r  r  r  r  rc   r   )r  rV   rc   r   )r?  r  rc   r  )ry   r   r   rB  r   r  r  )r  s   rj   r  r  	  sr          -17;= = = = =/ / / /< < < < < < < <rl   r  r    ru  rd   rc   c                F    | j         v r j         |                    S | S rf   )r   )ru  r$  rh   s    rj   r$  z.Scheduler.compute_dependencies.<locals>.rename	  s.    D)))vd3A6777Hrl   Fused_by_namer  r<  r  r   r=  r   c                n     |                                         t          |||                     d S rf   )r  r;  )r  r  r  r=  name_to_usersr$  s       rj   add_userz0Scheduler.compute_dependencies.<locals>.add_user
  sE     &&../66K99    rl   Nc                F    g | ]}t          |t          j                  |S r   )r   r#  r  r   r  s     rj   r   z2Scheduler.compute_dependencies.<locals>.<listcomp>
  s)    SSS!Auz9R9RSASSSrl   c                    | j         S rf   r  rl  s    rj   r   z0Scheduler.compute_dependencies.<locals>.<lambda>#
  s    AF rl   r  Tzscheduling %s)unbacked_onlyc                    | j         S rf   r  rl  s    rj   r   z0Scheduler.compute_dependencies.<locals>.<lambda>6
  s    !& rl   z not in )r7  )mutating_buf)r=  zscheduling output %sz+scheduling output %s for unbacked symint %sc                    i | ]\  }}||	S r   r   )r   r1  rp   s      rj   r   z2Scheduler.compute_dependencies.<locals>.<dictcomp>
  s+     
 
 
'E4D%
 
 
rl   c                     g | ]
}|         S r   r   )r   rp   	inp_namess     rj   r   z2Scheduler.compute_dependencies.<locals>.<listcomp>
  s*     &
 &
 &
 $IdO&
 &
 &
rl   r  c                6    g | ]}|                                 S r   r   )r   r  s     rj   r   z2Scheduler.compute_dependencies.<locals>.<listcomp>
  rp  rl   'z': ru   r  zBUFFER USER LIST
z===== AFTER SCHEDULING =====
%s)ru  rd   rc   rd   )FF)
r  rd   r  r<  r  r   r=  r   rc   r   )Br   rV   r  r   rI  r   rg   r   r\   rz   r%   r8   r~   r{   rr  rP   r   r  r  r#  r  r   	TensorBoxr	  r  get_unbacked_symbol_defsSymbolr   r  get_free_symbol_usesr  r*  r/   r   r   rQ  r0  r.   r7  r}   rT   rn  r0   r   rp   r  r&  r   r4  r7  r  r   graph_outputsmutated_inputsrI  rs  rR  mutated_input_idxsr   r   rG   r   r   r   r   compute_dependencies_log)%rh   r\   buf1	buf1_name	buf2_namelist1list2combinedr  r  unbacked_symbol_to_origin_noderp   valfssym_sizer  has_non_input_unbacked_defsunbacked_symbol_defsunbacked_symbol_usesr  r   r   	node_modealt_namer   
other_namer  r  r   logbufr$  ra   rd   r  r  r  r$  s%   `                                @@@@rj   rS   zScheduler.compute_dependencies	  s   	< 	< 	< 	< 	< 	< 	< 	< 	< 	<> @K?V@
 @
 J 	L 	LD((** L L MMOO	 ty/??D,,..//!33!%!1!1!3!3 L LI M11i=6P6P -i 8 -i 8#(5=#0#5#5#7#7 > >C -c 2e ; ;#0#5#>#>5=c 2> #m333@3Ki003@3Ki00LL8	 	 	 	 	 	 	 !&!		 	 	 	 	 	 	 	 MO&
 -3355 
	B 
	BID##uz** 	B* > >B9=2266>C.. B TSs||~~SSS! B BAn B B=A6r::B ',#J 	H 	HD9((( $*	2244:J:J$ $ $  * H H!!U\22222 /3+:::8<215H J @	 @	DIIoty111* Gy,,,'-I222FF((( ( ($
 . G GA >>>>FF&DFF ?>> <A>>K#'#4Q#7#C#C#E#E G GC --gcllnn.E.EFFFF D$+,,11 d&6&=!>!>???S 2sI.. 2  H		 	 '')) E E3,,..//14444 # 1 1 3 3 E EH%vh//HHXt,,,%%ghY&G&G&GHHH -h 7 = E E==??dmmoo==$)$)5FGGGGG*.)*D*D*F*F E EJ)/
););J -- '
 P P P   %HZtDDDDDEEE& (. F F!$00 FHTYd.>.>t.D.DEEE%%d&;<<< ''))   # 1 1 3 3  H>AllnnD)&&*:*:;69llnnD)(3/33HhGG +CLLNN;; 0022 	> 	>HII,h777HXz'(*;*;<<==== ' 	Nw, N N111EE N NA >>>>MM&D&I&I&K&KMM ?>> ;1==q N(,(9!(<(M(M(O(O N NHII M ( !  
 %HXz'(:K:K/L/LMMMMN ) 	: 	:Dqw+++z'$--88999&**40000***z'$--88999
 
+4QW5I5N5N5P5P+Q+Q
 
 
	&
 &
 &
 &
()(>&
 &
 &
"
 J 	C 	CD'')) C CmCLLNN;ABBBBC / 	S 	SD'-77d8K8QRRRR  !!c'--// 	4 	4JC 4 4;;u{;;;2#22%2223334 4 4 4 4 4 4 4 4 4 4 4 4 4 4 	c  ""))++ &&';<<< &&'I3OOOOOs   .d??e	e	c           
         ddl m}m}m}m} t          t          j        j        	                                          } | j
        |          }t          j        j        j        s | j
         j                   t          t          j                                                  } | j
        ||          \  }}	}	d t#          t%           j
                            D             |D ]~}
|
j        dk    r|
j        dk    r|
j                                        }|
j                 d                             |           |
j                 d                             |           ddlm}  |             d fd}g }t9           j
                  D ]S\  }}|                    |           |                     |||t%           j
                  dz
  k                         T| _
        d S )Nr    )r[  compute_memory_timelineFreeableInputBufferget_freeable_input_bufc                    g | ]}g g fS r   r   )r   rn  s     rj   r   z7Scheduler.insert_memory_check_nodes.<locals>.<listcomp>
  s/     C
 C
 C
RHC
 C
 C
rl   r   )register_check_mem_opstep_idxrm   is_final_stepr   rc   r	  c                Z   |          d         }|          d         }|||g}t          j        t          t          j        d                    t          j        j        j        j        g |d           }dj	        |          
                                 |_        t          |          S )Nr   r    r  )r  c                6    | |d         |d         |d         dfS )Nr   r    r:  )alivedeadr  r   )tensor_argsrs  s     rj   r   zWScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node.<locals>.<lambda>
  s/    !.q!1 -a 0)6q)9 C rl   )rz   r   r  nontensor_argsunflatten_args
mem_check_)r%   MemoryCheckKernelr8   r  r  rz  _inductor_debugcheck_memory_stepdefaultrI  rg   operation_namer	  )r   r  expected_newly_aliveexpected_newly_deadr  r\   rh   step_allocs_deallocss         rj   construct_mem_check_nodezEScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node
  s     $8#A!#D "6x"@"C24GWN'!e)<)<===y0BJ-     D #Qtz(/C/L/L/N/N"P"PD,T4888rl   )r  )r   rm   r  r   rc   r	  )r  r[  r  r  r  r   rP   r   r  rr  rI  r  r  r"   rZ  r   r  r  r~   
size_alloc	size_freer  rg   
start_stepr  end_step#torch._inductor.runtime.debug_utilsr  rR  )rh   r[  r  r  r  r  name_to_freeable_input_bufr  buf_info_listrn  buf_infor  r  r  	new_nodesrP  r\   r  s   `                @rj   r  z#Scheduler.insert_memory_check_nodes
  sr   	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 )3173G3L3L3N3N(O(O""4:|<< 	# %= 	==
D,   *4AG4L4L4N4N)O)O55J&
 
q!C
 C
#C
OO44C
 C
 C
 & 	H 	HH"a''H,>!,C,C//11H !45a8??III !23A6==hGGGGMMMMMM	9 	9 	9 	9 	9 	9 	92 	 ,, 	 	GAtT"""((1DJRS@S;SUUU    


rl   c                   g }t          | j                  D ]ddd}                                D ]}t          fd|j        D                       }|rdt
                              d|                                           t          j	        j
                            |                                           d	}                                 o| }|s|                               t
                              d
                                           t          j	        j                                                                       j        j        D ]J}|j        | j        v r:| j        |j                 j        }fd|D             | j        |j                 _        Kt'          t          |                    | _        | j        D ]                                 dS )z0
        Remove any nodes without users
        r   r;  rc   r   c                Z    | j         p$|                                 t          j        j        v S rf   )r=  rg   rP   r   rT  )r   s    rj   can_eliminate_userz;Scheduler.dead_node_elimination.<locals>.can_eliminate_user   s!    |Tt}}!':T'TTrl   Fc              3  .   K   | ]} |          V  d S rf   r   )r   ur  s     rj   r!  z2Scheduler.dead_node_elimination.<locals>.<genexpr>  s/      #M#Ma$6$6q$9$9#M#M#M#M#M#Mrl   zremoved dead buffer: %sTzremoved dead operation: %sc                r    g | ]3}|j                                                                         k    1|4S r   r   )r   r   r\   s     rj   r   z3Scheduler.dead_node_elimination.<locals>.<listcomp>  s>     = = ="#0A0AT]]__0T0TA0T0T0Trl   N)r   r;  rc   r   )r"  rI  r   rw  ra   r   r  rg   rP   r   r  rI  r  r  rT  r   r   rp   r   r   r\  )	rh   updated_nodesactive_buffersr   can_eliminater  ra   r  r\   s	          @@rj   r|  zScheduler.dead_node_elimination
  s
    TZ(( 	 	DU U U U #N'')) * * ##M#M#M#M39#M#M#M M M  *II7HHHG+//????%)NN $ 5 5 7 77N<NM  $$T**** 		6HHH*..t}}??? ,2  DyD$444 $ 0 ; A= = = =',= = =(39 (=1122
 J 	# 	#D  """"	# 	#rl   rJ  c                    t          t                               t                      g dfd|D ]}|                                D ]}||<   |D ]} |           S )z?
        Ensure nodes is in topologically sorted order
        ru  rT   rc   r   c                    | vrf                     |            t          | j        d           D ]"}|j        vr |j                            #                    |            d S d S )Nc                    | j         S rf   r  )ds    rj   r   zDScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>-  s    af rl   r  )rI  r  r   rp   r  )ru  r   r  r   seenvisits     rj   r*  z2Scheduler.topological_sort_schedule.<locals>.visit*  s    }}!!"6<L<LMMM 2 2Cx|33 E,sx01111a      }rl   )ru  rT   rc   r   )r   rT   r  rn  )rh   rI  r\   rp   r  r   r)  r*  s       @@@@rj   r{  z#Scheduler.topological_sort_schedule   s     +,..59VV*,	! 	! 	! 	! 	! 	! 	! 	! 	!  	* 	*D--// * *%)T""* 	 	DE$KKKKrl   rg  c                f    t                      }t          |t          t          t          t
          f          r%|j        D ]}|                    |j                   n t          dt          |           d           fd|D             }t          t           fd|D                                 S )Nz+get_unmet_dep_nodes is not implemented for .c              3  V   K   | ]#}j         |                                         V  $d S rf   )r   rk   rP  s     rj   r!  z1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>L  s7      XXc)#.??AAXXXXXXrl   c              3  2   K   | ]}j         |         V  d S rf   r]  rP  s     rj   r!  z1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>M  s+      QQat6q9QQQQQQrl   )r   r   rs  r	  r  r  r   rI  rp   RuntimeErrorrx   r   )rh   rg  
unmet_depsr   unmet_dep_opss   `    rj   _get_unmet_dep_nodeszScheduler._get_unmet_dep_nodes;  s    &0ll
)&"	
 
 	 / ) )sx(((() Ld5kkLLL   YXXXZXXXJQQQQ=QQQQQRRRrl   r  c                b   g }t                               | j        d          }i }| j        D ]^}|                     |          }t	          |          ||<   |D ]2}|                    |g           }|                    |           |||<   3_d |                                D             }|rx|                    |           |D ]@}	|                    |	g           D ]}
||
xx         dz  cc<   |                    |	           Ad |                                D             }|x|r
J d            |S )zU
        Sort nodes by their topological order, return a list of node lists.
        r   c                $    g | ]\  }}|d k    |S r   r   r   ru  r  s      rj   r   z5Scheduler._topological_sort_nodes.<locals>.<listcomp>^  s!    @@@1a!rl   r    c                $    g | ]\  }}|d k    |S r6  r   r7  s      rj   r   z5Scheduler._topological_sort_nodes.<locals>.<listcomp>e  s!    DDDDAqQ!VVaVVVrl   zTopological sort failed!)	r  fromkeysrI  r3  r~   r7  r  r  rH  )rh   r8  rI  childrenr\   rL  r   czero_deg_nodesru  r   s              rj   r  z!Scheduler._topological_sort_nodesO  sf    dj!,,#%J 	" 	"D,,T22Dd))E$K " "LLb)) !"
 A@@@@ 	ELL(((#  $LLB// % %D$KKK1$KKKK		!DDEKKMMDDDN  	E 44444yrl   c                b   i }| j         D ]|}t                      }|j        D ]F}| j        |j                                                 }|                    |           |||         z  }G|||                                <   ||_        }t          | j                   D ]\  }}||_
        ||_        dS )z.
        Populate each node.ancestors
        N)rI  r   r   r   rp   rk   rI  rg   r   rR  r   r   )rh   name_to_ancestorsr\   r   r   dep_node_namer8  s          rj   r}  zScheduler.compute_ancestorsi  s    
 9;J 	' 	'D)3I. > > $ 0 : K K M Mm,,,.}==		1:dmmoo.&DNN$TZ00 	# 	#KE4"DN"DNN	# 	#rl   c                b   t           j        sd S | j        D ]}t          |t          t
          f          r$|                                st           j        dk    rC|                                D ]@}t          |t                    r|	                                r,|
                                 Ad S )Nhalide)r"   r  rI  r   rs  r  rJ   cpu_backendri  r  r  )rh   r\   rg  s      rj   r  zScheduler.merge_loops|  s    0 	FJ 	$ 	$D d]4F$GHH KKMM&,&8H&D&D)) $ $!%77 5;L;L;N;N !!####$	$ 	$rl   c                   t          ddd          5  t          d          D ]}t          |          }t                              d|dz   |           |                     |          }t          |          }t                              d|dz   ||           ||k    s|dk    r t                              d|dz               n|cd	d	d	           S # 1 swxY w Y   d	S )
zB
        Combine eligible nodes into FusedSchedulerNodes.
        zScheduler.fused_nodesTrV  
   z/===== attempting fusion (%d/10): %d nodes =====r    z=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====N)r   r  r~   r  r  fuse_nodes_once)rh   rI  rP  old_lennew_lens        rj   r
  zScheduler.fuse_nodes  s[    #4QU
 
 
 	 	 2YY  e**  EE  
 ,,U33e**  TE	   g%%A$$Eq1u   E	 *6
 /	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   B2CCCc                    g }| j         D ]A}|                    t          |t                    r|                                n|g           B|| _         dS )zA
        Unpack GroupedSchedulerNode into regular nodes.
        N)rI  rK  r   rW  r  )rh   r  r\   s      rj   r  zScheduler.process_grouped_nodes  se     .0	J 	 	D!+D2F!G!GSdV    


rl   r{  tuple[float, str]c                   t          |          dk    sJ |d                                         }|| _        |                     |          }t	          ddd          5  |                    |          cddd           S # 1 swxY w Y   dS )
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        r   benchmark_fused_nodesTcompile_time_autotune_time_us)rW  dynamo_compile_column_usN)r~   r   r  r  r   rL  )rh   rI  r  backends       rj   rL  zScheduler.benchmark_fused_nodes  s     5zzA~~~~q$$&&$""6**#"&%D
 
 
 	8 	8
 0077	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8s   B  BBNbenchmark_kernelr   hint_overrideOptional[int]c                   t          |          dk    sJ |d                                         }|| _        |                     |          }t	          d          5  |                    |||          cddd           S # 1 swxY w Y   dS )rK  r   rL  rQ  N)r~   r   r  r  r   generate_kernel_code_from_nodes)rh   rI  rP  rQ  r  rO  s         rj   rU  z)Scheduler.generate_kernel_code_from_nodes  s     5zzA~~~~q$$&&$""6**122 	 	::'} ;  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   B  BBmoduler   r  c                    || _         |                     |          }t          d          5  |                    |          cddd           S # 1 swxY w Y   dS )rK  rL  N)r  r  r   benchmark_codegened_module)rh   rV  r  rO  s       rj   rX  z$Scheduler.benchmark_codegened_module  s     %""6**122 	> 	>55f==	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	>s   AAAc                   dd}t          | j                  D ]U\  }}t          |t                    r9t          |j        t
          j                  r|j        }t          j        j	        s|
                                \  }}n+t          d |                                D                       }t          |t          j        j        j                  rt          j        ri }||d	<   t          j        D ]e}|                    |
          }	d |	                                D             }
t%          |
                                d           d         }|||<   f|j                            |           n|j                            |           w|                                }|j        }t          |t
          j                  sJ |j        }t          |t
          j                  sJ |j        |_         |||           |                     |          }|| j        |<   || j        |                                <   || j        |                                <   i t=          j        |j         j!        |j"                  D ].}| j#        $                    |j%        d	          x}r
|j%        |<   /dfd} ||j"                  |_"         ||j         j!                  |j         _!        tM          |'                                |'                                          D ]-\  }}|| j(        |                                <   |j)        |_)        .|j*        |_*        |j+        |_+        |j,        |_,        Wd	S )a  
        Finalize a backing choice for MultiTemplateBuffers which did not already have a
        choice finalized through fusion. In the case of an extern choice, this will result
        in replacing the SchedulerNode.

        If a MultiTemplateBuffer did not have any fusion opportunities, finalizing a choice
        will force completion of compilation and benchmarking.
        	orig_nodeir.MultiTemplateBufferr  ir.OperationBufferrc   r   c                z   |                                 }|                                  }t          |t                    rt          |t                    sJ |                                }|                                 }t          |t                    rt          |t                    sJ t          j        j        |= ||_        t          j        j        |= ||_	        t          j        j
                            |           }t          j        j
                            |           |t          j        j
        |<   |t          j        j        |<   t          j        j                            |           }t          j        j                            |           |t          j        j        |<   |t          j        j        |<   d S rf   )rg   r   rd   rc  rP   r   rJ  rp   
name_to_opr  buffersr1  remove
operations)rZ  r  replaced_buf_nameorig_buf_namereplaced_op_nameorig_op_nameorigs          rj   replace_operation_bufferzKScheduler.finalize_multi_template_buffers.<locals>.replace_operation_buffer  sd    !) 1 1 3 3%..00MmS11XjARTW6X6XXXX'::<<$7799LlC00VZ@PRU5V5VVVV&'89)HM"#34&2H#7?((33DGO""8,,,$,AGOD!4<AG"=17%++I66DG%%h///'/AGt$/7AG|,,,rl   c              3  b   K   | ]*}t          |t          j        j        j                  &|V  +d S rf   )r   r  r  select_algorithmExternKernelCaller)r   timings     rj   r!  z<Scheduler.finalize_multi_template_buffers.<locals>.<genexpr>  sT         &) & % @ S   "     rl   NrT  c                D    i | ]\  }}t          |t                    ||S r   )r   r   )r   r8  r  s      rj   r   z=Scheduler.finalize_multi_template_buffers.<locals>.<dictcomp>2  s?     . . .$(Aq#-a1I#J#J. !1. . .rl   c                    | d         S r  r   rl  s    rj   r   z;Scheduler.finalize_multi_template_buffers.<locals>.<lambda>7  s    qQRt rl   r  r   rL  r   c                :    t          fd| D                       S )Nc              3  B   K   | ]}|                               V  d S rf   )r$  )r   r   r   s     rj   r!  zQScheduler.finalize_multi_template_buffers.<locals>.rename_deps.<locals>.<genexpr>V  s0      %S%Sscjj1A&B&B%S%S%S%S%S%Srl   r   )rL  r   s    rj   rename_depsz>Scheduler.finalize_multi_template_buffers.<locals>.rename_depsU  s&    %%S%S%S%Sd%S%S%SSSSrl   )rZ  r[  r  r\  rc   r   )rL  r   rc   r   )-rR  rI  r   rs  r\   r%   MultiTemplateBufferr"   r  %force_extern_kernel_in_multi_templateget_min_choicerQ  choice_timingsr  r  r   multi_kernel_hintsr  r  finalize_as_triton_callersfinalize_as_triton_calleroutput_noder  
StorageBoxOperationBufferrz   rO  r  rg   r]  r@  rA  r   r   r   r4  r7  rp   r  r   r   ra   r   r   r   )rh   rg  rP  r\   
multi_nodemin_node_unfusedrn  callershinttimingstriton_timingschoiceout_tensorboxout_storage
out_buffernew_scheduler_noder   	real_namerp  new_outold_outr   s                        @rj   r  z)Scheduler.finalize_multi_template_buffers  s   	8 	8 	8 	86 !,, R	@ R	@GAt$.. Q@:	214 4 Q@ "Y
*P *4*C*C*E*E'$aa'+ *4*C*C*E*E  	( 	($ $O&?   0 NQS(8$*$= 3 3D&0&?&?d&?&S&SG. .,3MMOO. . .N
 &))=)=)?)?^^%T%T%TUV%WF,2GDMM	<<WEEEE	;;<LMMM 0 < < > >+0!+r}=====(-
!*b.@AAAAA$.$5
!((Z@@@%)%?%?
%K%K" 2
15G!$--//2;M'8 $& $?$*D,C  ? ?C %)$;$?$?$$O$OOy ?69h(3T T T T T T 9D&99 9"5 8C{&288 8".4 ),&2244d6F6F6H6H) ) 2 2$GW <CD$W%5%5%7%78$+MGMM/3~",/3~",04"-eR	@ R	@rl   	node_listc                4    t          d |D                       S )Nc              3     K   | ]Q}t          |j        d           o7|j        duo.t          |j        j        d          o|j        j        j        dk    V  RdS )r  Nscatter_moder8  )r   r\   r  r  rt  s     rj   r!  z,Scheduler._any_atomic_add.<locals>.<genexpr>j  s       
 

 	 AFF## 9d"9^449 (L8	
 
 
 
 
 
rl   )r-  rh   r  s     rj   _any_atomic_addzScheduler._any_atomic_addi  s2     
 

 
 
 
 
 
 	
rl   r  r  Union[bool, Callable[[], bool]]c           
     D    !"#$% t          d fD                       }t          j        s|sdS                                 r,t	                                          t          j                  r(                                s                                rdS 	                                }|d         
                                sJ j        dk    rdS 	                                }t          t          j        ||                    }                     |          rdS ddlm t%                    %|d         
                                J d"fd!t&          j        j                                        	 d#d$ fd}|rt          d fD                       r                                dur                                n                                $t	          $t          j                  sJ i  g t          j        D ]}$                    |          }	t5          |	                                d           D ]\  }
}t	          |
t&          j        j        j                  s*$                    |
          5                      |
g |||
j                   R            ddd           n# 1 swxY w Y   tC          d          }d}i }D ]\  }
}}	 ||"                                 nh# tF          $ r[}tH          %                    tL          j'                  r-tH          (                    dsdndtS          |                     Y d}~~d}~ww xY w$                    |
          5   *                    |          \  }}|||
<   ||k     r|}|
}ddd           n# 1 swxY w Y   |$j+        |<   t	          |tX                    sJ | |<   ։$                                }	$-                                \  }"r .                    |          n .                    |          \  #}g d}t5          |	                                t_          j0        d                    D ]\  }
}t	          |
t&          j        j        j,                  s*s!tc          |
d          r|
j2        $j2        k    rM|"#z   k    r ng|dz  }|t          j3        k    r nP$                    |
          5                      |
g ||          R            ddd           n# 1 swxY w Y   ti                    dk    rdS d% !"#$ f	d }|S  ||           ||           ||          d%! %fd!}|S )&
        If config.benchmark_fusion is False, always return True.
        Otherwise, return True if fusion can brings speedup.
        c              3     K   | ]D}|                                 o+t          |                                t          j                  V  Ed S rf   )r  r   rB  r%   rq  rt  s     rj   r!  z.Scheduler.speedup_by_fusion.<locals>.<genexpr>z  sb        
  
  MMOO J1..00"2HII 
  
  
  
  
  
rl   Tr   r  CompilationErrorNms_fusedr  ms1ms2rc   r   c           
        t                               t          j                  r| ||z   k     rXt                               d                                                                t          ||z   | z  d                     d S t                               d                                                                t          | ||z   z  d                     d S d S )Nz9can fuse (benchmark): fusing %s with %s cause %sx speedup.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdown)r  r  r  DEBUGr  rn  r<   r=   )r  r  r  r  r  s      rj   
log_fusionz/Scheduler.speedup_by_fusion.<locals>.log_fusion  s    &&w}55 cCi''$$S..00..00"sSyH&<#B#BCC	     $$W..00..00 Hc	$:!@!@AA	     rl   rI  r{  rQ  rR  )tuple[Optional[LambdaFuture], ModuleType]c                                         | d|          }t          j        |          }                                sd }n.                    d|          }t          |t                    sJ ||fS )NT)rP  rQ  triton_)kernel_namesource_code)rU  r   loaduse_process_poolr  r   r   )rI  rQ  src_codemodfutasync_compilerh   s        rj   compile_kernelz3Scheduler.speedup_by_fusion.<locals>.compile_kernel  s     ;;M <  H "8,,C 1133 5#**yh*WW!#|44444:rl   c              3  B   K   | ]}|                                 d uV  d S rf   rF  rt  s     rj   r!  z.Scheduler.speedup_by_fusion.<locals>.<genexpr>  sD       %
 %
23A!!-%
 %
 %
 %
 %
 %
rl   c                    | d         S r  r   rl  s    rj   r   z-Scheduler.speedup_by_fusion.<locals>.<lambda>  s
    !A$ rl   r  rT  infException in compiling %s: %srT  rV  r    allowed_prologue_inpsFr   c            	       	 t          d          } d }i }D ]\  }}}	 ||                                 nh# t          $ r[}t                              t
          j                  r-t                              d
sdndt          |                     Y d }~~d }~ww xY w	                    |          5  
                    |	          \  }}|||<   || k     r|} |}d d d            n# 1 swxY w Y    |            | z   k     rJ|Ht          j        r|d <                                  n                    |           |j        d <   dS dS )Nr  r  rT  rV  TF)r  r   r   r  r  r  r  r  rd   swap_as_triton_callerrX  r"   ru  rv  rw  _choice_timings)min_ms_fusedms_fused_choicenew_timingsr  future	mod_fusedr5  r  pathr  epilogue_fusionfuture_choices hint_override_best_fusion_choicer  r  r  r{  rh   s            rj   benchmark_when_readyz9Scheduler.speedup_by_fusion.<locals>.benchmark_when_ready-  s   $U||"& 1? 5 5-FFI!!-"MMOOO % ! ! !%227=AA &,, ?2A Q

z #A  
 !! $99&AA 5 5)-)H)H%v* *$ /7F+#l22+3L.4O5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 
<c22239--/2M0 NAP8>"==<    #<<_MMM7BJ.t44 5s'   5
BABB2)C''C+	.C+	c                 T   ddl m}  	 d         d         d         fD ]}||                                                     d         
          \  t	          j                  r d           dS                     d         
          \  t	          j                  r d           dS                     d         
          \  t	          j                  r d           dS             t          d          rZz   k    rQfj        vrFj                            f           t          d          
                    fd	           z   k     S # | $ r Y dS 	$ r}d
t          |          v rY d }~dS  d }~ww xY w)Nr   )NoTritonConfigsErrorr    z%register spilling of the first kernelFz&register spilling of the second kernelz%register spilling of the fused kernelslow_fusionc            	     $      z   z  dS )N)kernel1_pathkernel1_latencykernel2_pathkernel2_latencyfused_kernel_pathfused_kernel_latencyslow_down_ratior   )r  r  r  path1path2
path_fuseds   rj   r   zKScheduler.speedup_by_fusion.<locals>.benchmark_when_ready.<locals>.<lambda>  s-    053605365?8@3;sSy3I% % rl   Loop-carried variableT))torch._inductor.runtime.triton_heuristicsr  r   rX  mathisinfr   r  rI  r   r  rd   )r  r  r5  r  r  r  r  r  r  r  r  future_and_mod_l1future_and_mod_l1_fusedfuture_and_mod_l2r  rh   r  s      @@@@@@rj   r  z9Scheduler.speedup_by_fusion.<locals>.benchmark_when_readyc  si        ; *!,)!,/2  ) )
 ?JJLLL!%!@!@)!,f" "JC z# %CDDD$u!%!@!@)!,f" "JC z# %DEEE$u+/+J+J/2F, ,(Hj z(++ %CDDD$uJxc222 0>>$c	11"EN$2III/33UENCCC(77??        
 
 
 $cCi//+ ! ! ! 55'   .#a&&88#ttttts7   A.E> ?>E> ?>E> ?A>E> >F'F'
F"!F""F')r  r  r  r  r  r  rc   r   rf   )rI  r{  rQ  rR  rc   r  r   )5r-  r"   benchmark_fusionr  r   rB  r%   TritonTemplateBufferr  ri  r   rx   r   r@  rA  r  triton.compiler.errorsr  r  r  r  r  AsyncCompilerq  ru  rt  r  r  ri  TritonTemplateCallerr  r  rQ  r  r   r   r  r  r  r  r  rd   rX  r  r   rs  rL  operator
itemgetterr   r   max_epilogue_benchmarked_choicesr~   )&rh   r  r  is_multi_templatenode_list_1node_list_2node_list_fusedr  rQ  rt  r  unfused_timer  r  r  r  r  r5  r  r  rn  r  triton_choicesr  r  r  r  r  r  r  r  r  r  r  r  r  r{  r  s&   ```                     @@@@@@@@@@@@@@rj   speedup_by_fusionzScheduler.speedup_by_fusionr  sD       
  
 U^ 
  
  
 
 

 & 	/@ 	4 	u6688":QRR	 !!	 !!		 4oo''Q**,,v ;%4oo''y{KHHII
 00 	4;;;;;;u%% #..00!!!	 	 	 	 	 	 	" 5BBDD PT	 	 	 	 	 	 	 	  `	( %
 %
8=u~%
 %
 %
 "
 "
 `	( $5577tCO #/''))),,.. 
 j"*@AAAAA  - TVN!'!: ,R ,R!+!:!:=!I!I,2"((**- - -  (FL & @ U  ! !#99&AA  &-- &!/$36CW"" "" ""                   %U||FJ 1? 5 5-FFI
!!-"MMOOO$ ! ! !%227=AA &,, ?2A Q

z #A  
 !! $99&AA 5 5)-)H)H%v* *$ /7F+#l22+3L.4O5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 =H
*=9!/3KLLLLLBQ0?? (6688N..00FAs #=**;777//<< C TVNN(.$$&&H,?,B,B) ) ) V V$ "&%/*<*UVV  ((?@@ 4
8XXX39,,E!#!F$KKKE55f== V V"))6*TNN?4S4S*T*TUUUV V V V V V V V V V V V V V V >""a''u,! ,! ,! ,! ,! ,! ,! ,! ,! ,! ,! ,! ,! ,!\ (' !/{ ; ; .{ ; ;&4n_&E&E#@ @ @ @ @ @ @ @ @ @ @ @ @D ('sO    *KKK>L
M:AM55M:)OOO#UU	U	c                @    | j         |                                         S )z0Look up the node in Scheduler name_to_fused_node)r]  re  r   s     rj   r  zScheduler.get_fused_node  s    &t':':'<'<==rl   c                l    t          |          t                              t          j                  rLt                              d           D ]/}t                              d|                                           0i d fdd fd	}                     |          D ]\  }} |||                                |          }                     |          } 	                    ||          r[ 
                    ||          sE                     ||          }t          |          r|||f|<   |||f|<   |s ||           t                      }                                D ]~\  }}	}
||v r|                    |                                |	          |	u sJ                      |
          |
u sJ  |            r" 
                    |	|
          s |	|
           t          d
           }                     |          }                     |           |S )a  
        Combine eligible nodes into FusedSchedulerNodes.

        This relies on two key functions to control the logic:
            - self.can_fuse(): checks if a fusion is legal
            - self.score_fusion(): assigns priority to a given fusion
        zfuse_nodes_once, candidates:z  %sr  rT   r  rc   c                &   t                               d|                                 |                                           |                                 }|                                |k    sJ                     |                              | |                              |                                |                                          j        	                    fd
                                D                        S )Nzfusing %s with %sc                :    i | ]}|                                 S r   r   )r   ru  node3s     rj   r   zEScheduler.fuse_nodes_once.<locals>.fuse_two_nodes.<locals>.<dictcomp>  s#    @@@u@@@rl   )r  r  rg   r   r  r`  r`  rI  r]  r  ri  )r  r  r  r  r  rh   s      @rj   fuse_two_nodesz1Scheduler.fuse_nodes_once.<locals>.fuse_two_nodes  s    0%..2B2BENNDTDTUUU%%''F##%%////$$V,,11%??Eu%%%u%%%OOE"""#**@@@@eoo.?.?@@@   Lrl   r   c                                        |           v s                     |          v r                                         |                                                    |          d                     }|J |\  }}}                    |d                                |d                                 |          |u sJ                      |          |u sJ  |            r                    | |          r	 ||                                |           v                      |          v d S d S rf   )r  r7  rH  will_fusion_create_cycle)	r  r  pending_fusion
is_speedup	node_key1	node_key2r  pending_fusionsrh   s	         rj   resolve_pending_fusionsz:Scheduler.fuse_nodes_once.<locals>.resolve_pending_fusions  s|    ##E**o==&&u--@@!0!4!4''..#''(;(;E(B(BDII" " &1113A0
Iy##It444##It444**955BBBB**955BBBB!z|| t'D'DUE'R'R y)444' ##E**o==&&u--@@@@@@rl   c                    | j         S rf   rS  rl  s    rj   r   z+Scheduler.fuse_nodes_once.<locals>.<lambda>  s    !+ rl   r  )r  rT   r  rT   rc   rT   r  )r   r  r  r  r  r  r  get_possible_fusionsr  r  r  r  callabler   rI  r  r{  ra  )rh   rI  r\   r  r  r  speedupseen_pair_speedup_fnis_speedup_fnr  r  r  r  r  s   `          @@@rj   rE  zScheduler.fuse_nodes_once  s    !''""7=11 	A;<<<# A A  )=)=)?)?@@@@  	
	 	 	 	 	 	 	 	5 	5 	5 	5 	5 	5 	5 	52 !55e<< 	- 	-LE5 $#E5111''..E''..E}}UE** -43P3Pu4 4 - 00>>G$$ .5ue-DOE*.5ue-DOE* ue,,,?I||3B3I3I3K3K 	5 	5/M9i 444 $$]333&&y11Y>>>>&&y11Y>>>>} 5t'D'D9( ( 5 y)444{(=(=>>>..u55!!%(((rl   rY  c                F  	 t          | j                  }d}t          | j                  }t                              d|           t          t                              |                     D ]2\  }}t                              |          }t          |          dk     r4|||k    r n| 	                    |          st                              d|           o|dz  }t          j        dk    }t          |d         j        |d|          	t                              d	t          |          |           |D ]}|                    |           |                    	           | j                            	fd
	                                D                        4t'          |d           | _        |                     | j                  | _        t                              d||t          | j                             |                     | j                   dS )z'
        Groups parallel nodes
        r   z2ComboKernels: Generating with num_ck_nodes = %s...r:  Nz)ComboKernels: Not speeding up %d-th groupr    Tr  z0ComboKernels: Combining %d nodes for %d-th groupc                :    i | ]}|                                 S r   r   )r   ru  rC  s     rj   r   z7Scheduler.create_combo_kernel_nodes.<locals>.<dictcomp>9  s#    LLLq{LLLrl   c                    | j         S rf   rS  rl  s    rj   r   z5Scheduler.create_combo_kernel_nodes.<locals>.<lambda>;  s    q{ rl   r  zDGenerated ComboKernel nodes: %d ComboKernels, totally %d -> %d nodes)r   rI  r~   r   r  rR  r  r  r  speedup_by_combo_kernelr"   r  r[   r  r`  rI  r]  r  ri  r  r{  ra  )
rh   rY  r  rp  num_nodes_orignumr  r  r\   rC  s
            @rj   r  z#Scheduler.create_combo_kernel_nodes  s/    !,,TZ		FUUU'&DDTJJ
 
 	 	NC 3CCINNI9~~!!'EL,@,@//	:: 		EsKKKQJE$;a?O4!&*. /	  K HHBI  
 " ) )""4((((OOK(((#**LLLLK4I4I4K4KLLL    K-B-BCCC
33DJ??
R
OO		
 	
 	
 	!!$*-----rl   c                D    |D ]}|                     | j                   d S rf   )ra  r]  )rh   rI  r\   s      rj   ra  zScheduler.prune_redundant_depsE  s5     	? 	?D%%d&=>>>>	? 	?rl   1list[tuple[BaseSchedulerNode, BaseSchedulerNode]]c                ,   	
 g 	t          t          t          t          f                              
d	
 fd}t          j        t
                    }|D ]J}                     |          r|                                D ]}||                             |           K|	                                D ]} ||           t          j        rnt          j        t
                    }|D ]0}t          |dd          }|r||                             |           1|	                                D ]} ||                                	          		                     j        d	           t                               d
t%          	                     	S )z^
        Helper to find all legal fusion opportunities, sorted by self.score_fusion()
        rI  rJ  rc   r   c                   t          |           D ]\  }}| |dz   |dz   t          j        z            D ]}||f}|v r                    |                               ||          r                    |           L|                                s|                                r-                    ||          r                    ||f           d S r  )rR  r"   )max_fusion_buffer_group_pairwise_attemptsrI  r  r  r  r  )rI  node1_indexr  r  r  possible_fusionsr)  rh   s        rj   check_all_pairsz7Scheduler.get_possible_fusions.<locals>.check_all_pairsR  s!   &/&6&6 @ @"U"!Ok'F'G G @ @E
 !%.Cd{{ HHSMMM}}UE22 @(//4444++-- @1A1A1C1C @uJ J @ )//???!@@ @rl   r   NT)r  reversezfound %d possible fusionsrI  rJ  rc   r   )r   ro  rT   r  r   r   unfusable_noderB  r  r   r"   aggressive_fusionr   *get_possible_fusions_with_highest_priorityr7  score_fusion_keyr  r  r~   )rh   rI  r  buffer_names_groupingr\   r   node_groupinggroup_groupingr   r  r)  s   `        @@rj   r  zScheduler.get_possible_fusionsI  s    % 13D DEFHH	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@( !, 7 = = 	8 	8D""4(( --// 8 8%c*11$77778299;; 	+ 	+MOM****# 	/(4T::N 7 7gt44 7"5)00666!/!6!6!8!8 / /....JJ
 
 	$"7FFF4c:J6K6KLLLrl   c                    t          t                               d fd|                                j                                        |                                j                                        z  |j        j                                        |j        j                                        z  z
  t           fdD                       }|r t          ||          d           |S )	z~
        Finds whether there's a path from node1 to node2 (or vice-versa)
        caused indirectly by other fusions.
        r\   rT   rc   r   c                ,   t          | t                    r}| vry                    |            |                                                               rdS t          | j        z            p#t          fd| j        z
  D                       S dS )NFc              3  D   K   | ]} j         |                   V  d S rf   r/  r   ru  
found_pathrh   s     rj   r!  zIScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>  sQ       H H #
4#:1#=>>H H H H H Hrl   )r   r  rI  rj  issubsetr   r   r-  )r\   combined_ancestorscombined_namesr  rh   visiteds    rj   r  z6Scheduler.will_fusion_create_cycle.<locals>.found_path  s    $ 233 G8K8KD!!!++--667IJJ  !5   ?@@ C H H H H H!%2D!DH H H E E  5rl   c              3  D   K   | ]} j         |                   V  d S rf   r/  r  s     rj   r!  z5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>  s5      WWqJJt6q9::WWWWWWrl   zwill create cycler\   rT   rc   r   )r   r  rj  _dictrr  r   r-  r  )rh   r  r  cycler  r  r  r  s   `   @@@@rj   r  z"Scheduler.will_fusion_create_cycle  s    /022	 	 	 	 	 	 	 	 	 	2 %%''-2244''))/44667 	
 O!&&((5?+@+E+E+G+GG WWWWWDVWWWWW 	9#IeU##$7888rl   c                    ddl m d fd} ||          } ||          }t          fd|D                       }t          fd	|D                       }|                    |          }d
}	|D ]-}
	 |	t	          |
d                   z  }	# t
          $ r Y  dS w xY w                     ||          }t          j        j	        
                    |	d|z            rdS dS )a  
        Return true if fusing the two nodes can potentially increasing peak memory.

        The implementation is more like a heuristic since we don't really know if we are at peak
        or not when trying to fuse these two nodes. The order of nodes may change later which makes the
        peak memory estimation hard.

        Here is how we decide the LOWER BOUND of extra memory allocation if we fuse these 2 nodes:
        1. find all buffers read by each node with a single user. These buffers are supposed to
           be reused if we don't fuses these 2 nodes
        2. find the intersection of these buffers for the two node and sum the total buffer size.
           If we don't fuse these two nodes, we can at lease avoid this much memory allocation.
           Note that the extra memory allocation is not necessarily causing peak memory increase.
           This is just a heuristic.

        We return true only if the saving for fusion can not trade off the extra memory allocation.
        r    )buffer_reuse_keyr\   rT   rc   list[ir.Buffer]c                   g }| j         j        D ]n}j                            |j                  }|rKt          |j                  dk    r3|j                                        r|	                    |j                   o|S r  )
r   r   r   r7  rp   r~   ra   r\   has_tensor_outputr  )r\   r   r  r   rh   s       rj   _find_single_user_inputszKScheduler.can_fusion_increase_peak_memory.<locals>._find_single_user_inputs  s~     F&, , ,&**2733 ,3sy>>Q..383M3M3O3O.MM#(+++Mrl   c              3  .   K   | ]} |          V  d S rf   r   r   r   r  s     rj   r!  z<Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>  /      #S#Sc$4$4S$9$9#S#S#S#S#S#Srl   c              3  .   K   | ]} |          V  d S rf   r   r  s     rj   r!  z<Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>  r  rl   r   r:  F    T)r\   rT   rc   r  )r  r  r   intersectionrm   r+  score_fusion_memoryrP   r   r  statically_known_gt)rh   r  r  r  lhs_dep_nodesrhs_dep_nodeslhs_reuse_keysrhs_reuse_keyscommon_reuse_keysmemory_overheadr  	bw_savingr  s   `           @rj   can_fusion_increase_peak_memoryz)Scheduler.can_fusion_increase_peak_memory  sG   * 	655555	 	 	 	 	 	 10770077##S#S#S#S]#S#S#SSS##S#S#S#S]#S#S#SSS*77GG$ 	 	C3s1v;;.   uuu ,,UE::	 7//iPP 	4us   7B
BB	thresholdrm   c                     |j         j        |j         j        z  |j         j        |j         j        z  z
  }t           fd|D                       |k    S )Nc              3  B   K   | ]}                     |          V  d S rf   dep_size_hintrP  s     rj   r!  z:Scheduler.fusion_accumulate_large_reads.<locals>.<genexpr>  s1      @@s4%%c**@@@@@@rl   )r   r   r   r  )rh   r  r  r,  	all_readss   `    rj   fusion_accumulate_large_readsz'Scheduler.fusion_accumulate_large_reads  s^     &,u/@/FF$u'8'??
	 @@@@i@@@@@9LLrl   c                    t          t          |j        |j        z
            t          |j        |j        z
                      }|dk    S )aB  
        This function prevents fusion for nodes that can increase memory
        footprint. This problem is more common in horizontal fusion, where nodes
        that are far apart in the original order get fused, lengthening the live
        intervals of tensors. This is very evident in models with activation
        checkpointing, where the recomputed nodes from different checkpointed
        regions get fused and significantly increase the memory footprint.

        The current attempt is a quick, possibly hacky, heuristic to prevent the
        fusion of nodes that are far away in the original order.

        A better but difficult to implement heurisitic would be to use live
        intervals of the buffers, find region of peak pressure in the original
        program and prevent fusion that crosses that peak region. We might need
        special care or good approximation in this implementation, as fusion of
        node changes live intervals, and re-computing live intervals and peak
        memory after each fusion can introduce large compilation overhead.
        @   )r0  r"  r   r   )rh   r  r  proximity_scores       rj   are_long_distant_nodesz Scheduler.are_long_distant_nodes  sH    * %/122%/122
 
 ##rl   common_buf_names"Union[tuple[str], OrderedSet[str]]c                   i }d |j                                         D             }d |j                                         D             }|D ]}t          j                            |          }||         }	||         }
t          |	t                    rt          |
t                    s&dt          |	           dt          |
           ||<   |	                                |
                                k    r0d|	                                 d|
                                 ||<   t          |	j
                  t          |
j
                  k    rd||<   |	                                }|
                                }||k    rd| d| ||<   H|	                                |
                                k    rd|	 d|
 ||<   d	}t          |t          j                  s
d
|j         }d|	 d|
 d| ||<   t!          |          S )z}
        Try to decide reasons why fusion fail due to no shared memory even though
        there are common buffers.
        c                    i | ]
}|j         |S r   r  r  s     rj   r   z7Scheduler.decide_fusion_fail_reason.<locals>.<dictcomp>      XXXC#(CXXXrl   c                    i | ]
}|j         |S r   r  r  s     rj   r   z7Scheduler.decide_fusion_fail_reason.<locals>.<dictcomp>  r;  rl   znot MemoryDep: z v.s. zdifferent numel: 	broadcastzdifferent offset: zMismatch loop orders: r   zLayout: zUnknown reason: z. )r   r"  rP   r   r  r   r.   rx   	get_numelrO   r2  
get_offsetnormalize_with_stride_orderr%   r  rz   rd   )rh   r  r  r7  reasonsnode1_name2depnode2_name2depr  r   lhs_deprhs_deplhs_offrhs_off
layout_strs                 rj   decide_fusion_fail_reasonz#Scheduler.decide_fusion_fail_reason	  sY    XX53D3U3U3W3WXXXXX53D3U3U3W3WXXX( ,	 ,	H'$$X..C$X.G$X.Ggy11 GY9W9W Jd7mmJJ4==JJ !   ""g&7&7&9&999X(9(9(;(;XX7CTCTCVCVXX !  W\**mGL.I.III$/!((**G((**G'!! %R$Q$Q$Q$Q! 335566889 9 %VW$U$UG$U$U! Jc2#566 54
44
I7II'IIZII H 7||rl   c                   t           j        rt          d ||fD                       rdS |                                s|                                rdS |j                                        }|j                                        }||z  }|sdS d |j                                        D             }d |j                                        D             }g }|D ]}	||	         }
||	         }|
                                |                                k    rN|                    t          j
        j                            |
                                d          |
|f           t          |          dk    rdS t          |t!          j        d                    \  }}
}t%          |
t&                    rt%          |t&                    sdS |
j        |j        k    rA|
                                |                                k    r|                     |
          S dS d}|                                s|                    |
|          }nk|                                s|                    ||
          }n@t2                              d	|                                |                                           |r|                     ||          ndS )
a  
        Right now just greedily reorder the loop of node1 to be compatible with node2,
        but ideally we should have some heuristics to reorder the loop for node2
        to be compatible with node1 if that's more efficient.

        Return the amount of shared data re-computed in this method.
        If no such recomputation happens, return -1 (not return 0 since 0 is a valid
        amount of shared data).

        c              3  >   K   | ]}|                                 V  d S rf   )r  rt  s     rj   r!  z>Scheduler.shared_data_after_reordering_loop.<locals>.<genexpr>W  s;       8
 8
AHHJJ8
 8
 8
 8
 8
 8
rl   r  c                    i | ]
}|j         |S r   r  r  s     rj   r   z?Scheduler.shared_data_after_reordering_loop.<locals>.<dictcomp>i  r;  rl   c                    i | ]
}|j         |S r   r  r  s     rj   r   z?Scheduler.shared_data_after_reordering_loop.<locals>.<dictcomp>j  r;  rl   r   r  r  Fz?Don't reorder loops since both nodes are reductions: %s v.s. %s)r"   r  r-  r  r   buffer_namesr"  r@  r  rP   r   r  r  r>  r~   r0  r  r  r   r.   r  r  r0  r  r  r  r  rg   r"  )rh   r  r  node1_buffer_namesnode2_buffer_namescommon_buffer_namesrB  rC  
candidatesbuffer_namerD  rE  _numel	reordereds                 rj   !shared_data_after_reordering_loopz+Scheduler.shared_data_after_reordering_loopG  s     0 	C 8
 8
!&8
 8
 8
 5
 5
 	 2
  	%"3"3"5"5 	2".;;==".;;==03EE" 	2XX53D3U3U3W3WXXXXX53D3U3U3W3WXXX 
. 	 	K$[1G$[1G335566889 9 !!(2273D3D3F3FQR2SS   z??a2 $'zx7J17M7M#N#N#N '9-- 	Z5S5S 	2w///
   ""g&7&7&9&999))'2222	!!## 		77IIII##%% 	77IIII##Q       :CJt''u555Jrl   c                    t          |t          t          f          o)|                                 ot	          |j                   S )z>
        Is this node unfusable under any conditions.
        )r   r	  r  r  rL   r\   r   s     rj   r  zScheduler.unfusable_node  sK    
 t79OPQQ C$$&&&C7	BBB	
rl   prologue_noderU  r  r  c                   |                                 t          j        j        k    rdS |                                }|                                }d}|||z  k    r |d           dS t          d |                                D                       }|t          j	        j
        j        j        fk    r |d           dS dd} ||                                j                  r!|                                s |d           dS dS )zT
        Heuristics to avoid benchmarking predictably slow prologue fusions
        T皙?z@prologue fusion will not increase amount of bytes read in kernelFc              3     K   | ]9}|j         0|j                                         D ]}|j        dk    |j        V  :d S )Ncall_function)r\   r  ri   r  )r   ru  r5  s      rj   r!  zEScheduler.check_prologue_fusion_heuristics_fusable.<locals>.<genexpr>  sf       
 
v!V'')) "!t&&	 H '&&&&
 
rl   z\prologue fusion will not increase attempt to fuse in padding bc it increases unaligned readsr7  torch.dtyperc   r   c                &    | j         dk    o| j        S )Nr:  )itemsizeis_floating_point)r7  s    rj   low_prec_fpzGScheduler.check_prologue_fusion_heuristics_fusable.<locals>.low_prec_fp  s    >Q&B5+BBrl   zVprologue fusion that must be upcast to fp32 not profitable for low precision templates)r7  r]  rc   r   )rj  rP   r   invoke_quant_opsr  r  ro  ri  r  rz  r{  constant_pad_ndr  rH  r7  rx  )	rh   rX  rU  r  
read_byteswrite_bytesBYTES_THRESHOLD_MULTIPLIERr  ra  s	            rj   (check_prologue_fusion_heuristics_fusablez2Scheduler.check_prologue_fusion_heuristics_fusable  s[    ,,..!'2JJJ4"88::
#::<< &)"'AABBCRSSS5  
 
",,..
 
 
 
 
 uy~5=???Cn   5	C 	C 	C 	C K@@BBHII	!>>@@	 Ch   5trl   /Optional[tuple[int, SchedulerNode, sympy.Expr]]c                    t          |t                    rt          |t                    sdS t          |j        t          j                  rt          |j        t          j                  sdS |                                s|                                rdS t          j        dk    rdS |j        |j        }}|\  }}|\  }}|	                                s:|	                                s&||k    s t          |          t          |          k    rdS t          |j        j                  dk    st          |j        j                  dk    rdS                      t          t          |j        j                                      }	                     t          t          |j        j                                      }
t!          |	|
          t          j        k    rdS d
 fd} ||          s ||          rdS g }t%          t'          ||                    D ]#\  }\  }}||k    r|                    |           $t          |          dk    rdS |d	         }||         ||         }}t*          j        j                            ||          r|||fS t*          j        j                            ||          r|||fS dS )ao  
        Fusing two small pointwise nodes significantly reduces kernel overhead
        and launch overhead. However, slightly different sizes would prevent fusion.
        Here, we decide if expanding sizes of one node is profitible by allowing
        fusion, and returns the dimension to expand, node with smaller sizes,
        and new size after expand.
        NrA  r    r\   rT   rc   r   c                .   | j         j        D ]}|j        j        v rj        |j                 }nj                            |j                  }|rBt          j        j        	                    ||           rt          |j        t                    s dS dS )NTF)r   r   rp   r   r   r7  rP   r   r   r  r   r^   r  )r\   r  r  rh   s      rj   has_reusable_bufferzIScheduler.get_expand_dim_for_pointwise_nodes.<locals>.has_reusable_buffer  s    (.    9 ;;; $ ;DI FII $ 0 4 4TY ? ?I  ,66y$GG  'y'<>TUU 
  445rl   r   r  )r   rs  r\   r%   r  r.  r"   rB  r  r  r~   r   r   r0  rQ  r0  r0  small_memory_access_thresholdrR  r  r  rP   r   r  statically_known_lt)rh   r  r  n1_sizesn2_sizesn1_iter_sizesn1_reduce_sizesn2_iter_sizesn2_reduce_sizesnode1_write_memorynode2_write_memoryrk  mismatch_dimensionsidxn1_sizen2_sizemismatch_dimmismatch_size1mismatch_size2s   `                  rj   "get_expand_dim_for_pointwise_nodesz,Scheduler.get_expand_dim_for_pointwise_nodes  s    %// 	z%7W7W 	4 uz2#455	5:r'899	 4 ))++ 	u/M/M/O/O 	4 ))4 #\5<()1&)1&  	!!##	 /11=!!S%7%7774 u '((1,,E4E4L0M0MPQ0Q0Q4 "//T%:K:R5S5S0T0TUU!//T%:K:R5S5S0T0TUU"$67723 3 4	 	 	 	 	 	  u%% 	)<)<U)C)C 	4 !'0]M1R1R'S'S 	0 	0#C#'7'!!#**3///"##q((4*1-,',' ' 7//OO 	66W11..QQ 	664rl   c                &   ||u rdS t          ||          }|                                r=|                     |                                                              ||          rdS t          |t                    st          |t                    r |d           dS t          |t          t          f          r!|                                s |d           dS t          |t          t          f          r!|                                s |d           dS |	                                |j
        z  r |d           dS |                                rXt          j        s |d           dS |                                s|                                r |d           dS |                                }t          |t          j                  s |d	           dS |                                }t%          d
 |j        D                       |z
  }|                                |z  r |d           dS |                                s|                                r |d           dS |                                dd         D ]J}|                                }|D ]1}	t1          fd|	j        D                       s |d             dS 2Kt          |t4                    s|gnd |j        D             }
t9          |
          dk    sJ |
d         }t9          d         j                  dk    rNt9          d         j        d         j                  dk    r%d         j        d         j        d         j        |u s |d           dS |                     |||          sdS |                                rA|                                s |                                st          j         s |d           dS |                                tB          j"        j#        z  s&|                                tB          j"        j#        z  r |d           dS |                                }|                                }||k    r |d||           dS ~| $                    ||          }|t          j%        k     r*t          j&        r| '                    ||          }|dk    r|}t          j(        rJ| )                    ||          x}r2|\  }}}|*                    ||           | $                    ||          }tV          ,                    tZ          j.                  rAtV          /                    d|0                                |0                                |           tB          j1        2                    | |||          sdS |	                                |j
        z  ra| 3                    ||          oJtB          j1        3                    | |||          o(|                     |          3                    ||          S tB          j1        4                    | |||          o(|                     |          4                    ||          S )zj
        Determine if it is possible to combine node1 and node2 into a
        single fused node.
        FTz/grouped node must not be fused with other nodesznode1 is extern or nopznode2 is extern or nopznode1 must go before node2zprologue fusion turned offz2prologue fusion only supported for pointwise nodesz2prologue fusion only supported for TritonTemplatesc              3  >   K   | ]}|                                 V  d S rf   r   )r   inps     rj   r!  z%Scheduler.can_fuse.<locals>.<genexpr>{  s*      EEc3<<>>EEEEEErl   z;prologue fusion not implemented for kernel for these inputsz:template prologue can only fuse functional pointwise nodesNr  c              3  *   K   | ]}|j         v V  d S rf   r  )r   r   prologue_nodess     rj   r!  z%Scheduler.can_fuse.<locals>.<genexpr>  s*      QQttyN:QQQQQQrl   z7template prologue can only fuse nodes with a single usec                :    g | ]}|                                 |S r   rN  rt  s     rj   r   z&Scheduler.can_fuse.<locals>.<listcomp>  s%    AAAAAaAAArl   r    r   zEtemplate prologue can only fuse nodes with a single use into templateztemplate epilogue not satisfiedz#fusion for buffer explicit disabledzdevice mismatch (%s vs %s)z%s and %s has %s shared data)5r  r  r  r   can_fuse_multi_outputs_templater   rW  r	  r  rj  r   r"   prologue_fusionr  rH  r%   r  get_allowed_prologue_inpsr   rq  rn  r.  ri  r   rw  ra   r  r  r~   r   r\   rg  r  rP   r   no_fuse_buffer_namesr"  score_fusion_memory_thresholdr  rV  $expand_dimension_for_pointwise_nodesr}  r  r  r  r  r  r  rg   choicesr  can_fuse_verticalcan_fuse_horizontal)rh   r  r  r  rG  r  unsupported_prologue_argsr\   	node_outsr   template_snodestemplate_snoder  device2shared_data_scorenew_shared_data_scoreexpand_analysis
expand_dimsmaller_nodeexpand_sizer  s                       @rj   r  zScheduler.can_fuseE  sT   
 E>>5u%% 	4#3#3$
 $

)
)%
7
7	 4e122 	j'7
 7
 	 CABBB5u8:PQRR	%%''	 C()))5u8:PQRR	%%''	 C()))5$$&&8 	C,---5 8	) 0111u!!## u'8'8':': HIIIu7799Hh(?@@ HIIIu$,$F$F$H$H! EEX_EEEEE'( &
 %%''*CC QRRRu--// 53Q3Q3S3S PQQQu"__..N&ss+ % % ,,..	$ % %CQQQQsyQQQQQ %UVVV$uuu%% "%);<<BAAAAA 
 ''1,,,,,Q/N N2&.//144r*215;<<AA"2&.q17:?>QQ[   u@@sSS u 	**,,	!!##	 )	
 C12225""$$qw'CC 	""$$qw'CC	 C56665!!##""$$WC,fg>>>5 44UEBB DDD1 E %)$J$J5RW$X$X!$))$9!6 	G#FFueTTTO	G 7F3Z{<<ZUUU $ 8 8 F F))'-88 	##.    !	   y!!$u6GHH 	5$$&&8 
	M &&ue44 MI//eUDUVVM$$V,,>>ueLL 900eU$5  M""6**>>ueLLMrl   c                   |                                 }t          ||          }t          t                    }|j        D ]o}| j                            |j        |j                  }t          |t                    r| 
                    |||          rT||                             |           p|j        j        D ]}t          |t                    s|                    | j                            |j        |j                            }	|	r0|	D ]-}
|                     |
|          r|	                    |
           .t#          d t$          j                            |                                          D                       }||z  r |d           dS |                                }|D ]D}| j        |                                         }|| j        |         j        z  r |d            dS EdS )a  
        Check if it is legal to fuse a consumer (node2) into a producer (node1).

        We can fuse them if all the reads of node2 either match
        corresponding writes in node1, or are written by nodes that can
        be scheduled before the fusion of node1 and node2.
        c              3  $   K   | ]}|j         V  d S rf   r  r  s     rj   r!  z.Scheduler.can_fuse_vertical.<locals>.<genexpr>  s8       $
 $
 H$
 $
 $
 $
 $
 $
rl   zmemory deps did not matchFz(intermediate nodes between node1 & node2T)rn  r  r   r   r   r   r7  rp   r   r0   fusable_weak_depr  r   r   r.   fusable_read_and_writer`  r   r@  rA  r  r   rj  r   rk   r]  r   )rh   r  r  node1_buf_namesr  remaining_deps_by_namer   rp   cd	remainingr  remaining_depsnode1_op_namesrU  s                 rj   r  zScheduler.can_fuse_vertical  s.     0022u%%7B47H7H+ 	5 	5C(,,SXsx@@D#w'' D,A,A#ue,T,T "4(//4444#* 		- 		-Bb),, .22%))"'27;; I  -# - -B222r:: -!((,,,# $
 $
 445K5R5R5T5TUU$
 $
 $
 
 

 O+ 	
 C+,,,52244" 	 	D&t,==??G 7 @ JJ >???uu trl   weak_depr0   c                   j         |                                vrdS fd|j        j        D             }t	          |          dk    rdS |d         t          t                    sJ t          j        t          j
                  rdS | j        j                 fd|j        j        D             }t          fd|D                       S )NFc                4    g | ]}|j         j        k    |S r   )rp   r  )r   r_  r  s     rj   r   z.Scheduler.fusable_weak_dep.<locals>.<listcomp>  s3     
 
 
zX222 222rl   r    r   c                *    g | ]}|j         k    |S r   r  )r   r  r  s     rj   r   z.Scheduler.fusable_weak_dep.<locals>.<listcomp>*  s,     
 
 
	Y8N8ND8N8N8Nrl   c              3     K   | ]Y}t          |t                    o?t          |j        t          j                   o|j        j        k    o|j        j        k    V  Zd S rf   )r   r.   r   r1  r   TMPr2  )r   r  r_  s     rj   r!  z-Scheduler.fusable_weak_dep.<locals>.<genexpr>-  s       
 

 	 tY'' ('
DH===(
ek)( 	UZ'	
 
 
 
 
 
rl   )rp   rn  r   r   r~   r   r.   r   r1  r   r  r4  r  r   rw  )rh   r  r  r  mutating_writesrelevant_readsr  r_  s    `    @@rj   r  zScheduler.fusable_weak_dep  s"    = 6 6 8 8885
 
 
 
*1
 
 

 1$$5"%+++++u{DH55 	5+H,AB	
 
 
 
".4
 
 
  
 
 
 

 '
 
 
 
 
 	
rl   r  r-   r_  r.   c                ^   t          |t                    r| j                            |j        |j                  }||j        k    s>t          |j        t          j                  st          |j        t          j                  rdS t          j
        r8|j        |j        k    r(|                                }|                                }|j        |j        k    oSt          |j                  t          |j                  k    o)|j        d t          |j                           |j        k    S t          |t                    ri| j                            |j        |j                  }| j                            |j        |j                  }|j        |j        k    r|j        ||k    rdS dS r   )r   r.   r   r7  rp   r   r1  r   r  r"   r  r  r  r~   r2  r/   r7  )rh   r  r_  	read_name
write_names        rj   r  z Scheduler.fusable_read_and_write9  sw   dI&& 	-11$)TYGGI UZ''&tz48<< (&u{DH== ( u0 *T]en5T5T ~~'')) 
ek) ?	NNc%*oo5?I/EJ/0EJ>
 g&& 	-11$)TYGGI.225:uzJJJ	UZ''J*++turl   r   c                @    t           j                            |          S rf   )rP   r   get_dep_size_hintr)  s     rj   r0  zScheduler.dep_size_hint[  s    w((---rl   c                P    t          |j        j                  t          |j        j                  z   }t          |j        j                  t          j        j                  z   }t	          ||          dz  t          ||          k     rL||k    r|}}|fd|j        j        |j        j        z  D             }t           fd|D                       S |j        j        |j        j        z  j        j        j        j        z  z  }t           fd|D                       S )zn
        The first term in our fusion score that estimates number of saved
        memory operations.
        r  c                L    g | ] }|j         j        v s|j         j        v |!S r   )r   r   r   )r   r   r  s     rj   r   z1Scheduler.score_fusion_memory.<locals>.<listcomp>o  sD       %+111SE<M<T5T5T 5T5T5Trl   c              3  B   K   | ]}                     |          V  d S rf   r/  rP  s     rj   r!  z0Scheduler.score_fusion_memory.<locals>.<genexpr>u  s1      ??3t))#..??????rl   c              3  B   K   | ]}                     |          V  d S rf   r/  rP  s     rj   r!  z0Scheduler.score_fusion_memory.<locals>.<genexpr>z  s1      IIs4%%c**IIIIIIrl   )r~   r   r   r   r  r0  r  )rh   r  r  node1_dep_lennode2_dep_lentmprL  common_memory_depss   ` `     rj   r"  zScheduler.score_fusion_memory^  sF    E-344s5;L;S7T7TTE-344s5;L;S7T7TT }m,,q03}m3T3TTT},,    ,2U5F5MM  D ????$??????#/58I8PP#e&7&>>
 IIII6HIIIIIIrl   r  c                $   t          |          dk    r|S i }|D ]\  }}|                                |                                k    sJ |                                }t          |                     |                              ||                    }||vr	||fg||<   ||                             ||f           t          |                                t          j	        d                    d         }t          |          dk    sJ |S )Nr   r  r    )
r~   r   rm   r  get_fusion_pair_priorityr  r  r  r  r  )rh   r  "possible_fusions_group_by_priorityr  r  r  fusion_pair_priority&possible_fusions_with_highest_prioritys           rj   r  z4Scheduler.get_possible_fusions_with_highest_priority|  sY   
   A%%##  	+ - 	 	LE5##%%)9)9););;;;;%%''F#&  ((AA%OO$ $  $+MMMENL23GHH 33GHOOEN    25.4466H<OPQ<R<R2
 2
 2

2. 9::Q>>>>55rl   +tuple[BaseSchedulerNode, BaseSchedulerNode]r   c                0    t          j        j        | g|R  S )z-
        Shim for list.sort(key=...)
        )rP   r  score_fusionrL  s     rj   r  zScheduler.score_fusion_key  s     y%d3U3333rl   c                    t          t          j                                                  }t	          | j                  D ]7}|                    || j                   |                    |j	                   8dS )zg
        Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
        N)
r   rP   r   r  r"  rI  r;  r4  r  r   )rh   r2  r\   s      rj   r  zScheduler.compute_last_usage  sv    
 ))A)A)C)CDDTZ(( 	8 	8D 3T5LMMM&&t7777	8 	8rl   c                ,   t          | j        t          j        j        z
  t          j        j        j        z
            D ];}|| j        v rK| j        |         }|                                r)t          j        j        	                    |j
                   W|t          j        j        v rt          j        j        |         }t          |t          j                  r%t          j        j        	                    |           t          |t          j                  r|j        }t          |t          j                  r|                                sJ t          j        j        	                    |j                   =| j                                         dS )z*Free any buffers that are no longer neededN)r  r  rP   r   r  r   freedr   r   codegen_freer\   r  r   r%   r  rk  r  ry  is_input_bufferclear)rh   rp   r   r  storages        rj   free_bufferszScheduler.free_buffers  sb   %g%&g"()
 
 	D 	DD
 t'''&t,<<>> @G(55ch???---g*40c2#566 	DG(55c::::R%677 D!hG"7BM::?F?V?V?X?X XG(55glCCC!'')))))rl   c                    | j                                         D ]}|                                 |                                  d S rf   )rn  r   flushr  )rh   rO  s     rj   r  zScheduler.flush  sF    }++-- 	 	GMMOOOOrl   scheduler_noder	  c                   t          |t                    sJ t          d         dxx         dz  cc<   t          j        t          d                    5  |                                 |                                 d d d            n# 1 swxY w Y   |j        }t          |t          j
                  sJ dt          |                      |                    t          j        j                   |                                  d S )Nr  extern_callsr    F)increase_kernel_countztype(node)=)r   r	  r   rP   set_kernel_handlerr*   r  r=  r\   r%   r  rx   r  r   r   r  )rh   r  r\   s      rj   codegen_extern_callzScheduler.codegen_extern_call  s9   .*CDDDDD
 	^,,,1,,,!&u"E"E"EFF 	& 	&00222##%%%	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& "$00BB2BT$ZZ2B2BBB0QW)***s   )B

BBBaseSchedulingc                F   t          |j                  r|j        J | d            t          j                            |           t          |j                  }|t          d|j                   t                      s|j        dk    rKt          j
                            |          x}j        dk     r!t          |t          j                              t          |j                  r+|j        dk    s t!          t          j                               ||           S )Nz( should have been normalized in loweringzUnsupported device type: cuda   mps)rJ   rx   r1  rP   r   add_device_infor)   r0  r   r  r  get_device_propertiesmajorr1   inspectcurrentframer2   )rh   r  device_schedulingdevice_propss       rj   create_backendzScheduler.create_backend  s   &+&& 	
&,*B*B??? +C*BB 	
'''5fkBB$H6;HHIII|| 	<v%%%*Z%E%Ef%M%MM\TWXXX(w7K7M7MNNN$$ <V[E-A-A#G$8$:$:;;;  &&&rl   c                p    |J || j         vr|                     |          | j         |<   | j         |         S rf   )rn  r  r  s     rj   r  zScheduler.get_backend  sB    !!!&&$($7$7$?$?DM&!}V$$rl   c                4    d	 fdfd|                                 D             }t          |                                          }|rLt          |t	          j        d                    \  }}t          j        j        	                    |           d S d S )
Nru  torch.fx.Noderc   rm   c                    | j         vr;j                             d t          | j        j                  D                        j         |          S )Nc                    i | ]\  }}||	S r   r   rO  s      rj   r   z>Scheduler.enter_context.<locals>.get_order.<locals>.<dictcomp>  s    ,W,W,WdaQ,W,W,Wrl   )r  r  rR  r   rI  )ru  rh   s    rj   	get_orderz*Scheduler.enter_context.<locals>.get_order  sQ    ,,,$++,W,Wi>V>V,W,W,WXXX'**rl   c                r    i | ]3}|j         	|j                                         D ]} |          |fd 4S rf   r  )r   ru  r5  r  s      rj   r   z+Scheduler.enter_context.<locals>.<dictcomp>  sY     
 
 
v!V'')) "! Yq\\1t!!!!rl   r   r  )ru  r  rc   rm   )
ri  r   rr  r0  r  r  rP   r   r   enter_context)rh   r\   r  rn  lastr  s   `    @rj   r  zScheduler.enter_context  s    	+ 	+ 	+ 	+ 	+ 	+
 
 
 
^^%%
 
 
 w||~~&& 	5'x':1'='=>>>GAtG ..t44444	5 	5rl   rp   fused_node_namesr3  c                    	 | j         |         j        }n# t          $ r Y dS w xY wt          fd|D                       o|| j        vo|| j        vS )NFc              3  R   K   | ]!}|j         p|                                v V  "d S rf   )r=  rg   )r   r   r  s     rj   r!  zAScheduler.can_buffer_be_removed_through_fusion.<locals>.<genexpr>  s8      VVC3C CVVVVVVrl   )r   ra   KeyErrorrw  r   r4  )rh   rp   r  ra   s     ` rj   $can_buffer_be_removed_through_fusionz.Scheduler.can_buffer_be_removed_through_fusion
  s    	$T*0EE 	 	 	55	 VVVVPUVVVVV 4D114D33	
s    
$$F
should_logc                    |j         }t          |t          j        j        j                  r|j        x}r|                                }t          |t          j        j	                  r| d|j
         n|}|t          j        v s|t          j        v r#t          |t          j        j	                  sJ dS t          j        j        j        j        st          j        dS dd
}|rt"          n|}t          |t$                    r t'           fd|j        D                       S |j         J |                                s |d|           dS t          |j         t          j                  r |d|           dS t          |j         t          j                  r |d|           dS t1          |j         dd          r |d|           dS t3          |j                   r |d|           dS dS )zBReturn True if we should partition the inductor graph on this noder,  TNmsgrd   r\   r]   rc   r   c                    d S rf   r   )r  r\   s     rj   noop_logz,Scheduler.should_partition.<locals>.noop_log:  s    Frl   c              3  B   K   | ]}                     |          V  d S rf   )should_partition)r   rg  rh   s     rj   r!  z-Scheduler.should_partition.<locals>.<genexpr>@  s1      MMt,,U33MMMMMMrl   znon gpu opsr  zDeviceCopy opszConditional opsunbacked_bindingszunbacked binding opszCUDAGraph-unsafe custom opsF)r  rd   r\   r]   rc   r   )r\   r   r  r  r%   r  r
  rp   _ops
OpOverload_overloadnamer"   custom_should_partition_opsr  r  r?   wrapperrN   r  r-  r  rJ   
DeviceCopyConditionalr   rI   )	rh   r\   r  ir_noderi   op_overload_packet_nameop_overload_namer  log_partition_reasons	   `        rj   r  zScheduler.should_partition  sE    )gu1@AA 	%%B	 ')ggii# b%*"788-*??R-=???,  (6+MMM#v'III!"ej&;<<<<<t &-8	6>F4	 	 	 	 AKX<<PXd.// 	NMMMMMMMMMMy$$${{}} 	  T::::4di// 	  !1====4di00 	  !2>>>>4491488 	  !7dCCCC4!$),, 	  !>TJJJJ4url   ;dict[str, Union[ir.IRNode, ir.TorchBindObject, sympy.Expr]]c                    i }|                     t          j        j                   | j        D ]+}|j                                        D ]\  }}|j        ||<   ,|S )z~
        Return a mapping from name strings to the corresponding graph inputs or
        base scheduler node outputs.
        )r  rP   r   r  rI  r   r  r\   )rh   r  r\   rp   scheduler_buffers        rj   get_name_to_nodeszScheduler.get_name_to_nodes[  sv     UWAG0111J 	; 	;D*.*>*D*D*F*F ; ;&&%5%:T""; rl   
signatureslist[GraphPartitionSignature]c           	        d t          t          j        j                  D             }d t          t          j                                                  D             }g t          j        _        t          |          D ]\  }}|j        rg }|j        D ]*}|                    |	                    |                     +g }|j
        D ]<}	|                    |	                    |	                                                     =t          j        j                            t          ||||j                             dS )z
        computes a mapping from partition input/output indices to graph input/output
        indices for each partition.
        c                    i | ]\  }}||	S r   r   r   rw  rp   s      rj   r   z:Scheduler.compute_graph_partition_maps.<locals>.<dictcomp>s  s+     %
 %
 %
##tD#%
 %
 %
rl   c                    i | ]\  }}||	S r   r   r  s      rj   r   z:Scheduler.compute_graph_partition_maps.<locals>.<dictcomp>v  s+     &
 &
 &
##tD#&
 &
 &
rl   N)rR  rP   r   r  r  partition_mapsskip_cudagraphinput_nodesr  r7  output_nodesrg   rF   constant_names)
rh   r  name_to_graph_input_indexname_to_graph_output_indexpartition_id	signatureinput_mappingrp   output_mappingr\   s
             rj   compute_graph_partition_mapsz&Scheduler.compute_graph_partition_mapsk  si   %
 %
'01E'F'F%
 %
 %
!&
 &
'01I1I1K1K'L'L&
 &
 &
" "$'0'<'< 	 	#L)' 
 M!- J J$$%>%B%B4%H%HIIIIN!. W W%%&@&D&DT]]__&U&UVVVVG"))! !",	    !	 	rl   	partitionrU   r	  OrderedSet[sympy.Symbol]c                  	
 d	fd	d	
fd
d	fd	dd} t                      j        
fd|D              } |j        fd|                                D                ||          }t                      }|D ]@}t          j        j                            |          }|                    |j                   At          t          |t          j        d                              S )ai  
        Returns all symbol inputs which are required to be in scope to successfully
        perform codegen for this graph partition, including:
        - free symbols used in partition nodes
        - free symbols in partition input/node shapes, strides, and offsets. This is needed
          for recording cudagraphs for tensors with dynamic shapes.
        r\   	ir.IRNoderc   r  c                   t                      }|                                 }t          |t          j                  r|                    t          |j                  t          |j                  z  t          |j	                  z             t          |t          j
                  r#|                     |j                             n|J d|             |S )Nz*Expect layout to be None but found layout=)r   maybe_get_layoutr   r%   Layoutr  r   r2  strideoffsetr  r  )r\   free_symbol_usesrz   get_layout_symintss      rj   r  zGScheduler.get_graph_partition_symbol_inputs.<locals>.get_layout_symints  s    9C**,,F&"),,  '' --"6=112"6=112  
 fb&CDD O$++,>,>v},M,MNNN~~III &~~ $#rl   rT   c                   t          | t                    r' t                      j        fd| j        D              S | j        J | j                                        } |j        fd| j                                        D               |S )z4
            Gets symbols used in node.
            c              3  .   K   | ]} |          V  d S rf   r   )r   rg  get_scheduler_node_symbol_usess     rj   r!  zfScheduler.get_graph_partition_symbol_inputs.<locals>.get_scheduler_node_symbol_uses.<locals>.<genexpr>  s/      UU44U;;UUUUUUrl   Nc              3  .   K   | ]} |          V  d S rf   r   )r   r  r  s     rj   r!  zfScheduler.get_graph_partition_symbol_inputs.<locals>.get_scheduler_node_symbol_uses.<locals>.<genexpr>  s/      UU'$$W--UUUUUUrl   )	r   r  r   rN  r  r\   r  r  r   )r\   r  r  r   s     rj   r   zSScheduler.get_graph_partition_symbol_inputs.<locals>.get_scheduler_node_symbol_uses  s     $ 233 )z||)UUUUUUU  9(((#y==??##UUUUTY=R=R=T=TUUU  $#rl   0Union[ir.IRNode, sympy.Expr, ir.TorchBindObject]c                    t          | t          j                  rt                      S t          | t          j                  r |           S t          dt          |                      )zW
            Gets symbols used in input node shapes, strides, and offsets.
            zUnsupported input node type: )r   r%   r  r   r%  r  rx   )r\   r  s    rj   get_input_node_symbolszKScheduler.get_graph_partition_symbol_inputs.<locals>.get_input_node_symbols  sj     $ 233 X!||#D"),, X))$/// **V$t***V*VWWWrl   symbolsc                4    t          d | D                       S )z
            Filters a set of symbols that are required for codegen. Skip symbols
            that are always internal to kernels, such as SymT.TMP, SymT.INDEX,
            and SymT.R0_INDEX.
            c              3     K   | ]B}t          |t          j        t          j        t          j        t          j        f          >|V  Cd S rf   )r   r   SIZEFLOATUNBACKED_INTUNBACKED_FLOATr  s     rj   r!  zVScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols.<locals>.<genexpr>  sd        !	
)+	      rl   r   )r%  s    rj   filter_symbolszCScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols  s2             rl   c              3  .   K   | ]} |          V  d S rf   r   )r   r\   r   s     rj   r!  z>Scheduler.get_graph_partition_symbol_inputs.<locals>.<genexpr>  s/      IIt,,T22IIIIIIrl   c              3  4   K   | ]\  }} |          V  d S rf   r   )r   rn  r\   r$  s      rj   r!  z>Scheduler.get_graph_partition_symbol_inputs.<locals>.<genexpr>  s3      NNwq$$$T**NNNNNNrl   rp   r  )r\   r  rc   r  )r\   rT   rc   r  )r\   r"  rc   r  )r%  r  rc   r  )r   rN  r  rP   r   r  simplifyr  r   r  r  
attrgetter)rh   r  r	  r,  candidate_symbolsresr  symplified_sr$  r  r   s           @@@rj   !get_graph_partition_symbol_inputsz+Scheduler.get_graph_partition_symbol_inputs  se   	$ 	$ 	$ 	$ 	$ 	$$	$ 	$ 	$ 	$ 	$ 	$ 	$"	X 	X 	X 	X 	X 	X 	 	 	 	, 7Ijll6HIIIIyIII7
 	 NNNN+:K:K:M:MNNN	
 	
 +N+<==(2" 	2 	2A7+44Q77LJJ|01111&(*=f*E*EFFFGGGrl   
partitionslist[PartitionType]skip_cudagraphs
list[bool]c           	         g }t          t          j                                                  }                                 d fdt          t          |          t          |                    D ]\  }}t                      }|D ].}|                    |j        	                                           /|
                    |          }	t          j                            d |D                       }
t          fd|
j        |
j        z  D                       |z
  }t           fd|D                       }t                      |D ]}                    |j                   fd	|z
  D             }|                    |           fd
|D             }fd|D             }fd|D             }|	                    |           t           fd|	D                       }	fd|	D             }d |D             }                     ||          }t%          ||||||          }|                    |           |                    ||	z
            }|ddd         S )z
        Gets signature for each graph partition, including input nodes, output nodes, and
        whether deallocating an input within graph partition.
        r  rd   rc   r   c                   j                             | d          }|dS t          |j        j        t
                    rIt          |j        t          j                  r(j                            | d          x}r |          S dS dS )z
            Checks if buf_name is NoneLayout. Buffers with NoneLayout is not allocated
            so graph partition should not take it as inputs or outputs.
            NFT)	r   r7  r   r\   rz   r8   r%   MutationOutputr4  )r  r   r  is_none_layoutrh   s      rj   r<  z?Scheduler.get_graph_partition_signature.<locals>.is_none_layout  s    
 "&&x66C{u#(/:66 ch(9:: 5!%!8!<!<Xt!L!LLI5 *>)444t5rl   c                    g | ]	}|j         
S r   rG  rh  s     rj   r   z;Scheduler.get_graph_partition_signature.<locals>.<listcomp>(  s    888d!888rl   c                >    g | ]} |j                   |j         S r   r  )r   r  r<  s     rj   r   z;Scheduler.get_graph_partition_signature.<locals>.<listcomp>/  s=       -~af55  rl   c              3  N   K   | ]}j                             ||          V   d S rf   r4  r7  r  s     rj   r!  z:Scheduler.get_graph_partition_signature.<locals>.<genexpr>8  J       / / '++D$77/ / / / / /rl   c                    g | ]}|v |	S r   r   r   rp   r  s     rj   r   z;Scheduler.get_graph_partition_signature.<locals>.<listcomp>D  s.     ! ! !<'' '''rl   c                *    i | ]}|v ||         S r   r   rC  s     rj   r   z;Scheduler.get_graph_partition_signature.<locals>.<dictcomp>K  s5       <'' l4('''rl   c                *    i | ]}|v ||v rd ndS )TFr   r   rp   r  r  s     rj   r   z;Scheduler.get_graph_partition_signature.<locals>.<dictcomp>P  s?     " " "<'' d&:::dd'''rl   c                $    g | ]}|v |v
|S r   r   rF  s     rj   r   z;Scheduler.get_graph_partition_signature.<locals>.<listcomp>Z  s9     " " "<''D8L,L,L ,L,L,Lrl   c              3  N   K   | ]}j                             ||          V   d S rf   r@  r  s     rj   r!  z:Scheduler.get_graph_partition_signature.<locals>.<genexpr>b  rA  rl   c                6    g | ]} |          |         S r   r   )r   rp   r<  r  s     rj   r   z;Scheduler.get_graph_partition_signature.<locals>.<listcomp>g  s>       %~d++T"  rl   c                8    g | ]}|t           j        j        v |S r   )rP   r   rs  r   rp   s     rj   r   z;Scheduler.get_graph_partition_signature.<locals>.<listcomp>m  s.       $!'BS:S:S:S:S:Srl   Nr  )r  rd   rc   r   )r   rP   r   r  r   r  r"  r  r   rr  r!  r$   rL  rM  r   r   r   r4  r5   r  rN  )rh   r5  r7  r  unmet_output_namesr  r  output_namesr\   returned_output_namesr   partition_input_namesextra_input_namesr	  input_deallocationextra_output_namesr
  r  symbol_inputspartition_signaturer  r<  r  s   `                   @@@rj   get_graph_partition_signaturez'Scheduler.get_graph_partition_signature  s    
'(@(@(B(BCC--//	 	 	 	 	 	 	( *-Z  (?";";*
 *
 g	 g	%I~ -7LLL! A A##D$8$=$=$?$?@@@@$0$=$=>P$Q$Q! '1<<88i888 K    !,!2[5G!G     " %/ / / / /1/ / / % %!
 5?LL ! = =$++DO<<<<
! ! ! !1L@! ! !
 "(():;;;   1  K
" " " " "1" " "" " " " "1" " " "(();<<<$. / / / /1/ / / % %!
    1  L !6  N !BB; M #:"# # 1222!6!<!<"%::" " $$B$rl   r  r5   c                   d |j                                         D             }d |j                                        D             }d |j        D             }d |j        D             }t          |j        ||||j        |          S )z
        Updates the partition signature by removing buffers specified in
        V.graph.removed_buffers. See [Note: Removed Graph Partition Arguments]
        c                @    i | ]\  }}|t           j        j        v||S r   rP   r   r  )r   rp   r  s      rj   r   zLScheduler.clean_removed_buffer_from_partition_signatures.<locals>.<dictcomp>  s8     
 
 
f17222 &222rl   c                @    i | ]\  }}|t           j        j        v||S r   rX  )r   rp   r  s      rj   r   zLScheduler.clean_removed_buffer_from_partition_signatures.<locals>.<dictcomp>  s8     
 
 
c17222 #222rl   c                \    g | ])}|                                 t          j        j        v'|*S r   )maybe_get_namerP   r   r  rh  s     rj   r   zLScheduler.clean_removed_buffer_from_partition_signatures.<locals>.<listcomp>  s>     
 
 
""$$AG,CCC CCCrl   c                8    g | ]}|t           j        j        v|S r   rX  rK  s     rj   r   zLScheduler.clean_removed_buffer_from_partition_signatures.<locals>.<listcomp>  s2     
 
 
17222 222rl   )r	  r  rQ  r
  r  r5   rS  r  )rh   r  r	  rQ  r
  r  s         rj   .clean_removed_buffer_from_partition_signaturesz8Scheduler.clean_removed_buffer_from_partition_signatures  s    
 
 ) 5 ; ; = =
 
 


 
&9??AA
 
 


 
!.
 
 


 
!0
 
 

 '#$
 
 	
rl   c                   	
 ddl t                      
g g d t          |          D             d fd	d	
fd	}|D ]5}t          |j        j                  
|<   
|         dk    r 	|           6g }d}|t          |          k     rsrr:                              \  }}|                    |            ||           :r:                              \  }}|                    |            ||           :|d
z  }|t          |          k     r|t          |          k    rt          d          |S )a  
        Reorder nodes to minimize the number of partitions via a bfs
        topological sort. This is the optimal reordering such that the
        number of partitions cannot be reduced further. This may be
        sub-optimal for other metrics such as peak memory. This does not
        change relative orders of two cudagraphable nodes, nor the
        relative order of two non_cudagraphable nodes.
        r   Nc                    i | ]\  }}||	S r   r   )r   rw  r\   s      rj   r   z>Scheduler.reorder_for_minimizing_partition.<locals>.<dictcomp>  s    EEEysDsEEErl   r\   rT   rc   r   c                    |          | f}                     |           r                    |           d S                     |           d S rf   )r  heappush)r\   node_with_indexcudagraphable_nodesheapqnode_to_indexnon_cudagraphable_nodesrh   s     rj   insert_pending_nodeszHScheduler.reorder_for_minimizing_partition.<locals>.insert_pending_nodes  s_    ,T2D9O$$T** E6HHHHH2ODDDDDrl   c                    | j         j        D ]7}|         dk    sJ |xx         dz  cc<   |         dk    r |           8d S )Nr   r    )r   
succ_nodes)r\   	succ_noderg  node_to_indegrees     rj   update_indegreezCScheduler.reorder_for_minimizing_partition.<locals>.update_indegree  su    !]5 4 4	'	2Q6666 +++q0+++#I.!33((333	4 4rl   r    z
                Failed to schedule, while loop ran too long when
                reordering for minimizing the num of partitions
                r\   rT   rc   r   )	rd  r  rR  r~   r   
pred_nodesheappopr  r0  )rh   rI  rl  r\   schedule	num_itersrn  rc  rd  rg  rk  re  rf  s   `      @@@@@@rj    reorder_for_minimizing_partitionz*Scheduler.reorder_for_minimizing_partition  s    	9=CEGIEEIe4D4DEEE	E 	E 	E 	E 	E 	E 	E 	E 	E 	E	4 	4 	4 	4 	4 	4 	4  	+ 	+D%()A%B%BT"%**$$T***,.	#e**$$# %': % * &--(?@@4%%%%%% * &
 & &--(;<<4%%%%%% & &
 NI #e**$$# %': % s5zz!!   rl   c           	     `   ddl m}m} t          t          j                                                  } ||| j        | j        t          t          j        j	        
                                          |          \  }}|                     |          } ||||          \  }}	||dz  k     r|S |S )zx
        Reorder nodes to minimize the number of partitions if this only slightly
        increase peak memory.
        r    )estimate_peak_memoryprepare_planning_inforZ  )r  rt  ru  r   rP   r   r  r   r]  r  rr  rr  )
rh   rI  rt  ru  r  default_peak_memoryr  reordered_nodesreorder_peak_memoryrn  s
             rj   r  z0Scheduler.maybe_reorder_for_minimizing_partition  s     	HGGGGGGG"17#;#;#=#=>>:O:O#qw+002233;
 ;
77 ??FF!5!57"
 "
Q
 !4s!:::""rl   c                *   g }g }g }dd}|D ]}|                      |          }|r.t          |j                  dk    r|                    |           G|r! ||          r|                    |           j|                    |           ||z   |z   S )a  
        Reorder a node if it should be partitioned and has simple dependency:
        1. move a partitioned node to the front if it has no dependency
        2. move a partitioned node to the back if it is only used by OutputNode
        3. otherwise do not reorder
        r\   rT   rc   r   c                    |                                  D ]*}|j        D ] }t          |j        t                    s  dS !+dS r   )r   ra   r   r\   r   )r\   r   r   s      rj   only_output_userzPScheduler.reorder_for_partition_with_simple_dependency.<locals>.only_output_user  s\    '')) % %9 % %C%ch
;; %$uuu%% 4rl   r   r  )r  r~   r   r  )rh   rI  frontmiddlebackr{  r\   r  s           rj   r  z6Scheduler.reorder_for_partition_with_simple_dependency
  s     *,*,(*	 	 	 	  	$ 	$D#44T:: $C(?$@$@A$E$ET""""! $&6&6t&<&< $D!!!!d####v~$$rl   9tuple[list[PartitionType], list[GraphPartitionSignature]]c                   g }d}g }g }| j         D ]d}|                     |d          }|r2||k    r,|                    |           |                    |           g }|}|                    |           e|r*|                    |           |                    |           |                     ||          }|                     |           ||fS )z
        Given a list of BaseSchedulerNodes, split into a list of
        graph partitions and compute partition input/output signatures.
        T)r  )r5  r7  )rI  r  r  rU  r  )rh   r5  r  cur_partitionr7  r\   r  r  s           rj   r  zScheduler.graph_partition*  s    +-
')J 	' 	'D#44Td4KK #3C!C!C!!-000&&~666 "-N  &&&& 	3m,,,"">22277!? 8 
 

 	))*555:%%rl   c                    t          d          5  t          j        j        j        r|                                 n|                     | j                  	 cd d d            S # 1 swxY w Y   d S )NzScheduler.codegen)r   r  r  r"   r  _codegen_partitions_codegenrI  rq   s    rj   r  zScheduler.codegenJ  s    -.. 	 	 ?)9/((***]]4:..	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   AA""A&)A&c                   ddl m} t          j        j        }t          | j                  }t          j                                        5  t          j                            dd| ||           | 	                    |           t          t          j        j        |          sJ |                     |          }|t          j        j        _        t          j        j                                         t          j        j                            t          j        j                  \  }}ddd           n# 1 swxY w Y   t          j        j                            |j                   t          j        j                            ||           t          j        j        j                            d |j        D                        dS )z,Codegen a partition given its inputs/outputsr    )SubgraphPythonWrapperCodegenT
partition_)is_subgraphsubgraph_nameparent_wrapper_codepartition_signaturesNc                6    g | ]}|                                 S r   r   rh  s     rj   r   z8Scheduler._codegen_partition_wrapper.<locals>.<listcomp>y  s     @@@T]]__@@@rl   )r  r  rP   r   r   rQ  rq  set_current_wrapper_codeinit_wrapper_coder  r   r]  r  write_prefixgenerateis_inferencedefine_subgraph_launcher_fnr$  codegen_partition_call	allocatedr  r
  )rh   r  r  r  r  graph_partition_idpartition_codern  s           rj   _codegen_partition_wrapperz$Scheduler._codegen_partition_wrapperR  s    	BAAAAAg2!$"?@@W--// 	T 	TG%% ?+=??$7%.	 &    MM)$$$ ag24PQQQQQKKIVVI8AAG 5G --/// ! 4 = =ag>R S SNA-	T 	T 	T 	T 	T 	T 	T 	T 	T 	T 	T 	T 	T 	T 	T0 	
889MNNN	334F	RRR	&--@@)?@@@	
 	
 	
 	
 	
s   
C!D77D;>D;'contextlib.AbstractContextManager[None]c                P     t           j        d fd            } |            S )Nrc   Iterator[None]c               3  <  K                                    j        r]t          j        j                  rDj        j        
J d            t
          j        j                            j        j                   	 d V  j        r<t          j        j                  r#t
          j        j        	                                 d _        d S # j        r<t          j        j                  r#t
          j        j        	                                 d _        w xY w)Ndevice should have an index)
%update_graph_partition_default_devicerw  rB   rx   r1  rP   r   r   codegen_device_guard_entercodegen_device_guard_exit)r5  rh   r  s   rj   ctxz1Scheduler.use_default_device_context.<locals>.ctx  s@     66z:NNN* /@+00 0  28DD1 EDD $??/5  3. E3D/44 4 E G(BBDDD.2+++	 . E3D/44 4 E G(BBDDD.2+2222s   ?C AD)rc   r  )
contextlibcontextmanager)rh   r5  r  r  s   ``` rj   use_default_device_contextz$Scheduler.use_default_device_context|  sJ     
	"	3 	3 	3 	3 	3 	3 	3 
#	"	3* suurl   c                    t          |          dk    r|d         j        sd S dd}dd
}d }t          ||          D ]\  }}|j        s ||          } n|d S t          ||          D ]\  }}|j        r |||          s d S || _        d S )Nr    r   r  rU   rc   r  c                B    | d                                          }|J |S rt  r   )r  partition_devices     rj   get_cudagraph_partition_devicezWScheduler.update_graph_partition_default_device.<locals>.get_cudagraph_partition_device  s*    (|6688#///##rl   target_devicer   c                J    | D ]}|                                 }||k    r dS  dS r   r  )r  r  r\   r  s       rj   all_on_target_devicezMScheduler.update_graph_partition_default_device.<locals>.all_on_target_device  s>     " ! !**]** 55 +4rl   )r  rU   rc   r  )r  rU   r  r  rc   r   )r~   r  r  rw  )rh   r5  r  r  r  cudagraph_partition_devicer  r  s           rj   r  z/Scheduler.update_graph_partition_default_device  s    z??a
1(D F	$ 	$ 	$ 	$
	 	 	 	 &*"$'
J$?$? 	 	 Iy+ -K-KI-V-V*
 &-F$'
J$?$? 	 	 Iy' 0D0D51 1  &@###rl   c                .   |                                  \  }}t          |          dk    r$dt          |           d}t          |d           |                     ||          5  t	          ||          D ]e\  }}t          |          dk    sJ dt          |                       |j        r|                     |           O|                     ||           f	 ddd           n# 1 swxY w Y   t          | j	                  }t          j        j                            |           |dk    rat          j        j        J |t          t          j        j                  k    s.J d	| d
t          t          j        j                               dS dS )z
        Split nodes into partitions and codegen each partition into separate functions.
        This allows further applying different optimizations (e.g., cudagraph) to
        each function.
        r    zcudagraph partition into z partitionsr   )r  prefixz5Each partition must have at least one node but found Nr   zExpect z partition maps but got )r  r~   rN   r  r  r  r  r  rQ  rq  rP   r   r   set_all_partition_namesr  )rh   r5  r  r  r  r  num_partitionss          rj   r  zScheduler._codegen_partitions  s    "&!5!5!7!7
Jz??QJc*ooJJJC)c"====,,ZDD 		J 		J(+J
(C(C J J$	99~~***\CPYNN\\ +** + JMM),,,,33IyIIIIJ		J 		J 		J 		J 		J 		J 		J 		J 		J 		J 		J 		J 		J 		J 		J d;<<	44^DDD A7)555!S)?%@%@@@@_.__#agF\B]B]__ A@@ @@s   %A7C))C-0C-c                   t           j        rdd l}t          j                    }t                      }t          |          D ]k}|j        dk    r|j        |j	        j
        j        k    r nC|j        |j        f}||vsJ d|j         d|j         d            |                    |           l| j        | _        | j        r4t           j        j        r#t$          j        j                                         |D ]}t,                              t0          j                  r	 t,                              d|                                |                                           n=# t:          $ r0 t,                              d|                                           Y nw xY w|                     |           |                                x}r|| j        k    s(|                                 s|!                                r| "                                 || j        k    r| j        r<tG          | j        j$                  r#t$          j        j        %                                 || _        tG          |j$                  r:|j&        
J d            t$          j        j        '                    |j&                   || _(        | j)        *                    |j+                   |!                                rd|,                    t[          |.                                                    \  }	}
}| /                    |          0                    |
||	           nN|                                 r1tc          j2        tf          |          }| 4                    |           n	|5                                rtc          j2        tl          |          }| /                    |          }d	d
l7m8} d	dl9m:} tw          |||f          r|}nty          dtI          |                     |=                    |           nptw          |t|          t~          f          r)| /                    |          @                    |           n+tw          |t                    sJ |B                                 t           j        jC        r'| /                    |          D                                 | jE        *                    |F                                           | jG        *                    |H                                           tw          |t                    s\|                                }|F|j$        dk    r;| /                    |          I                                r| "                                 | j        | j        k    rE| j        J tG          | j        j$                  r#t$          j        j        %                                 | "                                 d S )Nr   _compile_innerzDuplicate stack frame :zs; did you add a decorator to one of the functions in this stack trace?  If so, try using a context manager instead.z5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0r  r    )CUDACombinedSchedulingr  ztype(self)=r  )Jr"   "check_stack_no_cycles_TESTING_ONLYtorch._dynamo.convert_frame	tracebackextract_stackr   r"  rp   filename_dynamoconvert_frame__file__linenorI  rw  r  r  autotune_at_compile_timerP   r   r   write_get_raw_stream_headerr   r  r  r  r  rg   r!  r   r  r   r  r  r  rB   rx   r  r1  r  ru  r  r  r   rW  r   ri  r  codegen_templater  r  r	  r  r  r   codegen.cuda_combined_schedulingr  r  r  r   r/  codegen_combo_kernelr  rs  codegen_noder  r=  debug_sync_kernelcodegen_syncrO  rn  r  rj  ready_to_flush)rh   rI  r  stackr)  framer  r\   r  rT  rU  rV  backend_r  r  rO  s                   rj   r  zScheduler._codegen  s   4 	....+--E7A||D!%   J"222%-*E*NNNE~u|4$JU^ J Jel J J J '
 "9& 	?6=+Q 	?G <<>>> L	! L	!D.. 
IIO2244   
 !   IIP     t$$$***v Vd111~~'' 2'')) 2 JJLLLT000* I/@+00 0 I ,FFHHH*0D'(55 V%|779V777,GGUUU $D%,,T_===!!  484W4W))**5 51-   ((99!8X    !!  {#<dCC((....""  {#=tDD++F33TTTTTT888888h9O(PQQ ;&GG()9DJJ)9)9:::,,T2222D#5}"EFF    ((55d;;;;!$(>?????}. 8  ((55777'..t/D/D/F/FGGG%,,T-E-E-G-GHHHd$:;; !**&v--((00??AA . JJLLL$"=== &222 !4!9:: A $>>@@@

s   A E7FF(tuple[float, float, list[Optional[str]]]c                    |d                                          }| t          j        _        || _        |J |                     |          }|                    |          S )rK  r   )r   rP   r   r[   r  r  benchmark_combo_kernel)rh   r  r  rO  s       rj   r  z Scheduler.benchmark_combo_kernel[  s^     1((** $!!!""6**--i888rl   c                z   t           j        sdS |}|d                                         }||j        dk    rdS ddlm} dg }}t          |          D ]\  }}|                                }	|                     |	          rt          
                    d           	 |                     |	          \  }
}t          j        |
          rt          
                    d|            d	S n@# |$ r8}d
t          |          v r!t          
                    d           Y d}~ dS  d}~ww xY w||
z  }|                    |           	 |                     |          \  }}}n?# |$ r7}d
t          |          v r t          
                    d           Y d}~dS  d}~ww xY w||z
  dk     p|dk     }t                              t"          j                  rc||k    s|r.t          
                    dt'          ||z  d                     n-t          
                    dt)          ||z  d                     ||z
  |k     p|S )r  Tr   Nr  r  g        z<ComboKernel: benchmarking may not accurate due to atomic_addz;ComboKernel benchmark: register spilling of %d-th subkernelFr  zCComboKernel benchmark: return True because of loop-carried variableg333333?z/can fuse (benchmark): fusing causes %sx speedupr  z3cannot fuse (benchmark): fusing causes %sx slowdown)r"   r  r   rx   r  r  rR  ri  r  r  r  rL  r  r  rd   r  r  r  r  r<   r=   )rh   rI  subkernel_nodesr  r  r  
path1_listrP  rg  r  r4  r  r5  r  	ms2_clone_path2_listsmall_kernels                    rj   r  z!Scheduler.speedup_by_combo_kerneli  s   
 , 	4 #..00 >V[E114;;;;;;rZ!/22 	$ 	$HAu))I ##I..   R  55i@@D:b>> !$$U   !55! $   *c!ff44$$]    444444 2ICd####
	*.*E*Eo*V*V'CKK 	 	 	&#a&&00  Y   ttttt	 Y,9c	""7=11 
	SyyLy  E#)1122   
   Ic	//00  
 Y$44s=   AC''D$,+DDD$E F"+FFFr  	ir.Layoutc                `    | j         |         }|j        J |j                                        S rf   )r   r\   
get_layout)rh   r  r   s      rj   get_buffer_layoutzScheduler.get_buffer_layout  s1    x(x###x""$$$rl   c                   | j         D ]}|                                r|j        j        D ]}t          j        j                            |j                  }|rut          |          dk    rbt          |j        t          t          f          sA|                                g k    r)t          j        j                            |j                   d S r  )rI  rJ   r   r   rP   r   rJ  r7  rp   r4   r   rz   r8   r7   r	  zero_dim_cpu_tensor_listrI  )rh   r\   r  r  s       rj   rv  z$Scheduler.update_zero_dim_cpu_tensor  s    J 	H 	HD{{}} H ,2 
H 
HDW377	BBFH+F33u<< *"MJ8I+J! ! = #OO--338<<TYGGG	H 	Hrl   )rI  rI  rc   r   )rc   r  r   )r  r   rc   r   r   )r  rd   rc   r   )r\   r   rc   rT   r   )rg  rT   rc   rJ  )rc   r  rI  r{  rc   rI  rf   rI  r{  rP  r   rQ  rR  rc   rd   )rV  r   r  r  rc   rI  )r  r{  rc   r   )r  rT   r  rT   rc   r  )r\   rT   rc   rT   )rY  rR  rc   r   r  )rI  rJ  rc   r  r  rT   r  rT   rc   r   )r  rT   r  rT   r,  rm   rc   r   )r  rT   r  rT   r7  r8  rc   rd   r  rT   r  rT   rc   rm   r  )rX  rT   rU  rT   r  r  rc   r   )r  rT   r  rT   rc   rh  )r  r0   r  rT   r  rT   rc   r   )r  r-   r_  r.   rc   r   )r   r-   rc   rm   )r  r  rc   r  )rI  r  rc   r   )r  r	  rc   r   )r  r  rc   r  )r  r   rc   r  rm  )rp   rd   r  r3  rc   r   r  )r\   rT   r  r   rc   r   )rc   r  )r  r  rc   r   )r  rU   r	  r  rc   r  )r5  r6  r7  r8  rc   r  )r  r5   rc   r5   )rc   r  )r  rU   r  r5   rc   r   )r5  r6  r  r  rc   r  )r5  r6  r  r  rc   r   r  r{  rc   r  )rI  rJ  rc   r   )r  rd   rc   r  )Qry   r   r   rB  r   rK  rx  propertyr  setterr  r  rO  r  rS   r  r|  r{  r3  r  r}  r  r
  r  rL  rU  rX  r  r  r  r  rE  r  ra  r  r  r+  r2  r6  rI  rV  r  rg  r}  r  r  r  r  r0  r"  r  r  r  r  r  r  r  r  r  r  r  r   r  r4  rU  r]  rr  r  r  r  r  r  r  r  r  r  r  r  r  rv  r  r  s   @rj   rZ   rZ     s        
   m
 m
 m
 m
 m
 m
^	# 	# 	# 	# & & & X& ( ( ( (7 7 7 7# # # #, , , ," " " "HsP sP sP sPjK K K KZ(# (# (# (#T   6S S S S(   4# # # #&$ $ $ $6   :	 	 	 	8 8 8 8, (,	    &
> 
> 
> 
>w@ w@ w@ w@r
 
 
 
s( s( s( s(j	> > > >h h h hT.. .. .. .. ..`? ? ? ?4  4  4  4 l, , , ,\7 7 7 7rM M M M$ $ $ $6< < < <|UK UK UK UKn
 
 
 
9 9 9 9v` ` ` `DXM XM XM XMt3 3 3 3j
 
 
 
J       D. . . .J J J J<6 6 6 6@4 4 4 4	8 	8 	8 	8* * * *4   
   ' ' ' '*% % % %5 5 5 5$
 
 
 
 ;@B B B B BH    ' ' ' 'ReH eH eH eHNI  I  I  I V"
 "
 "
 "
H? ? ? ?B   >% % % %@& & & &@   (
 (
 (
 (
T   6-A -A -A -A^   Br r r rh9 9 9 9I5 I5 I5 I5V% % % %
H H H H H H H Hrl   c                       e Zd Zd5 fdZd6dZd7d
Zd8dZd8dZd8dZd9dZ	d:dZ
d;dZ	 d<d=d$Zd>d'Zd6d(Zd?d)Zd6d*Zd@d,ZdAd/ZdBd1ZdCd4Z xZS )Dr  r[   Optional[Scheduler]c                V    t                                                       || _        d S rf   )r  r   r[   )rh   r[   r  s     rj   r   zBaseScheduling.__init__  s$    "rl   rc   r   c                J    | j         r| j                                          d S d S rf   )r[   r  rq   s    rj   free_buffers_in_schedulerz(BaseScheduling.free_buffers_in_scheduler  s0    > 	*N'')))))	* 	*rl   r  r  OrderedSet[BackendFeature]c                    t                      S )z0Return a set of .codegen.common.BackendFeature()r   r  s     rj   get_backend_featuresz#BaseScheduling.get_backend_features  s    ||rl   r  rT   r  r   c                    t           )zO
        Check whether node1 and node2 can be vertically fused or not.
        r  r  s      rj   r  z BaseScheduling.can_fuse_vertical  
     "!rl   c                    t           )zQ
        Check whether node1 and node2 can be horizontally fused or not.
        r  r  s      rj   r  z"BaseScheduling.can_fuse_horizontal  r  rl   c                    dS )au  
        A Multi-Output Template (referenced in #144012) is a template node
        with MultiOutputLayout, and its output buffers are instances of MultiOutput.
        In this context, we verify whether node1 represents the Multi-Output Template
        and node2 corresponds to one of its outputs. If so, we further check if
        backend supports this fusion.
        Fr   r  s      rj   r  z.BaseScheduling.can_fuse_multi_outputs_template  s	     url   r  c                    |                                 s|                                 rt                              ||          S t                              ||          S )z 
        Fuse two nodes
        )r  r  r`  r  r  s      rj   r`  zBaseScheduling.fuse  sW      	9!1!1!3!3 	9-225%@@@%**5%888rl   r  r  "tuple[tuple[sympy.Expr, ...], ...]c                    t           )z[
        Process the iteration sizes in case a transformation needs to be applied.
        r  )rh   r  s     rj   r  zBaseScheduling.group_fn  r  rl   rU  epilogue_nodesr{  r  Optional[str]c                    t           )z
        Given a template node, generate a kernel.

        This function is only available for triton now. If the third-party backend behaves as a sub-class
        of TritonScheduling, it can override it or reuse it.
        r  )rh   rU  r  r  s       rj   r  zBaseScheduling.codegen_template  s
     "!rl   NrI  rP  rQ  rR  rd   c                    t           zD
        Generate a kernel given a list of pre-fused nodes.
        r  )rh   rI  rP  rQ  s       rj   rU  z.BaseScheduling.generate_kernel_code_from_nodes  s
     "!rl   r\   (Union[FusedSchedulerNode, SchedulerNode]c                    t           r  r  r   s     rj   r  zBaseScheduling.codegen_node  
     "!rl   c                    t           )zt
        Generate synchronization code for the kernel. This method depends on the hardware characteristics.
        r  rq   s    rj   r  zBaseScheduling.codegen_sync$  r  rl   c                    dS )z
        Check whether the backend is requesting the scheduler to flush the generated kernel.
        If not supported, please return False.
        Fr   rq   s    rj   r  zBaseScheduling.ready_to_flush*  s	    
 url   c                    t           )z]
        Flush the generated kernel and python wrapper code to the source code file.
        r  rq   s    rj   r  zBaseScheduling.flush1  r  rl   rI  c                    t           )rK  r  rL  s     rj   rL  z$BaseScheduling.benchmark_fused_nodes7  
     "!rl   rV  r   c                    t           )z
        Benchmark a compiled module and return the execution time
        in milliseconds on randomly generated inputs.
        r  )rh   rV  s     rj   rX  z)BaseScheduling.benchmark_codegened_module@  s
    
 "!rl   rm   c                    dS )z
        Return an unsigned integer which represents the priority of this fusion pair.
        The smaller is with higher priority.
        r   r   r  s      rj   r  z'BaseScheduling.get_fusion_pair_priorityG  s	     qrl   r  r  c                    t           )z
        Benchmark the list of nodes to combine and return the execution time
        and memory copy time in milliseconds on randomly generated inputs.
        r  r  s     rj   r  z%BaseScheduling.benchmark_combo_kernelP  r  rl   )r[   r  r   )r  r  rc   r  r  r  )r  r  rc   r  )rU  rT   r  r{  r  r{  rc   r  rf   r  )r\   r  rc   r   r   r  )rV  r   rc   rI  r  r  )ry   r   r   r   r  r  r  r  r  r`  r  r  rU  r  r  r  r  rL  rX  r  r  r  r  s   @rj   r  r    s       # # # # # #* * * *   " " " "" " " "
 
 
 
	9 	9 	9 	9" " " "" " " "$ (,		" 	" 	" 	" 	"" " " "" " " "   " " " "" " " "" " " "   " " " " " " " "rl   r  )rc   rc  )rg  rT   rc   rd   )rg  rT   rc   rx  )rg  rT   rc   r   )r  r   rc   rd   )r\   rT   r]  r^  r   r  rc   r   )rC  rD  rc   r   )rC  rD  r[   rZ   r  rJ  rc   r   )r   )r  r  r  r  r  r  rc   r  r   )
__future__r   r  r  r   r4  r  r@  r  r  r  r  r  r  r  r  r   r   r   r   r   r	   r
   r   r   typing_extensionsr   r   collections.abcr   r   typesr   r#  r  torch._inductor.async_compiletorch.utils._pytreer  _pytreert  torch._dynamo.utilsr   r   torch._inductor.codecacher   r   torch._inductor.irr   torch._inductor.metricsr   r   %torch.fx.experimental.symbolic_shapesr   torch.utils._ordered_setr   torch.utils._sympy.symbolr   r   r   torch.utils._tritonr   r   r!   r"   r#   r$   r%   r&   analyze_preserves_zero_maskr'   codegen.commonr(   r)   r*   comm_analysisr+   r,   r-   r.   r/   r0   excr1   r2   fx_utilsr3   r4   r5   r6   r7   r8   	loop_bodyr9   r  r:   r;   runtime.runtime_utilsr<   r=   r  r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   virtualizedrP   	getLoggerry   r   _logginggetArtifactLoggerr  r  r  r   rU   r   rV   rW   	dataclassrY   r   rT   r2  r(  r'  r  r-  r  r|   r   r`  r	  r  rs  rO  rX  r  r  rW  r9  r;  rp  ro  rG  rZ   r  r   rl   rj   <module>r     s   " " " " " " "                         				        , , , , , , , , R R R R R R R R R R R R R R R R R R 2 2 2 2 2 2 2 2  !22222222         $ $ $ $ $ $ $ $ $ $ $ $ $ 6 6 6 6 6 6 6 6 ? ? ? ? ? ? ? ? 7 7 7 7 7 7 M M M M M M M M > > > > > > / / / / / / O O O O O O O O O O * * * * * * D D D D D D D D D D D D D D D D D D D D D D M M M M M M M M M M        ; : : : : : : : : : : : 2 2 2 2 2 2 2 2 $ $ $ $ $ $                     J J J J J J J J 7 7 7 7 7 7 7 7 & & & & & &                                     &       g!!^--hAA
N44XOO  >;;$     34 4 4 4 4WT]]Yt__ h8 h8 h8 h8 h8 h8 h8 h8V 4 4 4 4 4_ 4 4 4b1 b1 b1 b1 b1 b1 b1 b1J 2 2 2 2   (' ' ' '    <
 
 
 
 
 
 
 
,           &K &K &K &KRW W W W W 1 W W W"5 5 5 5 5. 5 5 5k* k* k* k* k*% k* k* k*\	   $   ,l* l* l* l* l** l* l* l*^~: ~: ~: ~: ~:!3 ~: ~: ~:B
_ _ _ _ _, _ _ _J #%+ + + + +\ 
 
 
 
 
 
 
 
> +9?,, 4 4 4 4\4H \4H \4H \4H \4H \4H \4H \4H~hN" N" N" N" N" N" N" N" N" N"rl   