
    `i5                      d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlmZmZ d dlmZmZmZmZmZ d dlZd dlmZ d dlZd dlZd dlmc mZ d dlmZ d dl m!Z!m"Z" d dl#m$Z$ d d	l%m&Z& d d
l'm(Z( d dl)m*Z*m+Z+m,Z,m-Z-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6m7Z7 ddl8m9Z9m:Z:m;Z; ddl<m=Z= ddl>m?Z? ddl;m@Z@mAZA ddlBmCZC ddlDmEZE ddlmFZFmGZGmHZHmIZImJZJmKZKmLZLmMZMmNZNmOZOmPZPmQZQ ddlRmSZS ddlTmUZUmVZVmWZWmXZXmYZYmZZZ ddl[m\Z\ ddl]m^Z^m_Z_m`Z` erd dlambZbmcZc d dldZdddlemfZf ddlgmhZh  eji        ej          Zk eX            jl        Zmenejo        ej        epeqf         Zree;js        eYf         Zted gdf         Zudld%Zvdmd(Zwexepeyf         Zzeeneeyej        f         d)f         eezgeneyd)f         f         f         Z{	 	 dndod5Z|dpd6Z}ej~         G d7 d8                      Z G d9 d:          Z G d; d           Zej~         G d< d=e                      Zej~         G d> d?e                      Zej~         G d@ dAe                      Zej~         G dB dCe                      Z G dD dEe          Zej~         G dF dGe                      Zej~         G dH dIe                      Zej~         G dJ dKe                      Zej~         G dL dMe                      Zej~         G dN dOe                      Zej~         G dP dQe                      Z G dR dS          Zej~         G dT dUe                      Zej~         G dV dWe                      Zej~         G dX dYe                      Zej~         G dZ d[e                      Z G d\ d]e          Zej~         G d^ d_e                      Zej~         G d` dae                      Zej~         G db dce                      Zej~         G dd dee                      Zej~         G df dge                      ZepZeeeMf         Z G dh dieV          Z G dj dke          ZdS )q    )annotationsN)chaincount)AnyCallableOptionalTYPE_CHECKINGUnion)Expr)dtype)countersdynamo_timed)DebugPrinterManager)MultiKernelState)	cache_dir)CallMethodKeyConvertIntKeyDivideByKeyresolve_unbacked_bindingsSymTypes)_get_qualified_name)
OrderedSet)SingletonInt)symbol_is_typeSymT   )async_compileconfigir)output_code_log)'set_kernel_post_grad_provenance_tracing)IRNodeReinterpretView)triton_heuristics)DeviceProperties)cache_on_selfDelayReplaceLineget_benchmark_nameget_dtype_sizeIndentedBuffer#is_codegen_graph_partition_subgraphis_using_cudagraph_partitionLineContextsympy_product	sympy_str
sympy_substriton_version_uses_attrs_dict)V   )ArgNameCodeGenDeferredLinePythonPrinterWorkspaceArgWorkspaceZeroMode)cexpr)	config_ofshould_unwrap_unspec_argsignature_to_meta)IteratorSequence)GraphLowering)FxConverterWrapperLinenode
BufferLikereturnReuseKeyc                <   t           j                            |           }|                                 t           j        j        v}|                                 |                                 t          t           j        j        	                    |                    |fS N)
r2   graphget_allocation_storage_sizeget_nameunaligned_buffersget_device_or_error	get_dtyper/   sizevarssimplify)rC   storage_size	alignments      s/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torch/_inductor/codegen/wrapper.pybuffer_reuse_keyrT   ]   sw    766t<<Lqw'@@I  "" 	!'"++L99::     	input_buf
output_bufc                   |                                  |                                 k    rdS |                                 |                                k    rdS t          j        j                            t          j                            |                     }t          j        j                            t          j                            |                    }t          |          t          |          k    sMt          j        j                            |d|z            r't          j        j        	                    ||          rdS dS )NFgffffff?T)
rM   rN   r2   rI   rO   rP   rJ   r/   statically_known_geqstatically_known_leq)rV   rW   
input_sizeoutput_sizes       rS   can_match_buffer_sizer]   k   s    $$&&**H*H*J*JJJu
 4 4 6 666u!**	++I66 J '"++	++J77 K 	*;!7!777 	
--k4*;LMM 	8 G11+zJJ	 	8 t5rU   .namestrconfigslist[triton.Config]gridslist[TritonGrid]wrapperOptional[PythonWrapperCodegen]original_fxnode_nameOptional[str]tuple[str, str]c                    t                      dd	 d d!fd
}d d" fd}d  } |d| d           r*t          j        j        rj                                        nt          j                    }                                5  |5  t          j        j        r>|r<t          j	        j
        r+|t          j	        j
        v rt          j	        j
        |         }	nd gt          |          z  }	t          |          dk    r/ ||d         |	d                   \  }
} |d|
 d|            nt          |          dk    sJ t          |          t          |          k    sJ t                      }t          t          |||	          d d          D ]\  }
}}g }|j        r4|j        D ],}|dvr&|                    d| d|j        |                     -|rd                    |          }nd} ||
|          \  }
}d| d|
 }||v ry|                    |            ||d| d|            d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   |                                fS )#NitemUnion[int, sympy.Expr]rE   
sympy.Exprc                b    t          | t          j                  r| nt          j        |           S rH   )
isinstancesympyr   Integer)rj   s    rS   _convert_to_sympy_exprz@user_defined_kernel_grid_fn_code.<locals>._convert_to_sympy_expr   s'    !$
33Lttt9L9LLrU   grid
TritonGridexample_gridOptional[TritonGrid]c                   t          |           r| | fS t          fd| D                       }|s|}                    |          t          j        j        r.                    t          fd|D                                 ndfS )a'  
        This function return a tuple of two values: the first one is for the real grid
        which is used in the generated code; the second one is an example grid with
        concreate values which is used in the autotune block to run the generated
        kernels at compile time.
        Nc              3  .   K   | ]} |          V  d S rH    ).0grq   s     rS   	<genexpr>zKuser_defined_kernel_grid_fn_code.<locals>.determine_grid.<locals>.<genexpr>   s/      CC11!44CCCCCCrU   c              3  ^   K   | ]'}                     |t          |                    V  (d S rH   generate_example_arg_valuetype)ry   rz   rd   s     rS   r{   zKuser_defined_kernel_grid_fn_code.<locals>.determine_grid.<locals>.<genexpr>   sM          ::1d1ggFF     rU   )callabletuplecodegen_python_shape_tupler   tritonautotune_at_compile_time)rr   rt   
sympy_gridrq   rd   s      rS   determine_gridz8user_defined_kernel_grid_fn_code.<locals>.determine_grid   s     ?htnn?:CCCCdCCCCC
 	&%L..z:: =922    !-       
 	
rU   liner_   rg   c                                         |            r8t          j        j        r)j        vr"j                             |p|            d S d S d S d S rH   )	writeliner   r   r   kernel_autotune_nameskernel_autotune_calls)r   rt   r^   outputrd   s     rS   r   z3user_defined_kernel_grid_fn_code.<locals>.writeline   s    	J6	J G999)33L4HDIIIII	J 	J 	J 	J :9rU   grid_wrapper_for_def z(meta):r3   r   zreturn c                6    t          | d         j                  S Nr3   lenkwargsxs    rS   <lambda>z2user_defined_kernel_grid_fn_code.<locals>.<lambda>   s    c!A$+.. rU   Tkeyreverse)matrix_instr_nonkdimwaves_per_eukpackzmeta['z'] == z and Trueif z	: return )rj   rk   rE   rl   rH   )rr   rs   rt   ru   )r   r_   rt   rg   )r*   r   r   r   r   indent
contextlibnullcontextr2   rI   autotuning_gridsr   r   sortedzipr   appendjoinaddgetvalue)r^   r`   rb   rd   rf   r   r   fn_namekernel_autotune_calls_indentexample_gridsrr   rt   seenc
guardslistkwargguards	statementrq   r   s   `  `              @@rS    user_defined_kernel_grid_fn_coder      s    FM M M M
 .2
 
 
 
 
 
 
 
>J J J J J J J J J )$((GI%W%%%&&& 	&}=	&%,,...#%% !
 
 .L .L6 .L .LM2	0$	0 (	0 %(@@@G45IJMM!FSZZ/Mu::??!/a-:J!K!KD,I&&&(@,(@(@AAAAu::>>>>u::W----$.LLD *0E7M22..* * * L L%a
  
8 W!" W W  )  
 '--.Uu.U.UAHUO.U.UVVV $$\\*55FF#F%3^D,%G%G"l9&99499	$$###	)%J6%J%JL%J%JKKKK].L .L .L .L .L .L .L .L .L .L .L .L .L .L .L .L .L .L .L .L .L .L .L .L .L .L .L .L .L .L` FOO%%%%s7   I#F-I I#I	I#I	I##I'*I'c                    t                                          | j        d           ddlddlm ddlm t          | j        g          fd |            	                                S )zg
    Given a triton kernel function pointer collect the transitive closure of
    its dependencies
    Tstripr   N)JITFunction)	constexprc           	        t          d t          j        | j                  D                       }| j        j                            di           }| j        j        j        D ]}|v r|| j        j        v rw| j        j        |         }t          |          rf		                                 	
                    d           	                    |j        d                               |            |           t          d          rt          |j        j        j                  rg		                                 	
                    d           	                    |j        d                               |            |           5t          |t$          t&          t(          
f          rŉ		                                 t          |
          rd|j        d	}n|}|                    |          x}rJt          |t,                    rd
|j         d|j         }nd
|}	
                    | | d|            n	
                    | d|                                |           ||v rm|dk    rgt          |d          rW|j                            d          r=	
                    d|j         d|j         d|                                |           d S )Nc              3  :   K   | ]}|j         d k    |j        V  dS )LOAD_GLOBALN)opnameargval)ry   insts     rS   r{   z^user_defined_triton_kernel_transitive_closure_source_code.<locals>.traverse.<locals>.<genexpr>  s=       '
 '
{m++ K++++'
 '
rU   __annotations__z@triton.jitTr   constexpr_functionz@triton.constexpr_functionztl.constexpr(): . = tl
__module__r   zfrom z import z as )r   disBytecodefn__globals__get__code__co_namesrn   newliner   splicesrcr   hasattrruntimejitConstexprFunctionintr_   boolvaluer   r   __name__
startswith)
cur_kernelunqualified_loadsglobal_annotationssymbol_namesymbol
symbol_str
annotationannotation_coder   compile_wrapperr   symbols_includedtraverser   s           rS   r   zKuser_defined_triton_kernel_transitive_closure_source_code.<locals>.traverse  s   
 ' '
 '
Z]33'
 '
 '
 
 

 (]6::;LbQQ%=1: 5	6 5	6K...jm777#2;?fk22 06#++---#--m<<<#**6:T*BBB$((555HV$$$$V%9:: *6zFN.@@ @ *6 $++---#--.JKKK#**6:T*BBB$((555HV$$$$c4(CDD "6#++---!&)44 3%FV\%F%F%F

(.]
%7%;%;K%H%HHz S%j$77 B RZ%: R RZ=P R R ,O /B:.A.AO'11*LOLL
LL    (11[2Q2QZ2Q2QRRR$((5555#444#t++55 ,
 )44X>> , $--] 1]]6?]]P[]]   %((555k5	6 5	6rU   )
r*   r   r   r   r   triton.languager   r   r   r   )kernelr   r   r   r   r   r   s    @@@@@@rS   9user_defined_triton_kernel_transitive_closure_source_coder      s    
 %&&O6:T222 MMM"""""")))))) "6?"344@6 @6 @6 @6 @6 @6 @6 @6 @6 @6D HV##%%%rU   c                  *    e Zd ZU ded<   ded<   d ZdS )SymbolicCallArgsympy.Symbolinnerrl   
inner_exprc                *    t          | j                  S rH   )r_   r   selfs    rS   __str__zSymbolicCallArg.__str__[  s    4:rU   N)r   r   __qualname__r   r   rx   rU   rS   r   r   U  s=             rU   r   c                  6     e Zd Z fdZddZddZddZ xZS )MemoryPlanningStatec                    t                                                       t          j        t                    | _        d| _        d S Nr   )super__init__collectionsdefaultdictlist
reuse_pooltotal_allocated_buffer_size)r   	__class__s    rS   r   zMemoryPlanningState.__init__`  s>    #D)) 	 12(((rU   r   rF   rE   r   c                R    t          | j                            |d                     S rH   )r   r   r   )r   r   s     rS   __contains__z MemoryPlanningState.__contains__g  s"    DO''T22333rU   FreeIfNotReusedLinec                V    | j         |                                         }|j        rJ |S rH   )r   pop	is_reusedr   r   rj   s      rS   r  zMemoryPlanningState.popj  s-    s#''))>!!!rU   rj   Nonec                X    |j         rJ | j        |                             |           d S rH   )r  r   r   r  s      rS   pushzMemoryPlanningState.pusho  s1    >!!!##D)))))rU   )r   rF   rE   r   )r   rF   rE   r   )r   rF   rj   r   rE   r  )r   r   r   r   r   r  r  __classcell__r   s   @rS   r   r   _  st        2 2 2 2 24 4 4 4   
* * * * * * * *rU   r   c                      e Zd ZddZdS )rB   	converterrA   rE   FxConversionFuncc                     t          d          )Nz2FX codegen not yet supported for type {type(self)})NotImplementedErrorr   r
  s     rS   
codegen_fxzWrapperLine.codegen_fxu  s    !"VWWWrU   Nr
  rA   rE   r  r   r   r   r  rx   rU   rS   rB   rB   t  s.        X X X X X XrU   c                  <    e Zd ZU ded<   ded<   ddZdd
ZddZdS )EnterSubgraphLinePythonWrapperCodegenrd   r@   rI   rE   r  c                N    | j                             | j         j                   d S rH   )rd   push_computed_sizescomputed_sizesr   s    rS   __post_init__zEnterSubgraphLine.__post_init__~  s#    (()DEEEEErU   coder*   c                l    | j                             | j                   |                                 d S rH   )rd   push_codegened_graphrI   	do_indentr   r  s     rS   codegenzEnterSubgraphLine.codegen  s0    ))$*555rU   r
  rA   r  c                    |j         S rH   )_generate_enter_subgraphr  s     rS   r  zEnterSubgraphLine.codegen_fx  s    11rU   NrE   r  r  r*   rE   r  r  r   r   r   r   r  r  r  rx   rU   rS   r  r  y  sn         !!!!F F F F   2 2 2 2 2 2rU   r  c                  :    e Zd ZU ded<   ddZedd            ZdS )CommentLiner-   r   r  r*   rE   r  c                :    |                     | j                   d S rH   )r   r   r  s     rS   r  zCommentLine.codegen  s    ty!!!!!rU   r
  rA   r  c                    | j         S rH   )_generate_comment)r
  s    rS   r  zCommentLine.codegen_fx  s    **rU   Nr"  r  )r   r   r   r   r  staticmethodr  rx   rU   rS   r%  r%    sV         " " " " + + + \+ + +rU   r%  c                  2    e Zd ZU ded<   ddZddZddZdS )ExitSubgraphLiner  rd   rE   r  c                L    | j                                         | j         _        d S rH   )rd   pop_computed_sizesr  r   s    rS   r  zExitSubgraphLine.__post_init__  s     &*l&E&E&G&G###rU   r  r*   c                `    | j                                          |                                 d S rH   )rd   pop_codegened_graphdo_unindentr  s     rS   r  zExitSubgraphLine.codegen  s.    ((***rU   r
  rA   r  c                    |j         S rH   )_generate_exit_subgraphr  s     rS   r  zExitSubgraphLine.codegen_fx  s    00rU   Nr!  r"  r  r#  rx   rU   rS   r+  r+    sb         !!!!H H H H   1 1 1 1 1 1rU   r+  c                  4    e Zd ZU ded<   ded<   dd	ZddZdS )EnterDeviceContextManagerLiner   
device_idxOptional[int]last_seen_device_guard_indexr  r*   rE   r  c                $   t           j        j        r|                    d           t           j        j        r^| j        ;|                    t           j        j                                         d           d S | j        | j        k    s
J d            d S | j        C|                    t           j        j        	                                 d| j         d           d S |                    d| j         d           d S |                    dt           j        j        
                    | j                   d           |                                 |                    t           j        j                            | j                             d S )	N
z) stream_guard(stream, this->device_idx_);z4AOTInductor only supports running on one CUDA devicez device_guard(z);zdevice_guard.set_index(with :)r2   rI   cpp_wrapperr   aot_moder7  
device_opscpp_aoti_stream_guardr5  cpp_aoti_device_guarddevice_guardr  
set_devicer  s     rS   r  z%EnterDeviceContextManagerLine.codegen  s   7 	KNN4   w R 4<NN7-CCEEppp      <OOON POOOO 4<NN7-CCEEhhUYUdhhh     NN#PT_#P#P#PQQQQQ NNV17#5#B#B4?#S#SVVVWWWNNNN17-88IIJJJJJrU   r
  rA   r  c                    |j         S rH   )&_generate_enter_device_context_managerr  s     rS   r  z(EnterDeviceContextManagerLine.codegen_fx  s    ??rU   Nr"  r  r   r   r   r   r  r  rx   rU   rS   r4  r4    s]         OOO////K K K K:@ @ @ @ @ @rU   r4  c                      e Zd ZddZdd	Zd
S )ExitDeviceContextManagerLiner  r*   rE   r  c                T    t           j        j        s|                                 d S d S rH   )r2   rI   r<  r0  r  s     rS   r  z$ExitDeviceContextManagerLine.codegen  s1    w" 		 	rU   r
  rA   r  c                    |j         S rH   )%_generate_exit_device_context_managerr  s     rS   r  z'ExitDeviceContextManagerLine.codegen_fx  s    >>rU   Nr"  r  r   r   r   r  r  rx   rU   rS   rG  rG    s<           ? ? ? ? ? ?rU   rG  c                  4    e Zd ZU ded<   ded<   dd	ZddZdS )ExternKernelAllocLiner  rd   ir.ExternKernelAllocrC   r  r*   rE   r  c                    | j         }g |                                |                                }| j                            | j         |           d S rH   )rC   codegen_argscodegen_kwargsrd   $_generate_extern_kernel_alloc_helper)r   r  rC   argss       rS   r  zExternKernelAllocLine.codegen  sP    y=""$$=t':':'<'<=99$)TJJJJJrU   r
  rA   r  c                    |j         S rH   )_generate_extern_kernel_allocr  s     rS   r  z ExternKernelAllocLine.codegen_fx  s    66rU   Nr"  r  rE  rx   rU   rS   rM  rM    sZ         !!!!K K K K
7 7 7 7 7 7rU   rM  c                  4    e Zd ZU ded<   ded<   dd	ZddZdS )ExternKernelOutLiner  rd   ir.ExternKernelOutrC   r  r*   rE   r  c                J   | j         }g |                                |                    d          }|                                }t          j        j        r|j        dk    rd}n|                                }|                                x}r|j	        nt          j        j
        }d }t          j        j        dk    rt          ||d          }| j                            ||                                |j        r|j                                        nd |||           d S )NT)skip_outztorch::inductor::_mm_plus_mmaoti_torch__mm_plus_mm_outr   )	is_extern)rC   rP  rQ  get_kernel_namer2   rI   r<  cpp_kernel_name
get_devicer   device_typer   traceprovenance_tracking_levelr!   rd   "_generate_extern_kernel_out_helpercodegen_referenceoutput_view)r   r  rC   rS  kernel_nameddeviceprovenance_debug_handles           rS   r  zExternKernelOutLine.codegen  s2   yJ""$$Jt':':D':'I'IJ**,,G	1$(FFF 7KK..00K!%!2!22AL9L15<1Q66&MkT' ' '# 	77""$$484DND..000$#	
 	
 	
 	
 	
rU   r
  rA   r  c                    |j         S rH   )_generate_extern_kernel_outr  s     rS   r  zExternKernelOutLine.codegen_fx      44rU   Nr"  r  rE  rx   rU   rS   rW  rW    sV         !!!!
 
 
 
85 5 5 5 5 5rU   rW  c                  4    e Zd ZU ded<   ded<   dd	ZddZdS )FreeLiner  rd   %Union[BufferLike, ir.TorchBindObject]rC   r  r*   rE   r  c                    | j                                         t          j        j        vsJ |                    | j                            | j                              d S rH   )rC   rK   r2   rI   removed_buffersr   rd   make_buffer_freer  s     rS   r  zFreeLine.codegen  sP    y!!##17+BBBBBt|44TY??@@@@@rU   r
  rA   r  c                    |j         S rH   )_generate_freer  s     rS   r  zFreeLine.codegen_fx      ''rU   Nr"  r  rE  rx   rU   rS   rn  rn    sZ         !!!!////A A A A( ( ( ( ( (rU   rn  c                      e Zd ZU ded<   ded<   ded<   ded<   ded<   d	ed
<   ded<   ded<   ded<   ded<   ded<   ddZddZdS )KernelCallLiner  rd   r_   rf  ztuple[Any, ...]	call_argsraw_keysraw_args	list[str]	arg_typesr   r   zdict[str, Any]triton_metaztorch.devicerh  
graph_namerf   r  r*   rE   r  c                    | j                             | j        | j        | j        | j        | j        | j        | j        | j	        | j
        | j        
  
         d S )N)r   r|  ry  rz  r}  rh  r~  rf   )rd   _generate_kernel_call_helperrf  rx  r   r|  ry  rz  r}  rh  r~  rf   r  s     rS   r  zKernelCallLine.codegen!  s`    11N;n]](;!%!: 	2 	
 	
 	
 	
 	
rU   r
  rA   r  c                    |j         S rH   )_generate_kernel_callr  s     rS   r  zKernelCallLine.codegen_fx/      ..rU   Nr"  r  rE  rx   rU   rS   rw  rw    s         !!!!LLLOOO
 
 
 
/ / / / / /rU   rw  c                  h    e Zd ZU ded<   ded<   ded<   dZded<   d	Zd
ed<   dZded<   ddZddZdS )KernelDefinitionLiner  rd   r_   rf  kernel_bodyNrg   metadataTr   gpucpp_definitionr  r*   rE   r  c                v    | j                             | j        | j        | j        | j        | j                   d S N)r  r  r  )rd   _define_kernel_helperrf  r  r  r  r  r  s     rS   r  zKernelDefinitionLine.codegen<  sG    **]. 	+ 	
 	
 	
 	
 	
rU   r
  rA   r  c                    |j         S rH   )_generate_kernel_definitionr  s     rS   r  zKernelDefinitionLine.codegen_fxE  rl  rU   r"  r  )	r   r   r   r   r  r  r  r  r  rx   rU   rS   r  r  3  s         !!!!"H""""C$(N((((
 
 
 
5 5 5 5 5 5rU   r  c                  2    e Zd ZU ded<   ddZdd
ZddZdS )MemoryPlanningLiner  rd   stater   rE   c                    | S )zFirst pass to find reuserx   r   r  s     rS   planzMemoryPlanningLine.planM  s    rU   r  r*   r  c                    dS )zSecond pass to output codeNrx   r  s     rS   r  zMemoryPlanningLine.codegenQ  s      rU   r_   c                \   g }t          j        |           D ]i}|j        dk    rt          | |j                  }|                    |j         d|j        t          j        u r|                                n|            jt          |           j	         dd
                    |           dS )zF
        Emits a string representation that fits on one line.
        rd   =(, r   )dataclassesfieldsr^   getattrr   r   r   BufferrK   r   r   )r   rS  fieldvals       rS   r   zMemoryPlanningLine.__str__T  s      '-- 	 	EzY&&$
++CKK:TT%*	2I2IsTT    t**%::		$::::rU   Nr  r   rE   r  r"  rE   r_   )r   r   r   r   r  r  r   rx   rU   rS   r  r  I  s^         !!!!   ) ) ) ); ; ; ; ; ;rU   r  c                  ,    e Zd Zd ZddZddZddZdS )EfficientPeakEstimatec                   ddl m}m} t          j        j        j        }t          t          j        j        	                                          }t          t          j        
                                          } |||          } ||||          \  | _        }ddlm}  ||t          j        t           d          | _        d S )Nr   )estimate_peak_memoryget_freeable_input_bufr3   )SegmentedTreer   )memoryr  r  r2   rI   	schedulernodesr   graph_inputskeysget_output_namesoverall_peak_memorysegmented_treer  operatorr   max)	r   r  r  scheduler_nodesr  graph_outputsnames_to_freeable_bufspeak_by_scheduler_noder  s	            rS   r   zEfficientPeakEstimate.__init__d  s    IIIIIIII'+1!!'"6";";"="=>>"17#;#;#=#=>>!7!7!V!V;O;O"<
 <
8 "8 	211111+m"HL#q
 
rU   rC   rD   rE   r   c                    t           j        j                            t           j                            |          d          t          |                                          z  S )Nr   fallback)r2   rI   rO   	size_hintrJ   r)   rN   r   rC   s     rS   	_get_sizezEfficientPeakEstimate._get_sizew  sS    w))G//55 * 
 
4>>++,,- 	-rU   line_ar   line_bAllocateLinec                X    | j                             |j        dz   |j        dz
            S r   )r  summarize_rangescheduler_node_indexr   r  r  s      rS   peak_betweenz"EfficientPeakEstimate.peak_between|  s2    "22'!+V-H1-L
 
 	
rU   c                    |j         dz   |j         k    rd S | j                            |j         dz   |j         dz
  |                     |j                             d S r   )r  r  update_ranger  rC   r  s      rS   update_peak_betweenz)EfficientPeakEstimate.update_peak_between  si    &*f.IIIF(('!+'!+NN6;''	
 	
 	
 	
 	
rU   N)rC   rD   rE   r   )r  r   r  r  )r   r   r   r   r  r  r  rx   rU   rS   r  r  c  s_        
 
 
&- - - -

 
 
 


 
 
 
 
 
rU   r  c                  @    e Zd ZU ded<   d Zdd
ZddZddZddZdS )r  rD   rC   c                    t           j        j        j        J t           j        j        j                            t           j        j        j                  | _        d S rH   r2   rI   r  current_noder  indexr  r   s    rS   r  zAllocateLine.__post_init__  G    w -999$%G$5$;$A$AG*%
 %
!!!rU   	free_liner   sizer   rE   r   c                    |j         dz   | j         k    rdS | j        j        j        }| j        j                            ||           }||z   }||k    S )Nr3   T)r  rd   estimate_peakr  r  )r   r  r  r  peak_memory_in_rangenew_peak_memorys         rS   should_reuse_bufferz AllocateLine.should_reuse_buffer  s^    )A-1JJJ4"l8L#|9FFyRVWW!55"555rU   r  r   r  c           	        | j                                         t          j        j        v rt          | j                  S t          | j                   }t          j	        r||v r|
                    |          }t          j        j                            t          j                            | j                   d          t          | j                                                   z  }|                     ||          rGd|_        | j        j                            ||            t)          | j        |j         | j                   S |                    ||           | S | j                                         j        dk    r\| j                            | j                   }|;|xj        t5          t7          j        t:          j        |d                    z  c_        | S )Nr   r  Tcpur3   )rC   rK   r2   rI   rq  NullLinerd   rT   r   allow_buffer_reuser  rO   r  rJ   r)   rN   r  r  r  r  	ReuseLiner  rM   r   static_shape_for_buffer_or_noner   r   	functoolsreducer  mul)r   r  r   r  r  static_shapes         rS   r  zAllocateLine.plan  s   917#:::DL))) ty))$ 			#I7#--33DI>> .  ty2244556D ''	488 &*	#*>>y$OOO y~tyIII

3	***9((**/588<GG	RRL'11S$X\<CC6 6 11 rU   r  r*   r  c                    | j                                         t          j        j        vsJ | j                            | j                   }|                    |           d S rH   )rC   rK   r2   rI   rq  rd   make_buffer_allocationr   r   r  r   s      rS   r  zAllocateLine.codegen  sU    y!!##17+BBBBB|2249==trU   r
  rA   r  c                    |j         S rH   )_generate_allocater  s     rS   r  zAllocateLine.codegen_fx  s    ++rU   N)r  r   r  r   rE   r   r  r"  r  )	r   r   r   r   r  r  r  r  r  rx   rU   rS   r  r    s         
 
 
6 6 6 6   8   
, , , , , ,rU   r  c                  F    e Zd ZU ded<   dZded<   d ZddZddZddZdS )r   rD   rC   Fr   r  c                    t           j        j        j        J t           j        j        j                            t           j        j        j                  | _        d S rH   r  r   s    rS   r  z!FreeIfNotReusedLine.__post_init__  r  rU   r  r   rE   r  c                   t          | j                                                  dk    r| S t          | j        j        t
          j                  r| S | j        rJ | j                                        t          j
        j        v rt          | j                  S t          j        r(|                    t#          | j                  |            | S r   )r   rC   get_inputs_that_alias_outputrn   layoutr   MultiOutputLayoutr  rK   r2   rI   rq  r  rd   r   r  r  rT   r  s     rS   r  zFreeIfNotReusedLine.plan  s    ty5577881<<Kdi&(<== 	K>!!!917#:::DL)))$ 	:JJ'	22D999rU   r  r*   r  c                    | j                                         t          j        j        vsJ | j        s4|                    | j                            | j                              d S d S rH   )	rC   rK   r2   rI   rq  r  r   rd   rr  r  s     rS   r  zFreeIfNotReusedLine.codegen  sg    y!!##17+BBBBB~ 	ENN4<88CCDDDDD	E 	ErU   r
  rA   r  c                    |j         S rH   )_generate_free_if_not_reusedr  s     rS   r  zFreeIfNotReusedLine.codegen_fx  s    55rU   Nr  r"  r  )	r   r   r   r   r  r  r  r  r  rx   rU   rS   r   r     s         I
 
 

 
 
 
E E E E
6 6 6 6 6 6rU   r   c                  F    e Zd ZU ded<   ded<   ded<   dd
ZddZddZdS )ReinterpretLinerD   rC   	reused_asz	ir.Layoutr  r  r   rE   r  c                    | S rH   rx   r  s     rS   r  zReinterpretLine.plan  s    rU   r  r*   r  c                   t          | j        t          j                  sJ t          | j        j        t          j                  sJ | j                            | j        	                                | j        j                   d S rH   )
rn   r  r   NonOwningLayoutviewr#   rd   codegen_deferred_allocationr  rK   r  s     rS   r  zReinterpretLine.codegen  sw    $+r'9:::::$+*B,>?????00N##%%t{'7	
 	
 	
 	
 	
rU   r
  rA   r  c                    |j         S rH   )_generate_reinterpretr  s     rS   r  zReinterpretLine.codegen_fx  r  rU   Nr  r"  r  )r   r   r   r   r  r  r  rx   rU   rS   r  r    sv            
 
 
 
/ / / / / /rU   r  c                  J    e Zd ZU ded<   ded<   dZded<   ddZddZddZdS )r  rD   rC   r  Tr   
delete_oldr  r   rE   r  c                2   | j                                         t          j        j        v r@| j                                        t          j        j        v sJ t          | j                  S | j                                        t          j        j        vsJ | S rH   )rC   rK   r2   rI   rq  r  r  rd   r  s     rS   r  zReuseLine.plan  s}    917#:::>**,,0GGGGGDL)))~&&((0GGGGGrU   r  r*   r  c                2   | j                                         t          j        j        vsJ | j                                        t          j        j        vsJ |                    | j                            | j         | j        | j	                             d S rH   )
rC   rK   r2   rI   rq  r  r   rd   make_buffer_reuser  r  s     rS   r  zReuseLine.codegen  s    y!!##17+BBBBB~&&((0GGGGGL**49dndoVV	
 	
 	
 	
 	
rU   r
  rA   r  c                    |j         S rH   )_generate_reuser  s     rS   r  zReuseLine.codegen_fx
  s    ((rU   Nr  r"  r  )r   r   r   r   r  r  r  r  rx   rU   rS   r  r    s{         J   
 
 
 
) ) ) ) ) )rU   r  c                      e Zd ZddZdS )r  r
  rA   rE   r  c                    |j         S rH   )_generate_nullr  s     rS   r  zNullLine.codegen_fx  ru  rU   Nr  r  rx   rU   rS   r  r    s(        ( ( ( ( ( (rU   r  c                  l    e Zd ZU ded<   ded<   edd            Zedd	            Zedd            ZdS )CommBufferLiner  rd   	ir.BufferrC   rE   r   c                    ddl m} | j                                        }| j                                        } ||          rt          d| j                   t          |          |j        z  S )Nr   )is_symbolicz-The size of a comm buffer can't be symbolic: )torch._inductor.utilsr  rC   	get_numelrN   AssertionErrorr   itemsize)r   r  numelr   s       rS   r  zCommBufferLine.size  s    555555	##%%	##%%;u 	 K	KK   5zzEN**rU   ir.CommBufferTypec                z    | j                                         }t          |t          j                  sJ |j        S rH   )rC   get_output_specrn   r   CommBufferLayoutcomm_buffer_typer   r  s     rS   r  zCommBufferLine.comm_buffer_type$  s7    **,,&""566666&&rU   r_   c                z    | j                                         }t          |t          j                  sJ |j        S rH   )rC   r  rn   r   r  
group_namer  s     rS   r  zCommBufferLine.group_name*  s7    **,,&""566666  rU   NrE   r   )rE   r
  r  )r   r   r   r   propertyr  r  r  rx   rU   rS   r  r    s         !!!!OOO	+ 	+ 	+ X	+ ' ' ' X'
 ! ! ! X! ! !rU   r  c                  4    e Zd ZddZed             Zdd
ZdS )CommBufferAllocateLiner  r*   rE   r  c                   | j                                         t          j        j        vsJ | j                                         }| j                                         }| j                                         }t          | j                                                   }t          | j         	                                          }|
                    |                     | j        | j        | j        |||||                     d S rH   )rC   rK   r2   rI   rq  r_  rN   r   get_size
get_strider   make_allocation_liner  r  rd   )r   r  r^   rh  r   shapestrides          rS   r  zCommBufferAllocateLine.codegen3  s    y!!##17+BBBBBy!!##%%''	##%%di((**++ty++--..%%%	 		
 	
 	
 	
 	
rU   c                    | t           j        j        k    rU| d|                    |           d|                    |           d| d|j         d| dt          j        dd           dS t          d	|            )
Nz = empty_strided_p2p(r  z, torch.device("cuda:z"), group_name="z", alloc_id=r   l    r   zUnsupported comm buffer type: )r   CommBufferTypeSYMM_MEMcodegen_shape_tupler  randomrandintr  )r  r  rd   r^   rh  r   r  r  s           rS   r  z+CommBufferAllocateLine.make_allocation_lineG  s     r0999 < <..u55< <..v66< < < < '-l	< <
  *< < #N1i88< < < &C1ACC  rU   r
  rA   r  c                    |j         S rH   )_generate_comm_buffer_allocater  s     rS   r  z!CommBufferAllocateLine.codegen_fxZ  s    77rU   Nr"  r  )r   r   r   r  r)  r  r  rx   rU   rS   r  r  1  sW        
 
 
 
(   \$8 8 8 8 8 8rU   r  c                      e Zd ZddZdd	Zd
S )CommBufferFreeLiner  r*   rE   r  c                    | j                             | j                  }|                    | d| j        j         d           d S )Nz # z buffer free)rd   rr  rC   r   r  r   r  s      rS   r  zCommBufferFreeLine.codegen`  sH    |,,TY77$LL4#8#>LLLMMMMMrU   r
  rA   r  c                    |j         S rH   )_generate_comm_buffer_freer  s     rS   r  zCommBufferFreeLine.codegen_fxd  s    33rU   Nr"  r  rK  rx   rU   rS   r%  r%  ^  s@        N N N N4 4 4 4 4 4rU   r%  c                  L    e Zd ZU dZded<   ded<   ded<   ded<   ddZddZdS )MultiOutputLinezU
    Given a MultiOutputLayout buffer, indexes actual buffer(s) from the result.
    r  rd   r_   result_namearg_nameSequence[Any]indicesr  r*   rE   r  c                      fd  j          j                  }|                     j        j          j         d|  j        j                    d S )Nc                   t          |          dk    r|d         \  }}t          |t                    r |  d| d|dd                    S t          |t                    rBj                            | j        t          |                    } ||dd                    S t          |t                    r |  d| d|dd                    S t          d|          | S )Nr   []r3   z['z']znon supported index type: )
r   
issubclassr   r   rd   codegen_tuple_accessr+  r_   dictr  )basenamer.  itypeituple_accesscodegen_list_tuple_accessr   s        rS   r:  z:MultiOutputLine.codegen.<locals>.codegen_list_tuple_accesst  s   7||a"1:qeT** N445G5G15G5G5GQRQSQSUUUu-- 	N#'<#D#D $"2CFF$ $L 54\7122;OOOt,, N445I5IA5I5I5I7STSUSU;WWW()EuMMMrU   r   )r,  r.  r   rd   declarer+  ending)r   r  r   r:  s   `  @rS   r  zMultiOutputLine.codegens  s    	  	  	  	  	  	 $ *)$-FF|#VT%5VV%VATVV	
 	
 	
 	
 	
rU   r
  rA   r  c                    |j         S rH   )_generate_multi_outputr  s     rS   r  zMultiOutputLine.codegen_fx  s    //rU   Nr"  r  )r   r   r   __doc__r   r  r  rx   rU   rS   r*  r*  h  sw           "!!!MMM
 
 
 
00 0 0 0 0 0rU   r*  c                  >    e Zd ZU ded<   ded<   ded<   ddZddZdS )SymbolicCallArgLiner  rd   r   argr@   rI   r  r*   rE   r  c                P    | j                             | j        | j                   d S rH   )rd   "_generate_symbolic_call_arg_helperrB  rI   r  s     rS   r  zSymbolicCallArgLine.codegen  s$    77$*MMMMMrU   r
  rA   r  c                    |j         S rH   )_generate_symbolic_call_argr  s     rS   r  zSymbolicCallArgLine.codegen_fx  rl  rU   Nr"  r  rE  rx   rU   rS   rA  rA    sf         !!!!N N N N5 5 5 5 5 5rU   rA  c            	          e Zd ZdZdZ fdZe	 ddd            ZddZddZ	ddZ
ddZddZedd            ZddZedd            ZddZedd             Zdd!Zdd#Zd d%Zdd&Zdd'Zdd(Zdd*Zdd,Zdd-Zdd.Zdd/Zdd2Zd3 Zd4 Z d5 Z!d6 Z"d7 Z#dd8Z$dd9Z%dd:Z&dd<Z'dd?Z(dd@Z)ddAZ*ddDZ+d	dFZ,dG Z-d
dIZ.	 dddQZ/ddSZ0ddTZ1ddUZ2dV Z3dW Z4dX Z5ddcZ6dd Z7ddeZ8e9j:        ddi            Z;ddjZ<dk Z=dl Z>dm Z?dn Z@ddpZAdduZBdv ZCddyZDdz ZEdd{ddZFdd{ddZGddZHddZIddZJddZKddZL	 dddZMddZNddZOd ZPd ZQd ZRd ZS	 	 	 dddZTe	 ddd            ZU	 	 	 dddZVddZWddZXdddZYd dZZd!dZ[d!dZ\d Z]d Z^d Z_d Z`d Zad Zbd Zcd ZdddZed Zfdddddddddd"dZgdddddddddd#dZhd Zid Zjd ZkddZld$dÄZmedĄ             Zn	 d%dńZodƄ Zpd&dǄZqd'dɄZrd(d˄Zsd)dτZt	 dd*dЄZud+dӄZvd,dքZwd-d؄Zxdل ZyddڄZzdۄ Z{d.dބZ|d߄ Z}d/dZ~d Zd0dZd1dZd Zd Zd Zd Zd Zd Zd Zd Zed             Zed             Zed             Zed             Zed             Z xZS (2  r  zB
    Generate outer wrapper in Python that calls the kernels.
    Tc                    t                                                       t                       _        i  _        t                       _        t                       _        t                       _        t                       _	        t                       _
        t                       _        t                       _        t                       _        t                       _        t                       _        i  _        d _        i  _        t                       _        g  _        d _        d _        d _        d _        d _        t6          j        j        rdnd _        t6          j        j        rdnd _        d  _         d _!        i  _"        t                       _#        t                       _$        d  _%         &                                 g  _'        g  _(         )                                 tU                     s +                                  ,                                 t6          j        j-        s?t6          j        j.        /                                D ]\  }} 0                    ||           t          tb                                _2        t          tb                                _3        i  _4         tk          j6        d            j7                   _7        tj          j8        d fd            }| _9        i  _:        t                       _;        ty                       _=        t                       _>        i  _?        t          t          jB        jC        t          jB        jD                   _E        g  _F        d S )Nr    #r  z
std::move(r   Tr   r_   rE   c                    j                             |            t          j        j        rj                            |            d S d S rH   )importsr   r   r   r   r   )r   r   s    rS   add_import_oncez6PythonWrapperCodegen.__init__.<locals>.add_import_once  sM    L""4(((}5 ;*44T:::::; ;rU   )debug_printer_leveluse_array_ref)r   r_   rE   r  )Gr   r   r   _names_iterargs_to_buffersr*   rL  headerprefixsuffixkernel_declarationswrapper_callkernel_autotune_defsr   subgraph_definitionsr   r   kernel_autotune_example_argskernel_autotune_tmp_arg_idxsrc_to_kernelkernel_numel_exprlinesr;  declare_maybe_referencer<  commentnone_strr2   rI   r<  
move_beginmove_endr7  supports_intermediate_hooksuser_defined_kernel_cacheunbacked_symbol_declsr  launcher_fn_nameset_launcher_fn_namecodegened_graph_stackcomputed_sizes_stackwrite_headerr+   write_prefix!write_kernel_autotune_defs_headerr=  constant_reprsitemswrite_constant
BufferName	allocatedfreedreusesr  	lru_cachewrite_get_raw_streamcacherM  _metas
_meta_varsr   multi_kernel_statealready_codegened_subgraphsallocated_workspacesr   r   aot_inductor debug_intermediate_value_printerallow_stack_allocationdebug_printeradditional_files)r   r^   hashedrM  r   s   `   rS   r   zPythonWrapperCodegen.__init__  s#   */''  	 &''$&&$&&$&&#1#3#3 *,,$2$4$4!%3%5%5"$2$4$4!6@ll" IK)01( .0HR!#
')$*+'*=E,,2 w2:;?)+/(QS&LL 	" 9C $!!### &("$&!2488 	 ..000w 	2 ! 6 < < > > 2 2f##D&1111#J/11
+--
 57$=I$7$=$=%%
 %
! 
	; 	; 	; 	; 	; 
	;
  /&(+5<<"2"4"4<FLL(46! 1 & 3 T -D
 
 
 !#rU   Nis_subgraphr   subgraph_namerg   parent_wrapperre   partition_signatures$Optional[ir.GraphPartitionSignature]c                T    | r|J |J t          |||          S t                      S rH   )SubgraphPythonWrapperCodegenr  )r  r  r  r  s       rS   createzPythonWrapperCodegen.create  sL      	 ,,,!---/~/C   $%%%rU   rE   r  c                    d| _         d S )Ncall)rf  r   s    rS   rg  z)PythonWrapperCodegen.set_launcher_fn_name  s     &rU   r^   r_   r  c                D    | j                             | d|            d S )Nz = None  # )rR  r   )r   r^   r  s      rS   ro  z#PythonWrapperCodegen.write_constant  s,    ::&::;;;;;rU   c           	     \   t           j        j                                        }d}||j        
d|j         }d}t          t          j        j                  dk    rd}nt           j	        j        j
        j        rd}| j                            d| dt          j         d| d	d
           | j                            dd
           	 ddlm} | j                            dd
           n# t&          t(          f$ r Y nw xY wt          j        r| j                            d           d S d S )NrI  z
# AOT ID: r   zRfrom torch._inductor.codegen.debug_utils import _print_debugging_tensor_value_infozFfrom torch._inductor.runtime.debug_utils import tracked_empty_strided
z
                aH  
                from ctypes import c_void_p, c_long, c_int
                import torch
                import math
                import random
                import os
                import tempfile
                from math import inf, nan
                from cmath import nanj
                from torch._inductor.hooks import run_intermediate_hooks
                from torch._inductor.utils import maybe_profile
                from torch._inductor.codegen.memory_planning import _align as align
                from torch import device, empty_strided
                from zq import AsyncCompile
                from torch._inductor.select_algorithm import extern_kernels
                z
            Tr   a  
                aten = torch.ops.aten
                inductor_ops = torch.ops.inductor
                _quantized = torch.ops._quantized
                assert_size_stride = torch._C._dynamo.guards.assert_size_stride
                assert_alignment = torch._C._dynamo.guards.assert_alignment
                empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
                empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
                empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
                reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
                alloc_from_pool = torch.ops.inductor._alloc_from_pool
                async_compile = AsyncCompile()
            )_SymmetricMemoryzs
                empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
                zfrom torch.cuda import nvtx)torch_guardsTracingContexttry_getaot_graph_namer   r   r|  r}  	_inductortest_configstrack_memory_lifecyclerL  r   r   r   rR  torch._C._distributed_c10dr  AttributeErrorImportErrorannotate_trainingr   )r   contextaot_config_commentinductor_debug_utilsr  s        rS   rj  z!PythonWrapperCodegen.write_header  s   -.66887#9#E!Fg.D!F!F!v"CDDqHH#w  _#0G 	m#l #  $,   &!  $ ' 	 	
 	
 	
* 	 ! 	 	
 	
 	
$	 DCCCCCK 	      , 	 	 	D	# 	AK!!"?@@@@@	A 	As   
"C- -D DrR  c                    d S rH   rx   )r   rR  s     rS   include_extra_headerz)PythonWrapperCodegen.include_extra_header]      rU   c                V    | j                             dt          j         d           d S )Na	  
                import torch
                from torch._dynamo.testing import rand_strided
                from torch._dynamo.utils import preserve_rng_state
                from torch._inductor.select_algorithm import AlgorithmSelectorCache
                from aH   import AsyncCompile

                async_compile = AsyncCompile()
                generate_example_value = AlgorithmSelectorCache.generate_example_value
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
            )rW  r   r   r   r   s    rS   rl  z6PythonWrapperCodegen.write_kernel_autotune_defs_header`  sC    !((
 $,  	
 	
 	
 	
 	
rU   c                   dt           j         d}t          j        j        rV| j                            |           | j                            t          j	        j
                            d                     t          j	        j        sZ| j                            |d           | j                            t          j	        j
                            d                     d S d S )NzU
            import triton
            import triton.language as tl
            from z+ import start_graph, end_graph
            get_raw_streamTr   )r$   r   r   r   r   r   r   r   r2   rI   r>  import_get_raw_stream_asr<  rL  r   
import_strs     rS   write_triton_header_oncez-PythonWrapperCodegen.write_triton_header_oncep  s     $,  

 =1 	&--j999&00";;<LMM   w" 	L
$777L""";;<LMM    	 	rU   c                j   t           j        j                            d          }t          j        j        r4| j                            |          s| j        	                    |           t           j        j
        s6| j                            |          s| j        	                    |           d S d S d S )Nr  )r2   rI   r>  r  r   r   r   r   containsr   r<  rL  )r   import_get_raw_stream_strs     rS   write_get_raw_stream_headerz0PythonWrapperCodegen.write_get_raw_stream_header  s    $%G$6$O$O%
 %
! =1 	P-667PQQ P*445NOOOw" 	B<(()BCC B&&'@AAAAA	B 	BB BrU   c                .    |                                   d S rH   )r  r   s    rS    write_get_raw_stream_header_oncez5PythonWrapperCodegen.write_get_raw_stream_header_once  s    ((*****rU   metaTritonMetaParamsc                `   t          |          }|| j        vrdt          | j                   }|| j        |<   | j                            | d|            t
          j        j        r9| j                            | d|            | j	        
                    |           | j        |         S )Nr  r   )reprrw  r   rR  r   r   r   r   r   rx  r   )r   r  vars      rS   add_meta_oncez"PythonWrapperCodegen.add_meta_once  s    Dzzt{""+T[))++C #DKK!!S"3"3T"3"3444}5 )*445F5F5F5FGGG##C((({4  rU   r{  c                D      fd                                  D             S )Nc                D    g | ]}|                     j                  S rx   )rd  rV  )ry   r   r   s     rS   
<listcomp>z8PythonWrapperCodegen.get_output_refs.<locals>.<listcomp>  s7     
 
 
78A 122
 
 
rU   )get_graph_outputsr   s   `rS   get_output_refsz$PythonWrapperCodegen.get_output_refs  s:    
 
 
 
<@<R<R<T<T
 
 
 	
rU   c                    d S rH   rx   r   s    rS   mark_output_typez%PythonWrapperCodegen.mark_output_type      rU   >dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr]]c                $    t           j        j        S rH   )r2   rI   r  r   s    rS   get_graph_inputsz%PythonWrapperCodegen.get_graph_inputs  s     w##rU   list[IRNode]c                $    t           j        j        S rH   )r2   rI   r  r   s    rS   r  z&PythonWrapperCodegen.get_graph_outputs  s    w$$rU   c           
     8   |                                                                  D ]\  }}t          |t          j        t
          j        f          r,|t          j        j	        vst          |t
          j
                  rZt          |                                          dk    r|                     |                                          }|                     |                                          }| j                            d| d| d| d           d S )Nr   zassert_size_stride(r  r   )r  rn  rn   ro   r   r   TorchBindObjectr2   rI   graph_input_namesGeneratorStater.   r  r   r  rS  r   )r   r^   bufr  r  s        rS   codegen_input_size_assertsz/PythonWrapperCodegen.codegen_input_size_asserts  s   ..006688 	S 	SID##
B,>?@@  17444
R&9 94  S\\^^,,11223<<>>BBD44S^^5E5EFFFK!!"Q"Q"Q"Q"Q"Q"Q"QRRRR	S 	SrU   c                `   | j                             d           |                                                                 D ]l\  }}t	          |t
          j        t          j        f          r,d| d}| j                             |           d| d}| j                             |           md S )Nz(# make sure graph inputs are not nan/infzassert not z.isnan().any().item()z.isinf().any().item())	rS  r   r  rn  rn   ro   r   r   r  )r   r^   r  r   s       rS   codegen_input_nan_assertsz.PythonWrapperCodegen.codegen_input_nan_asserts  s    HIII..006688 	( 	(ID##
B,>?@@ <<<<DK!!$'''<<<<DK!!$''''	( 	(rU   c                :    | j                             d           d S )NzV

            async_compile.wait(globals())
            del async_compile
            )rS  r   r   s    rS   write_async_compile_waitz-PythonWrapperCodegen.write_async_compile_wait  s+    	
 	
 	
 	
 	
rU   input_namesc                    d                     |          }t          |          dk    r|dz  }| j                            | d           | j                            d           d S )Nr  r3   ,z = argszargs.clear())r   r   rS  r   )r   r  lhss      rS   
write_argszPythonWrapperCodegen.write_args  sg    ii$${q  3JCooo...n-----rU   r   c                    t           j        r| j                            d           d}n%| j                            d| j         d           d}|S )Na  
                class Runner:
                    def __init__(self, partitions):
                        self.partitions = partitions

                    def recursively_apply_fns(self, fns):
                        new_callables = []
                        for fn, c in zip(fns, self.partitions):
                            new_callables.append(fn(c))
                        self.partitions = new_callables

                    def call(self, args):
                r   z
                def z(args):
                r3   )r   graph_partitionrS  r   rf  r   prefix_indents     rS   !write_launcher_fn_call_get_indentz6PythonWrapperCodegen.write_launcher_fn_call_get_indent  sv    ! 	K   MMK*    
 MrU   c                $    t           j        j        S rH   )r2   rI   r  r   s    rS   get_graph_input_namesz*PythonWrapperCodegen.get_graph_input_names  s    w((rU   c                   | j         J |                                  |                                 }| j                            |          5  t
          j        j        r;| j                            t          j
        j                                                   t          j
                                        }t
          j        r| j                            d| d           |                                 x}r|                     |           |                                  t%                      rt'          |           r|                                  d d d            d S # 1 swxY w Y   d S )Nz0training_annotation = nvtx._device_range_start(''))rf  r  r  rS  r   r   r   debug_sync_graphr   r2   rI   r>  synchronizeget_training_phaser  r  r  codegen_inputsr,   r+   "codegen_input_size_and_nan_asserts)r   r  phaser  s       rS   rk  z!PythonWrapperCodegen.write_prefix  s   $000%%'''>>@@[.. 	: 	:}- H%%ag&8&D&D&F&FGGGG..00E' %%PuPPP   %)$>$>$@$@@  3 1222!!!
 -..:<TBB: 77999)	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	:s   DEE"%E"c                    t           j        r|                                  t           j        r|                                  d S d S rH   )r   size_assertsr  nan_assertsr  r   s    rS   r  z7PythonWrapperCodegen.codegen_input_size_and_nan_asserts  sL     	.++--- 	-**,,,,,	- 	-rU   r5  r~  c                    |                                   d| }t          j        j        r3| j                            | d| d           t          j        j        r|S |                     | d| d           |S )Nstream = get_raw_stream(r   )	r  r   r   r   r   r   r2   rI   r<  )r   r5  r~  r^   s       rS   ru  z)PythonWrapperCodegen.write_get_raw_stream  s    ((***$
$$=1 	&0088:888   w" $??*???@@@rU   c                    | j         d         S )N)rh  r   s    rS   get_codegened_graphz(PythonWrapperCodegen.get_codegened_graph*  s    )"--rU   c                :    | j                             |           d S rH   )rh  r   )r   rI   s     rS   r  z)PythonWrapperCodegen.push_codegened_graph-  s    "))%00000rU   c                4    | j                                         S rH   )rh  r  r   s    rS   r/  z(PythonWrapperCodegen.pop_codegened_graph0  s    )--///rU   c                T    ddl m} | j                             ||                    S )Nr   )deepcopy)copyr  ri  r   )r   r  r  s      rS   r  z(PythonWrapperCodegen.push_computed_sizes3  s5    !!!!!!(//0H0HIIIrU   c                4    | j                                         S rH   )ri  r  r   s    rS   r-  z'PythonWrapperCodegen.pop_computed_sizes8  s    (,,...rU   c                ,    t          | j                   S rH   )nextrP  r   s    rS   next_kernel_suffixz'PythonWrapperCodegen.next_kernel_suffix;  s    t'((**rU   c                   |                      t          || j                             t          j        j        r|                                  | j                             dt          j	        j
                            |           d           | j                                         t          |           r|                                  | j                             d| d| d           || _        d S )Nr:  r;  r  r  r   )r   r4  r7  r   r   r   r  r   r2   rI   r>  rA  r  r+   r  )r   r5  s     rS   codegen_device_guard_enterz/PythonWrapperCodegen.codegen_device_guard_enter>  s    )*d6WXX	
 	
 	
 =1 	))+++&00F*77
CCFFF   &002222488 300222&00DDDzDDD   -7)))rU   c                    |                      t                                 t          j        j        r| j                                         d S d S rH   )r   rG  r   r   r   r   r0  r   s    rS   codegen_device_guard_exitz.PythonWrapperCodegen.codegen_device_guard_exitQ  sM    355666=1 	5&2244444	5 	5rU   output_refsc                   |r(t           j        r| j                            dd                    |          z   dz              | j                            d           | j                                         | j                            d           | j                                         | j                            d           | j                            d           | j                            d           | j                            d	d                    |          z   dz              d S | j                            d
           d S )Nzreturn_vars = (r  , )zfor var in return_vars:z!if isinstance(var, torch.Tensor):z#assert not var.isnan().any().item()z#assert not var.isinf().any().item()r   zreturn (z	return ())r   r  rV  r   r   r  r0  )r   r  s     rS   generate_returnz$PythonWrapperCodegen.generate_returnV  s<    	5! 
1!++%		+(>(>>F   !++,EFFF!++---!++,OPPP!++---!++,QRRR!++,QRRR!--a000''
TYY{5K5K(Ke(STTTTT''44444rU   resultr*   c                    d S rH   rx   r   r  s     rS   generate_before_suffixz+PythonWrapperCodegen.generate_before_suffixh  r  rU   c                    t           j        rRd                    | j                  t	          | j                  dk    rdndz   }|                    d| d           d S d S )Nr  r3   r  rI  z-
                runner = Runner(partitions=[z{])
                call = runner.call
                recursively_apply_fns = runner.recursively_apply_fns
                )r   r  r   all_partition_namesr   r   )r   r  all_partition_name_lists      rS   generate_after_suffixz*PythonWrapperCodegen.generate_after_suffixk  s    ! 	&*ii0H&I&I434499r'# MM-D      	 	rU   c                    d S rH   rx   r  s     rS   generate_endz!PythonWrapperCodegen.generate_endy  r  rU   rC   ir.FallbackKernelc                L    |                      t          | |                     d S rH   )r   rM  r  s     rS   generate_fallback_kernelz-PythonWrapperCodegen.generate_fallback_kernel|  s%    ,T48899999rU   rN  c                    |                     |            |                     t          | |                     t          |j        t
          j                  r|                    |            d S d S rH   )codegen_commentr   rM  rn   r  r   Layoutcodegen_size_assertsr  s     rS   generate_extern_kernel_allocz1PythonWrapperCodegen.generate_extern_kernel_alloc  sm    T""",T488999dk29-- 	,%%d+++++	, 	,rU   c           
        t          |j        t          j                  }|                                }|                                }|                                }| j        }t          j	        r	d|v rd| }|r9| 
                    | j         | dd                    |           d|            d S | 
                    | j         | d| dd                    |           d|            | j        rLt          j        rB|Bt          d         dxx         d	z  cc<   | 
                    d
|j        d| d           d S d S d S d S )Nview_as_complex.clone()r  r  r   r   inductorintermediate_hooksr3   zrun_intermediate_hooks()rn   r  r   
NoneLayoutrK   get_origin_noder]  r<  r   memory_planningr   r;  r   rc  generate_intermediate_hooksr   r^   )r   extern_kernelrS  	no_returnoutput_nameorigin_noderf  r<  s           rS   rR  z9PythonWrapperCodegen._generate_extern_kernel_alloc_helper  s    }3R]CC	#,,..#3355#3355! 	)&7;&F&F )((F 	NNdlTKTT$))D//TTFTTUUUUUNN<YYYYYtyyYYQWYY   06  +$%9:::a?:::Rk.>RRKRRR        ,+rU   rX  c                v    |                     |            |                     t          | |                     d S rH   )r  r   rW  r  s     rS   generate_extern_kernel_outz/PythonWrapperCodegen.generate_extern_kernel_out  s;     	T"""*46677777rU   r   outout_viewrS  rh  debug_handler6  c                Z   t           j        j        j        }|                    ||d d d           |                    d|r|n|            |                     ||           |5  |                     | dd                    |           d           d d d            d S # 1 swxY w Y   d S )Nexternzout=r  r  r   )	r2   rI   wrapper_coder  set_printer_argsr   write_provenance_debug_handler   r   )r   r   r  r  rS  rh  r  debug_printer_managers           rS   rc  z7PythonWrapperCodegen._generate_extern_kernel_out_helper  s     !" 4 B..tVT4RRR:x888S::;;;**6<@@@" 	; 	;NNf99tyy999:::	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	;s   $/B  B$'B$Fc                    |j         }|j        }|r2t          d |D                       }t          d |D                       }|j                                         d}d                     fd|D                       }d                     fd|D                       }t                               |j                  }d}| d|j	         d	}| d| d| d| }	| d
|	 d}
|
S )Nc              3  ^   K   | ](}t           j        j                            |          V  )d S rH   r2   rI   rO   atomically_apply_size_hintry   rg  s     rS   r{   zRPythonWrapperCodegen._generate_tma_descriptor_call_experimental.<locals>.<genexpr>  s5      VVA)DDQGGVVVVVVrU   c              3  ^   K   | ](}t           j        j                            |          V  )d S rH   r%  r'  s     rS   r{   zRPythonWrapperCodegen._generate_tma_descriptor_call_experimental.<locals>.<genexpr>  sF        CD ;;A>>     rU   z.data_ptr()r  c              3  N   K   | ]}t                               |          V   d S rH   r  val_to_arg_strry   dimr   s     rS   r{   zRPythonWrapperCodegen._generate_tma_descriptor_call_experimental.<locals>.<genexpr>  s4      XXC-<<T3GGXXXXXXrU   c              3  N   K   | ]}t                               |          V   d S rH   r*  r,  s     rS   r{   zRPythonWrapperCodegen._generate_tma_descriptor_call_experimental.<locals>.<genexpr>  sE       
 
?B //c::
 
 
 
 
 
rU   z$triton.tools.experimental_descriptorz.create_d_tma_descriptorr  r   )
dims
block_dimsr   tensorrd  r   r  r+  element_sizerank)r   descapply_size_hintsr0  r1  ptrr3  rS  r   rS  r  s   `          rS   *_generate_tma_descriptor_call_experimentalz?PythonWrapperCodegen._generate_tma_descriptor_call_experimental  sO   y_
 	VVQUVVVVVD  HR    J ..00===yyXXXXSWXXXXXYY 
 
 
 
FP
 
 
 
 

 ,::4ARSS7;;	;;;======|==trU   c                    |j         }|rt          d |D                       }d}| d}|j                                         d| }| d| d}|S )Nc              3  ^   K   | ](}t           j        j                            |          V  )d S rH   r%  r'  s     rS   r{   zLPythonWrapperCodegen._generate_tma_descriptor_call_stable.<locals>.<genexpr>  sF          CD ;;A>>           rU   z/triton.tools.tensor_descriptor.TensorDescriptorz.from_tensorr  r  r   )block_shaper   r2  rd  )r   r5  r6  r;  rS  r   rS  r  s           rS   $_generate_tma_descriptor_call_stablez9PythonWrapperCodegen._generate_tma_descriptor_call_stable  s    & 	    HS       K C$$$+//11BB[BBtrU   c                    t          |t          j                  r|                     ||          S t          |t          j                  sJ |                     ||          S rH   )rn   r   TMADescriptorExperimentalr8  TMADescriptorStabler<  )r   r5  r6  s      rS   _generate_tma_descriptor_callz2PythonWrapperCodegen._generate_tma_descriptor_call  sh    dB899 	UBB&   dB$:;;;;;<<TCSTTTrU   c                    |                      |          }|j         d| | j         }|                     |           d S Nr   )r@  r^   r<  r   )r   r5  r  r   s       rS   generate_tma_descriptorz,PythonWrapperCodegen.generate_tma_descriptor  sI    11$77)333dk33trU   c                &   | dd                     t          t          |                     }|                    d          r|d                     dg|z             z  }n|r|dt	          |           z  }|dz  }|                     |           d S )Nr  r  zaten.scatter_reducer  rI  z	, reduce=r   )r   mapr_   r   r  r   )	r   r   inputsr^  python_kernel_namesrc_is_tensorr  r   r   s	            rS   generate_scatter_fallbackz.PythonWrapperCodegen.generate_scatter_fallback  s     %CCsxxC0@0@'A'ACC(()>?? 	3DIIrdVm,,,DD 32DLL222trU   c                    dd                     |           d}||||g}|                     |                     ||                     d S )Nr1  r  r2  )r   r   wrap_kernel_call)r   r   r   r.  values
accumulateindices_strrS  s           rS   generate_index_put_fallbackz0PythonWrapperCodegen.generate_index_put_fallback  sT    /$))G,,///;
3t,,VT::;;;;;rU   buf_namerG  get_argsCallable[[], Sequence[str]]op_overload<Union[torch._ops.OpOverload, torch._ops.HigherOrderOperator]rz  r-  outputsSequence[ir.Buffer]c           
     x    |                      | d| dd                     |                       d           d S )Nr   r  r  r   )r   r   )r   rP  rG  rQ  rS  rz  rU  s          rS   ,generate_fallback_kernel_with_runtime_lookupzAPythonWrapperCodegen.generate_fallback_kernel_with_runtime_lookup  sH     	(TT'9TTDIIhhjj<Q<QTTTUUUUUrU   c                ~    t          d          5  |                     |          cd d d            S # 1 swxY w Y   d S )NzPythonWrapperCodegen.generate)r   	_generater   is_inferences     rS   generatezPythonWrapperCodegen.generate  s    9:: 	0 	0>>,//	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0s   266c                "    t           j        rdS dS )Nr   r3   )r   r  r   s    rS   get_wrapper_call_indentz,PythonWrapperCodegen.get_wrapper_call_indent  s    ! 	11rU   newCallable[..., None]Iterator[Callable[..., None]]c              #  V   K   | j         }	 || _         |V  || _         d S # || _         w xY wrH   r   )r   r`  olds      rS   set_writelinez"PythonWrapperCodegen.set_writeline  s@      n	! DNIII DNNNSDN    s    	(c                    | j         j        }t          j        j        r| j                            |           d S | j                            |           d S rH   )ry  kernel_defsr   r   r   rW  r   rR  )r   rh  s     rS   _write_multi_kernel_defsz-PythonWrapperCodegen._write_multi_kernel_defs!  sS    -9=1 	,%,,[99999K{+++++rU   c                `	   t           j        r|                                  t          j                    5 }|                    | j                                                   t           j        r| 	                    |           t           j        r| 
                                 |                     |           t           j        j        r%t           j        j        s|                                  |                     | j        j                  5  | j        D ]L}t'          |t(                    r|                    | j                   2| j                            |           M	 d d d            n# 1 swxY w Y   |                                  |                                 }|                                  t           j        j        r;| j                            t4          j        j                                                   t           j        r|                                  t           j        j        r%t           j        j        s|                                  t           j        j        r|                                   t           j!        r&t           j"        s| j                            d           | #                    |           d d d            n# 1 swxY w Y   tI                      }|%                    | j&                   |                    d           |%                    | j'                   t4          j        j(        r0t4          j        j"        rt4          j        j)        rtI                      }|%                    | j*                   | +                                 |%                    | j,                   | -                                }|                    |          5  |%                    | j                   d d d            n# 1 swxY w Y   | .                    |           |%                    | j/                   | 0                    |           | 1                    |           | 2                    |           |3                                | j4        3                                fS )Nz+nvtx._device_range_end(training_annotation)rI  )5r   profile_bandwidthr  r   	ExitStackenter_contextrV  r   profiler_mark_wrapper_call#generate_profiler_mark_wrapper_callgenerate_start_graphrun_wrapper_ir_passesr   store_cubinr   !generate_reset_kernel_saved_flagsrf  r   r]  rn   rB   r  ri  r  r  r  r2   rI   r>  r  generate_end_graph generate_save_uncompiled_kernelsgenerate_and_run_autotune_blockr  r<  r  r*   r   rL  rR  r=  is_const_graphrX  finalize_prefixrS  r_  r  rT  r   r  add_benchmark_harnessgetvaluewithlinemaprU  )r   r\  stackr   r  r  wrapper_call_indents          rS   rZ  zPythonWrapperCodegen._generate(  s   # 	,))+++!## *	.u 1 8 8 : :;;;0 @88???' ,))+++&&|444}( 91W 966888 ##D$5$?@@ : : J : :D!$44 :T%67777)33D9999	:: : : : : : : : : : : : : : : ))+++..00K!!###}- N!++AG,>,J,J,L,LMMM' *'')))}( 81W 855777}5 744666 ' 0B !++A     ---U*	. *	. *	. *	. *	. *	. *	. *	. *	. *	. *	. *	. *	. *	. *	.Z  !!dl###dk""" 7 	& 3 	&8N 	&#%%F 	d/000dk""""::<<]].// 	- 	-MM$+,,,	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	##F+++dk"""""6***&!!!""6*** &&(($88::
 	
sJ   CKAE."K.E2	2K5E2	6EKKK#P

PPc                     j                             d           i }t          j        j        r;t
          j        j        r* fdt          t
          j        j                  D             } j         	                                dz    j
        	                                z   }t          j        t          j        k    rt          j        t#                      dd          5 }|                    |                    d                     |j        }ddd           n# 1 swxY w Y   t          j        d	|           	 t-          ||           dS # t.          $ r}t1          d
|           |d}~ww xY w)z
        Compose self.kernel_autotune_defs and self.kernel_autotune_calls into a single block of
        code and execute it to trigger Triton kernel compilation and auto-tuning
        zQ
            async_compile.wait(globals())
            del async_compile
        c                B    i | ]\  }}                     |          |S rx   )get_autotuning_input_name)ry   idxvr   s      rS   
<dictcomp>zHPythonWrapperCodegen.generate_and_run_autotune_block.<locals>.<dictcomp>  s=       C ..s33Q  rU   r9  z.pyF)dirrT  deletezutf-8NzAuto-tuning code written to %sz%Failed to run autotuning code block: )rW  r   r   r   r   r2   rI   autotuning_inputs	enumerater   r   r    levelloggingDEBUGtempfileNamedTemporaryFiler   writeencoder^   debugexec	ExceptionRuntimeError)r   scopetuning_codef	file_pathes   `     rS   rv  z4PythonWrapperCodegen.generate_and_run_autotune_blocky  s   
 	!((	
 	
 	
 =1 	ag6O 	   '(ABB  E
 %..00(11334 	
  GM11 ,KKe   #**733444F		# # # # # # # # # # # # # # #
 !0  
	Se$$$$$ 	S 	S 	SJqJJKKQRR	Ss*   0DD D:E 
E.E))E.c                b    ddl m}  ||                               | j                  | _        d S )Nr3   )MemoryPlanner)r  r  r  r]  )r   r  s     rS   memory_planz PythonWrapperCodegen.memory_plan  s9    222222"]4((--dj99


rU   c                $   |                                  }t          j                            |          }| j        rt          | j        d         t                    rr| j        d         j        j        |vrY| j        	                                 | j        r9t          | j        d         t                    r| j        d         j        j        |vYt                      g}g }t          t          | j                            D ]}| j        |         }t          |t                    r$|                    |d                   | j        |<   Ht          |t                    r"|                    t                                 t          |t                     r'|                    |	                                           |                    |	                                           t          |          dk    sJ t#          d |D                       }d S )Nr  r   c              3  $   K   | ]}|j         V  d S rH   )r   ry   ss     rS   r{   z9PythonWrapperCodegen.memory_plan_reuse.<locals>.<genexpr>  s6       +
 +
./A)+
 +
 +
 +
 +
 +
rU   )r  r2   rI   _get_output_namesr]  rn   r  rC   r^   r  r   ranger   r  r  r   r+  sum)r   rU  	out_namesplanning_statespast_planning_statesr8  r   _total_allocated_buffer_sizes           rS   memory_plan_reusez&PythonWrapperCodegen.memory_plan_reuse  s    ((**G--g66	 J	4:b>+=>>	 
2#(	99 JNN J	4:b>+=>>	 
2#(	99 /001!s4:'' 	C 	CA:a=D$ 233 C $		/"*= > >
1D"344 C&&':'<'<====D"233 C$++O,?,?,A,ABBB##O$7$7$9$9:::?##q((((
 (+ +
 +
3G+
 +
 +
 (
 (
$$$rU   r\  c                    |r"t           j        r|                                  d S t           j        rt	                      | _        |                                  d S rH   )r   r  r  r  r  r  r  r[  s     rS   rq  z*PythonWrapperCodegen.run_wrapper_ir_passes  s]     	%F2 	%( =%:%<%<"""$$$$$rU   r   ir.TensorBox
bound_varsOrderedSet[sympy.Symbol]c           	     `  	 | j         	t          j        	fd            }t          j        	fd            }t          |t          j                  rQt          |t          j                  r||v rd S 	                    | d|            |                    |           d S t          |t          j
                  rt          |                                          D ]_\  }}t          |t          j                  r@||vr<	                    | d ||           d| d           |                    |           `t          |                                          D ]_\  }}t          |t          j                  r@||vr<	                    | d ||           d| d           |                    |           `d S t          |t          j                  rd S t          |t          j                  rd S t           j        j        j        rd S t)          dt+          |                     )Nc                D                         |  d|  d           |  dS )Nz_size = z.size()_sizerd  r^   r  s    rS   sizeofzDPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.sizeof  s2    NNd99D999:::>>>!rU   c                D                         |  d|  d           |  dS )Nz
_stride = z	.stride()_striderd  r  s    rS   strideofzFPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.strideof  s5    NNd==d===>>>####rU   r   r1  r2  zUnknown value type: )rS  r  rv  rn   ro   r   Symbolr   r   r   	TensorBoxr  r  r  r  r  r  r  r   r  r  r   )
r   r^   r   r  r  r  r-  r  r  r  s
            @rS   codegen_input_symbol_assignmentz4PythonWrapperCodegen.codegen_input_symbol_assignment  sh    {		" 	" 	" 	" 
	" 
	$ 	$ 	$ 	$ 
	$ eUZ(( 	KeU\22 ez6I6INNe....///NN5!!!!!r|,, 	K&u~~'7'788 ) )	TdEL11 )d*6L6LNNd#D#Dvvd||#D#Dc#D#D#DEEENN4(((()9)9););<< + +Vfel33 +j8P8PNNf#H#H$#H#H##H#H#HIIINN6***+ + r122 	KFr011 	KF%5 K$%IDKK%I%IJJJrU   c                   t          t          j                             }|                                 }d |                                D             d |                                D             z   }|D ]\  }}|                     |||           d	d}|D ],\  }}t          |t          j                  s  |||           -dS )
z$Assign all symbolic shapes to localsc                P    g | ]#\  }}t          |t          j                  ||f$S rx   rn   ro   r  ry   kr  s      rS   r  z7PythonWrapperCodegen.codegen_inputs.<locals>.<listcomp>  sA     
 
 
q!z!U\7R7R
F
 
 
rU   c                P    g | ]#\  }}t          |t          j                  ||f$S rx   r  r  s      rS   r  z7PythonWrapperCodegen.codegen_inputs.<locals>.<listcomp>  s1    XXX1Jq%,<W<WXaVXXXrU   r   r  r  r  c                Z   t          j        |                                 |                                 g          D ]n}t	          |t
                    rt	          |t          j                  r2fd|j        D             }t          |          dk    rt          d| d| d          od S )Nc                    g | ]}|v|	S rx   rx   )ry   symr  s     rS   r  z`PythonWrapperCodegen.codegen_inputs.<locals>._verify_input_symbol_assignment.<locals>.<listcomp>  s*     % % %:8M8MC8M8M8MrU   r   zFor z, expected z to have been codegen-ed.)r   from_iterabler  r  rn   r   ro   r  free_symbolsr   r  )r   r  exprundefined_symbolss    `  rS   _verify_input_symbol_assignmentzLPythonWrapperCodegen.codegen_inputs.<locals>._verify_input_symbol_assignment  s     +U^^-=-=u?O?O?Q?Q,RSS 
 
!$-- D%,1O1O % % % %#'#4% % %! ())A--(\t\\0A\\\   .
 
rU   N)r   r  r  r  )	r   ro   r  r  rn  r  rn   r   r  )r   r  r  rF  r^   r   r  _s           rS   r  z#PythonWrapperCodegen.codegen_inputs  s   -//
 ,,..
 
+1133
 
 
XX 2 2 4 4XXXY " 	J 	JKD%00ujIIII	 	 	 	&  	? 	?HAueR\22 ++E:>>>>	? 	?rU   r  r   c                t   t          |t          j                  rt          |t          j                  r|| j        v rd S | j                            |           t          j	        j
        j        |         }t          ||          }|                     t          | |t          j	                             d S d S d S rH   )rn   ro   r  r   r   PRECOMPUTED_SIZEr  r   r2   rI   rO   inv_precomputed_replacementsr   r   rA  )r   r  r  rB  s       rS   ensure_size_computedz)PythonWrapperCodegen.ensure_size_computed$  s    c5<(( 	D^CAV-W-W 	Dd)))##C(((7#@ED!#t,,CNN.tS!'BBCCCCC	D 	D 	D 	DrU   c                    d S rH   rx   r   s    rS   rx  z$PythonWrapperCodegen.finalize_prefix-  r  rU   rP   r   r   rP   c                    t          d          )Nz8codegen_cpp_sizevar is only implemented for cpp_wrapper!)r  r   r   rP   s      rS   codegen_cpp_sizevarz(PythonWrapperCodegen.codegen_cpp_sizevar0  s    UVVVrU   c               $    t          ||          S )Nr  )pexprr  s      rS   codegen_python_sizevarz+PythonWrapperCodegen.codegen_python_sizevar3  s    Q****rU   c                ,    |                      |          S rH   )r  )r   r   s     rS   codegen_sizevarz$PythonWrapperCodegen.codegen_sizevar6  s    **1---rU   r6  r  c                    | d| dS )Nr1  r2  rx   )r   r6  r^   r  s       rS   r4  z)PythonWrapperCodegen.codegen_tuple_access9  s    %%U%%%%rU   r  Sequence[Expr]c                    g t          | j        |          }t          |          dk    rdS t          |          dk    rd|d          dS dd                    |           dS )Nr   ()r3   r  r  r  r   )rE  r  r   r   )r   r  partss      rS   r   z/PythonWrapperCodegen.codegen_python_shape_tuple<  sn    :#d1599:u::??4u::??$uQx$$$$&499U##&&&&rU   c                ,    |                      |          S rH   )r   )r   r  s     rS   r  z(PythonWrapperCodegen.codegen_shape_tupleD  s    ..u555rU   tuple[str, list[str]]c                    d                     d                    |t          |          t          |          |                     |          |                     |          g                    g fS )Nzalloc_from_pool({})r  )formatr   r  r_   r   )r   r^   offsetr   r  r  s         rS   codegen_alloc_from_poolz,PythonWrapperCodegen.codegen_alloc_from_poolG  ss     %++II&MMJJ33E::33F;; 

 

 
 
	rU   r   c                   ||j         j        k    r]||j         j        k    rM||j         j        k    r=|&||j        k    rd|                                 d| dS |                                 S |                     |          }|                     |          }|                     |          }|/||j        k    r$d|                                 d| d| d| d| dS d|                                 d| d| d| d	S )Nzaten.view.dtype(r  r   z#aten.view.dtype(reinterpret_tensor(z), zreinterpret_tensor()r  r  r  r  r   rK   r   r  )r   datar  r  r  r   r   s          rS   codegen_reinterpret_viewz-PythonWrapperCodegen.codegen_reinterpret_viewV  s6    DK$$$$+,,,$+,,, Udj%8%8E$--//EEUEEEE--//++22488D44V<<F))&11F Udj%8%8uT]]__uuPTuuX^uubhuumruuuu Y$--//XXTXXVXXvXXXrU   non_blockingUnion[bool, str]c                B    |                      | d| d| d           d S )Nz.copy_(r  r   rd  )r   r   dstr  s       rS   codegen_device_copyz(PythonWrapperCodegen.codegen_device_copys  s3    #<<c<<\<<<=====rU   ir.MultiOutputc                    |                                 }|                    d          }|                     t          | |||j                             d S r   )rK   
input_namer   r*  r.  )r   rC   r+  r,  s       rS   codegen_multi_outputz)PythonWrapperCodegen.codegen_multi_outputv  sH    mmoo??1%%t[(DLQQRRRRRrU   c           
         |j          d|j         d|j          d|j          }|                     |j         d|j         d|j         d| d           | j                            t          |j                             d S )Nz + z if z
 < 0 else r   z * (r   )	r  r  r   unbacked_offset_symbolbase_offsetbase_dim_stridere  r   r_   )r   rC   	index_strs      rS   codegen_dynamic_select_indexz1PythonWrapperCodegen.codegen_dynamic_select_index{  s    zWWdiWWTZWW4:WW	*jjt/?jjDDXjj^gjjj	
 	
 	
 	"&&s4+F'G'GHHHHHrU   c                   d |j         D             \  }t          |j                  dk    r"|                     |j         d| d           nLt          |j                  dk    rAt          |j        d         t                    r!|                     |j         d| d           nt          |j                  dk    rt          |j        d         t                    r|                     |j         d| d           |                     d	|j         d
|j        d         j         d|j         d|j        d         j         d	           |                     |j         d|j         d|j        d         j                    nt          d|j                   |                     |
                                 d           d S )Nc              3  >   K   | ]}|                                 V  d S rH   rd  )ry   ts     rS   r{   z>PythonWrapperCodegen.codegen_dynamic_scalar.<locals>.<genexpr>  s.      >>Q1&&((>>>>>>rU   r   r   .item()r3   z = 1 if z.item() else 0z_undivided = zassert z_undivided % z
 == 0, f'{z_undivided} not divisible by 'z_undivided // unrecognized keypath z = None)rF  r   keypathr   r  rn   r   r   divisorr  rK   )r   rC   r  s      rS   codegen_dynamic_scalarz+PythonWrapperCodegen.codegen_dynamic_scalar  s   >>$+>>>t|!!NNdh8848889999!##
4<?M(R(R#NNdhDDDDDEEEE!##
4<?K(P(P#NNdhBBTBBBCCCNNZ$( Z Za1H Z ZxZ Z?C|A?VZ Z Z   NN8QQQQQ8OQQ    !!G!G!GHHH 	$--//22233333rU   c           
          fd}fd}fd}                     g d                                           5                      dd           t          j        j                                        D ]Z\  }}                    d|             |||                                |	                                |j
        |j                   [t          t          j        j                  d	k    rb                    d
           t          j        j                                        D ])\  }}                    d|             |||           *t          j        j                                        D ]\  }}t          |t           j                  r>t          t          j        j        j                            |d           t*                    r^t          |t,          j                  rnt          t          j        j                  d	k    r                    d
                               d|             |||                                           t          |t           j                  r2 ||t          j        j                            |d                     2t          |t,          j                  r ||d|j
        j         d           hd |                                D             }d |                                D             }	 ||||	|                                |                                            dd!                    t          j        j        "                                           d}
                    d|
                                d           d d d            d S # 1 swxY w Y   d S )Nc                                         |  d                    |           d                    |           d| d| d
           d S )Nz = rand_strided(r  
, device='	', dtype=r   )r   r   )r^   r  r  rh  r   r   r   s        rS   add_fake_inputzFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_fake_input  s     5 5225995 5226::5 5 "5 5 -25 5 5    rU   c                <                         |  d|            d S rB  rd  )r^   r  r   s     rS   add_expr_inputzFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_expr_input  s+    ..../////rU   c                    dd l }t          |t          j                  sJ                     |  d|                    |          d           d S )Nr   z = pickle.loads(r   )picklern   r  ScriptObjectr   dumps)r^   r   r  r   s      rS   add_torchbind_inputzKPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_torchbind_input  s\    MMMeU%788888NNfll56I6INNNOOOOOrU   )rI  rI  z3def benchmark_compiled_module(times=10, repeat=10):z
                from torch._dynamo.testing import rand_strided
                from torch._inductor.utils import print_performance
                Tr   zglobal r   zimport pickle*   r  ztorch.cuda.default_generators[z].graphsafe_get_state()c                Z    g | ](}t           j        j                            |d           )S r  r  r2   rI   rO   r  ry   r   s     rS   r  zBPythonWrapperCodegen.benchmark_compiled_module.<locals>.<listcomp>  s@        (221r2BB  rU   c                Z    g | ](}t           j        j                            |d           )S r  r  r	  s     rS   r  zBPythonWrapperCodegen.benchmark_compiled_module.<locals>.<listcomp>  s@        (221r2BB  rU   zcall([r  ])zfn = lambda: z8return print_performance(fn, times=times, repeat=repeat))#
writelinesr   r   r2   rI   	constantsrn  r   r  r  rh  r   r   torchbind_constantsr  rn   ro   r  rO   
var_to_valr   r   r   r  get_real_objr   r  r  r  r  r  r_  rN   r   r  )r   r   r  r  r  r^   r   torchbind_objr  r  call_strs   ``         rS   benchmark_compiled_modulez.PythonWrapperCodegen.benchmark_compiled_module  s   	 	 	 	 	 		0 	0 	0 	0 	0	P 	P 	P 	P 	P 	KKK	
 	
 	
 ]]__ E	Y E	YMM       !w06688  e   !14!1!1222%**,,ek    17.//!33  111+,7+F+L+L+N+N = ='D- $$%5t%5%5666''m<<<< w399;; ( (eeU\22 zG$/33E4@@,8 8  eR%788  176771<<((999$$%5t%5%5666''e.@.@.B.BCCCCuz22 
 #N4)9)C)CETV)C)W)WXXXXr'899 "Nd9Kddd   
 !&!1!1  E !&!1!1!3!3  F #N((**))    K		!'*>*C*C*E*E F FJJJH7X77888WXXXKE	Y E	Y E	Y E	Y E	Y E	Y E	Y E	Y E	Y E	Y E	Y E	Y E	Y E	Y E	Y E	Y E	Y E	Ys   M=OOOc                "   t           j        sdS |                     |           |                    g d           |                                5  |                    ddt                       dg           ddd           dS # 1 swxY w Y   dS )zL
        Append a benchmark harness to generated code for debugging
        N)rI  rI  zif __name__ == "__main__":zBfrom torch._inductor.wrapper_benchmark import compiled_module_mainzcompiled_module_main('z', benchmark_compiled_module))r   benchmark_harnessr  r  r   r(   r   r   s     rS   ry  z*PythonWrapperCodegen.add_benchmark_harness  s     ' 	F&&v...@@@AAA]]__ 	 	X`-?-A-A```  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   (BBBrf  r  r  r  r  c           
     V    |                      t          | |||||                     d S r  )r   r  )r   rf  r  r  r  r  s         rS   define_kernelz"PythonWrapperCodegen.define_kernel  sH     	 !-  		
 		
 		
 		
 		
rU   c                ,    |r| dnd}d| |  d| }|S )Nr9  rI  z

r   rx   )rf  r  r  metadata_commentbodys        rS   _format_kernel_definitionz.PythonWrapperCodegen._format_kernel_definition  s9     /7>h????BE&EEEEErU   c                   t           j        j        rE|                     ||d           }| j                            |           t          j        j        rd S |                     |||          }| j	                            |           d S )N)r  )
r   r   r   r  rW  r   r2   rI   r<  rR  )r   rf  r  r  r  r  r  s          rS   r  z*PythonWrapperCodegen._define_kernel_helper"  s     =1 	11[4 2  D %,,T222w" --x . 
 
 	4     rU   fn_codec                :    | j                             |           d S rH   )rX  r   )r   r  s     rS   define_subgraph_launcher_fnz0PythonWrapperCodegen.define_subgraph_launcher_fn9  s    !((11111rU   rb   "list[list[Union[int, sympy.Expr]]]c                b  *+,-./0 ddl m}m}m}	 ddlm*m}
m}m}m	} ddl
m}m} |j        }g 0i -g ,g },0fd+d4*+-fd	}t          |j                  D ]\  }}||j        v r || *|	          d
           (|vr-|         }|          || *|	          d
           Vt#          |t$          j                  r[t#          |t$          j                  r!d|j        |j                                        fnd\  }}} || |||||                     t#          |t$          j                  r> || |||                                |                                                     #t#          |t$          j                  rN || |||j                                        |                                |j        j                             t#          |t<          t>          j         f          o$tB          j"        j#        $                    |d          } || |||          |           tK          0d ,d |j        D                       }|tM          j'        tB          j"        (                                          i -tR          *                    |d          tW          0,          gd}|rtY          |          |d<   |rtY          |          |d<   t[          |          dk    r8|.                                }g t_          t>          j0        |d                   }nd5.fd/i ./fd|D             }|r t[          |          t[          |          k    sJ g }tc          te          ||          d  d
!          D ]g\  }} |3                     ||           g t_          th          |          g t_          tj          |          g t_          th          |          d"           h|	j        |g t_          tl          .7                                          d#}g .8                                }ts          |j:                  g}!t[          |          dk    rR7                                D ]=}t#          |t$          j        t$          j        f          s|!3                    |           >|!3                    tm          |                     |!;                    tm          |                     tY          |!          }!|!| j<        v rg | j<        |!         |R S | d$t[          | j<                   }"t{                      }#t|          j
        j?        r|#@                    d%|"d&           n|#@                    d%|d&           |"|d'<   |A                    |B                                           |#C                     |                       |#C                    d(g t_          ||          d)|d*|d+           t          |          }$t|          j
        j?        r|$E                    d,| d-d,|" d-          }$|$E                    d.d/          }$|#C                    |$           tB          j"        (                                }%|#@                    d0|%jF         d1           t          jH        |j:                  \  }&}'t          jI        |j:                  }(d2|( d3|' })| J                    |"|#K                                |)           |"|f| j<        |!<   |"||fS )6Nr   )config_to_dict	FixedGridPrecomputedGridr3   )ConstexprArgKernelArgTypeSizeArg	TensorArgTMADescriptorArg)gen_common_triton_importsTritonKernelc                \                         |                                |            d S rH   )r   )r  rB  arg_indices	signatures     rS   add_to_signaturezPPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_to_signatureY  s1    S!!!s#####rU   Fc                   |r<t                      r | |           |j        v r|j                 |j        <   d S d S |j        v sJ |rBt                      r |  |j                             n | |           d|j        <   d S |r5t                      r |  |j                             d |j        <   d S  | |           d S )Nr^   r3   )r1   r^   )	r  rB  is_constexprequals_1equals_noner&  r0  r  r   s	        rS   add_argzGPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_arg]  s;    !/133 / %$S#...8v%% +1*:Ich''' &% x6)))) /577 3
 )(ll.I.I.IJJJJ((c222*+Ich'''  /577 K )(ll.I.I.IJJJ*.Ich'''$$S#.....rU   r2  T)r3  )r5  stable)experimentalNN)r^   api_typer;  r   )r^   bufferr   )r^   r:  r   r  )r4  c                ,    g | ]}t          |          S rx   )r4   r	  s     rS   r  zJPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.<listcomp>  s    :::AWQZZ:::rU   )
size_dtyper.  argdefs)r.  )r/  rh  r  r`   restore_valuereset_to_zeror   r  rk   rE   rl   c                `   t          | t          j                  rig | j        }|s| S |                    t
                     |D ].}|v rt          j        dt                               |<   /t          |           S t          | t                    sJ t          j
        |           S )N)r   _launcher_s)rn   ro   r   r  sortr_   r  r   r0   r   rp   )r  symbolsr  extra_launcher_argss      rS   rename_sizes_for_launcherzYPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.rename_sizes_for_launcher  s    dEJ// A2 12G" $#LLSL)))&  "555$38<D#.A*B*BDD4 4+C00 &d,?@@@!$,,,,,}T***rU   c                4    g | ]}g t          |          S rx   )rE  )ry   rr   rE  s     rS   r  zJPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.<listcomp>  s*    OOO<s4d;;<OOOrU   c                6    t          | d         j                  S r   r   r   s    rS   r   zHPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.<lambda>  s    3qt{3C3C rU   r   )r   pythoncpppython_slow)	grid_typeprecomputed_gridsrD  r  zasync_compile.triton(z, '''rf  zG
            @triton_heuristics.user_autotune(
                configs=z ,
                inductor_meta=z,
                triton_meta=z{,
                filename=__file__,
                custom_kernel=True,
            )
            @triton.jit
            r   r  z'''z\'\'\'z''', device_str='r  z# Original path: r;  )FFF)r  rk   rE   rl   )Lruntime.triton_heuristicsr#  r$  r%  commonr&  r'  r(  r)  r*  r   r+  r,  r   r  	arg_names
constexprsrn   r   TMADescriptorr?  r;  r2  rN   r  rK   r#   r  r  r  r   ro   rp   r2   rI   rO   statically_known_equalsr=   r%   r  get_current_device_or_throwr5  fromkeysr;   r   r   setup_grid_as_argsrE  sympifyr   r   r   r  r:   r_   rL  r  idr   extendrd  r*   r   unique_user_kernel_namesr   updateinductor_meta_commonr   r   replacer   inspectgetsourcelinesgetsourcefiler  r   )1r   r   r`   r   restore_value_argsreset_to_zero_argsrb   r#  r$  r%  r'  r(  r)  r*  r+  r,  original_nameequal_to_1_argsr6  r  r   rB  r9  r;  r   r4  triton_signaturer}  inductor_metaextra_launcher_call_argsrL  rr   cfg	cache_keyr^   r   
kernel_srccurrent_devicer  linenosrcfiler  r&  r0  r.  r  rD  rE  r/  s1      `                                      @@@@@@@rS   !define_user_defined_triton_kernelz6PythonWrapperCodegen.define_user_defined_triton_kernel<  s!	   	
 	
 	
 	
 	
 	
 	
 	
 	
 	

	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	DCCCCCCC)+	$&	!#%'	$ 	$ 	$ 	$ 	$ 	$"	/ "	/ "	/ "	/ "	/ "	/ "	/ "	/ "	/H "&"233 9	G 9	GHCf'''\\s333$GGGG&  +Cc{"\\s333FFFFFc2#344 ,G &c2+ABB:3?CJ4H4H4J4JKK9 1Hk5
 G((!$%-(3"'	       RY// GG!	!$#&<<>>"%--//       R%788 G G!	!$#&8#4#4#6#6"%--//#&:#4	       *c5=1     '*BB   GCc!2!2XFFFFF,::)9:::	
 
 
 *&-ag.Q.Q.S.STT--33
 '  '
 '
,  	E+01C+D+DK( 	E+01C+D+DK(u::??,5,H,H,J,JM'FU]E!H)E)E'F$$+ + + + + +  EGOOOOOOOE7SZZ3w<<7777 "#E7##)C)CT   
 
	c "(("0."5"5"5Ct$4$4"52UD!1!12':UD)9)9':	     -5%6'PS2E2L2L2N2N)O)O'P M
 (E)<)A)A)C)C'D$ VY--	w<<!}} * *!#	23E'FGG *$$S)))[))***]++,,,)$$	666/	:(  
  GG#d&D"E"EGG(**=1 	V%%&Kd&K&K&KLLLL%%&Tm&T&T&TUUU'+m$\>>@@AAA88::;;;	83~w778	 	  -	 	 )		 	 		
 	
 	
 OvVV
=1 	U#++,C=,C,C,C^D^^^TTJ''{;;
z***<<>>!!"Mn6I"M"M"MNNN*6955	6'	229w9999$$&&	
 	
 	
 6:;4G&y1[":::rU   rT  c                    | d|j          d}||d| z  }t          j        |dd          }t          ||j                  }|                     t          | |t          j                             |S )Nr  r	  T)
is_integeris_positive)	rS  ro   r  r   r	  r   rA  r2   rI   )r   rf  treerT  sym_namer  rB  s          rS   generate_numel_exprz(PythonWrapperCodegen.generate_numel_exprH	  s    !66DK666F$Hl8$GGG c4:..*4ag>>???
rU   rB  r   rI   r@   c                h    |                      |j         dt          |j                              d S rB  )r   r   r  r   )r   rB  rI   s      rS   rD  z7PythonWrapperCodegen._generate_symbolic_call_arg_helperZ	  s6     	#)??cn(=(=??@@@@@rU   wsr8   c                   |                                 }t          | |          }|j        t          j        k    r|                     |           n1|j        t          j        k    r>|                     |           |                     |                     |                     n|j        t          j        k    r| j	        
                    |          }|rQt          |t                    rt          |j        t                    sJ t          j        |j        |          |_        n\|                     |           |                     |                     |                     || j	        |<   nt          |j                  t           j        j        r| j                            t(                              | ||j        |j        t0          j        j                            |j                  fd                     |j        t          j        k    r7| j                            t(                              | |                     d S d S d S )N)r3   )r  r  )rK   r  	zero_moder9   UNINITIALIZEDr   ZERO_ON_CALLmake_zero_bufferZERO_PER_GRAPHr{  r   rn   rC   r8   maximumr  r   r   r   r   r  make_allocationrh  r   r2   rI   rO   r  r   )r   ru  r^   r   priors        rS   generate_workspace_allocationz2PythonWrapperCodegen.generate_workspace_allocation_	  s$   {{}}D"%%<,:::NN4    \.;;;NN4   NN400667777\.===-11$77E 	7!%66 :J< <    *1%*bAA

t$$$t44T::;;;26)$// ...=1 	&00$44IH7+55bh??A 5  	 	 	 |0>>>*44(99$EE    	 	 ?>rU   c                z    |j         t          j        k    r%|                     t	          | |                     d S d S rH   )rw  r9   r{  r   r   )r   ru  s     rS   generate_workspace_deallocationz4PythonWrapperCodegen.generate_workspace_deallocation	  s>    <,;;;NN.tR8899999 <;rU   c                    | d| j          S )Nz.zero_())r<  )r   r^   s     rS   rz  z%PythonWrapperCodegen.make_zero_buffer	  s    -----rU   c                F    | dd                     |           d| j         S )Nr  r  r   )r   r<  )r   r^   rx  s      rS   rK  z%PythonWrapperCodegen.wrap_kernel_call	  s,    ==9--=====rU   c                    | j                             d           | j                             dt          j        j         d           |                    | j                                                    d S )Nz*from torch.profiler import record_functionzwith record_function('graph_z_inductor_wrapper_call'):)rV  r   r2   rI   graph_idrm  r   )r   r{  s     rS   ro  z8PythonWrapperCodegen.generate_profiler_mark_wrapper_call	  ss    ##$PQQQ##V17+;VVV	
 	
 	
 	D-446677777rU   c                :    | j                             d           d S )Nzstart_graph())rV  r   r   s    rS   rp  z)PythonWrapperCodegen.generate_start_graph	  s    ##O44444rU   c                V    | j                             dt          j        d           d S )Nz
end_graph(r   )rV  r   r   profile_bandwidth_outputr   s    rS   rt  z'PythonWrapperCodegen.generate_end_graph	  s.    ##$U1P$U$U$UVVVVVrU   c                V    | j                             dt          j         d           d S )NU
            for kernel in globals().values():
                if isinstance(kernel, zU.CachingAutotuner):
                    kernel.cuda_kernel_saved = False
            rV  r   r$   r   r   s    rS   rs  z6PythonWrapperCodegen.generate_reset_kernel_saved_flags	  sA      '8'A  	
 	
 	
 	
 	
rU   c                V    | j                             dt          j         d           dS )a[  
        Precompile and save the CUBINs of the Triton kernels that haven't
        been precompiled and saved as a side effect of running the generated
        JIT model (Python wrapper). This can happen when the model contains
        control flow: only one pass through the control flow operators covers
        the kernels that are saved, the remaining kernels are not launched,
        hence not saved. The main purpose of this codegen is to compile and
        save the Triton kernels outside the active control flow path for
        subsequent AOTInductor code generation and compilation.
        r  a  .CachingAutotuner):
                    if not kernel.cuda_kernel_saved:
                        if len(kernel.launchers) == 0:
                            kernel.precompile()
                        kernel.save_gpu_kernel(
                            grid=(0, 0, 0),   # use dummy grid
                            stream="stream",  # use dummy stream
                            launcher=kernel.launchers[0],
                        )
            Nr  r   s    rS   ru  z5PythonWrapperCodegen.generate_save_uncompiled_kernels	  sC     	  '8'A  	
 	
 	
 	
 	
rU   c                &    d fd|D             S )Nc                (   t          | t                    rt          |           r| dz   n| S t          | t          t          t
          t          f          rt          |           S t          t          j	        j
                            |                     S )Nr  )rn   r_   r<   r   floatr   r   r  r2   rI   rO   rP   )rB  s    rS   wrap_argzAPythonWrapperCodegen.prepare_triton_kernel_call.<locals>.wrap_arg	  sv    #s## =*B3*G*GPsYSPC#udO!DEE =3xxQW-66s;;<<<rU   c                &    g | ]} |          S rx   rx   )ry   rB  r  s     rS   r  zCPythonWrapperCodegen.prepare_triton_kernel_call.<locals>.<listcomp>	  s!    333#333rU   rx   )r   rx  r  s     @rS   prepare_triton_kernel_callz/PythonWrapperCodegen.prepare_triton_kernel_call	  s.    	= 	= 	= 43333333rU   c                x    t          |t                    r2t          |t          j                  r4|                                                                } j        |         }nR j                            |          r|} j        |         }n(|
J d            d j         }|} xj        dz  c_        |J d|             t          d |
                                D                       }t          d t          j                            |          D                       }t          d |                                D                       }|                                }	|                                }
t          j        j                            |                                j        t*          j                  }d	| d
| d|	 d|
 d
| d
| d} j                            | d|            t          |t          j                  r8                     |d          }|} j                            | d|            |S t5          |t6          j                  st          |t:                    rt          |t<                    r| j        v r|S |dS |}t          |t:                    r|j         }|t          j        j        j!        v rt          j        j        j!        |         }t=          t          j        j        "                    |t*          j                            S t          |t<          tF          tH          tJ          f          rt=          |          S t          |tL                    r%dd
'                     fd|D                        dS tQ          dtS          |                     )NzBV.graph.get_buffer(arg) and raw_arg can't be None at the same timetmp_arg_r3   z Failed to find a buffer for arg c              3  v   K   | ]4}t           j        j                            |t          j                   V  5dS r  Nr2   rI   rO   r&  r   unbacked_symint_fallbackry   r  s     rS   r{   zBPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>	  s[        
 	  ;;#< <       rU   c              3  v   K   | ]4}t           j        j                            |t          j                   V  5dS r  r  r  s     rS   r{   zBPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>	  s[       $ $
 	  ;;#< <  $ $ $ $ $ $rU   c              3  v   K   | ]4}t           j        j                            |t          j                   V  5dS r  r  r  s     rS   r{   zBPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>	  s[        
 	  ;;#< <       rU   r  zgenerate_example_value(r  z, 'z', r   r   T)r5  r6  r  r1  c              3  ^   K   | ]'}                     |t          |                    V  (d S rH   r}   )ry   ar   s     rS   r{   zBPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>
  s:       Z ZQR!@!@DGG!L!L Z Z Z Z Z ZrU   r2  zUnsupported type )*rn   torch_dtyper   rQ  
get_tensorrK   rQ  r   rZ  r   r  r2   rI   get_allocation_sizer  r_  rN   rO   r  
get_layoutr  r   r  r   r   r@  r3  ro   Basicr   r_   rx  r   r  r&  r   r  r   r   r   r  r   )r   rB  arg_typeraw_argrP  r  r  allocation_sizer  rh  r   r  r   s   `            rS   r~   z/PythonWrapperCodegen.generate_example_arg_value	  s   h,, R	G'2#344 6"--//88::*3/%))#.. 	6*3/**X +** Id&FHH00A500??$Ls$L$L???  
     D $ $ $
 44S99$ $ $  O   
 ))    F ^^%%FMMOOEW%//  '8 0  F rdqqfqqqqEqqU[qq_nqqqE&00H1H1H1H1HIII'2#344 N :: %) ;   *445L5LU5L5LMMMO%+.. 	G*S/2R2R 	G#s## $/))J?!6#// %nag&CCCg&CCH ;;&"A <     c3t455 	Gs88OT"" 	G]tyy Z Z Z ZVY Z Z ZZZ]]]]%&E$s))&E&EFFFrU   c                     t          |t                    r'dd                     fd|D                       z   dz   S t          |          S )Nr1  r  c              3  B   K   | ]}                     |          V  d S rH   )_grid_dim_str)ry   rj   r   s     rS   r{   z5PythonWrapperCodegen._grid_dim_str.<locals>.<genexpr>$
  s1      RRT 2 24 8 8RRRRRRrU   r2  )rn   r   r   r  )r   grid_per_dims   ` rS   r  z"PythonWrapperCodegen._grid_dim_str!
  sY    lD)) 	'diiRRRR\RRRRRRUXX &&&rU   )rh  r   r|  ry  rz  r}  rf   r  c               2   | j                             d |D                        |pt          j                                        }|                     ||
           |                     t          | ||||||||t          j        j        |	                     dS )z
        Generates kernel call code.

        triton: Defines whether the backend uses Triton for codegen. Otherwise it uses the CUDA language when gpu=True,
                and C++ when gpu=False.
        c                x    i | ]7}t          |t                    |t          j                            |          8S rx   )rn   r_   r2   rI   try_get_buffer)ry   rB  s     rS   r  z=PythonWrapperCodegen.generate_kernel_call.<locals>.<dictcomp>A
  sJ       c3''QW++C00  rU   )
rf  rx  ry  rz  r|  r   r}  rh  r~  rf   N)	rQ  rZ  r2   rI   rS  r!  r   rw  r^   )r   rf  rx  rh  r   r|  ry  rz  r}  rf   r  s              rS   generate_kernel_callz)PythonWrapperCodegen.generate_kernel_call)
  s    . 	## $  	
 	
 	
 @17>>@@**;EEE'#!!#'7<%9  	
 	
 	
 	
 	
rU   rI  )rh  r   r|  ry  rz  r}  r~  rf   c          
     
    |pt           j                                        }|s|j        dk    r|j        dk    r*                                          |                     nP|j        dk    r-                                           d|                     nt          d|j         d          d S                      |          }d                    |          }t          
                     |j        |	          }|s)d| d	}                      d
 d| d| d	           d S                                   t          j        j        r j        vrv| t#          |          t#          |          k    s
J d            d |
r6t           j        j        r%t           j        j                            |
d           d fd} fd}g }|3|
J d            d gt#          |          z  }d gt#          |          z  }n*t#          |          t#          |          k    s
J d            i }t)          t+          ||||                    D ]|\  }\  }}}}d }t-          |t.                    r)dt/          |          v r|                    d          \  }}d }r|v r                     |                   }|rL|}t-          |t4                    s4t7          |t8          j                  st-          |t<                    r|||<   n|dk    r |||||          r	||         }nt-          |t4                    rYt?          j         d|          r|}n4| j!        vr "                    |||          }n j!        |         d         }|f j!        |<   n "                    |||          }|#                    ||n| d|            ~ j$                            dt           j        j%        &                    |j                   d            j$        '                                  j$                             dd                    |           d| d	            j$        (                                  j$                            tS          d|d                      j        *                               t           j        j+        rd S t           j        j,        j-        }|.                    ||d            |5                        d| d| d	           d d d            n# 1 swxY w Y                                     d S )Ncudar  mpsz.generated_kernelzdevice z nyir  z	c_void_p(r   r   r  z$call_args and arg_types do not matchrE   r_   c                     fdj                                         D             } | rdd                    |            dS dS )a  After all the autotune kernel calls have been written (i.e.
                self.kernel_autotune_example_args is complete), returns a deletion call
                for all autotune example tensors that are unnecessary after kernel_name
                is called.c                &    g | ]\  }}|k    |S rx   rx   )ry   r2  knrf  s      rS   r  ziPythonWrapperCodegen._generate_kernel_call_helper.<locals>.get_autotune_deletion_call.<locals>.<listcomp>
  s2     % % %"[(( (((rU   del r  r9  rI  )rY  rL  r   )tensors_to_deleterf  r   s    rS   get_autotune_deletion_callzUPythonWrapperCodegen._generate_kernel_call_helper.<locals>.get_autotune_deletion_call
  si    
% % % %&*&G&N&N&P&P% % %!
 % CB$)),=">">BBBBrrU   c                   ||         }||v rdS t          t          | |                    D ]\  }\  }}||k    st          |t                    s$d}r|v r                    |                   }|dk    rN	 |                                }	t          |	j                  D ]\  }
}||k    r| d|
 d||<     dS # t          $ r Y w xY wdS )zWe try to infer raw_arg (i.e. raw_args[idx]) from remaining raw_args.
                This is particularly useful for jagged cases, where the dimension is often
                being passed in as an input.TrI  z.shape[r2  F)r  r   rn   r"   r  r  r  r  )ry  rz  r  reused_args
target_argr8  raw_keyr  triton_inputr  r-  r  autotune_argsr   s               rS   infer_arg_by_inputszNPythonWrapperCodegen._generate_kernel_call_helper.<locals>.infer_arg_by_inputs
  s=   
 &c]
,,4-6s8X7N7N-O-O ! !)A)Cxxz'6'B'Bx #%L$ M)A)A'+'E'E)'2( ( $r)) 	!!(!3!3!5!5&/&<&< , ,FC J=I:X:XRU:X:X:XJ 7'+ttt  /, / ! ! ! !! us   <?C >C  
CCzkeys are not None but args arez#call_args and raw_args do not matchr  rI  z^(workspace|semaphore)r   r:  r;  z.run(z	, stream=z
<del_call>r  )/r2   rI   rS  r   r   rK  r  r  r   r  ru  r  r  r   r   r   r   r   autotuning_mappingr   r  r   rn   r_   splitr  r  r3  ro   r  r   rematchrY  r~   r   r   r>  rA  r  r0  r'   r   r<  r  r  r   )r   rf  rx  rh  r   r|  ry  rz  r}  r~  rf   call_args_strstream_name
stream_ptrr  r  all_argsr  r8  rB  r  r  r  r   r  arg_strr"  r  s   ``                         @rS   r  z1PythonWrapperCodegen._generate_kernel_call_helperZ
  s    @17>>@@ 
	&+//{e##t44[)LLMMMM%%))[*K*K*KYWW    ##>V[#>#>#>???F77	BB		-00*??&,

 
  	3[333JNNMMMM}MM
MMM   F%%''' M2E	4#=== (S^^s9~~-M-M-M6 .N-MM !M# (B  ! : > >($! !           B H'')I''' 6C	NN2 6C	NN28}}I6669 766 K8AIy(H==9 9 )P )P44C7G c3'' .C3s88OO"yy~~HC.2  W%=%=#'#A#A%g.$ $L   V*G%h<< 7"8U[997%c?;;7 07G,]]':':h;( (]
 *'2GG+66 V x 93?? L"%D$EEE"&"A"A7# # #'"CC"H"K>E{=SD5c::"==c8WUUG3;s<N<NW<N<NOOOO &00H*77EEHHH   &00222&00QQTYYx%8%8QQ;QQQ   &22444&00 /I<XX   &**;777w"  !" 4 B..y+yRVWWW" 	X 	XNNkVVVVVVVWWW	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X%%'''''s   T//T36T3c                :    | j                             |           d S rH   )r]  r   r   r   s     rS   r   zPythonWrapperCodegen.writeline  s    
$rU   c                :    |D ]}|                      |           d S rH   rd  )r   r]  r   s      rS   r  zPythonWrapperCodegen.writelines  s0     	! 	!DNN4    	! 	!rU   c                T    | j                             t          |                     d S rH   )r]  r   r-   )r   ctxs     rS   rm  z"PythonWrapperCodegen.enter_context  s&    
+c**+++++rU   c                r    ddl m}  |            rdd l}t          |t                    rt          |j        j                  S t          |t          j	                  rt          |          S t          |t          t          f          rQt          j         G d d                      t           t          |           fd|D                                 S t          |t           j        j                  rt'          |          S t          |t(          j        t(          j        t.          f          r|                                S  |            r)t          ||j        j                  rt          |          S t          |t(          j                  r|                                S t          |          S )Nr   )has_triton_packagec                       e Zd ZU ded<   d ZdS )1PythonWrapperCodegen.val_to_arg_str.<locals>.Shimr   refc                    | j         S rH   )r  r   s    rS   __repr__z:PythonWrapperCodegen.val_to_arg_str.<locals>.Shim.__repr__+  s	    8OrU   N)r   r   r   r   r  rx   rU   rS   Shimr  '  s.         $ $ $ $ $rU   r  c              3  `   K   | ](} t                               |                    V  )d S rH   r*  )ry   r  r  r   s     rS   r{   z6PythonWrapperCodegen.val_to_arg_str.<locals>.<genexpr>0  s>      VVq1@@qIIJJVVVVVVrU   )torch.utils._tritonr  r   rn   r   r  rC   r  ro   r   r   r   r  	dataclassr  r   r  _ops
OpOverloadr   r   r  
MutableBoxr#   rd  languager   r  )r   r  type_r  r   r  s   `    @rS   r+  z#PythonWrapperCodegen.val_to_arg_str  s   :::::: 	MMMa"" 	%%%5:&& 	88OE4=)) 	"$ $ $ $ $ $ $ #"$ QVVVVVTUVVVVV   5:011 		&q)))BIr}oFGG 	&&(((!! 	jFO4I&J&J 	77N2,-- 	&&(((77NrU   r:  rD   c           	        |                                 }|                                }t          |                                          }t          t          j                            |                    }t          |                                          }|                                }| 	                    |
                                ||||||          S rH   )r_  rN   r   r  r2   rI   r  r  get_is_pinnedr}  rK   )r   r:  rh  r   r  allocation_shaper  	is_pinneds           rS   r  z+PythonWrapperCodegen.make_buffer_allocation>  s    ""$$  ""foo''(( !<!<V!D!DEEv((**++((**	##OOvueV=My
 
 	
rU   c                h    d}t           j        j        s| j                            |d           d S d S )Nzi
            from torch._inductor.runtime.debug_utils import check_memory_step, track_tensor
            Tr   )r2   rI   r<  rL  r   r  s     rS   "write_memory_track_allocation_oncez7PythonWrapperCodegen.write_memory_track_allocation_onceI  sC    
 w" 	8L
$77777	8 	8rU   c                   ||}|                      |          }|                      |          }	|                      |          }
t          j        j        j        j        r| d|	 d|
 d| d|j         d| d}nR|j        dk    r|r| d|	 d|
 d| d	}n6|j        d
v r| d|j         d|	 d|
 d| d	
}n| d|	 d|
 d|j         d| d	
}||	k    r|d| d|
 d	z   }|S )Nz = tracked_empty_strided(r  z, dtype=r  z	', name='r  r  z = empty_strided_cpu_pinned(r   )r  r  xpumtiaz = empty_strided_r  z = empty_strided(r  z.as_strided()r   r  r  r   r  r  r   )r   r^   rh  r   r  r  r  r  r  codegen_allocation_shape_tuplecodegen_stride_tupler  s               rS   r}  z$PythonWrapperCodegen.make_allocationQ  s    #$"==eDD)-)H)H*
 *
&  $>>vFF?!.E 	 " "1" "'" " " " ";	" "
 " " " C [E!!i!  1 '     C [:::   &+  1 '     C  : :1: :': : ";: : 27: : :  "@@@U':UU>RUUUUC
rU   c                J    |                      t          |                     d S rH   )r   r%  r  s     rS   make_commentz!PythonWrapperCodegen.make_comment  s"    {4(()))))rU   c           	     B    | j          | d| | j         d| j         d| 	S )Nr      )r;  r<  r_  )r   new_nameold_namer_  s       rS   make_tensor_aliasz&PythonWrapperCodegen.make_tensor_alias  s7    ,^^^X^t{^^dl^^U\^^^rU   ro  c                0    d|                                  S )Nr  rK   )r   r:  s     rS   rr  z%PythonWrapperCodegen.make_buffer_free  s    )foo'')))rU   names_to_delc                F    dd                     d |D                        S )Nr  r  c              3     K   | ]}|V  d S rH   rx   )ry   r^   s     rS   r{   z:PythonWrapperCodegen.make_free_by_names.<locals>.<genexpr>  s"      >>>>>>>>rU   )r   )r   r  s     rS   make_free_by_namesz'PythonWrapperCodegen.make_free_by_names  s+    @dii>>>>>>>@@@rU   r  r  del_linec           	     B    | j          | d| | | j         d| j         d	S )Nr   r   reuse)r^  r<  r_  )r   r  r  r  s       rS   codegen_exact_buffer_reusez/PythonWrapperCodegen.codegen_exact_buffer_reuse  s;    .tttXtxtQUQ\tt`d`lttttrU   c                R    |$|                      | j         d| d|            d S d S )Nz [Provenance debug handles] r;  )r   r_  )r   rf  r  s      rS   r!  z2PythonWrapperCodegen.write_provenance_debug_handle  sM    
 #NN<YY[YY<YY     $#rU   re  r  c                   |                                 |                                 k    sJ |                                }|                                }d}|t          j                                        vr|rd|                     |           }|                                |                                k    rA|                                |                                k    r|                     |||          S | 	                    ||                                |                                d| j
        j                  }| j         | d| | d| j         dS )N;z; r   r   r  r  )rN   rK   r2   rI   r  rr  r  r  r  r  rV  r   r;  r_  )r   re  r`  r  r  r  r  reinterpret_views           rS   r  z&PythonWrapperCodegen.make_buffer_reuse  s5   }}#--//1111<<>><<>>17335555*58D11#6688H<<>>S\\^^++0@0@CNNDTDT0T0T228XxPPP88!1!11d6G6Q
 
 ,___-=_x__4<____rU   r  ir.ReinterpretViewc                    |                      t          || j         | d|                                 | j         d| j         d                     d S )Nr   r  z alias)r   r6   r;  rd  r<  r_  )r   r^   r  s      rS   r  z0PythonWrapperCodegen.codegen_deferred_allocation  sl    <ggg$*@*@*B*BgDKggSWS_ggg 	
 	
 	
 	
 	
rU   r  c                   |                                 }|t          j        j        v s/|| j        v s&t          |t          j        t          j        f          rd S | j        	                    |           t          |
                                t          j        t          j        f          r|                                sd S |                                }t          |t          j                  rd S t          |t          j                  rd S t          |t          j                  rt          |j        t          j                  s'J dt)          |j                   d|j                     |j        j        }t          |t          j                  sJ t)          |                      |j        }t          |t          j                  sJ t)          |                      |                     |           |                     t5          | |||                     d S t          |t          j                  r%|                     t9          | |                     d S |                     t;          | |                     d S )Nzunexpected r   )rK   r2   rI   rq  rq  rn   r   DonatedBufferSubgraphBufferr   get_defining_opExternKernelAllocMultiOutputshould_allocater  MutationLayoutSHOULDREMOVEr  r  r  r#   r   r  
StorageBoxr  codegen_allocationr   r  r  r  r  )r   r:  r^   r  boxinput_buffers         rS   r  z'PythonWrapperCodegen.codegen_allocation  sG      AG+++t~%%&2#3R5F"GHH & F4   &&((%r~6 	
 **,,	 F''))fb;<< 	Ffbm,, 	Ffb011 
	fk2+=>>  @d6;//@@6;@@ > +"Cc2=11<<499<<18LlBI66AAS		AA6##L111NN?4vvNNOOOFfb122 	NN1$??@@@F|D&1122222rU   c                   |                                 }t          |t          j        t          j        f          r%|                     t          | |                     d S t          |                                t          j                  r%|                     t          | |                     d S | 
                    |          sd S | j                            |           |                     t          | |                     d S rH   )rK   rn   r   InputBufferr  r   rn  r  r  r%  	can_reuserr  r   r   )r   r:  r^   s      rS   codegen_freez!PythonWrapperCodegen.codegen_free  s       fr~r/ABCC 	NN8D&11222Ff,,..0CDD 	 NN-dF;;<<<F~~f%% 	F
t*48899999rU   c                \   |                                 }|t          j        j        v p|t          j        j        v o/t          t          j        j        |         t          j                   pA|t          j        j	        v p.|t          j        j
        v p|t          j        j        v p|| j        v  S rH   )rK   r2   rI   rq  r  rn   graph_inputs_originalr   r   r  r  never_reuse_buffersrr  )r   r
  output_bufferr^   s       rS   r  zPythonWrapperCodegen.can_reuse  s    $$&&AG++ 
",, "G1$79I  
" qw((
" qw22
" qw22
" tz!
 	
rU   c                    |                                 | j        v o4| j        |                                          |                                 k    S rH   )rK   rs  )r   r:  reused_buffers      rS   	did_reusezPythonWrapperCodegen.did_reuse   sH     OO, KFOO--.-2H2H2J2JJ	
rU   r
  r  c                   t          ||          sJ |                     |           | j                            |                                           | j                            |                                           |                                | j        |                                <   |                     t          | ||                     d S rH   )	r]   r  rr  r   rK   rq  rs  r   r  )r   r
  r  s      rS   codegen_inplace_reusez*PythonWrapperCodegen.codegen_inplace_reuse  s    $\=AAAAA---
|,,..///=11334440<0E0E0G0GM**,,-y|]CCDDDDDrU   c                ~    t          |          }|| j        v r|S | j                            |           | j        |z   S rH   )r_   re  r   r;  )r   r   r^   s      rS   codegen_unbacked_symbol_declz1PythonWrapperCodegen.codegen_unbacked_symbol_decl  sF    6{{4---K &**4000<$&&rU   r  r   unbacked_bindings,Optional[dict[sympy.Symbol, pytree.KeyPath]]c                &   t          t          j        j        j        |          }|sd S |                                D ]O\  }dfdfd}|                     |                     |           d |             | j                    Pd S )	Nr  r_   r  pytree.KeyPathc                2   |dk    r| S t          |          dk    rnt          |d         t                    rSt          |d         t          j                  r3 |  d|d         j         d|d         j         d|dd                    S t          |d         t                    r% |  d|d         j         d|dd                    S t          |d         t          j                  r\t          j        j	        r& d	|d         j         d
|  d|dd                    n$ |  d|d         j         d|dd                    S t          |d         t                    r% |  d|d         j         d|dd                    S t          d|           )Nrx   r   r   r3   r   r  r   r  z	std::get<z>(r1  r2  z.__floordiv__(r  )r   rn   r   pytreeSequenceKeyr^   r  r2   rI   r<  r   r  r  )r  r  gos     rS   r!  zIPythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputs.<locals>.go0  s   b==K LLA%%"71:}== &"71:v/ABB & 2EE'!*/EEGAJNEEEwqrr{    
M:: L2;;
;;;WQRR[III
F,>?? L 7.J@wqz~@@@@@'!""+NNNR4 ; ;'!*. ; ; ;WQRR[II
  
K88 L 2JJWQZ5GJJJGTUTVTVKXXX()J)J)JKKKrU   c                    t           j        j        rt                    dk    rhd         }  d                                         t          | t          j                  r"t          | j                  dk    r
dd          n          S t          d         t          j
                  sJ  d         j                                                 dd                    S            S )Nr3   r   )r2   rI   r<  r   rK   rn   r   r  r.  r  r   r  )r  r!  r  r  rU  s    rS   go_outerzOPythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputs.<locals>.go_outerN  s    7& 4
 7||q((%aj  "r#AJ//11)#r~>>)CFs{CSCSWXCXCX $ABBKK!(	      *'!*f6HIIIII!r''!*."9"B"B"D"DgabbkRRR2k7333rU   r   )r  r_   r  r  )	r   r2   rI   rO   	shape_envrn  r   r  r<  )r   r  rU  r  r  r#  r!  r  s    ``   @@rS   (codegen_unbacked_symbol_defs_for_outputsz=PythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputs  s     6G&(9
 
 ! 	F ,1133 <	 <	JAw
L L L L L L<4 4 4 4 4 4 4 4. NN44Q77UUHHJJUUU   u<	 <	rU   c                     fd} fd}	                       j                                         j         dj                     |             t
          j        }t          j        j                  5  j                            |           d d d            n# 1 swxY w Y    |                                               d S #                                   w xY w)Nc                     t          j        j                  t                    k    sJ t          j        j                  D ]-\  } }                    j         |  d| j                    .d S rB  )r   rI   r  r   r   r;  r<  )inner_inputouter_inputouter_inputsr   subgraphs     rS   _codegen_subgraph_prefixzSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_prefixw  s    x~233s<7H7HHHHH,/+\- -  ([ |O[OO[O$+OO    rU   c                    t          j        j                  t                    k    sJ t          j        j                  D ]8\  } }                    | d|                                  j                    9d S rB  )r   rI   r  r   r   rd  r<  )inner_outputouter_outputouter_outputsr   r+  s     rS   _codegen_subgraph_suffixzSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_suffix  s    x~344M8J8JJJJJ.1,m/ /  *l #WW(F(F(H(HW$+WW    rU    subgraph: )parent_graph)	r  rI   r   r_  r^   r2   set_graph_handlercodegen_subgraphr/  )r   r+  r*  r0  r,  r1  r3  s   ````   rS   codegen_subgraph_by_inliningz1PythonWrapperCodegen.codegen_subgraph_by_inliningi  sl   	 	 	 	 	 	 		 	 	 	 	 	 		'%%hn555NNdlFFx}FFGGG$$&&&7L$X^44  //!- 0                  %$&&&$$&&&&&D$$&&&&s0   A-C B)C )B--C 0B-1C C*partition_idir.GraphPartitionSignaturec           	        |j         }|j        }t          |                                          d |j        D             z   }d                    |          t          |          dk    rdndz   }d |D             }d                    |          t          |          dk    rdndz   }|                     d| d| d	           d
 |                                D             }	|	r+|                     dd                    |	                      |                     d| d| d| d           |                     d| d           dS )z'Generate code to call a graph partitionc                    g | ]	}|j         
S rx   r2  ry   symbol_inputs     rS   r  z?PythonWrapperCodegen.codegen_partition_call.<locals>.<listcomp>  s(     9
 9
 9
".L9
 9
 9
rU   r  r3   r  rI  c                6    g | ]}|                                 S rx   r  )ry   rC   s     rS   r  z?PythonWrapperCodegen.codegen_partition_call.<locals>.<listcomp>  s     AAADAAArU   	partition	_args = [r2  c                    g | ]	\  }}||
S rx   rx   )ry   r^   
deallocates      rS   r  z?PythonWrapperCodegen.codegen_partition_call.<locals>.<listcomp>  s1     
 
 
%T:z

 
 
rU   r  r  z) = self.partitions[z](partition_args)zdel partition_argsN)	input_deallocationoutput_nodesr   r  symbol_inputsr   r   r   rn  )
r   r7  r  rD  rE  r  rF  output_namesrU  r  s
             rS   codegen_partition_callz+PythonWrapperCodegen.codegen_partition_call  s    2D+8-224455 9
 9
2F2T9
 9
 9
 
 ;''#k2B2Ba2G2G33RPAALAAA))L))C4E4E4J4JSSPRS 	C<CC&CCCDDD
 
);)A)A)C)C
 
 
  	=NN;$))L"9"9;;<<< 	ZZZ\ZZlZZZ	
 	
 	
 	:|:::;;;;;rU   num_partitionsc                B    d t          |          D             | _        d S )Nc                    g | ]}d | S )
partition_rx   )ry   r  s     rS   r  z@PythonWrapperCodegen.set_all_partition_names.<locals>.<listcomp>  s!    #X#X#X3$6$6$6#X#X#XrU   )r  r  )r   rI  s     rS   set_all_partition_namesz,PythonWrapperCodegen.set_all_partition_names  s&    #X#X%BWBW#X#X#X   rU   c           	     j   d                     |          t          |          dk    rdndz   }d                     |          t          |          dk    rdndz   }|                     |j        j         d| d           |                     d| d|j        j         d|j        j         d	           d S )
Nr  r3   r  rI  r?  r2  r  z) = rB  )r   r   r   rI   r^   )r   r+  r*  outer_flattened_outputsouter_output_namesouter_input_namess         rS   ,codegen_subgraph_call_with_flattened_outputszAPythonWrapperCodegen.codegen_subgraph_call_with_flattened_outputs  s     "YY'>??.//144CC"
 !IIl33|$$))CCr
 	(.-LL8ILLLMMM 	Y"YY(;YYhn>QYYY	
 	
 	
 	
 	
rU   c                T   d                     |          t          |          dk    rdndz   }|                     |j        j         d| d           t
          j        j                                         |                     | d|j        j         d|j        j         d	           d S )
Nr  r3   r  rI  r?  r2  r   r  rB  )r   r   r   rI   r^   r2   r  free_buffers)r   r+  r*  outer_buffer_namerQ  s        rS   codegen_subgraph_callz*PythonWrapperCodegen.codegen_subgraph_call  s     IIl33|$$))CCr
 	(.-LL8ILLLMMM 	
&&((( 	 VVX^%8VV8>;NVVV	
 	
 	
 	
 	
rU   c                   |                      |j                   |                     d           |                     | j         d|j                    t
          j        }|j        |j        _        |j        j        | j        vrt          j        |j                  5  t          j
        dd          5  |j                                        \  }}d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   | j                            |j        j                   |                     |j                   d S d S )NrI  r2  r  F)r  rI   r   r_  r^   r2   r<  rz  r4  r   patchr  r   r   r   )r   r+  r3  subgraph_coder  s        rS   codegen_subgraph_commonz,PythonWrapperCodegen.codegen_subgraph_common  s   !!(.111r$,BB8=BBCCCw%1%=">d&FFF $X^44 @ @\"3U;; @ @'/~'='='?'?$M1@ @ @ @ @ @ @ @ @ @ @ @ @ @ @@ @ @ @ @ @ @ @ @ @ @ @ @ @ @
 ,001DEEE,,]-@AAAAA GFs6   C33CC3C 	 C3#C 	$C33C7:C7c                ^    |                      |           |                     |||           d S rH   )rZ  rR  )r   r+  r*  rO  s       rS   'codegen_subgraph_with_flattened_outputsz<PythonWrapperCodegen.codegen_subgraph_with_flattened_outputs  sB     	$$X...99l$;	
 	
 	
 	
 	
rU   c                ^    |                      |           |                     |||           d S rH   )rZ  rV  )r   r+  r*  rU  s       rS   r5  z%PythonWrapperCodegen.codegen_subgraph  s7     	$$X...""8\;LMMMMMrU   c                   |                                 |                      dt          |j                              d |j        D             }t
          j        j        rKfdt          t          |j                            D             }| 	                    |j
        ||           d S |                     |j
        |           d S )N = [None] * c                6    g | ]}|                                 S rx   r  ry   r  s     rS   r  z@PythonWrapperCodegen.codegen_invoke_subgraph.<locals>.<listcomp>  s$    RRRC--//RRRrU   c                     g | ]
} d | dS r1  r2  rx   ry   r8  r^   s     rS   r  z@PythonWrapperCodegen.codegen_invoke_subgraph.<locals>.<listcomp>  s2       #$4!  rU   )rK   r   r   rU  rF  r2   rI   r=  r  r6  r+  r5  )r   invoke_subgraphr*  r0  r^   s       @rS   codegen_invoke_subgraphz,PythonWrapperCodegen.codegen_invoke_subgraph  s    ''))$JJC0G,H,HJJKKKRR?;QRRR7 	P   (-c/2I.J.J(K(K  M --(,     !!/":L$OOOOOrU   c                   |                                 d |j        D             }|j                                        }t	          |j        t
          j                  s| d}|                      dt          |j	                              |                     d| d           |                     t          | |j        j                             t          j        j        rJfdt          t          |j	                            D             }|                     |j        ||           n|                     |j        |           |                     t%          |                      |                     d           |                     t          | |j        j                             t          j        j        rJfdt          t          |j	                            D             }|                     |j        ||           n|                     |j        |           |                     t%          |                      d S )	Nc                6    g | ]}|                                 S rx   r  ra  s     rS   r  z<PythonWrapperCodegen.codegen_conditional.<locals>.<listcomp>  s$    PPPC--//PPPrU   r  r_  r   r;  c                     g | ]
} d | dS rc  rx   rd  s     rS   r  z<PythonWrapperCodegen.codegen_conditional.<locals>.<listcomp>  %    UUU^^q^^^UUUrU   zelse:c                     g | ]
} d | dS rc  rx   rd  s     rS   r  z<PythonWrapperCodegen.codegen_conditional.<locals>.<listcomp>'  rj  rU   )rK   operands	predicaterd  rn   r   ShapeAsConstantBufferr   r   rU  r  true_subgraphrI   r2   r=  r  r6  r5  r+  false_subgraph)r   conditionalr*  rm  r0  r^   s        @rS   codegen_conditionalz(PythonWrapperCodegen.codegen_conditional  s=   ##%%PP;;OPPP);;==	+/1IJJ 	.$---I$FFC0C,D,DFFGGG)Y)))***({/H/NOOPPP7 	QUUUUU3{?R;S;S5T5TUUUM--)<    !!+";\4PPP'--...w({/I/OPPQQQ7 	RUUUUU3{?R;S;S5T5TUUUM--*L-    !!+"<lDQQQ'--.....rU   c                	     fd}|                                 d |j        D             }d |j        D             }t          |          }                      dt          |                      |r(                      dt          |           d           t          |          D ]"\  }}                      d| d|            #g fd	t          t          |                    D             |}	 d
g}
t          |	          }|dt          |                   } ||j        |	|
                                d|
d                                          d           |rt          t          ||j                            D ]u\  }\  }}                     t           |j        j                                                   d| d| d                                t                                vnt          t          ||j                            D ]u\  }\  }}                     t           |j        j                                                   d| d| d                                t                                v                     d                                t           |j        j                              ||j        ||                                t                                |r                     t           |j        j                             t          t          |                    D ]&}                      d||z    d d| d           '                     t                                                     t           |j        j                              ||j        |	|
                                t                                                     d|
d                     |rƉ                     d           t          t          |                    D ]}                     d d||z    d                                t           |j        j                                                   d| d d||z    d                                t                                dS dS )z1while_loop is codegened as a host side while_loopc                    t           j        j        r                    | ||           dS                     | ||           dS )z3Helper method to deduplicate subgraph codegen logicN)r2   rI   r=  r6  r\  )r+  r*  r0  r   s      rS   r5  zAPythonWrapperCodegen.codegen_while_loop.<locals>.codegen_subgraph2  sW    w 11(L-XXXXX<<lM    rU   c                6    g | ]}|                                 S rx   r  ra  s     rS   r  z;PythonWrapperCodegen.codegen_while_loop.<locals>.<listcomp><  s1      
  
  
(+C!!## 
  
  
rU   c                6    g | ]}|                                 S rx   r  ra  s     rS   r  z;PythonWrapperCodegen.codegen_while_loop.<locals>.<listcomp>?  s1     #
 #
 #
(+C!!###
 #
 #
rU   r_  z.extend([[] for _ in range(z)])r1  z] = c                     g | ]
} d | dS rc  rx   rd  s     rS   r  z;PythonWrapperCodegen.codegen_while_loop.<locals>.<listcomp>O  s%    GGGnnnnnGGGrU   _cond_resultNzshould_loop = r   zif not should_loop:z.unsqueeze(0).clone()r  zwhile should_loop:z	].append(r  z    should_loop = z%# Stack outputs after loop completionzif len(z]) > 0:z] = torch.stack(z	], dim=0))rK   carried_inputsadditional_inputsr   r   r  r  r   cond_subgraphr   r  body_subgraphrI   r+  )r   
while_loopstack_outputr5  outer_carried_inputsouter_additional_inputs
ckp_offsetr8  inpcond_outer_inputscond_outer_outputsbody_outer_inputsbody_outer_outputscarried_inputcarried_bufr^   s   `              @rS   codegen_while_loopz'PythonWrapperCodegen.codegen_while_loop/  s   	 	 	 	 	 ""$$ 
  
/9/H 
  
  
#
 #
/9/K#
 #
 #
 -..
$GGC0D,E,EGGHHH 	NNRRC8L4M4MRRR     455 	3 	3FAsNNd11Q11C112222
GGGGeC0D,E,E&F&FGGG
$
 "&3334 
 
 //J5I1J1J/JK$&79K	
 	
 	
 	?(:1(=??@@@,--- 	73<(**CDD4 4 7 7//M; 0z7O7UVVWWW$TTTTTTTUUU/5566667 4=(**CDD4 4 7 7//M; 0z7O7UVVWWW$GGGGGGGHHH/556666+,,,(z/G/MNNOOO$&79K	
 	
 	
 	'--...  	3NN,T:3K3QRRSSS334455 P P$NNZNN$NNNNNOOOONN+D11222 	(z/G/MNNOOO$&79K	
 	
 	
 	'--...C,>q,ACCDDD  	7NNBCCC334455 7 7GGGJGGGHHH0z7O7UVVWWWQQaQQQQJQQQ   /556666	7 	77 7rU   c                    	 t          | dd           rd S t          | t                    r| S t          j        j                            |           }||S t          |          S # t          $ r Y d S w xY w)Nr  )r  rn   r   r2   rI   
_shape_env_maybe_evaluate_staticr  )r   r  s     rS   statically_known_int_or_nonez1PythonWrapperCodegen.statically_known_int_or_none  s    	q.$//  t!S!! '$;;A>>C{
s88O 	 	 	44	s!   A# A# 'A# A# #
A10A1c                |    g }| D ]6}t                               |          }| d S |                    |           7|S rH   )r  r  r   )lstr  r   nums       rS   %statically_known_list_of_ints_or_nonez:PythonWrapperCodegen.statically_known_list_of_ints_or_none  sO     	 	A&CCAFFC{ttMM#rU   c                :    t                               |           d uS rH   )r  r  )r  s    rS    is_statically_known_list_of_intsz5PythonWrapperCodegen.is_statically_known_list_of_ints  s      !FFsKKSWW	
rU   c                Z    t                               |                                           S rH   )r  r  r  r:  s    rS   r  z4PythonWrapperCodegen.static_shape_for_buffer_or_none  s'    #IIOO
 
 	
rU   c                :    t                               |           d uS rH   )r  r  r  s    rS   !can_prove_buffer_has_static_shapez6PythonWrapperCodegen.can_prove_buffer_has_static_shape  s    #CCFKKSWWWrU   rH   )r  r   r  rg   r  re   r  r  r!  )r^   r_   r  r_   rE   r  )rR  r_   )r  r  rE   r_   rE   r{  rE   r  rE   r  )r  r{  r  )r5  r   r~  r_   rE   r_   r  )r5  r   rE   r  )r  r{  rE   r  r  r*   rE   r  )rC   r  rE   r  )rC   rN  )rC   rX  rE   r  )r   r_   r  r_   r  rg   rS  r{  rh  r_   r  r6  rE   r  )F)rP  r_   rG  r_   rQ  rR  rS  rT  rz  r-  rU  rV  rE   r  )r`  ra  rE   rb  )r\  r   )r^   r_   r   r  r  r  )r  r   )r   r   rP   r   rE   r_   )r   r   rE   r_   )r6  r_   r^   r_   r  r_   rE   r_   )r  r  rE   r_   )rE   r  )r   ra  rE   r_   )r  r  )rC   r  )NTN)
rf  r_   r  r_   r  rg   r  r   r  rg   )rf  r_   r  r_   r  rg   )r  r_   )rb   r!  )rf  r_   rT  rg   )rB  r   rI   r@   rE   r  )ru  r8   )rf  r_   r  r6  )rf  r_   )r:  rD   )NF)rI  )r:  ro  )r  r{  )r  r_   r  r_   r  r_   )r  r6  )re  rD   r`  rD   r  r   )r^   r_   r  r  rE   r  r:  r  )r
  r  r  r  )r  r_   rU  r   r  r  rE   r  )r7  r   r  r8  )rI  r   )r   r   r   r?  supports_cachingr   r)  r  rg  ro  rj  r  rl  r&   r  r  r  r  r  r  r  r  r  r  r  r  r  r  rk  r  ru  r  r  r/  r  r-  r  r  r  r  r  r   r  r  r
  rR  r  rc  r8  r<  r@  rC  rI  rO  rX  r]  r_  r   contextmanagerrf  ri  rZ  rv  r  r  rq  r  r  r  rx  r  r  r  r4  r   r  r  r  r  r  r  r  r  ry  r  r  r  r   rm  rs  rD  r  r  rz  rK  ro  rp  rt  rs  ru  r  r~   r  r  r  r   r  rm  r+  r  r  r}  r  r  rr  r  r  r!  r  r  r  r  r  r  r  r  r%  r6  rH  rM  rR  rV  rZ  r\  r5  rf  rr  r  r  r  r  r  r  r  r  s   @rS   r  r    s         ]# ]# ]# ]# ]#~ 
 FJ	& & & & \&' ' ' '< < < <@A @A @A @AD   
 
 
 
     ]"	B 	B 	B 	B + + + ]+	! 	! 	! 	! 
 
 
 ]

   $ $ $ $
% % % %S S S S$	( 	( 	( 	(
 
 
 
. . . .   8) ) ) ): : : :6- - - -   . . .1 1 10 0 0J J J
/ / /+ + + +7 7 7 7&5 5 5 5
5 5 5 5$         : : : :, , , ,  :8 8 8 8 '+; ; ; ; ;"   ,   U U U U  
  &< < <
	V 	V 	V 	V0 0 0    ! ! ! !, , , ,O
 O
 O
b&S &S &SP: : :
 
  
  
D% % % %(K (K (K (KT'? '? '?RD D D D   @D W W W W W W CG + + + + + +. . . .& & & &' ' ' '6 6 6 6   ,     :> > > >S S S S
I I I4 4 4*[Y [Y [Yz  , #'(,
 
 
 
 
& FJ    \ #'(,! ! ! ! !.2 2 2 2J; J; J; J;X    $A A A A
% % % %N: : : :. . .> > >8 8 85 5 5W W W
 
 

 
 
4
4 
4 
4SG SG SG SGj' ' ' !&*/
 /
 /
 /
 /
 /
l !u( u( u( u( u( u(n     ! ! !, , ,       F	
 	
 	
 	
 8 8 ]8 TY. . . .`* * *_ _ _ _* * * *A A A Au u u u '+    ` ` ` ` 
 
 
 
(3 (3 (3 (3T: : :(
 
 
 
 
 
 
E E E E' ' 'N N N N`+' +' +'Z< < < <BY Y Y Y
 
 
$
 
 
"B B B&
 
 
N N NP P P / / /Ba7 a7 a7F   \   \ 
 
 \

 
 
 \

 X X \X X X X XrU   r  c                       e Zd ZdZ	 d#d$ fd	Zd%dZd%dZd Zd Zd Z	d&dZ
d'dZd(dZd(dZd)dZd*dZd+dZd, fd Zed%d!            Zed%d"            Z xZS )-r  a  
    A wrapper codegen that generates code for a subgraph. For most of the
    methods, we rely on the implementation in the PythonWrapperCodegen. But we
    override a few functions to produce cleaner code (like avoiding writing
    imports twice in the output code)
    Nr  r_   r  r  r  r  c                r    || _         || _        || _        t                                                       d S rH   )r  r  r  r   r   )r   r  r  r  r   s       rS   r   z%SubgraphPythonWrapperCodegen.__init__  s9     +,$8!rU   rE   r  c                    | j         | _        d S rH   )r  rf  r   s    rS   rg  z1SubgraphPythonWrapperCodegen.set_launcher_fn_name  s     !% 2rU   c                    d S rH   rx   r   s    rS   rj  z)SubgraphPythonWrapperCodegen.write_header  r  rU   c                    d S rH   rx   r  s     rS   ry  z2SubgraphPythonWrapperCodegen.add_benchmark_harness  r  rU   c                    d S rH   rx   r  s     rS   r  z6SubgraphPythonWrapperCodegen.benchmark_compiled_module  r  rU   c                    d S rH   rx   r   s    rS   r  z5SubgraphPythonWrapperCodegen.write_async_compile_wait  r  rU   c                4    | j                                         S rH   )r  r  r   s    rS   r  z/SubgraphPythonWrapperCodegen.next_kernel_suffix  s    "55777rU   r  r*   c                    d S rH   rx   r  s     rS   r   z2SubgraphPythonWrapperCodegen.generate_after_suffix  r  rU   r   c                P    | j                             d| j         d           d}|S )Nz
            def z(args):
            r3   )rS  r   rf  r  s     rS   r  z>SubgraphPythonWrapperCodegen.write_launcher_fn_call_get_indent  sC    &  	
 	
 	

 rU   c                    dS r   rx   r   s    rS   r_  z4SubgraphPythonWrapperCodegen.get_wrapper_call_indent  s    qrU   r  c                n    | j         x}r|j        d |j        D             z  }nt          j        j        }|S )Nc                .    i | ]}t          |          |S rx   )r_   r  s     rS   r  zASubgraphPythonWrapperCodegen.get_graph_inputs.<locals>.<dictcomp>  s-     . . .A. . .rU   )r  input_nodesrF  r2   rI   r  )r   r/  rF  s      rS   r  z-SubgraphPythonWrapperCodegen.get_graph_inputs  sS     119 	** . .#,#:. . . FF W)FrU   r{  c                    | j         x}r9t          |j                                                  d |j        D             z   }nt
          j        j        }|S )Nc                    g | ]	}|j         
S rx   r2  r;  s     rS   r  zFSubgraphPythonWrapperCodegen.get_graph_input_names.<locals>.<listcomp>  s(     : : :&2!: : :rU   )r  r   r  r  rF  r2   rI   r  )r   r/  namess      rS   r  z2SubgraphPythonWrapperCodegen.get_graph_input_names  se    119 	..335566 : :6?6M: : : EE G-ErU   r  c                J    | j         x}r|j        }nt          j        j        }|S rH   )r  rE  r2   rI   r  )r   r/  rU  s      rS   r  z.SubgraphPythonWrapperCodegen.get_graph_outputs  s+    119 	,,GGg+GrU   r:  r  c                    |                                 }| j        x}r||j        v rd S t                                          |           d S rH   )rK   r  r  r   r  )r   r:  r^   r/  r   s       rS   r  z/SubgraphPythonWrapperCodegen.codegen_allocation  sT      22I 		@U8U8U F""6*****rU   c                8    | j                                          d S rH   )r  r  r   s    rS   r  z5SubgraphPythonWrapperCodegen.write_triton_header_once  s     	4466666rU   c                8    | j                                          d S rH   )r  r  r   s    rS   r  z=SubgraphPythonWrapperCodegen.write_get_raw_stream_header_once%  s     	<<>>>>>rU   rH   )r  r_   r  r  r  r  r!  r  r  r  r  r  r  r  )r   r   r   r?  r   rg  rj  ry  r  r  r  r   r  r_  r  r  r  r  r&   r  r  r  r  s   @rS   r  r    s         FJ	      3 3 3 3
         8 8 8 8         	 	 	 	      + + + + + + 7 7 7 ]7 ? ? ? ]? ? ? ? ?rU   r  )rC   rD   rE   rF   )rV   rD   rW   rD   )NN)r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rE   rh   r  )
__future__r   r   r   r  r   r  r]  r  r  r   r  r  	itertoolsr   r   typingr   r   r   r	   r
   ro   r   r  
torch._opstorch.utils._pytreeutils_pytreer  r   r  torch._dynamo.utilsr   r   #torch._inductor.codegen.debug_utilsr   $torch._inductor.codegen.multi_kernelr   %torch._inductor.runtime.runtime_utilsr   %torch.fx.experimental.symbolic_shapesr   r   r   r   r   torch.fx.noder   torch.utils._ordered_setr    torch.utils._sympy.singleton_intr   torch.utils._sympy.symbolr   r   rI  r   r   r   	codecacher    r  r!   r"   r#   r   r$   runtime.hintsr%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   virtualizedr2   rN  r4   r5   r6   r7   r8   r9   	cpp_utilsr:   triton_utilsr;   r<   r=   collections.abcr>   r?   r   rI   r@   wrapper_fxirrA   	getLoggerr   logdoprintr  r   rh  r_   r   rF   r  rD   r  rT   r]   r5  r   r  rs   r   r   r  r   r   rB   r  r%  r+  r4  rG  rM  rW  rn  rw  r  r  r  r  r   r  r  r  r  r  r%  r*  rA  rp  Liner  r  rx   rU   rS   <module>r     s   " " " " " "             



         				  " " " " " " " " @ @ @ @ @ @ @ @ @ @ @ @ @ @             $ $ $ $ $ $ $ $ $ & & & & & & 6 6 6 6 6 6 6 6 C C C C C C A A A A A A ; ; ; ; ; ;              . - - - - - / / / / / / 9 9 9 9 9 9 : : : : : : : : ( ( ( ( ( ( ( ( ( ( ' ' ' ' ' ' ; ; ; ; ; ; ( ( ( ( ( ( ( ( ' ' ' ' ' ' , , , , , ,                                                       P P P P P P P P P P  *22222222MMM%%%%%%)))))) g!! u{C5629l*+
]OT12       @ S> 	%UZ
 #
%&2B1CU3PS8_1T(UU
 /3*.k& k& k& k& k&\S& S& S& S&l        * * * * * * * **X X X X X X X X
 2 2 2 2 2 2 2 2 + + + + ++ + + + 1 1 1 1 1{ 1 1 1 "@ "@ "@ "@ "@K "@ "@ "@J? ? ? ? ?; ? ? ? 
7 
7 
7 
7 
7K 
7 
7 
7 !5 !5 !5 !5 !5+ !5 !5 !5H 	( 	( 	( 	( 	({ 	( 	( 	( / / / / /[ / / /> 5 5 5 5 5; 5 5 5* ; ; ; ; ; ; ; ;2%
 %
 %
 %
 %
 %
 %
 %
P 3, 3, 3, 3, 3,% 3, 3, 3,l 6 6 6 6 6, 6 6 6> / / / / /( / / /& ) ) ) ) )" ) ) ).( ( ( ( (! ( ( (
 ! ! ! ! ![ ! ! !: )8 )8 )8 )8 )8^ )8 )8 )8X 4 4 4 4 4 4 4 4 #0 #0 #0 #0 #0k #0 #0 #0L 	5 	5 	5 	5 	5+ 	5 	5 	5 
,-Z(X Z(X Z(X Z(X Z(X7 Z(X Z(X Z(XzPp? p? p? p? p?#7 p? p? p? p? p?rU   