
    `i                       U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZ d dlmZmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZmZm Z  d d	l!m"Z"m#Z#m$Z$ d
dl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZE ddlFmGZGmHZHmIZImJZJ ddlKmLZLmMZMmNZNmOZOmPZPmQZQmRZRmSZSmTZTmUZUmVZVmWZW ddlXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZfmgZg ejh        dk    Ziejj        d             Zkejl        m                    end          Zo eg d          Zpddddddd d!d"d"d#
Zq eg d$          Zrd%d&d'd(d)d*d+d,d-d.d/
Zsd0d1d2Zteju        ejv        gZwejx        ejy        eju        ejv        ejz        ej{        ej|        ej}        ej~        ej        ej        gZeej                 ed3<   ejy        eju        ejv        ej{        ej|        gZeej                 ed4<   d5 Zd6 Z	 	 	 dzd7eej                 fd8Zd9 Zd:eSd;ej        d<ed=ej        d>ej        d?eMfd@ZdAeeeOf         dBedCedDej        dEeeef         f
dFZdGeSdHedIefdJZdGeSfdKZej        d7ej        dLej        fdM            Zej        d7ej        dLej        dNefdO            Zej        	 d{d7ej        dLej        dNee         fdP            Zej         G dQ dR                      Z G dS dTe6          Z G dU dV          ZdW Z G dX dYeV          Ze                    dZ            G d[ d\e          Ze                    d]           e                                  G d^ d_e          Z G d` daeT          Z G db dce          Z G dd dee          Zdfe0d?eeej                 ezf         fdgZ G dh di          Z G dj dke          Z G dl dme          Z G dn doe          Z G dp dqe3          Z G dr ds          Z G dt du          Zej         G dv dw                      Zej         G dx dy                      ZdS )|    N)Sequence)Enum)AnyCallablecastOptionalUnion)dependencies)is_float_dtypeis_integer_dtype)
OrderedSet)CeilDivFloorDivModularIndexing)free_symbol_is_typesymbol_is_typeSymT   )counters   )configcpp_buildercpu_vec_isairmetrics)'set_kernel_post_grad_provenance_tracing)LoopBody)BaseSchedulerNodeBaseSchedulingExternKernelSchedulerNodeForeachKernelSchedulerNodeFusedSchedulerNode	SchedulerSchedulerNode)cache_on_selfget_bounds_index_exprget_fused_kernel_namehas_free_symbolsis_multi_outputs_templateis_welford_reductionparallel_num_threadsPlaceholdersympy_index_symbolsympy_index_symbol_with_prefixsympy_product
sympy_subs)NullKernelHandleropsOpsValueV   )BackendFeatureBracesBufferCSECSEVariableDataTypePropagationDeferredLineDTYPE_TO_COMPUTATION_DTYPEIndentedBufferKernel
KernelArgsOpOverridesOptimizationContext)_get_dtype_from_loopbodies_get_loop_bodycexprcexpr_indexcodegen_randCppCSEVariableDTYPE_TO_CPPget_promote_dtype
INDEX_TYPELocalBufferContextmay_unify_binary_op_mask_typepromote_args(template_fusion_with_epilogues_supportedunify_mask_base_typevalue_to_cppwin32c                      t           rdndS )Nz__declspec(dllexport) _IS_WINDOWS     o/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torch/_inductor/codegen/cpp.pyget_export_declarationrY   Y   s    &19""r9rW   schedule)+*^||minmaxr[   r\   r]   r_   r`   argminargmaxr^   welford)
sumprodxor_sumr_   r`   ra   rb   anywelford_reducewelford_combine)
r`   r_   rd   re   rf   rh   ri   ra   rb   rg   z
at::Tensorlongdoubleboolzstd::stringzc10::ScalarTypezat::MemoryFormatz
at::Layoutz
at::Devicez
at::Scalar)
Tensorintfloatrl   str
ScalarTypeMemoryFormatLayoutDevicenumberzstd::vectorzstd::optional)Listr   VECTORIZABLE_DTYPESMASKED_VECTORIZABLE_DTYPESc                    |t           v rt          j        }| dv rdS | dk    rdS | dv rt          |         }|t          j        k    r| dv rt          t          j                 }t          |          rd| dnd	| d
}t          |          rd	| dnd	| d}| dv r|n|}| dv r|nd| d| dS t          |           rdt          |          dS t          |           )N)rf   rd   rg   r   re   r5   )r`   rb   r_   ra   ra   rb   -std::numeric_limits<>::infinity()std::numeric_limits<>::min()>::max())r`   rb   )r`   r_   IndexValue<z>{0, }Welford<>())	DTYPE_LOWP_FPtorchfloat32rH   rl   ro   r   r*   AssertionError)reduction_typedtypecdtypemin_varmax_varinit_vars         rX   reduction_initr      sX    222qq;;;e$EJ>5I#I#I!%+.F e$$99F99998888 	 e$$98688888888 	
 -0AAA77w // H9v99X999	

 N++ 32,u-2222

(
((rW   c                     t           t          |                  }t          |           rd| dS | dv r-|t          j        k    rt           t          j                 }d| dS |S )Nr   >rz   r   )rH   r<   r*   r   rl   ro   )r   r   scalar_types      rX   reduction_acc_typer      so    9%@AKN++ )(+((((---EJ&u{3K+[++++rW   indexc           	      b   |t           j        k    }| dk    r|r	d| d| dS |rdnd}| d| d| S | dk    r| d	| S | d
k    r| d| S | dk    r| d| S | dv r|  d| d| dS | dk    r	d| d| dS | dk    r?t          |t                    r|\  }}	}
nt	          | |          \  }}	}
d| d| d|	 d|
 d	S | dv rbt          |d          r7|j        t           j        k    r"|j        s||  d| d| d| dS |  d| d| dS ||  d| d| d| dS |  d| d| dS t          |           )Nrd   cascade_sum_combine(, &)|r[    re    * rf    ^ rg    || )r_   r`   z_propagate_nan(, rh   welford_combine(ri   , {})rz   r   z	_combine(z, static_cast<float>(), )))	r   rl   
isinstancetuplereduction_projecthasattrr   is_vecr   )r   var
next_value
helper_valr   	src_dtypeis_boolconjunctionmeanm2weights              rX   reduction_combiner      sf    5:%G 	7F*FFFFFF!(1##cK66K66*666&&*&&&""&&*&&&'':''''' EEEE
EEEE)))6#666666***j%(( 	M)D"ff0LLD"fD#DD4DD2DDDDDD---J((
	 EJ..% /  (dd3ddZdd\adddd &XXXX*XXX $LLsLLjLLELLLL$CCsCCjCCCC

(
((rW   c                 R    t          |           r| d| d| dfS | dv r| dS |S )Nz.meanz.m2z.weightrz   z.index)r*   )r   accs     rX   r   r   	  sN    N++ }}}kkkc???::	/	/	/~~~JrW   codeiter_varnew_iter_var
loop_startloop_endreturnc                    t                      }t          j                    5 }|                    dt           d| dt          |           d| dt          |           d| dz              |                    |                                           t          | j	                  D ]\  }}t          |t          t          f          sJ d}	t          |t                    r|j        }	|j        }t          j        d	| z   d	z   | |          }
|	rt          |	|
          }
|                    |
           	 ddd           n# 1 swxY w Y   |S )
a  
    f(iter_var) is transformed to f(new_iter_var) under the inner loop
      \/
    for (new_iter_var = loop_start; new_iter_var < loop_end; new_iter_var++) {
        f(new_iter_var)
    }
    Please be careful while using this function,
    as the variable defined in f(iter_var) will be invalid outside the for loop.
    For example:
    auto tmp0 = in_ptr[x0]; ->
    for (new_x0 = start; new_x0 < end; new_x0++){
        auto tmp0 = in_ptr[new_x0];
    }
    The tmp0 is invalid outside the loop.
    zfor (r    = ; < ; ++)N\b)r7   
contextlib	ExitStack	writelinerJ   rE   enter_contextindent	enumerate_linesr   rp   r;   namelineresub)r   r   r   r   r   transformed_codestack_r   deferred_namenew_lines              rX   move_code_under_inner_loopr     s   , $~~				 15""LJLLLL+j2I2ILLLLL+h"7"7LL<LLLM	
 	
 	
 	,3355666 -- 	1 	1GAt      !M$-- ! $	yvem3e;=NPTUUH A'x@@&&x0000	11 1 1 1 1 1 1 1 1 1 1 1 1 1 1, s   DEE
Eacc_varacc_typer   r   lenc                     t                      }t          j                    rd|  d| d| dn
| d|  d| d}|                    |            |                    d| d	d
d|  d |||           ddg           |S )a  
    MSVC don't support dynamic array(VLA). So we use std::unique_ptr here.
    Ref: https://stackoverflow.com/questions/56555406/creating-dynamic-sized-array-using-msvc-c-compiler
    MSVC is the only one compiler without VLA. support. Since MSVC can't get good performance here.
    We just use unique_ptr make it works on MSVC.
    For other compilers, we continue to use VLA to get best performance.
    auto z_arr = std::make_unique<z[]>();r   _arr[];for (int i = 0; i < ; i++){    z
_arr[i] = r   r   )r=   r   
is_msvc_clr   
writelines)r   r   r   r   r   init_fncode_bufferacc_decls           rX   reduction_prefix_arrayr   A  s     !""K !##	1FFFFFsFFFF00700000 
 X-(((.3...G7GGggne&D&DGGG		
   rW   bufferr   new_namec                 >   t          | j                  D ]\  }}t          |t          t          f          sJ t          |t                    r)t          j        d| z   dz   | |j                  |_        at          j        d| z   dz   | |          | j        |<   d S )Nr   )r   r   r   rp   r;   r   r   r   )r   r   r   ir   s        rX   replace_acc_namer   b  s    V]++ V V4
 
 	
 	
 
 dL)) 	Vu$y058X-SSDII!vei&7%&?HPTUUFM!V VrW   c           
         d}t          | j                  D ]\  }}t          |t          t          f          sJ t          |t                    r|j        n|}t          j        ||          }|r]|                                \  }}t          j	        || d| d| d|          }t          |t                    r||_        || j        |<   dS )zT
    Replaces `acc = cascade_sum_combine(value, ...)` with `acc = acc + value;`
    z/(.*?)\s*=\s*cascade_sum_combine\(([^,]+),.*?\);r    + r   N)
r   r   r   rp   r;   r   r   searchgroupsr   )	r   patternr   r   contentmatchr   valuenew_contents	            rX   replace_cascade_sum_with_addr   q  s    
 AGV]++ / /4
 
 	
 	
 
  *$==G$))4	'7++ 	/JC&S*E*ES*E*EU*E*E*EwOOK$-- /'		#.a !/ /rW   r   c                     |                      |          st          j        j        S ||dz   i}t	          | |          }t          j        || z
            S Nr5   )hassympySZeror0   simplify)r   r   replacement	new_indexs       rX   	stride_atr     sQ    99S>>  w|a.K5+..I>)e+,,,rW   
vec_lengthc                   	 dd	fd}	fd}| }t          j        dd          }|                     t                    r$|                     t          |          |          } t          j        dd          }|                     t
                    r%|                     t          ||          |          } t          j        |           } | |k    rt          |           S | S )a  
    Simplifies the index expression within the range of a vectorized loop.
    Given a vectorized loop variable `var` in the range of a loop with `vec_length`,
    this function transforms the `index` into an equivalent form. It handles
    simplifications for cases where `var` can be expressed as `vec_length * a + b`,
    where `b` ranges from 0 to `vec_length - 1`. The function reduces occurrences
    of `FloorDiv` and `ModularIndexing` in the `index` with best-effort optimizations.

    NOTE:
    The simplified index expression is intended for analysis purposes only, not
    for code generation. It replaces `FloorDiv` and `ModularIndexing` with free variables
    which are not dependent on the loop variable `var` in the vectorized range. Check
    https://github.com/pytorch/pytorch/pull/117221#discussion_r1449746217 for more details.

    Examples:
    1. If `var` is `x3` and `vec_length` is 16, and `x3 = 16*a + b`, then
       `FloorDiv(x3, div)` or `ModularIndexing(x3, div, mod)` becomes a free variable
       when `div` is divisible by 16.
    2. `ModularIndexing(x3, 1, mod)` can be simplified to `x3 + c` where `c` is a free
       variable when `mod` is divisible by 16.
    r   c                     t          |           }t          j        |           k    rt          j         d           }dz  |S )N_div_cr5   )r   r   gcdSymbol)divisorresultdiv_freevar_idr   r   s     rX   visit_indexing_divz7simplify_index_in_vec_range.<locals>.visit_indexing_div  sU    #w''9Wj))Z77\S"@"@"@"@AAFaNrW   c                    t          | |          }t          j        |           k    rt          j         d           }dz  n@| dk    r:t          j        |          k    r!t          j         d           z   }dz  |S )N_mod_cr5   )r   r   r  r  )r  modulusr  mod_freevar_idr   r   s      rX   visit_modular_indexingz;simplify_index_in_vec_range.<locals>.visit_modular_indexing  s     gw779Wj))Z77\S"@"@"@"@AAFaNN\\ei<<
JJ5<3(F(Fn(F(FGGGFaNrW   r  T)integerr	  )r   Wildr   r   replacer   r   simplify_index_in_vec_range)
r   r   r   r  r  original_indexdivmodr  r
  s
    ``     @@rX   r  r    s    0 NN      	 	 	 	 	 	 	 N
*Y
-
-
-Cyy FhsC002DEE
*Y
-
-
-Cyy!! Voc3<<>TUUN5!!E*5#zBBBLrW   c                 H    |rt          | ||          } t          | |          S N)r  r   )r   r   r   s      rX   stride_at_vec_ranger    s.      D+E3
CCUC   rW   c                   (    e Zd ZU dZeed<   eed<   dS )ParallelDepthz{
    A class representing parallel depth.
    Includes the starting depth of parallelism and the depth of parallelism.
    parallel_depthstart_depthN)__name__
__module____qualname____doc__rn   __annotations__rV   rW   rX   r  r    s6          
 rW   r  c                   v     e Zd Zededefd            Zdddeeee	f                  f fdZ
d Zd	 Zd
 Z xZS )OuterLoopFusedSchedulerNodenode1node2c                    |j         |j         u sJ t          d ||fD                       sJ t          d ||fD                       r | |j         t          |          t          u r!t          |                                          n|gt          |          t          u r!t          |                                          n|gz   |          S  | |j         ||g|          S )Nc              3   \   K   | ]'}t          |          t          t          t          fv V  (d S r  )typer   r$   r"   .0nodes     rX   	<genexpr>z3OuterLoopFusedSchedulerNode.fuse.<locals>.<genexpr>  sQ       
 
  JJ+"
 
 
 
 
 
rW   c              3   B   K   | ]}t          |          t          u V  d S r  r%  r   r&  s     rX   r)  z3OuterLoopFusedSchedulerNode.fuse.<locals>.<genexpr>  .      TTTtDzz88TTTTTTrW   )	schedulerallrg   r%  r   listget_outer_nodes)clsr!  r"  outer_loop_fusion_depths       rX   fusez OuterLoopFusedSchedulerNode.fuse  s1    %/1111 
 
 
 
 
 
 
 	
 	
 
 TTeU^TTTTT 	Q3 E{{&AAA ..00111  E{{&AAA ..00111  (!  & 3u8OPPPrW   r-  r#   outer_fused_nodesc                 $   || _         || _        g }| j         D ]T}t          |t          t          f          sJ |                    t          |                                                     Ut                      	                    ||           d S r  )
r4  r2  r   r$   r"   extendr/  	get_nodessuper__init__)selfr-  r4  r2  flatten_snodes_node	__class__s         rX   r9  z$OuterLoopFusedSchedulerNode.__init__  s      	 (?$+ 	; 	;Eem5G%HIIIII!!$u'8'8"9"9::::N33333rW   c                     | j         S r  )r4  r:  s    rX   r0  z+OuterLoopFusedSchedulerNode.get_outer_nodes  s    %%rW   c           
      8   dt           dt           dt          dt          dt          f
fdt          t	          |          dz
            D ]0}||         j        }||dz            j        } |||d          s d	S 1|D ]}t          j        t          j	        |j
        d |                   }t	          |j
                  |k    rVt          |t          j                  r<t          |j
        |         t          j                  r|d
z  |j
        |         k     r d	S dS )Nleft_loop_nestright_loop_nestloop_fusion_depthcurrent_checking_depthr   c                 `   | j         sJ |j         sJ | j         |         |j         |         g d}t          fd|D                       sdS |dk    sJ |dz
  x}dk    rI|dz   }|t          | j                   k     sJ |t          |j                   k     sJ  | |||          sdS dS )N)r   sizeoffsetstepsc              3   \   K   | ]&}t          |          t          |          k    V  'd S r  )getattr)r'  attr_compareleft_loop_levelright_loop_levels     rX   r)  zaOuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr.<locals>._inner.<locals>.<genexpr><  sV         % O\::/>>?     rW   Fr5   r   T)loopsr.  r   )rA  rB  rC  rD  outer_loops_attr_compare_listrL  rM  _inners        @@rX   rP  zNOuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr.<locals>._inner*  s3    "''''"((((,23IJO.45KL- - -)       )F     u$))))%6%::!a??)?!)C&-N4H0I0IIIII-O4I0J0JJJJJv"#%*	  ! !54rW   r5   r   F,  T)LoopNestrn   rl   ranger   	loop_nest	functoolsreduceoperatormulrangesr   r   Integer)	r:  cpp_kernel_proxy_listr2  idxrA  rB  cpp_kernel_proxyouter_rangesrP  s	           @rX   "check_outer_fusion_loop_level_attrz>OuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr!  s   (	$(	%(	  #(	 %(	(	
 (	 (	 (	 (	 (	 (	T 233a788 		 		C237AN3C!G<FO6'	   uu !6 	 	$+ '(@)@(@A L $+,,/FFF|U];; G$+,CDM  G !3&")*ABC C uutrW   c                      |d         j         }t          |          } fd|D             |_        |d         }||j        _        |j        j        d  j                 |j        _        |S )Nr   c                 N    g | ]!}|j                             j                  "S rV   )rT  from_loop_levelr2  )r'  proxyr:  s     rX   
<listcomp>zJOuterLoopFusedSchedulerNode.merge_outer_fusion_kernels.<locals>.<listcomp>}  s<     )
 )
 )
 O++D,HII)
 )
 )
rW   )kernel_groupOuterLoopFusedKernelinnerrT  kernelrN  r2  )r:  r[  re  outer_loop_fused_kernelouter_fused_proxys   `    rX   merge_outer_fusion_kernelsz6OuterLoopFusedSchedulerNode.merge_outer_fusion_kernelsw  s     -Q/<"6|"D"D)
 )
 )
 )
.)
 )
 )
% 2!4-D#*,=,G,M*d**-
#) ! rW   )r  r  r  classmethodr   r3  r/  r	   r"   r$   r9  r0  r_  rk  __classcell__r=  s   @rX   r   r     s        !Q%!Q.?!Q !Q !Q [!QF44  &8-&G HI4 4 4 4 4 4 & & &T T Tl! ! ! ! ! ! !rW   r   c                   4    e Zd Zd	defdZd Zd Zd Zd ZdS )
RecordOptimizationContextrS   	func_namec                 0    || _         d | _        d | _        d S r  )rq  current_nodeopt_ctx)r:  rq  s     rX   r9  z"RecordOptimizationContext.__init__  s    "596:rW   c                 T   t           j        sJ t           j        j        sJ t           j        j        | _        | j        J t          j        | j        j        v r"| j        j        t          j                 | _        nt                      | _        | j        J | j        | j        _        | S r  )	r4   interpreterrs  rA   keymetart  rq  ops_namer?  s    rX   	__enter__z#RecordOptimizationContext.__enter__  s    }}}))))M6 ,,,"d&7&<<<,12E2IJDLL.00DL|''' $rW   c                 f    | j         sJ | j        sJ | j        | j         j        t          j        <   d S r  )rs  rt  rx  rA   rw  r:  exc_typeexc_valexc_tbs       rX   __exit__z"RecordOptimizationContext.__exit__  s:        ||:>,26777rW   c                     | j         S r  )rt  r?  s    rX   get_opt_ctxz%RecordOptimizationContext.get_opt_ctx  s
    |rW   c                 "    | j         sJ | j         S r  )rs  r?  s    rX   get_fx_nodez%RecordOptimizationContext.get_fx_node  s          rW   N)rS   )	r  r  r  rp   r9  rz  r  r  r  rV   rW   rX   rp  rp    ss        ; ;# ; ; ; ;
  G G G
  ! ! ! ! !rW   rp  c                      t          d | D                       r
J d            t          |           x}t          |         S d| d          dS )Nc              3   N   K   | ] }t          |t                    o|j        V  !d S r  r   rG   r   r'  args     rX   r)  z$decltype_promoted.<locals>.<genexpr>  s3      RRc:c>22AszRRRRRRrW   z*Promotion of vector types is not supported	decltype(r   r   )rg   rI   rH   )argsdts     rX   decltype_promotedr    sk    RRTRRRRR  4 R  %%%2B%47%%%%rW   c                      e Zd ZdZed             Zed             Zed             ZedNd            Zed             Z	ed	             Z
ed
             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Z ed              Z!ed!             Z"ed"             Z#ed#             Z$ed$             Z%ed%             Z&ed&             Z'ed'             Z(ed(             Z)ed)             Z*ed*             Z+ed+             Z,ed,             Z-ed-             Z.ed.             Z/ed/             Z0ed0             Z1ed1             Z2ed2             Z3ed3             Z4ed4             Z5ed5             Z6ed6             Z7ed7             Z8ed8             Z9ed9             Z:ed:             Z;ed;             Z<ed<             Z=ed=             Z>ed>             Z?ed?             Z@ed@             ZAedA             ZBedB             ZCedC             ZDedD             ZEedE             ZFedFeGjH        dGeGjH        fdH            ZIedFeGjH        dGeGjH        fdI            ZJedFeGjH        dGeGjH        fdJ            ZKedK             ZLedL             ZMedM             ZNdS )OCppOverrideszMap element-wise ops to C++c                 4    t          | |           d|  d| dS )N(r   r   r  abs     rX   addzCppOverrides.add  +    #Aq))66A66!6666rW   c                 4    t          | |           d|  d| dS )Nr   - r   r  r  s     rX   r   zCppOverrides.sub  r  rW   c                 4    t          | |           d|  d| dS )Nr  r   r   r  r  s     rX   rX  zCppOverrides.mul  r  rW   NTc                    t          | t                    sJ || j        }t          j                            | ||          }t          j        j                            t          j        j        |          }|	                    d| |fd|i           |t          v r3|t          j        k    r#	 t          j                            | |||           |S )Nto_dtyper   )r   rG   r   r4   rh  get_to_dtype_exprcsegeneratecomputeupdate_on_argsr   r   ro   cache_dtype_convert)xr   r   use_compute_typesexprcsevars         rX   r  zCppOverrides.to_dtype  s    !^,,,,,Ix))!UI>>&&qx'7>>j1e*{I6NOOOM!!i5;&>&>> H((IvuEEErW   c                 f    |t           v sJ | dt           d            dt           |          d|  dS )Nz missing from z.DTYPE_TO_CPPzc10::bit_cast<>(r   )rH   r  )r  r   r   s      rX   to_dtype_bitcastzCppOverrides.to_dtype_bitcast  sJ    $$$&U&Uh&U&U&U$$$;U 3;;q;;;;rW   c                     d|  dS )Nz	std::abs(r   rV   r  s    rX   abszCppOverrides.abs      1rW   c                     d|  dS )Nz	std::sin(r   rV   r  s    rX   sinzCppOverrides.sin  r  rW   c                     d|  dS )Nz	std::cos(r   rV   r  s    rX   coszCppOverrides.cos  r  rW   c                     d|  d|  dS )Nr  z)(-r   rV   r  s    rX   negzCppOverrides.neg       %1%%%%%%rW   c                     d|  dS )Nz	std::exp(r   rV   r  s    rX   expzCppOverrides.exp  s      1rW   c                     d|  dS )Nz
std::exp2(r   rV   r  s    rX   exp2zCppOverrides.exp2	       A    rW   c                     d|  dS )Nzstd::expm1(r   rV   r  s    rX   expm1zCppOverrides.expm1      !Q!!!!rW   c                     d|  dS )Nz	std::erf(r   rV   r  s    rX   erfzCppOverrides.erf  r  rW   c                     d|  dS )Nz
std::erfc(r   rV   r  s    rX   erfczCppOverrides.erfc  r  rW   c                     d|  dS )Nzcalc_erfinv(r   rV   r  s    rX   erfinvzCppOverrides.erfinv      "a""""rW   c                     d|  dS )Nz
std::sqrt(r   rV   r  s    rX   sqrtzCppOverrides.sqrt  r  rW   c                     d|  dS )Nz1 / std::sqrt(r   rV   r  s    rX   rsqrtzCppOverrides.rsqrt!  s    $$$$$rW   c                 t    t           j        j        }|dk    r|  d|  dS |d|  dS t          d|          )Naccuracy + decltype()(1)zstd::log1p(r   8unrecognized config cpp.inject_log1p_bug_TESTING_ONLY = r   cppinject_log1p_bug_TESTING_ONLYr   r  bugs     rX   log1pzCppOverrides.log1p%  sa    j6*,,Q,,,,[%%%%% R3RR  rW   c                     d|  dS )Nz	std::tan(r   rV   r  s    rX   tanzCppOverrides.tan1  r  rW   c                     d|  dS )Nz
std::tanh(r   rV   r  s    rX   tanhzCppOverrides.tanh5  r  rW   c                 (    t           rd|  dnd|  dS )z
        On windows std::signbit only support float type.
        Ref: https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/signbit?view=msvc-170
        z std::signbit(static_cast<float>(r   zstd::signbit(r   rT   r  s    rX   signbitzCppOverrides.signbit9  s0     &4q4444%%%%	
rW   c                     d|  d| dS )Nz	std::pow(r   r   rV   r  s     rX   powzCppOverrides.powE  s    $1$$$$$$rW   c                     d|  dS )Nz	std::log(r   rV   r  s    rX   logzCppOverrides.logI  r  rW   c                     d|  dS )Nzstd::nearbyint(r   rV   r  s    rX   roundzCppOverrides.roundM  s    %%%%%rW   c                     d|  dS )Nzstd::floor(r   rV   r  s    rX   floorzCppOverrides.floorQ  r  rW   c                 H    |  d| }|  d| }d|  d| d| d| d| d| d	S )
N /  % ((z
 < 0) != (z	 < 0) ? (z != 0 ? z - 1 : z) : r   rV   )r  r  quotrems       rX   floordivzCppOverrides.floordivU  s\     ||||llqllXAXXXXSXX$XXtXXQUXXXXrW   c                     d|  dS )Nz
std::ceil(r   rV   r  s    rX   ceilzCppOverrides.ceil\  r  rW   c                     d|  dS )Nzstd::trunc(r   rV   r  s    rX   trunczCppOverrides.trunc`  r  rW   c                     |  d| S Nr  rV   r  s     rX   truncdivzCppOverrides.truncdivd  s     ||||rW   c                     d|  d| dS )Nz
std::fmod(r   r   rV   r  s     rX   fmodzCppOverrides.fmodi  s    %A%%%%%%rW   c                     d|  dS )Nzstd::isinf(r   rV   r  s    rX   isinfzCppOverrides.isinfm  r  rW   c                     d|  dS )Nzstd::isnan(r   rV   r  s    rX   isnanzCppOverrides.isnanq  r  rW   c                     d|  dS )Nzstd::lgamma(r   rV   r  s    rX   lgammazCppOverrides.lgammau  r  rW   c                     d|  dS )Nz
std::acos(r   rV   r  s    rX   acoszCppOverrides.acosy  r  rW   c                     d|  dS )Nzstd::acosh(r   rV   r  s    rX   acoshzCppOverrides.acosh}  r  rW   c                     d|  dS )Nz
std::cosh(r   rV   r  s    rX   coshzCppOverrides.cosh  r  rW   c                     d|  dS )Nz
std::sinh(r   rV   r  s    rX   sinhzCppOverrides.sinh  r  rW   c                     d|  dS )Nz
std::asin(r   rV   r  s    rX   asinzCppOverrides.asin  r  rW   c                     d|  dS )Nzstd::asinh(r   rV   r  s    rX   asinhzCppOverrides.asinh  r  rW   c                     d|  d| dS )Nzstd::atan2(r   r   rV   r  ys     rX   atan2zCppOverrides.atan2      &Q&&!&&&&rW   c                     d|  dS )Nz
std::atan(r   rV   r  s    rX   atanzCppOverrides.atan  r  rW   c                     d|  dS )Nzstd::atanh(r   rV   r  s    rX   atanhzCppOverrides.atanh  r  rW   c                     d|  d| dS )Nzstd::copysign(r   r   rV   r  s     rX   copysignzCppOverrides.copysign  s    )))Q))))rW   c           	         d|  dd|  df}t          d |D                       rt          d |D                       S t                      }t          j        j                            t          j        | j	                  }t          j        j                            | j
        | j	                  }|                    d| d           |                    d	| d
|  d| d           t          j        j                            |           ||f}t          ||          D ]*\  }}t          j        j                            ||           +||fS )Nfrexp()[0])[1]c              3   b   K   | ]*}t           j        j                            |          d uV  +d S r  r4   rh  r  try_getr'  	cache_keys     rX   r)  z%CppOverrides.frexp.<locals>.<genexpr>  9      WWyqx|##I..d:WWWWWWrW   c              3   ^   K   | ](}t           j        j                            |          V  )d S r  r  r  s     rX   r)  z%CppOverrides.frexp.<locals>.<genexpr>  4      UUY--i88UUUUUUrW   )r   shapezint32_t r   r   z = std::frexp(r   r   )r.  r   r7   r4   rh  r  newvarr   int32r  r   r   r  splicezipput)r  
cache_keysr   exponentmantissacse_varsr  cse_vars           rX   frexpzCppOverrides.frexp  s^   %a%%%'7'7'7'77
WWJWWWWW 	VUU*UUUUUU~~8<&&U[&HH8<&&QWAG&DD-(---...IxIIqIIXIIIJJJ	%%%h'"%j(";"; 	1 	1IwHLY0000!!rW   c                     d|  d| dS )Nzstd::hypot(r   r   rV   r  s     rX   hypotzCppOverrides.hypot  r  rW   c                     d|  dS )Nzstd::log10(r   rV   r  s    rX   log10zCppOverrides.log10  r  rW   c                     d|  dS )Nz
std::log2(r   rV   r  s    rX   log2zCppOverrides.log2  r  rW   c                     d|  d| dS )Nzstd::nextafter(r   r   rV   r  s     rX   	nextafterzCppOverrides.nextafter  s    ***a****rW   c                     t           j        j        }|dk    rdS |dk    r|  dS |dk    r|  d|  dS |	d|  d	|  d
S t          d|          )Ncompile_errorcompile error!runtime_error	; throw 1r  r  r  z	std::max(, decltype()(0))7unrecognized config cpp.inject_relu_bug_TESTING_ONLY = r   r  inject_relu_bug_TESTING_ONLYr   r  s     rX   reluzCppOverrides.relu  s    j5/!!##O##???"J,,Q,,,,[5q55Q5555 Q#QQ  rW   c                     d|  d| dS )Nzmin_propagate_nan(r   r   rV   r  s     rX   minimumzCppOverrides.minimum      -A------rW   c                     d|  d| dS )Nzmax_propagate_nan(r   r   rV   r  s     rX   maximumzCppOverrides.maximum  r;  rW   c                     |  d| d| S )N ?  : rV   )r  r  cs      rX   wherezCppOverrides.where  s    !!!!a!!!rW   c                     d|  d| dS )Nzmod(r   r   rV   r  s     rX   r  zCppOverrides.mod  s    a1rW   c                 8    t          | t          |                   S r  )rP   rH   )valr   s     rX   constantzCppOverrides.constant  s    Ce!4555rW   c                 
   t          t          j                            |                     }t          j        j                            t          j        j        |t          |                     }t          j	        ||          S )Nbounds)
rD   r4   rh  rename_indexingr  r  r  r&   r2   r  )r  r   idx_strr   s       rX   
index_exprzCppOverrides.index_expr  sf    006677hl##Hg.CD.I.I $ 
 
 |C'''rW   c                 P   t                      }t          j        j                                        }|                    d| d           t          j                            |          5  |                                5   |            }|                    d| d           d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   |                    d           t          j        j        	                    |           t          |d| d          }|  d| d| S )	Nr    = [&]return r   r  z())r?  z() : )r7   r4   rh  r  r  r   swap_buffersr   r  r  rP   )maskbodyotherr   body_varr  
other_codes          rX   maskedzCppOverrides.masked  s   ~~ 8<&&((/x///000X""4(( 	0 	0$++-- 	0 	0TVVFNN.V...///	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	s	%%% "%)BX)B)B)BCC
66866*666s6   *C?$B/#C/B3	3C6B3	7CC
C
c                     |  d| S )N && rV   r  s     rX   logical_andzCppOverrides.logical_and      }}}}rW   c                     d|  S )N!rV   r  s    rX   logical_notzCppOverrides.logical_not      1wwrW   c                     |  d| S )Nr   rV   r  s     rX   
logical_orzCppOverrides.logical_or  rZ  rW   c                     |  d| S )N != rV   r  s     rX   logical_xorzCppOverrides.logical_xor  rZ  rW   c                     d|  d|  d| dS )Nr  )( & r   rV   r  s     rX   bitwise_andzCppOverrides.bitwise_and  #    *1****a****rW   c                     d|  d|  dS )Nr  z)(~r   rV   r]  s    rX   bitwise_notzCppOverrides.bitwise_not  r  rW   c                     d|  d|  d| dS )Nr  rf   | r   rV   r  s     rX   
bitwise_orzCppOverrides.bitwise_or  ri  rW   c                     d|  d|  d| dS )Nr  rf  r   r   rV   r  s     rX   bitwise_xorzCppOverrides.bitwise_xor  ri  rW   c                 6   t                      }|                    d           |                                5  t          | j                 }|                    d| d| d           |                    d| d| d| d           |                                5  |                    d	|  d
           d d d            n# 1 swxY w Y   |                    d	|  d| d|  d| d	           d d d            n# 1 swxY w Y   |                    d           |S )N[&]()constexpr decltype() max_shift = sizeof(z) * CHAR_BIT;$if ((static_cast<std::make_signed_t<>>() < 0) || ( >= max_shift))return decltype(z)(0);z#)(static_cast<std::make_unsigned_t<z) << r   ()r7   r   r   rH   r   r  r  r   scalar_ts       rX   bitwise_left_shiftzCppOverrides.bitwise_left_shift  s   ~~w[[]] 	 	#AG,HNNUaUUhUUU   NNdxddAddRSddd    < <:!:::;;;< < < < < < < < < < < < < < <NNc1ccccVWcc^_ccc  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	ts6   A"C9C 4C9 C	C9C	%C99C= C=c           
      <   t                      }|                    d           |                                5  t          | j                 }|                    d| d| d| d           |                    d| d| d| d	           |                                5  |                    d
|  d|  d           d d d            n# 1 swxY w Y   |                    d
|  d|  d| d           d d d            n# 1 swxY w Y   |                    d           |S )Nrr  rs  rt  z ) * CHAR_BIT - std::is_signed_v<z>;ru  rv  rw  rx  ry  rf  z >> max_shift); >> r   rz  r{  r|  s       rX   bitwise_right_shiftz CppOverrides.bitwise_right_shift3  s   ~~w[[]] 
	A 
	A#AG,HNNtatthtthpttt   NNdxddAddRSddd    K KI!IIqIIIJJJK K K K K K K K K K K K K K KNN?a??1??!???@@@
	A 
	A 
	A 
	A 
	A 
	A 
	A 
	A 
	A 
	A 
	A 
	A 
	A 
	A 
	A 	ts6   A%C<C:C<C
	
C<C
	"C<<D D seedrG  c                     d|  d| dS )Nznormalized_rand_cpu(r   r   rV   r  rG  s     rX   randzCppOverrides.randE  s    7d77f7777rW   c                     d|  d| dS )Nz
randn_cpu(r   r   rV   r  s     rX   randnzCppOverrides.randnI  s    -D--F----rW   c           	           d|  d| d| d| d	S )Nzrandint64_cpu(r   r   rV   )r  rG  lowhighs       rX   	randint64zCppOverrides.randint64M  s+    @@@@@#@@@@@@rW   c                     d|  d|  d|  dS )Nr  z)(1) / (decltype(z)(1) + std::exp(-r   rV   r  s    rX   sigmoidzCppOverrides.sigmoidQ  s#    H1HHqHH1HHHHrW   c           
         t                      }d|  d}d|  d}|                    d           |                                5  |                    d|  d| d| d           |                    d	|  d
| d| d           |                    d           d d d            n# 1 swxY w Y   |                    d           |S )Nr  )(0)r  rr  auto left = z > 0 ? r@  r   auto right = z < 0 ? return left - right;rz  r7   r   r   )r  r   scalar_zero
scalar_ones       rX   signzCppOverrides.signU  s(   ~~)!)))((((
w[[]] 	3 	3NNQ!QQJQQ;QQQRRRNNR1RRZRRKRRRSSSNN1222	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	ts   AB$$B(+B(c                     d|  d| dS )Nr  z" ? 0 : (throw std::runtime_error("z"), 0))rV   )condmsgs     rX   device_assert_asyncz CppOverrides.device_assert_asyncb  s    G4GG3GGGGrW   NT)Or  r  r  r  staticmethodr  r   rX  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r	  r  r  r%  r'  r)  r+  r-  r8  r:  r=  rB  r  rF  rL  rV  rY  r^  ra  rd  rh  rk  rn  rp  r~  r  r   Exprr  r  r  r  r  r  rV   rW   rX   r  r    s       %%7 7 \7 7 7 \7 7 7 \7 ( ( ( \(T < < \<     \      \      \  & & \&     \  ! ! \! " " \"     \  ! ! \! # # \# ! ! \! % % \% 	 	 \	     \  ! ! \! 	
 	
 \	
 % % \%     \  & & \& " " \" Y Y \Y ! ! \! " " \"   \ & & \& " " \" " " \" # # \# ! ! \! " " \" ! ! \! ! ! \! ! ! \! " " \" ' ' \' ! ! \! " " \" * * \* " " \"  ' ' \' " " \" ! ! \! + + \+   \ . . \. . . \. " " \"     \  6 6 \6 ( ( \( 7 7 \7    \   \   \   \ + + \+ & & \& + + \+ + + \+   \&   \" 85: 8uz 8 8 8 \8 .EJ .
 . . . \. A
 AEJ A A A \A I I \I 
 
 \
 H H \H H HrW   r  r  c                       e Zd ZdZ fdZed             Zed             Zed             Zed             Z	ed             Z
ed             Zed	             Zed
             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Z ed             Z!ed             Z"ed              Z#ed!             Z$ed"             Z%ed#             Z&ed$             Z'ed%             Z(ed&             Z)ed'             Z*ed(             Z+ed)             Z,ed*             Z-ed+             Z.ed,             Z/ed-             Z0ed.             Z1ed/             Z2ed0             Z3ed1             Z4ed2             Z5ed3             Z6ed4             Z7ed5             Z8ed6             Z9ed7             Z:ed8             Z;ed9             Z<ed:             Z=ed;             Z>ed<             Z?ed=             Z@ed>             ZAed?             ZBed@             ZCedA             ZDedB             ZEedC             ZFedD             ZGedE             ZHedF             ZIedG             ZJedH             ZKedI             ZLedJ             ZMedTdM            ZNedN             ZOedO             ZPedP             ZQedQ             ZReSdR             ZTeSdS             ZU xZVS )UCppVecOverridesz.Map element-wise ops to aten vectorization C++c                 *   t                                          |           fd}t          t                                                    D ]B\  }}t          |dd           t          k    r#|dvrt          | ||j                             CS )Nc                       fd}|S )Nc                  X   d | D             }d | D             }t          |           }|r|rg }| D ]}t          |t          t          j        f          r~t          |t          j                  r'|j        s t          j        |t          j	                  }nt          j
        |t          j	                  }t          |t                    r|j        n|}|                    |           |rOt          |          dk    rt          |          }n,	t           j        k    rt          |dd                    |dd <   |r1|r/t          t$          j        t(                    sJ 	fd|D             }|r 	|i |S t+          t           
          }t-          |	j                  }|J  || i |S )Nc                     g | ]A}t          |t          t          j        f          st          |t                    8|j        ?|BS rV   )r   rn   r   r  rG   r   r  s     rX   rd  zJCppVecOverrides.__new__.<locals>.wrap.<locals>.wrapper.<locals>.<listcomp>~  sa       !#UZ'899 #377	 AD
	  rW   c                 J    g | ] }t          |t                    |j        |!S rV   r  r  s     rX   rd  zJCppVecOverrides.__new__.<locals>.wrap.<locals>.wrapper.<locals>.<listcomp>  sD       !#~66 <?:  rW   r   r5   c                     g | ]d}t          |t                    rK|j        sDt          j        t          j        t          j        fvrt          j        	                    |          n|eS rV   )
r   rG   r   r  r  r  r  r4   rh  	broadcast)r'  new_argfuncs     rX   rd  zJCppVecOverrides.__new__.<locals>.wrap.<locals>.wrapper.<locals>.<listcomp>  s           $ !+7N C C) )0) %)$3$8$3$9$3$=("%" %"	 H..w777 ")     rW   )r/  r   rn   r   r  	is_numberr2   rL  r   int64rF  r3   r   appendr   rM   r  rB  r4   rh  CppVecKernelr8  rJ  r  )r  kwargsscalarsvectorsnew_argsr  
scalar_opsscalar_funcr=  r  r:  s           rX   wrapperz6CppVecOverrides.__new__.<locals>.wrap.<locals>.wrapper}  s    #   #  
  :: 	-w 	-!H# - -%cC+<== R)#uz:: E3= E&)nS%+&F&F&)l3&D&D/9#x/H/H"Q#))cC ,,,,  	B 8}}))#/#9#9!666'3HQRRL'A'A  w %ah=====        (0     H$  844V444 "'!=!=J")*dm"D"DK&222&;7777rW   rV   )r  r  r=  r:  s   ` rX   wrapz%CppVecOverrides.__new__.<locals>.wrapp  s6    @8 @8 @8 @8 @8 @8 @8D NrW   r=  )rV  rL  )	r8  __new__varsr  itemsrJ  r  setattr__func__)r1  r  kargsr  r   methodr:  r=  s         @rX   r  zCppVecOverrides.__new__m  s    wws##O	 O	 O	 O	 O	 O	b !117799 	; 	;LD&v{D11\AAd S G G dDD$9$9:::rW   c                     |  d| S )Nr   rV   r  s     rX   r  zCppVecOverrides.add      ||||rW   c                     |  d| S )Nr  rV   r  s     rX   r   zCppVecOverrides.sub  r  rW   c                     |  d| S Nr   rV   r  s     rX   rX  zCppVecOverrides.mul  r  rW   c                     |  d| S r  rV   r  s     rX   truedivzCppVecOverrides.truediv  r  rW   c                     |  dS )Nz.abs()rV   r  s    rX   r  zCppVecOverrides.abs      |||rW   c                     |  dS )Nz.sin()rV   r  s    rX   r  zCppVecOverrides.sin  r  rW   c                     |  dS )Nz.cos()rV   r  s    rX   r  zCppVecOverrides.cos  r  rW   c                     |  dS )Nz.exp()rV   r  s    rX   r  zCppVecOverrides.exp  r  rW   c                     |  dS )Nz.exp2()rV   r  s    rX   r  zCppVecOverrides.exp2      }}}rW   c                     d|  d}|  d| S )Nr  r  z	.exp() - rV   )r  vec_ones     rX   r  zCppVecOverrides.expm1  s(     &a%%%''g'''rW   c                     |  dS )Nz.erf()rV   r  s    rX   r  zCppVecOverrides.erf  r  rW   c                     |  dS )Nz.erfc()rV   r  s    rX   r  zCppVecOverrides.erfc  r  rW   c                     |  dS )Nz	.erfinv()rV   r  s    rX   r  zCppVecOverrides.erfinv      rW   c                     |  dS )Nz.sqrt()rV   r  s    rX   r  zCppVecOverrides.sqrt   r  rW   c                     t          t          j        t                    sJ t          | t                    sJ | j        J t          j                            | j                   d|  d| dS )Nr   == r   r   r4   rh  r  rG   r   _get_mask_typer  s     rX   eqzCppVecOverrides.eq  o    !(L11111!^,,,,,w"""())!'22@@Q@@A@@@@rW   c                    t          t          j        t                    sJ t          | t                    sJ | j        t          j        k    rB|j        t          j        k    sJ t          t          j        j	        | |f          \  }}| d| S | j        J t          j        
                    | j                   d|  d| dS )Nrc  r  r   )r   r4   rh  r  rG   r   r   rl   rO   r  r  )r  r  x_casty_casts       rX   nezCppVecOverrides.ne  s    !(L11111!^,,,,,7ej  7ej((((1!(2BQFKKNFF**&***7&&&h--ag66DDDDDDDDrW   c                     t          t          j        t                    sJ t          | t                    sJ | j        J t          j                            | j                   d|  d| dS )Nr  r   r   r  r  s     rX   ltzCppVecOverrides.lt  o    !(L11111!^,,,,,w"""())!'22??Q??1????rW   c                     t          t          j        t                    sJ t          | t                    sJ | j        J t          j                            | j                   d|  d| dS )Nr  z > r   r  r  s     rX   gtzCppVecOverrides.gt  r  rW   c                     t          t          j        t                    sJ t          | t                    sJ | j        J t          j                            | j                   d|  d| dS )Nr   <= r   r  r  s     rX   lezCppVecOverrides.le%  r  rW   c                     t          t          j        t                    sJ t          | t                    sJ | j        J t          j                            | j                   d|  d| dS )Nr   >= r   r  r  s     rX   gezCppVecOverrides.ge,  r  rW   c                     |  d| S Nrg  rV   r  s     rX   and_zCppVecOverrides.and_3  r  rW   c                     |  dS )Nz.rsqrt()rV   r  s    rX   r  zCppVecOverrides.rsqrt7      ~~~rW   c                     |  d| dS )Nz.pow(r   rV   r  s     rX   r  zCppVecOverrides.pow;  s    !rW   c                     |  dS )Nz.log()rV   r  s    rX   r  zCppVecOverrides.log?  r  rW   c                     |  dS )Nz.round()rV   r  s    rX   r  zCppVecOverrides.roundC  r  rW   c                     |  dS )Nz.floor()rV   r  s    rX   r  zCppVecOverrides.floorG  r  rW   c                     |  dS )Nz.ceil()rV   r  s    rX   r  zCppVecOverrides.ceilK  r  rW   c                     |  dS )Nz.trunc()rV   r  s    rX   r  zCppVecOverrides.truncO  r  rW   c                     |  d| dS )Nz.fmod(r   rV   r  s     rX   r  zCppVecOverrides.fmodS  s    1rW   c                     |  dS )Nz	.lgamma()rV   r  s    rX   r  zCppVecOverrides.lgammaW  r  rW   c                 6    t          | |          \  } }|  d| S r  rL   r  s     rX   rY  zCppVecOverrides.logical_and[  %    ,Q221||||rW   c                     d|  S N~rV   r]  s    rX   r^  zCppVecOverrides.logical_not`  r_  rW   c                 6    t          | |          \  } }|  d| S Nrm  r  r  s     rX   ra  zCppVecOverrides.logical_ord  r  rW   c                 6    t          | |          \  } }|  d| S Nr   r  r  s     rX   rd  zCppVecOverrides.logical_xori  r  rW   c                 6    t          | |          \  } }|  d| S r  r  r  s     rX   rh  zCppVecOverrides.bitwise_andn  r  rW   c                     d|  S r  rV   r]  s    rX   rk  zCppVecOverrides.bitwise_nots  r_  rW   c                 6    t          | |          \  } }|  d| S r  r  r  s     rX   rn  zCppVecOverrides.bitwise_orw  r  rW   c                 6    t          | |          \  } }|  d| S r  r  r  s     rX   rp  zCppVecOverrides.bitwise_xor|  r  rW   c                     |  d| S )Nz << rV   r  s     rX   r~  z"CppVecOverrides.bitwise_left_shift  rZ  rW   c                     |  d| S )Nr  rV   r  s     rX   r  z#CppVecOverrides.bitwise_right_shift  rZ  rW   c                     t          t          j        t                    sJ t          j                            | |           S r  )r   r4   rh  r  load)r   rG  s     rX   	load_seedzCppVecOverrides.load_seed  s3    !(L11111(--f--//rW   c                     t          t          j        t                    sJ t	                      }d|  d}t          |||          S )Nz)result[offset_idx] = normalized_rand_cpu(, offset[offset_idx]);r   r4   rh  r  r7   rF   r  rG  r   rand_functions       rX   r  zCppVecOverrides.rand  sJ    !(L11111~~TTTT 	 FD-888rW   c                     t          t          j        t                    sJ t	                      }d|  d}t          |||          S )Nzresult[offset_idx] = randn_cpu(r  r  r	  s       rX   r  zCppVecOverrides.randn  sE    !(L11111~~V$VVVFD-888rW   c                     t          t          j        t                    sJ t	                      }d|  d| d| d}t          |||t          j                  S )Nz#result[offset_idx] = randint64_cpu(z, offset[offset_idx], r   r   )r   r4   rh  r  r7   rF   r   r  )r  rG  r  r  r   r
  s         rX   r  zCppVecOverrides.randint64  s[    !(L11111~~idiiZ]iiaeiiiFD-EEErW   c                 |    | j         |j         k    s
J d            |  dt                              | |           d| S )Nz;remainder vec implementation expect the same inputs' dtype.z - (z) * )r   r  r  r  s     rX   	remainderzCppVecOverrides.remainder  sP    w!'!!!I "!! @@11!Q77@@Q@@@rW   c                     |  dS )Nz.tan()rV   r]  s    rX   r  zCppVecOverrides.tan  r  rW   c           	      r    t           j        j        r"d|  d}d|  d}d|  d}| d| d| d|  d| 	S |  d	S )
Nr  r  z)(2)z)(-2)z / (z + (r   z).exp()) - z.tanh())r   r  use_decompose_tanh)r  r  vec_twovec_minus_twos       rX   r  zCppVecOverrides.tanh  s{    :( 	!)!)))G)!)))G0000MVVVV]VVqVVWVV === rW   c                     |  dS )Nz.reciprocal()rV   r]  s    rX   
reciprocalzCppVecOverrides.reciprocal  s    """"rW   c                     |  dS )Nz.atan()rV   r  s    rX   r	  zCppVecOverrides.atan  r  rW   c                     |  dS )Nz.acos()rV   r  s    rX   r  zCppVecOverrides.acos  r  rW   c                     |  dS )Nz.asin()rV   r  s    rX   r   zCppVecOverrides.asin  r  rW   c                     |  dS )Nz.cosh()rV   r  s    rX   r  zCppVecOverrides.cosh  r  rW   c                     |  dS )Nz.sinh()rV   r  s    rX   r  zCppVecOverrides.sinh  r  rW   c                     |  dS )Nz.log10()rV   r  s    rX   r)  zCppVecOverrides.log10  r  rW   c                     |  dS )Nz.log2()rV   r  s    rX   r+  zCppVecOverrides.log2  r  rW   c                     |  d| dS )Nz.nextafter(r   rV   r  s     rX   r-  zCppVecOverrides.nextafter  s    $$$$$$rW   c                     |  d| dS )Nz
.copysign(r   rV   r  s     rX   r  zCppVecOverrides.copysign  s    ##q####rW   c                     |  d| dS )Nz.atan2(r   rV   r  s     rX   r  zCppVecOverrides.atan2        A    rW   c                     |  d| dS )Nz.hypot(r   rV   r  s     rX   r'  zCppVecOverrides.hypot  r   rW   c           
      <    d|  d}d|  d}| d| d|  d| d|  d
S )	Nr  r  z)(0.5)z * ((r   z)/(r  z)).log()rV   )r  r  vec_one_halfs      rX   r  zCppVecOverrides.atanh  sS     &a%%%,1,,,NNWNNNNwNN1NNNNrW   c                     |  dS )Nz.asinh()rV   r  s    rX   r  zCppVecOverrides.asinh  r  rW   c                     |  dS )Nz.acosh()rV   r  s    rX   r  zCppVecOverrides.acosh  r  rW   c                     t           j        j        }|dk    rdS |dk    r|  dS |dk    r|  d|  dS |	d|  d	|  d
S t          d|          )Nr/  r0  r1  r2  r  r  r  zat::vec::clamp_min(r3  r4  r5  r6  r  s     rX   r8  zCppVecOverrides.relu  s    j5/!!##O##???"J,,Q,,,,[???q???? Q#QQ  rW   c                     d|  d|  d|  dS )Nr  z)(1)/(decltype(z)(1) + z.neg().exp())rV   r  s    rX   r  zCppVecOverrides.sigmoid  s#    G1GGQGGqGGGGrW   c                     |  dS )Nz.neg()rV   r  s    rX   r  zCppVecOverrides.neg  r  rW   c                    t          | j                  r#| j        |j        k    s
J d            d|  d| dS t          d | |fD                       sJ d|  d}t          j                            |j                  dk     r#| ddt          j        j        z  dz
   d	| d
| d}|  d| }d|  d| d| d}d|  d| d| d| d	}| d| d| d| d
| d| dS )NzDdiv_floor_floating_vec implementation expect the same inputs' dtype.zdiv_floor_floating_vec(r   r   c              3   >   K   | ]}t          |j                  V  d S r  )r   r   )r'  items     rX   r)  z+CppVecOverrides.floordiv.<locals>.<genexpr>  s-      GG'
33GGGGGGrW   r  r5   ::blend<r  (1), r  r  r  rc  z(0))r  r   z	(0)) != (z(0)))z	::blendv(r  rg  )r   r   r.  r4   rh  _get_raw_num_vectorstiling_factor)r  r  _tr  has_remis_negs         rX   r  zCppVecOverrides.floordiv  sh   !'"" 	S7ag%%%V &%% 7Q66!6666GGAGGGGGGGG!Q!!!Bx,,QW5599UUA)?$?1#DUUUUQRUUU<<A<<D/!////r///G<!<<<<Q<<2<<<FRR4RR4RRBRRWRRRRRRrW   c                     t           j                            |j                  dk     r)d| d}| ddt           j        j        z  dz
   d| d| d}|  d| S )Nr5   r  r   r,  r  r-  r  )r4   rh  r.  r   r/  )r  r  r0  s      rX   r  zCppVecOverrides.truncdiv'  sv     8((11A55!Q!!!BQQQX%; ;q@QQBQQQQQQA||||rW   c                     | j         t          j        k    rB|j         t          j        k    sJ t          t          j        j        | |f          \  }}| d| S d|  d| dS )Nrg  at::vec::minimum(r   r   r   r   rl   rO   r4   rh  r  r  r  a_castb_casts       rX   r:  zCppVecOverrides.minimum0  o    7ej  7ej((((1!(2BQFKKNFF)))))0q00A0000rW   c                     | j         t          j        k    rB|j         t          j        k    sJ t          t          j        j        | |f          \  }}| d| S d|  d| dS )Nrm  at::vec::maximum(r   r   r6  r7  s       rX   r=  zCppVecOverrides.maximum9  r:  rW   c                     |  d|  S r  rV   r]  s    rX   squarezCppVecOverrides.squareB  r  rW   c                 j   t          t          j        t                    sJ |j        t
          j        k    rL|j        t
          j        k    sJ t          t          j        j        | ||f          \  }}}d| d| d| d| d	S d| d| d| dt          j        	                    | |j                   d	S )Nr  
)::blendv(r   r   )
r   r4   rh  r  r   r   rl   rO   r  _get_mask_cast)r  r  rA  blendv_ablendv_bblendv_cs         rX   rB  zCppVecOverrides.whereF  s    !(L111117ej  7ej((((+? 1a), ,(Hh WxVV8VVxVV8VVVV\q\\A\\\\ah6M6MaQRQX6Y6Y\\\\rW   c                    t                      }d|  d}d|  d}d|  d| d| d| d|  d}d|  d| d| d|  d| d}|                    d           |                                5  |                    d	| d
           |                    d| d
           |                    d           d d d            n# 1 swxY w Y   |                    d           |S )Nr  r  r  r@  r   r   r   rr  r  r   r  r  rz  r  )r  r   vec_zeror  blendv_lblendv_rs         rX   r  zCppVecOverrides.signR  si   ~~&q&&&%a%%%TqTTHTTTT8TTPQTTTTqTTHTTTT1TTTTTw[[]] 	3 	3NN5(555666NN68666777NN1222	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	ts   (AB<<C C NTc                    |t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j	        t           j
        t           j        fv sJ t           d|             t          | t                    sJ | j        }t           j                            | ||          }t           j        j                            t           j        j        |          }|                    d| |fd|i           |t.          v r2|t           j        k    r"t           j                            | |||           |S )Nz does not support r  r   )r   rl   float64ro   bfloat16float16uint8int8r  r  float8_e4m3fnfloat8_e5m2r  r   rG   r   r4   rh  r  r  r  r  r  r   r  )r  r   r   use_compute_dtypesr  r  s         rX   r  zCppVecOverrides.to_dtypea  s#   JMKNMKJKK
 
 
 
 11%11
 
 
 !^,,,,,G	x))!UI>>&&qx'7>>j1e*{I6NOOOM!!i5;&>&>H((IvuEEErW   c                 r    t           j        j        }|dk    r|  d|  dS ||  dS t          d|          )Nr  r  r  z.log1p()r  r  r  s     rX   r  zCppVecOverrides.log1py  s\    j6*,,Q,,,,[>>>! R3RR  rW   c                 	   t          t          j        t                    sJ t	                      }t          j        j                                        }t          j                            |           5 }|                    d| d           t          j        	                    |          5  |
                                5   |            }|                    d| d           d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   |                    d           t          j        j                            |           |j        | d}fd}|j        r|}	n ||          }	t          |t                              }
 ||
          }t          |t"                    s
J |            |j        rbt	                      }|                    d           t          j        	                    |          5  |
                                5  |                    d| d	           |
                                5  |                    d| d           d d d            n# 1 swxY w Y   |                    d
           |
                                5  t          j        j                            t          j        j        |	          }t          j        j                            t          j        j        |          }t          |t"                    s
J |            t          |t"                    s
J |            |_        |_        t          j        j        }|                    d|                    |||           d           d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   |                    d           t          j        j                            t          j        j        |          }n|j        r=t          j        j                            t          j        j        |  d|	 d|           }n<t          j        j                            t          j        j        |  d| d|
           }|                    d| |||fi            |S )Nr   rN  rO  r   rz  c                     t           j        k    r$t          j                                         d|  dn$t          j                                       d|  dS )N::from(r   r  )r   rl   r4   rh  r  _get_vec_type)r   r   s    rX   maskify_or_vecifyz1CppVecOverrides.masked.<locals>.maskify_or_vecify  sb     EJ&& 8**,,<<T<<<<..u55?????rW   [&]if (z.all_zero())elser?  r@  rV  )r   r4   rh  r  r7   r  r  rV  r   rP  r   r  r  r   r   rP   rH   rG   r  	overridesrB  r  )rQ  rR  rS  r   r   new_maskr  	body_coderW  body_code_vecrU  other_code_vecbody_vec_varother_vec_varr[  r  r   s                   @rX   rV  zCppVecOverrides.masked  sF   !(L11111~~hl!!##X__T"" 	4hNN.3...///&&t,, 4 4dkkmm 4 422223334 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4
 	s	%%%JJJ		 	 	 	 	 = 	9%MM--i88M!%e)<==
**:66(N33==X==3? (	>>DNN5!!!&&t,,  dkkmm  <h<<<===[[]] @ @NN#>^#>#>#>???@ @ @ @ @ @ @ @ @ @ @ @ @ @ @v&&&[[]]  #$8<#8#8(%$ $L %&HL$9$9(&% %M &lNCCQQ\QQC%m^DDSSmSSD).L&*/M'*  NN[)//(L-"X"X[[[  #                                           2 NN4   X\**  FF ] 	X\** T"P"Pm"P"P"P"P FF X\** T"H"Hi"H"HJ"H"H F
 	htUF(CRHHHs   39D,D$C1%D1C55D8C59D<DD	DD	DD#&D#O#-.OJ5OJOJ	,O5C4N5)O5N99O<N9=O O#O	O#O	O##O'*O'c                 0   t          t          j        t                    sJ t          j                            |           }t          j        j        t          j        j                 }t          j                            ||          }|dk    rt          	                    | |          S |t          j        j
                            t          j        j        t          |          t          |                     }t          j        ||          }t          |t"                    r|j        }t          j                            ||          }n1t          j                            d ||t          j        j                  }|                    d| |fi            |S )Nr   rH  rL  )r   r4   rh  r  rJ  itervars
tiling_idx_try_get_const_strider  rL  r  r  r  rD   r&   r2   r  r3   r   arange_load_or_store_non_contiguousr  )r  r   r   
tiling_varstrider\  r   r  s           rX   rL  zCppVecOverrides.index_expr  sC   !(L11111((..X&qx':;
//zBBQ;;**4777(,'' %,,7LT7R7R (  C Le,,E%** $X__UF33FFX;;eUAH$4 F 	lT5M2>>>rW   c           
         d|  dd|  df}t          d |D                       rt          d |D                       S t          | j                 }t          j        j        rt          j        j        nt          j        j        }t                      }t          j        j	        
                    t          j                  }t          j        j	        
                    | j                  }|                    d| fi            |                    d| fi            t          j                            | j                  }|d	k    rd
| dnd| d| d}|                    |d	k    rd| dnd| d| d           |                    | d| d           |                    d           |                                5  |                    d| dt          j        j         d           |                    |  dt#          |           d           |                    dt          j        j         d           |                    d| dt          j        j         d           |                    dt#          |           d           |                                5  |                    d           d d d            n# 1 swxY w Y   |                    |d	k    r| dt#          |           dn| d| d t#          |           d           |                    | d!| d"t#          |           d           d d d            n# 1 swxY w Y   |                    d#           t          j        j                            |           ||f}	t)          ||	          D ]*\  }
}t          j        j	                            |
|           +||fS )$Nr  r  r  c              3   b   K   | ]*}t           j        j                            |          d uV  +d S r  r  r  s     rX   r)  z(CppVecOverrides.frexp.<locals>.<genexpr>  r  rW   c              3   ^   K   | ](}t           j        j                            |          V  )d S r  r  r  s     rX   r)  z(CppVecOverrides.frexp.<locals>.<genexpr>  r  rW   r   r%  )r  r5   at::vec::Vectorized<r   at::vec::VectorizedN<r   zat::vec::Vectorized<int32_t> r   zat::vec::VectorizedN<int32_t, > r   rr  __at_align__ std::array<	> tmpbuf;.store(tmpbuf.data(), r   z!__at_align__ std::array<int32_t, z> tmpbuf_exponent;z> tmpbuf_mantissa;r   r   z@tmpbuf_mantissa[i] = std::frexp(tmpbuf[i], &tmpbuf_exponent[i]);z? = at::vec::Vectorized<int32_t>::loadu(tmpbuf_exponent.data(), z! = at::vec::VectorizedN<int32_t, z!>::loadu(tmpbuf_exponent.data(), r   z ::loadu(tmpbuf_mantissa.data(), z();)r.  r   rH   r   r4   rh  	tail_sizer/  r7   r  r  r   r  r  _get_num_vectorsr   r   rE   r  r  r  r  )r  r   r   rF  r   r!  r"  n_vec
mantissa_tr#  r  r$  s               rX   r%  zCppVecOverrides.frexp  s   %a%%%'7'7'7'77
WWJWWWWW 	VUU*UUUUUUag&%&X%7Sqx!!QX=S~~8<&&U[&998<&&QW&55!b999!b999))!'22 zz -6,,,,;;;5;;; 	
 	zz 8H7777F%FF8FFF	
 	
 	

 	*22x222333w[[]] 	 	NNV6VVQX5KVVV   NNaLL{47H7HLLLMMMNN^AH4J^^^   NN_6__QX5K___   NNK+d2C2CKKKLLL  V                 NNA:: qq\ghl\m\mqqqq   A  A5  A  Akvw{k|k|  A  A  A  
 NNaa
aaKX\L]L]aaa  +	 	 	 	 	 	 	 	 	 	 	 	 	 	 	0 	u	%%%h'"%j(";"; 	1 	1IwHLY0000!!s8   C!M#K9MK			MK		A4MMMc                     fd}|S )Nc                  \   |rJ t           j        }t          |t                    sJ t	                      }|                    d           | d         j        }|                    |          }|j        r|j        n|j	        }g }t          |         }j        dv }	|	rdn|}
j        dk    rt          | d                  n|
}
|                                5  t          |           D ]\  }}t          |t                    r|j        sJ |j        |k    sJ |                    d| d|j	         d	| d
           |                    | d| dt!          |           d           |                    d| d           |                    |           |                    d|
 d|j	         d            | }|                    dt!          |           d           |                                5  |                    d| d
           d d d            n# 1 swxY w Y   |	r|j        rJ d}d| d| d}n(dt!          |           }|dk    rd|
 d}n	d|
 d| d}|                    d| d| d           d d d            n# 1 swxY w Y   |                    d           |S ) Nrr  r   )r  r  r  rl   r  rq  r   z> tmpbufr   z.store(tmpbufz	.data(), r   tmpbufz[i]z> tmpbuf_out;r   r   ztmpbuf_out[i] = ztmpbuf_out.data()at::vec::VecMask<,z>::fromztmpbuf_out.data(), r5   rn  z>::loaduz at::vec::VectorizedN<rO  r  rz  )r4   rh  r   r  r7   r   r   ru  rt  r/  rH   r  r   r   rG   r   rE   r  )r  r  rh  r   	vec_dtyperv  rF  scalar_argsr   output_maskoctypeargidxr  res	load_argsload_fnr  s                   rX   rg  z)CppVecOverrides._scalarize.<locals>.inner$  s   :XFfl33333>>DNN7###QI++I66E'-'7Q6##V=QDK!),F%. 3 K
  +6VVF  (,>>> T"X&& 
  B B#,T?? 0 0KFC!#~66 0"z))z"yI5555hvhhAUhh_ehhh   "WWWW+dBSBSWWW   $**+?F+?+?+?@@@@#**3////\v\\9M\\\   "k;/Ok$6G6GOOOPPP[[]] > >NN#<c#<#<#<===> > > > > > > > > > > > > > > 	U%//// 3II&II5IIIGG Ik$6G6G I IIzz"I"I"I"I"T6"T"TU"T"T"T@@@9@@@AAA?B B B B B B B B B B B B B B B@ NN4   Ks8   DJ4HJH	J!H	"AJJJrV   )r1  r  rg  s    ` rX   
_scalarizezCppVecOverrides._scalarize"  s$    7	 7	 7	 7	 7	r rW   c                 6   t          t                    }t          t                                                    D ]]\  }}t	          |t
                    rC||vr?|                     |j                  }||_        t          | |t          |                     ^d S r  )
r  r  r  r  r   r  r  r  r  r  )r1  vec_varsr   r  r  s        rX   _initialize_scalarizez%CppVecOverrides._initialize_scalarize_  s    (( ..4466 	7 	7LD&&,// 7D4H4H~~fo66 $T<#5#5666		7 	7rW   r  )Wr  r  r  r  r  r  r  r   rX  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rY  r^  ra  rd  rh  rk  rn  rp  r~  r  r  r  r  r  r  r  r  r  r	  r  r   r  r  r)  r+  r-  r  r  r'  r  r  r  r8  r  r  r  r  r:  r=  r>  rB  r  r  r  rV  rL  r%  rl  r  r  rm  rn  s   @rX   r  r  j  s       88[ [ [ [ [z   \   \   \   \   \   \   \   \   \ ( ( \(
   \   \   \   \ A A \A 	E 	E \	E @ @ \@ @ @ \@ A A \A A A \A   \   \   \   \   \   \   \   \     \    \   \   \   \   \   \   \   \   \   \   \ 0 0 \0 9 9 \9 9 9 \9 F F \F A A \A   \ 	! 	! \	! # # \#   \   \   \   \   \   \   \ % % \% $ $ \$ ! ! \! ! ! \! O O \O   \   \   \  H H \H   \ S S \S$   \ 1 1 \1 1 1 \1   \ 	] 	] \	]   \    \. 	 	 \	 J J \JX   \, 6" 6" \6"p : : [:x 7 7 [7 7 7 7 7rW   r  cppvecc                   $    e Zd Zed             ZdS )CppTile2DOverridesc                     t          t          j        t                    sJ t          j                            |           } t
                              | |          S r  )r   r4   rh  CppTile2DKerneltransform_indexingr  rL  )r  r   s     rX   rL  zCppTile2DOverrides.index_exprn  sE    !(O44444x**400))$666rW   N)r  r  r  r  rL  rV   rW   rX   r  r  m  s-        7 7 \7 7 7rW   r  c                   j    e Zd ZdZeZeZdZdZ	 fdZ
eefdZd Zd7dee         fd	Zej        d
             Z	 d8dej        fdZdej        defdZdej        dej        fdZdej        dej        fdZd Zdej        dej        dedefdZdedej        fdZ d7dZ!de"e#ef         ded ed!e$j%        fd"Z&d7dee'         fd#Z(d$ Z)	 	 d9d&Z*	 d:d'Z+d( Z,d) Z-d* Z.d+ Z/d, Z0d- Z1e2defd.            Z3d/ Z4ej        d0             Z5d1 Z6d2 Z7d3 Z8	 	 d;ded4ee         d5eej                 fd6Z9 xZ:S )<	CppKernela%  
    Base class for C++ kernel code generation in PyTorch Inductor.
    This class is responsible for generating C++ code from the intermediate representation.

    Args:
        args: Kernel arguments used for code generation
        num_threads: Number of threads for parallel execution
    r   r   c                 &   t                                          |           i | _        g | _        d | _        g | _        g | _        d | _        t                      | _	        g | _
        t                      | _        t                      | _        t                      | _        t                      | _        t                      | _        d| _        t                      | _        t                      | _        t'          | j        | j        d          | _        t'          | j        | j        d          | _        t'          | j        | j        d          | _        t                      | _        t                      | _        || _        i | _        g | _        d S )NFtmp_acc)name_prefixwelford_helpercascade_helper)r8  r9  active_rangesinner_itervarscall_rangesrY  rc  reduction_depthr=   reduction_prefixreduction_prefix_generatorsreduction_suffixparallel_reduction_prefixparallel_reduction_suffixlocal_reduction_initlocal_reduction_storesis_reductionnon_parallel_reduction_prefixnon_parallel_reduction_suffixr8   newvar_prefixsuffixreduction_csewelford_helper_csecascade_helper_csepreloads
poststoresnum_threadsreduction_omp_decreduction_var_names)r:  r  r  r=  s      rX   r9  zCppKernel.__init__  sq    HJ 35=A(*,.# . 0 0 <>( . 0 0)7)9)9&)7)9)9&$2$4$4!&4&6&6#!-;-=-=*-;-=-=* !3T[iXXX"%9I#
 #
 #
 #&9I#
 #
 #
 '(((**&=?.0   rW   c                    t           j        j        r!| j        s| j                            d           | d}t           j        j        rdnt                      }| d}	| j                            | d| d |||           d           | j                            t          ||||||                     | j	                            |	 d| d           | j
                            d| d	d
d| d ||||	|           ddg           d S )Nz(int max_threads = omp_get_max_threads();_localmax_threadsz	_arr[tid]r   r   r   zfor (int tid = 0; tid < z; tid++)r   r   r   r   )r   r  dynamic_threadsr  r   r+   r  r  r   r  r  r   )
r:  r   r   r   r   reduction_combine_fnreduction_init_fn	acc_localr  acc_local_in_arrays
             rX   _gen_parallel_reduction_buffersz)CppKernel._gen_parallel_reduction_buffers  s    :% 	d.L 	*44:   NNN	#Z7SMM=Q=S=S 	 !$...!++SS)SS(9(9.%(P(PSSS	
 	
 	
 	&--"! 		
 		
 		
 	#--1C.T.T	.T.T.TUUU&11@;@@@pspp33NCI[glmmmppp		
 	
 	
 	
 	
rW   c                 L    | j         D ]}t          | j        || d           d S )Nr  )r  r   stores)r:  var_names     rX   %update_stores_with_parallel_reductionz/CppKernel.update_stores_with_parallel_reduction  sB    0 	I 	IHT[(x4G4G4GHHHH	I 	IrW   Nr   c                    |J t                      }t          j                    5 }t          | d          rV|                    | j                   |                     |           |                    |                                           |                    | j	                   |                    | j
                   |                    | j                   d d d            n# 1 swxY w Y   t          | d          r|                    | j                   | j        r0| j        D ](}| j        |         \  }}t          ||| d||          })|S )Ncodegen_inner_loops_tail)r7   r   r   r   r  r  r  r   r   loadsr  r  r  r  r  r   )r:  r   r   r\  startends         rX   gen_bodyzCppKernel.gen_body  s   |||~~!## 	%ut233 3DM***((...##DKKMM222KK
###KK%%%KK$$$	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 4.// 	)KK((( 	X* X X!/4
s1$]]]ESVWWs   B5C''C+.C+c              #     K   | j         }|rYt          j        ||          }t          |t                    r/|j        }t          |t                    sJ t          j        |_	        || _         	 |V  || _         dS # || _         w xY w)z>Context manager to add an additional mask to loads and stores.N)

_load_maskr2   r  r   r3   r   rG   r   rl   r   )r:  rQ  priors      rX   rV  zCppKernel.masked  s        	(8D%((D$)) (z!$77777 #Z
	$JJJ#DOOOeDO####s   -A: :	Br5   r   r   c                 T    | j         |         }|||z  |z   i}t          ||          }|S r  )rc  r0   )r:  r   scaleitervar_idxrG  r   r   r   s           rX   scale_index_with_offsetz!CppKernel.scale_index_with_offset  s8     mK(C%K&01uk22	rW   r   c                 F    t          |                     |                    S )z
        Convert an index expr to a string that can be used in cpp code.
        e.g. a sympy expression "s2" may actually appear as "ks1" in the cpp kernel.
        )rD   rJ  r:  r   s     rX   index_to_strzCppKernel.index_to_str	  s     
 T))%00111rW   itervarc                 H     t           fd|j        D                       S )z]
        Check if an index has free symbol CppCSEVariable that depends on `itervar`.
        c              3      K   | ]k}|j         j        j        v t          j        j        |j                  t                    ?j        j        |j                                                V  ld S r  )r   r  varname_mapr   rG   
depends_on)r'  sr  r:  s     rX   r)  z6CppKernel.index_indirect_depends_on.<locals>.<genexpr>  sy       
 
v---48/7HH . H (33G<<----
 
rW   )rg   free_symbolsr:  r   r  s   ` `rX   index_indirect_depends_onz#CppKernel.index_indirect_depends_on  sF      
 
 
 
 
'
 
 
 
 
 	
rW   c                 @    ||j         v p|                     ||          S r  )r  r  r  s      rX   index_depends_onzCppKernel.index_depends_on  s.    %,, 
0N0N71
 1
 	
rW   c                 P    t          t          | j        | j                            S r  )dictr  rc  rY  r?  s    rX   
var_rangeszCppKernel.var_ranges   s    Ct{33444rW   r  rF  lowerupperc                    |s|sd S t          |t          j                  }|r6t          j        |t
          j                  j        }t          j	        j
        }n{t          j	        j
        }	 | j        t          j	        _
        t          j        |t
          j                  j        }|t          j	        _
        n# |t          j	        _
        w xY w| j        }|r2t          j	                            |                     |                    nd }	|                     ||rdnd |	| j                  }
| j                            ||
d           d S )N0F)
assignment)r   r   TMPr2   rL  r   r  r   r4   rh  r  r  sexprrJ  indirect_assertr  r  r  )r:  r  rF  r  r  indirectr  r   prior_computesize_strr   s              rX   check_boundszCppKernel.check_bounds#  s)     	 	F&tTX66 	 ^D%+66<FX%FF H,M1#': ek::@#0  = 0000ZFAFP18>>$"6"6t"<"<===D##5*CCdHdo
 
 	&$599999s   +:B7 7C
r   c                 @   | j                             |          }|                     |          }| dt          |           d}| j                            | j        |t          j        	                    |                    }|
                    d| ||fi            |S )N[]rm  r  )r  inputrJ  rE   r  r  r  r4   graph	get_dtyper  )r:  r   r   r   r   r  s         rX   r  zCppKernel.loadC  s    iood##$$U++--E**---""4:t17;L;LT;R;R"SSftT5&92>>>rW   c                 *   d|v sJ | j                             |          }|                     |          }|| dt          |           d| d}n|dk    rt          j        j        s$| j        dk    r| dt          |           d| d}n_t          j	        
                    |          }dt          |          d	| d
}d| dt          |           d| d}nt          d|           | j                            t          ||                     d S )Nbufr  ] = r   
atomic_addr5   z] += zstatic_cast<r  r   zatomic_add(&z], r   store mode=)r  outputrJ  rE   r   r  r  r  r4   r  r  rH   NotImplementedErrorr  r   r;   )r:  r   r   r   moder   r   r   s           rX   storezCppKernel.storeK  sK   }}}}it$$$$U++<<<K..<<E<<<DD\!!:- M$2Ba2G2GAAE 2 2AAAAA))$//F|E':FFeFFFLcLLK,>,>LL5LLL%&:D&:&:;;;l46677777rW   r   r   rtyper   c                 J    ddt           t                   ffd}|S )NrF  c                 Z    |  d d            dS t          |           S )Nr   r   r   )r   )rF  r   r   r   r   r  s    rX   rg  z.CppKernel._gen_reduction_prefix.<locals>.innerk  sW    |"EESEEWWUE-B-BEEEE-  rW   r  )r   rn   )r:  r   r   r  r   r   rg  s    ````` rX   _gen_reduction_prefixzCppKernel._gen_reduction_prefix]  sP    	 	 	 	 	 	 	 	 	 	 	 	 rW   c                 `    | j         D ]%}| j                             ||                     &d S r  )r  r  r  )r:  rF  gen_fns      rX   finalize_reduction_prefixz#CppKernel.finalize_reduction_prefixz  s@    6 	7 	7F!((6666	7 	7rW   c                 0   |dk    r| S |dk    r|t           j        k    r| j        J t          j        t
          j        | j        | j        d                    }t          j	        j
        r|}nt          |t                                }d}	 t          j        j                            |          }n# t"          $ r Y dS w xY w||k    r't          j        j                            ||           dS t          j        j                            ||           dS )Nrh   rd      TF)r   ro   r  rU  rV  rW  rX  r  r   r  r  r   r+   r4   r  sizevars	size_hint	Exceptioncheck_lt	check_leq)r:  r   r   
use_scalarreduction_sizert_size
chunk_sizert_size_hints           rX   need_use_acc_helperzCppKernel.need_use_acc_helper~  s.    ---!>! U""u';';#///&-d.t/C/E/EF N z) J )!.2F2H2HII J w/99'BB   tt j(( ))*g>>>t **7J???us   $B3 3
C CFc           
         |rt          ||          n|}t          |          }|dv sJ |dk    rdnd}	t          ||	          }
|dk    rdnd}|rt          |         }n2t          | d          r|                     |          nt          |         }| d| d	|	 d
| d| d
}|dk    r|S t          |
t          j                  r|
dk    rd| S |S )N)rh   rd   rh   i   r   WelfordHelperCascadeSumHelperrV  <r   rp  r  r   rd   r5   zstatic )r   rE   rH   r   rV  r   r   rZ  )r:  r   r   helper_ranger   r  r  num_range_threadnum_range_thread_exprr	  
num_chunkshelper_typeh_typehelper_init_lines                 rX   _acc_helper_initzCppKernel._acc_helper_init  s^    3>OGL+...< 	 !,,< = =!:::::+/???TTU
-z::
 !111 O# 	
  	!%(FF 411)""5)))!%(    V  z  Z  $   	 U""##j%-00 	$Z1__ 0-///##rW   c           
      >   t           j        j        rdnt                      }| j                            |                     ||||d |                     | j                            |                     ||||||                     |r|n| d}|dk    rH| j                            | d| d| d           | j	                            | d| d| d           d S | j                            | d	| d           | j	                            | d
| d           d S )Nr  _vecrh   z = welford_combine(r   r   z_local = welford_combine(z	_local, &z = cascade_sum_final(&z_local = cascade_sum_final(&)
r   r  r  r+   r  r   r  r  r  r  )	r:  r   r   r   r  r   r  r  r  s	            rX   _use_acc_helperzCppKernel._use_acc_helper  s    $Z7SMM=Q=S=S 	 	*44!!
L%z 	
 	
 	

 	!++!!
L%j 	
 	
 	

 #43---.88GGfGGGGG   '11SSFSSZSSS     .88?????   '11EEzEEE    rW   c           
         |dv }|||f}|| j         j        v r| j         j        |         S | j                             | j        d| d          }| j                            |            d| _        |r|n|}t          ||          }	| j                            | 	                    ||	||t                               |                     ||d          rt          j        t          j        | j        | j        d                    }
| j                            | j        d| d          }d| }|                     ||||
|d           | j                            | dt/          ||||           d	           n| j        J | j        | j                 }t3          | j        d
z   t5          | j                            D ] }|| j        |         z  | j        |         z   }!| j                            | dt/          ||||           d	           |                     ||	||           t9          ||          }|| j         j        |<   |S )Nrb   ra   
reduction FwriteTscalar_r  r   r   r5   r   )r  reduction_cacher  r  r  r  r  r   r  r  r   r  rU  rV  rW  rX  rY  r  r  r  r  r  r   r   rc  rS  r   r  r   )r:  r   r   r   r   argmax_or_argminreduction_keyr   
init_dtyper   r  r   scalar_helper_valr   r   r  s                   rX   	reductionzCppKernel.reduction  s   )-AA!>58D.>>>%5mDD ))J4]44E * 
 
 	 ''3111 "2=YY
%njAA(//&&X~z> 	
 	
 	
 ##NE4@@ 	&-dk$*>*@*@A N 099:=::% :  J !7* 6 6  ! !    K!!^^,^S%IZ[[^^^    '333M$"67E4/!3S5G5GHH B BA.q1AAK!!XX,^S%uUUUXXX   	,,S(NJWWW">377<B*=9rW   c                     |                      |          }| j                            |          }| j                            t          || dt          |           d| d                     d S )Nr  r  r   )rJ  r  r  r  r   r;   rE   )r:  r   r   r   r   s        rX   store_reductionzCppKernel.store_reduction)	  s}    $$U++it$$''#HHE(:(:HHHHHII	
 	
 	
 	
 	
rW   c                 J     j         rt j         t          |          t          |          z   k    s1J  j          dt          |           dt          |                        j        t          |          k    sJ nt          |          t          |          z    _          fd j         D              _        d t          t           j                            D              _        t          |           _         j        d  j                  j         j        d          fS )Nr  r   c                 :    g | ]}                     |          S rV   rJ  )r'  r  r:  s     rX   rd  z(CppKernel.set_ranges.<locals>.<listcomp>8	  s'    MMMq4//22MMMrW   c                 B    g | ]}t          t          j        |          S rV   r.   r   XBLOCKr'  ns     rX   rd  z(CppKernel.set_ranges.<locals>.<listcomp>9	  s4        /t{A>>  rW   )r  r   r  r   rY  rS  rc  )r:  lengthsreduction_lengthss   `  rX   
set_rangeszCppKernel.set_ranges0	  s5    	0#uW~~>O8P8P'PPPP#VVwVVEBS<T<TVV QPP '3w<<77777$W~~6G0H0HHDMMMMD<LMMMDK s4;//00  DM $'w<<D M0D001M$.001
 	
rW   c                     | j         J t          j        j                            t          | j                   d          S )N    fallback)r  r4   r  r  r  r/   r?  s    rX   r  zCppKernel.size_hintC	  sC    +++w))$*++d * 
 
 	
rW   c                     t           t                    sJ t                       j        J t          |j        t
                    r.|j                            |                                          n(                     |                                          |j        d uo|j        j	                 j
        t          j                    5 }j        rBr                                 n                               |                               nAdk    r;                                r'|                                                               dt(          ffddd	 ddt(          dt*          ffdddt(          dt*          ffd		 	 ddt(          dt*          d
t,          ffd|                                                               t          |j        t
                    rt          t.          j        t2                    rt.          j        j        rt.          j        j        }|                                D ]}t9           fd|                                j        D                       }t>          |                                j                  }d| dtC          |           d}	|"                                }
#                    d| d|
 d|	 d           #                    | d|
 d|
 d            |           d d d            d S # 1 swxY w Y   d S )Nr5   
_loop_nestc                      fd}                                  }t          |t                    r|j        D ]              d S t          |t                    sJ  j         |            r|                                 t          j                    5 }|	                    
                                           |                               d d d            d S # 1 swxY w Y   d S )Nc                  V    j         sJ j         j                 } | j        o| j        S r  )rN  r  r  parallel)rootr;  	par_depths    rX   is_parallel_reductionzOCppKernel.codegen_loops_impl.<locals>.gen_kernel.<locals>.is_parallel_reductiong	  s2    %++++%+I,ABD,>>rW   )
get_kernelr   rf  rg  CppKernelProxyrN  r  r   r   r   r   r  )r;  rA  rh  r   r   gen_loop_nestr@  s   `   rX   
gen_kernelz0CppKernel.codegen_loops_impl.<locals>.gen_kernelf	  sV   ? ? ? ? ? ?
 $..00f&:;; 	.&,l 2 2
%j11112 2 &fn=====!'38M8M8O8O3DDFFF#-// .5++DKKMM:::---. . . . . . . . . . . . . . . . . .s   =C""C&)C&Fc                     |r | j         }|r| j        |z   }n
| j        |z   }|S | j        }|r|| j        z   }n
|| j        z   }|S r  )r  r  r  r  r  r  )rh  r>  	is_suffixr  prefixs        rX   get_reduction_prefix_suffixzACppKernel.codegen_loops_impl.<locals>.get_reduction_prefix_suffixx	  sn     "#4F O!'!AF!J!'!E!N!M#4F O!'&*J!J!'&*N!N!MrW   r   depthc                    |                                  }| j        sJ | j        |         }t          j                    5 }|j        rS|sQ 	||j        d          }|r'|                                                                                   |           
rF|j        r?                               |j	        r#|j
        sJ                     |j	                    | |           
r<|j        r5|j
        r                    |j
                                                    |j        r(|s&                     	||j        d                     d d d            d S # 1 swxY w Y   d S )NF)rG  T)rB  rN  r   r   r  r>  r   r   r  r  r  close)r;  rJ  in_reductionrh  loopstack_outerr  r   gen_loop_atrI  is_reduction_loopthreadsworksharings          rX   gen_loop_with_reductionz=CppKernel.codegen_loops_impl.<locals>.gen_loop_with_reduction	  s    $..00!''''!'.)++ {( 6 6+F+F"DMU, , ,( , E'55dkkmmDDD$4555( ET] E#,,W555!6 E#)#@@@#@ KK(CDDDK
E222( ,T] ,!8 G KK(EFFF#))+++(  77 &    +                 s   DE((E,/E,c                 v   t          j                    5 }| j        sJ | j        |         }|                                }|	 d d d            d S                     |           |                                                                | |dz   |j                   d d d            d S # 1 swxY w Y   d S r   )r   r   rN  linesr   r   r   r  )r;  rJ  r   rN  
loop_linesr   rD  s        rX   rP  z1CppKernel.codegen_loops_impl.<locals>.gen_loop_at	  s3   )++ Lu%++++%+E2D!%J!)L L L L L L L L OOJ///''666!M*eai9JKKKL L L L L L L L L L L L L L L L L Ls   .B.AB..B25B2rM  c                 z    | j         |t          | j                   k    r |            d S  | ||           d S r  )rN  r   )r;  rJ  rM  rE  rT  s      rX   rD  z3CppKernel.codegen_loops_impl.<locals>.gen_loop_nest	  sT    
 #+uJ<L8M8M/M/MJz*****++J|LLLLLrW   c                 :    g | ]}                     |          S rV   r-  )r'  size_valr:  s     rX   rd  z0CppKernel.codegen_loops_impl.<locals>.<listcomp>	  s7        ( !00::  rW   zstd::make_unique<z []>(r   zstd::unique_ptr<z	 []> buf_r   r   z* z = buf_z.get();)FF)r   F)r   )$r   rC  r+   r  rh  rf  decide_parallel_depthmax_parallel_depthrN  r  r  r   r   r  rL  r>  mark_parallelsingler   r   rR  rn   rl   r4   local_buffer_contextrK   local_buffersvaluesr/   
get_layoutrF  rH   r   rD   get_namer  )r:  rT  r   rS  r   r`  local_bufferlocal_buf_sizelocal_buf_dtypeallocatelocal_buffer_namerE  rP  rD  rT  rI  rQ  r@  rR  s   ` ``       @@@@@@@@rX   codegen_loops_implzCppKernel.codegen_loops_implI	  s   $/////&((+++i&(<== 	!(>>,,.. II 22,,.. I
 O4' D	 56C 	 !## ~	%u' 	7$ 2%%''''((111''	22221%%'' 7''666.x . . . . . . . .$" " " "" DI $-0          B	L 	L 	L 	L 	L 	L 	L 	L 	L 	L %*M M$MM #M M M M M M M ... 9+-ABBq57IJJ *8 !" 6 D$1$8$8$:$:  L%2   ,8,C,C,E,E,J  & &N '3<3J3J3L3L3R&SOa?aa~I^I^aaaH(4(=(=(?(?%KKf?ffEVff[cfff   KK*bb.?bbHYbbb    M)$$$}~	% ~	% ~	% ~	% ~	% ~	% ~	% ~	% ~	% ~	% ~	% ~	% ~	% ~	% ~	% ~	% ~	% ~	%s   IL::L>L>c                 h    t                               |           }|                     |||           d S r  )rR  buildri  )r:  r   rS  rT  s       rX   codegen_loopszCppKernel.codegen_loops	  s1    NN4((		4=====rW   c                 ,    t           j        j        rdS dS )NAOTI_TORCH_CHECKTORCH_CHECK)r4   r  aot_moder?  s    rX   assert_functionzCppKernel.assert_function	  s    7 	!%% =rW   c                    | j         J | j         |j        |j        |j        z            }|                                 }d}d}|D ]b}t          j        j                            |d          }|d|z  k    s||k    r n*||z  t          j        j	        k     r n|dz  }||z  }||z  }ct          j        j
        r|dk    rt          |          dk    rd}t          ||j                  S )Nr5   r   r7  r8  r   r  r  )r  r  r  r  r4   r  r  r   r  min_chunk_sizer  r   r  )	r:  r\  rR  rY  seqparrJ  r  hints	            rX   r[  zCppKernel.decide_parallel_depth	  s   +++!*".1C1RR

 nn 		 		D7#--dT-BBDa'k!!SG^^g~
 999QJE4KC4KCC :% 	%1**VqE .@.L
 
 
 	
rW   c              #     K   | j         | j        | j        | j        f}t	                      | _         t	                      | _        t	                      | _        | j                                        | _        d V  | j                            | j                    | j                            | j                   | j                            | j                   |\  | _         | _        | _        | _        d S r  )r  r  r  r  r=   cloner  r  )r:  r  s     rX   write_to_suffixzCppKernel.write_to_suffix
  s      T\4;A#%%
%''$&&8>>##$$TZ000$$T\222$$T[111<A9T\4;rW   c                     t          |i |S r  )rG   )r:  r  r  s      rX   create_cse_varzCppKernel.create_cse_var
  s    t.v...rW   c                 *    dt           |          d| dS )Nzc10::convert<r  r   )rH   )r:  srcr   r   s       rX   r  zCppKernel.get_to_dtype_expr
  s     <|E2<<c<<<<rW   c                 j    |                      |||          }| j                            ||           d S r  )r  r  r  )r:  dst	dst_dtyper~  r   r  s         rX   r  zCppKernel.cache_dtype_convert
  s5    %%c9i@@T3rW   rH  r   c                 `   
 |d} j         sdS g 

 fd}|+| j         v sJ  j         |         \  }} ||||          sdS n4 j                                         D ]\  }}|\  }} ||||          s dS d                    
          }	|	r|                    d| d|	 d           dS dS )	NrS   Tc                 f   | |k    rdS d }t          j                  D ]\  }}||k    r|} nt                    t          k    r|r| dk    r|j        |         k    rd}                    | dt          |                                           | dt          |                      dS )NFr   r5   r  r   T)r   rc  r%  r  rY  r  rE   )r  r  r   var_idr   _var
conditionsr:  s         rX   genz)CppKernel.codegen_conditions.<locals>.gen%
  s    ||uF$T]33  4$;;FE  T

i'' (QJJ4;v...>>+e*<*<>>???;;S)9)9;;<<<4rW   FrX  zif(r  r   )r  r  joinr   )r:  r   rH  r   r  r  r  r  _rangejoined_conditionsr  s   `         @rX   codegen_conditionszCppKernel.codegen_conditions
  s)    >F! 	4
	 	 	 	 	 	& ?$,,,,,+C0JE33uc3'' u !% 2 8 8 : : ! !f#
ss5#t,, ! 55!"KK
33 	NN???*;???@@@45rW   r  )r5   r  r   NFFNN);r  r  r  r  r  r[  rD   r  r  r  r9  r   r   r  r  r   r7   r  r   contextmanagerrV  r   r  r  rp   r  r  r  r  r  rl   r  r  r  r	   r9   r   r   r  rn   r  r  r  r  r(  r*  r5  r  ri  rl  propertyrq  r[  rz  r|  r  r  r  rm  rn  s   @rX   r  r  u  s         IEMF'1 '1 '1 '1 '1^ /('
 '
 '
 '
RI I I Xl3    ( $ $ $& BC Z   2%* 2 2 2 2 2	
uz 	
EL 	
 	
 	
 	

ej 
5< 
 
 
 

5 5 5:j: j: 	:
 : : : :@ UZ    8 8 8 8$;#$  	
 {   :7 7hsm 7 7 7 7' ' '^ *$ *$ *$ *$Z PU   @4 4 4l
 
 

 
 
&
 
 
O% O% O%b> > > ! ! ! ! X!
 
 
: 
B 
B 
B/ / /= = =      !%&*	. .. . el#	. . . . . . . .rW   r  c                       e Zd ZeZ	 d( fd	Zdej        dej        fdZ	de
j        defdZde
j        defd	Zde
j        defd
Ze
j        fde
j        defdZdede
j        defdZ	 d(dedej        de
j        dee         fdZ	 	 	 d)dee         dej        de
j        dee         deeeef                  dedee         fdZdedej        f fdZ	 d*deeef         dedej        de
j        def
dZd(dZd Zd ZdedefdZ dedej        defd Z!d! Z"d" Z#ddde
j$        fdeej                 d#ee         d$ee
j                 fd%Z%d( fd&	Z& fd'Z' xZ(S )+r  Nc                     t                                          ||           t          j                    | _        | j        sJ |dk    s
J d            || _        || _        || _        |r|n|| _        d S )Nr   z0Expect pass in Non-Zero tiling_factor explicitly)	r8  r9  r   pick_vec_isavec_isar/  rd  rt  	num_elems)r:  r  r  r/  rd  rt  r=  s         rX   r9  zCppVecKernel.__init__M
  s     	{+++"/11||q   "T   *$"&/B]rW   r   r  c                                            ||          rd S  fd|j        D             D ]#}t          |t                    sJ |j        r d S $t          || j                  }|j        r|nd S )Nc              3   z   K   | ]5}t          |t          j                  j        j        |j                 V  6d S r  r   r   r  r  r  r   r'  r  r:  s     rX   r)  z5CppVecKernel._try_get_const_stride.<locals>.<genexpr>a
  sU       
 
a**
H (
 
 
 
 
 
rW   )r  r  r   rG   r   r  r/  r  )r:  r   r  indirect_varri  s   `    rX   re  z"CppVecKernel._try_get_const_stride^
  s    ))%99 	4
 
 
 
'
 
 
 	 	L
 lN;;;;;" tt$UGT5GHH)3vvt3rW   r   r   c                     t          j        | j        |j        z  dz  | j                                        z            }|dk    sJ |S )N   r5   )mathr  r/  itemsizer  	bit_widthr:  r   num_vectorss      rX   ru  zCppVecKernel._get_num_vectorsl
  sQ    i/!3dl6L6L6N6NN
 
 arW   c                 Z    | j         |j        z  dz  | j                                        z  S )Nr  )r/  r  r  r  )r:  r   s     rX   r.  z!CppVecKernel._get_raw_num_vectorss
  s,     !EN2Q69O9O9Q9QQQrW   c                     |                      |          }|dk    rdt          |          dS dt          |          d| dS )Nr5   rn  r   ro  r}  )ru  rH   r  s      rX   rV  zCppVecKernel._get_vec_typex
  sT    ++E22!@,u*=@@@@O<+>OOOOOOrW   c                 x    |t           j        k    rdS |                     |          }dt          |          d| dS )NrS   r|  r}  r   )r   rl   ru  rH   r  s      rX   r  zCppVecKernel._get_mask_type
  sE    EJ2++E22G<#6GGGGGGrW   rQ  c                     |j         t          j        k    sJ t          |                      |                     |          }| dt
          |          d| dS )Nz.template cast<r}  r   )r   r   rl   reprru  rH   )r:  rQ  r   r  s       rX   rA  zCppVecKernel._get_mask_cast
  s\    zUZ'''d'''++E22MM|E':MM[MMMMrW   r   	load_maskc                    t           |         }|                     |          }d}|rN|j        s&|                     t          j                   d| d}n!|                     |t          j                   }|dk    r| dt          |           n|}|t          j        k    r|                                  d| d}	n@|r| d| d| d| dn/| 	                    |           d	| d
t          | j
                   d}	|	S )a  
        Get a load line str that loads a vector from `var` at `index` of type `dtype`.
        If `load_mask` is not None, we do a masked load accordingly.
        Notes on the `dtype`:
        1. We always load `self.tiling_factor` number of elements regardless of the `dtype`.
           It means we load half of the vector lanes for 16-bit data types and quarter of the
           vector lanes for 8-bit data types.
        2. `torch.bool` and `torch.uint8` could mean masks and we load them as float mask vectors.
        NrU  r   r   r   z.template loadu<r}  r  ::loadu(r   )rH   ru  r   r  r   ro   rA  rE   rl   rV  r  )
r:  r   r   r   r  cpp_typer  load_mask_strloadbufr   s
             rX   _get_vec_load_linezCppVecKernel._get_vec_load_line
  sI      &++E22 	Q# Q#'#6#6u{#C#C X XI X X X#'#6#6y%+#N#N P5:aZZS11[//111SEJ))++>>G>>>DD !e=VV(VV[VVGVVVV**511dd7ddkRVR`FaFaddd 
 rW   Fr   store_value
accu_storec                     |r|
J d            |r|sJ  j         dt          j        dt          f fddt          j        dt          f fddt          dt          f fd}t                      }|                    d	           |                                5   |          }	 |          }
d
t          |          d|
 d}|                    |           |r(|                    | dt          |	           d           t           j         j                  d          }i } fd|j        D             D ]6}t          |t                    sJ |j        r ||          }| d| d||<   7                     | j        |          }d} j        Y|r
J d            t           j        t                    sJ  j                     j        j        r j         d| d}n
 j         d}t%          j                    r|                    d j                    n|                    d j                    |                    d| d| dt           j                   dz   | dz              |                                5  t-          j                    5 }t          |          }|D ]%}t1          j        d| z   dz   ||         |          }&|| d| dn| }|r@|                    d | d           |                    |                                           |r%|rd!nd"}|                    | d#| d$| d%           n|                    d&| d'| d(           ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y   |s0                     d)d*|          }|                    d+| d(           ddd           n# 1 swxY w Y   |                    d,           |r,|                    d(                               |           dS  j                            ||-          }t          |t                    sJ d.|_        |S )/a  
        Load or store a vector in a non-contiguous way. The vector is initialized from an array that is
        filled in an inner loop over the tiling factor.
        :param var: buffer to load from or store to, i.e. `var[transformed(index)]`. If None, we load the index
                    as index expression, i.e. `transformed(index)`.
        :param index: index into the `var` or the index expression by its own if `var` is None.
                      The `index` could contain indirect indexing or the tiling itervar. When used in
                      the inner loop, the index is transformed as follows:
                      1. the index is linearized along the tiling dim.
                      2. the indirect indexing vector variables are transformed into arrays over the tiling dim.
        :param dtype: data type of `var` or `index` if `var` is None.
        :param buffer: the code buffer to write the generated code to. If None, we write to `self.loads`.
        :param store_value: the value to store. If None, we load the vector.
        :param accu_store: whether accumulate the store_value to store_ptr. If True, a store_value should be provided
        :return: a CppCSEVariable that represents the loaded vector or None if it is a store.
        Nzstore var must be providedr   r   c                 L    | j         dk     rj        d| j         z  z  S j        S N   )r  r  r   r:  s    rX   get_result_sizezCCppVecKernel._load_or_store_non_contiguous.<locals>.get_result_size
  s,    ~!!~en)<==~%rW   c                 L    | j         dk     rj        d| j         z  z  S j        S r  )r  r/  r  s    rX   get_tiling_sizezCCppVecKernel._load_or_store_non_contiguous.<locals>.get_tiling_size
  s.    ~!!)Q%.-@AA))rW   vec_varc                    | j         sJ t                      }|                    d           |                                5  | j        }|J |t
          j        k    rt
          j        } |          } 	|          }|                    dt          |          d| d           |  dt          |           d}|                    |           |                    d           d d d            n# 1 swxY w Y   |                    d           
j
                            |          }t          |t                    sJ |S )	NrX  rq  r   rr  rs  r   zreturn tmpbuf;rz  )r   r7   r   r   r   r   rl   ro   rH   rE   r  r  r   rG   )r  r   r~  result_sizetiling_sizer   r  r   r  r  r:  s          rX   vec_to_arrayz@CppVecKernel._load_or_store_non_contiguous.<locals>.vec_to_array
  s   >!!>>>DNN5!!! 1 1#M	 ,,,
** %I-oi88-oi88`|I/F``+```   "UU[9Q9QUUUt$$$/0001 1 1 1 1 1 1 1 1 1 1 1 1 1 1 NN4   X&&vt44Ffn55555Ms   B$C22C69C6rX  rq  r   rr  rs  r   rP  c              3   z   K   | ]5}t          |t          j                  j        j        |j                 V  6d S r  r  r  s     rX   r)  z=CppVecKernel._load_or_store_non_contiguous.<locals>.<genexpr>  sU       ! !!!TX..!$QV,! ! ! ! ! !rW   r  r  r  rG  zunexpected store with load maskz.is_masked(r   z != 0z#pragma GCC unroll z#pragma unroll 
for (long  = 0; r   r   r   r   rY  +==r   z tmpbuf[r   ztmpbuf[r  r   ztmpbuf.data()r   rO  rz  rm  T)r  r   r   rn   rG   r7   r   r   rH   rE   r-   rc  rd  r  r   r   r  r  r   is_gccr/  r  r   r   r   r   r   r  r  r  r  )r:  r   r   r   r   r  r  r  r   r  r  result_declareitervar_innerreplacementsr  	array_varr  r   index_crhsr   	load_liner  r  r  s   `   `                  @@rX   rg  z*CppVecKernel._load_or_store_non_contiguous
  s   2 O#//3O//1 	;>ZF	&5; 	&3 	& 	& 	& 	& 	& 	&	*5; 	*3 	* 	* 	* 	* 	* 	*	. 	^ 	 	 	 	 	 	 	 	 	, ~~u[[]] ?	7 ?	7)/%00K)/%00KX<+>XX+XXX  NN>*** "VV+k:R:RVVV   /=1999 M L! ! ! !+! ! ! Q Q
 ",?????& Q ,\ : :I4=1P1P1P1P1PL.004?= 1  E I*&II(III!$/>BBSSDOSSB?) :#'? O O} O O OII#'? 9 9 9I!## GIT5GIIJJJJE1CEEFFFNN2]222"FF{4>'B'BFFFG"'''(  
  H H
 4 6 6 H%%e,,$0  L f< 11E9$\2 GG
 .1_))w))))W, 7NN#6)#6#6#6777''666 H*4"=$$#KNNc#R#RK#R#R#R#R#RSSSSNN#F]#F#F#F#F#FGGG!H H H H H H H H H H H H H H H H H H H H H H H H H H H H H H"  7 33OQNN	5555666?	7 ?	7 ?	7 ?	7 ?	7 ?	7 ?	7 ?	7 ?	7 ?	7 ?	7 ?	7 ?	7 ?	7 ?	7@ 	t 	NN3MM$4X&&vt5&AAFfn55555 FMMs\   "G3ON)C
M?3N?NNNN
ON	ON	5OO#&O#r   c                 n   | j                             |          }|                     |          }t          j                            |          }| j        | j                 }|                     ||          }|dk    r"t                      
                    ||          S |dk    r@|                     |||| j                  }| j                            | j        ||          }n|                     |||          }t#          |t$                    sJ |                    d| ||fi            d|_        |S )Nr   r5   rm  r  T)r  r  rJ  r4   r  r  rc  rd  re  r8  r  r  r  r  r  r  rg  r   rG   r  r   )
r:  r   r   r   r   rh  ri  r   r  r=  s
            rX   r  zCppVecKernel.load=  s   iood##$$U++!!$'']4?3
++E:>>Q;;77<<e,,,q[[**3udoNNDX&&tz4u&EEFF77UEJJF&.11111ftT5&92>>>rW   r   c           	         t          |t                    s&t          |t                    r|j        s
J |            | j        | j                 }| dt          |           }|                     ||          }t                      }	|dk    r|rk|t          j
        k    r"| j        |                     |           d| dn/|                     |           d| dt          | j                   d}
d| d|
 d}|t          j
        k    r#| j        |	                    | d| d	           nL|	                    | d| dt          | j                   d	           n|                     ||||	||
           |	S )a2  
        Get a store line buffer that stores `value` into `var` at `index` of `dtype`. It handles
        both contiguous and non-contiguous store cases.
        :param value: Vectorized type templaterized on `dtype`.
        :param var: buffer to store into.
        :index: index into the `var`.
        r   r5   Nr  r   r   r  .store(r   )r   r  r  )r   rp   rG   r   rc  rd  rE   re  r=   r   ro   rt  rV  r  r   rg  )r:  r   r   r   r   r  rh  var_exprri  r   r  s              rX   _get_store_linezCppVecKernel._get_store_lineQ  s   " %%% 	un--	27,	 		 	 
 ]4?3
22k%0022++E:>>Q;; . ++0F ))%00EE(EEEE ..u55iixii;W[WeKfKfiii 
 .E--d---##(>%<<<<<====PPXPPT^1L1LPPP    ..UE$Ej /    rW   c                 B   dv sJ t          |t                    s
J |            |j        s|                     |          }| j                                      }|                     |          }t          j        	                              }|J| 
                    ||||          }| j                            |                    fd                     d S |dk    r+t          j        j        sX| j        dk    rM| 
                    | |||d          }| j                            |                    fd                     d S |                     |          }|                     t&          j                  }	t*          |         }
t-          j        |t&          j                  j        }t          |t                    r|j        sJ d|
 d	|	 d	| d
| d	| d	| d}| j                            t5          |                     d S t7          d|           )Nr  c                 $    t          |           S r  r;   r  r   s    rX   <lambda>z$CppVecKernel.store.<locals>.<lambda>  s    ,tQ2G2G rW   r  r5   T)r  c                 $    t          |           S r  r  r  s    rX   r  z$CppVecKernel.store.<locals>.<lambda>  s    l46K6K rW   zatomic_add_vec<r   r  r   r  )r   rG   r   r  r  r  rJ  r4   r  r  r  r  r  mapr   r  r  r  ru  r   r  rH   r2   rL  r   r   r;   r  )r:  r   r   r   r  r   r   r   n_srcn_idxr   r   s    `          rX   r  zCppVecKernel.store}  s6   }}}}%0077%770| 	*NN5))Eit$$$$U++!!$''<''sE5AADKtxx(G(G(G(GHHIIIII\!!:- @$2Ba2G2G++J# ,   ""488,K,K,K,K#L#LMMMMM--e44--ek::%e,uek::@!%88IU\III^^^5^^E^^S^^E^^UZ^^^%%l4&>&>?????%&:D&:&:;;;rW   c           
         |t           v sJ |dv }| j        | j        k    }|r|n|}t          |t                    s
J |            |j        s|                     |          }|||f}|| j        j        v r| j        j        |         S d}	|	 dt          |          d}
t          ||          }|                     ||          }| j                            | j        d| d          }t          |t                    sJ | d}d	| }d	| }| xj        | ||gz  c_        d
| _        | j                            |                     ||||t&                               | j                            |                     ||||| j                             |                     ||d          }|r| j                            |                     ||||| j                             | j        J t-          j        t0          j        | j        | j        d                   }|dk    r&| j                            | j        d| d          }n%| j                            | j        d| d          }d	| }t=          | j        | j                 | j                  rX| j        | j        k    rFt=          || j        | j                           t=          | j        | j                 | j                  z  n|ntA          j!        d          }| j        | j                 | j        z  r2| j        | j        k    r t=          || j        | j                           n|ntA          j!        d          }|dk    r d| }| "                    |||||d
           | "                    |||||           | "                    |||||           | j#        r|n|}| j#        r|n|}|dk    r7| j$        %                    | d| &                    ||||           d           n| j$        %                    | d| &                    ||||           d           n| j        J | j'        | j                 }tQ          | j        dz   tS          | j'                            D ] }|| j        |         z  | j'        |         z   }!||||d}| j$        %                    | d | j&        ||fi | d           | *                    ||||| j&        | j                   | *                    ||||tV          t&                     |r%| *                    ||||| j&        | j                   |tX          j-        k    }|r`t]          |          r]| /                    |          dv s
J d            d| d}d| d}| j0        %                    | dtW          |||           d           n|r	| d| d}n|r|dv rd| d}n|dk    sJ | d}nd | &                    |d!d"          z   d#z   } |tX          j-        k    }|rtX          j1        n|}!d$t          |!          d}
d%t          |!          d&| /                    |!           d}"| }#|r|dk    sJ | d'| }#|" d(|
 d)|
 d*|  d&|# d
}| j0        %                    | dtW          ||||+           d           |}$nx|}$t]          |          r5d	|$ }%| j0        %                    |$ dtW          ||$|%           d           n2|r0|dk    sJ d	|$ }%| j0        %                    |$ d|$ d'|% d           te          ||$          }&|&| j        j        |<   |&S ),aw  
        Perform vectorized reduction operation.

        This method handles vectorized reduction for different reduction types.
        It manages special cases for low-precision floating point types and
        employs precision improvement techniques for certain reduction operations.

        Args:
            dtype: The output data type for the reduction result
            src_dtype: The source data type of the input value
            reduction_type: Type of reduction operation (sum, min, max, etc.)
            value: The input value to reduce

        Returns:
            The result of the reduction operation
        r  zat::vecz::Vectorized<r   r  Fr  r  masked_TNrh   r   rd   r   r!  r   r   r5   )r   r   horizontal_reductionr   )r  r  )r5   r   z4Welford reduction does not support VectorizedN (N>2)zwelford_vec_reduce_all(r   z_vec_reduce_all()rg   rd   r`   r\  z.all_zero()r_   z.all_masked()z	{ return r  r  z; }rn  zat::vec::vec_reduce_all<r   r   z([](z& x, z& y) r  )3VECTORIZABLE_RTYPESrd  r  r   rG   r   r  r  r#  rH   r   reduction_acc_type_vecr  r  r  r  r  r  r  r   reduction_init_vecr  rU  rV  rW  rX  rY  r  r  r  r   r/  r   rZ  r  rt  r  r   reduction_combine_vecrc  rS  r   r  r   r   rl   r*   ru  r  ro   r   )'r:  r   r   r   r   r$  r  r&  r%  vec_nsvecr   acc_type_vecr   acc_vec
masked_accmasked_acc_vecuse_acc_helperr  r   masked_helper_valhelper_vec_rangemasked_helper_vec_ranger'  acc_vec_helper_val_r   r   r  r   r   masked_next_valuereduce_all_bodyr~  vec_reduce_all_func
result_vectmpvarmasked_tmpvarr  s'                                          rX   r(  zCppVecKernel.reduction  s'
   & !44444)-AA#$2FF"2=YY
%0077%770| 	*NN5))E!>58D.>>>%5mDD<<l5&9<<<%njAA22>:NN ))J4]44E * 
 
 #~.....,,,$s__
,7,,  Xw$GG   (//&&X~z> 	
 	
 	

 	(//&&' 	
 	
 	
 11.%OO ]	,33**" "+    '333&-dk$*>*@*@A N !111!4==L">}">">e >  

 "4==L">}">">e >  
 !7* 6 6 DK8$:LMM& $*>>> ^T[-IJJt{4?;T=OPPQ Q ( ]1%%   ;t/$2DD&$*>>> ^T[-IJJJ' ]1%% $ &&$:j$:$:!$$"%"# %      Z1A5     !'   *.D~~WH/3~M++:K&&%%ooD$>$>~xY^`k$l$looo    %%ooD$>$>~xY^`k$l$looo    '333M$"67E4/!3S5G5GHH B BA.q1AA#(<&	 F K!!__9t9.'\\U[\\___   	,,!%!;"5 	- 	
 	
 	
 	,,!2, 	- 	
 	
 	
  		00%)%?"&"9 1    5:% 9	#N33 %n,,U33 8    J   BwAAA
$On$O$O$O!%//[[0FWXX[[[    " n .JJJJJ

 n! &  
 ":W!9!9!9JJ)U2222$+!:!:!:JJ  00cJJK  
  5:-+2=EKK	G\)-DGGG&iAX&&\`\q\qr{\|\|&&&# '\
! A)U2222$+!@!@!@!@J 3mmmm3mm_mm`jmmm
!++ee,^S*Xabbbeee   FFF#N33 
 2& 2 2%//]]"3NFM"Z"Z]]]      %.... 2& 2 2%//==&==]===   #>6::<B*=9rW   c                 l   |                      |          }| j                                      }t          j                                      }|j        r|t          j        k    rt          j	        }n|}t          j
                            |          }t          j
                            |          }t                      }	| j        | j        k    r:|	                    | dt!          |           dt"          |          d| d           n||k    rt"          |                             dd           d| }
|t          j        k    r&| d|                     t          j                   d}nM||cxk    rd	k    rn nd
t"          |          d| d}n(d
t"          |          d| dt"          |          d| d| d}|	                    d|
 d| d           |
}|	                    |                     ||||                     | j                            |	                    fd                     d S )Nr  z] = static_cast<r  r   z::r   z.template cast<bool,r   r5   at::vec::convert<r   r}  r   r   r   c                 $    t          |           S r  r  r  s    rX   r  z.CppVecKernel.store_reduction.<locals>.<lambda>  s    T18M8M rW   )rJ  r  r  r4   r  r  is_floating_pointr   rk   ro   rh  ru  r=   rd  r  r   rE   rH   r  rl   r  r  r  r  )r:  r   r   r   r   	out_dtyper   out_num_vectorssrc_num_vectorsr   converted_valueconverts    `          rX   r*  zCppVecKernel.store_reduction  s   $$U++it$$G%%d++	& 	9+D+DKEEE(33I>>(33E::?d222NNaaU++aa\)=TaaX]aaa   
 E!!#I.66tSAAKKEKK   
**!&bbD<Q<QRWR\<]<]bbbGG&/>>>>Q>>>>>SY0GSS5SSS  
cY0G c c.c c1=e1Dc cGVc cZ_c c c   EEE7EEEFFF'KK,,UC	JJKKK$$TXX.M.M.M.M%N%NOOOOOrW   
scalar_varc                    |j         rJ |j        t          j        k    r>| j                            | j        |                                  d|j         d          }nL|j        J | j                            | j        | 	                    |j                   d|j         d          }t          |t                    sJ |j        |_        |j        |_        d|_         |S )NrU  r   r  T)r   r   r   rl   r  r  r  r  r   rV  r   rG   dependent_itervars)r:  r  r  s      rX   r  zCppVecKernel.broadcast  s    $$$$uz))h''!4!4!6!6QQzQQQ GG #///h''%%j&677LL*/LLL G '>22222"(%/%B"rW   ri  c           	          |j         rJ |j        J | j                            | j        |                     |j                   d| d| d          }t          |t                    sJ |j        |_        d|_         |S )Nz	::arange(r   r   T)r   r   r  r  r  rV  r   rG   )r:  r   ri  r  s       rX   rf  zCppVecKernel.arange  s    <{&&&""L!!%+..KKKK&KKK
 
 &.11111{rW   c                    t           |         }|                     |          }t          |          rd| dS |dv rht          |         }|                     ||          }|dk    rt          |          rd| dnd| d}nt          |          rd| dnd| d	}| d
| dS |dk    r|                                  dS t          ||          }| d
| d}	|t          j	        k    r |dv sJ |                                  d| dS |	S )Nr   r   rz   ra   r}   r|   r   r{   r~   r  r   rg   z	::from(0))r_   r`   rd   rU  )
r<   rV  r*   rH   r  r   r  r   r   rl   )
r:  r   r   r   vec_typer   r   rE  scalar_initvec_inits
             rX   r  zCppVecKernel.reduction_init_vec  s   07%%k22// 	,+h++++111!+.F22>5IIH)) &e,,A@6@@@@@@@@  &e,,AAFAAAA@@@@ 
 ''''''U""))++6666$^U;;/////EJ!%:::::))++BBKBBBBrW   c                    t           |         }|                     |          }t          |          rd| dS |dv r||                     |          }|                     t          j                  }|t          j        k    r!dt          t          j                  d| d| dS dt          |          d| d| dS |t          j        k    r|dv sJ | 	                                 S |S )Nr   r   rz   zIndexValueVec<r   )r_   r`   rg   rd   )
r<   rV  r*   ru  r   r  rl   rH   ro   r  )r:  r   r   r   r   r  r  s          rX   r  z#CppVecKernel.reduction_acc_type_vec  s   07%%k22// 	*)h))))111))+66E))%+66E
""VU[(AVVUVVeVVVVRL$=RRRR%RRRREJ!%AAAAA))++--rW   r  r   c                 
   |t           j        k    }|dk    r7| j        rd| d| dt          | j                   dS |r| d| nd| d| dS |dk    r7| j        rd| d| dt          | j                   dS |r| d	| nd
| d| dS |dk    re|r.| j        rd| dt          | j                   d| dS d| d| dS | j        rd| d| dt          | j                   dS |rdnd}	| d|	 d| S |dk    r,| j        rd| d| dt          | j                   dS | d| S |dk    r,| j        rd| d| dt          | j                   dS | d| S |dk    rd|r4| j        r!d| d| dt          | j                   d| d	S d| d| d| dS | j        rd| d| dt          | j                   dS d| d| dS |dk    rjt	          |t
                    r|\  }
}}nt          ||          \  }
}}| j        r$d| d|
 d| d| dt          | j                   dS d| d|
 d| d| d	S |dv r|J t          |         }|t           j        k    rt          t           j                 }| 	                    |          }| 	                    t           j
                  }d}d}|-|J dt          |                                           }d| }| j        r-| d | d| d| | d!| d| | dt          | j                   dS | d | d| d| | d!| d| | dS |d"k    rUt	          |t                    r9|j        t           j        k    sJ t          t           j        j        |f          \  }| d| S t&          )#Nr`   zmax_masked_reduce(r   r   rm  r<  r_   zmin_masked_reduce(rg  r5  rd   r   r   zsum_masked_reduce(r   r[   r   re   zprod_masked_reduce(r   rf   zxor_sum_masked_reduce(r   rh   r   ri   r   z}, r   rz   rS   z_combine_vec<r  rg   )r   rl   rt  rE   r   r   r   rH   ro   ru  r  rp   r  rG   r   rO   r4   rh  r  r  )r:  r   r   r   r   r   r  r   r   r   r   r   r   r   r  r  t_extra	arg_extras                     rX   r  z"CppVecKernel.reduction_combine_vec  s    uz)U""~ _C__:__T^A\A\____ Bs++z+++ASAAJAAA
 u$$~ _C__:__T^A\A\____ Bs++z+++ASAAJAAA
 u$$ 
?> Om*mmDN@[@[mm`jmmmmN*NNNNNN> ?ccczcc[QUQ_E`E`cccc)0"9##cK!>>K>>*>>>v%%~ /`S``J``+dnB]B]````..*...y((~ /ccczcc[QUQ_E`E`cccc..*.../// 	C> RpcppZpp;t~C^C^ppcmppppQcQQZQQJQQQQ> CacaaZaa;t~C^C^aaaaBcBBZBBBB000*e,, Q#- b&& $5^Z#P#P b&~ Mk#kk4kk2kkkk[Y]YgMhMhkkkkL#LL4LL2LLLLLL333(((!),FEJ&&%ek2)))44E))%+66EGI +777Bs#788>>@@BB(LL	~ |% W WF W We W Wu Wg W WW W)W+4W W8CDN8S8SW W W
 ){{v{{{{%{QX{{\_{{cm{ox{{{{u$$*n55 V!'5:5555 4QX5E
} U U**j***%%rW   c           	      n   t          |t                    sJ |j        J |j        sFt          |t                    r|j        rd| d}t	                                          ||||          S |}|}|r |                     |j                   d| d}|r |                     |j                   d| d}|r|rd| d| d| d| d	}| d| d| }n#|r| d| }| d| }n|sJ | d| }| d| }|                     |j                   d| d}|r0|j        s |                     |j                   d| d}d| d| d}| j        rP|                     |j                   d|                     |j                   d	| d
t          | j                   d}d| d}| j
         d| d| dS )Nr  z).all_masked()r   r  z) & (r   z) | ~(z::set(z::from(1), (r   z, "index out of bounds: z"))r   rG   r   r   r8  r  rV  r  rt  rE   rq  )
r:  r   r  r  rQ  lower_scalarupper_scalarr  
cond_printr=  s
            rX   r  zCppVecKernel.indirect_assertq  s   #~.....y$$$z 	D$// 0DK 0/4///77**3udCCC 	@))#)44??u???E 	@))#)44??u???E 		3U 		3<u<<#<<C<<E<<<D(DDcDDlDDJJ 	3&&&&D(33c33JJLL5%%e%%D22L22J%%ci00::4::: 	+; C--ci88BB4BBB*t**4***D> 	&&sy11 > >9L9LSY9W9W > >> >*4>::> > >  (4'''&UUUUzUUUUrW   c                 ^   t          |t                    sJ |j        s#t                                          |||          S t
          |         }|                     |          }t
          |         }|                     |          }d| d}|t          j        k    r2|t          j        k    r"| 	                    |           d| d| d| d}n^|t          j        k    r|t          j        k    r| d| d| d}n2||k    r,||cxk    rdk    rn n
d	| d| d}nd	| d| d| d| d| d}|S )
Nr  r   z::from<r}  r  z.to<r   r5   r  )
r   rG   r   r8  r  rH   ru  r   rl   r  )
r:  r~  r   r   src_cpp_typer  dst_cpp_typedst_num_vectorsr  r=  s
            rX   r  zCppVecKernel.get_to_dtype_expr  s   #~.....z 	D77,,S%CCC#I.//	::#E*//663zzz
""u
':':)))44ee\eeOee_beeeDD%*$$%*)<)<BB|BBoBBBDD%/6666Q66666A<AA3AAAt<tt/ttLtt[jttnqtttrW   r  )NNFr  ))r  r  r  r  r[  r9  r   r  r  re  r   r   rn   ru  ro   r.  rp   rV  r  rG   rA  r   r  r=   r	   rl   rg  r  r  r  r(  r*  r  rf  r  r  r   r  r  r  rm  rn  s   @rX   r  r  J
  s
       I C C C C C C"45: 4 4 4 4 4ek c    R%+ R% R R R R
P5; P3 P P P P 38+ H HEK H# H H H HN> N%+ N# N N N N /3# ## z# {	#
 N+# # # #T ,0<@ L Lc]L zL {	L
 (L eC$789L L 
.	!L L L L\ UZ      4 !* *S.()* * z	*
 {* * * * *X< < < <Bu u un%P %P %PNN ~    $
N 
EL 
^ 
 
 
 
     D  * (,/3+0=c& c& %c& 'tnc& EK(c& c& c& c&J#V #V #V #V #V #VJ        rW   r  c                        e Zd ZdZeZ	 	 d fd	Zd Zd Z	 ddZ	de
dej        f fd	Zd fd
	Zd Z fdZdej        dej        fdZ xZS )r  an  
    A vector kernel that handles the 2d tiles with the tile size defined in `tiling_factor` on
    the inner-most loop level and one of the outer loop level (`outer_tiling_idx`). When the data
    tile is accessed in a contiguous way from the outer loop axis, a transposition is applied on the
    tile to make the access contiguous from the inner-most loop axis. Then, the same vectorization
    logic from its parent `CppVecKernel` is leveraged for load/store/compute. The transposed tile load
    and store are generated into kernel.preloads and kernel.poststores buffers.

    The loop structure looks like below:
    for ...
      for i_outer ...
        for ...
          for inner_most ...
            // generated by CppTile2DKernel
            float tmp0[16*16]; at::vec::transpose_mxn<...>(tmp0, in_ptr0 + ..., ...); // into kernel.preloads
            float tmp1[16*16]; // into kernel.preloads
            for i_inner ... { // the kernel inner loop
              vectorized loads/compute/stores (e.g., load tmp0, store tmp1) // into kernel.loads/compute/stores
            }
            at::vec::transpose_mxn(out_ptr0 + ..., tmp1, ...) // into kernel.poststores
          for inner_most ... (tail)
            // generated by CppVecKernel
            ...
      for i_outer ... (tail)
        for ...
          for ...
            // generated by CppKernel
            ...
    Nc                     t                                          ||||d         |           || _        || _        || _        |r|n|| _        |r|n|| _        d| _        d S )Nr5   T)r8  r9  tiling_indicesinner_tail_sizeouter_tail_sizeinner_num_elemsouter_num_elemsinner_is_tiling_idx)r:  r  r  r/  r  r  r  r=  s          rX   r9  zCppTile2DKernel.__init__  s     	1	
 	
 	
 -..2AT}2AT}#'   rW   c                 F    t          | j        | j                  d          S )NrP  )r-   rc  	outer_idxr?  s    rX   inner_itervarzCppTile2DKernel.inner_itervar  s"    !T]4>%B"J"J"JKKKrW   c                 B   | j         | j                 }| j         | j                 }t          ||| j                  }t          ||| j                  }| j        d u oF|dk    o@|                    |          o+|                    |           o|                    |           S r   )rc  r  rd  r  r/  r  r   )r:  r   	outer_var	inner_varouter_strideinner_strides         rX   need_vec_transposez"CppTile2DKernel.need_vec_transpose  s    M$.1	M$/2	*5)T=OPP*5)T=OPPOt# 0!0		)$$0 !$$Y///0 !$$Y///	
rW   c                 T   t           j                            |          }| j        }| dt	          |           }d}	t	          t          || j        | j                 | j                             }
t	          | j                   }|r|	|}	}||
}}
d}| j	        |z  r| j
        | j        }}n| j        | j
        }}|r|dk    rdnd}t          |t          j                  r|j        r!t          |t          j                  rH|j        sAdt           |          d| d	| d
|
 d
|	 d
| d
t	          |           d
t	          |           d}n@dt           |          dt	          |           dt	          |           d| d	| d
|
 d
|	 d
| d}|r| j                                        }nY| j                            |          s#| j                            | j        |d          }nd}| j                            |          }|rAt           |         }d| d| d}| d| d| d| d| d
}| j                            |           |                    dt3          |                    }|r)| j                            t7          ||                     n| j                            |           |S )Nr   __place_holder__Tr  truefalseztranspose_mxn<r}  r  r   r   Fr  zalignas(std::max(std::size_t(z), alignof(z)))r   r  r\   r   )r4   r  r  r/  rE   r  rc  rd  r  r  r  r  r   r   r  r  rH   r  r  containsr  r  getr   r  rp   r  r;   )r:  r   r   r   is_store
store_moder   factorr~  r  ld_srcld_dstneed_defineMNr  load_or_storetile_var	cpp_dtypealignasdefine_lines                        rX   gen_transposed_tile_load_storez.CppTile2DKernel.gen_transposed_tile_load_store  s    !!$''#--U++--  3E4=;Y[_[m n nooq//1 	,CC#VFF#h. 	')=qAA $$ A !)WjL.H.HVVPW
q%*%% 	ak 	q%*%%	./k	[e!4 [ [z [ [[ [![ [%([ [,2[ [6A!nn[ [HSTU[ [ [ M7e!4 7 7{1~~ 7 7TU 7 7Yc 7 77 7!7 7%(7 7,27 7 7   	3x((HH""=11 	3x((U(SSHHKx||M22H 	1$U+I XfWWWWWG$QQyQQ8QQfQQvQQQKM##K000%--.@#h--PP 	3O%%l4&G&GHHHHM##M222rW   r   r   c                    | j                             |          }|                     |          }|                                 }|                     |          r|                     |||d          }| dt          || j        z             }t          j	        
                    |          }|                     |d|          }| j                            | j        ||          }	|	                    d| ||fi            t!          |	t"                    sJ d|	_        |	S |                     |          }
t)                                          ||
          S )NF)r&  r   r   rm  r  T)r  r  rJ  r  r  r3  rE   r  r4   r  r  r  r  r  r  r  r   rG   r   r  r8  r  )r:  r   r   r   rg  r/  r  r   r   r  r   r=  s              rX   r  zCppTile2DKernel.load/  sM   iood##$$U++""$$""5)) 	1::c55 ;  H "KKk%$.2H&I&IKKGG%%d++E**7Au==DX&&tz4u&EEF!!&4u*=rBBBfn55555 FMM//66I77<<i000rW   c                 D   d|v sJ t          |t                    s
J |            |j        s|                     |          }| j                            |          }|                                 }|                     |          }|                     |          r| 	                    |||d|          }| dt          || j        z             }| j        s?t          j                            |          t           t"          j        t"          j        gz   v r| d| dt          | j                   d}	n| d| d}	| j                            t-          ||	                     d S |                     |          }
t1                                          ||
||           d S )Nr  T)r&  r'  r   r  r   r   )r   rG   r   r  r  r  r  rJ  r  r3  rE   r  rt  r4   r  r  r   r   rM  rN  r  r   r;   r  r8  r  )r:  r   r   r   r  r   rg  r/  storebufr   r   r=  s              rX   r  zCppTile2DKernel.storeE  s   }}}}%0077%770| 	*NN5))Eit$$""$$$$U++""5)) 	8::c54D ;  H #LL{54>3I'J'JLLH~ 5!2!24!8!8M
M = " "  SSSSK4O4OSSS44444K!!,tT":":;;;;;//66IGGMM$	5$77777rW   c                    |                                  }| j        r6|                    d| d| dt          | j                   d| d	           d S |                    d| d| dt          | j                   d| d	           d S )Nr  r  r   r   r   )r  r  r   rE   r  r  )r:  r   rg  s      rX   r  z#CppTile2DKernel.codegen_inner_loopsb  s    ""$$# 	NNcUcc%ccK@T4U4UccY^ccc     NNcUcc%ccK@T4U4UccY^ccc    rW   c                 r   t                                          ||          }| j        d         | j        k     r| j        nt	          | j                  \  | _        | _        | j        | j        d         k    r | j        | _        | j	        | _
        d| _        n| j        | _        | j        | _
        d| _        |S )Nr5   r   FT)r8  r5  r  r  reversedr  rd  r  rt  r  r  r  r  r  )r:  groupreduction_groupr  r=  s       rX   r5  zCppTile2DKernel.set_rangesm  s    ww!!%99 "1%(<<< $-.. 	(
 ?d1!444!1DN!1DN',D$$!1DN!1DN'+D$rW   r   c                 `    |                      || j        |                                           S )Nr  )r  r  r  r  s     rX   r  z"CppTile2DKernel.transform_indexing  s6    ++%%'' , 
 
 	
rW   r  r  )r  r  r  r  r  r[  r9  r  r  r3  rp   r   r  r  r  r  r5  r  rm  rn  s   @rX   r  r    s+        < #I ( ( ( ( ( (.L L L
 
 
 6:: : : :x1 1UZ 1 1 1 1 1 1,8 8 8 8 8 8:	 	 	    $

 
uz 
 
 
 
 
 
 
 
rW   r  _bodyc                    | j         gt          | j                                                  z   }d}d}|D ]}|j        j        D ]}|j        dk    s	|j        dv r|j        dvrd}t          |d          ru|j	        rnt          j        |j	        v sJ |j	        t          j                 }|j        r|j        t          vrd}}| ||j        k    rt          j        d           |j        }d}||fS )	z
    Returns the low precision data type (torch.float16/torch.bfloat16) contained in the nodes
    and if all the nodes can codegen with this data type without converting to float.
    Otherwise returns None and True.
    NFplaceholder)	get_indexrL  )r  r  r  r  r  Trx  z.bf16 and fp16 are mixed in the scheduler node.)
root_blockr/  	subblocksra  r  nodesoptargetr   rx  rA   rw  r   r   warningswarn)r=  
sub_blocks_lowp_fp_type	_use_fp32	sub_blockr<  rt  s          rX   get_loop_body_lowp_fprL    s>    "#d5?+A+A+C+C&D&DDJ+/MI ! !	_* 	! 	!Ex=((EL = - -  | $   !	uf%% !%* !*.%*<<<</4z:M:Q/R} 2](J(J $II".$55 &VWWW$+MMM 		9	!< )##rW   c                   \     e Zd ZdZ fdZdeee         ee         f         fdZd Z	 xZ
S )TilingSelectz
    Implement the heuristic to select the tiling factors and tiling indices.
    In the future, we can implement advanced heuristic in a subclass.
    c                 H    t                                                       d S r  )r8  r9  r:  r=  s    rX   r9  zTilingSelect.__init__  s    rW   r   c           	        " t          |          }t          |          }|sJ t          d |D                       rg g fS t          j        }t          |d                   d         ""r%t          "fd|dd          D                       r"}t          j                    	                    |          }| 
                    |||          }|rt          |d           \  }}	t          |          t          |	          z   }
t          j        j        rFd }d	 }d
 }d t!          t#          |
                    D             }t#          |          }|d |         ||d          }}i }i }|D ];}|j        gt'          |j                                                  z   }|D ]}|j        j        D ]}|j        dv r|j        dk    rdnd}|j                            ||f          |j        |         j        d                  } |||          r1 |||||          }|j        dk    r|n|dvr ||j        |           t9          |j        t:                    rL|j                            d          s2|j        dv s)|j        |vrd||j        <   ||j        xx         dz  cc<   =t?          |                                          }t?          |                                          }d}d}||k    s|dk    r||z  |k    rg g fS |	sP|rNt#          |          dk    r;tA          ||d                  g          s||d                  |dz  k     r
|dk     rg g fS |tB          v rt          j                    	                    |          }|D ]} | dk     r| t#          |
          z   } | dk     s| t#          |
          k    r4tA          |
          r_tD          j        j#        $                    |
|          d          }!|!|k     r,tD          j        j#        %                    |!|           |dz  } n|
|          |k     r|dz  } nt#          |          dk    r|g|fS t#          |          dk    r||g|fS g g fS )Nc              3   (   K   | ]}|t           vV  d S r  )rw   r'  r   s     rX   r)  z-TilingSelect.select_tiling.<locals>.<genexpr>  s(      HHEu//HHHHHHrW   r   c              3   J   K   | ]}t          |          d          k    V  dS )r   N)rL  )r'  	loop_body_lowp_fp_dtypes     rX   r)  z-TilingSelect.select_tiling.<locals>.<genexpr>  sI       "
 "
 #9--a0NB"
 "
 "
 "
 "
 "
rW   r5   rm  c                 ,    t          | d                   S r   r   sizess    rX   r  z,TilingSelect.select_tiling.<locals>.<lambda>  s    #eAh-- rW   rw  c                 V    ||d                  }t          | ||          }|j        r|nd S Nr   )r  r  )r   rc  r/  r  r  ri  s         rX   _try_get_stridez3TilingSelect.select_tiling.<locals>._try_get_stride  s6     '~a'89G0OOF%+%5?664?rW   c                 <    | |vrd|| <   d S || xx         dz  cc<   d S r   rV   )	node_namenon_contig_indexing_op_counters     rX   _update_negative_op_countz=TilingSelect.select_tiling.<locals>._update_negative_op_count  sC     !(FFFDE6yAAA6yAAAQFAAAAArW   c                     t          |          dk    oOt          |           dk    o<|d         dk    r|d         n|d         t          |           z   t          |           k     S Nr5   r   rX  )rc  r  s     rX   _is_valid_indicesz5TilingSelect.select_tiling.<locals>._is_valid_indices  sq    
 N++q0 (MMA-(  .a0A55 +1--!/!2S]]!Bh--(	rW   c                 B    g | ]}t          t          j        |          S rV   r/  r1  s     rX   rd  z.TilingSelect.select_tiling.<locals>.<listcomp>  s4        34;BB  rW   )rL  r  r  rL  r   r   r5   masked_subblock)r2   r  rF  r@  gQ?#   r  
   r8  )&rC   rB   rg   r   ro   rL  r.  r   r  	nelements_select_tiling_indicesr`   r   r   r  enable_tiling_heuristicsrS  r   rA  r/  rB  ra  r  rC  rE  rR  indexing_from_argsr  r   rp   
startswithrd   r(   r   r4   r  r  r  )#r:  fn_listvar_sizes_listloop_bodies
all_dtypesr   r/  r  r:  r;  r  r^  rb  re  rc  r  r  reduction_vars
op_counterra  r=  rH  rK  r<  arg_idxr   ri  op_numnon_contig_indexing_op_numratio_thresholdquantity_thresholdfactor_lowptiling_indice
call_rangerV  s#                                     @rX   select_tilingzTilingSelect.select_tiling  s    %W--/<<
zHHZHHHHH 	r6M.{1~>>qA 	#c "
 "
 "
 "
(_"
 "
 "
 
 
 	# #E#022<<5<II44^]
 
  P	F%($?$?& & &"E?  ,,)?)??Kz2 p"@ @ @G G G   "3{#3#344   #&e**-o-._--. % .0
 BD.( B BE"'"2!3d5?;Q;Q;S;S6T6T!TJ%/ B B	%._%: B BE$|/NNN/4||/K/K!!QR(1(I(I%)>$:)" )""'*W"5":1"=)? $5#4X~#N#N !*-<_(-x.& .&F
 ,1<<+G+G )/-36-A-A(A(A,1L:X)* )* )*  *%,<< B % 7 78I J JB#(<#M$N $N $)<z#A#A?@Ju|$<$<$.u|$<$<$<$A$<$<$<7BB< Z..0011-0299;;. .* #'%'"-1CCCQJJ2V;NN
 r6M ("" N++q00,!."34  1 nQ/0=13DDD r6M%% *688BBBOO%3  M$q(((5K8H8H(H$q((MS=M=M,M,M '44 
%&W%5%?%?'6 &@ & &
 &33G,55j+NNN,71,<M!E 4 %]3kAA(3q(8 B >""a''%66>""a''%}5~EE2vrW   c           	      :   g }t          ||          D ]C\  }}t          j        |g|R  }|d t          j        |j        |j                  D             z  }Dt          t                               }g }	t          t                               }
t          t                               }|D ]$}|j	        D ]}t          j        d|j                  st          |||          }|dk    r6|dk    r_|                    t          |j        dd                               |	                    t          |j        dd                               t!          d |j	        D                       r0|
                    t          |j        dd                               |                    t          |j        dd                               &||
z
  |z
  }t#          |d           \  }}t%          |          t%          |          z   }t%          |          dk    r|dz
  gS |rt'          |          dd          S ||
z  |z
  }t'          |          }t%          |          d	k    r|d         |v r|d         |dz
  k    r|S t'          ||	j                  dd          S )
Nc                     g | ]	}|j         
S rV   r"  )r'  deps     rX   rd  z7TilingSelect._select_tiling_indices.<locals>.<listcomp>p  s    TTT#)TTTrW   z^d\d+$r   r5   c              3   J   K   | ]}t          |t          j                  V  d S r  )r   r   SIZEr'  r  s     rX   r)  z6TilingSelect._select_tiling_indices.<locals>.<genexpr>  s.      SS!4955SSSSSSrW   c                 ,    t          | d                   S r   rX  rY  s    rX   r  z5TilingSelect._select_tiling_indices.<locals>.<lambda>      s5QR8}} rW   r[  r  r   )r  r
   extract_read_writes	itertoolschainreadswritesr   rn   r  r   r   r   r  r  r  r.  r`   r   sortedcount)r:  rp  rq  r/  	all_indexfn	var_sizesrwcontig_varscontig_vars_listnon_contig_stride_constnon_contig_stride_otherr   r   ri  contig_onlyr:  r;  num_itervarscontig_and_const_stridecontig_vars_sorteds                        rX   rl  z#TilingSelect._select_tiling_indicesg  s    	 .99 	U 	UMB	1"AyAAABTTyrx/S/STTTTII o''",S/"3"3",S/"3"3 	C 	CE) C CyCH55 ,UCGGQ;;q[[OOC$5$5666$++C,=,=>>>>SSv?RSSSSS C+//CHQRRL0A0ABBBB+//CHQRRL0A0ABBBBC "$;;>UU!$^9T9T!U!U!U5zzC$8$88{q   1$%% 	,+&&rss++11##$ $K00"##q(("2&*AAA"2&,*:::%%(.>.DEEEbccJJrW   )r  r  r  r  r9  r   r/  rn   r~  rl  rm  rn  s   @rX   rN  rN    s         
    i 
tCy$s)#	$	i i i iV.K .K .K .K .K .K .KrW   rN  c                        e Zd ZU eZee         ed<   eZee         ed<   e	Z
ee	         ed<    fdZd ZdefdZdefd	Zd
 Zd Zd Zdee         fdZd Zd Zddee         fdZdeded         fdZ xZS )rC  
kernel_clsvec_kernel_clstile2d_kernel_clsc                     t                                          |j        |j        j                   || _        d | _        d | _        t          j	                    | _
        g | _        d S r  )r8  r9  r  wsr  re  rT  r  r   r  picked_vec_isakernelsr:  re  r=  s     rX   r9  zCppKernelProxy.__init__  sX    *LO,GHHH(2=2J2L2L(*rW   c                 f    |D ]-}t          |t                    sJ t          j        |           .d S r  )r   r$   r:   propagate_scheduler_node)r:  rC  r<  s      rX   data_type_propagationz$CppKernelProxy.data_type_propagation  sI     	@ 	@Ee]333338????	@ 	@rW   scheduler_nodec                     t          |j        t                    sdS t          j        |           t          |j                  d         d uot          |j                  d          S )NTr   r5   )r   r=  r   r:   r  rL  )r:  r  s     rX   is_lowp_fp_schedulerz#CppKernelProxy.is_lowp_fp_scheduler  sf    ..99 	44^DDD!."677:$F C).*>??BB	
rW   rU  c                     dt           j        j        fd}|j        gt	          |j                                                  z   }|D ]} ||j                   d S )N	sub_graphc                 
   dt           j        j        dt          t           j                 fddt           j        j        dt          t           j                 fddt           j        j        dt           j        ffddt           j        j        dt           j        ffddt           j        j        dt           j        ffd}t          | j                  }g |D ]N}|j        d	v r |          xt          v rt          fd
|j
        D                       rB|j        d         }|                     |          5  |                     d||t           j        f          |                    fd           t           xj        dz  c_        d d d            n# 1 swxY w Y   |j        dk    r |          xt          v r|j        \  }}}}} ||          r	t$          j                            |          |                     |          5  |                     d||f          |                    |           t           xj        dz  c_        d d d            n# 1 swxY w Y   |j        dk    ru|j        \  }}}	}
|t          v r]t           j        t           j        t           j        t           j        fv sJ |t          v rt           j        nt           j        |	|
f|_        |j        dk    rX|j        d         t          v rD|j        \  }}
t          fd|j
        D                       ri||
t           j        f|_        |j        dk    rm|j        d         t          v rY|j        \  }}t          fd|j
        D                       r̉                    |           ||t           j        f|_        |j        dk    rK|j        \  }}}|t          v r~ |||          sr|                     |          5  |                     d|||f          |                    |           t           xj        dz  c_        d d d            n# 1 swxY w Y   t          v rt          fd|j
        D                       s|j        d         }|                     |          5  |                     d||t           j        f          |                    fd           t           xj        dz  c_        d d d            n# 1 swxY w Y   NPdt           j        j        ffd} ||            d S )Nr(  r   c                 6   | j         dk    r*t          j                            | j        d                   S | j         dk    r| j        d         S | j         dk    r@t          | j                  dk    r| j        d         S | j                            dd          S dS )	z6Get input dtype for nodes that may consumes lowp fp dtr  r5   r  r  r  r   r   N)rE  r4   r  r  r  r   r  r%  r(  s    rX   get_input_dtypez]CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.get_input_dtype  s    ;'))7,,TYq\:::[$6669R=([J..49~~))#y|+#{{DAAA4rW   c                     | j         dk    rDt          | j                  dk    sJ t          j                            | j        d                   S | j         dv r| j        d         S | j         dk    r| j        d         S dS )	z6Get output dtype for nodes that may produce lowp fp dtr  r   r5   )r  rF  rL  r  r  r   N)rE  r   r  r4   r  r  r  s    rX   get_output_dtypez^CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.get_output_dtype  s}    ;&((ty>>Q....7,,TYq\:::[$JJJ9R=([$6669Q<'4rW   r  c                 8    |t           v sJ  |           |k    S )z]Check if the given node produces output with expected low precision floating point data type.)r   )r(  r  r  s     rX   is_lowp_fp_sourcez_CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_source  s*    ]****''--33rW   c                 ^    |t           v sJ  |           x}r||k    S | j        dk    rdS dS )zZCheck if the given node accept input with expected low precision floating point data type.r  TF)r   rE  )r(  r  input_dtyper  s      rX   is_lowp_fp_sinkz]CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_sink  sM    ]****"1/$"7"77; !&",,[J..4 5rW   c                 `     |           o t          fd| j        D                       S )zCheck if the node is a lowp fp sources which are all directly fed to ops that accepts lowp fp input
                thus no need to promote to float
                c              3   0   K   | ]} |          V  d S r  rV   r'  userr  r  s     rX   r)  z}CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_source_no_promote.<locals>.<genexpr>  s@       ; ;26OOD"--; ; ; ; ; ;rW   r.  users)r(  r  r  r  s    `rX   is_lowp_fp_source_no_promotezjCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_source_no_promote  sY     )(r22 s ; ; ; ; ;:>*; ; ; 8 8 rW   )r  rL  c              3   0   K   | ]} |          V  d S r  rV   r  s     rX   r)  zWCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<genexpr>  /      MM??444MMMMMMrW   r   r  r  c                     | uS r  rV   r2  to_type_nodes    rX   r  zVCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<lambda>  s    A\4I rW   r5   r  r(  rF  r  c              3   0   K   | ]} |          V  d S r  rV   r  s     rX   r)  zWCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<genexpr>.  r  rW   c              3   0   K   | ]} |          V  d S r  rV   r  s     rX   r)  zWCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<genexpr>4  r  rW   r  c              3   0   K   | ]} |          V  d S r  rV   )r'  r  r   r  s     rX   r)  zWCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<genexpr>\  s/      UUe < <UUUUUUrW   c                     | uS r  rV   r  s    rX   r  zVCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<lambda>d  s    A\<Q rW   r  c                 L    dt           j        j        ffd} ||            d S )Nr  c                    dt           j        j        fdd | j        D             }fd|D             }|D ]}|                                D ]}\  }| j        v rot          fd|D                       sv rPt          d |D                       r7j        d         }                    |           |                                ~| j	        | 
                                 d S d S )Nto_nodec                 >    t          d | j        D                       S )Nc              3   ,   K   | ]}|j         d k    V  dS )r  NrE  r'  usrs     rX   r)  zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>._used_by_to.<locals>.<genexpr>s  s)      "U"U3:#;"U"U"U"U"U"UrW   r  )r  s    rX   _used_by_tozCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>._used_by_tor  s!    ""U"Uw}"U"U"UUUUrW   c                 (    g | ]}|j         d k    |S )r  r  r&  s     rX   rd  zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>.<listcomp>u  s+     $ $ $!%DK:<U<U<U<U<UrW   c                 8    g | ]} |          ||j         iS rV   )r  )r'  r(  r  s     rX   rd  zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>.<listcomp>x  s@     . . ./3{{SWGXGX.tz*. . .rW   c              3   P   K   | ] }|j         d          j         d          k    V  !dS r  Nr  )r'  r  r(  s     rX   r)  zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>.<genexpr>~  s4      #S#ScCHRLDIbM$A#S#S#S#S#S#SrW   c              3   >   K   | ]}|j         d          t          v V  dS r  )r  r   r  s     rX   r)  zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>.<genexpr>  s>       ,& ,&JM(E,& ,& ,& ,& ,& ,&rW   r  )r   fxNoderC  r  r.  all_input_nodesreplace_all_uses_with
erase_nodeowning_modulelint)	r  all_to_nodesall_to_nodes_and_users
node_usersr  val_noder  r(  to_lowp_fp_legalized_nodess	         @@rX   _eliminate_duplicate_to_nodezCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_nodek  sw   VUX] V V V V$ $)2$ $ $L. . . .7C. . .* '= ; ;
+5+;+;+=+= ; ;KD%#y66 ##S#S#S#SU#S#S#S S S  7 %),F$F$F(+ ,& ,&QV,& ,& ,& )& )& %G ,0+?+C $ : :8 D D D ) 4 4T : : :;* !.6!((((( 76rW   )r   r  Graph)r  r  r  s     rX   eliminate_to_dtypez`CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtypej  sF    ')EHN ') ') ') ') ') ')R -,Y77777rW   )r   r  r  r   r   r/  rC  rE  r   r.  r  r  inserting_aftercall_methodro   r  r   cpp_to_dtype_countr4   r  r  inserting_beforereplace_input_withrK  rL  r  r  r  )r  r  sub_graph_nodesr<  r2   r   r   	value_varr   r   r   r  r  r  r   r  r  r  r  r  r  s                @@@@@@@@rX   add_to_dtypezDCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype  s    ehm  8M        
 ux} 
 %+9N 
  
  
  
 4 45; 4 4 4 4 4 4
	!ehm 	! 	! 	! 	! 	! 	! 	!58= ek        #9?33O)+&( w wL$::://666=HH MMMMMMMMMM ! *Q-C"22599 8 8'0'<'<&c5%+-F (= ( ( 33(*I*I*I*I    22a7228 8 8 8 8 8 8 8 8 8 8 8 8 8 8 LG++.u555-GG16.Cq)Q33IrBB ! G--d33E"33E:: 8 8'0'<'<&c9e-D (= ( ( 00LIII22a7228 8 8 8 8 8 8 8 8 8 8 8 8 8 8 \[00 
!& M11  %!K!N!M!K	)          +0M+A+AEKKu!K*!&
 \Z//EJrNm4S4S',z$S%MMMMMMMMMM ! "%uek!:EJJ\Z//EJrNm4S4S#(:LS!RMMMMMMMMMM !  /55e<<<"%q%+!6EJJ\%7779>6S)UI !M11  <;IyQQ @!*!;!;E!B!B @ @/8/D/D$.c9i5P 0E 0" 0" !& 8 8L Q Q Q ' : :a ? : :@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ --  UUUUUUUUUU@ #(*Q-C!*!:!:5!A!A @ @/8/D/D$.c5%+5N 0E 0" 0" !& ; ;$02Q2Q2Q2Q!" !" !" !( : :a ? : :@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ *8ehn *8 *8 *8 *8 *8 *8X y)))))sL   AF&&F*	-F*	 AI22I6	9I6	AQ..Q2	5Q2	AT$$T(	+T(	)r   r  r  rA  r/  rB  ra  r  )r:  rU  r  rH  rK  s        rX   legalize_lowp_fp_dtype_loopbodyz.CppKernelProxy.legalize_lowp_fp_dtype_loopbody  s~    Z	*EHN Z	* Z	* Z	* Z	*x  *+d93F3M3M3O3O.P.PP
# 	* 	*IL))))	* 	*rW   c                 R    t           fd|D                       r|D ]}|j        j        gt          |j        j                                                  z   }|D ]_}|j        j        D ]P}|j        dv rE|j	        sJ t          j        |j	        v sJ |j	        t          j                 }|j        t          v sJ Q`d S |D ]e}t          |t                    sJ t          |j        t                     sJ |j        }|                                s                     |           fd S )Nc              3   l   K   | ].}t          |t                    o                    |          V  /d S r  )r   r$   r  )r'  r<  r:  s     rX   r)  z8CppKernelProxy.legalize_lowp_fp_dtype.<locals>.<genexpr>  sT       
 
 um,,Q1J1J51Q1Q
 
 
 
 
 
rW   )r  r  )r.  r=  rA  r/  rB  ra  r  rC  rE  rx  rA   rw  r   r   r   r$   r   is_memory_copyr  )r:  rC  r<  rH  rK  fx_nodert  rR  s   `       rX   legalize_lowp_fp_dtypez%CppKernelProxy.legalize_lowp_fp_dtype  s    
 
 
 

 
 
 
 
 	
  B B#k45K)00229 9 
 ", B BI#,?#8 B B">->>>#*<//<#6#:gl#J#J#J#J;B< 3 7<G $+=M#A#A#A#ABB F 	; 	;Ee]33333ek844444"[D&&(( ;44T:::	; 	;rW   c           	         !"# t                    t                    k    sJ | j        !t          d           \   "|                      "           !#fd} "fd# || j                  }t
          j        xj        |j        z  c_        t
          j        xj        |j        z  c_        t          
                    |          | _        | j        r| j        s:|g| _        |                     dd            | j                            |            d S t"          j        j                            d          5  t+                      }|                              \  }}t          |          t          |          k    sJ d}t/          t1                              }	t3          d |	D                       rd}d}
d }|r_d}|d	         }|d
z   }t          | j        j                  |k    r3| j        j        |         j        }| j        j        |         j        }|o| }
t          |          d
k    rt8          xj        d
z  c_        | j                            |d	         |d	                   } || j        |d	         |d	                   }|j         |j!        z
  }|j"        d	|j!        fi|_#        t&          j$        j%        r"|r  || j        |d	         |d	         |          }n|}|j"        g|_&        |j"        |j!        |j         fi|_#        ||g| _        |}nxt          |          dk    r\|d
         t          | j                  d
z
  k    r|d	         |d
         k    sJ t8          xj        dz  c_        | j                            |d	         |d	                   }d	|j!        f|j!        |j         fd}|j         |j!        z
  }| j                            |d
         |d	                   }d	|j!        f|j!        |j         fd}|j         |j!        z
  } || j'        |d	         |          }|j"        |d         |j"        |d         i|_#        g }t&          j$        j%        ro|rmdD ]i\  }}|dk    r|nd }|dk    r|nd } || j'        |d	         |||          }|j"        ||         |j"        ||         i|_#        |(                    |           jn || j        |d	         |d	                   }|j"        |d         |j"        |d         i|_#        |j"        g|_&        |(                    |           |j"        |d         |j"        d	|j         fi|_#        |j"        |j"        g|_&        |(                    |           |g|z   | _        |}n|g| _        |                     |
|           | j                            |            d d d            d S # 1 swxY w Y   d S )Nc                 ,    t          | d                   S r   rX  rY  s    rX   r  z2CppKernelProxy.codegen_functions.<locals>.<lambda>  r  rW   r[  c                      j         | g|R  5 }t          xj        dz  c_         |           |cd d d            S # 1 swxY w Y   d S r   )
new_kernelr   generated_kernel_count)r1  r  rh  re  runs      rX   codegen_kernelz8CppKernelProxy.codegen_functions.<locals>.codegen_kernel  s    ((4t444  ..!3..F                 s   "?AAc           	         |                                \  }}d}t          	          D ]\  }}|ft          t          j                            dffv r|rJ  |||           @d}|dfk    sJ d| d d             |                                 5   ||d           d d d            n# 1 swxY w Y   d S )NFrV   Tzunexpected group: rc  r   )r5  r  r   r  r  rz  )
rh  r  rt  	in_suffixr  r  rp  r:  r;  rq  s
         rX   r  z-CppKernelProxy.codegen_functions.<locals>.run  si   #)#4#4UO#L#L D.I!$Wn!=!= % %IO,9?5/BBCCRH!    )((=Bt^,,,, $I$)    VIUU5UUOUU  
  //11 % %4% % % % % % % % % % % % % % %% %s   B77B;	>B;	Finplace_buffersTc              3   (   K   | ]}|t           vV  d S r  )rx   rS  s     rX   r)  z3CppKernelProxy.codegen_functions.<locals>.<genexpr>  s(      SSu5 ::SSSSSSrW   r   r5   )r(  r   maintailr  )r  )r   r  )r   r   r   ))r   re  r`   r5  r  r4   r  removed_buffersinplaced_to_removerR  rk  rT  r  rc  r  aggregate_reduction_buffers
set_kernelr   	_inductorr   patchrN  r~  rB   rC   rg   rN  r  r   generated_cpp_vec_kernel_counttiler  rF  
tiled_sizer   r  r  enable_loop_tail_vecr  r  r  )$r:  rp  rq  r  scalar_kerneltiling_selecttiling_factorsr  could_masked_vecrs  _inner_loop_reduction_outer_not_outer_loopinner_loop_reductionouter_loop_levelinner_loop_levelouter_loop_reductionrN  
vec_kernelrt  tail_kernel
outer_loopr^  r  
inner_loopinner_rangesr  tile2d_kernelouter_rinner_r_inner_tail_size_outer_tail_sizerh  r:  re  r;  r  s$    ``                             @@@@rX   codegen_functionsz CppKernelProxy.codegen_functions  sZ   7||s>222222(!$^9T9T!U!U!U///	 	 	 	 	 		% 	% 	% 	% 	% 	% 	% 	%( 't77	=#@@	""m&FF""!66" 	$- 	)?DL,,UD999N%%d+++F _#))%)@@ C	, C	,(NNM-:-H-H. .*NN ~&&#n*=*=====#3N74K4KLLJSS
SSSSS )#( .3+K ',$#1!#4 #3a#7 t~+,,/???+/>+?(," ) ,0>+?(," ) -I5I1I 4 >""a''66!;66~**>!+<^TUEV*WW+^'):N1<M 
 !I7	,0Hq$/6J+K
(:2 	>7G 	>"0.+&q)&q)!	# #KK #0K48H:M0-1X7S,T) *K8"^$$))"1%T]););a)???&q)^A->>>>? 66!;66!^00"1%nQ.? 1  
 
 56'2JOD    #-/J4I"I!^00"1%nQ.? 1  
 
 56'2JOD    #-/J4I"I .*"1%"! ! NL$8NL$8/+ !:2 '67G '6- 3 3( 07&/@/@OOd ) 07&/@/@OOd ) "0 2*1-*,," " 'NL,A&NL,A0, $**62222-30 "0+^A->q@Q" "J #V(<"V(<0J, 2<0@J-&&z222"V(<"JO(<3M/ 5?NJN3SM0&&}555 -<( -,,/   N%%d+++GC	, C	, C	, C	, C	, C	, C	, C	, C	, C	, C	, C	, C	, C	, C	, C	, C	, C	,s   RWW"Wc                     |D ]+}|                      |           t          j        |           ,|                     ||           d S r  )r  r:   propagate_loopbodyr  )r:  rr  rq  rR  s       rX   codegen_loop_bodiesz"CppKernelProxy.codegen_loop_bodiesr  sU     	9 	9D00666248888{N;;;;;rW   rC  c                 p   |                      |           |                     |           t          |          dk    sJ d fd|D             }t          t          j        t                    r"t          j        j        rd fd|D             }d |D             }|                     ||           d S )Nr5   c                     |                                   |                                  t          t          j        t
                    r
 | j        | S |                     |          S r  )decide_inplace_updatemark_runr   r4   rh  r1   r=  codegen)r(  
index_varss     rX   r  z(CppKernelProxy.codegen_nodes.<locals>.fn~  sW    &&(((MMOOO!($566 0!tz:..||J///rW   c                 :    g | ]}t          j        |          S rV   )rU  partial)r'  r(  r  s     rX   rd  z0CppKernelProxy.codegen_nodes.<locals>.<listcomp>  s&    AAA49$R..AAArW   c                 R    t           j                            |           }| |_        |S r  )r4   r_  localize_functionoriginal_fn)r  
wrapped_fns     rX   wrap_fnz-CppKernelProxy.codegen_nodes.<locals>.wrap_fn  s-    3EE 
 *,
&!!rW   c                 &    g | ]} |          S rV   rV   )r'  r  r/  s     rX   rd  z0CppKernelProxy.codegen_nodes.<locals>.<listcomp>  s!    555rwwr{{555rW   c                 (    g | ]}|j         d          S )r5   )r:  r&  s     rX   rd  z0CppKernelProxy.codegen_nodes.<locals>.<listcomp>  s    :::D$*Q-:::rW   )	r  r  r   r   r4   r_  rK   r`  r  )r:  rC  rp  rq  r  r/  s       @@rX   codegen_nodeszCppKernelProxy.codegen_nodesx  s    ##E***""5)))5zzQ	0 	0 	0 BAAA5AAA q-/ABB	6&4	6
" " " 6555W555G::E:::w77777rW   c                 >    |                      | j        ||           d S r  )ri  rT  )r:  r   rS  s      rX   rl  zCppKernelProxy.codegen_loops  s"    kBBBBBrW   c                 B    | j         D ]}|                                 d S r  )r  r  r:  rh  s     rX   r  z4CppKernelProxy.update_stores_with_parallel_reduction  s2    l 	; 	;F88::::	; 	;rW   Nr   c                 H   |J d}| j         D ]}t          j                    5 }|                    ||          rPd}|                    |                                           |                    |                                           d d d            n# 1 swxY w Y   d S )N
C10_LIKELYC10_UNLIKELY)r  r   r   r  r   r   r  r  )r:  r   	if_prefixrh  r   s        rX   r  zCppKernelProxy.gen_body  s     	l 	3 	3F%'' 35,,T9== 3 .I''666KK 1 1222	3 3 3 3 3 3 3 3 3 3 3 3 3 3 3	3 	3s   A'BB	B	inner_loop_reduction_outer_notr  	LoopLevelc                 j    d fd} j         d         }|r|sJ  ||           nR|                                  j                            |j                    j                            |j                    j                            |j                    j                            |j                    j                            |j                    j                            |j                    j	                            |j	                    j
                            |j
                   dS )z
        CppKernel/CppVecKernel/CppTile2dKernel have reduction buffers themselves.
        Here, we decide how to aggregate them together and place new reduction buffers
        under CppKernelProxy.
        r  r;  c           
         t          j                  dk    sJ j        d         }j        d         }t          |j                  sJ t	          |          j        k    rV|                    |j                   |                                 j        	                    |j        |j        z              n3|                                 j        	                    |j                   t                      }t          j                    5 }|                    |d| j                  rA|                    |                                           |	                    |j                   d d d            n# 1 swxY w Y   t          j                    5 }|                    |d| j                  r
|                    |                                           t	          |          j        k    r|j        }|D ]P}| d| j         dt'          | j                   d}t+          |j        ||           t+          |j        ||           Qt/          |j                   |	                    t1          |j        | j        | j         d	| j        | j                             n|	                    |j                   d d d            n# 1 swxY w Y   |_        d S )
Nr   r   r  r7  r8  r   z_tail - r  r  )r   r  r   r  r%  r  r  r/  r  r  r7   r   r   r  r   r   r   r  r  rE   r	  r   r  r   r   rF  )	r  main_loop_kerneltail_loop_kernel
suffix_bufr   rt  r   r   r:  s	           rX   !aggregate_reduction_prefix_suffixzUCppKernelProxy.aggregate_reduction_buffers.<locals>.aggregate_reduction_prefix_suffix  s   t|$$))))#|A#|B/.0CDDDDD $%%88 !::$2   !::<<<%,,$5&78   
 !::<<<%,,-=-NOOO &J%'' I5#66jn  I ''
(9(9(;(;<<<%%&6&GHHHI I I I I I I I I I I I I I I %'' M5#66
  M ''
(9(9(;(;<<<,--@@)9)M$2  D*.'r'rZ^'r'r[YcYnMoMo'r'r'rH,-=-DdHUUU, 0 A4   
 55E5LMMM"))6 0 A *#-> 8 8 8 * 5 *     #))*:*KLLL5M M M M M M M M M M M M M M M6 %/D!!!s&   AE99E= E=D(KKKr   N)r  r;  )r  r  r  r  r  r  r  r  r  r  r  )r:  r:  r  rA  main_kernels   `    rX   r  z*CppKernelProxy.aggregate_reduction_buffers  sS   9	/ 9	/ 9	/ 9	/ 9	/ 9	/v l1o) 	G:--j999911333!(()EFFF!(()EFFF&--k.STTT&--k.STTT!(()IJJJ#**;+MNNN*115	
 	
 	
 	*115	
 	
 	
 	
 	
rW   r  )r  r  r  r  r  r%  r  r  r  r  r  r9  r  r$   r  r   r  r  r  r"  r/  r2  rl  r  r   r7   r  rl   r  rm  rn  s   @rX   rC  rC    s         #,JY+++)5ND&555/>tO,>>>+ + + + +@ @ @
= 
 
 
 
_* _* _* _* _*B; ; ;<v, v, v,p< < <84#6 8 8 8 8BC C C; ; ;3 3Xl3 3 3 3 3U
.2U
@H@UU
 U
 U
 U
 U
 U
 U
 U
rW   rC  c                   $     e Zd Z fdZd Z xZS )rf  c                 x    t                                          |j        |j        j                   g | _        d S r  )r8  r9  r  r  r  rg  r  s     rX   r9  zOuterLoopFusedKernel.__init__  s0    *LO,GHHH%'


rW   c           
      b   g }d | j         D             }|D ]d}|j        }|J |                    |                    t	          t          |          |j        z
  |j                  |          j                   et	          t          |j        t          |                    |j                  S )Nc                 6    g | ]}|                                 S rV   )rB  )r'  rT  s     rX   rd  z>OuterLoopFusedKernel.decide_parallel_depth.<locals>.<listcomp>	  s1     +
 +
 +
'0I  ""+
 +
 +
rW   rs  )
rg  r  r  r[  r  r   r  r  r_   r`   )r:  r\  rR  kernels_parallel_depthnested_kernelsrh  r  s          rX   r[  z*OuterLoopFusedKernel.decide_parallel_depth  s    !#+
 +
48J+
 +
 +
 % 	 	F !,K***")),,!,,/A/MM$6$B	     !
 
 
 
 "137M3N3N  +6	
 
 
 	
rW   )r  r  r  r9  r[  rm  rn  s   @rX   rf  rf    sG        ( ( ( ( (
 
 
 
 
 
 
rW   rf  c                       e Zd ZdZdZdZdS )ReasonFusedNodessame_vars_reducecompatible_reductioncompatible_ranges_no_reductionN)r  r  r  SAME_VARS_REDUCECOMPATIBLE_REDUCTIONCOMPATIBLE_RANGES_NO_REDUCTIONrV   rW   rX   rJ  rJ  %  s"        )1%E"""rW   rJ  c                       e Zd ZU eZee         ed<   dZ ee	j
        e	j        g          Zedej        dee	         fd            Z fdZdefdZd	 Zd
 Zd Zdee         fdZd Zd Zd ZdededefdZd Zd Z d Z!d Z"de#e$         fdZ%de&fdZ'de(e&e)e$f         fdZ*dedefdZ+dede,e         de,e         fd Z-d! Z.d" Z/d# Z0d'd%Z1d& Z2 xZ3S )(CppSchedulingkernel_proxy_clsi  devicer   c                     | j         S r  )backend_features)r1  rT  s     rX   get_backend_featuresz"CppScheduling.get_backend_features;  s    ##rW   c                     t                                          |           |r|                                  d| _        d S r  )r8  r9  reset_kernel_group_ready_to_flush)r:  r-  r=  s     rX   r9  zCppScheduling.__init__?  sC    ### 	&##%%%$rW   statusc                     || _         d S r  rZ  )r:  r[  s     rX   _set_flush_statuszCppScheduling._set_flush_statusE  s    %rW   c                 4    t          d |D                       S )Nc              3   x   K   | ]5}t          t          t          j        j        j        |                    V  6d S r  )r   r  r4   r  r  r   r  s     rX   r)  z)CppScheduling.group_fn.<locals>.<genexpr>I  s<      MM!U3qw/8!<<==MMMMMMrW   )r   )r:  rZ  s     rX   group_fnzCppScheduling.group_fnH  s    MMuMMMMMMrW   c                 ,    t                      | _        d S r  )KernelGroupre  r?  s    rX   rY  z CppScheduling.reset_kernel_groupK  s    'MMrW   c                    |                                 s|                                 rt          j        ||          S |                                r+|                                rJ t	          j        ||          S |                     ||          t          j        k    rt          |t          t          f          sJ t          |t          t          f          sJ |j
        \  }\  }}|j
        \  }\  }}|dk    r|dk    sJ ||f            fdt          |          t          |          k     r|n|}t          |t                    sJ t          |          t          |          k     r|n|}	 |	          }
|                    |
           |j
        \  }\  }}|j
        \  }\  }}||k    rt	          j        ||          S  |          }t          |	t                    r|	                    |           nht          |	t                    sJ |	j        D ]/}t          |t                    sJ |                    |           0t	          |	j        |	j                  }	|j
        \  }\  }}|j
        \  }\  }}||k    sJ ||f            t	          j        ||          S |                     ||          r0t                               |||                     ||                    S t	          j        ||          S )NrV   c                 d   t          | t                    rt          | j                  dk    sJ | j                    d }t	          t
                               }| j        D ]A} 	|          \  }}||}||k    sJ ||| j        f            |                    |           B|t          |          fS t          | t                    sJ | j	        }t          |t          j                  sJ |                                \  }}}|j        t          |j                                                  fS r]  )r   r"   r   snodesr   r   updater/  r$   r(  r   ComputedBufferget_default_sizes_bodyr  indexing_exprsra  )
r(  r  rj  snodevexprscomp_bufferr   rR  get_indexing_ranges_exprss
            rX   ro  z5CppScheduling.fuse.<locals>.get_indexing_ranges_exprs`  s7   !$(:;; S"4;//!333T[333%)
)3C):):%)[ 9 9E'@'@'G'GHAu)1-.
#-???ZDK4P???*11%8888)4+?+???)$>>>>>&*i)+r7HIIIII%0%G%G%I%I
4#T5H5O5O5Q5Q0R0RRRrW   )extra_indexing_constraints)
is_foreachr!   r3  is_templater"   _why_fuse_nodesrJ  rP  r   r$   r:  r   recompute_size_and_bodyrf  r-  can_fuse_vertical_outer_loopr   _get_outer_loop_fusion_depth)r:  r!  r"  r   vars1reduce1vars2reduce2node_to_recompref_noderef_indexing_constraints#node_to_recomp_indexing_constraintsrk  ro  s                @rX   r3  zCppScheduling.fuseN  sv    O	=!1!1!3!3 O	=-25%@@@   M	=((*****%*5%888 $$UE22#BC C "%-9K)LMMMMM!%-9K)LMMMMM&+k##E7&+k##E7"}}B'8J6S S S S S& +.e**s5zz*A*Au!.-@@@@@$'JJU$;$;55+D+DX+N+N(66/G 7    !&:E1 %:E1E>>-25%@@@ 7P6O"7 73 h66 W443V 5     &h0BCCCCC!)  )%?????557Z 6      2(2DhoVVH %:E1 %:E1~~~u~~~~).ue<<<225%@@ =2775$"C"CE5"Q"Q   *.ue<<<rW   c                     |j         \  }\  }}|j         \  }\  }}||k    r||k    rt          j        S |dk    r|||z   k    rt          j        S |                     ||          rt          j        S d S )NrV   )r:  rJ  rN  rO  &_can_fuse_nodes_with_compatible_rangesrP  )r:  r!  r"  r   rw  rx  ry  rz  s           rX   rs  zCppScheduling._why_fuse_nodes  s    #kE7#kE7E>>g00#44b==Uego55#8866ueDD 	C#BBtrW   c                 v   |j         \  }\  }}|j         \  }\  }}|dk    o|dk    }t          j        |          t          j        |          k    }	t          |          dk    pt          |          dk    }
|r|	r|
sdS t          |          t          |          k     r|n|}t          |          t          |          k     r|n|}t	          |t
                    rdS t	          |t                    sJ t	          |j        t          j	                  rdS t	          |j        t          j
                  sJ |j        j                                        }d }t	          |t
                    rt          t          t          df                              }|j        D ]}t	          |j        t          j	                  r n`t	          |j        t          j
                  sJ |                    t          |j        j                                                             t          |          dk    rdS t%          t'          t)          |                              }nVt	          |t                    sJ t	          |j        t          j
                  sJ |j        j                                        }||k    rdS dS )NrV   r5   F.T)r:  r  re   r   r   r"   r$   r(  r   TemplateBufferrh  dataget_sizer   r   r   rf  r  r/  nextiter)r:  r!  r"  r   rw  rx  ry  rz  c1c2c3r{  r|  ranges2ranges1
ranges_setrk  s                    rX   r  z4CppScheduling._can_fuse_nodes_with_compatible_ranges  s    $kE7#kE7],w"}Yu5!1!11ZZ1_/E

a 	r 	b 	5"%e**s5zz"9"9uJJU3355 n&899 	5 .-88888n)2+<== 	5.-r/@AAAAA !%*3355h 233 	4#E#s(O466J! B Bej"*;<< E!%*b.?@@@@@uUZ_%=%=%?%?@@AAAA:!##u4Z 0 01122GGh66666hmR->?????m(1133Gg5trW   c                     t          |t          t          f          sJ t          |t          t          f          sJ t          d ||fD                       rdS |                     ||          d uS )Nc              3   @   K   | ]}t          |t                    V  d S r  )r   r   r&  s     rX   r)  z:CppScheduling._can_fuse_horizontal_impl.<locals>.<genexpr>  s>       
 
>BJt899
 
 
 
 
 
rW   F)r   r"   r$   rg   rs  r:  r!  r"  s      rX   _can_fuse_horizontal_implz'CppScheduling._can_fuse_horizontal_impl  s    %"4m!DEEEEE%"4m!DEEEEE 
 
GLen
 
 
 
 
 	 5##E511==rW   c                 2   |                                 s|                                 rdS t          |                                          t          |                                          z   t          j        j        k    rdS |                     ||          S r  )rr  r   r7  r   r  max_horizontal_fusion_sizer  r  s      rX   can_fuse_horizontalz!CppScheduling.can_fuse_horizontal  s     	%"3"3"5"5 	5!!""S):):%;%;;j34 4 5--eU;;;rW   r!  r"  c                 B   |                                 x}rt          |j        t          j                  oht          |j        t          j                  oIt          |j        j                  dk    o,|j        j        d         	                                |j
        k    S dS )Nr5   r   F)get_template_noder   layoutr   MultiOutputLayoutr(  MultiOutputr   inputsrc  r   )r:  r!  r"  template_bufs       rX   can_fuse_multi_outputs_templatez-CppScheduling.can_fuse_multi_outputs_template  s     !22444< 	<.0DEE Iuz2>::I
)**a/I J%a(1133|7HH	 urW   c                 h   d}t          d ||fD                       s|S t          |t                    r|                                d         n|}t          |t          t
          f          sJ t          |t                    r|                                d         n|}t          |t          t
          f          sJ |j        \  }\  }}|j        \  }\  }	}
|dk    r|	dk    r|dk    r|
dk    r|S t          d ||fD                       r|j        |j        k    r|j        n|S t          t          |          t          |	                    }|dk    r\|d |         |	d |         k    rFt          d ||fD                       r)t          |          t          u r|n|}|j        |k    r|S |S |S |S )Nr   c              3   \   K   | ]'}t          |          t          t          t          fv V  (d S r  )r%  r   r"   r$   r&  s     rX   r)  z=CppScheduling._get_outer_loop_fusion_depth.<locals>.<genexpr>  sO       
 
  JJ+-?OP
 
 
 
 
 
rW   r  rV   c              3   B   K   | ]}t          |          t          u V  d S r  r+  r&  s     rX   r)  z=CppScheduling._get_outer_loop_fusion_depth.<locals>.<genexpr>   r,  rW   r5   c              3   B   K   | ]}t          |          t          u V  d S r  r+  r&  s     rX   r)  z=CppScheduling._get_outer_loop_fusion_depth.<locals>.<genexpr>+  s?        >BT

99     rW   )r.  r   r   r0  r"   r$   r:  r2  r_   r   rg   r%  )r:  r!  r"  DISABLE_OUTER_LOOP_FUSION_node1_node2r   rw  rx  ry  rz  r2  _compare_nodes                rX   rv  z*CppScheduling._get_outer_loop_fusion_depth  s@   $%! 
 
 
 
 
 
 
 	-
 -, %!<==E!!##B'' 	
 &#5}"EFFFFF %!<==E!!##A&& 	
 &#5}"EFFFFF$lE7$lE7B;;5B;;7b==W]],,TTeU^TTTTT 	 0E4QQQ --.
 #&c%jj#e**"="=#q((.../59Q:Q9Q3RRR  GLen     / "%[[,GGGEEU  !8<SSS2244 /.((rW   c                    |                                  ov|                                  oa|                                |j        z  oE|                     ||          o|                                  o|                     ||          dk    S r   )rr  get_operation_names	ancestorsr  r  rv  r  s      rX   ru  z*CppScheduling.can_fuse_vertical_outer_loop;  s    !!### E%%'''E))++eo=E ..ue<< -**,,,E 11%??1D		
rW   c                 6    |                      ||          rdS dS rd  )ru  r  s      rX   get_fusion_pair_priorityz&CppScheduling.get_fusion_pair_priorityG  s$    ,,UE:: 	11rW   c                 .   |                                 rdS |                                 r+t          ||g          \  }}|                                 o|S |                     ||          o|                                 p|                     ||          S r  )rr  rN   r  r  ru  )r:  r!  r"  template_fusion_supportedr   s        rX   can_fuse_verticalzCppScheduling.can_fuse_verticalN  s     	5 	J+Sw, ,(%q ))+++I0II**5%88UASASAUAU=U=..ue<<	=rW   rC  c                    t          d |D                       r|S ddd}d}d}d}|D ]|}t          |j        t          j                  sJ |j                                        \  }}}|j                                        D ] \  }	t          |	t          j	                  s!|	
                    t                    D ]t          fd|j        D                       r|k    r}|dz  }|dk    r|c c c S t          j        d         t          j        j        j                  rxj        d         |j        v rdbt#          fd|j                                        D                       r/j        d         dk    rj        d         j        d         d	}|}"~|s|S dfd
}
|D ]}||k    r|                    |
           |D ]}||k    r|                    |
            |S )aI  
        Apply loop split optimization.
        When one of the indexing_exprs contains a division, we eliminate the division by splitting the loop
        to avoid non-contiguous loads, subject to the following conditions:
            1. No reduction and no mudular index for all nodes.
            2. The indexing_exprs of all nodes contain only one (or more, but all the same) division,
               where the divisor is an integer and not too small (the divisor > 8), the dividend is
               one of the iter_vars, and this var, i.e. the dimension that needs to be split, is
               contiguous in all other indexing_exprs.

        For example, if the node's var_ranges: {z0: 2, z1: 9216, z2: 960} and indexing_exprs:
        {'index0': 8847360*z0 + 960*z1 + z2, 'index1': 32*z0 + (z2//30), 'index2': z2},
        we will split z2 -> 30*z2 + z3, then the node's var_ranges will be changed to
        {z0: 2, z1: 9216, z2: 32, z3: 30} and indexing_exprs will be changed to
        {'index0': 8847360*z0 + 960*z1 + 30*z2 + z3, 'index1': 32*z0 + z2, 'index2': 30*z2 + z3}.
        c              3      K   | ]]}t          |j        d          d                    dk    p4t          d |j        j                                        D                       V  ^dS )r5   r   c              3   J   K   | ]}|                     t                    V  d S r  )r   r   )r'  r  s     rX   r)  z9CppScheduling.try_loop_split.<locals>.<genexpr>.<genexpr>p  s?        .2))     rW   N)r   r:  rg   r=  rj  ra  r&  s     rX   r)  z/CppScheduling.try_loop_split.<locals>.<genexpr>n  s       
 

 	 
1a !!Q&   6:j6O6V6V6X6X    
 
 
 
 
 
rW   Nr   Fc              3   B   K   | ]}                     |          V  d S r  )r   )r'  r   div_exprs     rX   r)  z/CppScheduling.try_loop_split.<locals>.<genexpr>  s/      QQ#HLL--QQQQQQrW   r5   c              3   d   K   | ]*\  }}|k    t          |j        d                    dv V  +dS )r   rg  N)r  r  )r'  name_expr_r  r   s      rX   r)  z/CppScheduling.try_loop_split.<locals>.<genexpr>  sO           ,u$}} 0x}Q7GHHFR,}}}   rW   r  Tc                    | \  }}|\  }}|                               }|                                }||         z  ||<   |                    |dz              t          j        ||d          \  \  }	}
}|	                                }|                    |dz             }||         z  |z   ||<   t          j        |||g||	|          }s-|j        t          |j
                                                  f||f||	|ffS )Nr5   r  )rH  )r   copyinsertr
   index_vars_no_squeezepopr   r   r  r/  rj  ra  )rZ  rR  r  
index_sizereduce_sizer(  reduce_vars	split_idxnew_index_sizenew_index_varsr   r  	iter_varsdivisor_varrp  split_number	split_vars                 rX   
loop_splitz0CppScheduling.try_loop_split.<locals>.loop_split  sA   &+#J&*#J"((33I'__..N(29(=(MN9%!!)a->>>.:.PC/ / /+^Q '++--I#--	A66K#/)I2F#F#TIi ;y+.
NK D . O,335566.*
  -- rW   )recompute_sizes_body_func)rp  r  )rg   r   r(  r   rh  ri  rj  r  r   r  findr   r  r  corenumbersrZ  r.  rt  )r:  rC  num_div	div_expr_	match_divmatched_noder(  r   original_bodyr  r  r  rp  r   r  r  s              @@@@@rX   try_loop_splitzCppScheduling.try_loop_split[  s   &  
 

 
 
 
 
 
 	 L			 	, 	,Ddi):;;;;;"&)"B"B"D"DA}a+:@@BB , ,
d!$
33  $		( 3 3 , ,HQQQQ9PQQQQQ%$	11$,	1{{$"8=#3UZ5G5OPP,$M!,0GGG ,          0=0L0R0R0T0T        - %M!,q00$,M!$4	'/}Q'7$(	'+/,,:  	L%)"	 	 	 	 	 	 	:  	S 	SD|##,,z,RRR 	 	D|##,,/I.8 -   
 rW   r(  c                    	  j         t          j        }g g 	t          |t                    sJ dt          f	 fd} ||          s|t          _                                         	                                 t          j        j        	                    d          5  |
                                D ]t}t          |t          t          f          sJ |                                }                               }|                    |                               ||           u	 ddd           dS # 1 swxY w Y   dS dS )a  
        Generate the code for the outer loop fused scheduler node.
        1. Codegen with fused outer loop: depends on the analysis of
            the outer loop fused scheduler node, with or without the local buffer.
        2. If failed, fallback to standard codegen.
        r(  c           	      	    t           t                    sJ                                                                   dt          fdg }i t	           fd                                 D                       rDt                                                       D ] t          t                    sJ 	                    
                                                                           s%t                                                    dk    r{                                d         t	           fdj        D                       rjj        }t          |t           j                  sJ |                                } j        t                               z
  }fd}|                                r
 |            s3g }|j        d         }           |d	         }t-          |          D ]}	|                    d|           ||	z  }t!          j        |j        |j        ||          }
fd
}d} ||
|          }|sGt!          j        | dt          |           |
          }|                    |           g |j        <   |j                                     |           "t=          j                  5 }t          |          dk    r/|D ],}|j        J |                     ||j                            -                                 D ]}t          |tB          t          f          sJ "                              }|#                    |                                                               |                               |                                            $                     j                  s<|j%        D ]&}tL          j'        j%        (                    |           '	 d	d	d	           dS tR          j*                            tS          j+        t                    t          |j,                                        -                              }.                    |g t^          j0        1                                         d	d	d	           n# 1 swxY w Y   dS )zN
            Codegen code with fused outer loop and local Buffer.
            r(  c                     t          | t          t          f          sJ |                                 }t	          |d           j        \  }\  }}t          |          t          |          z   }|S )Nc                 D    t          |                                           S r  )rn   r  r  s    rX   r  z~CppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.get_call_ranges.<locals>.<lambda>  s    Q^^-=-=)>)> rW   r[  )r   r$   r"   r7  r`   r:  r   )r(  rC  r   r:  r;  r  s         rX   get_call_rangeszlCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.get_call_ranges  sy    !$8J(KLLLLL-1^^-=-=.1>>/ / / ,+E? $EllU?-C-CC""rW   c              3   `   K   | ](}t           |                    j        d z   k    V  )dS )r5   N)r   r2  )r'  r<  r  r(  s     rX   r)  zfCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.<genexpr>  sV         OOE**++t/Ka/OO     rW   r5   r   c              3   N   K   | ]}|j                                         v V   d S r  )r(  r7  )r'  r  r(  s     rX   r)  zfCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.<genexpr>  sE        :>	T^^%5%55     rW   c                  F   dd} t          j        j                                                  D ]\  }}| |z  z  | |z  } j                                                                      }fd |          o t          fdj        D                       S )Nr   r5   c                     | k    S r  rV   )r  contiguous_index_exprs    rX   is_contiguous_indexzCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous.<locals>.is_contiguous_index  s    '(,A'A ArW   c              3      K   | ]X}t          |j        t                    o9 |j        j                                                                                V  Yd S r  )r   r(  r$   r=  get_read_exprrc  )r'  r  r  scheduler_buffers     rX   r)  zCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous.<locals>.<genexpr>  s       Q Q %) !+49m D D !"$7$7$(IO$A$A(8(A(A(C(C%& %&%" %"Q Q Q Q Q QrW   )r9  r=  r  r  get_write_exprrc  r.  r  )ri  r   rS  write_index_exprr  r  r  r  s       @@rX   is_all_write_read_contiguouszyCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous  s   451%&F.6 . 4 ? E E G G/ / 0 0
U !6# E 5 &%/=/C/R/R 0 9 9 ; ;0 0,B B B B B $7#67G#H#H $S Q Q Q Q Q -=,BQ Q Q N N rW   r  Nc                 |    |D ]7}| |j         k    r*t          fd|j                 D                       r|c S 8d S )Nc              3      K   | ]J}|j         	t          fdt          j        j        j        |j                  j        D                       V  Kd S )Nc              3   N   K   | ]}|j                                         v V   d S r  )r(  rc  )r'  r  visited_scheduler_nodess     rX   r)  zCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer.<locals>.<genexpr>.<genexpr>?  sK       (& (&,0 )-	(:(:(<(<@W(W(& (& (& (& (& (&rW   )r   r.  r4   r  r-  name_to_bufr  )r'  global_bufferr  s     rX   r)  zCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer.<locals>.<genexpr>>  s       S" S" )6 (5'9'E %( (& (& (& (&45G4E4Q,9,>5**/	(& (& (& %& %& (F'E'E'ES" S"rW   )r  r.  r   )local_buffer_layoutr`  	local_buflocal_to_global_buffersr  s      rX   try_share_local_bufferzsCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer<  s    -: 5 5	#6):J#J#Js S" S" S" S" :Q(1:&S" S" S" P" P"#J ,5$4$4$4#'4rW   local_buffer_datar   )r   r  F)local_buffer_numberT)2r   r   clearr   r.  r0  r   r7  r$   r  rc  r  r   get_outputsr  r(  r   rh  rb  r2  is_contiguousri  r9  r  FixedLayoutrT  r   Bufferr  r   rK   r  add_local_bufferr"   rS  r2  r_  r  r4   r  remover   !cpp_outer_loop_fused_inner_countsCppOuterLoopFusedCountr`  rk  finalize_kernelr  r  from_iterable)r(  r`  r  global_buffer_layoutsize_offsetr  local_buffer_strideri  local_buffer_sizeszr  r  local_buf_prefixlocal_buffer_usedscoperd  r<  r]  removed_bufferouter_fusion_cpp_kernel_proxyr  r  r  r  r  r[  re  
nodes_listr:  s   `                   @@@@@rX   $try_outer_loop_fusion_with_local_bufzSCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf  s!    d$?@@@@@!'')))#&7 # # # # .0MBD#     !1133     l <F<<'&*nn&6&6 c cN%nmDDDDD+//0G0G0I0IJJJ&3355!~99;;<<AA '5'A'A'C'CA'F$    BRBX     X )9(=)-9JKKKKK/</G/G/I/I,&*&BS+ON;;F F '     4 1>>@@% < < > >% %9;+!5!<R!@,;ON,K,K'LL-) #++<"="= ) )B/66q&AAA"bLFF.0n0706-/	/ /+( ( ( ( ( (" ,?(,B,B/- -)  1 Q02	(8%O%O3};M;M%O%O':1 1 1- *001BCCCNP34E4JK/0A0FGNN)   $L$566 "%}%%))(5  +0<<<..(*A,BS*T    "1133 9 9E%e.@--PQQQQQ'+'<'<\'J'J$$225??3D3DEEE)001ABBB%%eoo&7&78888>>)4+G  ! +0*? G G /66~FFFF +" " " " " " " ", 9@@2122,/0C,D,D     150O0O)1 1- ,,1@io33J??@  ?" " " " " " " " " " " " " " "H 4s   D:S$BSSSFr  N)re  r   r  r   r   r  r   r  r   r  r0  r"   r$   r7  rS  r2  r  )
r:  r(  r  r  r<  _nodesr]  r[  re  r   s
   `      @@@rX   codegen_outer_loop_nodez%CppScheduling.codegen_outer_loop_node  s    ()0)O&=?02
$ ;<<<<<f	7R f	 f	 f	 f	 f	 f	 f	 f	 f	P 43D99 	K5SG2!''))) '--e-DD K K!1133 K KE%e.@--PQQQQQ27//2C2CF'+'<'<\'J'J$$226::: 001A6JJJJKK K K K K K K K K K K K K K K K K K	K 	Ks   'B
D??EEc                    | j         }t          |t                    r|                     |           ni|                                }|                     |          }|                     |          }|                    |           |                    ||           | 	                                }|t          j        k    r|                     d           dS dS )zC
        Turn an set of pre-fused nodes into a C++ kernel.
        TN)re  r   r   r  r7  r  rS  r2  r  _get_scheduled_num_argsrR  MAX_FUSED_KERNEL_ARGS_NUMr^  )r:  r(  re  rC  r]  args_nums         rX   codegen_nodezCppScheduling.codegen_node  s     (d788 	B((....)-)9)9E''..E#44\BB**5111(()95AAA//11m===""4((((( >=rW   c                 j    t          |t                    ot          |j        t          j                  S r  )r   r$   r(  r   CppTemplateBuffer)r:  r(  s     rX   is_cpp_templatezCppScheduling.is_cpp_template  s/    $.. 
:Ir+4
 4
 	
rW   template_nodeepilogue_nodesprologue_nodesc                    |rJ d |D             }t           d         dxx         dz  cc<   t           d         dxx         t          |          z  cc<   |                     |          s
J d            t          t          |          }|j        \  }\  }}|dk    sJ t          t          j        |j                  }d |D             }t          d	 |D                       s
J d
            d } |||j
        |          }	|                    ||	|          \  }
}|
5  t          |j                  s|                                 |D ]}|                                  |            }ddd           n# 1 swxY w Y   t          j        |
          5  |g|}|                     |||
j                  }ddd           n# 1 swxY w Y   t          |j                  rt          |j                  dk    s
J d            |j        d         j        D ]m}t)          |j        t*                    s
J d            t)          |j        j        t          j                  s
J d            |j                                         n|
                    ||           t          j        xj        |
j        z  c_        |                                  dS )zG
        Codegen a CPP template, possibly with fused epilogues
        c                 J    g | ] }t          |t          t          f          |!S rV   )r   r$   r"   )r'  epilogue_nodes     rX   rd  z2CppScheduling.codegen_template.<locals>.<listcomp>  s>     
 
 
--9K)LMM

 
 
rW   inductorcpp_templated_kernel_counterr5   cpp_epilogue_fusion_counterzlTemplate node passed to CppScheduler.codegen_template must be a SchedulerNode that wraps a CppTemplateBufferrV   c                     g | ]	}|j         
S rV   r  r1  s     rX   rd  z2CppScheduling.codegen_template.<locals>.<listcomp>  s'     ;
 ;
 ;
AF;
 ;
 ;
rW   c              3   J   K   | ]}t          |t          j                  V  d S r  )r   r   rh  r1  s     rX   r)  z1CppScheduling.codegen_template.<locals>.<genexpr>  s/      OO:a!233OOOOOOrW   z9Epilogue nodes must all be instances of ir.ComputedBufferc                     sdS |                                  |v sJ ||                                           j        }t          fd|D                        S )NFc              3   h   K   | ],}t          |j        t                    o|j        j        v V  -d S r  )r   r(  r   )r'  r  r  s     rX   r)  zZCppScheduling.codegen_template.<locals>.template_buffer_has_other_users.<locals>.<genexpr>  sW          49&788 5INn4     rW   )rc  r  r.  )template_bufferoutputs_by_namer  r  s     ` rX   template_buffer_has_other_userszGCppScheduling.codegen_template.<locals>.template_buffer_has_other_users  s     " u"++--@@@@#O$<$<$>$>?EE     "      rW   )$flag_template_buffer_has_other_usersr  NzSMulti outputs template should be with 1 output template buffer of MultiOutputLayoutr   z?Multi outputs template should be with ExternKernelSchedulerNodez7Multi outputs template has multi users with MultiOutput)r   r   r  r   r$   r:  r   r
  r(  r.  r  make_kernel_renderr)   r&  r4   set_kernel_handlerdefine_kernelr  outputsr  r   r    r  call_kernelr  r  free_buffers_in_scheduler)r:  r  r  r  r   rnumelctbepilogue_ir_nodesr  r  rh  renderr(  src_codenode_schedulekernel_namer  s                    rX   codegen_templatezCppScheduling.codegen_template  s    "!!!
 
!/
 
 
 	;<<<A<<<:;;;s>?R?RR;;;##M22 	
 	
z	
 	
2 ]M::&,;Av||||$()=}?Q$R$R;
 ;
*;
 ;
 ;
 OO=NOOOOO 	
 	
G	
 	
O	 	 	 0O/N.0A0
 0
, //1U, 0 
 

  	  	 ,]-?@@ )&&(((&    vxxH	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  !&)) 	S 	S*<^<M,,X}fkRRK	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S %]%788 	% },--222e 322 &-a06 % %!$)-FGG  U G "$).".AA  M A 	""$$$$;,,,	6#99&&(((((s%   AE55E9<E9"GG	Gc                 4    | j                                         S r  )re  get_num_argsr?  s    rX   r  z%CppScheduling._get_scheduled_num_args  s     --///rW   c                     | j         S r  r]  r?  s    rX   ready_to_flushzCppScheduling.ready_to_flush	  s    ##rW   c                     d S r  rV   r?  s    rX   codegen_synczCppScheduling.codegen_sync  s    rW   Nc                 0   t           j        j        }t          j        j        rt          |t          j        j                  nd}d                    d||                                g          }t           j        j	        r|nd}|
                    t          t          j                  |          }|
                    t          t          j                  |          }|
                    dd          }|                    d          }|                    d|          }	t"          r|                    d|	d	z             }	|||	d	z             d
}
t%                      }|| j        j        n|}|                                \  }}}t           j        j	        s|                    d|d           |                    |d           t           j        j	        s|                    d           |                    ||                                d|
           |S )NrS   r   r  rh  z#pragma CMTz//z
extern "C"r   r5   z;
zasync_compile.cpp_pybinding(z, '''T)stripz''')F)gpucpp_definition)r4   r  wrapper_coder   r  descriptive_namesr'   r  next_kernel_suffixcpp_wrapperr  rp   r,   KERNEL_NAMEDESCRIPTIVE_NAMErfindr  rU   r=   re  r  cpp_argdefsr   r  r  getvalue)r:  r'  rC  kernel_argsr  
fused_namer)  kernel_decl_name
first_char	last_charkernel_definitioncompile_wrapperr  r   	arg_typess                  rX   r  zCppScheduling.define_kernel  s   '& z+!%)EFFF 	
 hhz73M3M3O3OPQQ*+'*=K;;8##C(?$@$@BRSS##C(D$E$E{SS ##M488 ^^L11
MM#z22	 	: c9q=99I'
Y](BCHHH(**)4)<t %%+**,,1iw" 	Y%%&WY&W&W&WXXXxt444w" 	.%%f---$$&&,	 	 	
 	
 	
 rW   c                    | j                                         }|r}|                     || j         j                  }d }t          j        j        dk    rt          | j         j        |          }| j                             t          j
        j        ||           |                                  |                     d           d S )Nr   )debug_handleF)re  codegen_groupr  scheduled_nodesr   traceprovenance_tracking_levelr   r!  r4   r  r5  rY  r^  )r:  r'  r)  rG  s       rX   flushzCppScheduling.flush7  s    $2244 	,,$+; K +/L|5::F%5{    ))$k *    	!!!u%%%%%rW   r  )4r  r  r  rC  rS  r%  r  r  r   r6   INPLACE_BUFFERSREDUCE_TO_SINGLE_ELEMENTrV  rl  r   rT  rW  r9  rl   r^  ra  rY  r3  r   rJ  rs  r  r  r  r   r  rv  ru  r  r  r/  r$   r  r   r  r	   r"   r  r  r   r*  r  r.  r0  r  rL  rm  rn  s   @rX   rR  rR  +  s         .<d>*;;; !$!z*3	
  $%, $:n;U $ $ $ [$% % % % %& & & & &N N N* * *P= P= P=dx8H/I    6 6 6p> > >	< 	< 	<
&
/@
	
 
 
 
4) 4) 4)l

 

 

  = = =oD$7 o o o obFK)FK FK FK FKP)/1C]RS) ) ) ),
$5 
$ 
 
 
 

U)(U) !!23U) !!23	U) U) U) U)n0 0 0$ $ $  & & & &P& & & & & & &rW   rR  c                   X     e Zd Z fdZd Zd Zd Zd
defdZd
de	e
         fd	Z xZS )rc  c                 B   t                                                       t                      | _        t	                      | _        t          | j                  | _        t          j	                    | _
        | j
                            | j                   g | _        d S r  )r8  r9  r?   r  r7   
loops_codeWorkSharingr  r   r   r   r   rI  rP  s    rX   r9  zKernelGroup.__init__K  sv    LL	&..do..)++

  )))!rW   c                 8     || j         t                      g|R  S r  )r  r+   )r:  r1  r  s      rX   r  zKernelGroup.new_kernelT  s%    s49244<t<<<<rW   c                 n    | xj         |z  c_         | j        }| j        }|                    ||           d S r  )rI  rQ  r  rl  )r:  r  rC  r   r  s        rX   r  zKernelGroup.finalize_kernelW  s@    %W  r*****rW   c                 ^    | j                                         \  }}}t          |          }|S r  )r  r<  r   )r:  arg_defs
_call_args
_arg_typesr  s        rX   r,  zKernelGroup.get_num_args]  s,    +/9+@+@+B+B(*jx==rW   Nr   c                 J   | j                                          | j        sdS t                      }t          j        j        ot          j        dv }|r|	                    dg           |
                    d           |t          t          j                  n|}|t          t          j                  n|}| j                                        \  }}}d                    d                              |          }t'                      }t          j        j        rdnd}	|
                    d| d	|	 d
| d| d	           |                                5  |rMt,          j        j        }
|
dt          |
          z   dz   nd}|	                    d||z    d||z    dg           | j                                        D ]!\  }}|
                    d| d| d           "|                    | j                   d d d            n# 1 swxY w Y   |                                S )NrS   )linuxrQ   z3#include <torch/csrc/inductor/aoti_runtime/utils.h>z+#include <torch/csrc/inductor/cpp_prefix.h>z,
   C10_ALWAYS_INLINE_ATTRIBUTEzextern "C" z void r   r  r   graph_r   z9torch::aot_inductor::RAIIAtenRecordFunctionHandle record_z_("z", nullptr);r   r   r   )r   rL  rI  r7   r   r  enable_kernel_profilesysplatformr   r   rp   r,   r9  r:  r  r<  ljustr  rY   force_inline_kernelr   r4   r  graph_idaliasesr  rQ  r=  )r:  r   r   r^  r@  r)  rV  r   func_export_declinline_attrrc  rH  oldnews                 rX   rH  zKernelGroup.codegen_groupb  s   
# 	2~~ !'
 @ !
S\ V
 F
 ! 	UOORSTTTDEEE <@<3{6777T;?<c+6777T..00!Q;;r??''11133-3Z-KS))QS 	 	^*^^+^^@P^^S[^^^	
 	
 	

 [[]] 	) 	)$ 
7+;C;OCMM1C77UWb&,{&:b b?E?Sb b b   !I--// 7 7S5s55s5556666KK(((	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) }}s   B%HH
H
rG  c                 t    | j                                         \  }}}|                    ||d||           d S )NF)tritonrE  rG  )r  r<  generate_kernel_call)r:  r  r)  rG  r   	call_argsrE  s          rX   r!  zKernelGroup.call_kernel  sR    "&)"7"7"9"99i$$% 	% 	
 	
 	
 	
 	
rW   r  )r  r  r  r9  r  r  r,  rp   rH  r   rn   r!  rm  rn  s   @rX   rc  rc  J  s        " " " " "= = =+ + +  
, ,# , , , ,\
 
hsm 
 
 
 
 
 
 
 
rW   rc  c                   2    e Zd Zd Zd Zd Zd Zd Zd ZdS )rR  c                 `    || _         d| _        d | _        t          j                    | _        d S r  )r   in_parallelr  r   r   r   )r:  r   s     rX   r9  zWorkSharing.__init__  s-    	 )++


rW   c                    | j         r|| j        k    r|                                  | j         s|| _        d| _         t          j        j        r| j                            d           n| j                            d| d           | j        	                    | j        
                                           | j                            d           d S d S )NTz#pragma omp parallelz!#pragma omp parallel num_threads(r   zint tid = omp_get_thread_num();)ro  r  rL  r   r  r  r   r   r   r   r   )r:  rR  s     rX   r>  zWorkSharing.parallel  s     	4+; ; ;JJLLL 
	&D#Dz) T	##$:;;;;	##$R$R$R$RSSSJ$$TY%5%5%7%7888I1    
	 
	rW   c                 R    | j         r| j                            d           | j         S )Nz#pragma omp single)ro  r   r   r?  s    rX   r^  zWorkSharing.single  s-     	6I 4555rW   c                 F    | j                                          d| _        d S r  )r   rL  ro  r?  s    rX   rL  zWorkSharing.close  s$    
 rW   c                 8    | j                                          | S r  )r   rz  r?  s    rX   rz  zWorkSharing.__enter__  s    
rW   c                 >    | j                             |||           d S r  )r   r  r|  s       rX   r  zWorkSharing.__exit__  s"    
Hgv66666rW   N)	r  r  r  r9  r>  r^  rL  rz  r  rV   rW   rX   rR  rR    sn        , , ,        
! ! !  7 7 7 7 7rW   rR  c                   4   e Zd ZU dZeej                 ed<   dZeej                 ed<   ej	        j
        Zej        ed<   ej	        j
        Zej        ed<   ej	        j        Zej        ed<   dZeed<   d	Zeed
<   d	Zeed<   d	Zeed<   d	Zeed<   d Zd Zd ZdS )r;  Nr   rF  rG  r	  rH  r   r>  Fsimd_ompsimd_vec	collapsedr  c                 f    t          j                    }|r|                                nd| _        d S r]  )r   r  rk  simd_nelements)r:  r  s     rX   __post_init__zLoopLevel.__post_init__  s7     .9-E-G-GAO#V>#;#;#=#=#=UVrW   c                     t          j        |          }t          | j        | j                  }||_        d|_        t          |j        |          |z  |_        | j	        |_	        d|_
        | j        |_        |S )NTF)r   rZ  r;  r   rF  rH  rw  r   r	  r>  rx  r  )r:  r(  sympy_factorrN  s       rX   r  zLoopLevel.tile  sj    }V,,49--!
"49l;;lJ -rW   c                    t          | j                  }t          | j                  }t          j        j        r||k    rd S | j        r| j        dk    rd| j         dnd}| j        r<d}| j        dk    r|d| j         dz  }| j        r|	                    dd|           }n6| j
        rd}n,| j        rd	| }n| j        st          j                    rd
}nd}t           d| j         d| }| j         d| }| j        j        r| j         dt          | j                   }n4| j         dt          | j                   dt          | j                   d}d| d| d| d}| j        s|s|gS ||gS )Nr5   zsimd simdlen(z) rS   z#pragma omp forz
 collapse(r   z for z#pragma omp z#pragma GCC ivdepr   r  r  r  z+=(z == 0 ? 1 : zfor(r   )rE   rG  rF  r   r  no_redundant_loopsrv  rz  r>  r  rw  r  r   r  rJ   r   rH  r  rx  )	r:  offset_expr	size_exprsimdline1
offset_strr  	steps_strline2s	            rX   rV  zLoopLevel.lines  s   !$+..	**	:( 	[I-E-E4 }!%!4q!8!8 4D/3333 	
 = 	%E}q  6dm6666} ?g~t~~>>] 	EE] 	)4))EE" 	{'9';'; 	'EEE"==TX====
h,,,,: 	8@@{4:'>'>@@II
 8 2 2DJ 7 7 2 2"4:..2 2 2  >z==X=====> 	 	7Nu~rW   )r  r  r  r   r   r   r  r  rF  r   r   rG  r	  OnerH  r>  rn   rv  rl   rw  rx  r  r{  r  rV  rV   rW   rX   r;  r;    s         $C%*	$$$!%D(5:
%%%FEJ%%% #W\J
)))E5:###HcHdHdItL$
W 
W 
W	 	 	' ' ' ' 'rW   r;  c                       e Zd ZU dZdZeee                  ed<   dZ	ee
         ed<   ede
fd            Zd Zed             Zd Zd	 Zd
e
fdZd ZdefdZdS )rR  aV  
    A loop-nest-like structure. It is built with the `build` method
    as a loop nest and then will perform loop-tiling at some depth.

    A typical case is for vectorization, where we typically do loop-tiling
    at the innermost loop level. A more complicated case is when we do
    2D tiling at both the innermost and outer levels.
    NrN  rh  c                     | j         }| j        }| j        }|J d}t          t	          ||                    D ]E\  }\  }}t          ||          }|s|g}n|                    |           ||k    r| j        |_        Ft          |          }	|	S )z4Build a LoopNest with the given `kernel` as the leafN)	rc  rY  r  r   r  r;  r  r  rR  )
rh  rc  rY  r  rN  loop_idxr   rF  rN  rT  s
             rX   rk  zLoopNest.build'  s     ? 0***+/%.s8V/D/D%E%E 	8 	8!HksDS$''D #T"""?**$*$7!UOO	rW   c                 *    t          | j                  S r  )rl   rN  r?  s    rX   __bool__zLoopNest.__bool__<  s    DJrW   c                 v   | j         t          dd          S d}d}| j         d         j        }t          j        d          }| j         D ]1}|j        |k    r n#|t          |j        |j                  z  }|dz  }2d } || j                   }dt          fd}|t          | j                   k     rt          |t          j                  rt          | j         |         j        t          j                  r|dz  t          | j         |         j        | j         |         j                  k     r}|#||k    r| j         |         j        r ||           sX|}d}| j         |         j        }t          |t          | j                             D ]}	| j         |	         j        |k    r n|dz  } t          ||          S )	a  
        Maximal allowed depth for parallelism: All reduction or non-reduction levels.
        When the range of the first inner loop beyond the maximum parallel depth is much
        larger than the range of all outer loops within the maximum parallel depth,
        change the starting depth of parallelism to the first inner loop and recalculate
        the maximum parallel depth.
        Nr   rs  r5   c                 F    t          |           D ]\  }}|j        r|c S d S r  )r   rw  )rN  r   rN  s      rX   get_simd_vec_depthz7LoopNest.max_parallel_depth.<locals>.get_simd_vec_depthU  s:    $U++  4= HHH4rW   rT  c                     t          | j        t                    sJ t          d | j        j        D                       S )Nc              3   B   K   | ]}t          |t                     V  d S r  )r   r  )r'  rh  s     rX   r)  zILoopNest.max_parallel_depth.<locals>.has_scalar_kernel.<locals>.<genexpr>`  sC         v|444     rW   )r   rh  rC  rg   r  )rT  s    rX   has_scalar_kernelz6LoopNest.max_parallel_depth.<locals>.has_scalar_kernel^  sO    i.?????  '.6     rW   rQ  )rN  r  r  r   rZ  r   rF  rH  rR  r   r   rS  )
r:  r  	max_depthr  	num_stepsrN  r  simd_vec_depthr  r   s
             rX   r\  zLoopNest.max_parallel_depth?  s    : qAAAA	z!}1M!$$	J 	 	D L00!HTY
$C$CCINII	 	 	 ,+DJ77	 	 	 	 	 DJ''9em44 (4:i05u}EE ( Ctz),14:i3H3NOOP P *..Jy)6 /%%d++ /
 $KI:k2?L;DJ88  :a=-==EQ		I;OOOOrW   c                    |j         |                                 j         k    s
J d            | j        J t          | j                  |j         k    sJ | j        |j                 }|j         |_        |j        rt          xj        dz  c_        t          |j        dz   |j                   D ]}d| j        |         _
        d S )Nz?Parallel depth cannot exceed the maximal allowed parallel depthr5   T)r  r\  rN  r   r  r>  r  r   parallel_reduction_countrS  rx  )r:  r@  rN  r   s       rX   r]  zLoopNest.mark_parallel~  s    '4+B+B+D+D+SSSSM TSS z%%%4:)":::::z)/0!0 	2,,1,,y,q0)2JKK 	+ 	+A&*DJqM##	+ 	+rW   c                 ~    | j         sJ | j         |                             |          | j         |<   | j         |         S )z
        Do loop-tiling at the `depth` level with `factor`.
            for (x0 = 0; x0 < x0_end; x0++)
            ->
            for (x0 = 0; x0 < x0_end; x0 += factor)
        See details in Note [tiled_size].
        )rN  r  )r:  rJ  r(  s      rX   r  zLoopNest.tile  s?     zz Ju-226::
5z%  rW   r   c                 "    | j         sJ | j         S r  rh  r?  s    rX   rB  zLoopNest.get_kernel  s    {{{rW   c                     || _         d S r  r  r5  s     rX   r  zLoopNest.set_kernel  s    rW   levelc                     | j         sJ t          | j                   |k    sJ |t          | j                   k    rd n| j         |d          }t          || j                  S r  )rN  r   rR  rh  )r:  r  rN  s      rX   rb  zLoopNest.from_loop_level  s`    zz4:%''''TZ00dj6Ht{+++rW   )r  r  r  r  rN  r   r/  r;  r  rh  r  r  rk  r  r%   r\  r]  r  rB  r  rn   rb  rV   rW   rX   rR  rR    s          (,E8DO$+++"&FHY&&&i    \(      <P <P ]<P|+ + +
! 
! 
!I      ,S , , , , , ,rW   rR  )NNNr  )r   dataclassesrU  r  r  rW  r   r_  rF  collections.abcr   enumr   typingr   r   r   r   r	   r   r   torch.fxtorch._inductorr
   torch._prims_commonr   r   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._sympy.symbolr   r   r   _dynamo.utilsr   rS   r   r   r   r   r   debugr   rU  r   r-  r   r   r    r!   r"   r#   r$   utilsr%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   virtualizedr1   r2   r3   r4   commonr6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   	cpp_utilsrB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   r`  rU   cacherY   _logginggetArtifactLoggerr  schedule_logNATIVE_OMP_RTYPESRTYPE_TO_CPPr  PYTHON_TO_CPPCONTAINER_PYTHON_TO_CPPrK  rL  r   rJ  ro   rl   rM  rN  r  r  rO  rP  rw   r/  r   r  rx   r   r   r  r   r   r  rp   r   rn   r   r   r   	lru_cacher   r  r  	dataclassr  r   rp  r  r  _initialize_pointwise_overridesr  r  r  r  r  r  r   rL  rN  rC  rf  rJ  rR  rc  rR  r;  rR  rV   rW   rX   <module>r     s                      				 



  $ $ $ $ $ $       7 7 7 7 7 7 7 7 7 7 7 7 7 7    ( ( ( ( ( ( @ @ @ @ @ @ @ @ / / / / / / K K K K K K K K K K O O O O O O O O O O % % % % % % < < < < < < < < < < < < < < ; ; ; ; ; ;                                                         > = = = = = = = = = = =                                                            & lg% : : : ~//*EEJBBBCC    !j      #&     
N	M 
M	K	N	M	J	K	J	K	K		* T%+&    
K	N	M	K	J1 D-   ) ) )D   $(/) /)
 EL!/) /) /) /)d  -
-j- - 
	-
 j- - - - -`3#$  ;	
 
sCx   BV^ V3 V# V V V V/ / / / /2 -UZ -el - - - - ;uz ;
 ;PS ; ; ; ;| FJ! !:!!L!6>sm! ! ! !        ]! ]! ]! ]! ]!"4 ]! ]! ]!@! ! ! ! ! ! ! !B& & &oH oH oH oH oH; oH oH oHd  , ,U 3 3 3|7 |7 |7 |7 |7l |7 |7 |7~  / / 9 9 9  % % ' ' '7 7 7 7 7 7 7 7R R R R R R R Rj^ ^ ^ ^ ^9 ^ ^ ^BY
 Y
 Y
 Y
 Y
l Y
 Y
 Y
x)$ )$eHU[4I44O.P )$ )$ )$ )$XbK bK bK bK bK bK bK bKJg	
 g	
 g	
 g	
 g	
Y g	
 g	
 g	
T 
  
  
  
  
9  
  
  
FF F F F Ft F F F\& \& \& \& \&N \& \& \&~N
 N
 N
 N
 N
 N
 N
 N
b%7 %7 %7 %7 %7 %7 %7 %7P R R R R R R R Rj H, H, H, H, H, H, H, H, H, H,rW   