
    `i!                       d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZ d dlZd dlZd dlZddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl#m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2m3Z3m4Z4 ddl5m6Z7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZB ddlCmDZD ddlEmFZF ddlGmHZHmIZImJZJ erd dlKmLZL ddl*mMZMmNZN ddlOmPZP  ejQ        eR          ZSd ZT G d deU          ZV G d  d!e@          ZW eW            jX        ZY e@            jX        ZZej[        d"ej\        d#ej]        d$ej^        d%ej_        d&ej`        d'eja        d(ejb        d)ejc        d*ejd        d+eje        d,ejf        d-ejg        d.iZhd/ Zid0 Zj G d1 d2e?          Zkekl                    d3            G d4 d5e;          Zmejn         G d6 d7                      Zod8 Zpd9 Zq G d: d;eI          Zr G d< d=eJ          ZsdS )>    )annotationsN)defaultdict)inf)AnyCallablecastOptionalTYPE_CHECKINGUnion   )is_integer_dtype)
OrderedSet)FloorDivModularIndexing)symbol_is_typeSymT)ValueRanges   )configir)HalideCodeCache)get_reduction_combine_fn)is_metric_table_enabledlog_kernel_metadata)AddParenHandler)HalideInputSpec
HalideMeta)get_bounds_index_exprget_kernel_metadataparallel_num_threadssympy_index_symbol
sympy_subs)_opsV   )	BackendFeatureCSEVariableDeferredLineIndentedBufferKernelArgTypeOpOverridesPythonPrinterSizeArg	TensorArg)DTYPE_TO_CPP)cexpr)constant_repr
SIMDKernelSIMDScheduling)Sequence)ReductionType	StoreMode)BlockShapeTypec                4   t          | t                    rMd| cxk    rdk    s@n t          j        t          j                  }| |j        k    rdS | |j        k    rdS d| dS t          | t                    rdt          |            dS t          |           S )Ni   izhl.Int(64).min()zhl.Int(64).max()zhl.i64()zhl.f64()

isinstanceinttorchiinfoint64minmaxfloatr1   repr)valinfos     r/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torch/_inductor/codegen/halide.pyhalide_constantrF   >   s    #s "[C%E%E%E%E:%E%E%E%E{5;''$(??%%$(??%%!!!!!#u /.s++....99    c                        e Zd Zd fdZ xZS )UnsupportedreturnNonec                P    t                                          d|            d S )Nz!halide backend does not support: )super__init__)selfthing	__class__s     rE   rN   zUnsupported.__init__L   s*    DUDDEEEEErG   rJ   rK   )__name__
__module____qualname__rN   __classcell__rQ   s   @rE   rI   rI   K   sG        F F F F F F F F F FrG   rI   c                       e Zd Zed             Zed             Zd Zd Zd ZeZ	d Z
e
Zd Zd Zd	 Zd
 Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Z fdZd ZeZd Zd Z  xZ!S )HalidePrinterc                2    dt           j        j         d|  dS )Nhl.cast(, r9   )r$   kernelindex_dtypeexprs    rE   
cast_indexzHalidePrinter.cast_indexQ   s     9!(.99$9999rG   c                    d|  dS )Nhl.cast(hl.Float(32), r9    r_   s    rE   
cast_floatzHalidePrinter.cast_floatU   s    /////rG   c                    d| dS )Nhl.f32(r9   rd   rO   r`   s     rE   _print_FloatzHalidePrinter._print_FloatY   s         rG   c                ~    t          |j                  dk    sJ d|                     |j        d                    dS )Nr%   rg   r   r9   lenargs_printrh   s     rE   _print_ToFloatzHalidePrinter._print_ToFloat\   >    49~~""""5TYq\225555rG   c                    t          |j                  dk    sJ |                     d|                     |j        d                    d          S )Nr%   	hl.floor(r   r9   rl   rm   ra   rn   rh   s     rE   _print_floorzHalidePrinter._print_floor`   K    49~~""""G4;;ty|+D+DGGGHHHrG   c                    t          |j                  dk    sJ |                     d|                     |j        d                    d          S )Nr%   	hl.trunc(r   r9   rs   rh   s     rE   _print_TrunczHalidePrinter._print_Truncf   ru   rG   c                    t          |j                  dk    sJ |                     d|                     |j        d                    d          S )Nr%   hl.ceil(r   r9   rs   rh   s     rE   _print_ceilingzHalidePrinter._print_ceilingl   sK    49~~""""F$++dil*C*CFFFGGGrG   c                Z    d|                      |                     |                     dS Nzhl.sqrt(r9   )re   rn   rh   s     rE   _helper_sqrtzHalidePrinter._helper_sqrtp   s+    ?$//$++d*;*;<<????rG   c                    |                      |j        d                   }|                      |j        d                   }|                      |j        d                   }d| d| d| dS )Nr   r%   r   
hl.select(r\   r9   )doprintrm   )rO   r`   cpqs        rE   _print_WherezHalidePrinter._print_Wheres   sh    LL1&&LL1&&LL1&&*A****a****rG   c                j   t          |j                  dk    r |                     |j        d                   S t          |j                  dz  }|                     t          j        |j        d |                    }|                     t          j        |j        |d                     }d| d| dS )Nr%   r   r   hl.min(r\   r9   )rl   rm   rn   sympyMinrO   r`   midabs        rE   
_print_MinzHalidePrinter._print_Miny   s    ty>>Q;;ty|,,,$)nn!KK	49TcT?344KK	49STT?344"""a""""rG   c                j   t          |j                  dk    r |                     |j        d                   S t          |j                  dz  }|                     t          j        |j        d |                    }|                     t          j        |j        |d                     }d| d| dS )Nr%   r   r   hl.max(r\   r9   )rl   rm   rn   r   Maxr   s        rE   
_print_MaxzHalidePrinter._print_Max   s    ty>>Q;;ty|,,,$)nn!KK	49TcT?344KK	49STT?344"""a""""rG   c                    t          |j                  dk    sJ |                     d|                     |j        d                    d          S )Nr%   hl.abs(r   r9   rs   rh   s     rE   
_print_AbszHalidePrinter._print_Abs   sK    49~~""""ETYq\)B)BEEEFFFrG   c                ~    t          |j                  dk    sJ d|                     |j        d                    dS )Nr%   hl.cos(r   r9   rk   rh   s     rE   _print_OpaqueUnaryFn_cosz&HalidePrinter._print_OpaqueUnaryFn_cos   rp   rG   c                ~    t          |j                  dk    sJ d|                     |j        d                    dS )Nr%   hl.cosh(r   r9   rk   rh   s     rE   _print_OpaqueUnaryFn_coshz'HalidePrinter._print_OpaqueUnaryFn_cosh   >    49~~""""6$++dil336666rG   c                ~    t          |j                  dk    sJ d|                     |j        d                    dS )Nr%   hl.acos(r   r9   rk   rh   s     rE   _print_OpaqueUnaryFn_acosz'HalidePrinter._print_OpaqueUnaryFn_acos   r   rG   c                ~    t          |j                  dk    sJ d|                     |j        d                    dS )Nr%   hl.sin(r   r9   rk   rh   s     rE   _print_OpaqueUnaryFn_sinz&HalidePrinter._print_OpaqueUnaryFn_sin   rp   rG   c                ~    t          |j                  dk    sJ d|                     |j        d                    dS )Nr%   hl.sinh(r   r9   rk   rh   s     rE   _print_OpaqueUnaryFn_sinhz'HalidePrinter._print_OpaqueUnaryFn_sinh   r   rG   c                ~    t          |j                  dk    sJ d|                     |j        d                    dS )Nr%   hl.asin(r   r9   rk   rh   s     rE   _print_OpaqueUnaryFn_asinz'HalidePrinter._print_OpaqueUnaryFn_asin   r   rG   c                ~    t          |j                  dk    sJ d|                     |j        d                    dS )Nr%   hl.tan(r   r9   rk   rh   s     rE   _print_OpaqueUnaryFn_tanz&HalidePrinter._print_OpaqueUnaryFn_tan   rp   rG   c                ~    t          |j                  dk    sJ d|                     |j        d                    dS )Nr%   hl.tanh(r   r9   rk   rh   s     rE   _print_OpaqueUnaryFn_tanhz'HalidePrinter._print_OpaqueUnaryFn_tanh   r   rG   c                ~    t          |j                  dk    sJ d|                     |j        d                    dS )Nr%   hl.atan(r   r9   rk   rh   s     rE   _print_OpaqueUnaryFn_atanz'HalidePrinter._print_OpaqueUnaryFn_atan   r   rG   c                     t          d          Nlog2NotImplementedErrorrh   s     rE   _print_OpaqueUnaryFn_log2z'HalidePrinter._print_OpaqueUnaryFn_log2   s    !&)))rG   c                @   |j         r!t                                          |          S |j        \  }}|                     |                     |                    }|                     |                     |                    }|                     d| d| d          S )Nrr   z / r9   )
is_integerrM   _print_FloorDivrm   re   r   ra   )rO   r`   xdivrQ   s       rE   r   zHalidePrinter._print_FloorDiv   s    ? 	177**40003OODLLOO,,oodll3//007177777888rG   c                    t          |j                  dk    sJ |                     d|                     |j        d                    d          S )Nr%   	hl.round(r   r9   rs   rh   s     rE   _print_RoundzHalidePrinter._print_Round   ru   rG   c                (    |j         \  }}d| d| dS )N() / (z+hl.f32(0)))rm   )rO   r`   r   r   s       rE   _print_IntTrueDivzHalidePrinter._print_IntTrueDiv   s&    y1)1))1))))rG   c                    |j         \  }}|                     |          }t          |          }dd| z  d| dd|z  dS )Nrg   g      $@z)*hl.round((z	)*hl.f32()))rm   rn   r;   )rO   r`   rC   ns       rE   _print_RoundDecimalz!HalidePrinter._print_RoundDecimal   sT    Qkk#FFP1"PPSPP47PPPPrG   )"rS   rT   rU   staticmethodra   re   ri   ro   rt   _print_FloorToIntrx   _print_TruncToIntr{   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   _print_RoundToIntr   r   rV   rW   s   @rE   rY   rY   P   s       : : \: 0 0 \0! ! !6 6 6I I I %I I I %H H H@ @ @+ + +# # ## # #G G G6 6 67 7 77 7 76 6 67 7 77 7 76 6 67 7 77 7 7* * *9 9 9 9 9I I I %* * *
Q Q Q Q Q Q QrG   rY   z	hl.Bool()zhl.BFloat(16)zhl.Float(16)zhl.Float(32)zhl.Float(64)z	hl.Int(8)z
hl.Int(16)z
hl.Int(32)z
hl.Int(64)z
hl.UInt(8)zhl.UInt(16)zhl.UInt(32)zhl.UInt(64)c                    t           |          S N)_halide_typedtypes    rE   halide_typer      s    rG   c                    t          |           r#| j        r| t          j        k    rt          j        } | t          j        t          j        fv rt          j        } t          |           S r   )	r   	is_signedr<   r>   int32float16bfloat16float32r   r   s    rE   halide_acc_typer      sW     5? u7K7K///urG   c                     e Zd Ze	 	 dHdId            ZedJd            Zed	             Zed
             Zed             Z	ed             Z
ed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zed              Zed!             Zed"             Z ed#             Z!ed$             Z"ed%             Z#ed&             Z$ed'             Z%ed(             Z&ed)             Z'ed*             Z(ed+             Z)ed,             Z*ed-             Z+ed.             Z,ed/             Z-ed0             Z.ed1             Z/ed2             Z0ed3             Z1ed4             Z2ed5             Z3ed6             Z4ed7             Z5ed8             Z6ed9             Z7ed:             Z8ed;             Z9ed<             Z:ed=             Z;ed>             Z<ed?             Z=ed@             Z>edA             Z?edB             Z@edKdC            ZAedD             ZBedE             ZCedF             ZDedG             ZEdS )LHalideOverridesNTr   torch.dtype	src_dtypeOptional[torch.dtype]c                Z    |t           j        k    rd|  dS dt          |           d|  dS )Nr   z != 0)r[   r\   r9   )r<   boolr   )r   r   r   use_compute_typess       rE   to_dtypezHalideOverrides.to_dtype   s@     EJ q=== 4+e,,444444rG   c                    |t           j        t           j        fv rdt          |           d|  d} dt          |           d|  d}|t           j        t           j        fv rd| d}|S )Nr[   r\   r9   zhl.reinterpret(rc   )r<   r   r   r   )r   r   r   lines       rE   to_dtype_bitcastz HalideOverrides.to_dtype_bitcast   s    7779;y1199Q999A;U!3!3;;q;;;U]EN3333D333DrG   c                H    |                      t          |          |          S r   )r   rF   )clsvaluer   s      rE   constantzHalideOverrides.constant  s    ||OE22E:::rG   c                    d|  dS )Nr   r9   rd   r   s    rE   abszHalideOverrides.abs      ~~~rG   c                P    t          | d          sd|  dS d|  d| j         d|  dS )Nnamezhl.exp(r9   z"hl.fast_exp(hl.cast(hl.Float(32), z)) if z!.type().bits() <= 32 else hl.exp(hasattrr   r   s    rE   expzHalideOverrides.exp  sF    q&!! 	"!Q>>>!jAjjQVjjfgjjjjrG   c                    d|  dS r}   rd   r   s    rE   sqrtzHalideOverrides.sqrt      !rG   c                    t          | d          s	d|  d| dS d| j         d| d}d|  d| d	|  d
|  d| d| j         d|  d| dS )Nr   r   r\   r9   r[   	.type(), hl.select((<)|hl.is_nan(), ) if z.type().is_float() else hl.min(r   r   r   s     rE   minimumzHalideOverrides.minimum       q&!! 	'&Q&&!&&&&,qv,,,,,sQssssssass1ss16ssjkssopssssrG   c                    t          | d          s	d|  d| dS d| j         d| d}d|  d| d	|  d
|  d| d| j         d|  d| dS )Nr   r   r\   r9   r[   r   r   >r   r   r   z.type().is_float() else hl.max(r   r   s     rE   maximumzHalideOverrides.maximum"  r   rG   c                V    t          |d          rd|j         d| d}d|  d| d| dS )Nr   r[   r   r9   r   r\   r   )r   r   r   s      rE   wherezHalideOverrides.where*  sO    1f 	101600A000A*A****a****rG   c                    d|  dS )Nr   r9   rd   r   s    rE   coszHalideOverrides.cos0  r   rG   c                    d|  dS )Nr   r9   rd   r   s    rE   sinzHalideOverrides.sin4  r   rG   c                     t          d          )NlgammarI   r   s    rE   r	  zHalideOverrides.lgamma8      (###rG   c                    d|  dS )Nzhl.erf(r9   rd   r   s    rE   erfzHalideOverrides.erf<  r   rG   c                    d|  dS )Nr   r9   rd   r   s    rE   coshzHalideOverrides.cosh@  r   rG   c                    d|  dS )Nr   r9   rd   r   s    rE   sinhzHalideOverrides.sinhD  r   rG   c                    d|  dS )Nr   r9   rd   r   s    rE   acoszHalideOverrides.acosH  r   rG   c                    d|  dS )Nz	hl.acosh(r9   rd   r   s    rE   acoshzHalideOverrides.acoshL      1rG   c                    d|  dS )Nr   r9   rd   r   s    rE   asinzHalideOverrides.asinP  r   rG   c                    d|  dS )Nz	hl.asinh(r9   rd   r   s    rE   asinhzHalideOverrides.asinhT  r  rG   c                    d|  d| dS )Nz	hl.atan2(r\   r9   rd   r   ys     rE   atan2zHalideOverrides.atan2X      $1$$$$$$rG   c                    d|  dS )Nr   r9   rd   r   s    rE   atanzHalideOverrides.atan\  r   rG   c                    d|  dS )Nz	hl.atanh(r9   rd   r   s    rE   atanhzHalideOverrides.atanh`  r  rG   c                     t          d          )Ncopysignr
  r  s     rE   r%  zHalideOverrides.copysignd  s    *%%%rG   c                     t          d          )Nerfinvr
  r   s    rE   r'  zHalideOverrides.erfinvh  r  rG   c                    d|  d| dS )Nz	hl.hypot(r\   r9   rd   r  s     rE   hypotzHalideOverrides.hypotl  r  rG   c                     t          d          )N	nextafterr
  r  s     rE   r+  zHalideOverrides.nextafterp  s    +&&&rG   c                    |  d| S Nz & rd   r   s     rE   logical_andzHalideOverrides.logical_andt      ||||rG   c                    |  dS )Nz == 0rd   r   s    rE   logical_notzHalideOverrides.logical_notx  s    {{{rG   c                    |  d| S Nz | rd   r   s     rE   
logical_orzHalideOverrides.logical_or|  r/  rG   c                    d|  d| dS )Nr    ^ r9   rd   r   s     rE   logical_xorzHalideOverrides.logical_xor  s    1~~~~~rG   c                    |  d| S r-  rd   r   s     rE   bitwise_andzHalideOverrides.bitwise_and  r/  rG   c                    d|  S )N~rd   r1  s    rE   bitwise_notzHalideOverrides.bitwise_not  s    1wwrG   c                    |  d| S r4  rd   r   s     rE   
bitwise_orzHalideOverrides.bitwise_or  r/  rG   c                    |  d| S )Nr7  rd   r   s     rE   bitwise_xorzHalideOverrides.bitwise_xor  r/  rG   c                    |  d| S )Nz << rd   r   s     rE   bitwise_left_shiftz"HalideOverrides.bitwise_left_shift      }}}}rG   c                    |  d| S )Nz >> rd   r   s     rE   bitwise_right_shiftz#HalideOverrides.bitwise_right_shift  rD  rG   c                    d|  d| dS )Nzhalide_helpers.rand(r\   r9   rd   seedoffsets     rE   randzHalideOverrides.rand  s    7d77f7777rG   c                    d|  d| dS )Nzhalide_helpers.randn(r\   r9   rd   rH  s     rE   randnzHalideOverrides.randn  s    8t88v8888rG   c           	          d|  d| d| d| d	S )Nzhalide_helpers.randint64(r\   r9   rd   )rI  rJ  lowhighs       rE   	randint64zHalideOverrides.randint64  s+    K4KK6KKSKKDKKKKrG   c                |    t          j        | d           dt          j        j                            d|           S )Nr    + load_seed_offset)opsloadr$   r]   rm   seed_offset)r   rJ  s     rE   	load_seedzHalideOverrides.load_seed  s8    (4##__(A(ABTV\(](]___rG   c                    d|  dS )Nz1./hl.sqrt(r9   rd   r   s    rE   rsqrtzHalideOverrides.rsqrt  s     "Q!!!!rG   c                    d|  dS )Nr   r9   rd   r   s    rE   tanzHalideOverrides.tan  r   rG   c                    d|  dS )Nr   r9   rd   r   s    rE   tanhzHalideOverrides.tanh  r   rG   c                    d|  dS )Nz3(hl.reinterpret(hl.UInt(32), hl.cast(hl.Float(32), z)) >> 31) != 0rd   r   s    rE   signbitzHalideOverrides.signbit  s    VQVVVVrG   c                    |  d|  d| d| S )Nz - hl.trunc(/z)*rd   r   s     rE   fmodzHalideOverrides.fmod  s(     ----Q--!---rG   c                    d|  d| dS )Nzhl.pow(r\   r9   rd   r   s     rE   powzHalideOverrides.pow  s    """a""""rG   c                    d|  dS )Nzhl.log(r9   rd   r   s    rE   logzHalideOverrides.log  r   rG   c                     t          d          r   r   r   s    rE   r   zHalideOverrides.log2  s    !&)))rG   c                    d|  dS )Nz hl.is_inf(hl.cast(hl.Float(32), r   rd   r   s    rE   isinfzHalideOverrides.isinf       8!7777rG   c                    d|  dS )Nz hl.is_nan(hl.cast(hl.Float(32), r   rd   r   s    rE   isnanzHalideOverrides.isnan  rk  rG   c                    d|  dS )Nr   r9   rd   r   s    rE   roundzHalideOverrides.round  r  rG   c                    d|  dS )Nrr   r9   rd   r   s    rE   floorzHalideOverrides.floor  r  rG   c                    d|  d| dS )Nr   r   z + hl.f32(0))rd   r   s     rE   int_truedivzHalideOverrides.int_truediv  s    +1++1++++rG   c                $    d| j          d|  d| dS )Nz"hl.floor(hl.cast(hl.Float(max(32, .type().bits())), ) / r9   r   r   s     rE   floordivzHalideOverrides.floordiv  s+     WVV1VVRSVVV	
rG   c                   t          j        t          j        d|          t          j                  }t          j        t          j        |d          t          j                  }t          j        ||          }d|j         d| dS )N0r[   r   r9   )rU  r   ltr<   int8subr   )r   r   leftrightr}  s        rE   signzHalideOverrides.sign  sh    |CF3NNEJ77SVAs^^UZ88gdE""1!&1131111rG   c                    d|  dS )Nrw   r9   rd   r   s    rE   trunczHalideOverrides.trunc  r  rG   c                $    d| j          d|  d| dS )Nz"hl.trunc(hl.cast(hl.Float(max(32, ru  rv  r9   rw  r   s     rE   truncdivzHalideOverrides.truncdiv  s+    
 WVV1VVRSVVV	
rG   c                    d|  dS )Nrz   r9   rd   r   s    rE   ceilzHalideOverrides.ceil  r   rG   c                    d|  dS )Nr   z, 0)rd   r   s    rE   reluzHalideOverrides.relu  s         rG   c                t   t           j                            |          }t           j                            t           j                            |          t           j                            |          t          |                    }|t          j        t          j	        fvrt          j        ||          S |S )N)bounds)r$   r]   prepare_indexinggenfuncindex_to_strused_dims_from_indexr   r<   r   r>   rU  r   )r   r`   r   indexvars        rE   
index_exprzHalideOverrides.index_expr  s    ))$//hH!!%((H))%00(..  
 

 ek222<U+++
rG   c                    t          j        |t          j                  }t          j        |||          }||_        t          t          |                    S r   )rU  r   r<   r   halide_clampindirect_indexing_sizer!   str)r   	index_varsizecheckwrap_negs        rE   indirect_indexingz!HalideOverrides.indirect_indexing  sG     LEK88	$Ye<<	+/	(!#i..111rG   c                    t           j                            t           j                            |          dz
            }t	          |t
          t          j        f          sd|j         d| d}d| d| dS )Nr%   r[   r   r9   z	hl.clamp(z, 0, )	r$   r]   kexprrename_indexingr:   r;   r   Integerr   )r   r   r  r  ends        rE   r  zHalideOverrides.halide_clamp  sx    hnnQX55d;;a?@@$em 455 	98UZ88#888C .5--s----rG   c                   t           j                            | |          5 } |            }d d d            n# 1 swxY w Y   |j        j        rt          |          }t           j                            d|j         dt          |           dg t          j
        |          |j                  }t          j        |||          S )Nr[   r   r9   r  shape)r$   r]   
mask_loadsr  is_boolr   r  r   rF   r   wrapr  rU  r  )maskbodyothernew_maskresults        rE   maskedzHalideOverrides.masked#  s    X  u-- 	TVVF	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 =  	 KKE   Fv{FF_U-C-CFFF#E**,	 ! 
 
 y65111s   8<<c                     t          d          )Nfrexpr   r   s    rE   r  zHalideOverrides.frexp5  s    !'***rG   c                     t          d          )Ndevice_assert_asyncr   )condmsgs     rE   r  z#HalideOverrides.device_assert_async9  s    !"7888rG   )NT)r   r   r   r   )r   r   r   r   )TT)FrS   rT   rU   r   r   r   classmethodr   r   r   r   r   r  r  r  r  r	  r  r  r  r  r  r  r  r  r!  r#  r%  r'  r)  r+  r.  r2  r5  r8  r:  r=  r?  rA  rC  rF  rK  rM  rQ  rX  rZ  r\  r^  r`  rc  re  rg  r   rj  rm  ro  rq  rs  rx  r  r  r  r  r  r  r  r  r  r  r  rd   rG   rE   r   r      s        ,0	5 5 5 5 \5    \ ; ; [;   \ k k \k
   \ t t \t t t \t + + \+
   \   \ $ $ \$   \   \   \   \     \    \     \  % % \%   \     \  & & \& $ $ \$ % % \% ' ' \'   \   \   \   \   \   \   \   \   \   \ 8 8 \8 9 9 \9 L L \L ` ` \` " " \"   \   \ W W \W . . \. # # \#   \ * * \* 8 8 \8 8 8 \8     \      \  , , \, 
 
 \
 2 2 [2     \  
 
 \
   \ ! ! \! 	 	 [	 2 2 2 [2 . . [. 2 2 \2" + + \+ 9 9 \9 9 9rG   r   halidec                  `     e Zd Z ej        d          Z	 	 dd fdZd Zd ZddZ	d Z
 xZS )HalideCSEVariablez\b(tmp\d+)\[\?\]Nr  ValueRanges[Any]r   r   r  r7   rJ   rK   c                `    t                                          ||||           d | _        d S Nr  )rM   rN   	used_dims)rO   r   r  r   r  rQ   s        rE   rN   zHalideCSEVariable.__init__D  s1     	vuE:::7;rG   c                T   t          | j        pd          }t          j        ||                                          D ]E}t          |t                    r.|j        J |||f            |                    |j                   Ft          j	        
                    |          | _        d S )Nrd   )r   r  	itertoolschainvaluesr:   r  updater$   r]   sort_used_dims)rO   r   rm   kwargsusedargs         rE   update_on_argsz HalideCSEVariable.update_on_argsN  s    $..B//?499 	+ 	+C#011 +}004d2C000CM***0066rG   c                    t          |          dk    r
| j         dS | j         dd                    t          t          |                     dS )Nr   z[()][r\   ])rl   r   joinmapr  )rO   dimss     rE   	index_strzHalideCSEVariable.index_strV  sO    t99>>i%%%%)::diiC77::::rG   r  c                X    | j         
| j         dS |                     | j                   S )Nz[?])r  r   r  )rO   s    rE   __str__zHalideCSEVariable.__str__\  s/    >!i$$$$~~dn---rG   c                    | j         t          d | j         D                       sJ |                     fd| j         D                       S )Nc              3  J   K   | ]}t          |t          j                  V  d S r   r:   r   Expr.0r   s     rE   	<genexpr>z-HalideCSEVariable.subs_str.<locals>.<genexpr>c  s?       2
 2
*+Jq%*%%2
 2
 2
 2
 2
 2
rG   c                <    g | ]}                     ||          S rd   )get)r  r   replacementss     rE   
<listcomp>z.HalideCSEVariable.subs_str.<locals>.<listcomp>f  s)    NNN!|//155NNNrG   )r  allr  )rO   r  s    `rE   subs_strzHalideCSEVariable.subs_strb  sk    ~)c 2
 2
/3~2
 2
 2
 /
 /
)) 
 ~~NNNNt~NNNOOOrG   )NN)r  r  r   r   r  r7   rJ   rK   )rJ   r  )rS   rT   rU   recompileundefined_rerN   r  r  r  r  rV   rW   s   @rE   r  r  A  s        2:122L (, $< < < < < < <7 7 7; ; ;. . . .P P P P P P PrG   r  c                  H     e Zd ZU ded<   ded<   ded<   d fdZddZ xZS )DimensionInfozOptional[sympy.Expr]r`   
sympy.Exprr  striderJ   rK   c                    t                                                       t          j        j                            |d          r| }| }|| _        || _        || _        d S Nr   )	rM   rN   r$   graphsizevarsstatically_known_ltr`   r  r  )rO   r`   r  r  rQ   s       rE   rN   zDimensionInfo.__init__o  s]    7//:: 	WF5D		rG   NFc                   | j         J | j         }|r|dk    rdS |ri |}|j        D ]}t          |t          j                  r|t          |t          j                  sJ t          j	        
                    |j                  }t          |t                    sJ t          |                    |                    ||<   t          ||          }t          j	                            |          S )Nr   hl.Var())r`   free_symbolsr   r   TMPr:   r   Symbolr$   r]   lookup_cse_varr   r  r!   r  r"   r  )rO   r  	zero_varsr`   symr  s         rE   r  zDimensionInfo.index_strx  s    y$$$y 	: 	2+l+L( W W!#tx00 W%c5<88888(11#(;;C%c+<=====(:3<<;U;U(V(VL%dL11Dx$$T***rG   rR   NF)rS   rT   rU   __annotations__rN   r  rV   rW   s   @rE   r  r  i  sw              + + + + + + + +rG   r  c                h   t           j        j                            | |          rdS 	 t           j        j                            |           }t           j        j                            |          }n# t
          $ r Y dS w xY w||k    r%t           j        j                            | |           ||k    S NTF)r$   r  r  statically_known_equalssize_hint_or_throw	TypeErrorcheck_equals)r~  r  r   r   s       rE   eqr    s    w//e<< tG//55G//66   uuAvv	%%dE2226Ms   AA2 2
B ?B c                   t           j        j                            | |          rdS 	 t           j        j                            |           }t           j        j                            |          }n4# t
          $ r' t          j        | |          }|| k    r| |k    cY S Y dS w xY w||k     r%t           j        j                            | |           ||k     S r  )	r$   r  r  r  r  r  r   gcdcheck_lt)r~  r  r   r   r  s        rE   r{  r{    s    w++D%88 tG//55G//66   ie$$$;;5=   uu	
 	1uu	!!$...q5Ls   AA2 2+B#"B#c                  N    e Zd ZU eZeZded<   dJ fdZdKdZ	dLdZ
dM fdZd Zd ZdN fdZd ZdOdZd Zd ZdNdZd ZdPd ZdQd"ZdRd#Z	 dSdTd(ZdUd-Zd. ZdVd5Z ej                    dd6dWd:Zdd;dWd<ZdRd=Z d> Z!dXd@Z"dSdAZ#e$dB             Z%dSdRdCZ&dD Z'dYdIZ( xZ)S )ZHalideKernelzCallable[[sympy.Expr], str]r  tilingdict[str, sympy.Expr]rJ   rK   c                n    t                      j        |fi | | j        | _        | j        | _        | j        | _        t                      | _        | j        | _	        | j        | _
        i | _        i | _        i | _        i | _        i | _        i | _        t#          t$                    | _        d| _        d S r  )rM   rN   r  computeloadsstoresr)   indexing_code_dominside_reductionneeds_dom_indexinghas_reductionbuffer_dimensionsbuffer_offsetshalide_varsindex_replacementsreduction_renamesdom_renamesr   listbuffer_aliaseshas_indirect_indexing)rO   r  r  rQ   s      rE   rN   zHalideKernel.__init__  s    
 	**6***yY
i!/!1!1"&"7!2AC57;=@BCEHJ4?4E4E%*"""rG   r   r   r  c                     t          |          S r   )r   )rO   r   s     rE   dtype_to_strzHalideKernel.dtype_to_str  s    5!!!rG   Nc                f    | j                             | d|d           t          ||||          S )Nz = hl.Func(r9   )r  	writeliner  )rO   r   r  r   r  s        rE   create_cse_varzHalideKernel.create_cse_var  s=    	t99999::: vue<<<rG   indicesSequence[sympy.Expr]c           
     |     j         s j        s j        rJ t          j        t
          j        j        j        t                    t                              t          t                      j        |                    }t          t                                d t"          j                            d  j        D                       D             d fd}fd}|D ]}|                    t,                    r[|                    t-          t1          j        d          t1          j        d          t1          j        d	                    |           |                    t4                    rH|                    t5          t1          j        d          t1          j        d                    |                               t                                          |          j                   t;          d
 D                        _        d}t?           j                  D ]}fd|j         !                                D             }|"                    fd           |s.|#                    |$                    d|j%                             d}t0          j&        j'        g }	|tQ          |          k     rtS          |j%                  sfd|D             }
|tQ          |
          z  }|
s
J |            t          j*        t
          j        j        j+        |
          z  |
,                    fd|D                        |
rTt          j*        t0          j-        |
          tS          d          r8 |j%        z            tS          d          rJ g }
tQ          |          }d}t]          dtQ           j                             }|j/        r,t]          dtQ           j                              j        |<    j        |<   |	#                    |f           z  fd|D             }|tQ          |          z  }tQ          |
          }fd|
D             }
tQ          |
          |k     s|dk    sJ |
,                    |           |
T|tQ          |          k     rtS          |j%                  |D ]e}	 d}dtS          |j0                  s*|	|         \  }}|dz  }|z  tS          |j0                  *d}t0          j&        j1        }tS          |j2        |          s2|	|         \  }}|dz  }|||z  z  }||z  }tS          |j2        |          2| j         |3                                <   # th          $ r |sJ t0          j&        j1        }t0          j&        j'        }|	D ]\  }}|||z  z  }||z  }t
          j        j        5                    t-          ||j0        |j2                   j                   j         |3                                <   Y cw xY w j        D ]'} j6        7                    | d|j8        d           ( j        r; 9                    d fd j        :                                D                        dS dS )a  
        Hook called right before codegen with every index that will be
        used in the fused kernel.

        This populates self.halide_vars/index_replacements/reduction_renames which is an alternate indexing
        scheme that avoids using divide and modulus.  Instead of xindex/yindex/rindex
        we base indexing on a larger number of vars whose product combines to those.

        This function populates self.halide_vars, self.index_replacements, and self.reduction_renames
        fallbackc                8    i | ]}|                                 |S rd   symbol)r  r   s     rE   
<dictcomp>z2HalideKernel.finalize_indexing.<locals>.<dictcomp>  s2     
 
 
 HHJJ
 
 
rG   c                @    g | ]}|j                                         S rd   )nodesr  )r  trees     rE   r  z2HalideKernel.finalize_indexing.<locals>.<listcomp>  s&    BBB""$$BBBrG   c                n    t          j        t          j        j                            |                     S r   )r   simplifyr$   r  r  remove_precomputed_replacementsr_   s    rE   r"  z0HalideKernel.finalize_indexing.<locals>.simplify  s,    > @@FF  rG   c                    | v r|          }                     |j                            |j        |z  t          j        j                            |t          |j	        |                              
                                           d S d S r   )addrootlookupdivisorr$   r  r  evaluate_minr   lengthr  )baser(  modulusnodeall_used_symbolssym_to_nodes       rE   visit_modular_indexingz>HalideKernel.finalize_indexing.<locals>.visit_modular_indexing  s    {"""4( $$I$$w.(55#Xdk7%C%C  
 fhh     #"rG   c           	         | v re|          }                     |j                            |j        |z  t	          |j        |                                                               d S d S r   )r%  r&  r'  r(  r   r*  r  )r+  r(  r-  r.  r/  s      rE   visit_floor_divz7HalideKernel.finalize_indexing.<locals>.visit_floor_div  s{    {"""4( $$I$$w. g66  fhh	     #"rG   r+  r(  r,  c              3  J   K   | ]}t          |t          j                  V  d S r   )r   r   INDIRECT)r  r  s     rE   r  z1HalideKernel.finalize_indexing.<locals>.<genexpr>  s?       )
 )
36N3..)
 )
 )
 )
 )
 )
rG   Fc                @    g | ]}|                                 v |S rd   r  )r  r   r.  s     rE   r  z2HalideKernel.finalize_indexing.<locals>.<listcomp>  s,    VVV1qxxzzEU7U7UQ7U7U7UrG   c                $     | j                   S r   )r(  )r   	size_hints    rE   <lambda>z0HalideKernel.finalize_indexing.<locals>.<lambda>  s    YYqy%9%9 rG   keyr%   r   c                Z    g | ]'}t          |j                   |j                  (S rd   r  r(  r*  )r  r   r(  r"  s     rE   r  z2HalideKernel.finalize_indexing.<locals>.<listcomp>#  sF          +,AIw9O9O HQX&&     rG   c                    g | ]?}t          |j                  t          |j                  , |j        z            @S rd   )r{  r(  )r  r   r(  r  r"  s     rE   r  z2HalideKernel.finalize_indexing.<locals>.<listcomp>,  sb       gqy11 79C6H6H W!455  rG   Thhrc                H    g | ]}t          |j                  |j        S rd   r<  )r  r   r(  s     rE   r  z2HalideKernel.finalize_indexing.<locals>.<listcomp>D  s,     S S SaBqy'<R<R S S S SrG   c                ^    g | ])}t          |          t          j        |z            *S rd   )r  r   r"  )r  s	next_sizes     rE   r  z2HalideKernel.finalize_indexing.<locals>.<listcomp>G  sG     $ $ $!!Y//$q9}55$ $ $rG   z
 = hl.Var(r9   rdomc                2    i | ]\  }}|j         |         S rd   r	  r  vrvrO   s      rE   r  z2HalideKernel.finalize_indexing.<locals>.<dictcomp>t  s'    UUUUQT%a(UUUrG   N);r
  r	  r  	functoolspartialr$   r  r  r7  r   dictfromkeysr  rM   r  r   r   r  r  from_iterablerange_treeshasr   replacer   Wildr   r  r  anyr  reversedr  r  sortappendr'  numelSOnerl   r  reduceevaluate_maxextendr  r!   is_reductionr(  Zeror*  r  
IndexErrorsimplify_with_rangesindexing_coder  r   codegen_rdomitems)rO   r  r0  r2  r  had_fallbackr   r  handled_countadded_sym_sizesizes_to_addr  	new_sizes	prior_lenr-  idxr  r*  r`   
full_indexr  r.  r(  r  rC  r"  r7  r/  rQ   s   `                    @@@@@@@rE   finalize_indexingzHalideKernel.finalize_indexing  s    #	
'+'7	
;?;Q	
 	
 
 %ag&6&@3OOO	--EGG$<g F FGG%c?,,
 
_22BB1ABBB 
 
 
	 	 	

	 
	 
	 
	 
	 
		 	 	 	 	 	  	R 	REyy)) #
6**
9--
9-- 
 +   yy"" 
6**
9--  $   ##EGG$<$<U$C$C$PQQQQ%( )
 )
:J)
 )
 )
 &
 &
" T-.. S	 S	DVVVV
 1 1 3 3VVVEJJ9999J::: 9T[[DJ77888MgkGN#e**,,R
G5L5L,         05      \!2!22#**U**|	 0G$1<! !  ##     !&     # 3 ) 0L I II)Q'' , %-HTZ'-A$B$B	#%i#3#3333')(+E

'+,-HT5E1F1F-H-HIIC( 6H8T%5!6!6887 7.s3 -6D$S)"))3	*:;;;y(G S S S S5 S S SI!S^^3M #L 1 1I$ $ $ $!-$ $ $L
 |,,y88INNNJ ''	2227 # 3!  #e**,,R
G5L5L,\   CG w77 ($23$7	Tq4 !w77 ( F 7<D f55 '$23$7	Tq,$	 !f55 '
 >BD+DKKMM::!   '''<!&J"W[F%3 ' '	T"fsl2
$(==+JdkRR ,  +DKKMM:::!> # 	J 	JC((C)H)H38)H)H)HIIII! 	UUUUd6L6R6R6T6TUUU    	 	s   &B9V  B"YYc                     j         rdnd}| j        v r j        |         S i } j                                        D ][} j         s
| j        v rt          j        d|j                  }|sJ t          d| |	                    d                     ||<   \ 
                    | d fd|                                D                        | j        |<   |S )zCRDom based indexing uses explicit iteration ranges for Func updatesioz^h(\d+)$r>  r%   domc                2    i | ]\  }}|j         |         S rd   rF  rG  s      rE   r  z3HalideKernel.setup_dom_indexing.<locals>.<dictcomp>  s'    RRRBR!1!!4RRRrG   )r  r  r	  keysr  r  matchr   r!   grouprb  rc  )rO   prefixrenamesr  ms   `    rE   setup_dom_indexingzHalideKernel.setup_dom_indexingw  s   -63T%%%#F++#((** 	H 	HC( SD4J-J-Jch//AHH1-.F&.F!''!**.F.FGGGCLLNNNRRRR'--//RRR	
 	
 	
 $+ rG   c           	     B     fd|                                 D             } j                            | dd                    |           d           t	          |                                          D ](\  }} j                            | d| d| d           )d S )Nc                h    g | ].}d                                          |                     d/S )hl.Range(0, r9   )r  r  )r  r  rO   s     rE   r  z-HalideKernel.codegen_rdom.<locals>.<listcomp>  sO     
 
 
 E4::d&:&:4&@&@AADDD
 
 
rG   z = hl.RDom([r\   ]) = r  r  )r  ra  r  r  	enumeraterr  )rO   r   varsrsizesrn  rsyms   `     rE   rb  zHalideKernel.codegen_rdom  s    
 
 
 

 
 
 	$$%O%O$))F:K:K%O%O%OPPP -- 	B 	BGAt((D)@)@T)@)@A)@)@)@AAAA	B 	BrG   r  r  c                    t                                          |          }t          || j                  }t          j        j                            || j                  S r   )	rM   r  r"   r
  r$   r  r  r`  r	  )rO   r  rQ   s     rE   r  zHalideKernel.prepare_indexing  sL     ((//5$"9::w44UD<LMMMrG   c                    t          |t          j                  r|                     |j                  j        S | j        |         S )zThe size of an index symbol)r   r   r  r  r   r  r	  )rO   r  s     rE   sym_sizezHalideKernel.sym_size  s>    #tx(( 	H&&sx00GG$$rG   r  is_storer   c                	    g t          |j        d           D ]y}t          |t          j        t          j        f          r                    |           >t          |t          j        t          j        t          j	        f          s
J |            zt          j        j        }t                              t          j        j                  g }t          j                             |                    }t#          |t          j                  r|j        n|gD ]}fd|j        D             t)                    dk    r||z  }.t)                    dk    rd         xx         |z  cc<   Xg }t+          t)          |                    D ]t}	||	         J ||	         \  }
}t-          |
          t-                    z  r'                    fd|
D                        ||z  }]|                    |
|f           ug ||f} fd}g }|D ]A\  }}|D ]}|                    |          z  }|                     |||                     B                                D ]%\  }}|                     |||g                     &|                    d	            |s; j        r3|                    t9          t          j        j        dd                     nst:          j        j                             |d         j!        d          sC|"                    dt9          t          j        j        rdn|d         j!        d                     |rs| j#        v rbt:          j        j        $                    | j#        |                   r2 %                    || j#        |         z
              j#        |         }n=t:          j        j        &                    |d          r %                    ||           d}|}tO          j(                    D ]Z}	 )                    |||          r||fc S rJ | d
|	 }| j*        |         vr  j*        |                             |           [dS )zEConvert address-based indexing into dimensions using self.halide_varsc                    | j         S r   rw  r   s    rE   r8  z5HalideKernel.indexing_to_dimensions.<locals>.<lambda>  s    AF rG   r9  c                    g | ]}|v |	S rd   rd   )r  rH  
split_exprs     rE   r  z7HalideKernel.indexing_to_dimensions.<locals>.<listcomp>  s    IIIqjrG   r   r%   Nc                    g | ]}|v|	S rd   rd   )r  rH  	part_varss     rE   r  z7HalideKernel.indexing_to_dimensions.<locals>.<listcomp>  s#    )V)V)V1ICUCU!CUCUCUrG   c                   t          j        |           } t          |          dk    rlt          j        d	          }|                     ||d         z            }|r6t          |d                             |d                   ||                   S r
J |             t          j        t          | fd|D                       dz             }t           j	        j
        }t          | t           j                  ri| j        D ]a}t          |t           j                  rE||z  }t          j        | |z            } t          j        t          j        ||z                      }bt          | ||          S )Nr%   wild)excluder   c                B    i | ]}|                     |          d z
  S )r%   )r  )r  r  rO   s     rE   r  zRHalideKernel.indexing_to_dimensions.<locals>.expr_to_dimension.<locals>.<dictcomp>  s,    !N!N!N##t}}S'9'9A'=!N!N!NrG   )r   factorrl   rR  rs  r  r  r"  r"   rX  rY  r:   Mulrm   r  ceiling)
r`   symsstride_wildrw  r*  r  termr  rO   symbolss
          rE   expr_to_dimensionz>HalideKernel.indexing_to_dimensions.<locals>.expr_to_dimension  sj   <%%D4yyA~~#jAAAJJ{T!W455 (QtAw!7!7;    %%%%<^4!N!N!N!N!N!N!NOORSS F W[F$	** N I N ND!$66 N$$~dTk::!&ftm0L0L!M!M vv666rG   c                b    t           j        j                            | j        t
                    S )Nr  )r$   r  r  r7  r  r   )ds    rE   r8  z5HalideKernel.indexing_to_dimensions.<locals>.<lambda>  s      0 : :18c : R R rG   _view)+sortedr  r   r   HALIDEr  rV  UNBACKED_INTSIZEPRECOMPUTED_SIZEr   rX  r^  rL  rM  expandr  r:   Addrm   rl   ranger   r\  poprc  rU  r  r  r$   r  r  r  r  insertr  statically_known_geqapply_offset_to_dimensionstatically_known_gtr  countinstall_dimsr  )rO   r  r  r  r  rJ  split_failedpartnew_split_failedrn  
other_vars
other_partr  r  r  r`   rH  orig_varr  r  r  s   `  `              @@@rE   indexing_to_dimensionsz#HalideKernel.indexing_to_dimensions  s   %,2B2BCCC 	 	CcDK#:;; 
s####%)	-        ]]7EGL99
DFT11%8899",UEI">">KEJJUG 	F 	FDIIIID$5IIII9~~""$Y1$$9Q<(((D0((((#% s<0011 J JA'?666-9!_*J
!*--
90E0EE J!(()V)V)V)VZ)V)V)VWWW
*(//Z0HIIIIE!1EIt3DE	7 	7 	7 	7 	7 	7 	7. & 	7 	7JD$ * *
q)))KK))$556666#))++ 	8 	8ICKK))$667777		RR	SSS 	) ?M%',1==>>>!99$q'.!LL 	KK=H/Pqq$q'.RSTT    	 	d)))ag.>.S.S+C0/ /) ..tVd>QRU>V5VWWW,S1!55   ..tV<<<"" 	: 	:A  dFH== !Dy   <''A''C$-h777#H-44S999	: 	:rG   c                   || j         vr|| j         |<   || j        |<   dS | j        |         |k    s+t          | j         |                   t          |          k    rdS |r| j         |         |k    S t          | j         |         |          D ]s\  }}|j        |j        k    r dS |j        |j        k    s|j        |j        k    r;t          j        j	        
                    |j        |j                  |_        d|_        tdS )z>Try to set self.buffer_dimensions[var], return True on successTFN)r  r  rl   zipr  r  r`   r$   r  r  r[  )rO   r  r  rJ  r  oldnews          rE   r  zHalideKernel.install_dims  s   d,,,*.D"3''-D$4s#v--"3'2
 2
YY2 2 5 	7)#.$66D237>> 	  	 HCzSZ''uux38##sx38';';7+8838LLtrG   c                   |dk    rd S t          t          t          |                              D ]}||         j        dk    s0t          j        j                            |||         j                  rDt          |||         j                  }||||         j        z  z  }||         xj	        |z  c_	        |dk    sJ d S )Nr   r%   )
rT  r  rl   r  r$   r  r  r  r   r`   )rO   r  rJ  rn  r  s        rE   r  z&HalideKernel.apply_offset_to_dimension  s    Q;;F%D		**++ 	% 	%AAw~""ag&6&K&KQ' '"  Q77$a//Q${{{{{{rG   c                   t          t          j                             }|j        D ]	}t	          |t          j                  sJ t          |t          j                  rS|                     |j	                  }t	          |t                    r|j        J |                    |j                   t          |t          j                  r|                    |           t          |t          j        t          j        t          j        t          j        f          rt'          d|           |                     |          S )zIDetect which range trees are used to populate HalideCSEVariable.used_dimsNzunhandled symbol )r   r   r  r  r:   r   r   r  r  r   r  r  r  r  r%  r  r  r  INDEXr   r  )rO   r  r  r  cse_vars        rE   r  z!HalideKernel.used_dims_from_index+  s/   u|,..	% 	E 	ECc5<00000c48,, E--ch77w(9::)556   !23333T[11 Ec""""d'D4I4:V  E )*Cc*C*CDDD""9---rG   c                    t          d D                       sJ fdt          j        | j        | j                                                  D             }t          |          t                    k    sJ |S )Nc              3  J   K   | ]}t          |t          j                  V  d S r   r  r  s     rE   r  z.HalideKernel.sort_used_dims.<locals>.<genexpr>C  s.      @@:a,,@@@@@@rG   c                    g | ]}|v |	S rd   rd   )r  r  r  s     rE   r  z/HalideKernel.sort_used_dims.<locals>.<listcomp>D  s2     
 
 
 i	   rG   )r  r  r  r	  r  r  rl   )rO   r  ordereds    ` rE   r  zHalideKernel.sort_used_dimsB  s    @@i@@@@@@@@
 
 
 
  $"8"?"?"A"A 
 
 
 7||s9~~----rG   Fc                    d                     fd|D                       }t          |          dk    rd}nt          |          dk    r| d}|S )Nr\   c              3  D   K   | ]}|                               V  d S r   )r  )r  r  r  r  s     rE   r  z.HalideKernel.make_index_str.<locals>.<genexpr>O  s1      QQqakk,	BBQQQQQQrG   r   ()r%   ,)r  rl   )rO   r  r  r  r  s     `` rE   make_index_strzHalideKernel.make_index_strN  sc    IIQQQQQDQQQQQ	t99>>IIYY!^^$IrG   r   c                   | j                             |          }|                     |          }|                     ||d          \  }}| d|                     |           d}t
          j                            |          }|t          j	        t          j
        fv rt          j        }d| d}| j        rt          | j        t                    r| j        j        J t!          g |                     |          | j        j        R           }|                     |                     |                    }|j        r| j                            |j         d           | j                            |j         d| j         d           |                     | j        pd	          }	| j                            | d
t3          |           d|	 d           | j                            | d| dt3          |           d|j         d           n8| j                            | d| j         d| dt3          |           d           |S |                     ||                     |                    S )z"Codegen a load from an InputBufferFr  r  rc   r9   Nz!_mask = hl.RDom([hl.Range(0, 1)])z_mask.where(r   z = hl.cast(r\   r}  z + hl.cast(z_mask)z = hl.select(z
, hl.cast(z, 0)))rm   inputr  r  r  r$   r  	get_dtyper<   r   r   r   
_load_maskr:   r  r  r   r  newfuncr  r  r  r   r  _load_otherr   r  )
rO   r   r  r  r  r   r   r  r  r  s
             rE   rV  zHalideKernel.loadW  s   iood##%%e,,//UEBB	T44++D11444!!$''U]EN333ME3D333D? 	H4?,=>>O-99: #O$++E22OT_5NOO I \\$"5"5i"@"@AAF 	##v{$U$U$UVVV	##v{$R$R$R$R$RSSS

4#3#8q99	##HH+e*<*<HHHHH   	##\\$\\;u3E3E\\\\\   
 	##hhDOhhthh{[`OaOahhh   M<<d&?&?&F&FGGGrG   c                N    | j         j        t          j        dd|                   S )Nz\[.* )csevarname_mapr  r}  rO   r   s     rE   r  zHalideKernel.lookup_cse_var~  s     x#BF7B$=$=>>rG   r   r'   moder6   c                   t          |t                    sJ | j                            |          }|                     |          }|                     ||d          \  }}|                     |          s||                                 }|                     ||          }|	                    |          }	d
                    dgt          |          z            pd}
| j                            t          || d|
 d| d                     n&|                     |d	          }t          |          }	t           j                            |          }|| d| d
t'          |           d|	 d}n4|dk    r| d| dt'          |           d|	 d}nt)          d|           | j                            t          ||                     dS )z"Codegen a store to an OutputBufferTNr\   r  r  r  z] = hl.undef(z.type()))r  z] = hl.cast(r9   
atomic_addz] += hl.cast(zstore mode=)r:   r  rm   outputr  r  is_indirect_indexingrx  r  r  r  rl   r  r  r(   r  r$   r  r  r   r   )rO   r   r  r   r  r  r  r  r  	value_str
undef_dimsr   r   s                rE   storezHalideKernel.store  s    %!233333it$$%%e,,//UDAA	T$$U++ 
	#t/?2244L++D,??I|44I))ZL3t99$<==F$JITc#R#RJ#R#RS#R#R#RSS    ++DD+AAIE

I!!$''<UUIUU;u3E3EUUUUUDD\!!VVIVVK4F4FVV)VVVDD%&:D&:&:;;;	Lt4455555rG   r   reduction_typer5   +Union[CSEVariable, tuple[CSEVariable, ...]]c           	     8   | j         sJ | j        rJ |||f}|| j        j        v r| j        j        |         S t	          |t
                    r#|dk    sJ  | j        | x| j        j        |<   }|S t	          |t                    r|j        J t          | j
                  |                     fd|j        D                       }t          |j                  z
  rH|                     | |                     t          g |j        R                     |j                  }|                    | j
                  }t           j                            ||          }	t'          |          }
|dv r|j         d| }| j                            | d| d| d	           g }d
}t/          | j
                  D ]I\  }}|                    | d| d           |d
k    r|dxx         d| z  cc<   || j        |         z  }J| j                            | dd                    |                      n|dk    r|                     ||          }nt9          ||
          }t;          j        t?          tA                                          5   |||          }ddd           n# 1 swxY w Y   d|
 dtC          |	           d	}| j                            | d|            | j                            | d|            || j        j        |<   |S )zCodegen a reduction operationwelford_combineNc                    g | ]}|v|	S rd   rd   )r  rH  reduction_varss     rE   r  z*HalideKernel.reduction.<locals>.<listcomp>  s#    CCC11N+B+BQ+B+B+BrG   r  )argmaxargmin_z = hl.z(rdom, r9   r%   r  r  *r}  rS  welford_reducer[   r\   )"r  r  r  reduction_cacher:   tuplewelford_combine_implr  r  r   r  r  r  r  r  r  r   	Reductiondefault_accumulatorr   r   r  r  r~  rV  r	  r  welford_reduce_fallbackr   r$   set_ops_handlerr   r   rF   )rO   r   r   r  r   	cache_keyresult_tuple
result_varr  defaultacc_typer  partsr  rn  r  
combine_fncombine_strdefault_strr  s                      @rE   	reductionzHalideKernel.reduction  s    $$$$?"""6	0008+I66eU## 	 !%66666))51DH$Y/,  %!233S8S8SS#D$:;;\\CCCCCCC
 

 Ju777 	LL
##J/R/R>/R/R$S$STTk !  E
 NN4#9::	,22>9MM"5))111!9999EI5 S S S Sy S S STTTEF#D$:;; 0 03_____---Q;;"IIIV-III$*3//I: E E%**U2C2C E EFFFF///55eUCCJJ1.(KKJ"??3D3D#E#EFF @ @(jY??@ @ @ @ @ @ @ @ @ @ @ @ @ @ @LXLL1I1ILLLKI: ? ?+ ? ?@@@I: ? ?+ ? ?@@@.8 +s   J,,J03J0c                   t          |t                    r|j        J t          |t                    r|j        J t          |t                    r|j        J t          g |j        |j        |j        R p| j                  }|t          | j                  z  }|                     |                     |                    }d |||fD             }|j        }| j	        
                    | dd                    |           d           | j	        
                    | d| d           | j	        
                    | d| d           | j	        
                    | d	| d
           | j	        
                    | d|                    | j                              | j	        
                    | d|                    | j                              | j	        
                    | d|                    | j                              | j	        
                    | d| d| d           | j	        
                    | d| d| d           | j	        
                    | d| d| d| d           | d| d| d| d| d| d| d| d| d| dg}| j	        
                    | dd                    |           d           g }	t          d          D ]X}
|	                    |                     |j                             | j	        
                    |	d           d!| d"|
 d#           Yt          |	          S )$Nc                $    g | ]}d |j          dS )r[   z.type(), 0)rw  r  s     rE   r  z5HalideKernel.welford_combine_impl.<locals>.<listcomp>  s&    NNNa1af111NNNrG   z = hl.Tuple([r\   r|  z
_mean_1 = z[0]z_m2_1 = z[1]z_weight_1 = z[2]z
_mean_2 = z_m2_2 = z_weight_2 = z	_delta = z
_mean_2 - _mean_1z_new_weight = z_weight_1 + 	_weight_2z_w2_over_w = hl.select(z_new_weight == 0.0, 0.0, z_weight_2 / z_new_weight)z
_mean_1 + z	_delta * 
_w2_over_wz_m2_1 + z_m2_2 + z_weight_1 * _new_weightr   r  r}  r  r  )r:   r  r  r   r	  r  r  r  r   r  r  r  r  r  rV  r  )rO   meanm2weightr  r  r  pfxr  unpackedrn  s              rE   r  z!HalideKernel.welford_combine_impl  s   $ 122Qt~7Q7QQ"/00MR\5M5MM&"344U9I9U9UU?dn?r|?f.>??S4CS
 
	 	Z 6777	\\$"5"5i"@"@AA
NND"f;MNNNo	zNN		'8J8JNNNOOO	s==j===>>>	s;;J;;;<<<	s??
???@@@	sUUdmmD<R.S.SUUVVV	sQQBKK8N,O,OQQRRR	II0F G GII	
 	
 	
 		sHHSHHCHHHIII	sQQ#QQ3QQQRRR	kk3kkkkZ]kkk	
 	
 	
 ;;c;;C;;;eeCeeeeseeSeeVYeee

 		zMM		&8I8IMMMNNNq 	G 	GAOODLL)=>>???I8B< E EJ E E E E EFFFFXrG   dtypestuple[torch.dtype, ...]r  UCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]values_origtuple[CSEVariable, ...]c           
          j         sJ t          |          t          |          k    sJ g }t          t          j                             |D ]}t          |t                    r|j        J t          |j                  t           j                  z  r|	                    |           nI|	                     
                    | g |j        g  j        d d         |j                                                 |j                   É                                                              j        r)t          j                  t           j                  z  sJ d t          ||          D             }                                           j        d         j                            }j         d}| d}	 j                            | d| d           t           j                  dk    s
J d	            g  j        \  }
|
t/          |	          i|
t/          |	          dz
  it          |          dk    r0d
 }                              g}                              g}nUd }fdt3          t          |                    D             }fdt3          t          |                    D             } j                             d ||                      t5          j        t9          t;                                          5   |||          }d d d            n# 1 swxY w Y    j                                                           d ||                      t          |          dk    rfS  fd|D             }t=          |          D ](\  }} j                            | d d| d           )t?          |          S )Nr%   r  c                @    g | ]\  }}d t          |           d| dS )r[   r\   r9   )r   )r  r   r   s      rE   r  z%HalideKernel.scan.<locals>.<listcomp>   sG     
 
 
u :u--99999
 
 
rG   r  _rdomz.xz = hl.RDom([hl.Range(1, z)])z&multi-dimensional scan not implementedc                    | d         S r  rd   r   s    rE   maybe_tuplez&HalideKernel.scan.<locals>.maybe_tuple3  s    trG   c                4    dd                     |            dS )Nz
hl.Tuple([r\   r|  )r  r   s    rE   r  z&HalideKernel.scan.<locals>.maybe_tuple:  s    4DIIaLL4444rG   c                H    g | ]}                               d | dz   S r  r  r  )r  rn  r  scan_renames_pris     rE   r  z%HalideKernel.scan.<locals>.<listcomp>=  sE        ##$455A@  rG   c                H    g | ]}                               d | dz   S r	  r
  )r  rn  r  scan_renames_curs     rE   r  z%HalideKernel.scan.<locals>.<listcomp>A  sE        ##$455A@  rG   r}  c                `    g | ]*}                                                             +S rd   )r  r  )r  r  all_used_dimsrO   s     rE   r  z%HalideKernel.scan.<locals>.<listcomp>R  s3    XXXAt||D$7$7$F$FGGXXXrG   r  r  ) r  rl   r   r   r  r:   r  r  r  rV  r  r  r  r  r  r  r  r  rO  rW  r   r  r  r!   r  r  r$   r  r   r   r~  r  )rO   r  r  r   r  r   initialr*  scan_domscanscan_varr  	read_left
read_rightr  unpack_varsrn  rH  r  r  r  r  s   `                 @@@@rE   r  zHalideKernel.scan  s    $$$$6{{c+......*,"5<022  	2 	2Ee%677WEO<W<WW%/**Z8N-O-OO 	e$$$$LL 
I%/I+DT-C+DRaR+HI#k !       1111\\$"5"5m"D"DEE
# 	

:3G(H(H:"L
 L
 )
 	
 	
 

 
 #FF 3 3
 
 

 D001A"1E1KLLMM o,,,	xLLLLLMMM4)**a///4 0// 0./$&8&>&>?$&8&>&>&BCv;;!   $,,-=>>?I$--.>??@JJ5 5 5    s6{{++  I    s6{{++  J
 		zDDkk'.B.BDDEEE /@/@AABB 	< 	<$*Y
;;K	< 	< 	< 	< 	< 	< 	< 	< 	< 	< 	< 	< 	< 	< 	<	""#344SS[9Q9QSS	
 	
 	
 v;;!= XXXXXQWXXXk** 	< 	<DAqI1 : : : :a : : :;;;;[!!!s   0M		MMr  r  r7   r  c                   | j                             | j        |||          }t          |t                    sJ ||_        |S )Nr  )r  generater  r:   r  r  )rO   r   r  r  r  r  s         rE   r  zHalideKernel.genfuncW  sG     h	4eLL#011111!
rG   r  c               x    | j                             |          }t          |t                    sJ ||_        |S r  )r  newvarr:   r  r  )rO   r  r  r  s       rE   r  zHalideKernel.newfuncd  s:    hooEo**#011111!
rG   c                    t           j                            |                                                                          S )a  
        We map all tensors to 1D buffers in Halide since Halide has trouble representing some strides that PyTorch
        supports.  If there are gaps in the underlying layout the numel we pass to Halide includes the gaps while
        PyTorch's numel excludes them.
        )r$   r  
get_buffer
get_layoutstorage_sizer  s     rE   halide_buffer_numelz HalideKernel.halide_buffer_numelj  s2     w!!$''2244AACCCrG   c                   d }g }| j                                         \  }}}}t          t          ||          |          D ]\  }|                    |f           t          t                    rSj        dk    rj        J |	                    fd| j
                            j        d          D                        |S )zX
        Halide requires scalar inputs before outputs, so need to reorder args.
        c                j    | \  }}t          |t                    rdS d|j        v rdS d|j        v sJ dS )Nr%   out_ptrr   in_ptrr   )r:   r-   r   )	arg_tuple	_call_strr  s      rE   	arg_orderz.HalideKernel.halide_argdefs.<locals>.arg_orderw  sL    &NIs#w'' qch&&q38++++qrG   r9  r   Nc           	   3  l   K   | ].}d t          |j        j        j        j                  fV  /d S )Nalias_of)r.   bufferr   rJ  r   )r  aliasr  s     rE   r  z.HalideKernel.halide_argdefs.<locals>.<genexpr>  sg          !!JIJ%(X  	     rG   rd   )rm   python_argdefsr  r  rV  r:   r.   rJ  r)  r\  r  r  r   )rO   r&  r  r  r   r   call_strr  s          @rE   halide_argdefszHalideKernel.halide_argdefsr  s    
	 	 	 =?Y--//
1a#C1II9=== 	 	MHcMM8S/***#y)) zQ3<+?+??     "&!4!8!82!F!F      rG   r   c                    g }                                  D ]\  }}t          |t                    r	d}d}d}d}n fd j        |j                 D             } fd j        |j                 D             }t          |          t          |          k    sJ t           j        |j                           }t          |j	                  d}|
                    t          ||j        ||||j                             t          j                                        }|j        dk    r6t"          j        j        g}	t"          j        j        }
dt+                      i}d}n|j        d	k    s
J d
            |j        dk    s
J d            t"          j        j        g}	t"          j        j        }
t2          j                            |          }d|	d         vr:dD ]7\  }}|j        |k    r'|j        |k    r|	
                    d| |             n8|	
                    d           d|j        i}t?          d|j                  }|	
                    d           |	
                    d           t"          j        j         s|	
                    d           t"          j        j!        r|	
                    d           d j"        v r|	
                    d           tG          |d$                    |	          |
||          S )z)Compute metadata required by codecache.pyNlongc                ^    g | ])}t                              |j                            *S rd   )r0   r  r  r  r   rO   s     rE   r  z3HalideKernel.halide_kernel_meta.<locals>.<listcomp>  sA        $..qv6677  rG   c                ^    g | ])}t                              |j                            *S rd   )r0   r  r  r2  s     rE   r  z3HalideKernel.halide_kernel_meta.<locals>.<listcomp>  sA        $..qx8899  rG   r  )r  r  rJ  r)  cpuparallelismcudazonly cpu/cuda supportedr   zonly default device supportedcuda_capability))      )r8  r   )      )r:  r   )r9  r%   cuda_capability_user_contextstrict_float
no_runtime
no_assertsdebug64large_buffers-)target	schedulerscheduler_flagscuda_device)%r.  r:   r-   r  r   rl   r0   r  r/   r   rV  r   r)  r$   r  get_current_device_or_throwtyper   r  
cpu_targetscheduler_cpur    r  
gpu_targetscheduler_cudar<   r6  get_device_propertiesmajorminormulti_processor_countr@   assertsrA  r^   r   r  )rO   argtypesr  r  r  r  rJ  r   current_devicerE  rF  rG  rH  
capabilityrP  rQ  s   `               rE   halide_kernel_metazHalideKernel.halide_kernel_meta  sM   ))++ 	 	FAs#w'' 6   !3CH=     !3CH=   5zzS[[0000t238<=='	2555OOH!! \  	 	 	 	 <<>>%''m./F3I355O KK!&&0002K000!'1,,,.M,,,m./F4I99.IIJ q	11$L  LE5!'500Z5E5N5N&G&G&G&GHHHMM.)))z?O
 a!566K 	n%%% 	l###}$ 	(MM,'''= 	#MM'"""4###MM/***88F##+#
 
 
 	
rG   c                L
     j         j        rt          d                                           }t	                      }|                    dd           |                                                                  D ]\  }}t          |t                    r&|
                    |j         d j         d           @|j        s
J |            d|j        v rdnd	}t          |j                  }t!           j        |j                           }|
                    |j         d
| d| d| d           |                    d           |                                                                  D ])\  }}|
                    |j         d|j                    * j                                         D ]\  }	}
|
                    |	 d
|
             |                     j                    fd} j        j        D ]L}t          |t,                    r t.          j                            ||          }|
                    |           M|
                    d           |
                    d                                            D ]\  }}t          |t                    rLt4          j        j                            |j        d          }|
                    |j         d| d           g j        |j                 }g }t?          |          D ]\  }}                      t4          j        j                            |j!        d          |          }|"                    d| d           d|j        vr|
                    |j         d| d           	 |
                    |j         d| dtG          |j$                   d           n# tJ          $ r Y nw xY w	 |
                    |j         d| dtG          |j!                   d           # tJ          $ r Y w xY w|
                    |j         dd&                    |           d           |'                    d           |                    d(                                           |j)        rK|                    dtU          j+        |j)                  d|j,        d |j)        d|j-        d!	d           n |                    d"|j,        d#d           |.                                S )$z3Called at the end to generate a final kernel stringinplace_buffersz
            import halide as hl
            from torch._inductor.runtime import halide_helpers
            from math import inf, nan

            @hl.generator(name="kernel")
            class Kernel:
        Tstripz = hl.InputScalar(r9   outzhl.OutputBufferzhl.InputBufferr}  r   r\   z&
            def generate(g):
        z = g.c                    t          t          j        j        |                     d                             }|j        
J |            t          |          S )Nr%   )r   r  r  r  rt  r  r  )rw  r  rO   s     rE   update_indexz1HalideKernel.codegen_kernel.<locals>.update_index  sH    ($(*>qwwqzz*JKKC=,,c,,,s88OrG   r  zassert g.using_autoscheduler()r%   r  z.set_estimate(r{  z.dim(z).set_min(0)z).set_stride(z).set_extent(z.set_estimates([r|  r   zN
            if __name__ == "__main__":
                hl.main()
            z:
                else:
                    hl.load_plugin(z))
                    target = hl.Target(z=)
                    autoscheduler = hl.AutoschedulerParams(a  )
                    with hl.GeneratorContext(target, autoscheduler):
                        gen = Kernel()
                        pipeline = gen._build_pipeline()
                        # gen.compile_to_callable() does not run the autoscheduler
                        pipeline.apply_autoscheduler(target, autoscheduler)
                        kernel = pipeline.compile_to_callable([
                                gen._get_input_parameter(a.name)._to_argument()
                                for a in gen._get_arginfos()
                                if a.dir == hl.ArgInfoDirection.Input
                            ], target)
                zR
                  else:
                      with hl.GeneratorContext(hl.Target(zX)):
                          kernel = Kernel().compile_to_callable()
                  )/rm   rY  rI   rW  r)   splice	do_indentr.  r:   r-   r  r   r^   r*  r   r   rl   r  aliasesra  r  _linesr  r  r  r}  r$   r  r  r7  r`   r~  _autoscheduler_workaroundsr  rV  r;   r  r  r  do_unindentrstriprF  r   find_libautoschedulerE  rG  getvalue)rO   r   metacoder  r  argclsargtypendimr  r  r^  r   hintr  range_hintsrn  dims   `                 rE   codegen_kernelzHalideKernel.codegen_kernel  s
   9$ 	1/000&&((  	 
	
 
	
 
	
 	))++ 	L 	LFAs#w'' L#(QQd>NQQQRRRRz&&3&&z.3sx.?.?**EU%ci0041#(;<<#(JJvJJJJ4JJJKKKK	
 	
 	

 	))++ 	9 	9FAsNNch77SX778888	))++ 	- 	-HCNNc++c++,,,,D&'''	 	 	 	 	
 I$ 	! 	!D$$$ N(599,MMNN4    r7888))++ 	X 	XFAs #w'' Xw'11#(Q1GG#(AA$AAABBBB-ch7 'oo ! !FAs::(2238a2HH$ D  &&'=d'='='=>>>CH,,#('H'H'H'H'HIII! NN#&8 T T! T T#cj// T T T     ) ! ! ! D!! NN#&8 R R! R R#ch-- R R R     ) ! ! ! D! - #(VVDIIk<R<RVVVWWWW 		
 	
 	
 > 	KK$3$H$X$X  )-  =AN	  QUPd	    #     ( KK:>+  
      }}s$   5O
O! O!%5P
P*)P*c                    t          |          dk    rLt          j        j        dk    r7t          j                                        j        dk    rt          d|           } | S )Nr%   Anderson2021r6  r   )	rl   r   r  rN  r$   r  rI  rJ  r@   )r   r  s     rE   rc  z'HalideKernel._autoscheduler_workaroundsb  sS     IINN,>>3355:fDD Aq		ArG   c                f   t           j        j        }d |                                 D             }t           j                                        }|j        dk    r?|                    |j        t           j        j                  }|	                    |           |
                    |||d           dS )zCodegen a call to this kernelc                (    g | ]\  }}|j         | S r   r(  )r  r   r  s      rE   r  z,HalideKernel.call_kernel.<locals>.<listcomp>p  s%    XXX33<CWVCWCWCWrG   r6  F)devicetritonN)r$   r  wrapper_coder.  rI  rJ  write_get_raw_streamr  r   rV  generate_kernel_call)rO   r   r-  wrapper	call_argsrU  stream_names          rE   call_kernelzHalideKernel.call_kernelm  s    '&XX$*=*=*?*?XXX	<<>>&((!66$agl K [)))$$!	 	% 	
 	
 	
 	
 	
rG   c                    dS r  rd   )rO   r  s     rE   generate_assertzHalideKernel.generate_assert~  s    urG   r`   r  lowerupperc                    d S r   rd   )rO   r`   r  r  r  s        rE   check_boundszHalideKernel.check_bounds  s	     	rG   )r  r  rJ   rK   )r   r   rJ   r  )NNN)r  r  )r  r  )r  r  r  r  r  r   r  )r   r  r  r  )r   r  r   )
r   r  r  r  r   r'   r  r6   rJ   rK   )
r   r   r   r   r  r5   r   r  rJ   r  )r  r  r  r  r   r  rJ   r  )r  r7   rJ   r  )rJ   r   )r`   r  r  r  r  r   r  r   )*rS   rT   rU   r   	overridestexprr  r  rN   r  r  rl  rx  rb  r  r  r  r  r  r  r  r  rV  r  r  r  r  r  r   unknownr  r  r  r.  rW  rp  r   rc  r}  r  r  rV   rW   s   @rE   r  r    s        I).E....+ + + + + +6" " " "= = = =i i i i i iV  *B B BN N N N N N% % %f: f: f: f:P  (
 
 
. . . ..
 
 
   %H %H %H %HN? ? ? ? SW6 6 6 6 6:< < < <|$ $ $LS" S" S" S"t #{"$$ $      =A      D D D D" " "HQ
 Q
 Q
 Q
fw w w wr   \
 
 
 
 
"         rG   r  c                  0    e Zd ZeZedd            Zd ZdS )	HalideSchedulingru  torch.devicerJ   OrderedSet[BackendFeature]c                    t          t          j        t          j        t          j        g          }t
          j        j        r|                    t          j	                   |S r   )
r   r&   TUPLE_REDUCTIONPREFER_STORE_LOOP_ORDERREDUCE_TO_SINGLE_ELEMENTr   r  scan_kernelsr%  SCAN)r   ru  r  s      rE   get_backend_featuresz%HalideScheduling.get_backend_features  sQ    .67
 
 =% 	,JJ~*+++rG   c                R   t           j        j        }||j        v r|j        |         }nd|                                 }||j        |<   |                    d           t                      }|                    d|                                d           |	                    |d           |                    d           t          ||          \  }}| d| }	|                    ||                                |	           t          d	          rt          |d
|           |S )z6Codegen kernel definition to go in output wrapper codehalide_kernel_zEfrom torch._inductor.runtime.hints import HalideMeta, HalideInputSpeczasync_compile.halide(z, '''TrZ  z''')
kernel_metadatar  )r$   r  rw  src_to_kernelnext_kernel_suffixadd_import_oncer)   r  rW  r_  r   define_kernelrg  r   r   )
rO   src_codenode_scheduler]   rz  kernel_namecompile_wrapperoriginsdetailed_originsmetadata_comments
             rE   r  zHalideScheduling.define_kernel  sY   '&w,,,!/9KKI7+E+E+G+GIIK.9G!(+##W   -..O%%L(A(A(C(CLLL   ""84"888%%f---(;M7(S(S%G%")??-=??!!_55779I   ''899 ?#KX>>>rG   N)ru  r  rJ   r  )rS   rT   rU   r  kernel_typer  r  r  rd   rG   rE   r  r    sF        K
 
 
 [
    rG   r  )t
__future__r   dataclassesrJ  r  loggingr  collectionsr   mathr   typingr   r   r   r	   r
   r   r   r<   torch._logging_prims_commonr   utils._ordered_setr   utils._sympy.functionsr   r   utils._sympy.symbolr   r   utils._sympy.value_rangesr   r  r   r   	codecacher   r   metricsr   r   ops_handlerr   runtime.hintsr   r   utilsr   r   r    r!   r"   virtualizedr#   rU  r$   commonr&   r'   r(   r)   r*   r+   r,   r-   r.   cppr/   	cpp_utilsr0   simdr1   r2   r3   collections.abcr4   r5   r6   shape_propagationr7   	getLoggerrS   rg  rF   RuntimeErrorrI   rY   r   r  pexprr   r   r   r   float64r|  int16r   r>   uint8uint16uint32uint64r   r   r   r   _initialize_pointwise_overridesr  	dataclassr  r  r{  r  r  rd   rG   rE   <module>r     sb   " " " " " "              				 # # # # # #       F F F F F F F F F F F F F F F F       - - - - - - , , , , , , ? ? ? ? ? ? ? ? 7 7 7 7 7 7 7 7 4 4 4 4 4 4         ' ' ' ' ' ' ) ) ) ) ) ) B B B B B B B B ) ) ) ) ) ) 7 7 7 7 7 7 7 7              ) ( ( ( ( ( ( (
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
             ; ; ; ; ; ; ; ; ; ;  3((((((66666666222222g!!
 
 
F F F F F, F F F
Q Q Q Q QM Q Q QD 	 
J	NO	M>	M>	M>	J	K	K	K	K	L-	L-	L-"    H9 H9 H9 H9 H9k H9 H9 H9V
  / / 9 9 9%P %P %P %P %P %P %P %PP + + + + + + + +>
 
 
   ^ ^ ^ ^ ^: ^ ^ ^B+ + + + +~ + + + + +rG   