
    `i                       d dl mZ d dlZd dlZd dlZd dlZd dlmZ d dlm	Z	m
Z
mZ d dlZd dlmZ d dlZd dlmZ d dlmZ d dlmZmZ d d	lmZ d
dlmZmZmZ d
dlmZm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+m,Z, erd dlm-Z- d
dl.m/Z/m0Z0 d
dl1m2Z2m3Z3 ddl"m4Z4  ej5        e6          Z7ej8        dej9        dej:        dej;        dej<        dej=        dej>        dej?        dej@        di	ZAd+d!ZB G d" d#e          ZC G d$ d%e'          ZDeDE                    d&           eDF                                  G d' d(e+          ZG G d) d*e,          ZHdS ),    )annotationsN)Path)AnyOptionalTYPE_CHECKING)
PRECEDENCE)_embed_headers)
OrderedSet)
CppPrinterExprPrinter)ValueRanges   )ceildivget_bounds_index_exprget_kernel_metadata)ops
OpsWrapperV   )CSEVariableDeferredLineDTYPE_TO_COMPUTATION_DTYPEIndentedBufferOpOverridesPythonPrinter)IterationRangesEntry
SIMDKernelSIMDScheduling)Union)ReductionType	StoreMode)	SchedulerSchedulerNode)OpVarTboolcharshortintlongucharfloathalfbfloatval)Union[float, int, bool, str, CSEVariable]returnstrc                    t          | t                    r<| t          j        k    rdS | t          j         k    rdS | | k    rdS t	          |           S t          | t
                    r| rdndS t	          |           S )N	HUGE_VALFz
-HUGE_VALFNANtruefalse)
isinstancer+   torchinfr1   r%   )r.   s    o/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torch/_inductor/codegen/mps.pyvalue_to_metalr;   8   s    #u 	*%);UYJ<CZZ53xx	C		 *)vv')s88O    c                  ~    e Zd ZdZddZddZddZdd	Zdd
ZddZ	ddZ
ddZddZddZddZeZddZddZdS )MetalExprPrinterz/Converts sympy expression to Metal code snippetexpr
sympy.Exprr0   r1   c                    |j         \  }}|                     |          }|                     |          }|j        r	d| d| dS d| d| dS )Nc10::metal::floor_divide(, )metal::floor() / (argsdoprint
is_integer)selfr?   xdivs       r:   _print_FloorDivz MetalExprPrinter._print_FloorDivI   si    3LLOOll3? 	;:q::C::::-q--s----r<   c                    |j         \  }}}|                     |          }|dk    r/|                     |          }|j        r
d| d| d}n	d| d| d}|                     |          }d| d| dS )Nr   (rF   rD   rE   z) % (rG   )rK   r?   rL   rM   mods        r:   _print_ModularIndexingz'MetalExprPrinter._print_ModularIndexingQ   s    i3LLOO!88,,s##C 3&&&&&&2A22C222ll3!1!!3!!!!r<   c                    t          |j                  dk    rt          d          t          | j        |j                  \  }}d| d| d| d}d| d| d| d}d| d| dS )	Nr   z$metal::min only supported for 2 argsstatic_cast<decltype(+)>(rD   zmetal::min(rC   lenrH   RuntimeErrormap_printrK   r?   ab
typecast_a
typecast_bs         r:   
_print_MinzMetalExprPrinter._print_Min]       ty>>QEFFF4;	**1;Q;;;;q;;;
;Q;;;;q;;;
8Z88:8888r<   c                    t          |j                  dk    rt          d          t          | j        |j                  \  }}d| d| d| d}d| d| d| d}d| d| dS )	Nr   z$metal::max only supported for 2 argsrT   rU   rV   rD   zmetal::max(rC   rW   r\   s         r:   
_print_MaxzMetalExprPrinter._print_Maxe   rb   r<   c                ~    t          |j                  dk    sJ d|                     |j        d                    dS )Nr   metal::abs(r   rD   rX   rH   r[   rK   r?   s     r:   
_print_AbszMetalExprPrinter._print_Absm   s>    49~~""""9T[[1669999r<   c                ~    t          |j                  dk    sJ d|                     |j        d                    dS )Nr   zstatic_cast<long>(metal::rint(r   ))rg   rh   s     r:   _print_RoundToIntz"MetalExprPrinter._print_RoundToIntq   s>    49~~""""MDIaL0I0IMMMMr<   c                    t          |j                  dk    sJ |j        \  }}|j        r|dk     sJ t          d| d          |                     |t
          d                   }d| d| d|  d	S )
Nr   r   zOFor integer inputs, only non-negative ndigits are currently supported, but got .Mulz!static_cast<float>(metal::rint(1e * z) * 1erD   )rX   rH   rJ   
ValueErrorparenthesizer   )rK   r?   numberndigits
number_strs        r:   _print_RoundDecimalz$MetalExprPrinter._print_RoundDecimalu   s    49~~"""") 	Q;;;;lbilll   &&vz%/@AA
\7\\z\\RYQY\\\\r<   c                t    |j         \  }}d|                     |           d|                     |           dS )Nstatic_cast<float>(z) / static_cast<float>(rD   )rH   r[   )rK   r?   lhsrhss       r:   _print_IntTrueDivz"MetalExprPrinter._print_IntTrueDiv   s?    9SaT[[%5%5aadkkZ]N^N^aaaar<   c                    t          |j                  dk    sJ t          | j        |j                  \  }}d| d| dS )Nr   zmetal::pow(static_cast<float>(z), static_cast<float>(rk   )rX   rH   rZ   rI   )rK   r?   rL   ys       r:   _print_PowByNaturalz$MetalExprPrinter._print_PowByNatural   sK    49~~""""4<++1NNNNNNNr<   c                    t          |j                  dk    sJ |                     |j        d                   }d| dS )Nr   r   rx   rD   rX   rH   rI   rK   r?   rL   s      r:   _print_ToFloatzMetalExprPrinter._print_ToFloat   sC    49~~""""LL1&&)Q))))r<   c                    t          |j                  dk    sJ |                     |j        d                   }d| dS )Nr   r   z1static_cast<int>(metal::floor(static_cast<float>(z)))r   r   s      r:   _print_FloorToIntz"MetalExprPrinter._print_FloorToInt   sC    49~~""""LL1&&I1IIIIr<   c                    t          |j                  dk    sJ |                     |j        d                   }d| dS )Nr   r   zstatic_cast<int>(metal::trunc(rk   r   r   s      r:   _print_TruncToIntz"MetalExprPrinter._print_TruncToInt   sC    49~~""""LL1&&55555r<   c                    t          |j                  dk    sJ |                     |j        d                   }d| dS )Nr   r   zmetal::log2(rD   r   r   s      r:   _print_OpaqueUnaryFn_log2z*MetalExprPrinter._print_OpaqueUnaryFn_log2   sC    49~~""""LL1&&"a""""r<   N)r?   r@   r0   r1   )__name__
__module____qualname____doc__rN   rR   ra   rd   ri   rl   rv   r{   r~   r   r   _print_floorr   r    r<   r:   r>   r>   F   s9       99. . . .
" 
" 
" 
"9 9 9 99 9 9 9: : : :N N N N
] 
] 
] 
]b b b b
O O O O
* * * *
J J J J
 %L6 6 6 6
# # # # # #r<   r>   c                  0   e Zd ZdZe	 	 dMdNd            ZedOd            ZedPd            ZedQd            ZedRd            Z	edSd            Z
edTd            ZedUd             ZedUd!            ZedUd"            ZedUd#            ZedVd$            ZedVd%            ZedVd&            ZedVd'            ZedVd(            ZedVd)            ZedVd*            ZedVd+            ZedVd,            ZedVd-            ZedVd.            ZedVd/            ZedVd0            ZedWd2            ZedVd3            ZedVd4            ZedVd5            Z edVd6            Z!edVd7            Z"edUd8            Z#edVd9            Z$edVd:            Z%edUd;            Z&edVd<            Z'edUd=            Z(edVd>            Z)edXdA            Z*edXdB            Z+edYdE            Z,edVdF            Z-edUdG            Z.dZdIZ/d[dJZ0e1d\dL            Z2dS )]MetalOverrideszXImplements Metal-specific overrides for ops. Base class emits Python-friendly overrides.NTrL   r   dtypetorch.dtype	src_dtypeOptional[torch.dtype]use_compute_typesr%   r0   r1   c                    |t           j        k    r t                              d           d|  dS dt          |          d|  dS )Nz>float64 cast requested, probably from tensorify_python_scalarsrx   rD   static_cast<>()r8   doublelogwarningDTYPE_TO_METAL)rL   r   r   r   s       r:   to_dtypezMetalOverrides.to_dtype   s[     EL  KKP   .----;nU3;;q;;;;r<   c                F    dt           |          dt           |          d|  dS )Nzas_type<z>(static_cast<r   rk   r   )rL   r   r   s      r:   to_dtype_bitcastzMetalOverrides.to_dtype_bitcast   s1     b./aa~i?Xaa\]aaaar<   r.   Union[bool, float, int]c                     t          |           S Nr;   )r.   r   s     r:   constantzMetalOverrides.constant   s    c"""r<   r?   r@   c                *   t           j                            t           j                            |                     }t           j        j                            t           j        j        |t          |                     }t          j	        ||          S )N)bounds)
r   kernelindex_to_strprepare_indexingcsegeneratecomputer   r   r   )r?   r   idx_strvars       r:   
index_exprzMetalOverrides.index_expr   sn    (''(A(A$(G(GHHhl##Hg.CD.I.I $ 
 
 |C'''r<   maskbodyotherc                    t           j                            | |          5 } |            }d d d            n# 1 swxY w Y   |j        j        rt          |          }t          j        |||          S r   )r   r   
mask_loadsr   is_boolr%   r   where)r   r   r   new_maskresults        r:   maskedzMetalOverrides.masked   s     X  u-- 	TVVF	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 =  	 KKEy65111s   8<<r]   r$   r^   cc                0    |  d| dt          |           S )Nz ? z : r   )r]   r^   r   s      r:   r   zMetalOverrides.where   s&    1111nQ//111r<   c                    d|  d| dS )Nzc10::metal::remainder(rC   rD   r   r]   r^   s     r:   	remainderzMetalOverrides.remainder   s    111Q1111r<   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )NrT   rU   rV   rD   zc10::metal::max(rC   r   r]   r^   r_   r`   s       r:   maximumzMetalOverrides.maximum   [    ;Q;;;;q;;;
;Q;;;;q;;;
=*==
====r<   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )NrT   rU   rV   rD   zc10::metal::min(rC   r   r   s       r:   minimumzMetalOverrides.minimum   r   r<   c                    |  d| S )Nz || r   r   s     r:   
logical_orzMetalOverrides.logical_or       }}}}r<   c                    |  d| S )Nz && r   r   s     r:   logical_andzMetalOverrides.logical_and   r   r<   c                    d|  dS )Nzmetal::isnan(rD   r   rL   s    r:   isnanzMetalOverrides.isnan       #q####r<   c                    d|  dS )Nzmetal::isinf(rD   r   r   s    r:   isinfzMetalOverrides.isinf   r   r<   c                    d|  dS )Nzmetal::log(rD   r   r   s    r:   r   zMetalOverrides.log       !Q!!!!r<   c                    d|  dS )Nzmetal::exp(rD   r   r   s    r:   expzMetalOverrides.exp   r   r<   c                    d|  dS )Nrf   rD   r   r   s    r:   abszMetalOverrides.abs   r   r<   c                    d|  dS )Nzmetal::signbit(rD   r   r   s    r:   signbitzMetalOverrides.signbit   s    %%%%%r<   c                    d|  dS )Nzmetal::precise::sin(rD   r   r   s    r:   sinzMetalOverrides.sin      *a****r<   c                    d|  dS )Nzc10::metal::sinc(rD   r   r   s    r:   sinczMetalOverrides.sinc  s    '1''''r<   c                    d|  dS )Nzmetal::precise::cos(rD   r   r   s    r:   coszMetalOverrides.cos  r   r<   c                    d|  dS )Nzmetal::tan(rD   r   r   s    r:   tanzMetalOverrides.tan  r   r<   c                    d|  dS )Nzmetal::asin(rD   r   r   s    r:   asinzMetalOverrides.asin      "a""""r<   c                    d|  dS )Nzmetal::acos(rD   r   r   s    r:   acoszMetalOverrides.acos  r   r<   c                    d|  dS )Nzmetal::atan(rD   r   r   s    r:   atanzMetalOverrides.atan  r   r<   r}   c                    d|  d| dS )Nz::metal::atan2(rC   rD   r   )rL   r}   s     r:   atan2zMetalOverrides.atan2   s    ***a****r<   c                    d|  dS )Nzmetal::sqrt(rD   r   r   s    r:   sqrtzMetalOverrides.sqrt$  r   r<   c                    d|  d|  dS )NrT   z)>(-rD   r   r   s    r:   negzMetalOverrides.neg(  s     3q22a2222r<   c                    d|  dS )Nzmetal::rsqrt(rD   r   r   s    r:   rsqrtzMetalOverrides.rsqrt.  r   r<   c                    d|  dS )Nzmetal::tanh(rD   r   r   s    r:   tanhzMetalOverrides.tanh2  r   r<   c                    d|  dS )Nzmetal::atanh(rD   r   r   s    r:   atanhzMetalOverrides.atanh6  r   r<   c                    d|  d| dS )NrB   rC   rD   r   r   s     r:   floordivzMetalOverrides.floordiv:  s     51444444r<   c                    d|  dS )NrE   rD   r   r   s    r:   floorzMetalOverrides.floor?  r   r<   c                    d|  dS )Nzmetal::sign(rD   r   r   s    r:   signzMetalOverrides.signC  r   r<   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )NrT   rU   rV   rD   zmetal::fmod(rC   r   r   s       r:   fmodzMetalOverrides.fmodG  s[    ;Q;;;;q;;;
;Q;;;;q;;;
9j99J9999r<   c                    d|  dS )Nmetal::trunc(rD   r   r   s    r:   trunczMetalOverrides.truncM  r   r<   c                l    |  d| }| j         | j         j        s|j         |j         j        rd| dS |S )Nz / r   rD   )r   is_floating_point)r]   r^   quots      r:   truncdivzMetalOverrides.truncdivQ  sK    ||||GAG$=GAG$=*4****r<   c                    d|  dS )Nzmetal::ceil(rD   r   r   s    r:   ceilzMetalOverrides.ceilZ  r   r<   seedoffsetc                \    t           j        j                            d           d|  d| dS )Nrandomzc10::metal::rand(rC   rD   r   r   headersaddr  r  s     r:   randzMetalOverrides.rand^  s4    	X&&&444464444r<   c                \    t           j        j                            d           d|  d| dS )Nr	  zc10::metal::randn(rC   rD   r
  r  s     r:   randnzMetalOverrides.randnc  s4    	X&&&5D55F5555r<   lowhighc           	     h    t           j        j                            d           d|  d| d| d| d	S )Nr	  zc10::metal::randint64(rC   rD   r
  )r  r  r  r  s       r:   	randint64zMetalOverrides.randint64h  sF     	
X&&&HHHHH#HHHHHHr<   c                    d|  dS )Nzmetal::rint(rD   r   r   s    r:   roundzMetalOverrides.roundo  r   r<   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )NrT   rU   rV   rD   zmetal::pow(rC   r   )r]   r^   cast_acast_bs       r:   powzMetalOverrides.pows  s[    777Q771777777Q7717770V00v0000r<   namec                \    t           j        j                            d           d| d| dS )Nspecial_mathc10::metal::rP   rD   r
  )rK   r]   r  s      r:   _special_unaryzMetalOverrides._special_unaryy  s4    	^,,,)d))Q))))r<   c                b    t           j        j                            d           d| d| d| dS )Nr  r  rP   rC   rD   r
  )rK   r]   r^   r  s       r:   _special_binaryzMetalOverrides._special_binary}  s<    	^,,,.d..Q..!....r<   Nonec           
        dD ],}t          | |t          j        | j        |                     -t          j        | j        d          | _        dD ]/}t          | |t          j        | j        |dz                        0dD ],}t          | |t          j        | j        |                     -dD ]/}t          | |t          j        | j        |dz                        0d S )N)erferfinvi0i0ei1i1edigammaspherical_bessel_j0)r  	log_gamma)
	bessel_j0	bessel_j1	bessel_y0	bessel_y1modified_bessel_i0modified_bessel_i1modified_bessel_k0modified_bessel_k1scaled_modified_bessel_k0scaled_modified_bessel_k1_forward)	polygammaigammaigammaczeta)
chebyshev_polynomial_tchebyshev_polynomial_uchebyshev_polynomial_vchebyshev_polynomial_whermite_polynomial_hhermite_polynomial_heshifted_chebyshev_polynomial_tshifted_chebyshev_polynomial_ushifted_chebyshev_polynomial_vshifted_chebyshev_polynomial_w)setattr	functoolspartialmethodr  lgammar!  )clsr  s     r:   _initialize_special_opsz&MetalOverrides._initialize_special_ops  s8   	
 
	W 
	WD Cy6s7IPTUUUVVVV,S-?kRRR

 	 	D '(:
ARSSS   
 	X 	XD Cy6s7JQUVVVWWWW
 	 	D '(;$BSTTT   	 	r<   )NT)
rL   r   r   r   r   r   r   r%   r0   r1   )rL   r   r   r   r   r   r0   r1   )r.   r   r   r   r0   r1   )r?   r@   r   r   r0   r1   )r   r   r   r@   r   r   r0   r1   )r]   r$   r^   r$   r   r$   r0   r1   )r]   r$   r^   r$   r0   r1   )r]   r   r^   r   r0   r1   )rL   r   r0   r1   )rL   r   r}   r   r0   r1   )r  r   r  r   r0   r1   )
r  r   r  r   r  r   r  r   r0   r1   )r]   r   r  r1   r0   r1   )r]   r   r^   r   r  r1   r0   r1   r0   r"  )3r   r   r   r   staticmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r!  classmethodrK  r   r<   r:   r   r      s       bb ,0"&	< < < < \< b b b \b
 # # # \# ( ( ( \( 2 2 2 \2 2 2 2 \2 2 2 2 \2 > > > \>
 > > > \>
    \    \ $ $ $ \$ $ $ $ \$ " " " \" " " " \" " " " \" & & & \& + + + \+ ( ( ( \( + + + \+ " " " \" # # # \# # # # \# # # # \# + + + \+ # # # \# 3 3 3 \3
 $ $ $ \$ # # # \# $ $ $ \$ 5 5 5 \5 $ $ $ \$ # # # \# : : : \:
 $ $ $ \$    \ # # # \# 5 5 5 \5 6 6 6 \6 I I I \I # # # \# 1 1 1 \1
* * * */ / / / = = = [= = =r<   r   mpsc                  T    e Zd ZU dZeZdZdZdZdZ	 e
            j        Z e            j        Z e            j        ZeZ edg          Zded<   g Zd	ed
<   d? fdZd@dZdAdZ	 dBdCdZdDd Zddd! ej                    fdEd+ZdFd0ZdFd1ZdGd4Z dHd5Z!dBdId7Z"dBdJd9Z#dKd>Z$ xZ%S )LMetalKernelz;Implement Metal codegen based on the SIMDKernel abstraction;auto i       utilszOrderedSet[str]r  zlist[IterationRangesEntry]multistage_reduction_entrytilingdict[str, sympy.Expr]kwargsr   r0   r"  c                l     t                      j        |fi | t          j                    | _        d S r   )super__init__	itertoolscountacc_var_ids)rK   rW  rY  	__class__s      r:   r\  zMetalKernel.__init__  s:    
 	**6***$?,,r<   r   r   r1   c                    t           |         S r   r   )rK   r   s     r:   dtype_to_strzMetalKernel.dtype_to_str  s    e$$r<   r  indexr@   r   c                p   | j                             |          }|                     |          }t          j                            |          }| d|                     |           d}|t          j        t          j	        fv rd| d}t          j
        }| j                            | j        ||          S )z"Codegen a load from an InputBuffer[]rx   rD   r   )rH   inputr   r   graph	get_dtyper   r8   float16bfloat16float32r   r   loads)rK   r  rc  r   r   lines         r:   loadzMetalKernel.load  s    iood##%%e,,!!$''33))%00333U]EN333 1000DMEx  T ???r<   Nvaluemoder!   c                   | j                             |          }|                     |          }|                     t          j                            |                    }d| d| d}|| d|                     |           d| d}nc|dk    rK| j        	                    d           d	| d
}	d|	 d| d}
|	 d|
 d|                     |           d| d}nt          d|           | j        r*| j                            t          ||                     d S | j                            t          ||                     d S )Nr   r   rD   re  ] = rR  
atomic_addatomiczc10::metal::AtomicType<>zreinterpret_cast<device z
::type *>(z::atomic_add(rC   );zUnimplemented store mode )rH   outputr   rb  r   ri  rj  r   r  r  rY   inside_reductionr   	writeliner   stores)rK   r  rc  rq  rr  r   	dtype_strcast_valro  atomic_typecast_vars              r:   storezMetalKernel.store  s    it$$%%e,,%%ag&7&7&=&=>>	7)77u777<EED--e44EE(EEEDD\!!LX&&&@I@@@KO+OOOOOH!ddddD<M<Me<T<TddX`dddDDA4AABBB  	<L""<d#;#;<<<<<K!!,tT":":;;;;;r<   c                   | j                             |          }|                     |          }|                     t          j                            |                    }t          d | j        D                       }| d| 	                    |           d| d| d}d|j
         d| }| j                            t          ||                     d S )Nc              3  (   K   | ]}|j         	|V  d S r   is_reduction.0ts     r:   	<genexpr>z.MetalKernel.store_reduction.<locals>.<genexpr>  s)      KK1ANKQKKKKKKr<   re  z] = static_cast<r   rx  if (z == 0) )rH   ry  r   rb  r   ri  rj  nextrange_treesr   r  r|  r{  r   )rK   r  rc  rq  r   r}  reduction_dimro  s           r:   store_reductionzMetalKernel.store_reduction  s    it$$%%e,,%%ag&7&7&=&=>>	KK(8KKKKKXX))%00XX)XXuXXX7m(7777l46677777r<   TUnion[str | torch.dtype]
elem_countOptional[int]default_valueOptional[Any]is_threadgroupr%   r   ValueRanges[Any]c                   t          |t          j                  r|                     |          }dt	          | j                   }t          j                            |||          }|rdnd}|| d| z  }|r|d| 	                    |           dz  }||r
J d            |d| z  }| j
                            || j        z              |S )	Ntmp_acc_zthreadgroup   re  rf  z+Thread group var can not have default value = )r7   r8   r   rb  r  r_  r   r   create_cse_varsexprindexing_coder{  suffix)	rK   r   r  r  r  r   var_namer   var_defs	            r:   _new_idxvarzMetalKernel._new_idxvar  s     eU[)) 	-%%e,,E6d4#34466h%%h>>$2:..e((h((( 	544::j114444G$%TT'TTT%,],,,G$$Wt{%:;;;
r<   r   reduction_typer    +Union[CSEVariable, tuple[CSEVariable, ...]]c                    |||f}|| j         j        v r| j         j        |         S |                     ||||          }|| j         j        |<   |S )z)Caching wrapper around _reduction_nocache)r   reduction_cache_reduction_nocache)rK   r   r   r  rq  	cache_keyr   s          r:   	reductionzMetalKernel.reduction#  s]     6	0008+I66((	>5QQ.4 +r<   c                   | j         sJ | j        rJ dFd}d}d}| j        D ]l}|j        s
|r|dz  }||j         d	| z  }t          |j        t          j                  r||j        z  }J|t          j	        |j
         d
dd          z  }mt          j        || j                  }|                     |          }	t          |t          j                  rt          || j                  n| j        }
|dk    r|                     |          }| j                            | d           | j                            d           | j                            d| d| d           | j                            d           |S | j                            d           |dv rt0          |         }|                     ||
          }| j        s|}nH|dk    rdnd\  }}|                     ||d          }| j                            | d| d| d           | j                            | j        d| d| d| d| d|	 d t0          |         !          S |d"v r|                     ||
          }t8          |         }d#| d$| d }| j        s|}n`|                    d%          rd&nd%}d'| d(| d)}|                     ||d          }| j                            | d*| d| d| d+           | j                            | j        d| d| d| d| d|	 d t0          |         !          S |d,v rj|                     ||
          }|                     ||
          }t8          |         }d#| d$| d }| j        s|}d#t8          |          d$| d }n|                    d%          rd&nd%}d'| d(| d)}|                     ||d          }|                     |d-d          }t=          d. | j                                         D                       }|d/k    rd0nd1}|j!        rd2| d3nd}| j                            d| d| d| | d| d4| d5| d4|j         d6           | j                            | j        d| d| d| d| d| d| d|	 d |!          S |d7k    r| j        sz|                     ||          }| j                            | d8| d9| d           | j                            | j        d| d| d|	 d tD          j#        !          } ||          S |                     d:|          }| d8| d;}| j                            | d<           | j                            | d=| d>| d?           | j                            | j        d@| d| d tD          j#        !          } ||          S |dAk    r/t          |tH                    s
J dB            |                     d:|          }| d8| d;}dC|d-          d|d          d|dD          d }| j                            | d<           | j        rA| j                            | d<           | j                            | d=| d| d+           n | j                            | d4| d           | j                            | j        r| j        n| j        d| d| d|	 d tD          j#        !          } ||          S tK          |          )GzeCodegen a reduction operation.
        Only sum and prod operations are somewhat reasonable optimizedres3r   r0   tuple[CSEVariable, ...]c                D     t          j         fddD                       S )Nc                P    g | ]"}t           d | j        j                  #S )rn   )r   r   r   )r  r  r  s     r:   
<listcomp>zJMetalKernel._reduction_nocache.<locals>._unwrap_helper.<locals>.<listcomp>B  s3    TTT]]q]]DKDDTTTr<   xyz)r   _unwrap)r  s   `r:   _unwrap_helperz6MetalKernel._reduction_nocache.<locals>._unwrap_helper?  s/    %TTTTeTTT  r<   r  r    + rp   numelTintegerpositiveanyz	 = false;z7threadgroup_barrier(metal::mem_flags::mem_threadgroup);z
                if (z) {
                    z' = true;
                }
            reduction_utils)prodsumr  )r   rU   )r   *F)r  r  r  z= rR  zc10::metal::threadgroup_rP   rC   rD   rg  )maxminr   r   r  lowestz::metal::numeric_limits<z>::z()z = ::c10::metal::rx  )argminargmaxr   c              3  (   K   | ]}|j         	|V  d S r   r  r  s     r:   r  z1MetalKernel._reduction_nocache.<locals>.<genexpr>  s=             r<   r  rw  <z || ::metal::isnan(z) r  z;
                    z$;
                }
                welford_reducere  rt  float3rf  z = 0.0;z! = ::c10::metal::welford_combine(z	, float3(z, 0.0, 1.0));z(c10::metal::threadgroup_welford_combine(welford_combinez&Input to welford combine must be tuplezfloat3(r   N)r  r   r0   r  )&rz  
_load_maskr  r  r  r7   r  sympyIntegerSymbolprefixMinmax_threadgroup_sizer  r   simd_group_sizer  r  r{  r   splicer|  r  r  r   rV  r   r   r   endswithr  range_tree_nodesvaluesr  r8   rm  tupleNotImplementedError)rK   r   r   r  rq  r  reduction_idxacc_buf_sizerdacc_buf_size_strshmem_buf_sizeacc	acc_dtypeacc_bufr.   default_valreduction_opsrc_metal_type
cast_valuelim_fn	limit_valdata_acc_bufidx_acc_bufidx_validx_varcmp_op
nan_suffixwf_resacc_thread_var	inp_values                                 r:   r  zMetalKernel._reduction_nocache3  s-
    $$$$?"""	 	 	 	 " 	 	B?  '&::L:::M"(EM22 (y'''! ! !  yt/HII::l33 ,66&GL$"6777% 	 U""""5))C((C):):):;;;((I   L      K!!I   J*+++_,,29=I&&y.AAG2 	G !/% 7 7HHX *\ &&[ '   ##s$E$E\$E$EU$E$E$EFFF8$$r>rrGrrsrrmrr_orrr07 %   
 ^++&&y.AAG+I6NBBB%BBBJ2 
 %3%<%<U%C%CNT~TT&TTT	&&Yu '   ##SS^SScSSZSSS   8$$r>rrGrrsrrmrr_orrr07 %   
 111++I~FFL**5.AAK+I6NBBB%BBBJ2  R)>RR-RRR%3%<%<U%C%CNT~TT&TTT	&&Yu '   **5RW*XX  #4;;==     !/( : : !23%3333 
 ## %% %#% %&)%+5% %% %"% % % % ")% % %    8$$J> J JL J JK J JJ J!J J%2J J6FJ J J	 %    ---2 .**9lCC##w$L$L$L$LE$L$L$LMMM**L^~^^^^K[^^^- +  
 &~f---&&x>>G '::-:::N%%&@&@&@AAAL""!qqNqq]bqqq   X&&U7UUlUUUm '  F
 ">&)))...eU++UU-UUU+&&x>>G '::-:::NE%(EEeAhEE%(EEEI%%&@&@&@AAA. K"))^*D*D*DEEE&&%ggggZcggg    &&.'I'IY'I'I'IJJJX&&#>PDLZ>ZZGZZGWZZZm '  F
 ">&)))!.111r<   entryr   c                2   |                      |j                  }|                     |          }|j        r9t	          |j        j        t          j                  rD|j        j        | j	        k    r/| j
                            | j         d|j         d| d           d S t	          |j        j        t          j                  r|j        j        n#t          j        |j        j         ddd          }| j                            |           |t%          | j	        dz
            z   t%          | j	                  z  }|                     |          }| j                            d|j         d	|j         d
| d|j         d	           | j                                        5  t	          |t          j                  r>| j                            | j         d|j         d| j	         d|j         d| d
           n8| j                            | j         d|j         d| d| d|j         d
           t	          |t          j                  s|| j	        z  |k    r&| j                            d|j         d| d           d d d            d S # 1 swxY w Y   d S )Nr  r  rR  r  Tr  r   z	for(auto z
_cnt = 0; z_cnt < z; ++z_cnt) {rp   z_cnt + r  z_cnt;r  z >= z) break;)rename_indexingr?   r  r  r7   rootr  r  r  r  r  r{  index_dtyper  r  r  rV  appendr+   r   indent)rK   r  r   	index_stracc_size	loop_sizeloop_size_strs          r:   codegen_iteration_ranges_entryz*MetalKernel.codegen_iteration_ranges_entry  s:   ))%*55
JJz**	! 	uz'77	
 D$===((#AAejAAYAAA   F %**EM::XEJ!29994RVWWW 	 	'..u555 d&?!&C D DD%J
 J
 
	 

9--	h
hhejhhhhTYT^hhh	
 	
 	
 Y 	O 	O(EL11 	##'vv%*vv9RvvW\Wavvjsvvv    	##'jj%*jjjj9jjY^Ycjjj   8U\22Ot88HDD	##$M5:$M$M8$M$M$MNNN	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 	O 	Os    CJJJc                   | j         r%| j                                        5  | j                            | j                   | j                            | j                   ddd           n# 1 swxY w Y   | j                            dt          | j                   z             | j        	                    t          d | j        j                                        D                                  | j         r2| j                                                                          | j         2n>| j                            | j                   | j                            | j                   | j                            | j                   | j                                         | j                                         | j                                         dS )a  
        Concat output code from index_code, loads, compute, stores,
        suffix into self.body.

        For pointwise kernels, this is called just once at the end.

        For reduction kernels, this generates a loop over the reduction
        axis.
        N}c              3  T   K   | ]#}t          |t                    r|n|fD ]}|V  $d S r   )r7   r  )r  itemvs      r:   r  z+MetalKernel.codegen_body.<locals>.<genexpr>.  sb        &0u&=&=JddD7         r<   )rV  r   r  r  rn  r   r{  rX   r   
invalidater
   r  r  popcache_clearr|  clear)rK   s    r:   codegen_bodyzMetalKernel.codegen_body  s    * 	+!!## / /	  ,,,	  .../ / / / / / / / / / / / / / / Ic$*I&J&J JKKK
 H   $ 8 ? ? A A       1 D/3355AACCC 1 D ITZ(((IT\***	%%%
s   ?A--A14A1Optional[str]c                B   |                                   t                      }t          j        j        r|                    d           n|                    d           |                                 }|                                5  t          j        j        s$| j        D ]}|                    d| d           nhd | j        D             }t          |t          t                    j        j        j        dz  gt                                }|                    |           | j        rrt          j        d | j        D                       }t%          |t&          j                  rt+          || j                  n| j        }|                    d| d	           |                    d
           |                                5  | j        j                                        D ]]\  }	}
|	| j        v r|                     t          j                            |	                    }|                    d| d|
 d           ^| j        j                                        D ]\  }	}
t          j                            |	          }|t<          j        k    rKt          j                             |	          }||!                                g k    rtE          d          d}n|                     |          }|                    d| d|
 d           | j        j#                                        D ]\  }	}
|                    d|
 d           |D ]@}t%          |j$        t&          j                  r"|                    d|j%         d           AtM          |          dk     s
J d            tM          |          dk    rdtM          |           nd}tM          |          dk    r|d         j'        nd}| j        rdnd}|                    | d| d|            | j        r|                    | d           ddd           n# 1 swxY w Y   |                    d           |                                5  tM          |          dk    rFtQ          |          D ]6\  }}|                    d|j'         d tS          d!|z              d"           7|*                    | j+                   |*                    | j,                   ddd           n# 1 swxY w Y   |                    d#           ddd           n# 1 swxY w Y   t          j        j        r|                    d$           n|                    d%           |-                                S )&z3Called at the end to generate a final kernel stringz(R"MTL(zcompile_mps_shader('''#include <c10/metal/.h>c                    g | ]}d | d	S )r  r  r   )r  headers     r:   r  z.MetalKernel.codegen_kernel.<locals>.<listcomp>O  s1       ;A66666  r<   includec              3  2   K   | ]}|j         	|j        V  d S r   )r  r  r  s     r:   r  z-MetalKernel.codegen_kernel.<locals>.<genexpr>Z  s?       1 1 !1G1 1 1 1 1 1r<   z$[[max_total_threads_per_threadgroup(z)]]zkernel void generated_kernel(zdevice z* ,Nzfloat64 is not supported by MPSr+   z	constant zconstant long& znumel,   z%Up to 3 index variables are supportedr   uintr   
thread_posr  r  z [[thread_position_in_grid]]z- group_pos [[thread_position_in_threadgroup]]z) {rS  z = thread_pos.x   rR  r  z)MTL");z''')).r   r   r   ri  cpp_wrapperr{  active_range_treesr  r  r	   r   __file__parentr
   rz  mathr  r  r7   r  r  r  r  rH   output_buffersitemsremoved_buffersrb  rj  input_buffersr8   float64try_get_bufferget_sizerY   sizevarsr  r  rX   r  	enumeratechrr  r  r   getvalue)rK   r  codeidx_varsr  r  header_contentstotal_reduction_sizethreadgroup_sizeouterinnerr}  r   	outer_bufr  thread_pos_dtypethread_pos_var_namethread_pos_suffixidxr   s                       r:   codegen_kernelzMetalKernel.codegen_kernel?  s   7 	5NN9%%%%NN3444**,,[[]] Q	  Q	 7& 0"l G GFNN#E&#E#E#EFFFFG EI\   #1(^^*189DELL# #
 ///$ '+y 1 1%)%51 1 1 ( ($ ""6FF3C,d.GHHH2 !
 P;KPPP   NN:;;; ) )$(I$<$B$B$D$D D DLE5 444  $ 1 1!'2C2CE2J2J K KINN#BY#B#B%#B#B#BCCCC$(I$;$A$A$C$C 
F 
FLE5G--e44E--$%G$:$:5$A$A	$,	0B0B0D0D0J0J"./P"Q"QQ$+		$($5$5e$<$<	NN#Dy#D#DE#D#D#DEEEE$(I$6$<$<$>$> ? ?LE5NN#=U#=#=#=>>>>  ( Q QG!'-?? Q'O'O'O'OPPPP8}}q(((*Q(((.1(mma.?.?*3x==***V ! ),H(:(:HQK$$ $ ,0+@$HCCb!'nn*=nn[lnn   ( NN+ZZZ  O) ) ) ) ) ) ) ) ) ) ) ) ) ) )T NN5!!! ' 'x==1$$$-h$7$7  SMCHMMCc	NNMMM    D.///DI&&&' ' ' ' ' ' ' ' ' ' ' ' ' ' ' NN3cQ	  Q	  Q	  Q	  Q	  Q	  Q	  Q	  Q	  Q	  Q	  Q	  Q	  Q	  Q	 f 7 	#NN9%%%%NN6"""}}s]   E UI9Q UQ	UQ	,U BTUT	U!T	"UU
U
nodec                    t           j        j        } j        j                                        D ]}|                    |            j                                        \  }}}}d t          ||          D             g  j        j	                                         j        j
                                        } fd|D             }|d  j        j                                        D             z  }fd|D             } j        D ]}	t          |	j        t          j        t           f          r)t          |	j        t          j                  r|	j        }
n*t           j        j                            ||	          j        }
|	j        r j        r<|                    t/          |
                     |                    t                      t           j        j        r j        n j        dd
}t7                                                     dk    r[ fd                                 D             }|                     ||d                     |                    t:                     n t           j        j        rt=          d           j        r[ fd                                 D             }|                     ||d                     |                    t:                     n,t           j        j        r|dgz  }|                    d           |                    ||tA          j!        d          d|           dS )z0
        Codegens a call to this kernel
        c                4    i | ]\  }}t          |          |S r   r1   )r  call_argarg_types      r:   
<dictcomp>z+MetalKernel.call_kernel.<locals>.<dictcomp>  s1     
 
 
(:(CMM8
 
 
r<   c                &    g | ]}|j         v|S r   )r  )r  argrK   s     r:   r  z+MetalKernel.call_kernel.<locals>.<listcomp>  s&    GGGs$2F'F'F'F'F'Fr<   c                ,    g | ]}t          |          S r   r.  )r  r  s     r:   r  z+MetalKernel.call_kernel.<locals>.<listcomp>  s    ;;;AQ;;;r<   c                     g | ]
}|         S r   r   )r  r3  arg_name_to_types     r:   r  z+MetalKernel.call_kernel.<locals>.<listcomp>  s    ;;;s%c*;;;r<   threads	list[str]kwargr1   r0   c                    t           j        j        r%d | D             } dd                    |            dS | dd                    |            dS )Nc                    g | ]}d | d	S )zstatic_cast<uint64_t>(rD   r   r  s     r:   r  zCMetalKernel.call_kernel.<locals>.format_threads.<locals>.<listcomp>  s$    JJJQ8A888JJJr<   {rC   r  z=[rf  )r   ri  r  join)r7  r9  s     r:   format_threadsz/MetalKernel.call_kernel.<locals>.format_threads  sb    w" 9JJ'JJJ2DIIg..222288499W#5#58888r<   r   c                |    g | ]8} |j         rt          j        |j        j                  n|j                  9S r   r  r  r  r  r  r  r  expr_printerrK   s     r:   r  z+MetalKernel.call_kernel.<locals>.<listcomp>  s\         ~!EIagt'@AAA   r<   zWe should always have threads?c                r    g | ]3}|j         r( t          j        |j        j                            nd 4S )1r@  rA  s     r:   r  z+MetalKernel.call_kernel.<locals>.<listcomp>  sU         >UYqw0IJJKKK  r<   
group_sizeNrO  F)devicetriton	arg_types)r7  r8  r9  r1   r0   r1   )"r   ri  wrapper_coderH   r  keysensure_size_computedpython_argdefszipr  r  r  r7   r  r  r  r(   r  generate_numel_exprr$  r  rz  r  r1   r  cexprpexprrX   r  listrY   generate_kernel_callr8   rF  )rK   r  r+  wrapperr  _	call_argsrH  rH   treer?   r>  r7  r6  rB  s   `            @@r:   call_kernelzMetalKernel.call_kernel  s    '&#((** 	, 	,A((++++%)Y%=%=%?%?"9a
 
>A)Y>W>W
 
 
 S)..00R493J3O3O3Q3QRGGGGtGGG;;!3!8!8!:!:;;;;;;;;d;;;	 $ 	& 	&D$*u}c&:;; RDJ55 Rzw+??dKKQ$ &(= &CII&&&  %%%%&W%8Htzzdj	9 	9 	9 	9 t&&(())A--     0022  G KKw	::;;;T""""w" E"#CDDD  	'     0022	  G KKw==>>>T""""w" '   &&&$$<&& 	% 	
 	
 	
 	
 	
r<   r?   sizelowerupperc                    |s|sd S |                      |          }|r| dnd}|r| d|                      |           nd}|r|r
d| d| d}nd| | d}| j                            | j        |d	
           d S )Nz < 0r  z > zif ((z) && (z	)) returnr  z) returnF)
assignment)r   r   r   r   )	rK   r?   rX  rY  rZ  expr_str
lower_expr
upper_exprro  s	            r:   check_boundszMetalKernel.check_bounds  s      	 	F $$T***/7&&&&R
BGO>>T%6%6t%<%<>>>R
 	;U 	;B:BBZBBBDD:*:j:::D$,?????r<   )rW  rX  rY  r   r0   r"  )r   r   r0   r1   )r  r1   rc  r@   r0   r   r   )
r  r1   rc  r@   rq  r   rr  r!   r0   r"  )r  r1   rc  r@   rq  r   r0   r"  )r   r  r  r  r  r  r  r%   r   r  r0   r   )
r   r   r   r   r  r    rq  r  r0   r  )r  r   r0   r"  rL  )r  r  r0   r1   )r  r1   r+  r   r0   r"  )
r?   r@   rX  r@   rY  r%   rZ  r%   r0   r"  )&r   r   r   r   r   	overridesr  newvar_prefixr  r  r   rI   rP  r   rO  r>   r  kexprr
   r  __annotations__rV  r\  rb  rp  r  r  r   unknownr  r  r  r  r   r*  rW  r`  __classcell__r`  s   @r:   rQ  rQ    s        EEIFMOMOO#EJLL E&EE)z7)44G4444=?????- - - - - -% % % %@ @ @ @ SW< < < < <*8 8 8 8 %)'+##6;#6#8#8    ,    s2 s2 s2 s2j0O 0O 0O 0Od# # # #Jc c c c cJR
 R
 R
 R
 R
h@ @ @ @ @ @ @ @r<   rQ  c                  ,     e Zd ZeZd fdZddZ xZS )MetalScheduling	schedulerOptional[Scheduler]r0   r"  c                    t                                          |           t          j        j        }|-t          j        j        s|j                            d           d S d S d S )NzDfrom torch._inductor.runtime.runtime_utils import compile_mps_shader)r[  r\  r   ri  rI  r  r  r  )rK   rj  rS  r`  s      r:   r\  zMetalScheduling.__init__  sq    ###'&7& %%Z      r<   src_coder1   node_schedulelist[SchedulerNode]r   rQ  c                6   t           j        j        }||j        v r|j        |         }npd|                                 }| }||j        |<   t           j        j        rd| |z   }t          ||          \  }}| d| }	|                    |||	d           |S )Nmps_lib_z+at::native::mps::DynamicMetalShaderLibrary 
F)gpu)r   ri  rI  src_to_kernelnext_kernel_suffixr  r   define_kernel)
rK   rm  rn  r   rS  kernel_namemps_lib_nameoriginsdetailed_originsmetadata_comments
             r:   rv  zMetalScheduling.define_kernel  s     '&w,,,!/9KK Eg&@&@&B&BDDL)+K.9G!(+w" P,PP 
 )<M7(S(S%G%")??-=??!!,:JPU!VVVr<   )rj  rk  r0   r"  )rm  r1   rn  ro  r   rQ  r0   r1   )r   r   r   rQ  kernel_typer\  rv  rf  rg  s   @r:   ri  ri  
  sV        K            r<   ri  )r.   r/   r0   r1   )I
__future__r   rG  r]  loggingr  pathlibr   typingr   r   r   r  sympy.printing.precedencer   r8   torch.utils._cpp_embed_headersr	   torch.utils._ordered_setr
   torch.utils._sympy.printersr   r   ExprPrinter_torch.utils._sympy.value_rangesr   rU  r   r   r   virtualizedr   r   r   commonr   r   r   r   r   r   simdr   r   r   r   ops_handlerr    r!   rj  r"   r#   r$   	getLoggerr   r   r%   int8int16int32int64uint8r+   r,   rl  r   r;   r>   r   _initialize_pointwise_overridesrK  rQ  ri  r   r<   r:   <module>r     s[   # " " " " "                 / / / / / / / / / /  0 0 0 0 0 0  9 9 9 9 9 9 / / / / / / O O O O O O O O 7 7 7 7 7 7 G G G G G G G G G G , , , , , , , , , ,                C B B B B B B B B B  6666666644444444g!! 
J	J	K	K	K	K	K	J	NH
   Y# Y# Y# Y# Y#| Y# Y# Y#x] ] ] ] ][ ] ] ]@	  . .u 5 5 5  & & ( ( (A	@ A	@ A	@ A	@ A	@* A	@ A	@ A	@H$ $ $ $ $n $ $ $ $ $r<   