
    `i)R                       d dl mZ d dlZd dlmZmZmZ d dlZd dlm	Z	 ddl
mZmZ ddlmZmZmZmZ ddlmZmZmZmZ dd	lmZmZmZmZmZmZ dd
lm Z  erd dl!m"Z" ddlm#Z#  ej$        e%          Z&ej'        j(        Z(ed             Z)ed             Z*dZ+	  ede)de+z   dz   e+z   dz             Z,dZ- ede*de-z   dz   e-z   dz             Z. eej/        dde(j/        j0                  Z1d Z2 ee2d          Z3 G d de          Z4d7d1Z5d2 Z6d3 Z7 ee(j/                  d8d4            Z/ ee(j8                  d5             Z8d6 Z9 ee(j/        e9           dS )9    )annotationsN)OptionalTYPE_CHECKING	TypedDict)CKGroupedConvFwdTemplate   )configir)add_layout_constraintconstrain_to_fx_strides	loweringsregister_lowering)autotune_select_algorithmExternKernelChoiceSymbolicGridFnTritonTemplate)is_onesis_zerospad_listlikesympy_productuse_ck_conv_templateuse_triton_template)V)Sequence)	TensorBoxc               d     || |z  |z  |d                    |||d                   |d         fS NBLOCK_MBLOCK_NGROUPS )nchwmetacdivs         o/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torch/_inductor/kernel/conv.pyconv2d_gridr)   -   sC     	QUQYY((QY  X     c               j     || |z  |z  |z  |d                    |||d                   |d         fS r   r!   )r"   r#   dr$   r%   r&   r'   s          r(   conv3d_gridr-   6   sG     	QUQY]DO,,QY  X r*   a  
        idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H
        idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W
        idx_x_c = tl.arange(0, BLOCK_K) + k

        x_ptrs = x_base + (
            (idx_x_h * stride_xh)[:, None]
            + (idx_x_w * stride_xw)[:, None]
            + (idx_x_c * stride_xc)[None, :]
        )
        mask_x = (
            (idx_n < BATCH)[:, None]
            & (idx_x_h >= 0)[:, None]
            & (idx_x_h < IN_H)[:, None]
            & (idx_x_w >= 0)[:, None]
            & (idx_x_w < IN_W)[:, None]
            & (idx_x_c < GROUP_IN_C)[None, :]
        )
        matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0)

        w_ptrs = w_base + (
            (idx_x_c * stride_wc_in)[:, None] + (i * stride_wh) + (j * stride_ww)
        )
        mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C)
        matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0)
        acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32)
convolution2dag  
{{def_kernel("X", "W")}}
    # Tensor dimensions
    BATCH = {{size("X", 0)}}
    IN_C = {{size("X", 1)}}
    IN_H = {{size("X", 2)}}
    IN_W = {{size("X", 3)}}
    OUT_C = {{size(None, 1)}}
    OUT_H = {{size(None, 2)}}
    OUT_W = {{size(None, 3)}}

    # Strides:
    stride_xn = {{stride("X", 0)}}
    stride_xc = {{stride("X", 1)}}
    stride_xh = {{stride("X", 2)}}
    stride_xw = {{stride("X", 3)}}
    stride_wc_out = {{stride("W", 0)}}
    stride_wc_in = {{stride("W", 1)}}
    stride_wh = {{stride("W", 2)}}
    stride_ww = {{stride("W", 3)}}

    nhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
    idx_y_w = nhw % OUT_W
    nh = nhw // OUT_W
    idx_y_h = nh % OUT_H
    idx_n = nh // OUT_H
    idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)

{% if GROUPS == 1 %}
    group = 0
    GROUP_IN_C = IN_C
    GROUP_OUT_C = OUT_C
{% else %}
    group = tl.program_id(2)
    GROUP_IN_C = IN_C // GROUPS
    GROUP_OUT_C = OUT_C // GROUPS
{% endif %}

    x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None]
    w_base = (
        W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :]
    )

    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)

{% if UNROLL %}
{% for i in range(KERNEL_H) %}
{% for j in range(KERNEL_W) %}
    i = {{i}}
    j = {{j}}
    for k in range(0, GROUP_IN_C, BLOCK_K):
        a  
{% endfor %}
{% endfor %}
{% else %}
    # Could be simplified, but slightly slower:
    # for i in range(KERNEL_H):
    #     for j in range(KERNEL_W):
    #         for k in range(0, GROUP_IN_C, BLOCK_K):
    BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K
    for ijk in range(KERNEL_H * KERNEL_W * BLOCK_K_COUNT):
        k = (ijk % BLOCK_K_COUNT) * BLOCK_K
        ij = ijk // BLOCK_K_COUNT
        i = ij // KERNEL_W
        j = ij % KERNEL_W
        a  
{% endif %}

    mask = (
        (idx_n < BATCH)[:, None]
        & (idx_y_h < OUT_H)[:, None]
        & (idx_y_w < OUT_W)[:, None]
        & (idx_y_c < GROUP_OUT_C)[None, :]
    )
    idx_n = idx_n[:, None]
    idx_c = idx_y_c[None, :] + group * GROUP_OUT_C
    idx_h = idx_y_h[:, None]
    idx_w = idx_y_w[:, None]

    # inductor generates a suffix
    {{store_output(("idx_n", "idx_c", "idx_h", "idx_w"), "acc", "mask")}}
)namegridsourcea  
        idx_x_d = d - PADDING_D + idx_y_d * STRIDE_D
        idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H
        idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W
        idx_x_c = tl.arange(0, BLOCK_K) + k

        x_ptrs = x_base + (
            (idx_x_d * stride_xd)[:, None]
            + (idx_x_h * stride_xh)[:, None]
            + (idx_x_w * stride_xw)[:, None]
            + (idx_x_c * stride_xc)[None, :]
        )
        mask_x = (
            (idx_n < BATCH)[:, None]
            & (idx_x_d >= 0)[:, None]
            & (idx_x_d < IN_D)[:, None]
            & (idx_x_h >= 0)[:, None]
            & (idx_x_h < IN_H)[:, None]
            & (idx_x_w >= 0)[:, None]
            & (idx_x_w < IN_W)[:, None]
            & (idx_x_c < GROUP_IN_C)[None, :]
        )
        matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0)

        w_ptrs = w_base + (
            (idx_x_c * stride_wc_in)[:, None] +
            (d * stride_wd) + (i * stride_wh) + (j * stride_ww)
        )
        mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C)
        matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0)
        acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32)
convolution3daH  
{{def_kernel("X", "W")}}
    # Tensor dimensions
    BATCH = {{size("X", 0)}}
    IN_C = {{size("X", 1)}}
    IN_D = {{size("X", 2)}}
    IN_H = {{size("X", 3)}}
    IN_W = {{size("X", 4)}}
    OUT_C = {{size(None, 1)}}
    OUT_D = {{size(None, 2)}}
    OUT_H = {{size(None, 3)}}
    OUT_W = {{size(None, 4)}}

    # Strides:
    stride_xn = {{stride("X", 0)}}
    stride_xc = {{stride("X", 1)}}
    stride_xd = {{stride("X", 2)}}
    stride_xh = {{stride("X", 3)}}
    stride_xw = {{stride("X", 4)}}
    stride_wc_out = {{stride("W", 0)}}
    stride_wc_in = {{stride("W", 1)}}
    stride_wd = {{stride("W", 2)}}
    stride_wh = {{stride("W", 3)}}
    stride_ww = {{stride("W", 4)}}

    ndhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
    idx_y_w = ndhw % OUT_W
    ndh = ndhw // OUT_W
    idx_y_h = ndh % OUT_H
    nd = ndh // OUT_H
    idx_y_d = nd % OUT_D
    idx_n = nd // OUT_D
    idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)

{% if GROUPS == 1 %}
    group = 0
    GROUP_IN_C = IN_C
    GROUP_OUT_C = OUT_C
{% else %}
    group = tl.program_id(2)
    GROUP_IN_C = IN_C // GROUPS
    GROUP_OUT_C = OUT_C // GROUPS
{% endif %}

    x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None]
    w_base = (
        W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :]
    )

    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)

{% if UNROLL %}
{% for d in range(KERNEL_D) %}
{% for i in range(KERNEL_H) %}
{% for j in range(KERNEL_W) %}
    d = {{d}}
    i = {{i}}
    j = {{j}}
    for k in range(0, GROUP_IN_C, BLOCK_K):
        aF  
{% endfor %}
{% endfor %}
{% endfor %}
{% else %}
    # Could be simplified, but slightly slower:
    # for d in range(KERNEL_D):
    #   for i in range(KERNEL_H):
    #     for j in range(KERNEL_W):
    #         for k in range(0, GROUP_IN_C, BLOCK_K):
    BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K
    for dijk in range(KERNEL_D * KERNEL_H * KERNEL_W * BLOCK_K_COUNT):
        k = (dijk % BLOCK_K_COUNT) * BLOCK_K
        dij = dijk // BLOCK_K_COUNT
        j = dij % KERNEL_W
        di = dij // KERNEL_W
        i = di % KERNEL_H
        d = di // KERNEL_H
        a  
{% endif %}

    mask = (
        (idx_n < BATCH)[:, None]
        & (idx_y_d < OUT_D)[:, None]
        & (idx_y_h < OUT_H)[:, None]
        & (idx_y_w < OUT_W)[:, None]
        & (idx_y_c < GROUP_OUT_C)[None, :]
    )
    idx_n = idx_n[:, None]
    idx_c = idx_y_c[None, :] + group * GROUP_OUT_C
    idx_d = idx_y_d[:, None]
    idx_h = idx_y_h[:, None]
    idx_w = idx_y_w[:, None]

    # inductor generates a suffix
    {{store_output(("idx_n", "idx_c", "idx_d", "idx_h", "idx_w"), "acc", "mask")}}
zat::convolutionF)has_out_variantop_overloadc          
         t          j        t          j        |d          d          }t          j        |                     dddd          |                    dd          |                    dddd                    S )Nr   r         )out)torchsqueezematmulpermute)xr%   r9   s      r(   conv1x1_via_mmr?   M  so    emAr**B//A<			!Q1qyyACKK1a4K4K   r*   c                  L    e Zd ZU ded<   ded<   ded<   ded<   ded<   ded	<   d
S )ConvLayoutParamstuple[int, ...]stridepaddingdilationbool
transposedoutput_paddingintgroupsN)__name__
__module____qualname____annotations__r!   r*   r(   rA   rA   W  sU         ####KKKKKr*   rA   r>   r   weightbiasOptional[TensorBox]rC   Sequence[int]rD   rB   rE   rG   rF   rH   rJ   rI   return	ir.Layoutc	                B   t           j        j        5  t          j        j                            t          j        | d          t          j        |d          t          j        |d          t           j        j	        
                    |          t           j        j	        
                    |          t           j        j	        
                    |          |t           j        j	        
                    |          |	  	        }	t          j        |	                                          }
t          j        |	                                          }ddd           n# 1 swxY w Y   t          j        |                                 |                                 |
|          S )z)Determine output layout for a convolutionT)guard_shapeN)r   graph	fake_moder:   opsatenconvolutionr
   ir_node_to_tensorsizevars
size_hintsconvert_shape_to_inductorsizerC   FixedLayoutget_device_or_error	get_dtype)r>   rO   rP   rC   rD   rE   rG   rH   rJ   outputsizess              r(   conv_layoutrf   `  s|    
	 ? ?++ 555 T::: 4888G''//G''00G''11G''77

 

 ,V[[]];;-fmmoo>>? ? ? ? ? ? ? ? ? ? ? ? ? ? ? >			  s   D=EE"Ec                    t          t          t          |                               }|                    d|                    d                     |S )Nr8   r6   )listreversedrangeinsertpop)rankorders     r(   channels_last_orderro     s@    %++&&''E	LLEIIbMM"""Lr*   c                <   t          |                                          }t          |dz
            D ]#}t          t          j                 |d          }$t          t          j                 |ddg          }t          j        	                    | t          |                    } t          t          |                    }|                    |                    d                     t          t          j                 | |          } |                                 ^ }}t          t          j                 | t          |          |g          } |!t          t          j                 | |          }n!t          t          j                 || |          }t          t          j                 |g |d          }t          t          |                    }	|	                    d|	                    d                     t          t          j                 ||	          S )Nr   r6   dimr8   r   )lenget_sizerj   LrZ   r;   r=   r
   ExternKernelrequire_stride_orderro   rh   appendrl   reshaper   mmaddmmrk   )
r>   rO   rP   rm   _	x_permutere   in_chanresultresult_permutes
             r(   convert_1x1_conv_to_mmr     s   v  !!D4!8__ 1 14<R000t|_VaV,,F
,,Q0CD0I0IJJAU4[[!!IY]]1%%&&&	$,9%%AjjllOUG	$,M%00':;;A|47Av&&4:tQ//t|_V\u\b\22F%++&&N!^//33444T\?6>222r*   c	                    t          |          }t          |          }t          |          }t          |          }t          |t                    s$t          j        j                            |          }t          |t                    sJ t          t          j        j                            |                    }t          t          j        j                            |                    }||||||dt          j	                   }	t                                                     t                                                    dz
  k    rat          t          j                 t          t          t          j                  dg                                           |fi d          S t          j        j                                                                      ^}
}}t                                                     dk    rt          |          dk    r|	dk    r                    d|z   d|z   d|z   d|z   d	           t          t          j                  d
           t          t          j                 d
          t          t          j                 t           |fi d
          S t          |          t'          |          }t'          |          }t'          |          }t'          |          } fd}t(          j        pt(          j        }t(          j        s|r |            rt1          |          rt1          |          rt3          |          r{t1          |          rl|sjt3          |          r[|dk    rUt          j        j                            t7                                                     d          rt9           |          S |s|	dk    rmt           d fi }t          t          j                 |t          t          j                 ||                                d         gdgz  z                       S                                                                    t          j        j         rnd
k    rht          j        xj!        dz  c_!        t          j"        #                                t          j"        #                              tI           d fi }ntI           d fi }t          j%        t          j        j        &                    |j'                            }t          j"        (                     |           t          j"        (                    |          g d}|  g}d d<   |)                    dd           nc |g}|                                 |*                                 t          j        j                            |                                           g }tV          j,        j-        .                    d          rt_          j0        |||fi g}tV          j,        j-        .                    d          rrtc          |          rbt1          |          rR|sOt3          |          r?t          j        j        2                    |                                 d                   rt1          |          rRt1          |          rCt3          |          r4|dk    r.|3                    th          0                    ||                     t          j5        6                    |	          } |t7                                           d         g                                 d
d                    |
|          D ]0}d
k    rwto          j8        |f f||d         |d         |d         |d         |d         |d         |t1          |          tV          j9        j:        j;        |j<        |j=        d|j>         dk    rt          j8        |fi d fd|d|d         d|d         d|d
         d|d         d|d         d|d
         d|d         d|d         d|d
         d|dt1          |          dtV          j9        j:        j;        d|j<        d |j=        |j>         2t          |          r2t          jB        || f||fnt                      z   ||||!           t          d"|||          S )#N)rC   rD   rE   rG   rH   rJ   r8   r   rq   r7   xpu)r8   )r   )rC   rD   rE   rH   r   c                     t           j        j        rdk    rdS t          d fi } t	          j        t           j        j                            | j                            }|t          j	        k    S )Nr   T)
r   rW   
layout_optrf   r
   get_stride_orderr]   r^   rC   NHWC_STRIDE_ORDER)layoutreq_stride_orderkwargsndimrO   r>   s     r(   channels_last_convz'convolution.<locals>.channels_last_conv  sq    7 	$!))4Q7777.G''66
 
  2#777r*   cpurP   ATENTRITON)input_nodesr   KERNEL_HKERNEL_WSTRIDE_HSTRIDE_W	PADDING_H	PADDING_Wr    UNROLL
ALLOW_TF32
num_stages	num_warpsr   r   KERNEL_Dr   r   STRIDE_Dr   r   	PADDING_Dr   r   r    r   r   r   r   )r   rC   rD   rE   rJ   n_spatial_dimensionsr[   )Dtuple
isinstancerI   r   rW   r]   	guard_intguard_int_seqr
   get_device_typers   rt   ru   rZ   r;   r[   expandupdate	unsqueezer   r	   max_autotunemax_autotune_gemmconv_1x1_as_mmr   r   statically_known_gtr   r   addviewrealizer   num_channels_last_convrv   require_channels_lastrf   r   r^   rC   rw   rk   freeze_layoutr:   	_inductorutils_use_conv_autotune_backendaten_convolutionbindr   statically_known_equalsrx   aten_conv1x1_via_mmchoicesget_conv_configsconv2d_templatemaybe_append_choicebackendscudnn
allow_tf32r   r   r   conv3d_templater   r   add_ck_conv_choicesr   )r>   rO   rP   rC   rD   rE   rG   rH   rJ   device_typeout_chanr~   kernel_shaper   autotuning_gemmr   r   r   ordered_kwargs_for_cpp_kernelargsr   conv_configscfgr   r   s   ``                     @@r(   r[   r[     s	    6]]FGnnGXH>**Nfc"" 4!++F33fc""""" 17#11&99::FAG$227;;<<G  (   F $Q''K
1::<<C 1 122Q666$+q1*<qzz||*<==vtVVvVV
 
 
 	

 ()w'7'E'EfooFWFW'X'X$Hg
 1::<<A#l"3"3q"8"8[E=Q=Q-'> 8O"&"7	 	
 	
 	
 dnaQ'''4>"6q1116422622
 
 
 	

 |D&$''F7D))GHd++H!.$77N8 8 8 8 8 8 8 8 )EV-EO 
	7#277I7I7K7K7L!!7 FOO7 W	7
 H7 7 ^$$7 aKKG00qzz||1L1LaPP  &a666K500Q7777{AdiL(9(9!(<'=s
'JKK
 
 	
 IIKKK
NN
 	w Pdaii	&&!+&&O11!44 66v>>Q7777Q7777.G''66
 
 O004DEE55f>NOO% % %! |6{v%,,Q777764 	&&t}}777G77?? 
!-  	 
 	88BBC''C H	C
 C ^$$C G44Wajjll1oNNC L!!	C	C !!	C !NN.33D&AABBBy11+>><1::<<?>QZZ\\!""-=>??
 
 /	 /	C
 qyy3!"F!)!_)!_#AY#AY%aj%aj! #<00$~3>"~!m! " j#   & 3  !"F "6 *!__	
 *!__ *!__ $AYY $AYY $AYY &ajj &ajj &ajj "6  #<000!"  %~3>>#$  #~~%& "mmj)  , F## 

 4F$2BwwP!%		
 		
 		
 		
 %]GT6JJJr*   c                0    t          | ||||||||	  	        S N)r[   )r>   rO   rP   rC   rD   rE   rG   rH   rJ   	benchmarkdeterministiccudnn_enabledr   s                r(   _convolutionr     s*      	64(JPV  r*   c                    | j         t          j        j        j        j        k    sJ t          j        j        r||fS t          | g|R i |S r   )
targetr:   rY   rZ   r[   defaultr   rW   r   r   )fx_noder   r   s      r(   constrain_conv_to_fx_stridesr     sU    >UY^7?????w AV|&w@@@@@@@r*   )r>   r   rO   r   rP   rQ   rC   rR   rD   rB   rE   rB   rG   rF   rH   rB   rJ   rI   rS   rT   )r>   r   rO   r   rP   rQ   rC   rR   rD   rR   rE   rR   rG   rF   rH   rR   rJ   rI   ):
__future__r   loggingtypingr   r   r   r:   -torch._inductor.codegen.rocm.ck_conv_templater    r	   r
   loweringr   r   r   ru   r   select_algorithmr   r   r   r   r   r   r   r   r   r   r   virtualizedr   collections.abcr   r   	getLoggerrK   logrY   rZ   r)   r-   LOOP_BODY_2Dr   LOOP_BODY_3Dr   r[   r   r   r?   r   rA   rf   ro   r   r   r   r!   r*   r(   <module>r      s   " " " " " "  5 5 5 5 5 5 5 5 5 5  R R R R R R                                                     ((((((g!! y~      8
 !.		3h i4jkCH IDJKUY Y YvB !.		;x y<z{O` aPbccg g gR &%	 (	      )(>>     y          F  3 3 3. 4#$$lK lK lK %$lK^ 4$%%  &%(A A A  d&(D E E E E Er*   