
    `i                        d dl Z d dlZd dlmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ ddlmZ ddlmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0m1Z1 ddl2m3Z3m4Z4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@ ddlAmBZBmCZCmDZDmEZE 	 d dlFZF eeFjG                  ZHdZIn# eJ$ r  ed          ZHdZIY nw xY w ejK        eL          ZMejN        jO        ZOejN        jP        ZP e6deDejQ        jR        eHdk    rdnddd          ZS e6d eEd!"          ZTd#ZUd$ZVd%ZW e6d&eEeWeUz   eVz   "          ZXe jY        d'             ZZ e4ej[        d(eOj[        j\        )          Z] e4ej^        d*eOj^        j\        )          Z_ e4ej`        d+eOj`        j\        )          Za e4ejb        d,deOjb        jc        -          Zd e4eje        d.eOje        j\        )          Zfd/ Zgdddd0d1ZhdOd3Zi e4ehd          Zjd4 Zk G d5 d6e%          Zl el            Zm G d7 d8e%          Znd9 Zod: Zp end;d<eo          Zq end=d>ep          Zr e1eOj[        d?          dd@dA            Zs e1eOj`        d?          dd@dB            Zt e1eOj^        d?          ddddCdD            Zu e1eOjb        d?          dddEdF            Zv e/eOje        jc        e0            e1eOje        jc        d?          	 	 	 	 	 dPdG            Zwe jY        dHeex         d2eyfdI            ZzdJ Z{	 	 dQdKeex         fdLZ|dM Z}dN Z~dS )R    N)AnyOptional)counters)AutoHeuristicSelectAlgorithm)	AHContextcontext_add_stridescontext_add_using_tf32mm_operations)CppGemmTemplate)gen_best_config)V)make_fx)TorchVersion   )config)CUTLASS2xGemmTemplateCUTLASS3xGemmTemplate)CKTileGemmTemplate)CKGemmTemplate)SubgraphChoiceCallerSubgraphTemplate)BufferChoiceCallerFlexibleLayout	is_tritonLayout)MMKernelInputs)add_layout_constraintconstrain_to_fx_stridesregister_lowering)autotune_select_algorithmExternKernelChoicerealize_inputsTritonTemplate)	_use_cutlass_for_opuse_aten_gemm_kernelsuse_ck_gemm_templateuse_ck_tile_gemm_templateuse_cpp_gemm_templateuse_cutlass_templateuse_decompose_k_choiceuse_triton_templateuse_triton_tma_template   )_is_static_problemmm_argsmm_gridpersistent_mm_gridTz0.0.0Fmmz3.3.0aK
  
{{def_kernel("A", "B")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return
    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}

    # based on triton.ops.matmul
    pid = tl.program_id(0)
    grid_m = (M + BLOCK_M - 1) // BLOCK_M
    grid_n = (N + BLOCK_N - 1) // BLOCK_N

    # re-order program ID for better L2 performance
    width = GROUP_M * grid_n
    group_id = pid // width
    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
    pid_m = group_id * GROUP_M + (pid % group_size)
    pid_n = (pid % width) // (group_size)
    tl.assume(pid_m >= 0)
    tl.assume(pid_n >= 0)

    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    if ((stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1)) and (M >= BLOCK_M and K > 1):
        offs_a_m = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
    else:
        offs_a_m = rm % M
    if ((stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1)) and (N >= BLOCK_N and K > 1):
        offs_b_n = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
    else:
        offs_b_n = rn % N
    offs_k = tl.arange(0, BLOCK_K)
    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

    for k_idx in range(0, tl.cdiv(K, BLOCK_K)):
        {% if not EVEN_K %}
        a_mask = offs_k[None, :] < (K - k_idx * BLOCK_K)
        b_mask = offs_k[:, None] < (K - k_idx * BLOCK_K)
        {% endif %}
        a_k_idx_vals = offs_k[None, :] + (k_idx * BLOCK_K)
        b_k_idx_vals = offs_k[:, None] + (k_idx * BLOCK_K)

        idx_m = offs_a_m[:, None]
        idx_n = a_k_idx_vals
        {{load_input("A", "a", ("idx_m", "idx_n"), mask=None if EVEN_K else "a_mask", indent_width=8)}}

        idx_m = b_k_idx_vals
        idx_n = offs_b_n[None, :]
        {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask", indent_width=8)}}

        {% if USE_FAST_ACCUM %}
        acc = tl.dot(a, b, acc, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
        {% else %}
        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
        {% endif %}

    # rematerialize rm and rn to save registers
    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    idx_m = rm[:, None]
    idx_n = rn[None, :]
    mask = (idx_m < M) & (idx_n < N)

    # inductor generates a suffix
    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
a
  
{{def_kernel("A", "B")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return
    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}

    # based on triton.ops.matmul
    pid = tl.program_id(0)
    grid_m = (M + BLOCK_M - 1) // BLOCK_M
    grid_n = (N + BLOCK_N - 1) // BLOCK_N

    # re-order program ID for better L2 performance
    width = GROUP_M * grid_n
    group_id = pid // width
    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
    pid_m = group_id * GROUP_M + (pid % group_size)
    pid_n = (pid % width) // (group_size)
    tl.assume(pid_m >= 0)
    tl.assume(pid_n >= 0)

    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    if (stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1):
        offs_a_m = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
    else:
        offs_a_m = rm % M
    if (stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1):
        offs_b_n = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
    else:
        offs_b_n = rn % N
    offs_k = tl.arange(0, BLOCK_K)
    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

    for k_idx in range(0, tl.cdiv(K, BLOCK_K)):
        {% if not EVEN_K %}
        a_mask = offs_k[None, :] < (K - k_idx * BLOCK_K)
        b_mask = offs_k[:, None] < (K - k_idx * BLOCK_K)
        {% endif %}
        a_k_idx_vals = offs_k[None, :] + (k_idx * BLOCK_K)
        b_k_idx_vals = offs_k[:, None] + (k_idx * BLOCK_K)

        idx_m = offs_a_m[:, None]
        idx_n = a_k_idx_vals
        {{load_input("A", "a", ("idx_m", "idx_n"), mask=None if EVEN_K else "a_mask", indent_width=8)}}

        idx_m = b_k_idx_vals
        idx_n = offs_b_n[None, :]
        {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask", indent_width=8)}}
        {% if USE_FAST_ACCUM %}
        acc = tl.dot(a, b, acc, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
        {% else %}
        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
        {% endif %}

    # rematerialize rm and rn to save registers
    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    idx_m = rm[:, None]
    idx_n = rn[None, :]
    mask = (idx_m < M) & (idx_n < N)

    # inductor generates a suffix
    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
)namegridsource"cache_codegen_enabled_for_templateprologue_loads_all_inputsmm_persistent_tmaa  
{{def_kernel("A", "B")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return

    start_pid = tl.program_id(0)
    grid_m = tl.cdiv(M, BLOCK_M)
    grid_n = tl.cdiv(N, BLOCK_N)
    k_tiles = tl.cdiv(K, BLOCK_K)
    num_tiles = grid_m * grid_n
    tiles_per_SM = num_tiles // NUM_SMS
    if start_pid < num_tiles % NUM_SMS:
        tiles_per_SM += 1

    tile_id = start_pid - NUM_SMS
    ki = -1

    width = GROUP_M * grid_n
    rk_for_mask = tl.arange(0, BLOCK_K)
    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

    {%- if TMA_EXPERIMENTAL_API %}
    workspace_base = ws_ptr + start_pid * 2 * TMA_SIZE
    a_desc_ptr = workspace_base
    b_desc_ptr = workspace_base + TMA_SIZE

    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=a_desc_ptr,
        global_address=A,
        load_size=[BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
        global_size=[M, K] if A_ROW_MAJOR else [K, M],
        element_ty=A.dtype.element_ty,
    )
    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=b_desc_ptr,
        global_address=B,
        load_size=[BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
        global_size=[K, N] if B_ROW_MAJOR else [N, K],
        element_ty=B.dtype.element_ty,
    )

    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(a_desc_ptr)
    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(b_desc_ptr)

    {%- else %}
    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}
    a_desc = triton.language.make_tensor_descriptor(
        base=A,
        shape=[M, K] if A_ROW_MAJOR else [K, M],
        strides=[stride_am, 1] if A_ROW_MAJOR else [stride_ak, 1],
        block_shape=[BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
    )
    b_desc = triton.language.make_tensor_descriptor(
        base=B,
        shape=[K, N] if B_ROW_MAJOR else [N, K],
        strides=[stride_bk, 1] if B_ROW_MAJOR else [stride_bn, 1],
        block_shape=[BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
    )
    {%- endif %}

    pid_m = 0
    pid_n = 0
    rm = 0
    rn = 0

    for _ in range(0, k_tiles * tiles_per_SM):
        ki = tl.where(ki == k_tiles - 1, 0, ki + 1)
        if ki == 0:
            tile_id += NUM_SMS
            # re-order program ID for better L2 performance
            group_id = tile_id // width
            group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
            pid_m = group_id * GROUP_M + (tile_id % group_size)
            pid_n = (tile_id % width) // (group_size)

            rm = pid_m * BLOCK_M
            rn = pid_n * BLOCK_N

        rk = ki * BLOCK_K

        {%- if TMA_EXPERIMENTAL_API %}
        a = tl._experimental_descriptor_load(
            a_desc_ptr,
            [rm, rk] if A_ROW_MAJOR else [rk, rm],
            [BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
            A.dtype.element_ty,
        )
        b = tl._experimental_descriptor_load(
            b_desc_ptr,
            [rk, rn] if B_ROW_MAJOR else [rn, rk],
            [BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
            B.dtype.element_ty,
        )
        {%- else %}
        a = tl.load_tensor_descriptor(
            a_desc,
            [rm, rk] if A_ROW_MAJOR else [rk, rm],
        )
        b = tl.load_tensor_descriptor(
            b_desc,
            [rk, rn] if B_ROW_MAJOR else [rn, rk],
        )
        {%- endif %}
        acc += tl.dot(
            a if A_ROW_MAJOR else a.T,
            b if B_ROW_MAJOR else b.T,
            allow_tf32=ALLOW_TF32,
        )

        if ki == k_tiles - 1:
            # rematerialize rm and rn to save registers
            rcm = rm + tl.arange(0, BLOCK_M)
            rcn = rn + tl.arange(0, BLOCK_N)
            idx_m = rcm[:, None]
            idx_n = rcn[None, :]
            mask = (idx_m < M) & (idx_n < N)

            # inductor generates a suffix
            {{store_output(("idx_m", "idx_n"), "acc", "mask", indent_width=12)}}
            acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

)r4   r5   r6   a  
@triton.jit
def load_scales(a_scale_ptr, b_scale_ptr, SCALING_ROWWISE: tl.constexpr):
    if SCALING_ROWWISE:
        # For row-wise scaling, we'll return the pointers
        return a_scale_ptr, b_scale_ptr
    else:
        # For per-tensor scaling, we'll load the scalar values
        a_scale = tl.load(a_scale_ptr)
        b_scale = tl.load(b_scale_ptr)
        return a_scale, b_scale
a'  
@triton.jit
def apply_scaling(
    accumulator,
    a_scale,
    b_scale,
    SCALING_ROWWISE: tl.constexpr,
    offs_cm,
    offs_cn,
    M,
    N,
    stride_a_scale_m,
    stride_b_scale_n,
):
    if SCALING_ROWWISE:
        # For row-wise scaling, we need to load the scales for each row/column
        a_scales = tl.load(
            a_scale + (offs_cm * stride_a_scale_m),
            mask=offs_cm < M,
            other=0.0,
        )
        b_scales = tl.load(
            b_scale + (offs_cn * stride_b_scale_n),
            mask=offs_cn < N,
            other=0.0,
        )
        acc_scale = a_scales[:, None] * b_scales[None, :]
    else:
        # For per-tensor scaling, we can directly use the loaded scalar values
        acc_scale = a_scale * b_scale

    return accumulator * acc_scale
a  
{{def_kernel("A", "B", "A_inverse_scale", "B_inverse_scale")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return

    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}

    if SCALING_ROWWISE:
        stride_a_scale_m = 1
        stride_b_scale_n = 1
    else:
        stride_a_scale_m = 0
        stride_b_scale_n = 0

    start_pid = tl.program_id(axis=0)
    num_pid_m = tl.cdiv(M, BLOCK_M)
    num_pid_n = tl.cdiv(N, BLOCK_N)
    k_tiles = tl.cdiv(K, BLOCK_K)
    num_tiles = num_pid_m * num_pid_n

    {%- if TMA_EXPERIMENTAL_API %}
    workspace_base = ws_ptr + start_pid * 2 * TMA_SIZE
    a_desc_ptr = workspace_base
    b_desc_ptr = workspace_base + TMA_SIZE

    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=a_desc_ptr,
        global_address=A,
        load_size=[BLOCK_M, BLOCK_K],
        global_size=[M, K],
        element_ty=A.dtype.element_ty,
    )
    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=b_desc_ptr,
        global_address=B,
        load_size=[BLOCK_N, BLOCK_K],
        global_size=[N, K],
        element_ty=B.dtype.element_ty,
    )

    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(a_desc_ptr)
    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(b_desc_ptr)

    {%- else %}
    stride_am = {{stride("A", 0)}}
    stride_bn = {{stride("B", 1)}}
    a_desc = triton.language.make_tensor_descriptor(
        base=A,
        shape=[M, K],
        strides=[stride_am, 1],
        block_shape=[BLOCK_M, BLOCK_K],
    )
    b_desc = triton.language.make_tensor_descriptor(
        base=B,
        shape=[N, K],
        strides=[stride_bn, 1],
        block_shape=[BLOCK_N, BLOCK_K],
    )
    {%- endif %}

    tiles_per_SM = num_tiles // NUM_SMS
    if start_pid < num_tiles % NUM_SMS:
        tiles_per_SM += 1

    tile_id = start_pid - NUM_SMS
    ki = -1

    pid_m = 0
    pid_n = 0
    offs_am = 0
    offs_bn = 0

    num_pid_in_group = GROUP_M * num_pid_n
    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
    a_scale, b_scale = load_scales(A_inverse_scale, B_inverse_scale, SCALING_ROWWISE)

    for _ in range(0, k_tiles * tiles_per_SM):
        ki = tl.where(ki == k_tiles - 1, 0, ki + 1)
        if ki == 0:
            tile_id += NUM_SMS
            group_id = tile_id // num_pid_in_group
            first_pid_m = group_id * GROUP_M
            group_size_m = min(num_pid_m - first_pid_m, GROUP_M)
            pid_m = first_pid_m + (tile_id % group_size_m)
            pid_n = (tile_id % num_pid_in_group) // group_size_m

            offs_am = pid_m * BLOCK_M
            offs_bn = pid_n * BLOCK_N

        offs_k = ki * BLOCK_K

        {%- if TMA_EXPERIMENTAL_API %}
        a = tl._experimental_descriptor_load(
            a_desc_ptr, [offs_am, offs_k], [BLOCK_M, BLOCK_K],  A.dtype.element_ty
        )
        b = tl._experimental_descriptor_load(
            b_desc_ptr, [offs_bn, offs_k], [BLOCK_N, BLOCK_K],  B.dtype.element_ty
        )
        {%- else %}
        a = tl.load_tensor_descriptor(a_desc, [offs_am, offs_k])
        b = tl.load_tensor_descriptor(b_desc, [offs_bn, offs_k])
        {%- endif %}
        if USE_FAST_ACCUM:
            accumulator = tl.dot(a, b.T, accumulator)
        else:
            accumulator += tl.dot(a, b.T)

        if ki == k_tiles - 1:
            # Apply inverse scaling
            offs_cm = offs_am + tl.arange(0, BLOCK_M)
            offs_cn = offs_bn + tl.arange(0, BLOCK_N)
            # Apply scaling
            accumulator = apply_scaling(
                accumulator,
                a_scale,
                b_scale,
                SCALING_ROWWISE,
                offs_cm,
                offs_cn,
                M,
                N,
                stride_a_scale_m,
                stride_b_scale_n,
            )

            idx_m = offs_cm[:, None]
            idx_n = offs_cn[None, :]
            mask = (idx_m < M) & (idx_n < N)
            # inductor generates a suffix
            {{store_output(("idx_m", "idx_n"), "accumulator", "mask", indent_width=12)}}
            accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
scaled_mm_device_tmac                      t          |           S N)r"   )fns    m/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torch/_inductor/kernel/mm.pylazy_register_extern_choicer?   &  s    b!!!    z
at::mm_out)op_overloadzat::addmm_outzat::_int_mm_outzat::_sparse_semi_structured_mm)has_out_variantrA   zat::_scaled_mm_outc                 Z    |                                  t          j        t          j        fv S r<   )	get_dtypetorchint8uint8)mats    r>   _is_int8_matrI   A  s    ==??uz5;777r@   outalphabetac                   |                      d          dk    r|                     d          dk    s|                     d          dk    r t          j        | d         |||||          S t          j        | |||||          S )z
    Giving torch.addmm a 1D tensor calls a different (faster) cublasLt
    kernel under the hood.  There are a few shapes where this is slower,
    but they are rare.
    r   r.   rJ   )stridesizerE   addmm)inpmat1mat2rK   rL   rM   s         r>   
bias_addmmrU   E  s     	

1sxx{{a//CHHQKK14D4D{3q643e$OOOO;sD$Cu4HHHHr@   returnc                     dt           fd}dt           fd}dt           fd}t          j         |                                           p |                                            fd           t          j         |                                          p |                                          fd           d S )NrV   c                 X    t           j        j                            | d         d          S )Nr.   r   graphsizevarsstatically_known_equalsrO   s    r>   is_row_majorz.check_supported_striding.<locals>.is_row_majorQ  !    w77q	1EEEr@   c                 X    t           j        j                            | d         d          S Nr   r.   rY   r]   s    r>   is_col_majorz.check_supported_striding.<locals>.is_col_majorT  r_   r@   c                     t          t          j        j                            | d         d          p*t          j        j                            | d         d                    S ra   )boolr   rZ   r[   r\   )rP   s    r>   has_zero_dimz.check_supported_striding.<locals>.has_zero_dimW  sR    G44T!Wa@@ Dw77QCC
 
 	
r@   c                  2    d                                   S )Nz$mat_a must be row_major, got stride 
get_stride)mat_as   r>   <lambda>z*check_supported_striding.<locals>.<lambda>`      Ku7G7G7I7IKK r@   c                  2    d                                   S )Nz$mat_b must be col_major, got stride rg   )mat_bs   r>   rj   z*check_supported_striding.<locals>.<lambda>f  rk   r@   )rd   rE   _checkrh   get_size)ri   rm   r^   rb   re   s   ``   r>   check_supported_stridingrp   P  s   F F F F FF F F F F
d 
 
 
 
 
LU%%''((JLL9I9I,J,JKKKK   
LU%%''((JLL9I9I,J,JKKKK    r@   c                    | j         d         }|j         d         }| j         d         }||z  }|}t          j        |                     |||          d          }|                    |||          }	t          j        ||	t          j                  }
t          j        |
d          }|                    | j                  S )Nr   r.   )r.   r   r   	out_dtype)	shaperE   permutereshapebmmfloat32sumtodtype)abk_splitsmnkk_partsB
a_reshaped
b_reshapedresultreduced_bufs               r>   
decomposeKr   m  s    	
A	
A	
A8mGAqyyAw77CCJ1gq))JYz:GGGF)FA&&K>>!'"""r@   c                   F     e Zd Z fdZdee         dededef fdZ	 xZ
S )DecomposeKSugraphTemplatec                 L    t                                          d           d S )Ndecompose_kr4   )super__init__)self	__class__s    r>   r   z"DecomposeKSugraphTemplate.__init__|  s1     	 	
 	
 	
 	
 	
r@   input_nodeslayoutk_splitrV   c                 ,   ddl m} ddlm} d| d}d|} |            5   |            }t	          t          j        t          |          |          }	t                      	                    ||||	|	          cd d d            S # 1 swxY w Y   d S )
Nr   enable_python_dispatcherr   select_decomp_tabledecompose_k_mm__splitzk_split=)r~   r4   r   r   make_fx_graphdescription)
torch._dispatch.pythonr   decompositionr   r   	functoolspartialr   r   generate)r   r   r   r   r   r   r4   r   decompositionsr=   r   s             r>   r   z"DecomposeKSugraphTemplate.generate  s    	DCCCCC7777770000#mm%%'' 	 	0022N!*w??? B
 77##' ' $  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   AB		BB)__name__
__module____qualname__r   listr   r   intr   r   __classcell__r   s   @r>   r   r   {  s        
 
 
 
 

&\  	
 
         r@   r   c                   P     e Zd Zdededef fdZdee         dede	f fdZ
 xZS )	ContiguousTemplater4   r   r=   c                 v    || _         || _        || _        t                                          |           d S )Nr   )r4   r   r=   r   r   )r   r4   r   r=   r   s       r>   r   zContiguousTemplate.__init__  sG    	& 	 	
 	
 	
 	
 	
r@   r   r   rV   c                    ddl m} ddlm}  |            5   |            }t	          | j        |          }t                                          | j        |||| j	                  cd d d            S # 1 swxY w Y   d S )Nr   r   r   r   r   )
r   r   r   r   r   r=   r   r   r4   r   )r   r   r   r   r   r   r=   r   s          r>   r   zContiguousTemplate.generate  s    
 	DCCCCC777777%%'' 	 	0022N B
 77##Y'  , $  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   AA44A8;A8)r   r   r   strr   r   r   r   r   r   r   r   r   s   @r>   r   r     s        
S 
s 
 
 
 
 
 
 
&\  
	         r@   r   c                 P    t          j        | |                                          S r<   )rE   r3   
contiguous)r|   r}   s     r>   contiguous_mmr     s    8Aq||~~&&&r@   c                 R    t          j        | ||                                          S r<   )rE   rQ   r   )rR   r|   r}   s      r>   contiguous_addmmr     s    ;sAq||~~...r@   r   zcontiguous mmr   zcontiguous addmm)type_promotion_kindr   c                j
   t          | ||          \  }}}}} }t          |          \  }}d}t          | |g          }	t          d         d| d| d| xx         dz  cc<   t                              d||||                                 |                                |           |}
t          j        s-t          j	        s!t          |j        |j        |j                  }
g }t                      r;|                    t           j                            |	|
t&          gd                     t          |          \  }}|rt)          |d	
          r|                    t           j                            |	|t*          gd                     t-          | |          r;|                    t           j                            |	|t.          gd                     t1          |||          r;|                    t           j                            |	|t2          gd                     |                    t           j                            |	|t4          gd                     |rIt7          ||||          r7t9          d          r(t;          j        |||	                                           |r:tA          ||||          r(tC          j"        |||	                                           |r:tG          ||||          r(tI          j%        |||	                                           tM          || |          r(tO          j%        |||	                                           | |g}|r+t)          |          rtP          j)        j*        +                    |          rtY          |           rg }t                      r|-                    d           t]          |          }|                    t           j                            |	|t*          gd                     t_          | |||||||ta                      dd|          tP          j)        j*        1                    |          s."t]                    dk    rfd|D             }n
|d|         }t          j2        D ]J}|-                    tg          |          4                    |	                                |                     Kd}tP          j)        j*        j5        rtm          | |          }to          |||	                                ||          S )z_
    Lowering for autotuning aten.mm with different backends (Aten, Triton, CUTLASS, etc.)
    r   r3   aten_mm_infozaten.mm__r.   zOTuned aten.mm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sdevicer{   rP   Fcheck_max_autotune	extern_mmzmm-ahN
   )top_kalways_includedr   c                     g | ]}|v |	S  r   ).0choice
ah_choicess     r>   
<listcomp>ztuned_mm.<locals>.<listcomp>S  s#    PPPf6Z;O;O6;O;O;Or@   )best_config_future)8r0   r/   r   r   loginforD   inductor_configmax_autotunemax_autotune_gemmr   r   r{   rP   r&   extendr   choicesget_mm_configsaten_mmr,   mm_templater-   persistent_tma_mm_templater+   decompose_k_subgraph_templatemm_contiguous_subgraph_templater*   r%   r   add_cutlass_gemm_choicesnodesr'   r   add_ck_gemm_choicesr(   r   add_choicesr)   r   rE   	_inductorr   run_autoheuristicr   appendlenmm_autoheuristicr
   collect_autoheuristicexternal_matmulr?   bindremote_gemm_autotune_cacher   r!   )rS   rT   r   r   r   r   static_shape
is_nonzeror4   kernel_inputsaten_layoutr   r   r    num_choices_before_extra_configsr   r   s                   @r>   tuned_mmr     s    #*$V"D"D"DAq!VT41&99L*D #D$<00M ^333A33334449444HHY			   K( 
O,M 
$=6;
 
 
 #%G 
I$$]K'DQQ	
 	
 	
  2&99L* 
)&UKKK 
I$$]F[M4PP	
 	
 	
 #4.. 	NN	((!6,F+G    "!Q** 	NN	((!6,I+JD   
 	I$$v(G'H$ 	
 	
 	
 	
 Aq11
  %%

 	6V]0022	
 	
 	
  S*61a;; S*7FM<O<O<Q<QRRR O/1a@@ O&w8K8K8M8MNNNVT400 
#!!	
 	
 	
 ,K.E''.E O"44T::.E dOO	.E  "" 	0"";///+.w<<(I$$  
	
 
	
 
	
 &OO+
 
 

 %;;DAA 		E%#j//A*=*=
 QPPPPPP!"C#C"CD, 
 
'**//0C0C0E0EvNN	
 	
 	
 	
  8 9,T488$-   r@   c          	         t          | ||t          j                  \  }}}}} }d}t          d         d| d| d| xx         dz  cc<   t                              d||||                                 |                                |           t          |          \  }}|o|ot          ||||          }	g }
t          | |g          }t                      r;|
                    t          j                            ||t          g|                     |	r:t!          |          r+t#          j        |
||                                dd	           |rMt)          |dd
          r;|
                    t          j                            ||t*          g|                     t-          ||
|                                |          S )Nr   rs   int_mmr   zaten._int_mm_r   r.   zTTuned aten._int_mm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sTfuseablenon_fuseableF)enable_int32r   )r0   rE   int32r   r   r   rD   r/   r*   r   r&   r   r   r   r   aten__int_mmr%   r   r   r   r,   r   r!   )rS   rT   r   r   r   r   r4   r   r   use_cutlassr   r   s               r>   tuned_int_mmr   k  s    #*d6U[# # #Aq!VT4 D^8Q8888Q88999Q>999HH^			    2&99L*W:W2FvqRSUV2W2WK"$G #D$<00M 
I$$	 	
 	
 	
  
*400 
6V]0022TPT	
 	
 	
 	
  
)Te   
 	I$$]F[M4PP	
 	
 	
 %T7M4G4G4I4I6RRRr@   )rL   rM   r   c          	         t          ||| |          \  }}}}}}}	t          |          \  }
}d}t          |	||gt          ||                    }g }t          d         d| d| d| xx         dz  cc<   t
                              d	||||                                |                                |           |}|rt          j	        st          j
        sd
dlm}m} t          ||          r ||j        |j        |j                  }t          | ||gt          ||                    }|                    t&          j                            ||t,          g|                     t/          |||                                |          S t3                      rv|                    t&          j                            ||t4          g|                     |                    t&          j                            ||t,          g|                     |rt7          |d          r|                    t&          j                            ||t8          g|                     t;          ||          r;|                    t&          j                            ||t<          g|                     |                    t&          j                            ||t>          gd                     |rPtA          ||||          r>tC          |          r/tE          j#        |||                    g d          ||           |rDtI          ||||          r2tK          j&        |||                    g d          ||g d           tO          |||          r,tQ          j)        |||                                ||d           t/          |||                                |          S )zb
    Lowering for autotuning aten.addmm with different backends (Aten, Triton, CUTLASS, etc.)
    r   rQ   )rL   rM   )scalarsr   zaten.addmm_r   r.   zRTuned aten.addmm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sr   )FixedLayoutr   r   Fr   )r.   r   r   )reorder)r   r   r.   )rL   rM   input_reorderT)rL   rM   has_bias)*r0   r/   r   dictr   r   r   rD   r   r   r   torch._inductor.irr   r   
isinstancer   r{   rP   r   r   r   r   
aten_addmmr!   r   r&   aten_bias_addmmr,   r   r-   r   "addmm_contiguous_subgraph_templater*   r%   r   r   r'   r   r   r)   r   r   )rR   rS   rT   rL   rM   r   r   r   r   inp_expandedr   r   r4   r   r   r   r   r   s                     r>   tuned_addmmr    s    18dCPV0W0W0W-Aq!VT41&99L*D"	tT"Du4,H,H,H  M #%G ^6166q661667771<777HH\			   K W)W-<-NW
 	CBBBBBBBfk** 	(.}FLv{  K '$t%d'C'C'C
 
 
 	I$$	 	
 	
 	
 )w8K8K8M8MvVVV 
I$$ !	 	
 	
 	
 	I$$	 	
 	
 	
  
)&UKKK 
 	I$$	 	
 	
 	
 #4.. 		NN	((!/0	    	I$$34	 	
 	
 	
 	
 Aq11
  %%

 	6 			22	
 	
 	
 	
  

*61a;; 

* 			22#))		
 		
 		
 		
 VT400 
#!!	
 	
 	
 	
 %T7M4G4G4I4I6RRRr@   )rs   r   c                
   ddl m}  || ||          \  } }}|                                 \  }}|                                \  }}	|                                \  }
}t          j        j                            ||          }t          j        j                            d|z  |
          }|Addlm}  ||	                                |r|n|
                                ||g|dg          }n|
J d            t                      r!t                              | ||f||          gng }||z  dk    r=t          ||||          r+t          d          rt!          j        ||| ||gd	d	
           t%          d|| ||f|          S )Nr   )r#   r   )r   r.   z,out_dtype is ignored if layout is specified.rr   sparse_semi_structured_mmTr   ) torch._inductor.select_algorithmr#   ro   r   rZ   r[   check_equals_and_simplifyr   r   
get_devicerD   r&   aten__sparse_semi_structured_mmr   r*   r%   r   r   r!   )rS   	mat1_metarT   rs   r   r#   m1k1m2r   k2r   r   r   r   r   s                   r>   tuned_sparse_semi_structured_mmr  -  s    @????? +N4DAAD)T]]__FB  EBMMOOEB	222r::A	221r62>>A~222222OO"8II(8(8FF	
 
   "P    !""	+00y$'9 1  	
 	
   	
A

 Aq11 	 ;<< 	 	6VdD)4tRV	
 	
 	
 	
 %#WtY.Ev  r@   c	                    t          | |||          \  }	}
}}} }t          d         d|	 d|
 d| xx         dz  cc<   t                              d|	|
||                                 |                                |           d}t          | |           t          ||          \  }}|s| |||g}nt          |          }| ||||g}t          |dd	          }g }t                      rX|	                    t          j                            ||t          g|t          j        t          ||
          i                     |j        t"          j        k    rt'          ||||          S t)          |          \  }}|rt+          |dd          rt          |          }t-          | |          rK|sI|	                    t          j                            ||t.          g|t.          j        |i                     |	                    t          j                            ||t0          g|t0          j        |i                     |rKt3          ||	|
|          r9t5          |          r*t7          j        |||                                |           |r:t=          ||	|
|          r(t?          j         |||                                           t'          |||                                |          S )a9  
    Performs an optimized matrix multiplication where scaling factors are applied
    to the inputs and/or output.

    Args:
        mat1 (Tensor): First input matrix
        mat2 (Tensor): Second input matrix
        scale1 (Tensor): Scale factor applied to mat1 (supports broadcasting)
        scale2 (Tensor): Scale factor applied to mat2 (supports broadcasting)
        bias (Tensor, optional): Optional bias tensor to add to the result
        layout: Layout hint for optimization

    Returns:
        Tensor: The result of the scaled matrix multiplication
    r   r   zaten._scaled_mm.default_r   r.   z_Tuned aten._scaled_mm.default: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%s	scaled_mmr   )mat1_idxmat2_idx)rs   use_fast_accum)kwarg_overridesTF)enable_float8r   )USE_FAST_ACCUM)r  )!r0   r   r   r   rD   rp   r#   r   r&   r   r   r   r   aten__fp8_mmuidr   r{   rE   rx   r!   r/   r,   r-   scaled_mm_device_tma_templater   r*   r%   r   r   r   r'   r   r   )ri   rm   scale_ascale_bbiasscale_resultrs   r  r   r   r   r   r4   scale_a_realscale_b_realr   	bias_realr   r   r   r   
overriderss                         r>   tuned_scaled_mmr"  a  sz   8 %,uVy% % %!Aq!VUE ^CCCACCCCDDDIDDDHHi			   DUE***!/!A!AL,  Le\<@"4((	e\<K #;QGGGM"$G 
I$$ $d"+N' ' '! % 
 
	
 	
 	
 }%%(wVLLL&v..MAz 
)du   
 888
 #5%00 
	 
	NN	((!23%B%F
$S )     	I$$!,* = %  	
 	
 	
 	

 Aq11

  %%


 	6!!)		
 	
 	
 	
  S*61a;; S*7FM<O<O<Q<QRRR$T7M4G4G4I4I6RRRr@   indexc                 Z    t           j                            | pd          }|j        dk    S )Nr      )rE   cudaget_device_propertiesmajor)r#  propss     r>   _is_sm7x_or_older_gpur*    s(    J,,UZa88E;!r@   c                 4    t          d | D                       S )Nc              3   @   K   | ]}t          |t                    V  d S r<   )r   r   )r   dims     r>   	<genexpr>zdims_are_int.<locals>.<genexpr>  s,      44z#s##444444r@   )all)dimss    r>   dims_are_intr1    s    44t444444r@   r   c           	      8   t          | ||||          \  }}}t          |||g          sd S t          | |          \  }}fd}d } ||||| |||          }t          ||||||	          }|
|                    |
|          S |                                S )Nc                 p   t                      }|                    d|            |                    d|           |                    d|           |                    d|j        j        d           |                    d|j        j        d           t	          |d|           t	          |d	|           |                    d
|j                                        d           |                    d|j                                        d           dk    rt          ||j        j                   |S )Nr   r   r   
mat1_dtypeT)is_categorical
mat2_dtyperS   rT   mat1_iscontigmat2_iscontigr3   )r   add_featurer   r{   r   is_contiguousr	   )	r   r   r   rS   rT   mat1_stridemat2_stridecontextr4   s	           r>   get_contextz%mm_autoheuristic.<locals>.get_context  s=   ++C###C###C###L$+*;DQQQL$+*;DQQQGV[999GV[999T[6688 	 	
 	
 	
 	T[6688 	 	
 	
 	
 4<<"7DK,=>>>r@   c                      d S r<   r   r   r@   r>   fallbackz"mm_autoheuristic.<locals>.fallback  s    tr@   )r@  r   r   r=  r4   augment_contextprecondition)r   )get_size_hintsr1  get_size_hints_stridesr   get_top_k_choices_callerget_choice_caller)rS   rT   r   r   r   r   r4   r   opsrB  r   r   r;  r<  r>  r@  r=  autoheuristics         `           r>   r   r     s     T4Aq11GAq!Aq	"" t5dDAAK    &   k!Q4{KHHG0!  M 55? 6 
 
 	
 **,,,r@   c                    t          |t                    rt          |t                    sOt          j        j                            |                                 t          j        j	        j
                  \  }}t          |t                    rt          |t                    sOt          j        j                            |                                t          j        j	        j
                  \  }}|||fS )Nr@  )r   r   r   rZ   r[   
size_hintsro   rE   r   r   unbacked_symint_fallback)rS   rT   r   r   r   s        r>   rC  rC  '  s    a 
Z3%7%7 
!,,MMOO_+D - 
 
A
 a 
Z3%7%7 
!,,MMOO_+D - 
 
A a7Nr@   c                 0   | j         j        }|j         j        }||g}g }|D ]f}t          |t                    s:t          j        j                            |t          j	        j
        j                  }|                    |           g|d         |d         fS )NrJ  r   r.   )r   rO   r   r   r   rZ   r[   rK  rE   r   r   rL  r   )rS   rT   r;  r<  stridesstrides_hintsrO   s          r>   rD  rD  6  s    +$K+$KK(GM % %&#&& 	W%00/H 1  F 	V$$$$]1---r@   )rV   N)NNNFN)NN)r   loggingtypingr   r   rE   torch._dynamo.utilsr   +torch._inductor.autoheuristic.autoheuristicr   1torch._inductor.autoheuristic.autoheuristic_utilsr   r   r	   r
   )torch._inductor.codegen.cpp_gemm_templater   *torch._inductor.remote_gemm_autotune_cacher   torch._inductor.virtualizedr   "torch.fx.experimental.proxy_tensorr   torch.torch_versionr    r   r   codegen.cuda.gemm_templater   r   ,codegen.rocm.ck_tile_universal_gemm_templater   'codegen.rocm.ck_universal_gemm_templater   codegen.subgraphr   r   irr   r   r   r   r   r   r   loweringr   r   r    select_algorithmr!   r"   r#   r$   utilsr%   r&   r'   r(   r)   r*   r+   r,   r-   	mm_commonr/   r0   r1   r2   triton__version__triton_version
has_tritonImportError	getLoggerr   r   rG  atenprimsversionhipr   r   load_scalesapply_scaling
device_tmar  cacher?   r3   rK   r   rQ   r   _int_mmr   _sparse_semi_structured_mmdefaultr  
_scaled_mmr  rI   rU   rp   r   r   r   r   r   r   r   r   r   r   r   r  r  r"  r   rd   r*  r1  r   rC  rD  r   r@   r>   <module>rv     s                         ( ( ( ( ( ( T T T T T T            F E E E E E F F F F F F ) ) ) ) ) ) 6 6 6 6 6 6 , , , , , , ( ( ( ( ( ( U U U U U U U U M M M M M M D D D D D D E E E E E E E E H H H H H H H H H H H H H H * * * * * * X X X X X X X X X X           
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 P O O O O O O O O O O OMMM!\&"455NJJ   !\'**NJJJ g!!y~	n		T M%.G*C*CQG	 G	XFP (,"sZ Z Zx ,^		@D D D L FJ
Z !/		#m3! ! !  " " " 
UX|
M
M
M	Kdjn  
 "!	M$$,2B   #5"4	$$/7	# # #  "!	*8K  
8 8 8 (,11 I I I I I   4 %$Z66# # #          0      F !: 9 ; ;     )   D' ' '/ / / #5"4_m# #  &8%7*,<& & "
 47555#' T T T T 65Tn 4<T:::'+ .S .S .S .S ;:.Sb 4:4888*+!D LS LS LS LS 98LS^ 42MMM(,T- - - - NM-`  do-/F G G G 4?*EEE 
{S {S {S FE{S| # 4    
5 5 5  :- :- C=:- :- :- :-z  . . . . .s   :C C&%C&