
    Pi//                        d dl Z d dlZd dlZd dlmZ d dlmZ  ed  e j	        g dd          D             g           Z
ej        j        j        dk    rd  e j	        g d	d          D             Z
d
 e
D             Z
ej        dej        dej        dej        dej        fd            Zej        ej        fdej        dej        dej        dej        dej        dej        fd            Zd Zd Zej                            dd          Ze                    d           e                    d           ej                            edd          d             Zej                            edd          d             Zej                            edd          d             Zej                            edd          d              ZdS )!    N)get_best_config_fnc                     g | ]b\  }}}|||d d f|||d df|||ddf|||d df|||ddf|||ddf|||ddf|||d df|||ddf|||ddf|||ddf|||ddf|||ddf|||ddf|||ddfgcS )                         ).0ijks       o/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchao/kernel/intmm_triton.py
<listcomp>r      s       & Q1! 1aO1aO1aO1aO1aO1aO1aO1aO1aO1aO1aO1aO1aO1aO1aO	
      )    @         r   )repeat
EXHAUSTIVEc           	      8    g | ]\  }}}d D ]}dD ]	}|||||f
S ))r   r   r   r   r
   r   r   r	   )r   r   r	   r   )r   BLOCK_MBLOCK_NBLOCK_K
num_stages	num_warpss         r   r   r   )   si       %GWg 3 
 "   
'7J	:    r   )   r   r   r   r   c           	      R    g | ]$\  }}}}}t          j        |||d d||          %S )r	   )r   r   r   GROUP_M)r   r    )tritonConfig)r   r   r   r   sws         r   r   r   C   sZ        	Aq!Q M!a@@    r   r   r   r   r#   c                 N   t          j        d          }t          j        ||          }t          j        ||          }||z  }||z  }||z  }t          ||z
  |          }|||z  z   }||z  |z  }t          j        | ||f||f||z  df||fd          }t          j        |||f||	fd||z  f||fd          }t          j        ||ft           j                  }t          d||          D ]t}t          j        |d          }t          j        |d          }|t          j	        ||          z  }t          j
        |d|f          }t          j
        ||df          }u|}t          j        |||f|
|f||z  ||z  f||fd          }t          j        ||d           dS )	zqKernel for computing the matmul C = A x B.
    A has shape (M, K), B has shape (K, N) and C has shape (M, N)
    r   )axis)r   r   )baseshapestridesoffsetsblock_shapeorderdtype)r   r   )boundary_checkN)tl
program_idcdivminmake_block_ptrzerosint32rangeloaddotadvancestore) a_ptrb_ptrc_ptrMNK	stride_am	stride_ak	stride_bk	stride_bn	stride_cm	stride_cnr   r   r   r#   pid	num_pid_m	num_pid_nnum_pid_in_groupgroup_idfirst_pid_mpid_mpid_na_block_ptrb_block_ptraccumulatorr   abcc_block_ptrs                                    r   !matmul_kernel_with_block_pointersrZ   M   s   @ -Q


C7##I7##I*&&HW$K)k)733G3=)E##/E #!fI&!$g&  K #!fI&EGO$g&  K (GW-RX>>>K1a!! < < GK777GK777rva||# jq'l;;jwl;;A
 #!fI&%'/2g&  K H[!F333333r   EVEN_KACC_TYPEc                    t          j        d          }||z   dz
  |z  }||z   dz
  |z  }||z  }||z  }t          |||z  z
  |          }||z  ||z  z   }||z  |z  }||z  t          j        d|          z   }||z  t          j        d|          z   }t          j        t          j        ||z  |          |          }t          j        t          j        ||z  |          |          } t          j        d|          }!| |d d d f         |z  |!d d d f         |z  z   z   }"||!d d d f         |	z  | d d d f         |
z  z   z   }#t          j        ||f|          }$t          |d|           D ]}%|r)t          j        |"          }&t          j        |#          }'nJt          j        |"|!d d d f         |%k     d          }&t          j        |#|!d d d f         |%k     d          }'|$t          j	        |&|'          z  }$|"||z  z  }"|#||	z  z  }#||z  t          j        d|          z   }||z  t          j        d|          z   }|d d d f         }(|d d d f         })|(|k     |)|k     z  }*|)||(z  z   }+t          j        |t          j
        |(|*j                  z   |*d          },t          j        |t          j
        |+|*j                  z   |$|,z  |*           d S )Nr   r   r0   g        )maskother
evict_last)eviction_policy)r3   r4   r6   arangemax_contiguousmultiple_ofr8   r:   r;   r<   broadcast_tor+   r>   )-r?   r@   rA   s1_ptrrB   rC   rD   rE   rF   rG   rH   rI   rJ   
stride_s1m
stride_s1nr   r   r   r#   r[   r\   rK   grid_mgrid_nwidthrO   
group_sizerQ   rR   rmrnramrbnrkABaccr   rV   rW   idx_midx_nr^   xindextmp0s-                                                r   (scaled_matmul_kernel_with_block_pointersry      s+   > -

C'kAo')F'kAo')F fEe|HVh00'::Jw#
"23E5[j)E	29Q00	0B	29Q00	0B

BN267;;W
E
EC

BN267;;W
E
EC	1g		BQQQW	)BtQQQwK),CCDAAAAtGy(3tQQQw<)+CCDA
(GW%X
6
6
6C1a'"" 	! 	! 	<

A

AA47as;;;A111d7as;;;Arva||	Wy  	Wy   
29Q00	0B	29Q00	0Bqqq$wKEtQQQwKEAI%!)$D a%i F7"/%445$  D
 HUbofdj99:C$JMMMMMr   c                    | j         \  }|j         \  }fd}t          |         | ||||                     d          |                     d          |                    d          |                    d          |                    d          |                    d          f|j        |j        |j        d|j         |S )Nc                 t    t          j        | d                   t          j        | d                   z  fS Nr   r   r$   r5   METArB   rC   s    r   <lambda>z#int_matmul_kernel.<locals>.<lambda>  2    AtI''&+ai*I*II r   r   r   )r    r   num_ctas)r+   rZ   strider    r   r   kwargs)rV   rW   rX   configrD   gridrB   rC   s         @@r   int_matmul_kernelr      s    7DAq7DAq    D &d+												 "$   -!  $ Hr   c                    | j         \  }|j         \  }fd}t          |         | |||||                     d          |                     d          |                    d          |                    d          |                    d          |                    d          |                    d          |                    d          f|j        |j        |j        |dz  dk    d|j         |S )Nc                 t    t          j        | d                   t          j        | d                   z  fS r|   r}   r~   s    r   r   z*int_scaled_matmul_kernel.<locals>.<lambda>!  r   r   r   r   r   )r    r   r   r[   )r+   ry   r   r    r   r   r   )	rV   rW   scales1rX   r   rD   r   rB   rC   s	          @@r   int_scaled_matmul_kernelr     s
   7DAq7DAq
    D -T2												qq  "$A
' ( -)  , Hr   torchaoFRAGMENTz(int_matmul(Tensor a, Tensor b) -> Tensorz?int_scaled_matmul(Tensor a, Tensor b, Tensor scales1) -> Tensor
int_matmulMetac                 z    | j         \  }}|j         \  }}t          j        ||f| j        t          j                  S Ndevicer1   )r+   torchemptyr   r9   )rV   rW   rB   rD   rC   s        r   int_matmul_metar   B  s7    7DAq7DAq;1vahekBBBBr   CUDAc                 R   | j         d         |j         d         k    s
J d            | j         \  }}|j         \  }}t          j        ||f| j        t          j                  }t          t          | ||gt                    }|t          j        g           S t          | |||          S Nr   r   zIncompatible dimensionsr   )	r+   r   r   r   r9   r   r   int8_mm_kernel_configstensor)rV   rW   rB   rD   rC   rX   best_configs          r   int_matmul_cudar   I  s     71:###%>### 7DAq7DAqQF185;???A$Aq!9&< K |BQ1k222r   int_scaled_matmulc                 p    | j         \  }}|j         \  }}t          j        ||f| j        |j                  S r   )r+   r   r   r   r1   )rV   rW   r   rB   rD   rC   s         r   int_scaled_matmul_metar   ]  s7    7DAq7DAq;1vahgmDDDDr   c                     | j         d         |j         d         k    s
J d            | j         \  }}|j         \  }}t          j        ||f| j        |j                  }t          t          | |||gt                    }t          | ||||          S r   )r+   r   r   r   r1   r   r   r   )rV   rW   r   rB   rD   rC   rX   r   s           r   int_scaled_matmul_cudar   d  s     71:###%>### 7DAq7DAqQF187=AAAA$ 1a!"46L K $Aq'1kBBBr   )	itertoolsr   r$   triton.languagelanguager3   torchao.kernel.autotunerr   sumproductr   	_inductorr   max_autotune_gemm_search_spacejit	constexprrZ   r9   ry   r   r   libraryLibrarylibdefineimplr   r   r   r   r   r   r   <module>r      s*               7 7 7 7 7 7  & +*+=+=+=aHHH'  * -  2 	?8LHH ):):"""1*
 *
 *
  4  2    _4& \'_4( \)_4* \+_4, \-_4 _4 _4 _4D 8  X7KN KN, \-KN. \/KN0 \1KN2 \3KN4 L5KN6 l7KN KN KN KN\  6     F mIz22 

5 6 6 6 

L M M M Cv..C C /.C Cv..3 3 /.3& C,f55E E 65E C,f55C C 65C C Cr   