
    PiT              !          d dl Z d dlmZ d dlZd dlmZ d dlmZmZm	Z	m
Z
mZ d dlmZmZmZ d dlmZ  e j        dd          Zd	d	dddddddd
	dZ	 	 	 	 	 	 	 	 d)dZd	d	ddddddddej        dej        dej        deej                 deej                 deej                 dedeeee         ee         ee         f                  dee         fdZ e            rd dlZd dlmZ ej         dej!        dej!        dej!        dej!        dej!        dej!        dej!        dej!        d ej!        d!ej!        d"ej!        d#ej!        d$ej!        d%ej!        d&ej!        d'ej!        f d(            Z"dS dZ"dS )*    N)Optional)	warn_once)broadcast_batch_dimslaunch_kernelprepare_inputsptr_stride_extractortile_to_blocksize)get_metaminimizeupdate)
has_tritonBSR_AUTOTUNEF   )	betaalpha
left_alpharight_alphaoutstoreverboseforceopnamec       	           #$ ddl $|d}|j        d         }|                                }|                                }|                                dz
  }|j        ||dz            \  }}|j        |dz   |dz            \  }}t          dddt          ||z  d          	          }t          d|                                |z  |z  ||z  z  z
  d          }|j	        }||}n|j	        }||u r|}n||f}d||f}|||||dk    dk    |dk    f}t          |||d
          }|d}t          ||d|dfd
          }||}n|
s|S d
}| ||||f$fd	}|||||fd}t          ||||d|	          \  #}} }!|	rt          d|! d|dd| dd           |rc|r
#|k    r||usWt          j                                        }"t!          ||"||t#          #fdt%          #          D                                  #S )zTune bsr_dense_addmm kernel parameters against the given inputs.

    When store is True, the tuning results will be stored in the
    database of kernel parameters.
    r   Nbsr_dense_addmmr            )GROUP_SIZE_ROW
num_stages	num_warpsSPLIT_NT)versionexactF      ?c           	      d      	f	d}
j                             |dd          S )Nc                  4   	 t           	  	        S )N)r   r   r   r   metar   )r   )	r   r   bsrdenseinputr   r(   r   r   s	   q/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchao/kernel/bsr_triton_ops.py	test_funcz6tune_bsr_dense_addmm.<locals>.bench.<locals>.test_func_   s5    "%'
 
 
 
    i  d   )warmuprep)testingdo_bench)r(   r+   r)   r*   r   r   r-   r   r   r   tritons   `````` r,   benchz#tune_bsr_dense_addmm.<locals>.bench^   si    	 	 	 	 	 	 	 	 	 	 	 	 	 ~&&y#&FFFr.   c	                    | dv }	t          dddd          |          }
t          t          ||z  d                                        |           }t          dddd          |          }|	r$|dk    r|||z  z  n||t          |          z  z  }n|||z  z   }|
t          ||
          }|t	          ||          }| dk    r||z  dk    r|S |S )N>   r"   r!   r   )r"   r!   r    r   )r"   r   r   r"   )dictmaxgetabsmin)namevalue	directionr(   MNKBMBKis_log	min_value	max_value
value_step
next_values                 r,   step_meta_parameterz1tune_bsr_dense_addmm.<locals>.step_meta_parametero   s    11aAaPPPQUV	Q"Wa11155d;;	!qQqQQQRVW
 	8 q== 
I---zS^^;< J i!77J Z33J Z33J9Z1!4!4Lr.   )max_stepr   z-> z
, speedup=z.1fz %, timing=z.3fz msc              3   (   K   | ]}|         V  d S N ).0kr(   s     r,   	<genexpr>z'tune_bsr_dense_addmm.<locals>.<genexpr>   s'      00a$q'000000r.   )r4   shapevaluescrow_indicesdimr7   r8   round_nnzdtyper
   r   printtorchcudaget_device_namer   tuplesorted)%r+   r)   r*   r   r   r   r   r   r   r   r   r   r@   rR   rS   
batch_ndimr?   rA   rB   rC   reference_metasparsityrW   	out_dtypeversion_dtyper#   keyinitial_metamay_skip_updater5   rI   speeduptimingsensitivity_messagedevice_namer(   r4   s%      ` ``                            @@r,   tune_bsr_dense_addmmrj      s   ( MMM~"BAZZ\\F##%%L!!##a'J9Z*q.01DAq\*q.:>9:FB Q!Sb!__  N
 Qb2-Q77;;HIE
{		I	E	*-*GaBDAItqy%1*
=C FCEEELa_DQQQ)L   SU G G G G G G G G G" =>aBSU    2 2:2 2 2.D'6.  OM'MM7MMMVMMMMNNN 



 L00\5W5Wj002200006$<<00000	
 	
 	
 Kr.   c                    |t           j        }||}|d}||	|
|hd hk    rt           j                                        }| |||||dk    |dk    |dk    f}||u r|}n||f}t	          d|||||f          }||dk    rt	          d||||df          }|||urt	          d||||df          }|t	          dg |d d         d|dd          R |||df          }|1||ur-t	          dg |d d         d|dd          R |||df          }t          |pi           D ]E}||         }|d         }|d	         }||z  }||z  dk    r||k    rt          |          }||z  |d	<   F| |j        di | |S t          d
| d|d|d|d|d|d|d|d|d           |pt          ||z  d          }|pd}|
pd}
|	pd}	t          d|||
|	d|S )Nr%   r   r   r   )r#   r   *r   r"   z@bsr_dense_addmm uses non-optimal triton kernel parameters for M=z K=z N=z Ms=z, Ks=z beta=z alpha=z dtype=z out_dtype=zC. To find optimal triton kernel parameters, run with BSR_AUTOTUNE=1r   )r"   r   r    r!   rM   )
rY   float16rZ   r[   r
   r]   r7   r   r   r8   )r?   rA   r@   MsKsr   r   r"   r   r!   r    r`   rW   ra   _versionextrari   rc   rb   r(   matching_metamkeymeta_nsplit_ncs                             r,   bsr_dense_addmm_metarx      sj   * }	J7D6AAj0022!QB	419eqjAI!MM!9,M}h7	
 
 
 <HOO!!=#6	  D <E22!3hs=S  D <$!)#bqb')3)QRR))!=#6	  M $i)?)? (%-c"1"g-s-SW--%uc2	! ! ! }233 - -%d+G	*Lq5A::!q&&;;D&'1fDODK  %   KTT TT T"#T T')T T.0T T48T T<AT TEJT TNWT T T   (Q"WaG#(qNqJQI %	 
   r.   )r   r   r   r   r   skip_checksmax_gridr(   r+   r)   r*   r   r   r   ry   rz   r(   c                 
  
 !"#$% d}|                                 }|                                }|                                }|                                dz
  }|j        ||dz            \  }}|j        |dz   |dz            }|j        d         }t          |||          }||                    |||fz             }|                                dk    sdk    s|dk    s|dk    s|dk    rMdk    r|                                 n0|	                    |            dk    r|
                               |S 
t          d|                                |d         z  |d         z  ||z  z  z
  d          }t          rt          | |||||dd	dd
          
n/t          ||||d         |d         ||j        |j        
  
        
d	$d	%|'d$ |                    d          j        g |||R  }n   |j        g ||dR  j        g |||R  }|'d% |                    d          j        g |||R  }n   |j        g |d|R  j        g |||R  }|                                d         dk    sJ |                                d         dk    sJ |}t'          || ||||          \  }}}} }}}}|\  ! 
                    dt+          |!z  d                    }||z  "|}t-          |!"f          }t-          | "f          }t-          | !"f          } t-          |!"f          }t-          |!"f          }t.          j        t2          j        t.          j        t2          j        t.          j        t2          j        t.          j        t2          j        t.          j        t2          j        t.          j        t2          j        i|j                 #|                    d          }|                    d          dz
  }|                    d          }|||f}|	?tA          |	dd         ddd                   ddtC          |	dd                   z
  z  z   }nd}|d|d|d| d|d|d|d|di}dk    sJ  !"#$
%f	d}tE          ||||           |#                                |#                                k    r-|	                    |                    |j                             |S )a  Compute

      out = beta * input + left_alpha.reshape(-1, 1) * (alpha * (bsr @ dense)) * right_alpha.reshape(1, -1)

    where left_alpha, right_alpha are (* + 1)-D tensors when
    specified, otherwise, these are treated as tensors filled with
    ones.
    r   r   r   r   r   Nr   TF)	r   r   r   r   r   r   r   r   r   )r`   rW   ra   rM   r"   rL   )r   NN)r   Nr   )r   r}   )r   r}   Nc                    	 t          |          g t          | R dk    dk    dk    
t          j        k    d
	 d S )Nr   r   )
beta_is_onebeta_is_nonzeroalpha_is_oneleft_alpha_is_oneright_alpha_is_oneBLOCKSIZE_ROWBLOCKSIZE_INNERBLOCKSIZE_COL
allow_tf32	acc_dtype)_bsr_strided_addmm_kernelr   tlfloat32)gridsliced_tensorsrC   rB   BNr   r   dot_out_dtyper   r(   r   s     r,   kernelzbsr_dense_addmm.<locals>.kernel  s    !$' 	
!>2	
	
 	
 	
 	 AI!/1$
2#	
 	
 	
 	
 	
 	
 	
r.   )$rR   rS   col_indicesrT   rQ   r   	new_emptyrV   zero_copy_mul_rU   AUTOTUNErj   rx   rW   expandviewstrider   r9   r8   r	   rY   rm   r   r   bfloat16float64int8int32sizer\   lenr   data_ptr)&r+   r)   r*   r   r   r   r   r   ry   rz   r(   f_namerR   rS   r   r^   r?   rA   	blocksizer@   original_batch_dims_broadcastedr`   
out_backupr"   out_untiled	n_batchesn_block_rowsn_block_cols	full_gridgrid_blockstensor_dims_mapr   rC   rB   r   r   r   r   s&      ``     `                     @@@@@@r,   r   r     s   , FZZ\\F##%%L//##K!!##a'J9Z*q.01DAqZ!^j1n<=IBA&:63&N&N#
{oo=AFGG
xxzzQ%1**Q!q&&AFF199IIKKKKIIeqyy
|SXXZZ)A,61EQOOQRSS 	'%'(  DD (!!!k)  D  /U__R((/ 
,
./
12
 
 


 T_Z_L&ELqL!LLLS 
,
./
12
 
 

 !0eoob))0 
,
./
12
 
 
 V&k&N(GNNANNNU 
,
./
12
 
 
 r"a''''#q((((J 	sE5*k3GG	 FBhhy#a2gq//22G	
gBK
C"b
*
*Ceb"X..Eeb"X..E":Bx88J#K"b::K 	rz
rzrz
BHRX 
iM 

1I$$R((1,L::b>>LL,7IHRaRL2.//'QXbqb\ARAR=R2SS 	m_{}K[[	O A::::
 
 
 
 
 
 
 
 
 
 
 
 
$ &/9kBBB
||~~,,.... 	))**:;;<<<r.   left_alpha_tiled_col_strideleft_alpha_col_block_strideright_alpha_tiled_row_strideright_alpha_row_block_strider   r   r   r   r   r   r   r   r   r   r   r"   c7                    |dk    sJ |dk    sJ |dk    sJ |!dk    sJ t          j        d          }7t          j        d          }8t          j        d          }9t          j        d          }:t          j        d          };t          j        |8|9|:|;|5          \  }8}9|||7z  z   ||8z  z   }<t          j        |<          }=t          j        |<|z             }>|>|=z
  }?t          j        d|0          }@t          j        d|2          }A|1dk     s	|1dz  dk    rd}Bn|1}Bt          j        d|B          }C| ||7z  z   ||=z  z   ||@d d d f         z  z   ||Ad d d f         z  z   }D|||7z  z   ||9z  z   ||Ad d d f         z  z   ||Cd d d f         z  z   }E|#|$|7z  z   |%|8z  z   |&|9z  z   |'|@d d d f         z  z   |(|Cd d d f         z  z   }F||	|7z  z   |
|=z  z   }Gt          j        |0|Bf|3          }Ht          |?          D ]y}It          j        |D          }Jt          j        |G          }Kt          j        |E||Kz  z   |Cd d d f         |1k               }L|Ht          j        |J|L|4|3          z  }H|D|z  }D|G|
z  }Gz|-s|H|*z  }H|.sK|||7z  z   ||8z  z   ||9z  z   ||@d d d f         z  z   ||Cd d d f         z  z   }M|Ht          j        |M          z  }H|/sK|||7z  z   ||8z  z   | |9z  z   |!|@d d d f         z  z   |"|Cd d d f         z  z   }N|Ht          j        |N          z  }H|,rh|||7z  z   ||8z  z   ||9z  z   ||@d d d f         z  z   ||Cd d d f         z  z   }O|+r|Ht          j        |O          z  }Hn|H|)t          j        |O          z  z  }Ht          j	        |F|H
                    |#j        j                  |Cd d d f         |1k                d S )	Nr   r   )axisr      )rW   )mask)r   ra   )r   
program_idnum_programs	swizzle2dloadarangezerosrangedotr   torW   
element_ty)P
values_ptrvalues_batch_stridevalues_nnz_stridevalues_row_block_stridevalues_col_block_stridecrow_indices_ptrcrow_indices_batch_stridecrow_indices_stridecol_indices_ptrcol_indices_batch_stridecol_indices_stride	input_ptrinput_batch_strideinput_tiled_row_strideinput_tiled_col_strideinput_row_block_strideinput_col_block_stride	dense_ptrdense_batch_stridedense_tiled_row_stridedense_tiled_col_stridedense_row_block_stridedense_col_block_strideleft_alpha_ptrleft_alpha_batch_strideleft_alpha_tiled_row_strider   left_alpha_row_block_strider   right_alpha_ptrright_alpha_batch_strider   right_alpha_tiled_col_strider   right_alpha_col_block_stride
output_ptroutput_batch_strideoutput_tiled_row_strideoutput_tiled_col_strideoutput_row_block_strideoutput_col_block_strider   r   r   r   r   r   r   r   r   r   r   r   r   r"   	batch_pidrow_block_pidcol_block_pidr   r   crow_indices_offset_ptr
nnz_offsetnnz_offset_nextrow_nnzrow_block_arangeinner_block_arangePADDED_BLOCKSIZE_COLcol_block_arangevalues_block_ptrsdense_block_ptrsoutput_ptrscol_index_nnz_ptroutput_acc_block_values_blockdense_row_idxdense_blockleft_alpha_ptrsright_alpha_ptrs
input_ptrssP                                                                                   r,   r   r     s   V +a////*a////+q0000+q0000Mq)))	1---1---A...A...')|=,n(
 (
$}
 ')34!M12 	 
 W455
'"9<O"OPP "J.9Q66Yq/::2!3q!8!813  1> 9Q(<== !I-.*,- &(8D(AAB &(:47(CC	D 	  9,-$}45 %'9!!!T''BBC %'7aaa'@@	A 	 !I-.%56 &56 &(8D(AA	B
 &(8qqq(AAB 	 &23 :-. 	 801
 
 
 w 	4 	4A7#455L G$566M' #9M#II%dAAAg.>  K kjI! ! ! 
 !22!33 	&%  		9)I56-=> .=> .0@D0II	J
 .0@qqq0IIJ   8 88! 		:*Y67.>? />? /1A!!!T'1JJ	K
 /1A$'1JJK  (8 9 99 	?$y01(=89 )=89 )+;AAAtG+DD	E
 )+;D!!!G+DDE   ? BGJ$7$77   D27:+>+>$>>  	
 0 ;<<!$'*]:	
 	
 	
 	
 	
 	
r.   )NNNNNNNr   )#ostypingr   rY   torch._dynamo.utilsr   torch.sparse._triton_opsr   r   r   r   r	   torch.sparse._triton_ops_metar
   r   r   torch.utils._tritonr   getenvr   rj   rx   Tensorboolr\   intr7   r   r4   triton.languagelanguager   jit	constexprr   rM   r.   r,   <module>r	     sm   
			        ) ) ) ) ) )              E D D D D D D D D D * * * * * *29^U++ 



E E E E E` 
^ ^ ^ ^L 

)-*."&MQu u u<u	u <u &u %,'u 
%,	u u uXc]HSM8C=HIJu 4.u u u up :<< ]%MMM      ZU
L &(\MU
P &(\QU
Z ')l[U
^ ')l_U
x \yU
z {U
| l}U
~ <U
@ LAU
B |CU
D |EU
F GU
H <IU
J LKU
L MU
N OU
 U
 U
 ZU
 U
 U
p !%r.   