
    `iP             !       l	   d dl Z d dlZd dlZd dlmZ d dlmZ d dlZd dlm	Z	 d dl
mZ ddlmZ  e ej        dd	                    Zd
 Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd ZdudZd Zd Zd Z d Z!dddZ"	 	 	 	 	 	 dvdZ#	 	 	 	 	 	 	 	 dwdZ$ G d d           Z% ee!          d"             Z&	 dxd$Z'dyd%Z(dddddd&ddd'd(ej)        d)ej)        d*ej)        d+eej)                 d,eej)                 d-eej)                 d.e*d/ee+ee         ee         ee         f                  d0ee,         fd1Z-dddddd&ddd'd(ej)        d)ej)        d*ej)        d+eej)                 d,eej)                 d-eej)                 d.e*d/ee+ee         ee         ee         f                  d0ee,         fd2Z. e            rd dl/Z/d dl0m1Z2 e/j3        d3e2j4        d4e2j4        d5e2j4        d6e2j4        d7e2j4        d8e2j4        fd9            Z5e/j3        d4e2j4        d5e2j4        d7e2j4        d8e2j4        d:e2j4        f
d;            Z6d< Z7d=d=dd&dd>d(ej)        d?ej)        d@ej)        d-eej)                 d.e*d/ee+ee         ee         ee         f                  fdAZ8dd&dddBd)ej)        d*ej)        d-eej)                 d.e*d/ee+ee         ee         ee         f                  d0ee,         fdCZ9e/j3        dDe2j4        dEe2j4        fdF            Z:dudGZ;	 	 	 dzdIej)        dJej)        dKej)        dLeej)                 dMe<dNe*dOee<         fdPZ=e/j3        dQe2j4        dRe2j4        dSe2j4        dTe2j4        dUe2j4        dVe2j4        d8e2j4        fdW            Z>dXej)        dYej)        dZej)        d[ej)        d\ej)        f
d]Z?e/j3        d^e2j4        d_e2j4        dTe2j4        d`e2j4        dUe2j4        dVe2j4        dae2j4        d8e2j4        fdb            Z@	 d{dXej)        dYej)        ddej)        deej)        dfej)        dgej)        d0e,d\ej)        dhe*fdiZAe/j3        dje2j4        dke2j4        dle2j4        dme2j4        dne2j4        doe2j4        dpe2j4        dqe2j4        dre2j4        d4e2j4        d5e2j4        dse2j4        d7e2j4        d8e2j4        d:e2j4        d`e2j4        f dt            ZBdS dZ;dZ9dZ8dZ=dZ?dZAdZBdS )|    N)	lru_cache)Optional)	warn_once)
has_triton   )get_meta*TORCH_SPARSE_BSR_SCATTER_MM_LRU_CACHE_SIZE   c                 (    | st          |          d S N)
ValueError)condmsgs     l/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torch/sparse/_triton_ops.pycheckr      s      oo     c                 R    t          |j        t          j        k    |  d           d S )Nz@(): only BSR sparse format is supported for the sparse argument.)r   layouttorch
sparse_bsr)f_namets     r   check_bsr_layoutr      s7    		E$$SSS    r   c                 ^    t          |j        |k    o|j        j        dk    |  d           d S )Ncudaz9(): all inputs are expected to be on the same GPU device.)r   devicetype)r   r   r   s      r   check_devicer   !   sB    		F6qx}6LLL    r   c           	      ^   t          |                                dk    o|                                dk    |  d|                                 d|                                 d           |j        dd          \  }}|j        dd          \  }}t          ||k    |  d| d| d           d S )Nr
   zc(): all inputs involved in the matrix product are expected to be at least 2D, but got lhs.dim() == z and rhs.dim() == .zw(): arguments' sizes involved in the matrix product are not compatible for matrix multiplication, got lhs.shape[-1] == z( which is not equal to rhs.shape[-2] == )r   dimshape)r   lhsrhs_mklkr_ns          r   check_mm_compatible_shapesr*   (   s    			Q)37799> 	J 	J #			J 	J=@WWYY	J 	J 	J   Yrss^FBYrss^FB	
b 	R 	R "	R 	RLN	R 	R 	R    r   c           	          t          |j        |k    o3|j        t          j        t          j        t          j        ft          | z   v |  d| d|j         d           d S )Nz\(): all inputs are expected to be of the same dtype and one of (half, bfloat16, float32) or z, but got dtype == r    )r   dtyper   halfbfloat16floattuple)r   r   r,   additional_dtypess       r   check_dtyper2   9   s    		5 	SGZ5?P8QQS 	' 	'3D	' 	'G	' 	' 	'	    r   c           	          t          |          dk    sJ d fd}t           ||          |  d|d          d|d          d           d S )	Nr
   c                     | | dz
  z   S Nr    )vs    r   is_power_of_twoz(check_blocksize.<locals>.is_power_of_twoG   s    QK  r   c                 <    d}| D ]}|dk    o |          o|}|S )NT   r6   )bres	blocksizer8   s      r   is_compatible_blocksizez0check_blocksize.<locals>.is_compatible_blocksizeJ   s=     	K 	KI?Ay'A'AJsCC
r   z(): sparse inputs' blocksize (r   z, r   z;) should be at least 16 and a power of 2 in each dimension.)lenr   )r   r=   r>   r8   s      @r   check_blocksizer@   D   s    y>>Q! ! !     
	** 	D 	D1 	D 	D1 	D 	D 	D    r   c                 x    t          |                                           dk    r|                                 S | S )a  Return input as a triton-contiguous tensor.

    A triton-contiguous tensor is defined as a tensor that has strides
    with minimal value smaller than or equal to 1.

    While triton kernels support triton-non-contiguous tensors (all
    strides being greater than 1) arguments, a considerable slow-down
    occurs because tensor data is copied element-wise rather than
    chunk-wise. Zero strides is assumed to not have this defect.
    r   )minstride
contiguous)r   s    r   make_triton_contiguousrE   X   s2     188:: ||~~r   c                 |    	 t          j        d |D              S # t          $ r t          d|  d           Y d S w xY w)Nc              3   4   K   | ]}|j         d d         V  d S Nr!   r#   .0r   s     r   	<genexpr>z'broadcast_batch_dims.<locals>.<genexpr>m   s,      'F'F'F'F'F'F'F'Fr   Fz3(): inputs' batch dimensions are not broadcastable!)r   broadcast_shapes	Exceptionr   )r   tensorss     r   broadcast_batch_dimsrP   k   sf    U%'F'Fg'F'F'FGG U U UeSSSTTTTTTUs    ;;c              '   |   K   |D ]6}t          d           g|                                z  }||| <   ||         V  7d S r   )slicer"   )r"   slice_rangerO   r   slicess        r   slicerrU   r   sQ        ++(!si r   c              '      K   |D ][}t          d           g|                                z  }t          | |          D ]\  }}||||<   |t          |                   V  \d S r   )rR   r"   zipr0   )dimsrT   rO   r   sdd_slices          r   multidim_slicerr\   y   s~        4[[MAEEGG#dF++ 	 	JAw}!ak r   c               '   P   K   | D ] }|V  |                                 E d {V  !d S r   )rC   )rO   r   s     r   ptr_stride_extractorr^      sK        88:: r   c              #   n   K   dt                     cxk    rdk    sn J dt                    cxk    rdk    sn J dd l} fd}fd} |j         |             D ]Q}d t           |          D             }d t          ||          D             }|d d d         g ||          R V  Rd S )Nr      c               3   ^   K   t                    D ]\  } }t          d| |          V  d S )Nr   )rW   range)fgmg	full_gridgrid_blockss     r   generate_grid_pointsz.grid_partitioner.<locals>.generate_grid_points   sI      )[11 	# 	#FB2r""""""	# 	#r   c              3      K                                    D ]%\  }}t          t          || |                    V  &d S r   )itemsnextr\   )rT   r   t_dimstensor_dims_maps      r   generate_sliced_tensorsz1grid_partitioner.<locals>.generate_sliced_tensors   sS      (..00 	; 	;IAvvvq99::::::	; 	;r   c                 <    g | ]\  }}}t          ||z
  |          S r6   )rB   )rK   rc   gprd   s       r   
<listcomp>z$grid_partitioner.<locals>.<listcomp>   s9     
 
 
!+RCR
 
 
r   c                 :    g | ]\  }}t          |||z             S r6   )rR   )rK   ro   gs      r   rp   z$grid_partitioner.<locals>.<listcomp>   s*    GGGA%BF##GGGr   )r?   	itertoolsproductrW   )	re   rf   rl   rt   rg   rm   
grid_pointgridrT   s	   ```      r   grid_partitionerrx      sO     I####!######K  %%%%A%%%%%%# # # # # #; ; ; ; ; (i')=)=)?)?@ ; ;

 
/29j+/V/V
 
 
 HGZ1F1FGGG 44R4j:226::::::::; ;r   c                     dd d d         }||}n,d t          fdt          ||          D                       }t          |||          D ]^}} | |g|R   d S )N)i  rz   rs   c                 F    | |S t          dt          | |                    S r5   )maxrB   )rr   rd   s     r   valid_grid_dimz%launch_kernel.<locals>.valid_grid_dim   s&    y	 1c!Rjj)))r   c              3   6   K   | ]\  }} ||          V  d S r   r6   )rK   rr   rd   r}   s      r   rL   z launch_kernel.<locals>.<genexpr>   sD       
 
&+aNN1b!!
 
 
 
 
 
r   )r0   rW   rx   )kernelrl   re   rf   cuda_max_gridrw   sliced_tensorsr}   s          @r   launch_kernelr      s    .ttt4M#	* 	* 	*  
 
 
 
/2;/N/N
 
 
 
 
 "2;" " & &~ 	t%n%%%%%& &r   c                    |                                                      d          }|                                                     d          }t          |                                                     d                    }d |D             }t          j        |j        d d         gd |D             R  d  |d          } |d          } ||j        dd                    }fd|D             }|||g|R S )Nr   c                 R    g | ]$}t          |                    d                     %S r   )rE   	unsqueezerJ   s     r   rp   z"prepare_inputs.<locals>.<listcomp>   s+    MMM!%akk!nn55MMMr   c              3   4   K   | ]}|j         d d         V  d S rH   rI   rJ   s     r   rL   z!prepare_inputs.<locals>.<genexpr>   s,      ;;aQWSbS\;;;;;;r   c                 z    |                      ||z                                 dt          |          dz
            S )Nr   r   )broadcast_toflattenr?   )r   
batch_dimsinvariant_dimss      r   batch_broadcast_and_squashz2prepare_inputs.<locals>.batch_broadcast_and_squash   s;    ~~j>9::BBs:"
 
 	
r   rs   c           	      D    g | ]} ||j         d d                   S )r!   NrI   )rK   r   r   batch_dims_broadcasteds     r   rp   z"prepare_inputs.<locals>.<listcomp>   sC        	#"1&<agbcclKK  r   )crow_indicesr   col_indicesrE   valuesr   rM   r#   )bsrdense_tensorsr   r   r   rO   r   r   s         @@r   prepare_inputsr      s`   ##%%//22L//##--a00K#CJJLL$:$:1$=$=>>FMM}MMMG #3SbS;;7;;;  
 
 

 .-,e L -,[:PRWXXK''&RSS(9 F      G
 f6w666r   c                    t          | |g|R  }|                                                    |dz             }|                                                    |dz             }|                                                    ||                                j        dd          z             }||j        dd          z   }t          j        |||||j                  S )Nr   r   r!   sizer   )	rP   r   r   r   r   r#   r   sparse_compressed_tensorr   )r   r   rO   batch_shaper   r   r   r   s           r   broadcast_batch_dims_bsrr      s    &vs=W===K##%%22;3FGGL//##00u1DEEKZZ\\&&{SZZ\\5G5L'LMMF233'D)k6SZ   r   c                     | j         ^ }}}|||d         z  |d         ||d         z  |d         gz   }|                     |                              dd          S )Nr   r   r   r!   )r#   view	transpose)r   r=   restmn	new_shapes         r   tile_to_blocksizer      sh    'KT1a	Yq\!	Yq\!	 I 66)&&r2...r   c                     | j         dk     r |                     d          } | j         dk      | j         dk    r|                     d| j         dz
            } | j         dk    sJ | j                    | S )zReturn tensor as 3D tensor by either prepending new dimensions to
    the tensor shape (when ``tensor.ndim < 3``), or by collapsing
    starting dimensions into the first dimension (when ``tensor.ndim >
    3``).
    r`   r   )ndimr   r   r#   )tensors    r   	as1Dbatchr      su     +//!!!$$ +//{Q6;?33;!V\Mr   accumulatorsc                2   |d         }| j         dk    sJ | j        \  }}}|dk    r|dd         \  }}	|j         dk    sJ |j        \  }
}}||k    sJ |5|j        d         dz
  }t          j        |||f| j        | j                  }n|j        \  }}}||k    sJ ||k    sJ |dz  s|dz  s|dz  st          qt          |j        d         dz
            D ]R}||         }||dz            }t          ||          D ],}|	|         \  }}||xx         | |         ||         z  z  cc<   -Snt          | |||	|           |S |dk    rE|j        }t          |          }|j        \  }}}||z  dk    sJ |dd         \  }}}}}|d	         }|`||	                                
                                dz   |z  z   } t          j        g |dd
         | |R | j        | j                  }n|j        d
d         \  } }!|!|k    sJ |j        }"t          |          }||z  }|dz  s|dz  s|dz  st          (|                                 t          |          D ]}#t          |j        d                   D ]}||         
                                }$||         
                                }||dz            
                                }t          |$|          \  }%}&||#|%|%|z   |&|&|z   f         }'t          ||          D ]X}||         ||         }}t          |
                                |          \  }(})|'| |         ||#|(|(|z   |)|)|z   f         z  z  }'Ynt          | |||||||           |                    |"          S |dk    rt|j        }t          |          }|j        \  }}}||z  dk    sJ |dd         \  }}}}|d	         }|`||	                                
                                dz   |z  z   } t          j        g |dd
         | |R | j        | j                  }n|j        d
d         \  } }!|!|k    sJ |j        }"t          |          }||z  }|dz  s|dz  s|dz  st          7t          |          D ]%}#t          t          |                    D ]}*t          ||*         
                                |          \  }%}&|%|z  }+|&|z  },||+         
                                }-||+dz            
                                }.||#|%|%|z   |&|&|z   f         }'t!          t          |-|.                    D ]b\  }/}||,|.z  ||,z
  |-z  z   |/z            
                                }t          ||          \  }(})|'| |         ||#|(|(|z   |)|)|z   f         z  z  }'c'n7t          j        d|j        |j                  }t          | |||||||           |                    |"          S t%          |          )aw  Scattered matrix multiplication of tensors.

    A scattered matrix multiplication is defined as a series of matrix
    multiplications applied to input tensors according to the input
    and output mappings specified by indices data.

    The following indices data formats are supported for defining a
    scattered matrix multiplication operation (:attr:`indices_data[0]`
    holds the name of the indices data format as specified below):

    - ``"scatter_mm"`` - matrix multiplications scattered in batches
      of tensors.

      If :attr:`blocks` is a :math:`(* 	imes M 	imes K) tensor,
      :attr:`others` is a :math:`(* 	imes K 	imes N)` tensor,
      :attr:`accumulators` is a :math:`(* 	imes M 	imes N)` tensor,
      and :attr:`indices = indices_data['indices']` is a :math:`(*
      	imes 3)` tensor, then the operation is equivalent to the
      following code::

        c_offsets, pq = indices_data[1:]
        for r in range(len(c_offsets) - 1):
            for g in range(c_offsets[r], c_offsets[r + 1]):
                p, q = pq[g]
                accumulators[r] += blocks[p] @ others[q]

    - ``"bsr_strided_mm"`` - matrix multiplications scattered in
      batches of tensors and a tensor.

      If :attr:`blocks` is a :math:`(Ms 	imes Ks) tensor,
      :attr:`others` is a :math:`(* 	imes K 	imes N)` tensor,
      :attr:`accumulators` is a :math:`(* 	imes M 	imes N)` tensor, then
      the operation is equivalent to the following code::

        c_indices, r_offsets, p_offsets, q_offsets, meta = indices_data[1:]
        for b in range(nbatches):
            for i, r in enumerate(r_offsets):
                r0, r1 = divmod(r, N)
                acc = accumulators[b, r0 : r0 + Ms, r1 : r1 + Ns]
                for g in range(c_indices[i], c_indices[i + 1]):
                    p = p_offsets[g]
                    q0, q1 = divmod(q_offsets[g], N)
                    acc += blocks[p] @ others[b, q0 : q0 + Ks, q1 : q1 + Ns]

      where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are
      integer multiples of ``Ms`` and ``Ks``, respectively.

    - ``"bsr_strided_mm_compressed"`` - matrix multiplications
      scattered in batches of tensors and a tensor. A memory and
      processor efficient version of ``"bsr_strided_mm"`` format.  If
      :attr:`blocks` is a :math:`(Ms 	imes Ks) tensor, :attr:`others`
      is a :math:`(* 	imes K 	imes N)` tensor, :attr:`accumulators`
      is a :math:`(* 	imes M 	imes N)` tensor, then the operation is
      equivalent to the following code::

        c_indices, r_offsets, q_offsets, meta = indices_data[1:]
        for b in range(nbatches):
            for r in r_offsets:
                m = (r // N) // Ms
                n = (r % N) // Ns
                r0, r1 = divmod(r, N)
                c0, c1 = c_indices[m], c_indices[m + 1]
                acc = accumulators[b, r0 : r0 + Ms, r1 : r1 + Ns]
                for i, p in enumerate(range(c0, c1)):
                    q = q_offsets[n * c1 + (SPLIT_N - n) * c0 + i]
                    q0, q1 = divmod(q, N)
                    acc += blocks[p] @ others[b, q0 : q0 + Ks, q1 : q1 + Ns]

      where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are
      integer multiples of ``Ms`` and ``Ks``, respectively.

      Notice that the order of ``r_offsets`` items can be arbitrary;
      this property enables defining swizzle operators via
      rearrangements of ``r_offsets`` items..

    Auxiliary functions are provided for pre-computing
    :attr:`indices_data`. For example,
    :func:`bsr_scatter_mm_indices_data` is used to define indices data
    for matrix multiplication of BSR and strided tensors.

    Parameters
    ----------
    blocks (Tensor): a 3-D tensor of first matrices to be multiplied

    others (Tensor): a tensor of second matrices to be multiplied. If
      ``indices_data[0]=="scatter_mm"``, the tensor is a 1-D batch
      tensor of second input matrices to be multiplied. Otherwise, the
      second input matrices are slices of the :attr:`others` tensor.
    indices_data (tuple): a format data that defines the inputs and
      outputs of scattered matrix multiplications.

    Keyword arguments
    -----------------

    accumulators (Tensor, optional): a tensor of matrix product
      accumulators. If ``indices_data[0]=="scatter_mm"``, the tensor
      is a 1-D batch tensor of output matrices. Otherwise, output
      matrices are slices of the :attr:`accumulators` tensor.
    r   r`   
scatter_mmr   Nr,   r   r:   bsr_strided_mmSPLIT_Nr!   bsr_strided_mm_compressedr   )r   r#   r   zerosr,   r   _scatter_mm2rb   r   r|   item_scatter_mm6zero_divmodr   r?   	enumerateemptyNotImplementedError)0blocksothersindices_datar   indices_format_PMsKs	c_offsetspq_QKs_NsRMs_Ns_rg0g1rr   pqothers_shapeBKN	c_indices	r_offsets	p_offsets	q_offsetsmetar   MN_accumulators_shaper;   r_r0r1accq0q1jr   r   c0c1is0                                                   r   r   r     s   H "!_N;!JBB%%$QRR(	2{alCSyyyy"Q&A ;B6<  LL ',KAsC"9999"99997 	Fb2g 	Fb 	FL,@9?1-122 = =q\q1u%r2 = =Aa5DAq OOOvay6!9'<<OOOO== BEEE	+	+	+|6"",1a2v{{{{;G;K8	9iDy/immoo**,,q0Q66A ;*,ss#*Q***&,v}  LL !&rss+EAr7777)/ ..'\7 	b2g 	b 	L,@   1XX 
Q 
Qyq122 	Q 	QA"1**,,B"1**,,B"1q5)..00B#B]]FB&q"rBw,R"W'DEC"2r]] Q Q(|Yq\1!'!!4!4Bvay6!R"r'\2R<2O+PPPQ	Q
Q 	 	 	   !3444	6	6	6|6"",1a2v{{{{0<QRR0@-	9iy/immoo**,,q0Q66A ;*,ss#*Q***&,v}  LL !&rss+EAr7777)/ ..'\7 	b2g 	b 	L,@1XX Q Qs9~~.. 
Q 
QA#IaL$5$5$7$7;;FBbAbA"1**,,B"1q5)..00B&q"rBw,R"W'DEC )%B-- 8 8 Q Q1%a"f!r/A&AA&EFKKMM!'1Bvay6!R"r'\2R<2O+PPPQ
QQ IOI4D  I 	 	 	   !3444 ".111r   c           
      .   ||||	|
|hd hk    rBt           j                                        }t          d| ||||f|dt           j        df          }| |j        d$i | |S | ||fdk    rX||fdk    rd}d}d}d	}d}
d	}	n||fd
k    rd}d}d}d	}d}
d	}	n||fdk    rd}d}d}d	}d}
d	}	n||fdk    rd}d}d}d}d}
d	}	n| ||fdk    rX||fdk    rd}d}d}d}d}
d}	nn||fd
k    rd}d}d}d	}d}
d}	nX||fdk    rd	}d}d}d	}d}
d	}	nB||fdk    rd}d}d}d	}d}
d	}	n,| ||fdk    rj||fdk    rd	}d}d}d}d}
d}	n||fd
k    rd}d}d}d}d}
d}	n||fdk    rd}d}d}d	}d}
d}	n||fdk    rd}d}d}d	}d}
d	}	n||fdk    rd}d}d}d}d}
d	}	n| ||fdk    ri||fdk    rd	}d}d}d}d}
d}	n||fd
k    rd	}d}d}d	}d}
d}	n||fdk    rd	}d}d}d	}d}
d	}	nq||fdk    rd}d}d}d	}d}
d	}	n\||fdk    rd	}d}d}d}d}
d	}	nG| ||fdk    r>||fdk    rd}d}d}d}d}
d}	n)||fd
k    rd}d}d}d}d}
d}	n||fdk    rd}d}d}d}d}
d	}	|.ddd	ddddddd	                    |d          }|dk    r|dk    rd}||z  }|t          |dk     rdnd|          }|t          |dk     rdnd|          }|
pd}
|	t          | |          dk    rdddd                    |d	          }	nvt          | |          dk    rdddd                    |d	          }	nGt          | |          dk    rdd	d                    |d	          }	nddd                    |d	          }	|pd	}||k    sJ t          ||                      ||k    sJ t          ||                      || k    sJ t          | |                       ||k    sJ t          ||!                      ||k    sJ t          ||"                      t          d$||||
|	|d#|S )%Nr   r         ?version)   r   r   )r:   r:   r   r:      )    r   r
   r   )@   r   )   r   )   r   r      r   r   )   r   r   )r   r   )   r   r   )   r   r   r   )	r:   r   r   r   r   r   r   r   i    r   r   r   )r:   r   r   )r:   r   )TILE_Mr   )TILE_Nr   )r   r   )r   r   )r   r   )r   r   
GROUP_SIZE
num_stages	num_warpsr   r6   )	r   r   get_device_namer   float16updategetrB   dict)r   r   r   r   r   r   r   r   r   r   r   extradevice_namer   r   s                  r   scatter_mm_metar     s    	J
CvMMj00221b"s+	
 
 
 DK  %   K q!9
""Bx8##

		bX%%

		bX%%

		bZ''

	AY*$$Bx8##

		bX%%

		bX%%

		bZ''

	AY+%%Bx8##

		bX%%

		bX%%

		bZ''

		bZ''

	AY+%%Bx8##

		bX%%

		bX%%

		bZ''

		bZ''

	AY+%%Bx8##

		bX%%

		bX%%

	 

 

 #a** 	 99dG	
gB~288RRR00~288RRR00qJq!99tA1--11"a88IIAYY$A1--11"a88IIAYY#A**2q11IIA**2q11IqJR<<<V333<<<R<<<V333<<<777D1$$$777777D1$$$777777D1$$$777     r   c                    |t           j        }||}|d}||	|
|hd hk    rt           j                                        }| |||||dk    |dk    |dk    f}||u r|}n||f}t	          d|||||f          }||dk    rt	          d||||df          }|||urt	          d||||df          }|t	          dg |d d         d|dd          R |||df          }|1||ur-t	          dg |d d         d|dd          R |||df          }t          |pi           D ]E}||         }|d         }|d	         }||z  }||z  dk    r||k    rt          |          }||z  |d	<   F| |j        di | |S t          d
| d|d|d|d|d|d|d|d|           |pt          ||z  d          }|pd}|
pd}
|	pd}	t          d|||
|	d|S )Nr   r   r   bsr_dense_addmmr   r
   *r`   r   z@bsr_dense_addmm uses non-optimal triton kernel parameters for M=z K=z N=z Ms=z, Ks=z beta=z alpha=z dtype=z out_dtype=r   )r   GROUP_SIZE_ROWr   r   r6   )
r   r   r   r   r   sortedr   r   r   r|   )r   r   r   r   r   betaalphar   r  r   r   sparsityr,   	out_dtype_versionr   r   keyversion_dtyper   matching_metamkeymeta_r   split_ncs                             r   bsr_dense_addmm_metar    sf   * }	J7D6AAj0022!QB	419eqjAI!MM!9,M}h7	
 
 
 <HOO!!=#6	  D <E22!3hs=S  D <$!)#bqb')3)QRR))!=#6	  M $i)?)? (%-c"1"g-s-SW--%uc2	! ! ! }233 - -%d+G	*Lq5A::!q&&;;D&'1fDODK  %   K
 [[ [[ ["#[ [')[ [.0[ [48[ [<A[ [EJ[ [NW[ [  
 (Q"WaG#(qNqJQI %	 
   r   c                   :    e Zd ZdZd Zd Zd Zed             ZdS )TensorAsKeyaR  A light-weight wrapper of a tensor that enables storing tensors as
    keys with efficient memory reference based comparison as an
    approximation to data equality based keys.

    Motivation: the hash value of a torch tensor is tensor instance
    based that does not use data equality and makes the usage of
    tensors as keys less useful. For instance, the result of
    ``len({a.crow_indices(), a.crow_indices()})`` is `2`, although,
    the tensor results from `crow_indices` method call are equal, in
    fact, these share the same data storage.
    On the other hand, for efficient caching of tensors we want to
    avoid calling torch.equal that compares tensors item-wise.

    TensorAsKey offers a compromise in that it guarantees key equality
    of tensors that references data in the same storage in the same
    manner and without accessing underlying data. However, this
    approach does not always guarantee correctness. For instance, for
    a complex tensor ``x``, we have ``TensorAsKey(x) ==
    TensorAsKey(x.conj())`` while ``torch.equal(x, x.conj())`` would
    return False.
    c                 \   d }t          j        |          | _        |j        t          j        u r ||          | _        n|j        t          j        t          j        hv r@ ||	                                           ||
                                          f| _        ns|j        t          j        t          j        hv r@ ||                                           ||                                          f| _        nt          |j                  t!          | j                  | _        d S )Nc                     | j         j        s| j         j        rJ | j                     |                                 |                                 | j        |                                 | j         fS r   )r,   is_floating_point
is_complexdata_ptrstorage_offsetr#   rC   )objs    r   get_tensor_keyz,TensorAsKey.__init__.<locals>.get_tensor_key]  sb     	3Wsy7KWWciWWL""$$	

	 r   )weakrefref_obj_refr   r   stridedr
  
sparse_csrr   r   r   
sparse_csc
sparse_bscccol_indicesrow_indicesr   hash_hash)selfr  r  s      r   __init__zTensorAsKey.__init__\  s   	 	 	&  C((:&&%~c**DHHZE,e.>???s//1122s0011DHH ZE,e.>???s//1122s0011DHH
 &cj111$(^^


r   c                     | j         S r   )r&  r'  s    r   __hash__zTensorAsKey.__hash__  s
    zr   c                 t    t          |t                    sdS | j        |j        | |u S | j        |j        k    S )NF)
isinstancer  r  r
  )r'  others     r   __eq__zTensorAsKey.__eq__  sC    %-- 	58uy0 5= x59$$r   c                 *    |                                  S )z'Return object if alive, otherwise None.)r  r*  s    r   r  zTensorAsKey.obj  s     }}r   N)	__name__
__module____qualname____doc__r(  r+  r/  propertyr  r6   r   r   r  r  E  sg         ,#$ #$ #$J  % % %   X  r   r  )maxsizec	           	      |	   |j         }	|	J |	                                |	                                }}
|
j        }t          j        }| dk    rd||z  }g }t	          j        |||          |z  }t          ||z            D ]}|
|                                         }|
|dz                                            }||k    r@|	                    |||         ||z  z  
                    |          |                    ||z
            z              t	          j        |          }|
                                }|                                }|||z  z  }||z                       d          }|
}||                             |          }|                    dd          \  }}||         }| |||fS | dk    r||z  }g }g }t	          j        |||          |z  }t          ||z            D ]}|
|                                         }|
|dz                                            }||k    r@|	                    t	          j        ||||          
                    |                     |	                    |||         ||z  z  
                    |          |                    ||z
            z              t	          j        |          }|
                                }|                                }|||z  z  }||z                       d          }t	          j        |
d d         t	          j        ||                             |          d          f          }t	          j        |          }| ||||fS | d	k    r'|}dg}g }t          |          D ]}t          ||z            D ]}|
|                                         }|
|dz                                            }t          ||z            D ]}|	                    |d         |z   |z
             t          ||z
            D ]J} || z   }!||!                                         |||z  z  z   ||z  z  |z   }"|	                    |!|"g           K͌| t	          j        |||          t	          j        |||          fS t'          d
| d          )Nr   r   r   rs   T)
descendingstabler   r   r   zInvalid indices_format=z>. Expected bsr_strided_mm_compressed|bsr_strided_mm|scatter_mm)r  r   r   r   r   int32arangerb   r   appendrepeatrepeat_interleavecatdiffnonzeror   sortcumsumr   r   )#r   r   r   r   r   r   nbatchesr   compressed_sparse_tensor_as_keyr   r   r   r   indices_dtyper   q_offsets_lstr;   r   r   r   r   crow_indices_diffnon_zero_row_indicesar   r   nnz_per_rowindicesp_offsets_lstr   
pq_offsetsr   r   r   r   s#                                      r   _bsr_scatter_mm_indices_datarO    sI    *
-C??? # 0 0 2 2COO4E4E+L FKM444'\LfEEEJqBw 	 	Aa%%''Ba!e$))++BRxx  RU#rAv.66w??%%b2g../    Im,,	(--//088:: BF+ULL$$	 	'(<=OOPWXX*//4/MMWg&		9i@@	+	+	+'\LfEEEJqBw 	 	Aa%%''Ba!e$))++BRxx  R=HHHOOPWXX     RU#rAv.66w??%%b2g../    Im,,	(--//088:: BF+ULL$$	IRaR %&:;MMgVV 
 
	 Im,,		9iKK	<	'	'C	
x 		2 		2A17^^ 2 2!!_))++!!a%(--//qBw 2 2A$$Yr]R%7"%<==="27^^ 2 2F(^0022Q!r']BqBwORSS"))1a&1111222 L-GGGL=HHH
 	
 f~fff
 
 	
r   r   c                    |                                  dk    sJ | j        dk    sJ |                                 j        dd         }| j        \  }}|\  }}|j        dd         \  }	}
|	|k    sJ |j        dd                                         }t          |||
||fi |}d|vr3|                    | j        t          j	        t          j
        hv            |d         }t          ||||
||||t          |           	  	        }|dk    r|                    d	
           ||fz   S |dk    r|                    d
           ||fz   S |S )zkComputes indices data for :func:`scatter_mm` used in BSR and
    strided tensor matrix multiplication.
    r   r
   r!   N
allow_tf32rQ  r   r   T)is_compressedr   F)	dense_dimr   r   r#   numelr   r   r,   r   r   r.   rO  r  )r   r.  r   
meta_inputr=   r   r   r   r   K_r   rD  r   r   r   s                  r   bsr_scatter_mm_indices_datarX    sw    ==??a8q====

"233'I9DAqFBKEB7777{3B3%%''H1aB99j99D:%%syU]EN,KKLLL9oG/1aR7K<L<L L 444$'''tg%%	+	+	+%(((tg%%r   c           
         | j         dk    sJ |j         dk    sJ | j        d         | j        d         |j        d         }}}|                                 j        dd         }|t          | |d          }|d         }|5t	          j        g |j        dd         ||R | j        | j                  }|j        }	t          |          }| 	                                dk    r|
                                 n|d	v r;|
                                 t          |                                 |||
           n|dk    r|j        dd                                         }
t	          j        |
|z  |d         z  |z  |d         z  |d         |d         f| j        | j                  }t          |                              dd                              |
||d         z  |d         ||d         z  |d                                       dd                              dd          }t          |                                 |||
           |                    |                    d|
||d         z  ||d         z  f                              dd                              |
||                              dd                     nt+          |          |                    |	          S )zBSR @ strided -> stridedr
   r!   rs   Nr   )r   r   r   >   r   r   r   r   r   )r`   r   r   r
   )r   r
   r`   r   )r   r#   r   rX  r   r   r,   r   r   _nnzr   r   rU  r   r   r   movedimr   copy_	unflattenreshaper   )r   r.  r   outr   r   r   r=   r   	out_shaperD  r   r   s                r   bsr_scatter_mmra    s    8q====:????2	"u{2BB

"233'I2'B
 
 
 "!_N
{k'ek#2#''B''sy
 
 
 	I
C..C
xxzzQ			J	J	J		3::<<3GGGGG	<	'	';ss#))++{21-2ilB!!
 ):
 
 
 eYr2Til"!il"!  Wl  WQ]] 	 	3::<<LQQQQ		""HbIaL0"	!2DE  Wl  WXr2&&Yr2		
 		
 		
 		
 ".11188Ir   Fr  r  
left_alpharight_alphar_  skip_checksmax_gridr   inputr   denserc  rd  r_  re  rf  r   c                f   ||j         t          j        u rd}|                                }|                                dz
  }|j        |         }|j        d         }t          |||          }t          j        |||fz   t          j        |j	                  }t          | |||||||||	|
          S )N_int_bsr_dense_addmmr   rs   r   rb  )r,   r   int8r   r"   r#   rP   r   r:  r   r  )rg  r   rh  r  r  rc  rd  r_  re  rf  r   r   r   
batch_ndimr   r   original_batch_dims_broadcasteds                    r   rj  rj  \  s     {u{ej00'''))!%%''!+
Ij!KO*>vsE*R*R'k+q!f4+<
 
 

    r   c                	  
 !"#$% d}|                                 }|                                }|                                }|                                dz
  }|j        ||dz            \  }}|j        |dz   |dz            }|j        d         }t          |||          }||                    |||fz             }|                                dk    sdk    s|dk    s|dk    s|dk    rMdk    r|                                 n0|	                    |            dk    r|
                               |S d$d%|'d	$ |                    d
          j        g |||R  }n   |j        g ||dR  j        g |||R  }|'d	% |                    d
          j        g |||R  }n   |j        g |d|R  j        g |||R  }|                                d         dk    sJ |                                d         dk    sJ 
lt          d|                                |d         z  |d         z  ||z  z  z
  d          }t          ||||d         |d         ||j        |j        
  
        
|}t#          || ||||          \  }}}} }}}}|\  ! 
                    d|!z            }||z  "|}t'          |!"f          }t'          | "f          }t'          | !"f          } t'          |!"f          }t'          |!"f          }t(          j        t,          j        t(          j        t,          j        t(          j        t,          j        t(          j        t,          j        t(          j        t,          j        t(          j        t,          j        i|j                 #|                    d          }|                    d          dz
  }|                    d          }|||f}|	?t;          |	dd         ddd                   ddt=          |	dd                   z
  z  z   }nd}|d|d|d| d|d|d|d|di}dk    sJ  !"#$
%f	d}t?          ||||           |                                 |                                 k    r-|	                    |                    |j                             |S )a  Compute

      out = beta * input + left_alpha.reshape(-1, 1) * (alpha * (bsr @ dense)) * right_alpha.reshape(1, -1)

    where left_alpha, right_alpha are (* + 1)-D tensors when
    specified, otherwise, these are treated as tensors filled with
    ones.
    r  r   r
   r`   rs   Nr   FTr6   r!   )r  r,   r  r   r   r   r   NNr   Nrs   )r   r   )r   r   Nc                    	 t          |          g t          | R dk    dk    dk    
t          j        k    d
	 d S )Nr   r   )
beta_is_onebeta_is_nonzeroalpha_is_oneleft_alpha_is_oneright_alpha_is_oneBLOCKSIZE_ROWBLOCKSIZE_INNERBLOCKSIZE_COLrQ  	acc_dtype)_bsr_strided_addmm_kernelr^   tlfloat32)rw   r   BKBMBNr  r  dot_out_dtyperv  r   rw  s     r   r   zbsr_dense_addmm.<locals>.kernel  s    !$' 	
!>2	
	
 	
 	
 	 AI!/1$
2#	
 	
 	
 	
 	
 	
 	
r   )!r   r   r   r"   r#   rP   	new_emptyrZ  r   r\  mul_expandr   rC   roundr  r,   r   r   r   r   r   r}  r~  r.   float64rk  r:  r   r0   r?   r   r  )&rg  r   rh  r  r  rc  rd  r_  re  rf  r   r   r   r   r   rl  r   r   r=   r   rm  r  
out_backupr   out_untiled	n_batchesn_block_rowsn_block_colsre   rf   rl   r   r  r  r  r  rv  rw  s&      ``     `                     @@@@@@r   r  r    s   , FZZ\\F##%%L//##K!!##a'J9Z*q.01DAqZ!^j1n<=IBA ';63&N&N#
{oo=AFGG
xxzzQ%1**Q!q&&AFF199IIKKKKIIeqyy
 /U__R((/ 
,
./
12
 
 


 T_Z_L&ELqL!LLLS 
,
./
12
 
 

 !0eoob))0 
,
./
12
 
 
 V&k&N(GNNANNNU 
,
./
12
 
 
 r"a''''#q((((|SXXZZ)A,61EQOOQRSS#aLaL+i
 
 
 J 	sE5*k3GG	 FBhhy!r'**G	
gBK
C"b
*
*Ceb"X..Eeb"X..E":Bx88J#K"b::K 	rz
rzrz
BHRX 
iM 

1I$$R((1,L::b>>LL,7IHRaRL2.//'QXbqb\ARAR=R2SS 	m_{}K[[	O A::::
 
 
 
 
 
 
 
 
 
 
 
 
$ &/9kBBB
||~~,,.... 	))**:;;<<<r   IS_BETA_ZEROrx  rz  TILE_Kr{  rQ  c            
         t          j        d          } t          j        d          }!||| z  z   ||!z  z   }"t          j        |"          }#t          j        |"|z             }$|$|#z
  }%|%dk    rd S t          j        d|          }&t          j        d|          }'||| z  z   |	|#z  z   |
|&d d d f         z  z   ||'d d d f         z  z   }(||| z  z   ||#z  z   })||| z  z   ||!z  z   ||&d d d f         z  z   }*||| z  z   ||'d d d f         z  z   }+t          j        d|          },t	          |%          D ]2}-t          j        ||f|          }.t          j        |)          }/t	          d||          D ]}0|0|,z   }1|1|k     }2t          j        |*||1d d d f         z  z   |2d d d f         d          }3t          j        |+||/z  z   ||1d d d f         z  z   |2d d d f         d          }4|.t          j        |3|4||          z  }.|r|.| z  }.n| |.z  |t          j        |(          z  z   }.t          j        |(|.                    |j	        j
                             |(|	z  }(|)|z  })4d S )Nr   axisr   r,           maskr.  rQ  r  )r}  
program_idloadr;  rb   r   dotstoretor,   
element_ty)5r  r  r  rx  rz  kr  
values_ptrvalues_batch_stridevalues_nnz_stridevalues_row_block_stridevalues_col_block_stridecrow_indices_ptrcrow_indices_batch_stridecrow_indices_stridecol_indices_ptrcol_indices_batch_stridecol_indices_stridemat1_ptrmat1_batch_stridemat1_tiled_row_stridemat1_tiled_col_stridemat1_row_block_stridemat1_col_block_stridemat2_ptrmat2_batch_stridemat2_tiled_row_stridemat2_tiled_col_stridemat2_row_block_stridemat2_col_block_strider{  rQ  	batch_pidrow_block_pidcrow_indices_offset_ptr
nnz_offsetnnz_offset_nextrow_nnzrow_block_arangecol_block_arangevalues_block_ptrscol_index_nnz_ptrmat1_block_ptrsmat2_block_ptrsk_tile_arange_	acc_block	col_blockk_tile	k_offsetsmask_k
mat1_block
mat2_blocks5                                                        r   _sampled_addmm_kernelr  2  sv   F Mq)))	1--- ')34!M12 	 
 W455
'"9<O"OPP "J.a<<F9Q669Q66 !I-.*,- &(8D(AAB &(8qqq(AA	B 	 &23 :-. 	 )+,#m34 $&6qqq$w&??@ 	 )+,#&6tQQQw&??@ 	 	!V,,w &	4 &	4A-!?yQQQI  122I1f--  "]2	"QW#&;iaaa>P&PPaaa  
  W#+i78+i4.@@A  4  
 RV
zY   		  RU"		!I-rw?P7Q7Q0QQ	 H&	Z5E5P(Q(QRRR !22!33M&	4 &	4r   r  c                    t          j        d          }t          j        d          }t          j        d          }t          j        d          }t          j        d          } t          j        |||| |          \  }}|||z  z   ||z  z   }!t          j        |!          }"t          j        |!|z             }#|#|"z
  }$|$dk    rd S t          j        d|          }%t          j        d|          }&| ||z  z   ||"z  z   ||%d d d f         z  z   ||&d d d f         z  z   }'|||z  z   ||z  z   ||&d d d f         z  z   ||%d d d f         z  z   }(|||z  z   ||z  z   ||z  z   ||%d d d f         z  z   ||%d d d f         z  z   })||	|z  z   |
|"z  z   }*t          j        ||f|          }+t          |$          D ]i},t          j        |'          }-t          j        |*          }.t          j        |(||.z  z             }/|+t          j        |-|/||          z  }+|'|z  }'|*|
z  }*jt          j	        |)|+
                    |j        j                             d S )Nr
   r  r   r   r  r  r}  r  num_programs	swizzle2dr  r;  r   rb   r  r  r  r,   r  )0r  r  r  r  r  r  r  r  r  r  r  	dense_ptrdense_batch_stridedense_tiled_row_stridedense_tiled_col_stridedense_row_block_stridedense_col_block_stride
output_ptroutput_batch_strideoutput_tiled_row_strideoutput_tiled_col_strideoutput_row_block_strideoutput_col_block_striderx  rz  r{  rQ  r  r  r  col_block_pidr  r  r  r  r  r  r  r  r  dense_block_ptrsoutput_ptrsr  output_acc_blockr  values_blockdense_row_idxdense_blocks0                                                   r   "_bsr_strided_dense_rowspace_kernelr    sD   \ Mq)))	1---1---A...A...')|=,n(
 (
$}
 ')34!M12 	 
 W455
'"9<O"OPP "J.a<<F9Q669Q66 !I-.*,- &(8D(AAB &(8qqq(AA	B 	  9,-$}45 %'74'@@A %'7aaa'@@	A 	 !I-.%56 &56 &(8D(AA	B
 &(8qqq(AAB 	 &23 :-. 	 8]M$B)TTTw 	4 	4A7#455L G$566M' #9M#II K
 kjI! ! ! 
 !22!33 	.11*2B2MNNOOOOOr   c           
          |                     d          }|                     d          dz
  }||f}|?t          |d d         d d d                   ddt          |d d                   z
  z  z   }nd }|d|d|d|	d|
di}|j        t          j        t          j        fv rt          j        d	nt          j	        d
 fd}t          ||||           d S )Nr   rs   r   r
   r   )r   N)r   rs   )r   rq  TFc                 X    t          |          g	t          | R ddd d S )Nr   r   )r{  rQ  r   r   )r  r^   )
rw   r   r{  rQ  r  r  r=   is_beta_zeror  tile_ks
     r   r   z)_run_sampled_addmm_kernel.<locals>.kernelX  sw    !$' 	
   &~6  $%     r   )r   r0   r?   r,   r   r-   r.   r}  r~  r  r   )r  r  r  r=   r  r  r   r   r   mat1mat2rf  r  r  re   rf   rl   r   r{  rQ  s   ``````            @@r   _run_sampled_addmm_kernelr  4  s6    KKNN	#((,,q0-	!TTrT 233gSRTSTRTEVEVAV6WWKKKI'')
 <EJ777
IJJ
IJ	 	 	 	 	 	 	 	 	 	 	 	 	foy+FFFFFr   g      ?)r  r  r_  re  rf  r  r  c                   d}t          ||            t          || ||          }	|st          ||| j                   t          ||| j                   |dk    r)| j        t
          j        u rt          d| d| d           | j        t
          j        ur-t          ||| j                   t          ||| j                   nt          |||j                   t          |||           |t          ||           t          |||j                   t          ||| j                   t          |j
        |	j
        k    o)|                                |                                 k    | d|	j
         d|	                                 d|j
         d	|                                 	           ||	                    |j        d
          }n|                    |	           |                                dk    s|                                dk    r|S |                                j
        dd          }
|                    d          }|dk    s|dk    r)|                                                    |           |S |}t%          |||          \  }}}}}t'          ||
d         |f          }t'          |||
d         f          }t)          |
 }t+          |||dk    |
||||||||           |                                                                dd          |                                dd          k    rQ|                                                    |                    |                                j
                             |S )Nsampled_addmmr  Fz(): having beta == z3 not equal to 0.0 with boolean mask is not allowed.z!(): Expects `out` to be of shape z and with nnz equal to z but got out.shape = z and out.nnz = T)copyr   r!   rs   r   r   )r   r   r   r   r,   r   boolr   r2   r*   r#   rZ  r  r\  rU  r   r   r  r   r   r|   r  rC   r^  )rg  r  r  r  r  r_  re  rf  r   input_broadcastedr=   r  r  r   r   r   r  s                    r   r  r  i  s    !'''4VUD$OO 	u|444u|444s{{u{ej88kk$kkk   {%*,,FD%+666FD%+6666FD$*555&vtT::: ---VS$+666FC555I!2!88WSXXZZ5::<<=W R R@Q@W R R->-C-C-E-ER R+.9R REHXXZZR R   ;#&&tz&==CCII'(((99;;!sxxzzQJJJLL&rss+	IIbMM C<<166JJLLd###J 
8FsDRV8W8W5k64 	!a'899 9Q<'899i!CK	
 	
 	
$ %%'',0DDD%%fnnZ5F5F5H5H5N&O&OPPPr   )r_  re  rf  r   c                   d}| j         dd          \  }}|st          ||            t          || |j                   t	          || |j        t          j        f           t          || |           |	                    d          }	| 
                                j         dd          \  }
}t          ||
|f           t          |	dz   | d|	 d           n|j         dd          \  }}	t          || |          }|x|sv|||	fz   }t          |j         |k    d| d|j          d	           t          |                                p'|                    dd                                          d
           ||                    |||	fz             }|                                 dk    r|                                S t'          || |dd|          S )Nbsr_dense_mmr!   rs   r:   z(): dense.size(-1) == z should be divisible by 16z9bsr_dense_mm(): `out` argument has wrong shape, expected z
, but got r    zbsr_dense_mm(): only row-major/col-major `out` arguments are supported, i.e. (out.is_contiguous() or out.transpose(-2, -1).is_contiguous()) should be True.r   r   )r  r  r_  )r#   r   r   r   r2   r,   r   rk  r*   r   r   r@   r   rP   is_contiguousr   r  rZ  r   r  )r   rh  r_  re  rf  r   r   r   _klr   	row_blockr  _krrm  expected_out_shapes                  r   r  r    s     2333 	&VS)))el333U[5:-@@@&vsE:::

2A#&::<<#5bcc#: IyFY	$:;;;F
NNNNN   
 [%FC*>vsE*R*R'?;?!@Aq6!I	//G.G G:=)G G G  
 !!##Ls}}R'<'<'J'J'L'L"   ;//"AQF"JKKC 88::??99;; sCaaSIIIIr   MAX_ROW_NNZTILEc                    t          j        d          }t          j        d          }t          j        d          }| ||z  z   ||z  z   }t          j        |          }t          j        ||z             }||z
  }|dk    rd S t          j        d|
          }|||z  k     }|||z  z   ||z  z   ||z  z   }t          j        ||z   |t	          d                                         t           j                  }t          j        |d          }t          |
|	|
          D ]}||
z  }|||z  k     }t          j        ||z   |t	          d                                         t           j                  }t          j        |d          }t          j	        ||k    ||          }t          j
        ||z
            }t          j        |d          }t          |
|	|
          D ]}||
z  }|||z  k     }t          j        ||z   |t	          d                                         t           j                  }t          j
        ||z
            }|t          j        |d          z  }t          j        ||z   ||z                      |j        j                  |           t          |
|	|
          D ]}||
z  }|||z  k     }t          j        ||z   |t	          d                                         t           j                  }t          j
        ||z
            }t          j        ||z   ||z                      |j        j                  |           d S )Nr
   r  r   r   infr  )r  )r}  r  r  r;  r/   r  r~  r|   rb   whereexpsumr  r,   r  )r  r  r  r  r  r  values_nnz_col_block_strider  r  r  r  r  row_block_offset_pidr  r  r  r  r  
row_aranger  curr_row_values_ptrsrow_tilemax_row_valuer  curr_max_row_valuenumdenoms                              r   _bsr_softmax_kernelr    s    Mq)))	!}!4441--- ')34!M12 	 
 W455
'"9<O"OPP "J.a<<FYq$''
Gi// !I-.%(<<= 9$% 	 7 :-Du
 
 

"RZ.. 	 xa000t[$// 		 		A$J) 33Dw$z1U5\\M  bnn  "$q!9!9!9H 22MCU MM
 fX-..s###t[$// 	) 	)A$J) 33Dw$z1U5\\M  bnn  &M122CRVCa((((EE 	 :-5[Z-899	
 	
 	
 	

 t[$// 	 	A$J) 33Dw$z1U5\\M  bnn  &M122CH$z1u  !1!<==    	 	r   c                    d}t          ||            t          || | j                   |                                 dk    s|                                 dk    r|                                 S | j        dd          \  }}|                                 }|                                 j        dd          \  t          j	        |          nt          j	                  | 
                                                    d                              dd          }|                                                     dd                                          r'|                                                                 }n|                                 }|                    dd                                                              d                              dd                              d|z            }|j        d         |z  f}d }	|dd df         d|d	i}
fd
}t#          ||
||	            |                    d|                              dd          j        |                                 j         }t%          j        | 
                                                                |                                                                 || j        | j                  S )Nbsr_softmaxr   r!   r   rq  rs   .rp  ro  c                 f    t          |          g t          | t          d          R   d S )Ni   )r  r^   rB   )rw   r   r  max_row_nnzr  s     r   r   zbsr_softmax.<locals>.kernel}  sf    % %~6  	 E;''     r   r   )r   r2   r,   rZ  rU  cloner#   r   tritonnext_power_of_2r   r   r   r   r  rD   r^  r   r   r   r   r   )rg  r
  r   r   r   nnzr   r   re   rf   rl   r   r  r  s    `          @@r   r  r  P  s   '''FE5;///::<<1 2 2;;== {2331jjll$||~~3BCC8	9 033KK 0==K))++55a88@@BGG <<>>##B++99;; 	$\\^^))++FF\\^^FR$$Z\\Yq\\WQ^^WRC)O44 	 \!_ii@	 crc"MO	
	 	 	 	 	 	 	 	foy+FFFFNN2y#y99Yr2ellnn*, 	 -  &&((%%''<
 
 
 	
r   r  queryr
  value	attn_mask	dropout_p	is_causalscalec           	          d}t          | | d           t          |d u| d           |J t          |j        t          j        k    | dt          j         d|j         d           t	          ||| j                   t	          ||| j                   t	          ||| j                   t          ||| j                   t          ||| j                   |j        t          j        urt          ||| j                   t          || |
                    dd          d	d
          }||                     d          dk    s|d	k    rt          d
| d| d           |*dt          j        |                     d                    z  n|}	|                                                    |	           t!          |          }t          j        j                            |                                |d           t)          ||          }|S )N_scaled_dot_product_attentionz'(): is_causal == True is not supported.z'(): attn_mask == None is not supported.z(): attn_mask.layout must be z, but got attn_mask.layout == r    r!   rs   r  F)r  re  r   z(): current value of scale == z results in division by zero.r   T)r   inplace)r   r   r   r   r   r   r2   r,   r  r  r   r   mathsqrtr   r  r  nn
functionaldropoutr  )
r  r
  r  r  r  r  r  r   sdpascale_factors
             r   r  r    s:    1)mOOOPPPit#%W%W%WXXX$$$ 00 7 7(-(87 7#,#37 7 7	
 	
 	
 	VS%,///VUEL111VY555FC---FE5;///?%*,,	5;777ucmmB33#5
 
 
 =UZZ^^q00ESLL / / / / /  
 9>q49UZZ^^44445<(((4  ##DKKMMY#MMMD%((r   r   r   r   r  r   r   c                 *   | |z  }t          j        d          }t          j        d          }||z  }||z  }||z  t          j        d|          z   }||z  t          j        d|          z   }t          j        d|          }||d d d f         |z  |d d d f         |z  z   z   } ||d d d f         |	z  |d d d f         |
z  z   z   }!t          j        |||z  z             }"t          j        ||dz   |z  z             }#|"|#k    rd S t          j        ||f|          }$t          |"|#          D ]}%t          j        ||%|z  z             }&t          j        ||%|z  z   |z             }'t          j        | |&|z  z             }(t          j        |!|'|z  z             })|$t          j        |(|)||          z  }$|||z  z   |d d d f         |z  |d d d f         |z  z   z   }*t          j        |*|$                    |j	        j
                             d S Nr   r  r   r  )r  rQ  )r}  r  r;  r  r   rb   r  r  r  r,   r  )+r   r   r   
blocks_ptrblocks_stride_Pblocks_stride_Mblocks_stride_K
others_ptrothers_stride_Qothers_stride_Kothers_stride_Naccumulators_ptraccumulators_stride_Raccumulators_stride_Maccumulators_stride_Npq_offsets_ptrpq_offsets_stridepq_ptrpq_stride_Tpq_stride_1r  r   r   rQ  r   pid_tpidpid_mpid_nrmrnrkA_ptrB_ptrr   r   r  r   r   r   Ar   C_ptrs+                                              r   _scatter_mm2_kernelr=    sk   6 &[1%%%m###r	bV^bi6222V^bi6222Yq!__qqq$wK/)BtQQQwK/,II
 qqq$wK/)BtQQQwK/,II
 W^e.?&??@@W^uqy4E&EEFF88FHff-]CCC	r2 	V 	VA[011A[0;>??AO 3344AO 3344A1*UUUUII ++, 111d733T111W+ 556 	 		%5%;%FGGHHHHHr   r   r   rN  
pq_indicesr   c                    | j         \  }}|j         \  }}t          t          ddz            t          ddz            dd          }	fd}
t          j        t
          j        t          j        t
          j        t          j        t
          j        t          j        t
          j        i|j	                 }d|	vr$|	
                    |t
          j        k               t          |
         || |                     d	          |                     d          |                     d          ||                    d	          |                    d          |                    d          ||                    d	          |                    d          |                    d                              d	          ||                    d	          |                    d          fd
|i|	 d S )Nr:   r   r   r
   )r   r   r   r   c                     j         d         dz
  t          j        | d                   t          j        | d                   z  dfS )Nr   r   r   r   r#   r  cdiv)METAr   r   rN  s    r   rw   z_scatter_mm2.<locals>.grid  sH     #a'AtH~..QX1O1OO r   rQ  rR  r   r  )r#   r   r|   r   r   r}  r~  r.   r  r,   r   r=  rC   )r   r   rN  r>  r   r   r   r   r  r   rw   r  r   r   s     `         @@r   r   r     s    <Aq<Aqr16??3r16??qTU
 
 
	 	 	 	 	 	 	 M2:NBJM2:M2:	

 
 t##KK=BJ#>K???D!MM!MM!MM!MM!MM!MM!""""""a  a  a  )	
 	
* (+	
, -	
 	
 	
 	
 	
r   r   rS  r   r   c                    ||z  }||z  }||z  }t          j        d          }t          j        d          }|| z  } || z  }!||z  }"||"z  }#|#|z  }$t          ||$z
  |          }%|$||%z  z   }&||"z  |%z  }'|&|z  t          j        d|          z   }(|'|z  t          j        d|          z   })t          j        d|          }*||(d d d f         |z  |*d d d f         |z  z   z   }+|| |	z  z   |*d d d f         |
z  |)d d d f         |z  z   z   },t          j        ||!z             }-|rU|-|z  |z  }.|-|z  |z  }/t          j        ||.z             }0t          j        ||.z   dz             }1|/|1z  ||/z
  |0z  z   }2|1|0z
  }3n6t          j        ||!z             }2t          j        ||!z   dz             }4|4|2z
  }3||2z   }5t          j        ||f|          }6|r|+|0|z  z  }+t          |3          D ]f}7t          j        |5          }8t          j        |,|8z             }9t          j        |+          }:|6t          j        |:|9||          z  }6|+|z  }+|5dz  }5gn||2z   };t          |3          D ]}7t          j        |5          }8t          j        |,|8z             }9t          j        |;          }<t          j        |+|<|z  z             }:|;dz  };|5dz  }5|6t          j        |:|9||          z  }6||-z   | |z  z   |(d d d f         |z  |)d d d f         |z  z   z   }=t          j        |=|6	                    |j
        j                             d S r   )r}  r  rB   r;  r  r   rb   r  r  r  r,   r  )>rD  r   r   r   r!  r"  r#  r$  r%  others_stride_Br'  r(  r)  accumulators_stride_Br+  r,  c_indices_ptrr_offsets_ptrp_offsets_ptrq_offsets_ptrrS  r  r   r   r   r   rQ  r   BLOCKS_MBLOCKS_Npid_t_r3  pid_br2  num_pid_in_groupgroup_idfirst_pid_mgroup_size_mr4  r5  r6  r7  r8  r9  r:  r   r   r   r   r   r   r  r   q_ptrr  r  r   r   r;  p_ptrr   r<  s>                                                                 r   _scatter_mm6_kernelrU  B  s	   < '\<<A&&&m###!("%0**+8k1:>>s\12''L8V^bi6222V^bi6222Yq"qqq$wK/)BtQQQwK/,II
 o%&!!!T'{_,r$'{_/LLN 	 GME)** 
	aBAQ2A*++B*Q.//BR7Q;",,Br'CC.//B.233Br'C"Hff-]CCC	 	R/))E3ZZ  GENNGEAI&&GENNRVqMj   	 (
 "B&E3ZZ 	 	GENNGEAI&&GENNGEA$7788

RVqMj   		
 ++, 111d733T111W+ 556	 	 		%5%;%FGGHHHHHr   Tr   r   r   r   force_contiguousc	                 @   |d         }	| j         \  }
}|j         \  }}|j         \  }}}||k    sJ ||	z  |k    sJ fd}t          j        t          j        t          j        t          j        t          j        t          j        t          j        t          j        i|j                 }d|vr$|                    |t          j        k               |	                    d          dk    sJ 	                    d          dk    sJ |	                    d          dk    sJ |	                    d          dk    sJ |rT| 
                                } |
                                }|                                s|
                                }n|}n|}t          |         ||| | 	                    d          | 	                    d          | 	                    d          ||	                    d          |	                    d          |	                    d          ||	                    d          |	                    d          |	                    d          |||fd|i| |r+|                                s|                    |           d S d S d S )	Nr   c                     j         d         z  t          j        | d                   t          j        | d                   z  fS )Nr   r   r   rA  )rC  r   r   r   r   s    r   rw   z_scatter_mm6.<locals>.grid  sD    "Q&BX//&+b$x.2Q2QQ r   rQ  rR  r   r   r
   r  )r#   r   r   r}  r~  r.   r  r,   r   rC   rD   r  rU  r\  )r   r   r   r   r   r   r   r   rV  r   r   r   _Kr   B__Mr   rw   r  accumulators_r   r   r   s      `                @@@r   r   r     s    y/\
B<2q!'
BQwwww'\Qwwww	 	 	 	 	 	 	 	 M2:NBJM2:M2:	

 
 t##KK=BJ#>K???""a''''""a''''""a''''""a''''  	)&&((F&&((F--// - , 7 7 9 9 ,(MD!MM!MM!MM!MM!MM!MM!  ##  ##  ##)	
 	
* (+	
, -	
 	
 	
2  	.L$>$>$@$@ 	.}-----	. 	. 	. 	.r   left_alpha_tiled_col_strideleft_alpha_col_block_strideright_alpha_tiled_row_strideright_alpha_row_block_striders  rt  ru  rv  rw  ry  c7                     |dk    sJ |dk    sJ |dk    sJ |!dk    sJ t          j        d          }7t          j        d          }8t          j        d          }9t          j        d          }:t          j        d          };t          j        |8|9|:|;|5          \  }8}9|||7z  z   ||8z  z   }<t          j        |<          }=t          j        |<|z             }>|>|=z
  }?t          j        d|0          }@t          j        d|2          }At          j        d|1          }B| ||7z  z   ||=z  z   ||@d d d f         z  z   ||Ad d d f         z  z   }C|||7z  z   ||9z  z   ||Ad d d f         z  z   ||Bd d d f         z  z   }D|#|$|7z  z   |%|8z  z   |&|9z  z   |'|@d d d f         z  z   |(|Bd d d f         z  z   }E||	|7z  z   |
|=z  z   }Ft          j        |0|1f|3          }Gt          |?          D ]i}Ht          j        |C          }It          j        |F          }Jt          j        |D||Jz  z             }K|Gt          j        |I|K|4|3          z  }G|C|z  }C|F|
z  }Fj|-s|G|*z  }G|.sK|||7z  z   ||8z  z   ||9z  z   ||@d d d f         z  z   ||Bd d d f         z  z   }L|Gt          j        |L          z  }G|/sK|||7z  z   ||8z  z   | |9z  z   |!|@d d d f         z  z   |"|Bd d d f         z  z   }M|Gt          j        |M          z  }G|,rh|||7z  z   ||8z  z   ||9z  z   ||@d d d f         z  z   ||Bd d d f         z  z   }N|+r|Gt          j        |N          z  }Gn|G|)t          j        |N          z  z  }Gt          j	        |E|G
                    |#j        j                             d S )Nr   r
   r  r   r  r  r  )Or  r  r  r  r  r  r  r  r  r  r  	input_ptrinput_batch_strideinput_tiled_row_strideinput_tiled_col_strideinput_row_block_strideinput_col_block_strider  r  r  r  r  r  left_alpha_ptrleft_alpha_batch_strideleft_alpha_tiled_row_strider]  left_alpha_row_block_strider^  right_alpha_ptrright_alpha_batch_strider_  right_alpha_tiled_col_strider`  right_alpha_col_block_strider  r  r  r  r  r  r  r  rs  rt  ru  rv  rw  rx  rz  ry  r{  rQ  r  r   r  r  r  r  r  r  r  r  r  r  inner_block_aranger  r  r  r  r  r  r  r  r  r  left_alpha_ptrsright_alpha_ptrs
input_ptrssO                                                                                  r   r|  r|  	  si   V +a////*a////+q0000+q0000Mq)))	1---1---A...A...')|=,n(
 (
$}
 ')34!M12 	 
 W455
'"9<O"OPP "J.9Q66Yq/::9Q66 !I-.*,- &(8D(AAB &(:47(CC	D 	  9,-$}45 %'9!!!T''BBC %'7aaa'@@	A 	 !I-.%56 &56 &(8D(AA	B
 &(8qqq(AAB 	 &23 :-. 	 8]M$B)TTTw 	4 	4A7#455L G$566M' #9M#II K
 kjI! ! ! 
 !22!33 	&%  		9)I56-=> .=> .0@D0II	J
 .0@qqq0IIJ   8 88! 		:*Y67.>? />? /1A!!!T'1JJ	K
 /1A$'1JJK  (8 9 99 	?$y01(=89 )=89 )+;AAAtG+DD	E
 )+;D!!!G+DDE   ? BGJ$7$77   D27:+>+>$>>  	.11*2B2MNNOOOOOr   r   )NNNNNN)NNNNNNNr   )r   )NN)r  FN)T)Cr  osr  	functoolsr   typingr   r   torch._dynamo.utilsr   torch.utils._tritonr   _triton_ops_metar   intgetenvr	   r   r   r   r*   r2   r@   rE   rP   rU   r\   r^   rx   r   r   r   r   r   r   r   r  r  rO  rX  ra  Tensorr  r0   r   rj  r  r  triton.languagelanguager}  jit	constexprr  r  r  r  r  r  r  r/   r  r=  r   rU  r   r|  r6   r   r   <module>r     s    				               ) ) ) ) ) ) * * * * * * & & & & & & .1SBI:A>>. . *
  
      "    (  &U U U      ; ; ;2& & & &0 7  7  7F	 	 	
/ 
/ 
/   >B m2 m2 m2 m2 m2l k k k kl 
` ` ` `FK K K K K K K K\ =>>>]
 ]
 ?>]
B  ;   BD D D DX 

)-*."&MQ& & &<&	& <& && %,'& 
%,	& & uXc]HSM8C=HIJ& 4.& & & &\ 

)-*."&MQf f f<f	f <f &f %,'f 
%,	f f uXc]HSM8C=HIJf 4.f f f fR :<< s%MMM      Z{4 l{4 |	{4
 |{4 {4> <?{4@ LA{4 {4 {4 Z{4z ZAPN |OAPP |QAPR <SAPT LUAPV WAP AP AP ZAPF3G 3G 3Gt &*!QUU U U|UlU lU el#U U 5#x}!LMNU U U Uv '+!QU#5J 5J 5J\5J|5J el#	5J
 5J 5#x}!LMN5J tn5J 5J 5J 5Jn ZV \V lV V V ZVpE
 E
 E
 E
X !%, ,|,\, |, EL)	,
 , , , , , ,\ ZDI<DI<DI <DI* |+DI, -DI. /DI0 L1DI DI DI ZDIL4
4
4
 L4
 L	4

 l4
 4
 4
 4
l ZoI LoI* |+oI, |-oI. /oI0 1oI2 3oI4 L5oI6 L7oI oI oI ZoIt "&X. X.X.X. <X. <	X.
 <X. <X. X. lX. X. X. X. X.t ZIPL &(\MIPP &(\QIPZ ')l[IP^ ')l_IPx \yIPz {IP| l}IP~ <IP@ LAIPB |CIPD |EIPF GIPH <IIPJ LKIPL MIPN OIP IP IP ZIP IP IPX KLM$(!LL $r   