
    PiWE                        d dl mZmZ d dlmZmZ d dlZ ed           G d d                      Z e            Zde	d	e	fd
Z
ej        fdej        de	de	dej        de	d	ej        fdZej        fdej        de	de	dej        de	d	ej        fdZde	d	eej        ee	         ee	         f         fdZde	d	eej        ej        ej        f         fdZd Zd Zd ZdS )    )	dataclassfield)ListTupleNT)frozenc                       e Zd ZU dZeed<   dZeed<   dZeed<    ed           Z	e
e         ed	<    ed
           Ze
e         ed<   dS )Marlin24Constants   TILE   MIN_THREAD_N@   MAX_PARALLELc                  
    ddgS )N       r       q/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchao/sparsity/marlin/utils.py<lambda>zMarlin24Constants.<lambda>   s
    1a& r   )default_factorySUPPORTED_NUM_BITSc                  
    g dS )N)    r   r   r   r   r   r   r   zMarlin24Constants.<lambda>   s    EVEVEV r   SUPPORTED_GROUP_SIZESN)__name__
__module____qualname__r   int__annotations__r   r   r   r   r   r   r   r   r   r	   r	      s         D#NNNL#L# %*E..$I$I$IS	III',u=V=V'W'W'W49WWWWWr   r	   num_bitsreturnc                 B    | t           j        v sJ d|              d| z  S )zCompute the packing factor for a given number of bits.

    Args:
        num_bits (int): Number of bits to pack.
    Returns:
        int: The packing factor.
    zUnsupported num_bits = r   )constr   )r"   s    r   get_pack_factorr&      s3     u////1U81U1U///>r   q_wsize_ksize_npermtilec                    | j         ||fk    sJ ||z  dk    sJ d| d|             ||z  dk    sJ d| d|             |                     ||z  |||z  |f          } |                     d          } |                     ||z  ||z  f          } |                     d|                                f          dd|f                             | j                   } | S )a  Permute weights to 16x64 Marlin tiles.

    Args:
        q_w (torch.Tensor): Quantized weights.
        size_k (int): Number of input features.
        size_n (int): Number of output features.
        perm (torch.Tensor): The computed permutation tensor to be applied.
        tile (int, optional): Tile size. Defaults to `TILE`.
    Returns:
        torch.Tensor: Weight tensor permuted to Marlin tiles.
    r   	size_k = 	, tile = r            r   N)shapereshapepermutenumel)r'   r(   r)   r*   r+   s        r   marlin_permute_weightsr7   '   s   & 9(((((D=AB6BBDBBD=AB6BBDBB ++v~tVt^TB
C
CC
++l
#
#C
++v~v}5
6
6C
++r4::<<(
)
)!!!T'
2
:
:39
E
ECJr   q_w_unpackedreverse_permc                    | j         d         |f||z  | j         d         |z  fk    sJ ||z  dk    sJ d| d|             ||z  dk    sJ d| d|             |                     d|                                f          dd|f                             | j                   }|                    ||z  ||z  ||f          }|                    d          }|                    ||f          }|S )a  Reverse permute weights from 16x64 Marlin tiles.
    Args:
        q_w_unpacked (torch.Tensor): Unpacked quantized weights.
        size_k (int): Number of input features.
        size_n (int): Number of output features.
        reverse_perm (torch.Tensor): The computed reverse permutation tensor to be applied.
        tile (int, optional): Tile size. Defaults to `TILE`.
    Returns:
        torch.Tensor: Weight tensor reverse permuted from Marlin tiles.
    r   r1   r-   r.   r   Nr/   )r3   r4   r6   r5   )r8   r(   r)   r9   r+   q_w_comps         r   reverse_marlin_permute_weightsr<   H   sA   $ q!6*$1%/     D=AB6BBDBBD=AB6BBDBB ##R););)=)=$>??	<gl !!  44t LMMH--H 011HOr   c           	         g }t          d          D ]g }dz  }|dz  }dD ]\}ddz  z  ddz  z  dz   ddz  dz   z  ddz  dz   z  dz   fD ]/}|                    d|z  |dz  z   d|dz  z  z   d|z  z              0]t          d          D ]#|                    fd	|D                        $t          j        |t          j        
          }| dk    r#t          j        g dt          j        
          }nK| dk    r#t          j        g dt          j        
          }n"t          d                    |                     |                    dt          |                    dd|f         
                    d          }g }	t          d          D ]#|	                    fddD                        $g }
t          d          D ]#|
                    fddD                        $||	|
fS )a  Precompute permutations for Marlin24 weight and scale shuffling

    Marlin works on [16*2,64] tiles. The goal of the permutations is to reorder the weight data so that it is compatible
    with the tensor-core format that is described here:
    https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type

    As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core
    (without the need to use ldmatrix instructions)

    Args:
        num_bits (int): Number of bits to pack.
    Returns:
        Tuple[torch.Tensor, List[int], List[int]]: The weight permutation tensor, scale permutation list, and
        scale permutation list for a single group.
    r   r   r0   )r   r1   r1   r
      r   c                      g | ]
}|d z  z   S )r1   r   ).0pjs     r   
<listcomp>z get_perms_24.<locals>.<listcomp>   s!    777Aa!a%i777r   )dtype)r   r0   r      r1   r2         r/   znum_bits must be 4 or 8, got {}r   Nc                      g | ]
}d z  |z   S r   r   r@   rB   is     r   rC   z get_perms_24.<locals>.<listcomp>   s!    GGG1q519GGGr   )r   r   r1   rF   r0   rE   r2   rG   c                      g | ]
}d z  |z   S rI   r   rJ   s     r   rC   z get_perms_24.<locals>.<listcomp>   s!    !N!N!N!a%!)!N!N!Nr   )r   r1   r0   r2   r   rF   rE   rG   )rangeappendextendtorchtensorint32
ValueErrorformatviewlenr4   )r"   	perm_listperm1colcol_oblockrowr*   
interleave
scale_permscale_perm_singlerK   rB   s              @@r   get_perms_24r`   l   sy     I2YY 9 91fq 	Q 	QEQUQUaQUQYQUQY!#	 Q Q R#X3a37mCa%iOPPPPQ q 	9 	9A77777778888	9 <	555D1}}\":":":%+NNN

	Q\,,,ekBBB

:AA(KKLLL 99RZ))!!!Z-8@@DDDJ1XX I IGGGG.FGGGHHHH#%1XX P P  !N!N!N!N5M!N!N!NOOOO...r   c                     t          |           \  }}}|                                }t          j        |                                          }t          j        |                                          }|||fS )aJ  Reverse permutation for Marlin24 weight and scale shuffling from `get_perms_24`.

    Args:
        num_bits (int): Number of bits to pack.
    Returns:
        Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: The reversed weight permutation tensor, scale permutation list and
        scale permutation list for single group.
    )r`   argsortrP   rQ   )r"   perm_24scale_perm_24scale_perm_single_24r*   r^   r_   s          r   get_reverse_perms_24rf      so     4@3I3I0G]0??Dm,,4466J%9::BBDD...r   c                    t          j        d| |          d d d f                             d|          }t          j        d||                              | d          }d}|j        dk    rdnd}||z  |z  |dz  dz  z   |dz  d	z  z   ||z  d	z  dz  dz  z   ||z  dz  d	z  z   }|dz  dk    |dz  dk    z                      t           j                  }|dz  dk    |dz  dk    z                      t           j                  }	|||	z
  z  }|||	z
  z  }d}
||
z  }||
z  }|| z  |
z  ||
z  z   |z                       d
          S )Nr   devicer1   r   r0   r   r
   r   r   r   )rP   arangerepeatitemsizetoint8rU   )m
meta_ncols
meta_dtyperi   dst_rowsdst_colsgroup_xgroup_ytopright
bottomleftr]   cols_majcols_mins                r   *_calculate_meta_reordering_scatter_offsetsrz      s   |Aq000D9@@JOOH|Az&999@@AFFH G'1,,bb"G 	Gg%a<1
	a<A
	 w!#
)B
.	/ w1$
)		*  A"x!|q'89==ejIIHa<1$A):;??
KKJ:%%H:%%H
 J:%H*$HqL:%:(==HNNrRRRr   c                 	   |                                  dk    r%t          d|                                   d          | j        \  }}| j        }t          j        }| j        t          j        k    rt          j        }nZ| j        t          j        t          j	        t          j
        t          j        fv rt          j        }nt          d| j         d          |j        dz  dz  }|dvrt          d	          |t          j        k    r|d
z  dk    rt          d| d          n|dz  dk    rt          d| d          |d|z  z  dk    rt          d| dd|z             | j        t          j
        k    r;d}|                     d||z  |          }|dk                        d          \  }}	}
}n=d}|                     d||z  |          }|dk                        d          x\  }}
\  }	}|||z  z  }||	z  }| |	z  }| |	 z  }|}|}||z  |z  }||	 z  }||                    t          j                  dz  z  }||                    t          j                  dz  z  }| j        t          j
        k    r|                    d|                    d                    }|                    d|                    d                    }t	          j        ||fd                              ||dz            }nC|                    d|                    d          dz                                ||dz            }||dz  z  }|                    d||f                              |          }|dk    rI|d d d d df         |d d d d df         dz  z  |d d d d df         dz  z  |d d d d df         dz  z  }n|dk    r|d d d d df         |d d d d df         dz  z  |d d d d df         dz  z  |d d d d df         dz  z  |d d d d df         d
z  z  |d d d d df         dz  z  |d d d d df         dz  z  |d d d d df         dz  z  }|                    ||z  f          }t+          ||||          }|                    d||                    d                     ||                    ||          fS )Nr0   z)Expected 2-dimensional dense tensor, got -dimensional tensorInvalid datatype z of dense matrixr   r   )r   r   z6Invalid number of elements per meta element calculatedr
   r   zNumber of rows of dense matrix z must be divisible by 16r   z must be divisible by 32z"Number of columns of dense matrix z must be divisible by r   r1   )dimr2      rF      rE      rG      )r~   RuntimeErrorr3   ri   rP   rn   rD   rR   halfbfloat16floatint16rl   rU   unbindrm   int64gather	unsqueezestack	new_emptyrz   scatter_)densero   kri   rq   quadbits_per_meta_elemksparsedense_4m0m1m2m3dense_2rp   expr0expr1expr2bit0bit1bit2bit3idxs0idxs1sparse0sparse1sparsemeta_4meta_nmetameta_reorderedmeta_offsetss                                  r   )sparse_semi_structured_from_dense_cutlassr      s   yy{{aX		XXX
 
 	
 ;DAq\FJ{ej  [

	U^U[%+N	N	N[

Lu{LLLMMM'0149V++STTTU[  r6Q;;M!MMM   
 r6Q;;M!MMM   	A&&'1,,fff!NdJdff
 
 	
 {ek!!**Rgw77!Q,..r22BBB**Rgw77"a<//333BRw!778JH GEC"HEC2#IEDD5=2DB3;DDGGEK((A-.EDGGEK((A-.E{ek!!..U__R%8%899..U__R%8%899gw/R888==aaHHEOOB$7$71$<==BB1a1fMMeqj!F[["j*@ABBEEjQQF""111aaa7OaaaAg!#%aaaAg!#% aaaAg"$& 	 
 1	$	$111aaa7OaaaAg!#%aaaAg!#% aaaAg"$& aaaAg"$	&
 aaaAg"$& aaaAg"$& aaaAg"$& 	 ^^Q^$566N=	:z6 L A|TYYr]];;;N'':6677r   c                 	   |                                  dk    r%t          d|                                   d          | j        \  }}| j        }|                                 dk    r%t          d|                                  d          |j        |k    rt          d| d|j         d          |j        }|t
          j        t
          j        fvrt          d| d	          |j        d
z  dz  }| j        t
          j	        k    rdnd}|j        \  }}	||k    rt          d| d|           |	|z  |z  d|z  k    rt          d| d|	|z  |z  dz   d          t          ||	||          }
t          j        |                    d          d|
                              ||	          }t          j        ||	d|z  f||          }|dk    r|dz  |d d d d df<   |dz	  dz  |d d d d df<   |dz	  dz  |d d d d df<   |dz	  dz  |d d d d df<   |d
z	  dz  |d d d d df<   |dz	  dz  |d d d d df<   |dz	  dz  |d d d d df<   |dz	  dz  |d d d d df<   n$|d
k    r|dz  |d d d d df<   |dz	  dz  |d d d d df<   |dz	  dz  |d d d d df<   |dz	  dz  |d d d d df<   |d
z	  dz  |d d d d df<   |dz	  dz  |d d d d df<   |dz	  dz  |d d d d df<   |dz	  dz  |d d d d df<   |dz	  dz  |d d d d d
f<   |dz	  dz  |d d d d df<   |dz	  dz  |d d d d df<   |d z	  dz  |d d d d d!f<   |d"z	  dz  |d d d d df<   |d#z	  dz  |d d d d d$f<   |d%z	  dz  |d d d d df<   |d&z	  dz  |d d d d d'f<   |                    d          t          j        dd|z  |z  |z  |(          dz                      dd                              dd                              d          z   }t          j        |dz  |z  f| j        |          }| j        t
          j	        k    r+|                    d||                     d                     nd|                    t
          j                                      d||                     t
          j                                      d                     |                    |d|z            S ))Nr0   z*Expected 2-dimensional sparse tensor, got r|   z(Expected 2-dimensional meta tensor, got zExpected meta matrix to be on z device, got matrix on z devicer}   z of meta matrixr   r   zNumber of rows of meta matrix z5 must be equal to number of columns of sparse matrix z#Number of columns of sparse matrix z different from the z<, expected according to the number of columns of meta matrixr   r   )rD   ri   r2   r1   rE   
   rF   r      rG   r
      	   r         r         r         rh   )r~   r   r3   ri   rD   rP   r   rR   rl   r   rz   r   rU   emptyrj   rk   zerosr   r4   r   )r   r   ro   r   ri   rq   r   r   
meta_nrowsrp   r   r   meta_2dense_offsetsr   s                  r   'sparse_semi_structured_to_dense_cutlassr   j  s~   zz||qZZZZ
 
 	
 <DAq]Fq  `~7I7I7K7K```
 
 	
 &&jVjjNLajjj
 
 	
  %J%+u{333JzJJJKKK'0149<5;..aaAG+1J
QqZqqnoqq
 
 	
 G44A==I! I IV]I]`vIvz{I{ I I I
 
 	
 >	:z6 L <++B//LAAFFq*UUD [	
J223  F
 ""+qqq!!!Qw19,qqq!!!Qw19,qqq!!!Qw19,qqq!!!Qw19,qqq!!!Qw2:-qqq!!!Qw2:-qqq!!!Qw2:-qqq!!!Qw	1	$	$+qqq!!!Qw19,qqq!!!Qw19,qqq!!!Qw19,qqq!!!Qw19,qqq!!!Qw2:-qqq!!!Qw2:-qqq!!!Qw2:-qqq!!!Qw2:-qqq!!!Qw2:-qqq!!!Qw BJ$.qqq!!!Rx BJ$.qqq!!!Rx BJ$.qqq!!!Rx BJ$.qqq!!!Rx BJ$.qqq!!!Rx BJ$.qqq!!!RxKKOOQA	W,V<<<q@
d2qkk&&A,,ttBxx(M KQFLHHHE|u{""q-););<<<<

5:''}fkk%*55::2>>	
 	
 	
 ::aQr   )dataclassesr   r   typingr   r   rP   r	   r%   r    r&   r   Tensorr7   r<   r`   rf   rz   r   r   r   r   r   <module>r      s.   ) ( ( ( ( ( ( (          $X X X X X X X X 	
c 
c 
 
 
 
$ 
 	  ,	
  \   L 
! !,!! ! ,	!
 ! \! ! ! !H5/3 5/5tCy$s))K#L 5/ 5/ 5/ 5/p//
5<u|34/ / / /BS S SB}8 }8 }8H^  ^  ^  ^  ^ r   