
    Pi-                        d dl mZ d dlZd dlmc mc mZ d dlmZ d dlm	Z	 g dZ
dej        deded	eej        ej        f         fd
Zej        ej        fdededed	ej        fdZdej        dej        deded	eej        ej        ej        f         f
dZdej        dej        dej        dej        deded	eej        ej        f         fdZdej        dededed	eej        ej        f         f
dZdej        dej        dededed	ej        fdZdej        dededed	ej        f
dZdej        dededed	ej        f
dZdej        dedededed	ej        fd Zdej        dedededed	ej        fd!ZdS )"    )TupleN)const)mask_creator)	inject_24marlin_24_workspacepack_to_marlin_24unpack_from_marlin_24wsize_ksize_nreturnc                 $   | j         ||fk    sJ t          |                                                                                                                                           }|| z                                  |                                fS )a  Injects 2:4 sparsity into a weight tensor. The sparsity is applied in a 2:4 ratio, where for every
    group of 4 weights, 2 will be pruned based on their value. The mask will be created based on the
    ranked weight values.

    Args:
        w (torch.Tensor): The weight tensor to inject sparsity into.
        size_k (int): The number of input features.
        size_n (int): The number of output features.
    Returns:
        Tuple[torch.Tensor, torch.Tensor]: The pruned weight tensor and the mask tensor.
    )shaper   tcudabool
contiguous)r
   r   r   masks       t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchao/sparsity/marlin/__init__.pyr   r      sz     7vv&&&&&  ""''))..00D1H  ""DOO$5$555    out_featuresmin_thread_nmax_parallelc                     | |z  dk    sJ d|  d|             | |z  |z  }t          j        |t           j        d          S )a  Creates a workspace for marlin 2:4 quantization. The workspace is used to coordinate the locks
    during the execution of the kernel.

    Args:
        out_features (int): The number of output features.
        min_thread_n (int, optional): The minimum number of threads per block. Defaults to `MARLIN_24_MIN_THREAD_N`.
        max_parallel (int, optional): The maximum number of parallel threads. Defaults to `MARLIN_24_MAX_PARALLEL`.
    Returns:
        torch.Tensor: The workspace tensor fully initialized with zeros.
    r   zout_features = z, min_thread_n = r   dtypedevice)torchzerosint)r   r   r   max_workspace_sizes       r   r   r   $   sa     ,&!+++G,GGGG ,++ ',6,F;)6JJJJr   q_w_24scalesnum_bits
group_sizec                     | j         \  }}t          | |||          \  }}|dz  }t          ||||          }	t          |||||          }
|	|
|fS )a  Packs the quantized weights and scales into the marlin 2:4 format.

    Args:
        q_w_24 (torch.Tensor): The quantized weight tensor with 2:4 sparsity applied.
        scales (torch.Tensor): The scale tensor.
        num_bits (int): The number of bits used for quantization.
        group_size (int): The group size that was applied during quantization.
    Returns:
        Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: The packed quantized weights, the packed scales, and the meta tensor.
       )r   _compress_quantized_24_weight_to_marlin_weights_to_marlin_scales)r"   r#   r$   r%   in_featuresr   q_w_24_compmetain_features_compmarlin_24_q_w_compmarlin_24_ss              r   r   r   :   s      !'K 6\8 K #a' ,%|X  $\:x K {D00r   r,   r-   original_shapec                     |\  }}t          |g|||R  }|dz  }	t          | |	||          }
t          |
||	||          }||fS )a7  Unpacks the quantized weights and scales from the marlin 2:4 format.
    Args:
        q_w_24_comp (torch.Tensor): The packed quantized weights.
        scales (torch.Tensor): The packed scales.
        meta (torch.Tensor): The meta tensor.
        original_shape (torch.Size): The original shape of the weight tensor.
        group_size (int): The group size that was applied during quantization.
        num_bits (int): The number of bits used for quantization.
    Returns:
        Tuple[torch.Tensor, torch.Tensor]: The unpacked quantized weights and scales.
    r'   )_from_marlin_scale_from_marlin_weights_decompress_quantized_24_weight)r,   r#   r-   r1   r%   r$   r+   r   unpacked_scalesr.   unpacked_q_w_24_compunpacked_q_w_24s               r   r	   r	   _   s    & !/K )W.W*WhWWWO"a' 0%|X 
 6d$4lH O O++r   q_24c                    | j         ||fk    sJ d|z  dz
  }|dz   dz  }| |z
  }|                                                                }t          j        |          \  }}|                                                                }||z   }	|                    |j         d         dz  |j         d         dz            }|	|fS )a  Compresses the quantized weights to a 2:4 sparse format. Normalizes the weights over 0
    before compressing them.

    Args:
        q_24 (torch.Tensor): The quantized weight tensor.
        size_k (int): The number of input features.
        size_n (int): The number of output features.
        num_bits (int): The number of bits used for quantization.
    Returns:
        Tuple[torch.Tensor, torch.Tensor]: The compressed quantized weight tensor and the meta tensor.
       r'   r   )r   r   r   utils)sparse_semi_structured_from_dense_cutlassresize_)
r9   r   r   r$   	max_q_valzp
q_24_no_zpq_24_no_zp_compr-   	q_24_comps
             r   r(   r(      s     :&&))))) h!#I
a-A	BJ **,,J!KJWWOT%''))4466O  "$I <<
1*DJqMA,=>>Dd?r   rC   c                    | j         ||fk    sJ |                    |j         d         dz  |j         d         dz            }d|z  dz
  }|dz   dz  }| |z
  }|                                                                }t	          j        ||          }|                                                                }|                    |j         d         dz  |j         d         dz            }||z   }	|	S )a  Decompresses the quantized weights from a 2:4 sparse format and restores the original shape.

    Args:
        q_24_comp (torch.Tensor): The compressed quantized weight tensor in 2:4 sparse format.
        meta (torch.Tensor): The meta tensor.
        size_k (int): The number of input features.
        size_n (int): The number of output features.
        num_bits (int): The number of bits used for quantization.
    Returns:
        torch.Tensor: The decompressed quantized weight tensor.
    r;   r'   r   )r   r>   r   r   r<   'sparse_semi_structured_to_dense_cutlass)
rC   r-   r   r   r$   r?   r@   rB   rA   r9   s
             r   r5   r5      s     ?vv..... <<
1*DJqMA,=>>D h!#I
a-A	B"nO &''))4466O>PTUUJ**,,J <<
1*DJqMA,=>>D ?DKr   q_wc                    t          j        |          \  }}}t          j        | |||          } t          j        |          }|                     t
          j                  } t          j        | j        d         | j        d         |z  ft
          j        | j	                  }t          |          D ]}|| dd|d|f         ||z  z  z  }|                    t
          j                  }|S )a  Converts a quantized and 2:4 sparse format weight tensor to the marlin 2:4 format.

    Args:
        q_w (torch.Tensor): The quantized weight tensor in 2:4 sparse format.
        size_k (int): The number of input features.
        size_n (int): The number of output features.
        num_bits (int): The number of bits used for quantization.
    Returns:
        torch.Tensor: The weight tensor in the marlin 2:4 format.
    r   r;   r   Nr   )r<   get_perms_24marlin_permute_weightsget_pack_factortor   int64r   r   r   rangeint32)	rF   r   r   r$   perm_24_pack_factorq_packedis	            r   r)   r)      s    " &x00MGQ

&sFFG
D
DC '11K &&

C{	1sy|{23kz  H
 ; = =C1>k>)*x!|<<{{{--HOr   rS   c                    t          j        |          \  }}}t          j        |          }|                     t          j                  } t	          j        | j        d         | j        d         |z  ft          j        | j                  }t          |          D ]}| ||z  z	  d|z  dz
  z  |dd|d|f<   |                    t          j
                  }t          j        ||||          }	|	S )a  Converts a weight tensor in the marlin 2:4 format to a regular quantized 2:4 sparse format.

    Args:
        q_packed (torch.Tensor): The weight tensor in the marlin 2:4 format.
        size_k (int): The number of input features.
        size_n (int): The number of output features.
        num_bits (int): The number of bits used for quantization.
    Returns:
        torch.Tensor: The weight tensor in the quantized 2:4 sparse format.
    r   r;   r   NrH   )r<   get_reverse_perms_24rK   rL   r   rM   r   r   r   rN   rO   reverse_marlin_permute_weights)
rS   r   r   r$   rP   rQ   rR   q_w_unpackedrT   q_w_comps
             r   r4   r4      s    .x88MGQ'11K
 {{5;''H;		HN1-;<k  L
 ; 
 
+31+E(]a+
QQQ;&''  ???55L3ffg H Or   c                 Z   t          j        |          \  }}}||k     r5|dk    r/|                     dt          |          f          dd|f         } n.|                     dt          |          f          dd|f         } |                     d|f                                          } | S )a  Converts a scale tensor to the format necessary for marlin.
    Args:
        scales (torch.Tensor): The scale tensor.
        size_k (int): The number of input features.
        size_n (int): The number of output features.
        group_size (int): The group size that was applied during quantization.
        num_bits (int): The number of bits used for quantization.

    Returns:
        torch.Tensor: The scale tensor in the marlin format.
    N)r<   rI   reshapelenr   r#   r   r   r%   r$   rQ   scale_perm_24scale_perm_single_24s           r   r*   r*     s     .3-?-I-I*A}*FzR//S%7%7 899!!!]:JKS)=%>%> ?@@AA##
 ^^RL))4466FMr   c                 `   t          j        |          \  }}}||k     rN|dk    rH|                     dt          |          f          dd|f         } |                     ||z  |f          S |                     dt          |          f          dd|f         } |                     d          S )a  Converts a scale tensor from the marlin format to their original format.

    Args:
        scales (torch.Tensor): The scale tensor in the marlin format.
        size_k (int): The number of input features.
        size_n (int): The number of output features.
        group_size (int): The group size that was applied during quantization.
        num_bits (int): The number of bits used for quantization.
    Returns:
        torch.Tensor: The scale tensor in their original format
    r[   N)r;   r[   )r<   rV   r\   r]   r^   s           r   r3   r3   5  s     .3-G-Q-Q*A}*FzR//S%7%7 899!!!]:JK~~v3V<===S)=%>%> ?@@AA##
 ~~g&&&r   )typingr   r   torchao.sparsity.marlin.utilssparsitymarlinr<   r   torchao.sparsity.utilsr   __all__Tensorr    r   MIN_THREAD_NMAX_PARALLELr   r   Sizer	   r(   r5   r)   r4   r*   r3    r   r   <module>rm      s          - - - - - - - - - - - - / / / / / / / / / / / /  6|6 6*-6
5<%&6 6 6 6* **K KKK K \	K K K K,"1L"1L"1 "1 	"1
 5<u|34"1 "1 "1 "1J$,$,L$, ,$, J	$,
 $, $, 5<%&$, $, $, $,N 
,  # -0 <? 
5<%&       F#|##(<#9<#FI#UX#
\# # # #L#	## # 	#
 \# # # #L$l$$'$14$@C$
\$ $ $ $NL"%/2@COR
\   2'L'"%'/2'@C'OR'
\' ' ' ' ' 'r   