
    Pi              &       `   d dl Z d dlmZ d dlZd dlmZ ej                            dd          Ze                    d           e                    d           e                    d           e                    d	           e                    d
           e                    d           e                    d           e                    d           e                    d           e                    d           e                    d           e                    d           e                    d           e                    dej	        j
        j        g           e                    dej	        j
        j        g           e                    d           e                    d           e                    d           e                    d           e                    d           e                    d           d Zd Ze j        d             Z	 dd ed!ed"ed#ed$ed%ed&efd'Z ed(          	 dd ed!ed"ed#ed$ed%ed&efd)            Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd-ed.ed/ed0ee         d1ed2ed3ee         d4ed5ed6ed7ed8ed9ed:ed;ed<ed=ed&ef$d>Z ed?          	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd-ed.ed/ed0ee         d1ed2ed3ee         d4ed5ed6ed7ed8ed9ed:ed;ed<ed=ed&ef$d@            ZdAedBed&efdCZ edD          dAedBed&efdE            ZdAedFedGedBed&ef
dHZ edI          dAedFedGedBed&ef
dJ            ZdKedLedMedNedOedPedQedRedSed&efdTZ edU          dKedLedMedNedOedPedQedRedSed&efdV            ZdKedLedWedXedYedOedQedRedSed&efdZZ ed[          dKedLedWedXedYedOedQedRedSed&efd\            Z	 	 dd]ed^ed_ed`edaee         dbeej                 d&efdcZ edd          	 	 dd]ed^ed_ed`edaee         dbeej                 d&efde            Z	 	 dd]ed^ed_ed`edaee         dbeej                 d&efdfZ edg          	 	 dd]ed^ed_ed`edaee         dbeej                 d&efdh            Z	 	 dd]ed^ed_edied`edaee         dbeej                 d&efdjZ edk          	 	 dd]ed^ed_edied`edaee         dbeej                 d&efdl            Zd_ed&eeffdmZ edn          d_ed&eeffdo            Z	 	 ddpedqedredsed&eeff
dtZ  edu          	 	 ddpedqedredsefdv            Z	 	 	 	 	 ddzedMed{ed:ee         d|ee         d}ed~eded&efdZ! ed          	 	 	 	 	 ddzedMed{ed:ee         d|ee         d}ed~edefd            Zdedededed&ef
dZ" ed          dedededed&ef
d            Zdededededededaee         dee         dbeej                 d&efdZ# ed          dededededededaee         dee         dbeej                 d&efd            Z e j                    d             Z$d Z% ed          dedededefd            Z&dedededefdZ' ed          dedededefd            Z(d_ededed&efdZ) ed          d_ededed&efd            Zd]ededed_ededededaee         dbej        fdZ* ed          d]ededed_ededededaee         dbej        d&efd            Z ed          dedededed<edededbej        d&efd            Zd_eded&efdZ+ ed          d_eded&efd            Zd]eded_ededaee         dbej        fdZ, ed          d]eded_ededaee         dbej        d&efd            ZdS )    N)Optional)TensortorchaoFRAGMENTzuquant_llm_linear(int EXPONENT, int MANTISSA, Tensor _in_feats, Tensor _weights, Tensor _scales, int splitK) -> TensorzMunpack_tensor_core_tiled_layout(Tensor packed_w, int inner_k_tiles) -> Tensorzzdequantize_tensor_core_tiled_layout(Tensor packed_w, Tensor scales_and_zeros, int group_size, int inner_k_tiles) -> Tensorzmarlin_24_gemm(Tensor x, Tensor weight_marlin, Tensor meta, Tensor s, Tensor workspace, int bits, int size_m, int size_n, int size_k) -> Tensorzmarlin_qqq_gemm(Tensor x, Tensor weight_marlin, Tensor s_tok, Tensor s_ch, Tensor s_group, Tensor workspace, int size_m, int size_n, int size_k) -> Tensorzrowwise_scaled_linear_cutlass_s8s4(Tensor input, Tensor input_scale, Tensor weight, Tensor weight_scale, Tensor? bias=None, ScalarType? out_dtype=None) -> Tensorzrowwise_scaled_linear_cutlass_s4s4(Tensor input, Tensor input_scale, Tensor weight, Tensor weight_scale, Tensor? bias=None, ScalarType? out_dtype=None) -> Tensorzrowwise_scaled_linear_sparse_cutlass_f8f8(Tensor input, Tensor input_scale, Tensor weight, Tensor weight_meta, Tensor weight_scale, Tensor? bias=None, ScalarType? out_dtype=None) -> TensorzLto_sparse_semi_structured_cutlass_sm9x_f8(Tensor weight) -> (Tensor, Tensor)zsparse24_sm90_sparsify(Tensor input, str metadata_fmt, str activation, str sp_selection_algo, *, ScalarType? dtype = None, Tensor? scale=None) -> (Tensor, Tensor)zsparse24_fp8_sm90_cutlass_gemm(Tensor a, Tensor a_mdata, Tensor b, *, Tensor? a_scale = None, Tensor? b_scale = None, int swizzle_size=8, str swizzle_axis='n', int sm_count=128) -> Tensorz\swizzle_mm(Tensor mat1, Tensor mat2, bool mat1_is_swizzled, bool mat2_is_swizzled) -> Tensorzswizzle_scaled_mm(Tensor mat1, Tensor mat2, bool mat1_is_swizzled, bool mat2_is_swizzled, Tensor scale_a, Tensor scale_b, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None) -> TensorzImx_fp8_bf16(Tensor a, Tensor b, Tensor a_scale, Tensor b_scale) -> Tensor)tagszImx_fp4_bf16(Tensor a, Tensor b, Tensor a_scale, Tensor b_scale) -> Tensora6  qscaled_dot_product(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, float? scale=None, float q_scale=1.0, int q_zp=0, float k_scale=1.0, int k_zp=0, float v_scale=1.0, int v_zp=0, float a_scale=1.0, int a_zp=0, float o_scale=1.0, int o_zp=0) -> Tensorzida8w4_linear_prepack_cpu(Tensor weight, Tensor scales, Tensor qzeros) -> (Tensor, Tensor, Tensor, Tensor)zda8w4_linear_cpu(Tensor input, Tensor input_scales, Tensor input_qzeros, Tensor weight, Tensor weight_scales, Tensor weight_qzeros, Tensor compensation, Tensor? bias, ScalarType output_dtype) -> Tensorz_scaled_embedding_bag(Tensor qweight, Tensor indices, Tensor offsets, Tensor weight_scale, float o_scale, int mode, bool include_last_offset, ScalarType output_dtype) -> TensorzKfloat8_linear_prepack_cpu(Tensor weight, Tensor scales) -> (Tensor, Tensor)zfloat8_linear_cpu(Tensor input, Tensor input_scales, Tensor weight, Tensor weight_scales, Tensor? bias, ScalarType output_dtype) -> Tensorc                       fd}|S )Nc                 T    t          j                                       |           S N)torchlibraryregister_fakefuncnames    _/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchao/ops.py	decoratorz%register_custom_op.<locals>.decoratorS   s#    }**d955d;;;     r   r   s   ` r   register_custom_opr   R   s$    < < < < < r   c                       fd}|S )Nc                 X    t          j                             d          |           S )Nr   )mutates_args)r   r   	custom_opr   s    r   r   z*register_custom_op_impl.<locals>.decoratorZ   s(    }&&$yr&BB4HHHr   r   r   s   ` r   register_custom_op_implr   Y   s)    I I I I I r   c                      t           j                            t           j                                                  } | j        dz  | j        z   }|S )N
   )r   cudaget_device_propertiescurrent_devicemajorminor)device_propscompute_capabilitys     r   cached_compute_capabilityr%   `   sA    :33EJ4M4M4O4OPPL%+b0<3EEr      EXPONENTMANTISSA	_in_feats_weights_scalessplitKreturnc                     t                      t          j        dk    fd           t          j        j        j                            | |||||          S )as  
    Quant-LLM linear layer A @ W.T. See https://arxiv.org/abs/2401.14112 for more details.

    Arguments
        EXPONENT: number of exponent bits
        MANTISSA: number of mantissa bits
        _in_feats: input activations in FP16
        _weights: packed Floatx weights
        _scales: scale
        splitK: split K

    Returns
        output of linear layer
    K   c                      d  S )NzLquant_llm_linear requires sm7.5+ GPU architecture, but current device has smr   )r$   s   r   <lambda>z"quant_llm_linear.<locals>.<lambda>   s    s_qss r   )r%   r   _checkopsr   quant_llm_lineardefault)r'   r(   r)   r*   r+   r,   r$   s         @r   r4   r4   g   sh    . 344	Lb ssss   9-55(Ix&  r   ztorchao::quant_llm_linearc                 H   t          j                                        dk    fd           t          j        j        t           j        t           j        fv fd           t          j                                        dk    fd           t          j        j        t           j        u fd           t          j                                        dk    fd           t          j        j        t           j        t           j        fv fd           j        \  }}j        \  }}	d| z   |z   }
t          j        |d	z  |
z  j        d         k    d
            t          j        |j        d         k    d                                ||f          S )N   c                  4    d                                   dS )Nz!input should be a 2d tensor, got Ddimr)   s   r   r1   z_.<locals>.<lambda>   s    FIMMOOFFF r   c                      d j          S )Nz!weight must be FP16 or BF16, got dtyper<   s   r   r1   z_.<locals>.<lambda>   s    EIOEE r   c                  4    d                                   dS )Nz"weight should be a 2d tensor, got r9   r:   r*   s   r   r1   z_.<locals>.<lambda>   s    FX\\^^FFF r   c                      d j          S )Nzweight must be UINT8, got r>   rA   s   r   r1   z_.<locals>.<lambda>       =X^== r   r&   c                  4    d                                   dS )Nz!scale should be a 2d tensor, got r9   r:   r+   s   r   r1   z_.<locals>.<lambda>   s    $X$X$X$X r   c                      d j          S )Nz scale must be FP16 or BF16, got r>   rE   s   r   r1   z_.<locals>.<lambda>   s    B7=BB r      c                      dS NzDimensions mismatchedr   r   r   r   r1   z_.<locals>.<lambda>   s    @W r   r   c                      dS rI   r   r   r   r   r1   z_.<locals>.<lambda>   s    1H r   )	r   r2   r;   r?   float16bfloat16uint8shape	new_empty)r'   r(   r)   r*   r+   r,   BSICOC_N_BITSs     ```      r   rS   rS      s    
L1FFFF   
LEM5>::EEEE   
L!FFFF   
L%+%====   
LXXXX   
L%-88BBBB  
 _FBNEB\H$F	Lq6!X^A%668W8WXXX	Lw}Q'')H)HIIIBx(((r           F      ?querykeyvalue	attn_mask	dropout_p	is_causalscaleq_scaleq_zpk_scalek_zpv_scalev_zpa_scalea_zpo_scaleo_zpc                 t    t           j        j        j                            | |||||||||	|
||||||          S )a  
    Quantized SDPA with quantized inputs and outputs.
    Arguments
        query: input query tensor,
        key: input key tensor,
        value: input value tensor,
        attn_mask: attention mask tensor,
        dropout_p: dropout probability,
        is_causal: causal flag,
        scale: scaling factor applied prior to softmax,
        q_scale: scale for query from linear quantization,
        q_zp: zero point for query from linear quantization,
        k_scale: scale for key from linear quantization,
        k_zp: zero point of key from linear quantization,
        v_scale: zero point for value from linear quantization,
        v_zp: zero point of value from linear quantization,
        a_scale: scale for attention from softmax quantization,
        a_zp: zero point for attention from softmax quantization,
        o_scale: scale for output from linear quantization,
        o_zp: zero point for output from linear quantization,
    Returns
        output of quantized SDPA
    )r   r3   r   qscaled_dot_productr5   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   s                    r   ri   ri      sX    T 9088#  r   ztorchao::qscaled_dot_productc                     | S r
   r   rj   s                    r   rS   rS      s	    ( Lr   packed_winner_k_tilesc                 X    t           j        j        j                            | |          S )a   
    Unpacks weights that were packed with `torch.ops.aten._convert_weight_to_int4pack` to original tensor of shape `N x K`.

    Assumes that the packed weights were generated with `torch.ops.aten._convert_weight_to_int4pack` with `inner_k_tiles = 2 | 4 | 8`"

    Args:
        packed_w: torch.tensor: 4D tensor with shape (N / 8) x (K / (inner_k_tiles * 16)) x 32 x inner_k_tiles, dtype is torch.int32
        inner_k_tiles: int

    Returns:
        torch.tensor of shape is N x K, dtype is torch.int32

    rl   rm   )r   r3   r   unpack_tensor_core_tiled_layoutr5   ro   s     r   rp   rp     s/     9<DD E   r   z(torchao::unpack_tensor_core_tiled_layoutc                 n    t          j                                         dk     fd           t          j         j        t           j        u  fd           t          j        |dk    p|dk    p|dk    d            t          j                             d          dk    d            t          j                             d	          |dz  k    d
                                 d          dz  }                     d          |z  dz  }t          j        ||ft           j         j                  S )N   c                  4    d                                   dS )Nz*packed weight should be a 42d tensor, got r9   r:   rl   s   r   r1   z_.<locals>.<lambda>  s    NX\\^^NNN r   c                      d j          S Nzweight must be INT32, got r>   rt   s   r   r1   z_.<locals>.<lambda>#  rC   r   r7   rG   c                      dS Nz inner_k_tiles must be 2, 4, or 8r   r   r   r   r1   z_.<locals>.<lambda>'      2 r       c                      dS Nz#packed weight must have 32 at dim 2r   r   r   r   r1   z_.<locals>.<lambda>)      1V r      c                      dS Nz0packed weight must have inner_k_tiles/2 at dim 3r   r   r   r   r1   z_.<locals>.<lambda>,      B r   r   r&      r?   device)r   r2   r;   r?   int32sizeemptyr   )rl   rm   NKs   `   r   rS   rS     sG   	L!NNNN   
L%+%====   
LFmq0FMQ4F22   
Lq!!R')V)VWWW	LaMA--BB   	a1Aa=(2-A;1vU[IIIIr   scales_and_zeros
group_sizec                 Z    t           j        j        j                            | |||          S )a  
    Dequantizes by:
    - Unpacking weights that were packed with `torch.ops.aten._convert_weight_to_int4pack` to original tensor of shape `N x K`
    - Upcasting to bfloat16
    - Dequantizing with the scales_and_zeros that were packed with `torchao.quantization.utils.pack_tinygemm_scales_and_zeros`

    Assumes:
    - packed weights were generated with `torch.ops.aten._convert_weight_to_int4pack` with `inner_k_tiles = 2 | 4 | 8`"
    - packed scales_and_zeros were generated with `torchao.quantization.utils.pack_tinygemm_scales_and_zeros`
    - qGroupSize is 32 | 64 | 128 | 256

    Args:
        packed_w: torch.tensor: 4D tensor with shape `(N / 8) x (K / (inner_k_tiles * 16)) x 32 x inner_k_tiles / 2`, dtype is torch.int32
        scales_and_zeros: torch.tensor: 3D tensor with shape `numQGroups x N x 2`, dtype is torch.bfloat16 where numQGroups is K / qGroupSize
        group_size: int
        inner_k_tiles: int

    Returns:
        torch.tensor of shape is N x K, dtype is torch.bfloat16

    )r   r3   r   #dequantize_tensor_core_tiled_layoutr5   )rl   r   r   rm   s       r   r   r   4  s.    0 9@HH"J  r   z,torchao::dequantize_tensor_core_tiled_layoutc                     t          j                                         dk     fd           t          j         j        t           j        u  fd           t          j        |dk    p|dk    p|dk    d            t          j                             d          dk    d            t          j                             d	          |dz  k    d
                                 d          dz  }                     d          |z  dz  }t          j        |j        t           j        u d            t          j        |                                d	k    d            t          j        |dk    p|dk    p|dk    p|dk    d            t          j        |                    d          ||z  k    d            t          j        |                    d          |k    d            t          j        |                    d          dk    d            t          j        ||ft           j         j                  S )Nrr   c                  4    d                                   dS )Nz)packed weight should be a 4d tensor, got r9   r:   rt   s   r   r1   z_.<locals>.<lambda>X  s    MHLLNNMMM r   c                      d j          S rv   r>   rt   s   r   r1   z_.<locals>.<lambda>\  rC   r   r7   rG   c                      dS rx   r   r   r   r   r1   z_.<locals>.<lambda>`  ry   r   rz   c                      dS r|   r   r   r   r   r1   z_.<locals>.<lambda>b  r}   r   r~   c                      dS r   r   r   r   r   r1   z_.<locals>.<lambda>e  r   r   r   r&   r   c                      dS )Nz!scales_and_zeros must be bfloat16r   r   r   r   r1   z_.<locals>.<lambda>m  s    3 r   c                      dS )Nz9scales_and_zeros must be 3D, got {scales_and_zeros.dim()}r   r   r   r   r1   z_.<locals>.<lambda>q  s    K r   @         c                      dS )Nz&qGroupSize must be 32, 64, 128, or 256r   r   r   r   r1   z_.<locals>.<lambda>u  s    8 r   c                      dS )Nz3scales_and_zeros must have K // qGroupSize at dim 0r   r   r   r   r1   z_.<locals>.<lambda>y  s    E r   c                      dS )Nz%scales_and_zeros must have N at dim 1r   r   r   r   r1   z_.<locals>.<lambda>|      /V r   c                      dS )Nz%scales_and_zeros must have 2 at dim 2r   r   r   r   r1   z_.<locals>.<lambda>  r   r   r   )	r   r2   r;   r?   r   r   rL   r   r   )rl   r   r   rm   r   r   s   `     r   rS   rS   Q  s   
 
L!MMMM   
L%+%====   
LFmq0FMQ4F22   
Lq!!R')V)VWWW	LaMA--BB   	a1Aa=(2-A 
L%.033   
L!#KK   
LbVJ",V
c0AVZSVEV88   
La  AO3EE   
La  A%'V'V   
La  A%'V'V   ;1vU^HOLLLLr   xweight_marlinmetas	workspacebitssize_msize_nsize_kc	                 d    t           j        j        j                            | ||||||||	  	        S )a  
    Sparse Marlin 2:4 matrix multiplication. Reference: https://github.com/IST-DASLab/Sparse-Marlin/tree/main
    Args:
        x: input matrix of shape `(n, k/2)` in column-major layout.
        weight_marlin: weight matrix of original shape `(m, k)` in Marlin format; see `Layer.pack()`.
        meta: metadata information for 2:4 sparsity.
        s: scales of shape `(n / groupsize / 2, m)`.
        workspace: tensor with at least `m / 128 * max_par` entries that are all zero.
        bits: number of bits for quantization.
        size_m: number of rows in input matrix.
        size_n: number of columns in weight matrix.
        size_k: number of columns in input matrix.
    Returns:
        output matrix of shape `(n, m)` in column-major layout.
    )r   r3   r   marlin_24_gemmr5   )	r   r   r   r   r   r   r   r   r   s	            r   r   r     s7    4 9+33	=$9dFFF  r   ztorchao::marlin_24_gemmc	                 f    ddd}	t          j        dk    pdk    fd           dz  }
t          j                             d          k     fd	           t          j                             d
          k     fd           t          j        z  dk    fd           t          j        z  dz                      d          k    fd           t          j                            d
          k    fd           t          j                            d
          z  dk    fd                               d
          z  |
z  t          j        k    fd           t          j                            d          dz  dz  dz  k    fd           t          j                            d
          dz  k    fd           t          j         j        d            t          j                                         d            t          j        j        d            t          j                                        d            t          j        j        d            t          j                                        d            t          j        j        d            t          j                                        d            d                    d          d
k    rPt          j                            d          z  dk    fd                               d          z  dz  t          j        dk    pdk    fd           t          j        z  dk    fd           z  |	z  t          j                                        k    fd            t          j                             d                              d
          f j         j        !          S )"Nr   r   r   rr   rG   c                      d  S )Nznum_bits must be 4 or 8. Got = r   )r   s   r   r1   z_.<locals>.<lambda>  s    (P$(P(P r   rz   r   c                  :    d                     d           d  S NzShape mismatch: x.size(0) = r   , size_m = r   r   r   s   r   r1   z_.<locals>.<lambda>  "    MqvvayyMMVMM r   r&   c                  :    d                     d           d  S NzShape mismatch: x.size(1) = r&   , size_k = r   r   r   s   r   r1   z_.<locals>.<lambda>  r   r   c                      d d  S N	size_k = ! is not divisible by tile_size = r   	TILE_SIZEr   s   r   r1   z_.<locals>.<lambda>      PFPPYPP r   r7   c                  @    d                     d           d d  S Nz(Shape mismatch: weight_marlin.size(0) = r   r   z, tile_size = r   r   r   r   s   r   r1   z_.<locals>.<lambda>  0    ~=;M;Ma;P;P~~]c~~s|~~ r   c                  :    d                      d           d S )Nzs.size(1) = r&   , size_n = r   )r   r   s   r   r1   z_.<locals>.<lambda>  s"    %RAFF1II%R%R&%R%R r   c                  :    d                     d           d  S Nzweight_marlin.size(1) = r&   r   r   r   r   s   r   r1   z_.<locals>.<lambda>  '    n=+=+=a+@+@nnclnn r   c                      d d  S )N	size_n = , actual_size_n = r   actual_size_nr   s   r   r1   z_.<locals>.<lambda>  s    EFEEmEE r   c                  L    d                      d           ddz  dz  dz   S )Nzmeta.size(0) = r   z is not size_k / 8 / 2 / 2 = rG   r7   r   )r   r   s   r   r1   z_.<locals>.<lambda>  s3    d$))A,,ddVWX[\]M]abMbdd r   c                  @    d                      d           ddz   S )Nzmeta.size(1) = r&   z is not size_n * 2 = r7   r   )r   r   s   r   r1   z_.<locals>.<lambda>  s&    Q$))A,,QQVaZQQ r   c                      dS Nzx is not on GPUr   r   r   r   r1   z_.<locals>.<lambda>      $5 r   c                      dS Nzx is not contiguousr   r   r   r   r1   z_.<locals>.<lambda>      ,A r   c                      dS Nzweight_marlin is not on GPUr   r   r   r   r1   z_.<locals>.<lambda>      0M r   c                      dS Nzweight_marlin is not contiguousr   r   r   r   r1   z_.<locals>.<lambda>      /P r   c                      dS )Nzmeta is not on GPUr   r   r   r   r1   z_.<locals>.<lambda>      '; r   c                      dS )Nzmeta is not contiguousr   r   r   r   r1   z_.<locals>.<lambda>      /G r   c                      dS )Nzs is not on GPUr   r   r   r   r1   z_.<locals>.<lambda>  r   r   c                      dS )Nzs is not contiguousr   r   r   r   r1   z_.<locals>.<lambda>  r   r   c                  :    d d                      d           S )Nr   z! is not divisible by s.size(0) = r   r   )r   r   s   r   r1   z_.<locals>.<lambda>  s#    TTTPQTT r   c                      d  S NzUnexpected groupsize = r   	groupsizes   r   r1   z_.<locals>.<lambda>  s    5)55 r   c                      d d  S Nr   z$ is not divisible by min_thread_n = r   MIN_THREAD_Nr   s   r   r1   z_.<locals>.<lambda>
      VFVVVV r   c                  8    d                                  d  S )Nzworkspace.numel =  is below min_workspace_size = numelmin_workspace_sizer   s   r   r1   z_.<locals>.<lambda>  s#    kY__%6%6kkWikk r   r   )	r   r2   r   is_cudais_contiguousr   r   r?   r   )r   r   r   r   r   r   r   r   r   MAX_PARALLELISMpack_factorr   r   r   r   r   s   `````````  @@@@@r   rS   rS     s    ILO 
L	TQY P P P P   *K 
L!&&))MMMMM   
L!&&))MMMMM   
LaPPPPP   
L	9		!m&8&8&;&;;~~~~~~   
L	q		VRRRRR   
L1	)Q.nnnnn  
 #''**i7;FM	L-EEEEE   
L		!!q(A--ddddd   
L		!
"QQQQQ   
L55666	L""$A$ABBB 
L&(M(MNNN	L##%%'P'P  
 
L;;<<<	L##%%'G'GHHH 
L55666	L""$A$ABBB Ivvayy1}}QVVAYY!#TTTTT	
 	
 	
 affQii'	a		LR*9?5555   
L"VVVVV   !L0OC	L//kkkkk  
 ;q		166!99-QWQXNNNNr   s_toks_chs_groupc	                 d    t           j        j        j                            | ||||||||	  	        S )a  
    Marlin for W4A8 mixed precision matrix multiplication.
    See https://arxiv.org/pdf/2406.09904 for more details.
    Reference: https://github.com/HandH1998/QQQ/tree/main
    Args:
        x: `torch.int8` input matrix of shape `(m, k)` in standard row-major layout.
        weight_marlin: `torch.int32` weight matrix of original shape `(k, n)` in the specified format.
        s_tok: `torch.float32` activation per-token quantization scales of shape `(m, 1)`.
        s_ch: `torch.float32` weight per-channel quantization scales of shape `(1, n)`.
        s_group: `torch.half` weight per-group quantization scales of shape `(m / groupsize, n)`, it should be empty when group_size != -1.
        workspace: `torch.int32` tensor with at least `n / 128 * max_par` entries that are all zero.
        size_m: number of rows in input matrix.
        size_n: number of columns in weight matrix.
        size_k: number of columns in input matrix.
    Returns:
        `torch.half` out matrix of shape `(m, n)` in standard row-major layout.
    )r   r3   r   marlin_qqq_gemmr5   )	r   r   r   r   r   r   r   r   r   s	            r   r   r     s7    8 9,44	=%w	666  r   ztorchao::marlin_qqq_gemmc	                 "	    ddd}	d}
t          j                             d          k     fd           t          j                                        k    fd           t          j                             d          k     fd           t          j        z  dk    fd	           t          j        z                      d          k    fd
                                           dk    rdn                    d          z  t          j        dv fd           t          j                                        k    fd           t          j                            d          z  dk    fd           dk    rct          j                            d          k    fd           t          j                            d          z  dk    fd                               d          z  |
z  t          j        k    fd           t          j         j        d            t          j                                         d            t          j        j        d            t          j                                        d            t          j        j        d            t          j                                        d            t          j        j        t           j        k    d            t          j        j        d            t          j                                        d            t          j        j        t           j        k    d            t          j        j        d            t          j                                        d            t          j        j        t           j        k    d           t          j        z  dk    fd            z  |	z  t          j                                        k    fd!           t          j	        ft           j         j
        "          S )#Nr   r   rG   r   c                  :    d                     d           d  S r   r   r   s   r   r1   z_.<locals>.<lambda>J  r   r   c                  8    d                                   d S )Nz Shape mismatch: s_tok.numel() = r   r   )r   r   s   r   r1   z_.<locals>.<lambda>N  s     U5;;==UUVUU r   r&   c                  :    d                     d           d  S r   r   r   s   r   r1   z_.<locals>.<lambda>T  r   r   c                      d d  S r   r   r   s   r   r1   z_.<locals>.<lambda>X  r   r   c                  @    d                     d           d d  S r   r   r   s   r   r1   z_.<locals>.<lambda>\  r   r   r   )r   r   c                      d  S r   r   r   s   r   r1   z_.<locals>.<lambda>a  s    1V91V1V r   c                  8    d                                   d S )NzShape mismatch: s_ch.numel() = r   r   )r   r   s   r   r1   z_.<locals>.<lambda>f  s     S$**,,SS6SS r   c                  :    d                     d           d  S r   r   r   s   r   r1   z_.<locals>.<lambda>j  r   r   c                  :    d                      d           d S )Nz"Shape mismatch: s_group.size(1) = r&   r   r   )r   r   s   r   r1   z_.<locals>.<lambda>o  s#    ]a]]U[]] r   c                  :    d d                      d           S )Nr   z' is not divisible by s_group.size(0) = r   r   )r   r   s   r   r1   z_.<locals>.<lambda>s  s#    ```w||\]`` r   c                      d d  S )NzShape mismatch: size_n = r   r   r   s   r   r1   z_.<locals>.<lambda>y  s    UFUUmUU r   c                      dS r   r   r   r   r   r1   z_.<locals>.<lambda>}  r   r   c                      dS r   r   r   r   r   r1   z_.<locals>.<lambda>~  r   r   c                      dS r   r   r   r   r   r1   z_.<locals>.<lambda>  r   r   c                      dS r   r   r   r   r   r1   z_.<locals>.<lambda>  r   r   c                      dS )Nzs_tok is not on GPUr   r   r   r   r1   z_.<locals>.<lambda>  s    (= r   c                      dS )Nzs_tok is not contiguousr   r   r   r   r1   z_.<locals>.<lambda>  s    0I r   c                      dS )Nzs_tok's dtype is not float32r   r   r   r   r1   z_.<locals>.<lambda>  s    7U r   c                      dS )Nzs_ch is not on GPUr   r   r   r   r1   z_.<locals>.<lambda>  r   r   c                      dS )Nzs_ch is not contiguousr   r   r   r   r1   z_.<locals>.<lambda>  r   r   c                      dS )Nzs_ch's dtype is not float32r   r   r   r   r1   z_.<locals>.<lambda>  s    6S r   c                      dS )Nzs_group is not on GPUr   r   r   r   r1   z_.<locals>.<lambda>  s    *A r   c                      dS )Nzs_group is not contiguousr   r   r   r   r1   z_.<locals>.<lambda>  s    2M r   zs_group's dtype is not float16c                      d d  S r   r   r   s   r   r1   z_.<locals>.<lambda>  r   r   c                  8    d                                  d  S )Nzworkspace.numel() = r   r   r   s   r   r1   z_.<locals>.<lambda>  s#    my'8'8mmYkmm r   r   )r   r2   r   r   r   r   r?   float32rK   r   r   )r   r   r   r   r   r   r   r   r   r   PACK_FACTORr   r   r   r   r   s   `````````  @@@@@r   rS   rS   6  s    ILOK 
L!&&))MMMMM   
L%++--UUUUU   
L!&&))MMMMM   
LaPPPPP   
L	9	!3!3A!6!66~~~~~~   mmoo**',,q//0II	Li')V)V)V)VWWW 
L

SSSSS   
L1	)Q.nnnnn   BLLOOv%]]]]]	
 	
 	
 	W\\!__$)`````	
 	
 	

 #''**i7;FM	L-UUUUU   
L55666	L""$A$ABBB 
L&(M(MNNN	L##%%'P'P  
 
L = =>>>	L$$&&(I(IJJJ	L-/U/UVVV 
L;;<<<	L##%%'G'GHHH	Lu},.S.STTT 
L"A"ABBB	L&&((*M*MNNN	L%-/1QRRR 
L"VVVVV   !L0OC	L//mmmmm  
 ;'u}QXNNNNr   inputinput_scaleweightweight_scalebias	out_dtypec                 ^    t           j        j        j                            | |||||          S )a  
    CUTLASS-based row-wise scaled W4A8 linear operator.
    Args:
        input: quantized input tensor, in row-major layout.
        input_scale: scale factors for input tensor, has to be tensor of the same shape as the input tensor, minus the last dimension.
        weight: quantized weight matrix, in row-major layout.
        weight_scale: scale factors for weight tensor, one value per row of weight matrix (thus also tensor of the same shape as the weight tensor, minus the last dimension).
        bias: an optional vector of size equal to number of rows of weight tensor, or None.
        out_dtype: optional data type for output tensor.
    Returns:
        output: result tensor, in row-major layout.
    )r   r3   r   "rowwise_scaled_linear_cutlass_s8s4r5   r  r  r  r  r  r  s         r   r  r    s6    * 9?GG  r   z+torchao::rowwise_scaled_linear_cutlass_s8s4c                     ||n|j         }| j        }t          j        g | j        d d         |j        d         R ||          S Nr   r   r   r?   r   r   r   rN   r  r  r  r  r  r  r?   r   s           r   rS   rS     S     #.IIK4EE\F;;SbS);6<?;;5QWXXXXr   c                 ^    t           j        j        j                            | |||||          S )a  
    CUTLASS-based row-wise scaled W4A4 linear operator.
    Args:
        input: quantized input tensor, in row-major layout.
        input_scale: scale factors for input tensor, has to be tensor of the same shape as the input tensor, minus the last dimension.
        weight: quantized weight matrix, in row-major layout.
        weight_scale: scale factors for weight tensor, one value per row of weight matrix (thus also tensor of the same shape as the weight tensor, minus the last dimension).
        bias: an optional vector of size equal to number of rows of weight tensor, or None.
        out_dtype: optional data type for output tensor.
    Returns:
        output: result tensor, in row-major layout.
    )r   r3   r   "rowwise_scaled_linear_cutlass_s4s4r5   r  s         r   r$  r$    s1    * 9?GG{FL$	  r   z+torchao::rowwise_scaled_linear_cutlass_s4s4c                     ||n|j         }| j        }t          j        g | j        d d         |j        d         R ||          S r  r   r!  s           r   rS   rS     r"  r   weight_metac           	      `    t           j        j        j                            | ||||||          S )a.  
    CUTLASS-based row-wise scaled F8F8 linear operator, for sparsified weight case.
    Args:
        input: quantized input tensor, in row-major layout.
        input_scale: scale factors for input tensor, has to be tensor of the same shape as the input tensor, minus the last dimension.
        weight: sparsified quantized weight matrix, in row-major layout.
        weight_meta: sparsify metadata for weight tensor.
        weight_scale: scale factors for weight tensor, one value per row of weight matrix (thus also tensor of the same shape as the weight tensor, minus the last dimension).
        bias: an optional vector of size equal to number of rows of weight tensor, or None.
        out_dtype: optional data type for output tensor.
    Returns:
        output: result tensor, in row-major layout.
    )r   r3   r   )rowwise_scaled_linear_sparse_cutlass_f8f8r5   )r  r  r  r&  r  r  r  s          r   r(  r(    s3    . 9FNN{FKtY  r   z2torchao::rowwise_scaled_linear_sparse_cutlass_f8f8c                     ||n|j         }| j        }t          j        g | j        d d         |j        d         R ||          S r  r   )	r  r  r  r&  r  r  r  r?   r   s	            r   rS   rS     sS     #.IIK4EE\F;;SbS);6<?;;5QWXXXXr   c                 T    t           j        j        j                            |           S )a  
    CUTLASS-based conversion from sparsified input tensor to corresponding compressed tensor, along with corresponding metadata tensor.
    Args:
        weight: input tensor, in row-major layout.
    Returns:
        weight_compressed: compressed weight tensor, with sparsity eliminated, in row-major layout.
        weight_meta: metadata tensor, describing the sparsity structure of the input tensor, also in row-major layout.
    )r   r3   r   )to_sparse_semi_structured_cutlass_sm9x_f8r5   r  s    r   r+  r+  ,  s      9FNNvVVVr   z2torchao::to_sparse_semi_structured_cutlass_sm9x_f8c                     |                      | d         | d         dz            |                      | d         t          | d         dz  d          t          j                  fS )Nr   r&   r7   rG   r   r>   )rO   maxr   charr,  s    r   rS   rS   ;  s]     	F1IN33Cq	Q$;$;5:NN r   input_tensormetadata_format
activation	algorithmc                 V    t           j        j                            | |||||          S )N)r?   r]   )r   r3   r   sparse24_sm90_sparsify)r0  r1  r2  r3  r?   r]   s         r   r5  r5  H  s4     933oz9EQV 4   r   ztorchao::sparse24_sm90_sparsifyc                    ||n| j         }t          j        | j        d         | j        d         dz  f|| j                  t          j        | j        d         | j        d         dz  ft          j        | j                  fS )Nr   r&   r7   r   rG   )r?   r   r   rN   r   rM   )r0  r1  r2  r3  r?   r]   r  s          r   rS   rS   U  s     *0BI"L$6q$9Q$>?&	
 	
 	

 	"L$6q$9Q$>?+&	
 	
 	
 r   rG   nr   abb_scaleswizzle_sizeswizzle_axissm_countc           
      Z    t           j        j                            | |||||||          S )N)rd   r:  r;  r<  r=  )r   r3   r   sparse24_fp8_sm90_cutlass_gemmr8  r   r9  rd   r:  r;  r<  r=  s           r   r?  r?  m  s>     9;;		!! < 	 	 	r   z'torchao::sparse24_fp8_sm90_cutlass_gemmc                 ~    t          j        | j        d         |j        d         ft           j        | j                  S )Nr   r&   r   )r   r   rN   rL   r   r@  s           r   rS   rS     s0     ;
AGAJ/u~ahWWWWr   mat1mat2mat1_is_swizzledmat2_is_swizzledc                 Z    t           j        j        j                            | |||          S zP
    Similar to torch.mm but Tensor inputs can be SwizzleTensor instances.

    )r   r3   r   
swizzle_mmr5   rB  rC  rD  rE  s       r   rH  rH    s/     9'//d$&6  r   ztorchao::swizzle_mmc                 Z    |                      | j        d         |j        d                   S Nr   r&   rO   rN   rI  s       r   rS   rS     s$     >>$*Q-A777r   scale_ascale_bscale_resultc	                 d    t           j        j        j                            | ||||||||	  	        S rG  )r   r3   r   swizzle_scaled_mmr5   	rB  rC  rD  rE  rM  rN  r  rO  r  s	            r   rQ  rQ    s?     9.66
 
 
r   ztorchao::swizzle_scaled_mmc	                 Z    |                      | j        d         |j        d                   S rK  rL  rR  s	            r   rS   rS     s$     >>$*Q-A777r   c                  v    t          t          d          rt          j        t          j        fS t          j        fS )zGTODO: when e8m0 is hardened and major release lets remove uint8 supportfloat8_e8m0fnu)hasattrr   rM   rU  r   r   r   _get_dtypesrW    s1     u&'' 3U122K>r   c                      t                      }t          j         j        |v  fd           t          j        j        |v fd           d S )Nc                      d j          S )Nz4A_scale tensor must be uint8 or float8_e8m0fnu, got r>   )A_scales   r   r1   z%_check_scale_dtypes.<locals>.<lambda>      Vw}VV r   c                      d j          S )Nz4B_scale tensor must be uint8 or float8_e8m0fnu, got r>   )B_scales   r   r1   z%_check_scale_dtypes.<locals>.<lambda>  r[  r   )rW  r   r2   r?   )rZ  r]  allowed_dtypess   `` r   _check_scale_dtypesr_    sm     ]]N	L'VVVV   
L'VVVV    r   ztorchao::mx_fp8_bf16ABrZ  r]  c                     t          j        |                     d          |                    d          ft           j        | j                  S )zMeta impl for mx_fp8_bf16r   r&   r   r   r   r   rL   r   r`  ra  rZ  r]  s       r   meta_mx_fp8_bf16re    s8     ;q		166!99-U^AHUUUUr   c                 z    t          ||           t          j        j        j                            | |||          S )a=  Defines a matmul between two fp4 tensors w/ MX scales in E8MO and returns a bf16 tensor.

    The expected format is fp4_e2m1 specified:
    https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final.pdf (Section 5.3.3)

    Note: The mx scales are E8MO tensors stored in uint8 tensors (for now).
        The layout of the scales is very particular, see:
        https://docs.nvidia.com/cuda/cublas/index.html#d-block-scaling-factors-layout


    Args:
        A: fp4 tensor (2 fp4 elements are packed into 1 byte -> elem0|elem1)
        B: fp4 tensor (2 fp4 elements are packed into 1 byte -> elem0|elem1)
        A_scale: E8M0 scale tensor for A with groupsize=32 in swizzled layout
        B_scale: E8M0 scale tensor for B with groupsize=32 in swizzled layout

    Returns:
        MXN bf16 Tensor

    )r_  r   r3   r   mx_fp4_bf16r5   rd  s       r   rg  rg    s6    * )))9(00AwHHHr   ztorchao::mx_fp4_bf16c                     t          j        |                     d          |                    d          ft           j        | j                  S )zMeta impl for mx_fp4_bf16r   r&   r   rc  rd  s       r   meta_mx_fp4_bf16ri    s8     ;q		166!99-U^AHUUUUr   scalesqzerosc                 X    t           j        j        j                            | ||          S )z
    Prepack weights for DA8W4 linear operator on CPU.
    Args:
        weight: weight tensor.
        scales: scales for weight tensor.
        qzeros: zero points for weight tensor.
    Returns:
        packed weight, scales, and zero points.
    )r   r3   r   da8w4_linear_prepack_cpur5   r  rj  rk  s      r   rm  rm  
  s$     95==fffUUUr   z!torchao::da8w4_linear_prepack_cpuc                 0    | ||t          j                    fS r
   )r   r   rn  s      r   rS   rS     s    665<>>11r   input_scalesinput_qzerosweight_scalesweight_qzeroscompensationc	                 d    t           j        j        j                            | ||||||||	  	        S )a  
    DA8W4 linear operator on CPU.
    Args:
        input: input tensor.
        input_scales: scales for input tensor.
        input_qzeros: zero points for input tensor.
        weight: weight tensor.
        weight_scales: scales for weight tensor.
        weight_qzeros: zero points for weight tensor.
        compensation: compensation tensor for weight.
        bias: optional bias tensor.
        out_dtype: output data type.
    Returns:
        output tensor in out_dtype.
    )r   r3   r   da8w4_linear_cpur5   )	r  rp  rq  r  rr  rs  rt  r  r  s	            r   rv  rv     s?    4 9-55
 
 
r   ztorchao::da8w4_linear_cpuc	                     |                                 dk    sJ |                    d          |                    d          z  dz  }	 | j        g | j        d d         |	R d|iS )Nrr   r   r~   r7   r   r?   r;   r   rO   rN   )
r  rp  rq  r  rr  rs  rt  r  r  r   s
             r   rS   rS   G  sp     ::<<1AQ'!+A5?AEK,AaAAAyAAAr   ztorchao::_scaled_embedding_bagqweightindicesoffsetsw_scalesmodeinclude_last_offsetc                 x    |dk    sJ |j         d         dz
  }|                     || j         d         |          S )NTr   r&   r>   )rN   rO   )	ry  rz  r{  r|  rf   r}  r~  r  
batch_sizes	            r   rS   rS   X  sH     $&&&&q!A%JZq)9KKKr   c                 V    t           j        j        j                            | |          S )z
    Prepack weights for float8 linear operator on CPU.
    Args:
        weight: weight tensor.
        scales: scales for weight tensor.
    Returns:
        packed weight, packed scales
    )r   r3   r   float8_linear_prepack_cpur5   r  rj  s     r   r  r  i  s"     96>>vvNNNr   z"torchao::float8_linear_prepack_cpuc                 
    | |fS r
   r   r  s     r   rS   rS   x  s    6>r   c                 ^    t           j        j        j                            | |||||          S )aH  
    float8 linear operator on CPU.
    Args:
        input: input tensor.
        input_scales: scales for input tensor.
        weight: weight tensor.
        weight_scales: scales for weight tensor.
        bias: optional bias tensor.
        out_dtype: output data type.
    Returns:
        output tensor in out_dtype.
    )r   r3   r   float8_linear_cpur5   )r  rp  r  rr  r  r  s         r   r  r  }  s6    ( 9.66  r   ztorchao::float8_linear_cpuc                     |                                 dv sJ |                                 dk    r+|                    d          |                    d          z  n|                    d          } | j        g | j        d d         |R d|iS )N)r7   rr   rr   r   r~   r   r?   rx  )r  rp  r  rr  r  r  r   s          r   rS   rS     s     ::<<6!!!!+1::<<1+<+<AQ''&++a..A5?AEK,AaAAAyAAAr   )r&   )NrU   FNrV   r   rV   r   rV   r   rV   r   rV   r   )NN)NNrG   r7  r   )-	functoolstypingr   r   r   r   Librarylibdefine_CTagneeds_fixed_stride_orderr   r   	lru_cacher%   intr4   rS   floatboolri   rp   r   r   r   r?   r  r$  r(  r+  strr5  r?  rH  rQ  rW  r_  re  rg  ri  rm  rv  r  r  r   r   r   <module>r     s                   mIz22 

{   

S   

 A   

 V   

 a   

 h   

 h   

 C   

R   

 i   

 B   

b   

 T  
 

O
(,
/	0     

O
(,
/	0     

 }   

o   

 P   

 w   

Q   

 Q  
           	
      B /00 &) &)&)&) &) 	&)
 &) &) &) &) &) 10&)Z #'!#< <<	< < 	<
 < < E?< < < < < < < < <  !<" #<$ %< < < <~ 233
 #'!# 	  	
   E?          !" #$ %   43,f S V    & >??J Js Jv J J J @?J0(.<?PS   : BCC0M0M(.0M<?0MPS0M0M 0M 0M DC0Mf  	
         > -..mOmOmO mO 	mO
 mO mO mO mO mO mO mO mO /.mO`  	
         B .//iOiOiO iO 	iO
 iO iO iO iO iO iO iO iO 0/iOb "'+   	
 6
 $    > ABB "'+Y YYY Y 	Y
 6
Y $Y Y Y Y CBY* "'+   	
 6
 $    4 ABB "'+Y YYY Y 	Y
 6
Y $Y Y Y Y CBY, "'+   	
  6
 $    8 HII "'+Y YYY Y 	Y
 Y 6
Y $Y Y Y Y JIY"WWfW W W W HII		f	 	 	 JI	" 

 


 
 	
 f
 
 
 
 566 
   	   766 !% $ 
  f	
 f       , =>>
 !% $
X 
X
X

X 
X f	
X
 f
X 
X 
X 
X 
X 
X ?>
X	
		26	JN		 	 	 	 )**8
88268JN88 8 8 +*8

  	
   6
 6" $    8 0118
8
8 8 	8
 8 8 6
8 6"8 $8 8 8 8 218   
 
 
 *++V V6 VF VV V V V ,+V
I6 If Iv I I I I I2 *++V V6 VF VV V V V ,+VVVV V 	V V V V" 7882f 2f 2f 2 2 2 2 982$$$ $ 	$
 $ $ $ 6
$ {$ $ $ $N /00BBB B 	B
 B B B 6
B {B B B B 10B  455LLL L 	L
 L L L {L L L L 65L OOO O O O O 899f f     :9  	
 6
 {   < 011
B
B
B 
B 	
B
 6

B {
B 
B 
B 
B 21
B 
B 
Br   