
    Pi0                     j   d dl Z d dlmZ d dlZd dlmZ  e            rd dlZd dlmZ	 d dlm
Z
 d dD             Z ej        eg d          ej        d	e	j        d
e	j        de	j        de	j        de	j        de	j        fd                        Zej                            dd          	 d)dej        dej        dej        dej        dedej        fd            Zej        d)d            Zej        de	j        fd            Zdej        fdej        dedej        deej        ej        f         fd Zej        de	j        fd!            Zdej        fdej        dedeej        ej        f         fd"Zej        de	j        fd#            Z	 d)dej        d$ej        dedej        fd%ZdS 	 d)dej        dej        dej        dej        dedej        fd&Zdej        fdej        dedej        deej        ej        f         fd'Zdej        fdej        dedeej        ej        f         fd(ZdS )*    N)Tuple)
has_tritonConfigc           	      L    g | ]!}d D ]}dD ]}t          ||d|d          "S ))    @      )            )BLOCK_SIZE_MBLOCK_SIZE_N   )
num_stages	num_warpsr   ).0block_mblock_nr   s       y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchao/kernel/blockwise_quantization.py
<listcomp>r      sx     	 	 	 $	 	 &	 	  	$g>>!	
 	
 	
	 	 	 	 	    )   r   r	   r
   )NKM_BUCKETBLOCK_SIZE_K)configskeyr   r   r   r   r   r   c                    t          j        d          }t          j        d          }t          j        ||          }||	z  t          j        d|	          z   |z  }||
z  t          j        d|
          z   |z  }t          j        d|          }| |d d d f         |z  z   |d d d f         z   }||d d d f         |z  z   |d d d f         z   }|||z  z   }|||z  |z  z   }t          j        |	|
ft           j                  }t          |          D ]}t          j        ||d d d f         |||z  z
  k     d          }t          j        ||d d d f         |||z  z
  k     d          }t          j        |          }t          j        |          }|t          j        ||          |d d d f         z  |d d d f         z  z  }||z  }||z  }|dz  }|dz  }|	                    |j
        j                  }||	z  t          j        d|	          z   }||
z  t          j        d|
          z   }||d d d f         |z  z   |d d d f         z   }|d d d f         |k     |d d d f         |k     z  }t          j        |||           d S )Nr   axis   dtypeg        )maskotherr'   )tl
program_idcdivarangezerosfloat32rangeloaddottor&   
element_tystore)a_ptrb_ptrc_ptra_s_ptrb_s_ptrMr   r   r   r   r   r   pid_mpid_nkoffs_moffs_noffs_ka_ptrsb_ptrsa_s_ptrsb_s_ptrsaccumulatoriaba_sb_scc_ptrsr'   s                                  r   blockwise_fp8_gemm_kernelrN      s   $ 1%%%1%%%GA|$$,&1l)C)CCqH,&1l)C)CCqH1l++41,,vdAAAg>aaa1,,vaaag>VaZ'f499hl;2:NNNq 		 		AVD!!!G_q1|;K7K%KSVWWWAVAAAtG_q1|;K7K%KSVWWWA'(##C'(##C26!Q<<#aaag,6T111WEEKl"Fl"FMHMHHNN5;122%	!\(B(BB%	!\(B(BB41,,vdAAAg>qqq$w!#tQQQw!(;<
&&&&&&r   zao::blockwise_fp8_gemm )mutates_argsr
   rH   rJ   rI   rK   
block_sizereturnc                 >  	
 |                                  sJ |                                 sJ |                                 sJ |                                 sJ |                     d          }|                                 |z  	|                    d          
t          j        t          j        	                    } | j        g |                                 d d         
R dt          j        i}	
fd}t          |         | ||||	
|||
  
         |S )Nr   r&   c                 p    t          j        | d                   t          j        | d                   fS )Nr   r   tritonr,   )METAr;   r   s    r   <lambda>z$blockwise_fp8_gemm.<locals>.<lambda>`   s2    K4/00K4/00
 r   )r   )
is_contiguoussizenumelmathceillog2	new_emptytorchbfloat16rN   )rH   rJ   rI   rK   rQ   r   r   rL   gridr;   r   s            @@r   blockwise_fp8_gemmrd   O   s3                """""  """""FF2JJGGIINFF1II9TYq\\**AK@#2#@@@@@@
 
 
 
 
 	"$'q!S#q!Qz	
 	
 	
 	
 r   c                     |                     d          } | j        g |                                  d d         |R dt          j        i}|S )Nr   rT   r&   )r[   r`   ra   rb   )rH   rJ   rI   rK   rQ   r   rL   s          r   _rf   i   sL    FF1IIAK@#2#@@@@@@r   
BLOCK_SIZEc                    t          j        d          }||z  t          j        d|          z   }t          j        | |z                                 t           j                  }t          j        t          j        |                    dz  }||z  }|                    |j        j	                  }t          j
        ||z   |           t          j
        ||z   |           dS )a!  
        Quantizes the input tensor `x_ptr` and stores the result in `y_ptr` and the scaling factor in `s_ptr`.

        Args:
            x_ptr (triton.Pointer): Pointer to the input tensor.
            y_ptr (triton.Pointer): Pointer to the output tensor where quantized values will be stored.
            s_ptr (triton.Pointer): Pointer to the output tensor where scaling factors will be stored.
            BLOCK_SIZE (tl.constexpr): The size of the block to be processed by each program instance.

        Returns:
            None
        r   r"         |@N)r*   r+   r-   r1   r3   r/   maxabsr&   r4   r5   )	x_ptry_ptrs_ptrrg   pidoffsxsys	            r   fp8_blockwise_act_quant_kernelrt   o   s     m###Z")Az":"::GEDL!!$$RZ00F26!99%EDD'((
q!!!
a     r   rq   r&   c                                                       s
J d                                 d          |z  dk    sJ d| d            |t          j        t          j        fv s
J d            t          j         |          }  j        g                                  dd                              d          |z  R d	t          j        i} fd
}t          |          |||           ||fS )a  
        Quantizes the input tensor `x` using block-wise quantization with block size being BLOCK_SIZEx1.

        Args:
            x (torch.Tensor): The input tensor to be quantized. Must be contiguous and its last dimension size must be divisible by `block_size`.
            block_size (int, optional): The size of the blocks to be used for quantization. Default is 128.
            dtype (torch.dtype, optional): The dtype to use for the quantized tensor. Default is `torch.float8_e4m3fn`.


        Returns:
            Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
                - The quantized tensor with dtype `dtype`.
                - A tensor of scaling factors with dtype `torch.float32`.
        Input tensor must be contiguousrT   r   z@Last dimension size must be divisible by block_size (block_size=)6dtype must be torch.float8_e4m3fn or torch.float8_e5m2r%   Nr&   c                 `    t          j                                        | d                   fS Nrg   )rW   r,   r\   )metarq   s    r   rY   z)fp8_blockwise_act_quant.<locals>.<lambda>   s#    V[D4FGGI r   rg   )	rZ   r[   ra   float8_e4m3fnfloat8_e5m2
empty_liker`   r/   rt   )rq   rQ   r&   rs   rr   rc   s   `     r   fp8_blockwise_act_quantr      s#   "   CC"CCC vvbzzJ&!+++\z\\\ ,++ 
 
 
 
 D
 
 
 Qe,,,AKV#2#Vr

j(@VVVVVIIII&t,Q1LLLL!tr   c                    t          j        d          }t          j        d          }t          j        ||          }||z  t          j        d|          z   }	||z  t          j        d|          z   }
|	dddf         |z  |
dddf         z   }|	dddf         |k     |
dddf         |k     z  }t          j        | |z   |                              t           j                  }t          j        t          j        |                    dz  }||z  }|                    |j	        j
                  }t          j        ||z   ||           t          j        |||z  z   |z   |           dS )aj  
        Quantizes the input tensor `x_ptr` and stores the result in `y_ptr` and the scaling factors in `s_ptr`.

        Args:
            x_ptr (tl.pointer): Pointer to the input tensor.
            y_ptr (tl.pointer): Pointer to the output tensor where quantized values will be stored.
            s_ptr (tl.pointer): Pointer to the output tensor where scaling factors will be stored.
            M (int): Number of rows in the weight matrix.
            N (int): Number of columns in the weight matrix.
            BLOCK_SIZE (tl.constexpr): The size of the block to be processed by each program instance.
        r   r"   r$   Nr)   ri   )r*   r+   r,   r-   r1   r3   r/   rj   rk   r&   r4   r5   )rl   rm   rn   r;   r   rg   r<   r=   nr?   r@   rp   r'   rq   rr   rs   s                   r   !fp8_blockwise_weight_quant_kernelr      sf    1%%%1%%%GAz""#bi:&>&>>#bi:&>&>>aaag"VD!!!G_4qqq$w!#tQQQw!(;<GEDLt,,,//
;;F26!99%EDD'((
qt,,,,
"U*A.....r   c                 N   |                                  s
J d            |                                 dk    s
J d            |                     d          |z  dk    r|                     d          |z  dk    sJ d| d            |t          j        t          j        fv s
J d            |                                 \  t          j        | |	          }|                     |z  |z  t          j        	          }fd
}t          |         | |||           ||fS )a  
        Quantizes the given weight tensor using block-wise quantization with block size being BLOCK_SIZExBLOCK_SIZE.

        Args:
            x (torch.Tensor): The weight tensor to be quantized.
            block_size (int, optional): The block size to use for quantization. Defaults to 128.
            dtype (torch.dtype, optional): The dtype to use for the quantized tensor. Defaults to `torch.float8_e4m3fn`.

        Returns:
            Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
                - The quantized weight tensor with dtype `dtype`.
                - A tensor of scaling factors with dtype `torch.float32`.
        rv      z#Input tensor must have 2 dimensionsr   r$   zABoth dimensions of x must be divisible by block_size (block_size=rw   rx   r%   c                 p    t          j        | d                   t          j        | d                   fS rz   rV   r{   r;   r   s    r   rY   z,fp8_blockwise_weight_quant.<locals>.<lambda>   2    K4-..K4-..
 r   r|   )
rZ   dimr[   ra   r}   r~   r   r`   r/   r   )rq   rQ   r&   rs   rr   rc   r;   r   s         @@r   fp8_blockwise_weight_quantr      s[       CC"CCC uuww!|||B|||vvayy:%**qvvayy:/E/J/J/J]PZ]]] 0K/JJ 
 
 
 
 D
 
 
 vvxx1Qe,,,KKZjKNN
 
 
 
 
 	*$/1aA*UUUU!tr   c                 N   t          j        d          }t          j        d          }t          j        ||          }||z  t          j        d|          z   }	||z  t          j        d|          z   }
|	dddf         |z  |
dddf         z   }|	dddf         |k     |
dddf         |k     z  }t          j        | |z   |                              t           j                  }t          j        |||z  z   |z             }||z  }t          j        ||z   ||           dS )a%  
        Dequantizes weights using the provided scaling factors and stores the result.

        Args:
            x_ptr (tl.pointer): Pointer to the quantized weights.
            s_ptr (tl.pointer): Pointer to the scaling factors.
            y_ptr (tl.pointer): Pointer to the output buffer for dequantized weights.
            M (int): Number of rows in the weight matrix.
            N (int): Number of columns in the weight matrix.
            BLOCK_SIZE (tl.constexpr): Size of the block for tiling.

        Returns:
            None
        r   r"   r$   Nr)   )r*   r+   r,   r-   r1   r3   r/   r5   )rl   rn   rm   r;   r   rg   r<   r=   r   r?   r@   rp   r'   rq   rr   rs   s                   r   #fp8_blockwise_weight_dequant_kernelr      s1   $ 1%%%1%%%GAz""#bi:&>&>>#bi:&>&>>aaag"VD!!!G_4qqq$w!#tQQQw!(;<GEDLt,,,//
;;GEEAI%-..E
qt,,,,,,r   rr   c                    |                                  r|                                 s
J d            |                                 dk    r|                                dk    s
J d            |                                 \  t          j        | t          j                              }fd}t          |         | |||           |S )a'  
        Dequantizes the given weight tensor using the provided scale tensor.

        Args:
            x (torch.Tensor): The quantized weight tensor of shape (M, N).
            s (torch.Tensor): The scale tensor of shape (M, N).
            block_size (int, optional): The block size to use for dequantization. Defaults to 128.

        Returns:
            torch.Tensor: The dequantized weight tensor of the same shape as `x`.

        Raises:
            AssertionError: If `x` or `s` are not contiguous or if their dimensions are not 2.
        z Input tensors must be contiguousr   z$Input tensors must have 2 dimensionsr%   c                 p    t          j        | d                   t          j        | d                   fS rz   rV   r   s    r   rY   z.fp8_blockwise_weight_dequant.<locals>.<lambda>  r   r   r|   )rZ   r   r[   ra   r   get_default_dtyper   )rq   rr   rQ   rs   rc   r;   r   s        @@r   fp8_blockwise_weight_dequantr     s    "    	
Q__%6%6 	
 	
.	
 	
6 uuww!||1.T,vvxx1Qe&=&?&?@@@
 
 
 
 
 	,D1!Q1aJWWWWr   c                      t          d          Nzunsupported without tritonAssertionError)rH   rJ   rI   rK   rQ   s        r   rd   rd   #  s     9:::r   c                      t          d          r   r   rq   rQ   r&   s      r   r   r   ,       9:::r   c                      t          d          r   r   r   s      r   r   r   1  r   r   )r
   )r]   typingr   ra   torch.utils._tritonr   rW   triton.languagelanguager*   r   fp8_gemm_configsautotunejit	constexprrN   library	custom_opTensorintrd   register_fakerf   rt   r}   r&   r   r   r   r   r   rO   r   r   <module>r      sY           * * * * * *:<< g;MMM      	 	 )	 	 	 V_ &L&L&L   Z*' <*' <*' ,*' l*' l*' l*' *' *' Z *'X ]5BGG  <\ < \	
  
   HG2 %   &%
 Z! ! ! ! Z!. ,/UEX <%(7<{	u|U\)	*   > Z//1|/ / / Z/: ,/e6I! !<!%(!	u|U\)	*! ! ! !F Z-/1|- - - Z-< =@ <!L69	     J ; ;<;\; <; \	;
 ; 
; ; ; ; ,/UEX; ;<;%(;7<{;	u|U\)	*; ; ; ; ,/e6I; ;<;%(;	u|U\)	*; ; ; ; ; ;r   