
    )`i                     >   d dl Z d dlmZ d dlmZmZ d dlZddlmZ ddl	m
Z
 ddlmZmZmZ dd	Ze j        d
             Ze	 	 	 ddej        dededee         deej        ej        f         f
d            Ze	 ddej        dej        dedej        fd            ZdS )    N)SimpleNamespace)OptionalTuple   )flashinfer_api)#gen_mxfp8_quantization_sm100_module)device_support_pdlregister_custom_opregister_fake_op   c                 >    | |z   dz
  |z  |z  }|dz   dz  dz  }||z  S )Nr          )	total_rowtotal_columnrow_size
padded_rowpadded_columns        o/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/fp8_quantization.py _compute_swizzled_layout_sf_sizer      s:    h&*x7(BJ!A%!+a/M%%    c                     t                                                      t          dd          	 	 	 ddt          j        dt
          dt          d	t          t
                   d
t          t          j        t          j        f         f
fd            } t          d          	 	 ddt          j        dt
          dt          d
t          t          j        t          j        f         fd            }t          dd          	 ddt          j        dt          j        dt
          d
t          j        ffd            }t          d          	 ddt          j        dt          j        dt
          d
t          j        fd            }t          | |          S )Nz flashinfer::mxfp8_quantize_sm100 )mutates_argsT    inputis_sf_swizzled_layout	alignment
enable_pdlreturnc                 f   | j         j        dk    rt          j        | j        t          j        | j                   }|r+t          | j        d         | j        d         dz  d          }n|                                 dz  }t          j        |ft          j        | j                   }
                    | |||           ||fS |t          | j                   }|                                 | j        d         z  }| j        d         }||z   dz
  |z  |z  }	t          j        g | j        dd         |	R t          j
        | j                   }|rt          ||	dz  d          }n||	z  dz  }t          j        |ft          j        | j                   }
                    | |||||           ||fS )	aK  Quantize input tensor to MxFP8 format.

        Args:
            input (torch.Tensor): Input tensor of shape [M, K] with dtype fp16/bf16/fp8_quantized.
            is_sf_swizzled_layout (bool, optional): Whether to use swizzled layout for scale factors. Defaults to True.
            alignment (int, optional): sfVecSize. Defaults to 32. Note that alignment is not used in the host kernel.
            enable_pdl (Optional[bool], optional): Whether to enable PDL (Programmatic Dependent Launch).
                If None, automatically detects based on device capability. Defaults to None.
        Returns:
            Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
                - Quantized tensor of shape [M, K] with dtype FLOAT8_E4M3
                - Scale factors tensor with shape determined by layout and sf_vec_size
        cpudtypedevicer   r   r   r   N)r&   typetorchemptyshapeuint8r   numelmxfp8_quantize_hostr	   float8_e4m3fnmxfp8_quantize)r   r   r   r    out_valout_sf_sizeout_sfmkpadded_kmodules             r   mxfp8_quantize_sm100zAget_mxfp8_quantization_sm100_module.<locals>.mxfp8_quantize_sm100   s   . <%%k%+U[VVVG$ 2>KNEKNb$8#  $kkmmr1[+u{5<XXXF&&%	   F?"!/==
R0ABAI)i7)CHk-%+crc"-H--)|  G
 % 1>q(b.RUVV(lb0[+u{5<XXXF!!%   F?"r   c                     | j         \  }}|                     ||gt          j                  |                     ||z  dz  gt          j                  fS )Nr%   r   )r+   	new_emptyr)   int64int32)r   r   r   r4   r5   s        r   _fake_mxfp8_quantize_sm100zGget_mxfp8_quantization_sm100_module.<locals>._fake_mxfp8_quantize_sm100[   sQ     {1OOQF%+O66OOQUb[MO==
 	
r   z'flashinfer::mxfp8_dequantize_host_sm100)r   scale_tensorc                     t          j        | j        t           j        | j                  }                    | |||           |S )a  Dequantize input tensor from MxFP8 format.

        Args:
            input (torch.Tensor): Input tensor of shape [M, K] with dtype FLOAT8_E4M3.
            scale_tensor (torch.Tensor): Scale factors tensor with shape determined by layout and sf_vec_size.
            is_sf_swizzled_layout (bool, optional): Whether to use swizzled layout for scale factors. Defaults to True.

        Returns:
            torch.Tensor: Dequantized float tensor of shape [M, K] with dtype float32.
        r$   )r)   r*   r+   float32r&   mxfp8_dequantize_host)r   r?   r   outr7   s       r   mxfp8_dequantize_host_sm100zHget_mxfp8_quantization_sm100_module.<locals>.mxfp8_dequantize_host_sm100g   sL    & k%+U]5<PPP$$!		
 	
 	
 
r   c                 t    |                      | j        d         | j        d         gt          j                  S )Nr   r   r:   )r;   r+   r)   rA   r   r?   r   s      r   !_fake_mxfp8_dequantize_host_sm100zNget_mxfp8_quantization_sm100_module.<locals>._fake_mxfp8_dequantize_host_sm100   s-     AA?u}UUUr   )r8   rD   Tr   N)Tr   T)r   build_and_loadr
   r)   Tensorboolintr   r   r   r   )r8   r>   rD   rG   r7   s       @r   #get_mxfp8_quantization_sm100_modulerN      s   022AACCF*   '+%)	;# ;#|;##;# ;# TN	;#
 
u|U\)	*;# ;# ;# ;# ;#	 ;#z 899 '+	
 	
|	
#	
 	
 
u|U\)	*		
 	
 	
 :9	
 1   '+ |l  $ 
	    	 0 ?@@ '+V V|VlV  $V 
	V V V A@V 1$?   r   Tr   r   r   r   r    r!   c                     d}| j         d         |z  dk    sJ |t          | j                  }t                                          | |||          \  }}||fS )a  Quantize input tensor to MxFP8 format.

    This function implements MxFP8 quantization that converts input tensors to a compressed MxFP8 format
    with associated scale factors. It supports various input data types and scale factor layouts.

    Args:
        input (torch.Tensor): Input tensor of shape [M, K] with dtype fp16/bf16/fp8_quantized.
        is_sf_swizzled_layout (bool, optional): Whether to use swizzled layout for scale factors. Defaults to True.
        alignment (int, optional): sfVecSize. Defaults to 32.
        enable_pdl (Optional[bool], optional): Whether to enable PDL (Programmatic Dependent Launch).
            If None, automatically detects based on device capability. Defaults to None.
    Returns:
        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
            - Quantized tensor of shape [M, K] with dtype FLOAT8_E4M3
            - Scale factors tensor with shape determined by layout and sf_vec_size
    r   r'   r   )r+   r	   r&   rN   r8   )r   r   r   r    sf_vec_sizex_qsfs          r   r0   r0      sp    . K;r?[(A----'55
133HH	 GC 7Nr   r?   c                 H    t                                          | ||          S )av  Dequantize input tensor from MxFP8 format.

    This function performs dequantization by converting a packed FP8 tensor in MxFP8 format
    back to float values using the associated scale factors.

    Args:
        input (torch.Tensor): Packed FP8 tensor in MxFP8 format of shape [M, K] with dtype FLOAT8_E4M3.
        scale_tensor (torch.Tensor): Scale factors tensor with shape determined by layout and sf_vec_size.
        is_sf_swizzled_layout (bool, optional): Whether scale factors use swizzled layout. Defaults to True.

    Returns:
        torch.Tensor: Dequantized float tensor of shape [M, K] with dtype float32.

    )rN   rD   rF   s      r   rB   rB      s+    * /00LL  r   )r   rH   rI   )	functoolstypesr   typingr   r   r)   api_loggingr   jit.fp8_quantizationr   utilsr	   r
   r   r   cacherN   rK   rL   rM   r0   rB   r   r   r   <module>r[      s       ! ! ! ! ! ! " " " " " " " "  ' ' ' ' ' ' E E E E E E         & & & & x x xv  #'!%	! !<!! ! 	!
 5<%&! ! ! !H  #' <,   \	     r   