
    )`i                        d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	 ddl
Z
ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZmZ ddlmZ ddlmZmZmZmZmZm Z  dBdZ!	 dCde
j"        de#de#de#de
j"        f
dZ$defdZ%defdZ&defdZ'defdZ(defdZ)defdZ*dee+         de+defdZ,ej-        dDd e+fd!            Z.e	 	 	 	 	 	 dEd$e
j"        d%ee
j"                 de#d&e/d'e/d(e/d)ee/         de	e
j"        e
j"        f         fd*            Z0ede
j"        de
j"        fd+            Z1e1Z2e	 	 	 	 dFd,e
j"        d-e
j"        d.ee
j"                 de#d/e#d'e/de
j"        fd0            Z3ed1e
j"        d2e#de
j"        fd3            Z4e	 dCd1e
j"        d2e#d4e#fd5            Z5 G d6 d7e          Z6ee6j7        d"ddfd8            Z8ed9             Z9ed:             Z:e	 dGd<e
j"        d=e
j"        d>e#de
j"        fd?            Z;e	 dCd@            Z<edA             Z=dS )Ha3  
Copyright (c) 2025 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)Enum)SimpleNamespace)ListOptionalTuple   )flashinfer_api)JitSpec)env)gen_jit_specsm121a_nvcc_flagssm120a_nvcc_flagssm110a_nvcc_flagssm103a_nvcc_flagssm100a_nvcc_flagssm90a_nvcc_flags)is_cuda_version_at_least)device_support_pdl get_shuffle_matrix_a_row_indices#get_shuffle_matrix_sf_a_row_indicesregister_custom_opregister_fake_opget_compute_capability   c                 >    | |z   dz
  |z  |z  }|dz   dz  dz  }||z  S )Nr          )	total_rowtotal_columnrow_size
padded_rowpadded_columns        o/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/fp4_quantization.py _compute_swizzled_layout_sf_sizer%   /   s:    h&*x7(BJ!A%!+a/M%%       unswizzled_sfmnsf_vec_sizereturnc                     |dz  }|dz   dz
  dz  dz  }||z   dz
  |z  |z  }||z
  }||z
  |z  }|dk    r|dk    r| S t           j        j                            | d|d|fdd                                          S )ag  Pad scale factors tensor to meet alignment requirements.

    Args:
        unswizzled_sf (torch.Tensor): Input scale factors tensor with dtype uint8.
        m (int): M dimension.
        n (int): N dimension.
        sf_vec_size (int, optional): Scale factor vector size. Defaults to 16.

    Returns:
        torch.Tensor: Padded scale factors tensor.
    r   r   r   r   constant)modevalue)torchnn
functionalpad
contiguous)	r(   r)   r*   r+   factorr"   
padded_colpad_rowspad_colss	            r$   _pad_scale_factorsr:   5   s     1_Fs7Q;3&#-Jv:>f,6J A~HQ;.H1}}Qx"&&AxH5Ja ' 
 

*,,	r&   c                  ,    t          t          d          S )N100)gen_fp4_quantization_moduler   r   r&   r$   !gen_fp4_quantization_sm100_moduler>   R       &'8%@@@r&   c                  ,    t          t          d          S )N103)r=   r   r   r&   r$   !gen_fp4_quantization_sm103_modulerB   V   r?   r&   c                  ,    t          t          d          S )N90)r=   r   r   r&   r$    gen_fp4_quantization_sm90_modulerE   Z   s    &'7>>>r&   c                  ,    t          t          d          S )N110)r=   r   r   r&   r$   !gen_fp4_quantization_sm110_modulerH   ^   r?   r&   c                  ,    t          t          d          S )N120)r=   r   r   r&   r$   !gen_fp4_quantization_sm120_modulerK   b   r?   r&   c                  ,    t          t          d          S )N121)r=   r   r   r&   r$   !gen_fp4_quantization_sm121_modulerN   f   r?   r&   
nvcc_flagsdevice_archc                    t          d| t          j        dz  t          j        dz  t          j        dz  t          j        dz  t          j        dz  t          j        dz  t          j        dz  g| d	d
t          d          rdndgz   d	d
t          d          rdndgt          j        dz  t          j        dz  dz  g          S )Nfp4_quantization_z-nv_internal/tensorrt_llm/thop/fp4Quantize.cppz'nv_internal/tensorrt_llm/thop/fp4Op.cppz'nv_internal/cpp/kernels/quantization.cuz#nv_internal/cpp/common/envUtils.cppz!nv_internal/cpp/common/logger.cppz&nv_internal/cpp/common/stringUtils.cppz(nv_internal/cpp/common/tllmException.cppz-DENABLE_BF16z-DENABLE_FP8z12.8z-DENABLE_FP4 nv_internalinclude)extra_cuda_cflagsextra_cflagsextra_include_paths)r   jit_envFLASHINFER_CSRC_DIRr   )rO   rP   s     r$   r=   r=   j   s    )K))'=>'*SS'*SS'*OO'*MM'*RR'*TT		
 %6v>>FNNB

 6v>>FNNB
 '-7'-7)C
/   r&   r<   backendc                    t           t          t          t          t          t
          d}| |vrt          d|             ||                                                      t          dd          	 	 	 	 	 	 d4d	t          j
        d
t          t          j
                 dt          dt          dt          dt          dt          t                   dt          t          j
        t          j
        f         ffd            }t          d          	 	 	 	 d5d	t          j
        d
t          t          j
                 dt          dt          dt          dt          t          j
        t          j
        f         fd            }t          dd          	 d6dt          j
        dt          j
        dt          dt          j
        ffd            }t          d          	 d6dt          j
        dt          j
        dt          dt          j
        fd            }t          dd          dt          j
        dt          j
        ffd            }t          d          dt          j
        dt          j
        fd            }t          dd          	 	 	 d7d	t          j
        d
t          t          j
                 dt          dt          dt          t          j
        t          j
        f         f
fd             }t          d          	 	 	 d7d	t          j
        d
t          t          j
                 dt          dt          dt          t          j
        t          j
        f         f
d!            }	t          d"d          	 d8d	t          j
        d#t          j
        d
t          t          j
                 dt          t          j
        t          j
        f         ffd$            }
t          d"          	 d8d	t          j
        d#t          j
        d
t          t          j
                 dt          t          j
        t          j
        f         fd%            }t          d&d          d't          j
        d(t          j
        d#t          j
        dt          t          j
        t          j
        f         ffd)            }t          d&          d't          j
        d(t          j
        d#t          j
        dt          t          j
        t          j
        f         fd*            }t          d+d          	 	 	 	 d9d-t          j
        d.t          j
        d/t          t          j
                 dt          d0t          dt          dt          j
        ffd1            }t          d+          	 	 	 	 d9d-t          j
        d.t          j
        d/t          t          j
                 dt          d0t          dt          dt          j
        fd2            }t!          ||||||
|3          S ):N)rM   rJ   rG   rA   r<   rD   zInvalid backend: zflashinfer::fp4_quantize_sm100rS   )mutates_argsr'   FTinputglobal_scaler+   sf_use_ue8m0is_sf_swizzled_layoutis_sf_8x4_layout
enable_pdlr,   c                    |t          | j                  }t          j        g | j        dd         | j        d         dz  R t          j        | j                  }|                                 | j        d         z  }| j        d         }	|rt          ||	|z  |rdnd          }
n||	z  |z  }
t          j        |
ft          j        | j                  }                    | ||||||||	  	         ||fS )a~  Quantize input tensor to FP4 format.

        Args:
            input (torch.Tensor): Input tensor of shape [M, K] with dtype fp16/bf16/fp8_quantized.
            global_scale (torch.Tensor, optional): Global scale factor of shape [1] and dtype float32.
            sf_vec_size (int, optional): Scale factor vector size. Defaults to 16.
            sf_use_ue8m0 (bool, optional): Whether to use UE8M0 format for scale factors. Defaults to False.
            is_sf_swizzled_layout (bool, optional): Whether to use swizzled layout for scale factors. Defaults to True.
            is_sf_8x4_layout (bool, optional): Whether to use 8x4 layout or 128x4 layout for scale factors. Defaults to False.
            enable_pdl (Optional[bool], optional): Whether to enable PDL (Programmatic Dependent Launch).
                If None, automatically detects based on device capability. Defaults to None.

        Returns:
            Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
                - Quantized tensor of shape [M, K/2] with dtype FLOAT4_E2M1X2
                - Scale factors tensor with shape determined by layout and sf_vec_size
        N   dtypedevice   r   )	r   ri   r1   emptyshapeuint8numelr%   fp4_quantize)r^   r_   r+   r`   ra   rb   rc   out_valr)   kout_sf_sizeout_sfmodules               r$   fp4_quantize_sm100z7get_fp4_quantization_module.<locals>.fp4_quantize_sm100   s   < +EL99J+5ek#2#5B1 455+<
 
 

 KKMMU[_,KO  	/:1#*:%CQQ KK a%;.Kk^5;u|TTT!
	
 
	
 
	
 r&   c                     | j         \  }}|                     ||dz  gt          j                  |                     ||z  |z  gt          j                  fS )Nrf   rh   )rl   	new_emptyr1   int64int32)r^   r_   r+   r`   ra   r)   rq   s          r$   _fake_fp4_quantize_sm100z=get_fp4_quantization_module.<locals>._fake_fp4_quantize_sm100   sW     {1OOQQKu{O;;OOQUk12%+OFF
 	
r&   z!flashinfer::mxfp4_dequantize_host    weightscale
group_sizec                     t          j        | j        d         | j        d         dz  ft           j        | j                  }                    | |||           |S )Nr   r   rf   rg   )r1   rk   rl   float32ri   mxfp4_dequantize_host)r}   r~   r   outrt   s       r$   r   z:get_fp4_quantization_module.<locals>.mxfp4_dequantize_host   sk     k\!_fl1o12-=
 
 

 	$$		
 	
 	
 
r&   c                 z    |                      | j        d         | j        d         dz  gt          j                  S Nr   r   rf   rw   rx   rl   r1   r   )r}   r~   r   s      r$   _fake_mxfp4_dequantize_hostz@get_fp4_quantization_module.<locals>._fake_mxfp4_dequantize_host   s>     \!_fl1o12%-   
 
 	
r&   z(flashinfer::block_scale_interleave_sm100)rS   r(   c                    |                                  dk    r| j        d         nd}t          | j        d         | j        d         d          }t          j        ||z  f| j        | j                  }                    | |           |S )a  Swizzle block scale tensor for FP4 format.

        Args:
            unswizzled_sf (torch.Tensor): unswizzled block scale tensor with dtype uint8 or bfloat16.

        Returns:
            torch.Tensor: output tensor for swizzled block scale with dtype uint8 or bfloat16.
        r   r   r   re   r   rg   )dimrl   r%   r1   rk   rh   ri   block_scale_interleave_sm100)r(   num_expertsexpert_out_sizer   rt   s       r$   r   zAget_fp4_quantization_module.<locals>.block_scale_interleave_sm100  s     1>0A0A0C0Cq0H0Hm)!,,a:#]%8%<c
 
 k?*,% '
 
 

 	++M3???
r&   c                 ~    |                      | j        d         | j        d         z  dz  gt          j                  S )Nr   r   r'   rw   )rx   rl   r1   rm   )r(   s    r$   "_fake_block_scale_interleave_sm100zGget_fp4_quantization_module.<locals>._fake_block_scale_interleave_sm100  sE     && #m&9!&<<BC5; ' 
 
 	
r&   z&flashinfer::fp4_batched_quantize_sm100c                 "   | j         \  }}}t          j        |||dz  ft          j        | j                  }t          j        |t          |||z  d          ft          j        | j                  }	                    | |||||           ||fS )u  Quantize a batched tensor to FP4 (E2M1x2) with per-block scale factors.

        This function converts a float/bfloat16 (or FP8-quantized) input tensor into a
        packed FP4 tensor using the E2M1 format (two 4-bit values per byte), along with
        per-block scale factors. Scale factors are encoded as UE4M3 by default, or UE8M0
        when requested, and an optional global scale can be applied.

        Args:
            input (torch.Tensor): Input tensor of shape [B, M, K] with dtype torch.float16,
                torch.bfloat16, or an FP8-quantized dtype supported by the kernel.
            global_scale (torch.Tensor, optional): Global scale factor of shape [1] and
                dtype float32.
            sf_vec_size (int, optional): Scale-factor vector size and alignment unit along K.
                Supported/expected values:
                - 16 (NVFP4 path; supported)
                - 32 (MXFP4 path; not supported yet)
                Defaults to 16.
            sf_use_ue8m0 (bool, optional): Scale-factor encoding type.
                False → UE4M3 (default), True → UE8M0.

        Returns:
            Tuple[torch.Tensor, torch.Tensor]:
                - self_fp4 (torch.Tensor): Packed FP4 tensor in E2M1x2 format of shape
                [B, M, K // 2] with dtype torch.uint8 (two FP4 lanes per byte).
                - self_block_scale_factors (torch.Tensor): Block scale factors with dtype
                uint8 (UE4M3 or UE8M0), laid out as a flat buffer of shape
                [B, ceil(M / 128) * 128 * ceil(K / sf_vec_size / 4) * 4].

        Notes:
            - K must be even (because outputs pack two FP4 values per byte).
            - For best performance, K should be a multiple of sf_vec_size; the scale-factor
            buffer is aligned to sf_vec_size along K, pads M to multiples of 128, and
            rounds (K / sf_vec_size) up to a multiple of 4 for storage.
            - The batch dimension B is preserved for both outputs.
        rf   rg   r   )rl   r1   rk   rm   ri   r%   fp4_batched_quantize)
r^   r_   r+   r`   br)   rq   rp   rs   rt   s
            r$   fp4_batched_quantize_sm100z?get_fp4_quantization_module.<locals>.fp4_batched_quantize_sm100%  s    Z +1a+16N+<
 
 

 0A4DcJJK+<
 
 

 	##	
 	
 	
 r&   c           	          | j         \  }}}|                     |||dz  gt          j                  |                     |t	          |||z  d          gt          j                  fS )Nrf   rw   r   )rl   rx   r1   rm   r%   )r^   r_   r+   r`   r   r)   rq   s          r$    _fake_fp4_batched_quantize_sm100zEget_fp4_quantization_module.<locals>._fake_fp4_batched_quantize_sm100g  so     +1aOOQ16N%+O>>OO4Q[8H#NNOk   
 	
r&   z<flashinfer::silu_and_mul_scaled_nvfp4_experts_quantize_sm100maskc                    | j         }| j        \  }}}|dz  }d}||z  dk    sJ d| d            ||z  }	|	dz   dz  dz  }
|
dz  }|dz   d	z  d	z  }t          j        |||dz  |t          j        
          }t          j        ||||t          j        
          }                    |                    ||z  |dz            |                    ||z  |          |                     ||z  |          ||d           |                    ddd          }|                    t          j	                                      ||d	z  |
dz  ddd          }|                    dddddd          }||fS )a  Quantize a silu and matmul with masked batched tensor to FP4 (E2M1x2) with per-block scale factors.

        This function first does silu and matmul to a float/bfloat16 input tensor then convect the result
        into a packed FP4 tensor using the E2M1 format (two 4-bit values per byte), along with
        per-block scale factors. Scale factors are encoded as UE4M3 by default, or UE8M0
        when requested, and an optional global scale can be applied.

        Args:
            input (torch.Tensor): Input tensor of shape [B, M, K] with dtype torch.float16,
                torch.bfloat16, or an FP8-quantized dtype supported by the kernel.
            mask (torch.Tensor): mask tensor of shape [B] with dtype torch.int32.
            global_scale (torch.Tensor, optional): Global scale factor of shape [1] and
                dtype float32.

        Returns:
            Tuple[torch.Tensor, torch.Tensor]:
                - self_fp4 (torch.Tensor): Packed FP4 tensor in E2M1x2 format of shape
                [B, M, K // 2] with dtype torch.uint8 (two FP4 lanes per byte).
                - self_block_scale_factors (torch.Tensor): Block scale factors with dtype
                uint8 (UE4M3 or UE8M0), laid out as a flat buffer of shape
                [B, ceil(M / 128) * 128 * ceil(K / sf_vec_size / 4) * 4].

        Notes:
            - K must be even (because outputs pack two FP4 values per byte).
            - For best performance, K should be a multiple of sf_vec_size; the scale-factor
            buffer is aligned to sf_vec_size along K, pads M to multiples of 128, and
            rounds (K / sf_vec_size) up to a multiple of 4 for storage.
            - The batch dimension B is preserved for both outputs.
        rf   r'   r   "k must be multiple of 16, but got .r   r      r   ri   rh   Tr   r|      
ri   rl   r1   rk   rm   rz   *silu_and_mul_scaled_nvfp4_experts_quantizeviewpermutefloat8_e4m3fn)r^   r   r_   ri   lr)   k_by_2rq   r+   scale_kpadded_kpadded_k_int32padded_moutputoutput_scalesrt   s                  r$   0silu_and_mul_scaled_nvfp4_experts_quantize_sm100zUget_fp4_quantization_module.<locals>.silu_and_mul_scaled_nvfp4_experts_quantize_sm100w  s   L {1faK;!###%N!%N%N%N###{"u%!+a/!QMc)C/Q16&LLLxek
 
 
 	99KKAqAv&&q8|^<<JJq1uf%%	
 	
 	
 1a((%**5+>??DDx3Ar1a
 
 &--aAq!Q??}$$r&   c                    | j         }| j        \  }}}|dz  }d}||z  dk    sJ d| d            ||z  }	|	dz   dz  dz  }
|
dz  }|dz   d	z  d	z  }t          j        |||dz  |t          j        
          }t          j        ||||t          j        
          }|                    t          j                                      ||d	z  |
dz  ddd          }|                    dddddd          }||fS )Nrf   r'   r   r   r   r   r   r   r   r   r|   r   r   )	ri   rl   r1   rk   rm   rz   r   r   r   )r^   r   r_   ri   r   r)   r   rq   r+   r   r   r   r   r   r   s                  r$   6_fake_silu_and_mul_scaled_nvfp4_experts_quantize_sm100z[get_fp4_quantization_module.<locals>._fake_silu_and_mul_scaled_nvfp4_experts_quantize_sm100  s3    {1faK;!###%N!%N%N%N###{"u%!+a/!QMc)C/Q16&LLLxek
 
 
 &**5+>??DDx3Ar1a
 
 &--aAq!Q??&&r&   z*flashinfer::scaled_fp4_grouped_quant_sm100input_tensorinput_global_scalec                    | j         }| j        \  }}}d}||z  dk    sJ d| d            ||z  }|dz   dz  dz  }	|	dz  }
|dz   dz  dz  }t          j        |||d	z  |t          j        
          }t          j        |||
|t          j        
          }                    |                    ||z  |d	z            |                    ||z  |
          |                     ||z  |          ||d           |                    dd	d          }|                    t          j	                                      ||dz  |	dz  ddd          }|                    ddddd	d          }||fS )af  
        Quantize input tensor to FP4 and return quantized tensor and scale, for
        grouped gemm inputs (e.g., grouped_gemm_nt_masked for flashinfer).
        Args:
            input: The input tensor to be quantized to FP4, with shape (l, m, k)
                l is number of groups, m is number of tokens per group, k is number of features.
            input_global_scale: A scalar scaling factor for the entire tensor, with
                shape (l,).
        Outputs:
            output: The quantized tensor in FP4, with shape (m, k // 2, l) but the physical
                layout is (l, m, k // 2). `// 2` is because two fp4 values are packed into
                an uint8.
            output_scales: The blockscale tensor in FP8-E4M3, with shape (32, 4, rm, 4, rk, l)
                but the physical layout is (l, rm, rk, 32, 4, 4).
        Note:
            For the shape of output_scales, `32 * 4 * rm` is a padded m to nearest multiple of 128.
            `4 * rk` is a padded `k // 16` to nearest multiple of 4. These layout constants are
            required by the NVIDIA Blackwell MMA operations.
        r'   r   r   r   r   r   r   r   rf   r   Fr   r|   r   r   )r   r   r   ri   r   r)   rq   r+   r   r   r   r   r   r   rt   s                 r$   scaled_fp4_grouped_quant_sm100zCget_fp4_quantization_module.<locals>.scaled_fp4_grouped_quant_sm100  s   8 $$1a;!###%N!%N%N%N###{"u%!+a/!QMc)C/Q16&LLLxek
 
 
 	99KKAqAv&&q8|^<<a!eQ''	
 	
 	
 1a(( &**5+>??DDx3Ar1a
 
 &--aAq!Q??}$$r&   c                    | j         }| j        \  }}}d}||z  dk    sJ d| d            ||z  }|dz   dz  dz  }	|	dz  }
|dz   dz  dz  }t          j        |||d	z  |t          j        
          }t          j        |||
|t          j        
          }|                    dd	d          }|                    t          j                                      ||dz  |	dz  ddd          }|                    ddddd	d          }||fS )Nr'   r   r   r   r   r   r   r   rf   r   r   r|   r   )	ri   rl   r1   rk   rm   rz   r   r   r   )r   r   r   ri   r   r)   rq   r+   r   r   r   r   r   r   s                 r$   $_fake_scaled_fp4_grouped_quant_sm100zIget_fp4_quantization_module.<locals>._fake_scaled_fp4_grouped_quant_sm100  sA    $$1a;!###%N!%N%N%N###{"u%!+a/!QMc)C/Q16&LLLxek
 
 
 1a((%**5+>??DDx3Ar1a
 
 &--aAq!Q??}$$r&   z0flashinfer::e2m1_and_ufp8sf_scale_to_float_sm100r   e2m1_tensorufp8_scale_tensorglobal_scale_tensor	ufp8_typec           	      H   t          j        | j        d         | j        d         dz  ft           j        d          }                    |                                 |                                                    d          |                                ||||           |S )a  Convert E2M1 format tensor and UFP8 scale factors to float tensor.

        This function performs dequantization by converting a packed FP4 tensor in E2M1 format
        back to float values using the associated UFP8 scale factors and global scale.

        Args:
            e2m1_tensor (torch.Tensor): Packed FP4 tensor in E2M1 format of shape [M, K/2] with dtype uint8.
            ufp8_scale_tensor (torch.Tensor): Scale factors tensor in UFP8 format with dtype uint8.
            global_scale_tensor (torch.Tensor, optional): Global scale factor of shape [1] and dtype float32.
            sf_vec_size (int, optional): Scale factor vector size. Defaults to 16.
            ufp8_type (int, optional): UFP8 scale factor type (0 for UE8M0, 1 for E4M3). Defaults to 1.
            is_sf_swizzled_layout (bool, optional): Whether scale factors use swizzled layout. Defaults to True.

        Returns:
            torch.Tensor: Dequantized float tensor of shape [M, K] with dtype float32.
        r   r   rf   cpurg   re   )r1   zerosrl   r   $e2m1_and_ufp8sf_scale_to_float_sm100r   reshape)r   r   r   r+   r   ra   r   rt   s          r$   r   zIget_fp4_quantization_module.<locals>.e2m1_and_ufp8sf_scale_to_float_sm100/  s    8 kq!;#4Q#7!#;<-
 
 

 	33OO!!##++B//##%%!	
 	
 	
 
r&   c                 z    |                      | j        d         | j        d         dz  gt          j                  S r   r   )r   r   r   r+   r   ra   s         r$   *_fake_e2m1_and_ufp8sf_scale_to_float_sm100zOget_fp4_quantization_module.<locals>._fake_e2m1_and_ufp8sf_scale_to_float_sm100[  sB     $$q!;#4Q#7!#;<EM % 
 
 	
r&   )ru   r   r   r   r   r   r   Nr'   FTFN)Nr'   FTr|   )Nr'   F)NNr'   r   T)rN   rK   rH   rB   r>   rE   
ValueErrorbuild_and_loadr   r1   Tensorr   intboolr   r   r   )r[   backend_modulesru   r{   r   r   r   r   r   r   r   r   r   r   r   r   rt   s                   @r$   get_fp4_quantization_moduler      sm    10000. O o%%6W66777%_W%''6688F(   04"&*!&%)5 5|5u|,5 5 	5
  $5 5 TN5 
u|U\)	*5 5 5 5 5	 5n 677 04"&*
 
|
u|,
 
 	

  $
 
u|U\)	*
 
 
 87
 +    |  
	    	 $ 9:: 
 

|
 
 
	
 
 
 ;:
 2  |	    	 . @AA
|
	
 
 
 BA
 0   04"	< <|<u|,< < 	<
 
u|U\)	*< < < < <	 <| >?? 04"	
 
|
u|,
 
 	

 
u|U\)	*
 
 
 @?
 F   04>% >%|>%l>% u|,>% 
u|U\)	*	>% >% >% >% >%	 >%@ TUU 04' '|'l' u|,' 
u|U\)	*	' ' ' VU'4 4  8%l8%!L8% l8% 
u|U\)	*	8% 8% 8% 8% 8%	 8%t BCC%l%!L% l% 
u|U\)	*	% % % DC%4 :   7;&*& &\& <& &el3& 	&
 &  $& 
& & & & &	 &P HII 7;&*

 

\

 <

 &el3

 	


 

  $

 


 

 

 JI

 -%A-Q3#=9i'E   r&   FTr^   r_   r`   ra   rb   rc   c           	      ,   |dk    r|dk    rt          d          |                     d          dk    }|r|                     dd          } | j        d         |z  dk    sJ |t	          | j                  }t          | j                  \  }}	t          | |	                               | ||||||          \  }
}|	                    d| j        d         |z  f          }|r,|
                    dd          }
|                    dd          }|
|fS )a  Quantize input tensor to FP4 format.

    This function implements FP4 quantization that converts input tensors to a compressed FP4 format
    with associated scale factors. It supports various input data types and scale factor layouts.

    Args:
        input (torch.Tensor): Input tensor of shape [M, K] with dtype fp16/bf16/fp8_quantized.
        global_scale (torch.Tensor, optional): Global scale factor of shape [1] and dtype float32.
        sf_vec_size (int, optional): Scale factor vector size. Defaults to 16.
        sf_use_ue8m0 (bool, optional): Whether to use UE8M0 format for scale factors. Defaults to False.
        is_sf_swizzled_layout (bool, optional): Whether to use swizzled layout for scale factors. Defaults to True.
        is_sf_8x4_layout (bool, optional): Whether to use 8x4 layout or 128x4 layout for scale factors. Defaults to False.
        enable_pdl (Optional[bool], optional): Whether to enable PDL (Programmatic Dependent Launch).
            If None, automatically detects based on device capability. Defaults to None.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
            - Quantized tensor of shape [M, K/2] with dtype FLOAT4_E2M1X2
            - Scale factors tensor with shape determined by layout and sf_vec_size

    Raises:
        NotImplementedError: If any of the following features are requested but not implemented:
            - BFloat16 input when BFloat16 is not enabled
            - FP8 input when FP8 is not enabled
            - sf_vec_size other than 16 or 32
    r'   r|   z sf_vec_size can only be 16 or 32r   r   re   r   )
NotImplementedErrorstride	transposerl   r   ri   r   r   ru   r   )r^   r_   r+   r`   ra   rb   rc   is_column_majormajorminorx_qsfs               r$   ro   ro   t  s;   H b[B..!"DEEE ll2&&!+O (B'';r?[(A----'55
)%,77LE5)U*;E*;*;<<OO GC 
RRK78	9	9B "mmB##\\"b!!7Nr&   c                     | j         t          j        k    s'| j         t          j        k    sJ d| j                      t	          | j                  \  }}|dz  |z    }t          |                              |           S )a  Swizzle block scale tensor for FP4 format.

    This function swizzles the block scale tensor to optimize memory access patterns
    for FP4 operations. The output needs to be padded in the m dimension to be a multiple of 128.

    Args:
        unswizzled_sf (torch.Tensor): Input tensor with dtype uint8 or bfloat16.

    Returns:
        torch.Tensor: Swizzled tensor with the same shape as input.

    Raises:
        AssertionError: If input dtype is not uint8 or bfloat16.
    z+Input dtype must be uint8 or bfloat16, got 
   )rh   r1   rm   bfloat16r   ri   r   r   )r(   r   r   rP   s       r$   block_scale_interleaver     s    $ 	u{**m.AU^.S.S.SJ]5HJJ /T.SS *-*>??LE5RZ%')K&{33PP  r&   r   r   r   r   c                     t          t          j        d                    \  }}|dz  |z    }t          |                              | |||||          S )a  Convert E2M1 format tensor and UFP8 scale factors to float tensor.

    This function performs dequantization by converting a packed FP4 tensor in E2M1 format
    back to float values using the associated UFP8 scale factors and global scale.

    Args:
        e2m1_tensor (torch.Tensor): Packed FP4 tensor in E2M1 format of shape [M, K/2] with dtype uint8.
        ufp8_scale_tensor (torch.Tensor): Scale factors tensor in UFP8 format with dtype uint8.
        global_scale_tensor (torch.Tensor, optional): Global scale factor of shape [1] and dtype float32.
        sf_vec_size (int, optional): Scale factor vector size. Defaults to 16.
        ufp8_type (int, optional): UFP8 scale factor type (0 for UE8M0, 1 for E4M3). Defaults to 1.
        is_sf_swizzled_layout (bool, optional): Whether scale factors use swizzled layout. Defaults to True.

    Returns:
        torch.Tensor: Dequantized float tensor of shape [M, K] with dtype float32.

    cuda:0r   )r   r1   ri   r   r   )	r   r   r   r+   r   ra   r   r   rP   s	            r$   e2m1_and_ufp8sf_scale_to_floatr     sn    6 *X LE5 RZ%')K& ** 	r&   r   epilogue_tile_mc                 b    t          | |          }| |                    | j                           S )z;
    PyTorch equivalent of trtllm-gen `shuffleMatrixA`
    )r   tori   )r   r   row_indicess      r$   shuffle_matrix_ar     s-    
 3<QQK|':;;<<r&   num_elts_per_sfc                     t          | |          }| |                    | j                           }t          |          S )a  
    Cuda implementation of trtllm-gen `shuffleMatrixSfA` but with a caveat.
    `shuffleMatrixSfA` expects the input to be in 128x4 layout and then
    apply the same shuffling in `shuffleMatrixA` and writes out in 128x4
    layout.
    This function expects the input to be in linear layout. It's done this
    way because the scaling factors in the NVFP4 checkpoints are quantized
    and are in linear layout.
    This function doesn't add padding.
    )r   r   ri   r   )r   r   r   r   
w_shuffleds        r$   shuffle_matrix_sf_ar     s<    " 6lOTTKknn\-@AABJ "*---r&   c                       e Zd ZdZdZdZdZdS )SfLayoutz,
    Layout of scale factors for NVFP4.
    r   r   rf   N)__name__
__module____qualname____doc__layout_128x4
layout_8x4layout_linearr   r&   r$   r   r   %  s)          LJMMMr&   r   c           	      ,   |r|t           j        k    sJ t          |                                 |                                |ddd|          \  }}d}t	          |                    t          j                  |          }t          |                    t          j                  |          	                    |j
                  }nKt          |                                 |                                |dd|t           j        k    |          \  }}||fS )a  
    Quantize input tensor to NVFP4 format.

    Parameters:
        a (torch.Tensor): Input tensor of shape [M, K] with dtype fp16/bf16.
        a_global_sf (torch.Tensor): Global scale factor of shape [1] with dtype float32.
        sfLayout (SfLayout, optional): Scale factor layout. Defaults to SfLayout.layout_128x4.
        do_shuffle (bool, optional): Whether to shuffle the scale factors. Defaults to False. Only TRTLLM backend needs to shuffle the tensor B scale factors.
        sf_vec_size (int, optional): Scale factor vector size. Defaults to 16.
        enable_pdl (Optional[bool], optional): Whether to enable PDL (Programmatic Dependent Launch).
            If None, automatically detects based on device capability. Defaults to None.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
            - Quantized tensor of shape [M, K/2] with dtype FLOAT4_E2M1X2
            - Scale factors tensor with shape determined by layout and sf_vec_size
    F)r`   ra   rb   rc   r   T)r   r   ro   cudar   r   r1   rm   r   r   rl   r   )	aa_global_sfsfLayout
do_shuffler+   rc   a_fp4a_sfr   s	            r$   nvfp4_quantizer   /  s   6  
800000"FFHH"'"!
 
 
t  EK!8!8/JJ"499U[#9#9?KKSSJ
 
 #FFHH"&%)<<!
 
 
t $;r&   c                    d|                                                                                                                                  z  }t	          |                                 |                                ddd          \  }}||fS )a  
    Quantize input tensor to MXFP4 format.

    Parameters:
        a (torch.Tensor): Input tensor of shape [M, K] with dtype fp16/bf16.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
            - Quantized tensor of shape [M, K/2] with dtype uint8 (FLOAT4_E2M1X2)
            - Scale factors tensor with shape determined by layout and sf_vec_size (uint8)
    i
  r|   T)floatabs
nan_to_nummaxro   r   )r   r   r   r   s       r$   mxfp4_quantizer   l  sm     aggiimmoo88::>>@@@Kqvvxx)9)9););RtLLKE4$;r&   c                 @   t          |                                                     t          j                  |                                                    t          j                                      d          t          j        dg| j                  ddd          S )aj  
    Dequantize input tensor from MXFP4 format.

    Parameters:
        a_fp4 (torch.Tensor): Quantized tensor of shape [M, K/2] with dtype uint8 (FLOAT4_E2M1X2)
        a_sf (torch.Tensor): Scale factors tensor with shape determined by layout and sf_vec_size (uint8)

    Returns:
        torch.Tensor: Dequantized tensor of shape [M, K] with dtype float.
    re   g      ?)ri   r|   r   T)r   r   r   r1   rm   r   tensorri   )r   r   s     r$   mxfp4_dequantizer   ~  sw     *		%%

$$,,R00cU5<000
	  r&   r|   r}   r~   r   c                     t          t          j        d                    \  }}|dz  |z    }t          |                              | ||          S )a  
    Dequantize input tensor from MXFP4 format on host.

    Parameters:
        weight (torch.Tensor): Quantized tensor of shape [M, K/2] with dtype uint8 (FLOAT4_E2M1X2)
        scale (torch.Tensor): Scale factors tensor with shape determined by layout and sf_vec_size (uint8)
        group_size (int, optional): Group size for dequantization. Defaults to 32.

    Returns:
        torch.Tensor: Dequantized tensor of shape [M, K] with dtype float.
    r   r   )r   r1   ri   r   r   )r}   r~   r   r   r   rP   s         r$   r   r     s`    $ *X LE5 RZ%')K&{33II  r&   c                     t          | j                  \  }}|dz  |z    }t          |                              | ||d          \  }}||fS )a.  
    Quantize batched input tensor to NVFP4 format.

    Parameters:
        a (torch.Tensor): Input tensor of shape [B, M, K] with dtype fp16/bf16.
        a_global_sf (torch.Tensor): Global scale factor of shape [1] with dtype float32.
        sf_vec_size (int, optional): Scale factor vector size. Defaults to 16.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
            - Quantized tensor of shape [B, M, K/2] with dtype FLOAT4_E2M1X2
            - Scale factors tensor with shape determined by layout and sf_vec_size
    r   F)r   ri   r   r   )r   r   r+   r   r   rP   r   r   s           r$   nvfp4_batched_quantizer     sa    & *!(33LE5RZ%')K-k::UU		 KE4 $;r&   c                     t          | j                  \  }}|dz  |z    }t          |                              | ||          \  }}||fS )a.  
    quantize batched input tensor to NVFP4 format with mask.
    Parameters:
        a (torch.Tensor): Input tensor of shape [B, M, K] with dtype fp16/bf16.
        a_global_sf (torch.Tensor): Global scale factor of shape [1] with dtype float32.
        mask (torch.Tensor): Mask tensor to apply before quantization.
    Returns:
        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
            - Quantized tensor of shape [B, M, K/2] with dtype FLOAT4_E2M1X2
            - Scale factors tensor with shape determined by layout and sf_vec_size
    r   )r   ri   r   r   )r   r   r   r   r   rP   r   r   s           r$   scaled_fp4_grouped_quantizer     sf    " *!(33LE5RZ%')K- $$	  E4 $;r&   )r   )r'   )r<   r   r   r   )>r   	functoolsenumr   typesr   typingr   r   r   r1   api_loggingr	   jitr
   r   rY   r   r   r   r   r   r   r   jit.cpp_extr   utilsr   r   r   r   r   r   r%   r   r   r:   r>   rB   rE   rH   rK   rN   strr=   cacher   r   ro   r   nvfp4_block_scale_interleaver   r   r   r   r   r   r   r   r   r   r   r   r&   r$   <module>r     s               ! ! ! ! ! ! ( ( ( ( ( ( ( ( ( (  ' ' ' ' ' '                              2 1 1 1 1 1               & & & & EG <$',/>A
\   :A7 A A A AA7 A A A A?' ? ? ? ?A7 A A A AA7 A A A AA7 A A A ADI C G    > g g g g g gT  ,0"&"!%> ><>5<(> > 	>
  > > > 5<%&> > > >B %, 5<    :  6   37"&' ''|' "%,/' 	'
 '  ' \' ' ' 'T =5< =# =%, = = = =  . .,.. . . . .0    t     "9 9 9 9x   "   *   L<  \	   8     :     r&   