
    )`ia              
       f   d Z ddlmZ ddlmZmZ ddlZddlmZ ddl	m
Z
mZ ddlmZ ddlZddlmZmZmZmZmZmZ dd	lmZmZ dd
lmZ ddlmZ ej        d             Zdej        dej        dej        dej        ddf
dZ edej        deej!        ej        f         dej        fd            Z"dS )a3  
Copyright (c) 2025 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    )SimpleNamespace)DictListN)flashinfer_api)convert_to_block_layout!get_w2_permute_indices_with_cache)"gen_trtllm_low_latency_gemm_module)	AutoTunerTuningConfigDynamicTensorSpecConstraintSpecTunableRunnerOptimizationProfile)&get_last_power_of_2_num_tokens_bucketslast_positive_power_of_2)setup_cubin_loader)_get_cache_bufc                      t                      } |                                 t          t          |                                                       G fddt
                    fd}t          |          S )Nc            	           e Zd Zdeej                 dedee         f fdZ	 	 ddeej                 dede	dej        f fd	Z
d
S )Fget_trtllm_low_latency_gemm_module.<locals>.TrtllmLowLatencyGemmRunnerinputsprofilereturnc           	      
   d}d}|                                 |         }|                                 |         }|d         }|d         }|d         }	|\  }}}
}d}d}t                              |||	||                    }|S )Nr         )get_opt_shapeslisttrtllm_low_latency_gemm_tactics)selfr   r   a_tensor_indexb_tensor_indexabmnkglobal_scaleout	type_e4m3	type_bf16valid_tacticsops                  v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/trtllm_low_latency_gemm.pyget_valid_tacticszXget_trtllm_low_latency_gemm_module.<locals>.TrtllmLowLatencyGemmRunner.get_valid_tactics5   s    
 NN &&((8A&&((8A!A!A!A II 221aIyQQ M !     Ftacticdo_preparationc                    |\  }}}}|dk     r|S |j         d         }	|j         d         }
|j         d         }                    |	|
||          }t          d||j                  }                    ||||||           |S )Nr   r   trllm_low_latency_gemm)shapeget_workspace_size_in_bytesr   devicetrtllm_low_latency_gemm)r    r   r2   r3   kwargsr#   r$   r(   r)   r%   r&   r'   workspace_sizeworkspace_bufferr-   s                 r.   forwardzNget_trtllm_low_latency_gemm_module.<locals>.TrtllmLowLatencyGemmRunner.forwardP   s     zz

A
A
A;;Aq!VLLN-(.!(    &&    Jr0   N)r1   F)__name__
__module____qualname__r   torchTensorr   intr/   boolr=   )r-   s   r.   TrtllmLowLatencyGemmRunnerr   4   s        	!&	! )	! #Y		! 	! 	! 	! 	! 	!< #(		 	&	 	 !		 \	 	 	 	 	 	 	 	r0   rE   c                                    S )N )rE   s   r.   gemm_runnerz7get_trtllm_low_latency_gemm_module.<locals>.gemm_runnerp   s    ))+++r0   )rH   )r	   build_and_loadr   strget_library_pathr   r   )modrH   rE   r-   s     @@r.   "get_trtllm_low_latency_gemm_modulerM   .   s    
,
.
.C					Bs3//1122333: : : : : : :] : : :x, , , , ,    r0   ABr(   r)   r   c           	      |   t          j                    }dd}t          t          fdt          t
                    ft          |dfd          f          }| |||g}g }|                    t                      	                                           |
                    d|||          \  }	}
 |	||
           |S )	a  GEMM optimized for low M dimension. B needs to be shuffled and its layout needs to be adjusted.
    Only supported on Blackwell GPUs.

    Parameters
    ----------
    A: torch.Tensor
        Input tensor, shape (m, k), fp8 e4m3.

    B: torch.Tensor
        Mat2 tensor, shape (k // block_size, n, block_size), fp8 e4m3. block_size is 128 for e4m3.

    global_scale: torch.Tensor
        Scale tensor for the output, float.

    out: torch.Tensor
        Out tensor, shape (m, n), bf16.

    Examples
    --------
    >>> import torch
    >>> from flashinfer import mm_fp8, prepare_low_latency_gemm_weights
    >>> m = 16
    >>> n = 2560
    >>> k = 32768
    >>> a = torch.randn([m, k], device="cuda", dtype=torch.bfloat16)
    >>> a_fp8, a_inv_s = to_float8(a, dtype=torch.float8_e4m3fn)
    >>> b = torch.randn([n, k], device="cuda", dtype=torch.bfloat16)
    >>> b_fp8, b_inv_s = to_float8(b, dtype=torch.float8_e4m3fn)
    >>> prepared_b = prepare_low_latency_gemm_weights(b_fp8, _cache_permute_indices)
    >>> prepared_b.shape
    torch.Size([256, 16, 128])
    >>> global_scale = a_inv_s * b_inv_s
    >>> out = torch.zeros([m, n], device="cuda", dtype=torch.bfloat16)
    >>> mm_fp8(a_fp8, prepared_b, global_scale, out)
    >>> out.shape
    torch.Size([16, 2560])
    r      )rR   c                      |          d         S )NrR   rG   )shapesr!   s    r.   <lambda>z)trtllm_low_latency_gemm.<locals>.<lambda>   s    VN5KB5O r0   )dynamic_tensor_specsconstraint_specsr9   )r   r2   )r
   getr   r   r   r   r   appendrM   rH   
choose_one)rN   rO   r(   r)   tunerout_tensor_indextuning_configr   runnersrunnerr2   r!   s              @r.   r9   r9   y   s    X MOOEN !6(	 
  "&O&O&O&O 
  M L#&F#%GNN577CCEEFFF%%!	 NFF F&((((Jr0   wpermutation_indices_cachec                     d}t          || |          }| |                    | j                                                           }d}t	          ||          }|S )a  Helper method to prepare the input weight tensor for low-latency TRTLLM GEMM. It includes shuffling and converting to block layout.

    Parameters
    ----------
    w: torch.Tensor
        The weight tensor to shuffle, shape (n, k), fp8 e4m3.

    permutation_indices_cache: dict
        Some location to cache permutation indices. Calculating them is expensive.

    Returns
    -------
    block_layout_shuffled_weights: torch.Tensor
        The shuffled and block-layout weight tensor, shape (k // 128, n, 128), fp8 e4m3.
       )r8   )r   tor8   
contiguousr   )r`   ra   epilogue_tile_mpermute_indicesshuffled_weightsblock_kblock_layout_weightss          r.    prepare_low_latency_gemm_weightsrk      sg    ( O7!1o O ++18+<<=HHJJG23CWMMr0   )#__doc__typesr   typingr   r   	functoolsflashinfer.api_loggingr   flashinfer.fused_moe.corer   r   flashinfer.jit.gemm.corer	   rA   flashinfer.autotunerr
   r   r   r   r   r   flashinfer.fused_moe.utilsr   r   flashinfer.jitr   flashinfer.utilsr   cacherM   rB   r9   Sizerk   rG   r0   r.   <module>ry      s     " ! ! ! ! !             1 1 1 1 1 1        H G G G G G                        . - - - - - + + + + + + G G GTI|I|I ,I 
	I
 
I I I IX  | 04UZ5M0N 
\           r0   