
    )`i                        d Z ddlZddlZddlZddlmZmZmZmZ ddl	Z	ddl
Z
ddlZddlZddlZddlmZmZmZ ddlmZ dEdefdZdeej                 defd	Zdeej                 fd
Z	 dFdeej                 dedefdZd Zdedededeeeef                  fdZdGdZdej        fdZdej        deej        ej        f         fdZ dej        deej        ej        f         fdZ!d Z"d Z#d Z$d Z%d Z&d Z'd Z(d  Z)ej*        ej*        ej*        fd!Z+ej*        ej*        ej*        fd"Z,	 	 	 	 	 	 	 	 	 	 	 dHd(ed)ed*ed+ed,ee-         d-ee         d.ee.         d/e-dedee         d0e-fd1Z/	 	 	 	 	 	 	 	 	 	 	 	 dId(ed)ed*ed+ed,ee-         d-ee         d.ee.         d/e-d2e-dedee         d0e-fd3Z0	 	 	 	 	 	 	 	 	 	 	 	 dJd(ed)ed*ed+ed5ed,ee-         d-ee         d.ee.         d/e-dedee         d0e-fd6Z1	 	 	 	 	 	 	 	 	 	 	 	 	 	 dKd(ed)ed*ed+ed,ee-         d-ee         d.ee.         d/e-d7e-d2e-d5ededee         d0e-fd8Z2 G d9 d:          Z3 G d; d<          Z4	 	 	 	 	 dLd>ed?e-d@e.dAe-dBe-f
dCZ5dD Z6dS )Ma3  
Copyright (c) 2023 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)TupleAnyListOptional)	rearrangereducerepeat)round_upreturnc                     | t           j                                        } t           j                            |           }|j        S )z
    Get L2 cache size in bytes for the given CUDA device.

    Args:
        device: CUDA device (int, torch.device, or None for current device).

    Returns:
        L2 cache size in bytes.
    )torchcudacurrent_deviceget_device_propertiesL2_cache_size)devicepropss     l/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/testing/utils.pyget_l2_cache_sizer   &   s:     ~**,,J,,V44E    tensorsc                     d}| D ]O}t          |t          j                  r3|j        r,||                                |                                z  z  }P|S )z
    Calculate total bytes of tensors residing on GPU.
    Assumes all tensors are on the same device.

    Args:
        tensors: List of torch.Tensor objects.

    Returns:
        Total bytes occupied by GPU tensors (CPU tensors are ignored).
    r   )
isinstancer   Tensoris_cudanumelelement_sizer   totalts      r   _calculate_tensor_bytesr!   6   s\     E 2 2a&& 	219 	2QWWYY!1!111ELr   c                    g }t          | t          j                  r| j        r|                    |            nt          | t
          t          f          r(| D ]$}|                    t          |                     %nNt          | t                    r9| 
                                D ]$}|                    t          |                     %|S )a  
    Recursively extract all GPU-resident tensors from a nested structure
    of lists, tuples, and dicts.

    Args:
        obj: Object to extract tensors from (can be tensor, list, tuple, dict, or other).

    Returns:
        Flat list of tensors on GPU found in the structure.
    )r   r   r   r   appendlisttupleextend_extract_gpu_tensorsdictvalues)objr   itemvs       r   r'   r'   H   s     G#u|$$ 4 4s	C$	'	' 4 	7 	7DNN/556666	7	C		 4 	4 	4ANN/223333Nr      min_rotationsc                     t          |          }t          |           }|dk    rdS |dz  }||k    rdS t          j        ||z            dz   }t	          ||          S )a!  
    Calculate the number of buffer copies needed to ensure cold L2 cache.

    The function uses conservative thresholds to account for:
    - LRU eviction being gradual (not all data evicted when capacity exceeded)
    - Cache associativity effects (some data may persist in non-conflicting sets)
    - Hardware prefetching behavior

    Returns 1 (no rotation needed) only when tensor size substantially exceeds
    L2 cache (>= 5x), ensuring cache effects are truly negligible.

    Args:
        tensors: List of tensors to consider for rotation (must be on GPU).
        device: Device for L2 cache query (None for current device).
        min_rotations: Minimum number of rotations when rotation is needed.

    Returns:
        Number of buffer copies needed (1 means no rotation needed).
    r         )r   r!   mathceilmax)r   r   r.   l2_sizetotal_bytessafe_cache_thresholdnum_rotationss          r   calculate_rotation_countr9   _   sw    ,  ''G)'22Kaq #Q;***q
 I2[@AAAEM}m,,,r   c                    t          | t          j                  r| j        r|                                 r&|                                                                 S t          j        |                                 | 	                                | j
        | j                  }|                    |                                            |S | S t          | t                    rd | D             S t          | t                    rt          d | D                       S t          | t                    rd |                                 D             S | S )a(  
    Deep clone a nested structure, cloning GPU tensors with detach().clone()
    while preserving scalars, booleans, and other non-tensor values.

    For non-contiguous tensors (e.g., created with as_strided), this function
    preserves the stride pattern using torch.empty_strided() + copy_(). This is
    important for backends like cuDNN that expect specific memory layouts.

    Args:
        obj: Object to clone (tensor, list, tuple, dict, or other).

    Returns:
        Cloned structure with GPU tensors cloned, other values preserved.
    dtyper   c                 ,    g | ]}t          |          S  _clone_structure.0r+   s     r   
<listcomp>z$_clone_structure.<locals>.<listcomp>   s!    7774 &&777r   c              3   4   K   | ]}t          |          V  d S Nr?   rA   s     r   	<genexpr>z#_clone_structure.<locals>.<genexpr>   s+      <<%d++<<<<<<r   c                 4    i | ]\  }}|t          |          S r>   r?   )rB   kr,   s      r   
<dictcomp>z$_clone_structure.<locals>.<dictcomp>   s'    ???41a#A&&???r   )r   r   r   r   is_contiguousdetachcloneempty_stridedsizestrider<   r   copy_r$   r%   r(   items)r*   results     r   r@   r@      s9    #u|$$ ; 	  "" zz||))+++ ,HHJJJJLL):	   SZZ\\***J	C		 7737777	C		 <<<<<<<<	C		 ??399;;???? 
r   
input_argsinput_kwargsr8   c                     |dk    r| |fgS g }|                     | |f           t          |dz
            D ]7}t          |           }t          |          }|                     ||f           8|S )a  
    Create multiple copies of input_args and input_kwargs for buffer rotation.

    The first copy (index 0) uses the original args/kwargs.
    Subsequent copies clone all GPU tensors while preserving other values.

    Args:
        input_args: Positional arguments tuple.
        input_kwargs: Keyword arguments dict.
        num_rotations: Number of buffer copies to create.

    Returns:
        List of (args, kwargs) tuples, one for each rotation index.
    r0   )r#   ranger@   )rS   rT   r8   copies_cloned_argscloned_kwargss          r   _create_rotated_buffer_copiesr[      s    " \*++F
MM:|,--- =1$%% 4 4&z22(66{M23333Mr   r   c                 j    |i }t          |           t          |          z   }|r|d         j        S |S )a-  
    Infer CUDA device from GPU tensors in input_args/input_kwargs.

    Args:
        input_args: Positional arguments tuple.
        input_kwargs: Keyword arguments dict (can be None).
        default: Default device if no GPU tensors found.

    Returns:
        Device string or torch.device.
    Nr   )r'   r   )rS   rT   defaultgpu_tensorss       r   _infer_device_from_tensorsr_      sD     &z225I,5W5WWK %1~$$Nr   xc           	         |                      d                                                                          dk    sJ t          j        dt          j        t          j        |                                                               S )imported from DeepGEMMr          @)viewamaxr+   r   powr3   log2abs)r`   s    r   _ceil_to_ue8m0rj      sa    66"::??!!##a''''9S%*UZ%8%899:::r   c                 2   |                                  dk    r|                     d          dz  dk    sJ | j        \  }}|                     |dd          }|                                                                                    d                              |d                              d          }t          |dz            }|d	|	                    d          z  z  
                    t          j                                      ||          |fS )
rb   r-   r0      r   rc   )dim-C6?      |@      ?)rm   rN   shapere   ri   floatrf   clamprj   	unsqueezetor   float8_e4m3fn)r`   mnx_viewx_amaxsfs         r   per_token_cast_to_fp8r|      s    5577a<<AFF1IIOq00007DAqVVAr3FZZ\\!!&&1&--221b99??EEF		'	'BcBLLOO+,001DEEJJ1aPPRTTTr   c                 "   |                                  dk    sJ | j        \  }}t          j        t	          |d          t	          |d          f| j        | j                  }| |d|d|f<   |                    dd|                    d          dz  d          }|	                                
                                                    dd	                              d
          }t          |dz            }|d|z  z                      t          j                  }|                    |          d|d|f                                         |                    |                    d          |                    d                    fS )rb   r-   rl   r;   Nrc   r0   )r0      T)rm   keepdimrn   ro   rp   r   )rm   rq   r   zerosr
   r<   r   re   rN   ri   rr   rf   rs   rj   ru   rv   view_as
contiguous)r`   rw   rx   x_paddedry   rz   r{   x_scaleds           r   per_block_cast_to_fp8r      sg   5577a<<<<7DAq{	!S		8As++,AGAH  H HRaR!V]]2sHMM!$4$4$;SAAFZZ\\!!&&64&@@FFtLLF		'	'B#(#''(;<<HH%%bqb"1"f-88::BGGAA= =  r   c           	         | j         }|dv sJ d|             |t          |          cxk    rt          |          k    sn J t          j        t          j                  }t          j        |j        | j        t          j                  }|dk    r||\  }}|\  }	}
|dk    rt          | d||          }t          |                                dd	                              d
          }||z  }t          j        dt          j        t          j        |                                                              }t!          |d|	|
          }nDt          | d||          }t          |                                dd	                              d
          }||z  }t          j        dt          j        t          j        |                                                              }t          |d          }t!          |d|	|
          }n|dk    r|\  }}}|\  }	}
}|dk    rt          | d|||          }t          |                                dd	                              d
          }||z  }t          j        dt          j        t          j        |                                                              }t!          |d|	|
|          }nt          | d|||          }t          |                                dd	                              d
          }||z  }t          j        dt          j        t          j        |                                                              }t          |d          }t!          |d|	|
|          }| |dz   z  }|                    t          j                  }||fS )  
    Quantizes a 2D or 3D tensor to FP8.

    Args:
        x (torch.Tensor): The 2D or 3D input tensor.
        scale_shape (tuple): The shape of the scale tensor.
        tile_shape (tuple): The shape of the tiles.
        scale_major_mode (str): The tiling order, "K" for row-major like,
                                or another value for column-major like.

    Returns:
        tuple: A tuple containing the quantized FP8 tensor and the
               calculated float32 scales.
    r-   r~   x.ndim must be 2 or 3, but got r   r<   r-   K(s0 t0) (s1 t1) -> s0 s1 t0 t1s0s1zs0 s1 t0 t1 -> s0 s1r4   rn   rd   zs0 s1 -> (s0 t0) (s1 t1))t0t1z(s1 t0) (s0 t1) -> s0 s1 t0 t1zs0 s1 -> s1 s0zs1 s0 -> (s1 t0) (s0 t1)r~   z,(s0 t0) (s1 t1) (s2 t2) -> s0 s1 s2 t0 t1 t2r   r   s2zs0 s1 s2 t0 t1 t2 -> s0 s1 s2z#s0 s1 s2 -> (s0 t0) (s1 t1) (s2 t2))r   r   t2z,(s0 t0) (s2 t1) (s1 t2) -> s0 s1 s2 t0 t1 t2zs0 s1 s2 -> s0 s2 s1z#s0 s2 s1 -> (s0 t0) (s2 t1) (s1 t2)g:0yE>)ndimlenr   finforv   tensorr4   r   float32r   r   ri   rs   rg   r3   rh   r	   ru   )r`   scale_shape
tile_shapescale_major_moder   fp8_infofp8_amaxr   r   r   r   x_tiledabs_maxx_scalescales_repeatedscales_permutedr   r   x_fp32x_fp8s                       r   quantize_fp8r   
  s     6D6>>>CTCC>>>3{##6666s:666666{5.//H|HLOOOH qyyBBs""#CrRRRGW[[]],BEJJPPQUVVG(GiUZ
7;;==0I0I%J%JKKG %W.HRTVWWWOO  #CrRRRGW[[]],BEJJPPQUVVG(GiUZ
7;;==0I0I%J%JKKG (1ABBO$!;r  OO 
 
B
Bs""AbRTV  G > eDkk  (GiUZ
7;;==0I0I%J%JKKG %>2"QS  OO
  AbRTV  G > eDkk  (GiUZ
7;;==0I0I%J%JKKG (1GHHO$5  O /D()F IIe)**E'>r   c                    | j         }|dv sJ d|             |t          |j                  k    sJ |dk    r|dk    r|j        \  }}n
|j        \  }}t          |                     t
          j                  d||          } |dk    rt          |d          }nt          |d          }t          | |z  d	          }n|d
k    r|dk    r|j        \  }}}n|j        \  }}}t          |                     t
          j                  d|||          } |dk    rt          |d          }nt          |d          }t          | |z  d          }|S )r   r   r   r-   r   r   r   zs0 s1 -> s0 s1 1 1zs0 s1 -> s1 s0 1 1zs0 s1 t0 t1 -> (s0 t0) (s1 t1)r~   z+(s0 t0) (s1 t1) (s2 t2)-> s0 s1 s2 t0 t1 t2r   zs0 s1 s2 -> s0 s1 s2 1 1 1zs0 s1 s2 -> s0 s2 s1 1 1 1z,s0 s1 s2 t0 t1 t2 -> (s0 t0) (s1 t1) (s2 t2))r   r   rq   r   ru   r   r   )r`   r   r   r   r   r   outr   s           r   dequantize_fp8r   l  s     6D6>>>CTCC>>>3w}%%%%%% qyys""]FB]FBDD!AbR
 
 
 s"")=>>GG)=>>GG%EFF	s"" JBBB JBBDD9
 
 
 s"")EFFGG)EFFGG%STTJr   c                 P   t          j        |            t          j        |            t          j                            |            t           j                                        r@t           j                            |            t           j                            |            dS dS )z
    Set random seed for reproducibility during testing.

    Args:
        random_seed (int): Random seed to set.

    Returns:
        None
    N)r   manual_seedrandomseednpr   is_availablemanual_seed_all)random_seeds    r   set_seedr     s     
k"""
KINN;z   0
{+++
"";/////0 0r   c                     t          j        |           st          j        | dz  dg          }nd}t	          j        |           dS )z
    Sleep after kernel run. Dynamically adjust sleep time up to 1 sec based on execution time.

    Args:
        execution_time (float): Kernel execution time in milliseconds.

    Returns:
        None
       rp   g{Gz?N)r2   isinfr   mintimesleep)execution_time
sleep_times     r   sleep_after_kernel_runr     sK     :n%% V^c13788


Jz
Fr   c                     ||k    rt          d          |r)| d|z  |z
  z  |z  |z  |z  }| d|z  |z
  z  |z  |z  |z  }n"d| z  |z  |z  |z  |z  }d| z  |z  |z  |z  |z  }||z   }	|	S )a  
    Calculate FLOPs for a given attention layer. Assumes all sequence lengths are the same within the batch

    Args:
        batch_size (int): Batch size.
        qo_seqlen (int): Sequence length of the query. Assumed same within the batch.
        kv_seqlen (int): Sequence length of the key and value. Assumed same within the batch.
        head_dim_qk (int): Head dimension of the query and key.
        head_dim_vo (int): Head dimension of the value.
        num_qo_heads (int): Number of query heads.
        causal (bool): Whether to use causal masking. FLOPs is halved for causal masking.

    Returns:
        total_flops (int): Total FLOPs for the layer.
    zFqo_seqlen must be less than or equal to kv_seqlen for causal attentionr-   )
ValueError)

batch_size	qo_seqlen	kv_seqlenhead_dim_qkhead_dim_vonum_qo_headscausal
bmm1_flops
bmm2_flopstotal_flopss
             r   attention_flopsr     s    2 9T
 
 	
  Y9}y(*  	 	 9}y(*  	 	
 ^i/);lJ[X
^i/);lJ[X
z)Kr   c                    |r'| |k                                     rt          d          |rt          j        d|                    t          j                  z  |                     t          j                  z
  |                     t          j                            |z  |z  }t          j        d|                    t          j                  z  |                     t          j                  z
  |                     t          j                            |z  |z  }ndt          j        |                    t          j                  |                     t          j                            z  |z  |z  }dt          j        |                    t          j                  |                     t          j                            z  |z  |z  }||z   }|S )a  
    Calculate FLOPs for a given attention layer with actual sequence lengths where
    actual sequence lengths are provided as 1D tensors.

    Args:
        actual_seq_lens_q (torch.Tensor): Array of actual sequence lengths of the query.
        actual_seq_lens_kv (torch.Tensor): Array of actual sequence lengths of the key and value.
        head_dim_qk (int): Head dimension of the query and key.
        head_dim_vo (int): Head dimension of the value.
        num_qo_heads (int): Number of query heads.
        causal (bool): Whether to use causal masking.
        Note: Causal must be false for decode as this function assumes qo_seqlen == kv_seqlen.

    Returns:
        total_flops (int): Total FLOPs for the layer.
    zWactual_seq_lens_q must be less than or equal to actual_seq_lens_kv for causal attentionr-   )anyr   r   dotru   r   )	actual_seq_lens_qactual_seq_lens_kvr   r   r   r   r   r   r   s	            r   $attention_flops_with_actual_seq_lensr     s   4  
$'99>>@@ 
e
 
 	
  &
I&))%-888#&&u}556!$$U]33 
   	 I&))%-888#&&u}556!$$U]33 
   	
 i"%%em44!$$U]33 
   	 i"%%em44!$$U]33 
   	 z)Kr   c           	      h    t          | ||||||          }t          j        |          s||z  dz  ndS )a  
    Calculate TFLOPS per second for a given attention layer. Assumes all sequence lengths are the same within the batch.

    Args:
        batch_size (int): Batch size.
        qo_seqlen (int): Sequence length of the query.
        kv_seqlen (int): Sequence length of the key and value.
        head_dim_qk (int): Head dimension of the query and key.
        head_dim_vo (int): Head dimension of the value.
        num_qo_heads (int): Number of query heads.
        causal (bool): Whether to use causal masking.
        time (float): Execution time in milliseconds.

    Returns:
        tflops_per_sec (float): TFLOPS per second for the layer.
        eA        )r   r2   isnan)	r   r   r   r   r   r   r   r   fs	            r   attention_tflops_per_secr   G  sK    4 		 	A "&D!1!1:1t8c>>s:r   c                     t          | |||||          }t          j        |          s|                                |z  dz  ndS )a  
    Calculate TFLOPS per second for a given attention layer with actual sequence lengths.
    Does not assume all sequence lengths are the same within the batch.

    Args:
        actual_seq_lens_q (torch.Tensor): Array of actual sequence lengths of the query.
        actual_seq_lens_kv (torch.Tensor): Array of actual sequence lengths of the key and value.
        head_dim_qk (int): Head dimension of the query and key.
        head_dim_vo (int): Head dimension of the value.
        num_qo_heads (int): Number of query heads.
        causal (bool): Whether to use causal masking.
        ms (float): Execution time in milliseconds.

    Returns:
        tflops_per_sec (float): TFLOPS per second for the layer.
    r   r   )r   r2   r   r+   )r   r   r   r   r   r   msr   s           r   -attention_tflops_per_sec_with_actual_seq_lensr   m  sP    2 	-	 	A '+jnn=16688b=3#=r   c                     | |z  |z  |z  |j         z  }| |z  |z  |z  |	j         z  }| |z  |z  |z  |	j         z  }| |z  |z  |z  |
j         z  }||z   |z   |z   }|dz  }|dz  }t          j        |          s||z  ndS )aA  
    Calculate TB per second perf achieved for a given attention layer. Assumes all sequence lengths are the same within the batch.

    Args:
        batch_size (int): Batch size.
        qo_seqlen (int): Sequence length of the query.
        kv_seqlen (int): Sequence length of the key and value.
        head_dim_qk (int): Head dimension of the query and key.
        head_dim_vo (int): Head dimension of the value.
        num_qo_heads (int): Number of query heads.
        num_kv_heads (int): Number of key and value heads.
        time (float): Execution time in milliseconds.
        q_dtype (torch.dtype): Data type of the query.
        kv_dtype (torch.dtype): Data type of the key and value.
        o_dtype (torch.dtype): Data type of the output.

    Returns:
        tb_per_sec (float): TB per second for the layer.
         @@   mBr   )itemsizer2   r   )r   r   r   r   r   r   num_kv_headsr   q_dtypekv_dtypeo_dtypeq_bytesk_bytesv_byteso_bytesr6   time_in_secbytes_in_tbs                     r   attention_tb_per_secr     s    @ 9$|3kAGDTTG9$|3kAHDUUG9$|3kAHDUUG9$|3kAGDTTGG#g-7K*K$K,0Jt,<,<E;$$#Er   c
                    t          j        |           |z  |z  |j        z  }
t          j        |          |z  |z  |j        z  }t          j        |          |z  |z  |j        z  }t          j        |           |z  |z  |	j        z  }|
|z   |z   |z                                   }|dz  }|dz  }t	          j        |          s||z  ndS )a  
    Calculate TB per second perf achieved for a given attention layer with actual sequence lengths.
    Does not assume all sequence lengths are the same within the batch.

    Args:
        actual_seq_lens_q (torch.Tensor): Array of actual sequence lengths of the query.
        actual_seq_lens_kv (torch.Tensor): Array of actual sequence lengths of the key and value.
        head_dim_qk (int): Head dimension of the query and key.
        head_dim_vo (int): Head dimension of the value.
        num_qo_heads (int): Number of query heads.
        num_kv_heads (int): Number of key and value heads.
        time (float): Execution time in milliseconds.
        q_dtype (torch.dtype): Data type of the query.
        kv_dtype (torch.dtype): Data type of the key and value.
        o_dtype (torch.dtype): Data type of the output.

    Returns:
        tb_per_sec (float): TB per second for the layer.
    r   r   r   )r   sumr   r+   r2   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r6   r   r   s                    r   )attention_tb_per_sec_with_actual_seq_lensr     s    @ 		#$$|3kAGDTT  		$%%4{BXEVV  		$%%4{BXEVV  		#$$|3kAGDTT  W$w.8>>@@K*K$K,0Jt,<,<E;$$#Er      d   Fr>   Tdry_run_itersrepeat_itersdry_run_time_msrepeat_time_msl2_flushl2_flush_size_mbl2_flush_devicesleep_after_runcold_l2_cachec                    	
 
i 
t          d |||fD                       r/t          j        dt          d           ||nd}||nd}||nd}n*|}t	          	
d          }t          |          }|dz  d	z  }t          	          pt          
           	
fd
}t          j        	                    d          }t          j        	                    d          }d}|r6t          |          dz  dz  }t          j        ||t          j                  }d}t          j                                          |             t          j                                         |                                 t          |          D ]"}|r|                                  |             #|                                 t          j                                         |                    |          |z  }| t%          dt          ||z                      }| t%          dt          ||z                      }t          j                                         t          |          D ]"}|r|                                  |             #t          j                                         d t          |          D             }d t          |          D             }t          j                                         t          |          D ]g}|r|                                 ||                                           |             ||                                          |rt'          |           ht          j                                         g }t          |          D ]6}|                    ||                             ||                              7|S )a	  
    Benchmark kernel execution time using CUDA events (no CUDA graphs).

    This is the simplest benchmarking method. Best suited for kernels where launch overhead
    is negligible compared to execution time.

    The function performs:
    1. A quick estimation phase (5 iterations) to determine iteration counts
    2. Dry-run warmup iterations (not measured)
    3. Measured iterations with per-iteration timing via CUDA events

    Iteration counts can be specified directly or derived from target durations:
    - If dry_run_iters/repeat_iters are provided, those counts are used directly.
    - Otherwise, counts are computed from dry_run_time_ms/repeat_time_ms.

    Args:
        fn (Callable): The kernel function to benchmark.
        dry_run_iters (int, optional): Number of warmup iterations (not timed).
            If None, computed from dry_run_time_ms.
        repeat_iters (int, optional): Number of measured iterations.
            If None, computed from repeat_time_ms.
        dry_run_time_ms (int): Target warmup duration in ms (default: 25).
        repeat_time_ms (int): Target measurement duration in ms (default: 100).
        sleep_after_run (bool): If True, sleep briefly after each iteration to
            reduce thermal throttling (default: False).
        input_args (tuple): Positional arguments to pass to fn.
        input_kwargs (dict, optional): Keyword arguments to pass to fn.
        cold_l2_cache (bool): If True, flush L2 cache before each iteration to
            ensure cold-cache performance measurements (default: True).

    Returns:
        List[float]: Per-iteration execution times in milliseconds.

    Example:
        Basic usage:

        >>> def my_kernel(a, b):
        ...     return torch.matmul(a, b.T)
        >>> q = torch.randn(1024, 128, device="cuda")
        >>> k = torch.randn(1024, 128, device="cuda")
        >>> times = bench_gpu_time_with_cuda_event(
        ...     fn=my_kernel,
        ...     input_args=(q, k),
        ... )
        >>> print(f"Median time: {np.median(times):.3f} ms")

    Note:
        This method does NOT use CUDA graphs, so each iteration incurs kernel
        launch overhead. For microbenchmarking where launch latency matters,
        consider using ``bench_gpu_time_with_cudagraph`` instead.

    .. deprecated::
        The ``l2_flush``, ``l2_flush_size_mb``, and ``l2_flush_device`` parameters
        are deprecated. Use ``cold_l2_cache`` instead.
    Nc              3      K   | ]}|d uV  	d S rE   r>   rB   ps     r   rF   z1bench_gpu_time_with_cuda_event.<locals>.<genexpr>8  &      
P
PQ1D=
P
P
P
P
P
Pr   Zl2_flush, l2_flush_size_mb, and l2_flush_device are deprecated. Use cold_l2_cache instead.r-   category
stacklevelT   r      c                  4    r
  i  d S                d S rE   r>   fnhas_argsrS   rT   s   r   call_fnz/bench_gpu_time_with_cuda_event.<locals>.call_fnM  5     	B
+l+++++BDDDDDr   enable_timing   r   r1   r0   c                 N    g | ]"}t           j                            d           #S Tr  r   r   EventrB   rX   s     r   rC   z2bench_gpu_time_with_cuda_event.<locals>.<listcomp>y  +    VVVQEJ$$4$88VVVr   c                 N    g | ]"}t           j                            d           #S r
  r  r  s     r   rC   z2bench_gpu_time_with_cuda_event.<locals>.<listcomp>z  +    TTT1%*"""66TTTr   )r   warningswarnDeprecationWarningr_   r   boolr   r   r  intemptyint8synchronizerecordrV   zero_elapsed_timer4   r   r#   )r  r   r   r   r   r   r   r   r   rS   rT   r   _do_l2_flush_l2_flush_size_mb_l2_flush_devicer5   r  start_event	end_eventbufferl2_flush_sizemeasurement_itersrX   estimated_kernel_execution_timestart_events
end_eventsiter_idxmeasured_timesr  s   `        ``                 @r   bench_gpu_time_with_cuda_eventr)    s)   J  
P
P8-="O
P
P
PPP ;)'		
 	
 	
 	
 $,#7xxT0@0L,,RU.=.I??v$5j,PVWW#$455$q[k: J54#5#5H        *"""66K
  t 44IF W-..5<]3C5:VVV 	JGIII	J$%%   	LLNNN					J  ++.?? $
 As?5T#TUUVV1c.3R"RSSTT 
J=!!   	LLNNN					J WV%BUBUVVVLTTl@S@STTTJ	J,'' D D 	LLNNNX%%'''			8##%%% 	D"#BCCC 
JN,'' Y Yl84AA*XBVWWXXXXr   use_cuda_graphc                    
789:;<=>? i t          d |||fD                       r/t          j        dt          d           ||nd}||nd}||nd}n*|}t	          
d          }t          |          }|dz  d	z  }	 d
dlm8 d
dlm}  |d          }t          |
                    d          d
                   dk     rt          d          d
dlm} n# t          t          f$ r}t          |t                    rt          j        dt           d           nt          j        | dt           d           |	rt#           |||||
|	  	        cY d}~S t%           |||||
|	  	        cY d}~S d}~ww xY wd }8fd>8fd:8fd;8fd<:;<>fd7dt&          t(          t*          t*          t          t          t          f                  dt&          t(          t,          t*          t*          t          t          t          t          t          f                  dt&          f78fd}t/          
          pt/                    = =
fd}d}|r6t          |          d z  d z  }t1          j        ||t0          j        !          }|}d}|	rXt0          j                                         t0          j                                        }|                    t0          j                                                   t0          j                             |          5  tC          d"          D ]} |             	 ddd           n# 1 swxY w Y   t0          j                                                            |           t0          j        "                                }t0          j        #                    |          5   |             ddd           n# 1 swxY w Y   |j$        }d#}t0          j                                          |             t0          j                                         t0          j        %                    d$          }t0          j        %                    d$          } |&                                 tC          |          D ]"}|r|'                                  |             #| &                                 t0          j                                         |(                    |           |z  }!| tS          d%t          ||!z                      }| tS          d%t          ||!z                      }t0          j                                         tC          |          D ]"}|r|'                                  |             #t0          j                                         g }"g }#g }$8*                    8j+        j,                   8*                    8j+        j-                   8*                    8j+        j.                   8*                    8j+        j/                   8*                    8j+        j0                   81                    | |||"|#                     tC          |          D ]}|r|'                                 82                                }% |             82                                }&t0          j                                         |$3                    |%|&f           |rti          |!           85                    d
           86                    8j+        j,                   86                    8j+        j-                   86                    8j+        j.                   86                    8j+        j/                   86                    8j+        j0                   87                                 d& 9d
dl8}'ts          |"d' (          ?d) ?D             }(i })|#D ].}*|*d"         }+|+|)vrg |)|+<   |)|+         3                    |*           /g },d}-tu          |$          D ]"\  }.\  }%}&|';                    |(|%          }/|'<                    |(|&          }0t{          ?fd*tC          |/|0          D                       }1g }2|1D ]!}+|+|)v r|2>                    |)|+                    "|2st          d+|.           t{          9fd,|2D                       }3|-|3}-n|-|3k    rt          d-|- d.|3           t          d/ |2D                       }4tS          d0 |2D                       }5|5|4z
  d1z  }6|,3                    |6           $|,S )2am  
    Benchmark GPU time using CUPTI activity tracing for precise kernel timing.

    CUPTI (CUDA Profiling Tools Interface) provides hardware-level profiling that
    measures actual GPU kernel execution time, excluding CPU-side launch overhead.
    This gives the most accurate kernel performance measurements.

    Cold L2 cache is achieved via L2 flush between iterations. CUPTI measures
    per-iteration, so L2 flush works correctly regardless of ``use_cuda_graph``.

    Behavior:
    - Uses CUPTI (requires version >= 13, i.e., CUDA 13+) to trace kernel activities
      and compute per-iteration GPU time from recorded start/end timestamps.
    - Optionally captures operations in a CUDA graph (use_cuda_graph=True) for
      reduced launch overhead during measurement.
    - If CUPTI is unavailable, falls back to:
      - ``bench_gpu_time_with_cudagraph`` if use_cuda_graph=True (uses rotating buffers
        for cold L2)
      - ``bench_gpu_time_with_cuda_event`` otherwise (uses L2 flush for cold L2)

    Args:
        fn (Callable): The kernel function to benchmark.
        dry_run_iters (int, optional): Number of warmup iterations (not timed).
            If None, computed from dry_run_time_ms.
        repeat_iters (int, optional): Number of measured iterations.
            If None, computed from repeat_time_ms.
        dry_run_time_ms (int): Target warmup duration in ms (default: 25).
        repeat_time_ms (int): Target measurement duration in ms (default: 100).
        sleep_after_run (bool): If True, sleep briefly after each iteration (default: False).
        use_cuda_graph (bool): If True, capture and replay a CUDA graph (default: False).
        input_args (tuple): Positional arguments to pass to fn.
        input_kwargs (dict, optional): Keyword arguments to pass to fn.
        cold_l2_cache (bool): If True, flush L2 cache before each iteration to
            ensure cold-cache performance measurements (default: True).

    Returns:
        List[float]: Per-iteration GPU kernel execution times in milliseconds.

    Example:
        Basic CUPTI benchmarking (requires cupti-python >= 13):

        >>> def my_kernel(a, b):
        ...     return torch.matmul(a, b.T)
        >>> q = torch.randn(1024, 128, device="cuda")
        >>> k = torch.randn(1024, 128, device="cuda")
        >>> times = bench_gpu_time_with_cupti(
        ...     fn=my_kernel,
        ...     input_args=(q, k),
        ... )
        >>> print(f"Median GPU time: {np.median(times):.3f} ms")

    Note:
        Requires ``cupti-python`` package version >= 13.0.0:
        ``pip install -U cupti-python``

        If CUPTI is not available, a warning is issued and the function
        automatically falls back to CUDA event or CUDA graph timing.

    .. deprecated::
        The ``l2_flush``, ``l2_flush_size_mb``, and ``l2_flush_device`` parameters
        are deprecated. Use ``cold_l2_cache`` instead.
    Nc              3      K   | ]}|d uV  	d S rE   r>   r   s     r   rF   z,bench_gpu_time_with_cupti.<locals>.<genexpr>  r   r   r   r-   r   Tr   r   r   r   )cupti)versionzcupti-python.   z?CUPTI needs to be >= 13.0.0. Try 'pip install -U cupti-python'.)partialzhCUPTI is not installed. Try 'pip install -U cupti-python'. Falling back to CUDA events for benchmarking.z. Falling back to CUDA events for benchmarking.	r  r   r   r   r   r   rS   rT   r   c                      d} d}| |fS )Ni   r   r>   )buffer_sizemax_num_recordss     r   func_buffer_requestedz8bench_gpu_time_with_cupti.<locals>.func_buffer_requested&  s    %O++r   c                     | j         j        j        k    r| j        S | j         j        j        k    rdS | j         j        j        k    rdS d S )NMEMCPYMEMSET)kindActivityKindCONCURRENT_KERNELnamer8  r9  activityr-  s    r   set_kernel_namez2bench_gpu_time_with_cupti.<locals>.set_kernel_name+  sV    =E.@@@= ]e07778]e07778 87r   c                 T    | j         j        j        j        j        fv r| j        S dS Nr   )r:  r;  r8  r9  bytesr>  s    r   	get_bytesz,bench_gpu_time_with_cupti.<locals>.get_bytes3  s.    =U/68J8QRRR>!1r   c                 @    | j         j        j        k    r| j        S dS rB  )r:  r;  r8  	copy_kindr>  s    r   get_copy_kindz0bench_gpu_time_with_cupti.<locals>.get_copy_kind9  s$    =E.555%%1r   c                 @    | j         j        j        k    r| j        S dS rB  )r:  r;  r9  valuer>  s    r   	get_valuez,bench_gpu_time_with_cupti.<locals>.get_value?  s#    =E.555>!1r   c           	           |           | j         | j        | j         |            |            |           | j        fS rE   )startendcorrelation_idr:  )r?  rD  rG  rJ  r@  s    r   collect_kernel_infoz6bench_gpu_time_with_cupti.<locals>.collect_kernel_infoE  sV    OH%%NL#M(##IhIhM	
 		
r   launcheskernels
activitiesc                 H   |D ]}|j         j        j        j        j        j        j        fv r|                     |                     K|j         j        j        j        j        fv r3|                     |j        |j	        |j
        |j        |j         f           d S rE   )r:  r;  r<  r8  r9  r#   RUNTIMEDRIVERrL  rM  rN  cbid)rP  rQ  rR  r?  rO  r-  s       r   func_buffer_completedz8bench_gpu_time_with_cupti.<locals>.func_buffer_completedQ  s    
 # 	 	H}"4")")!   228<<===="*")#  
    /    	 	r   c                  4    r
  i  d S                d S rE   r>   r  s   r   r  z*bench_gpu_time_with_cupti.<locals>.call_fnp  r  r   r  r   r~   r1   r  r0   c           
      ^    | d          d| d          d| d          d| d          d| d          	S )Nr   rX      r1         r>   )kernels    r   generate_kernel_stringz9bench_gpu_time_with_cupti.<locals>.generate_kernel_string  sB    )MMfQiMM&)MMfQiMM&)MMMr   c                     | d         S rB  r>   )ls    r   <lambda>z+bench_gpu_time_with_cupti.<locals>.<lambda>  s
    QqT r   )keyc                     g | ]
}|d          S )r   r>   )rB   r`  s     r   rC   z-bench_gpu_time_with_cupti.<locals>.<listcomp>  s    333aQqT333r   c              3   4   K   | ]}|         d          V  dS r-   Nr>   )rB   isorted_launchess     r   rF   z,bench_gpu_time_with_cupti.<locals>.<genexpr>  s-      QQq)!,QQQQQQr   z,No kernel activities recorded for iteration c              3   .   K   | ]} |          V  d S rE   r>   )rB   rH   r^  s     r   rF   z,bench_gpu_time_with_cupti.<locals>.<genexpr>  s/      "S"S#9#9!#<#<"S"S"S"S"S"Sr   zInconsistent kernel names: z != c              3   &   K   | ]}|d          V  dS )r0   Nr>   rB   rH   s     r   rF   z,bench_gpu_time_with_cupti.<locals>.<genexpr>  s&      33!333333r   c              3   &   K   | ]}|d          V  dS re  r>   rj  s     r   rF   z,bench_gpu_time_with_cupti.<locals>.<genexpr>  s&      11qad111111r       .A)Ar   r  r  r  r_   r   r-  importlib.metadatar.  r  split	Exception	functoolsr1  ModuleNotFoundErrorr   UserWarningbench_gpu_time_with_cudagraphr)  r$   r%   rr   strr  r   r  r  r   r  Streamwait_streamcurrent_streamstreamrV   	CUDAGraphgraphreplayr  r  r  r  r4   activity_enabler;  rT  r<  rU  r8  r9  activity_register_callbacksget_timestampr#   r   activity_flush_allactivity_disablefinalizebisectsorted	enumeratebisect_leftbisect_rightsetr&   r   r   )@r  r   r   r   r   r   r   r   r   r*  rS   rT   r   r  r  r  r5   importlib_metadata_versioncupti_versionr1  er6  rW  r  r!  r"  runnergsrX   r#  r  r   r$  rP  rQ  iter_timestamps	start_cpuend_cpur  launch_startscorr_id_to_kernelsrH   corr_idr(  kernel_namesidxleft_idx	right_idxcorr_idsiter_kernelscurrent_kernel_names	min_startmax_endspan_msrO  r-  r^  rD  rG  rJ  r  r@  rg  s@   `         ``                                           @@@@@@@@@r   bench_gpu_time_with_cuptir    s   Z  
P
P8-="O
P
P
PPP ;)'		
 	
 	
 	
 $,#7xxT0@0L,,RU.=.I??v$5j,PVWW#$455$q[k:1LLLLLL22>BB}""3''*++b00Q   	&%%%%%%+ ' ' 'a,-- 	Mz$     MDDD$     	0+) /- /%)+
 
 
 
 
 
 
 
 
 2+) /- /%)+
 
 
 
 
 
 
 
 
;'T, , ,
                

 

 

 

 

 

 

 

uUE3S89:eCsCc3FGH       : J54#5#5H        F W-..5<]3C5:VVV FA 
   J	ej//11222Zq!! 	 	1XX  					 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	
!!##//222 J  ""Za   	 	GIII	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	JGIII	J*"""66K
  t 44I$%%   	LLNNN	J  ++.?? $
 As?5T#TUUVV1c.3R"RSSTT 
J=!!   	LLNNN	J :<HGIGO	%,4555	%,>???	%,3444	%,3444	%,3444	%%ww'<hPP   <   	D 	D 	LLNNN''))	%%''
   	73444 	D"#BCCC	Q	5-5666	5-?@@@	5-4555	5-4555	5-4555	NNN N N
 MMM X>>:::O33?333M
 	   . .A$,,,*,w'7#**1----NL%.%?%? ' '!!i%%mY??''w??	 QQQQeHi6P6PQQQQQ  	A 	AG,,,##$6w$?@@@ 	SQCQQRRR""S"S"S"Sl"S"S"SSS/LL333 Z,ZZDXZZ   33l33333	11L11111Y&#-g&&&&sP   AC   F1A*E>F!E>8F>F$MMMOO"%O"
   num_iters_within_graphc                 
   
  i t          d |||fD                       r#t          j        dt          d           ||nd}n|}t	          
d          }t          
          pt                    t          j                            d          }t          j                            d          }d	}d |r{t          
          t                    z   }t          |          d
k    rt          j        dt          d           d}n*t          ||          }|d	k    rt          
|           nd} 
fd}dt          f  fd}t          j                                         t          j                                        }|                    t          j                                                   t          j                            |          5  t)          d          D ]} |             	 ddd           n# 1 swxY w Y   t          j                                                            |           t          j                                        }t          j                            |          5  |r)|d	k    r#t)          |          D ]}||z  } ||           nt)          |          D ]} |             ddd           n# 1 swxY w Y   t          j                                         d}|                                 t)          |          D ]}|                                 |                                 t          j                                         |                    |          |z  }| t5          d	t          ||z                      }| t5          d	t          ||z                      }t          j                                         t)          |          D ]}|                                 t          j                                         d t)          |          D             }d t)          |          D             }t          j                                         t)          |          D ][}||                                          |                                 ||                                          |	rt7          |           \t          j                                         g }t)          |          D ]9}|                    ||                             ||                   |z             :|S )a  
    Benchmark GPU time using CUDA graphs with amortized kernel launch overhead.

    CUDA graphs capture a sequence of GPU operations and replay them with minimal
    CPU overhead. By running multiple iterations within a single graph, kernel
    launch latency is amortized, yielding measurements closer to pure GPU time.

    **Cold-L2 Benchmarking**:

    When ``cold_l2_cache=True``, the function uses **rotating buffers** to ensure
    cold L2 cache for each kernel invocation within the graph. Multiple copies of
    the GPU tensors in ``input_args``/``input_kwargs`` are created and rotated
    through during graph capture, ensuring each kernel invocation operates on
    different memory regions. The number of buffer copies is automatically
    calculated based on the device's L2 cache size.

    Args:
        fn (Callable): The kernel function to benchmark.
        dry_run_iters (int, optional): Number of warmup iterations (not timed).
            If None, computed from dry_run_time_ms.
        repeat_iters (int, optional): Number of measured iterations (graph replays).
            If None, computed from repeat_time_ms.
        dry_run_time_ms (int): Target warmup duration in ms (default: 25).
        repeat_time_ms (int): Target measurement duration in ms (default: 100).
        num_iters_within_graph (int): Number of kernel calls captured in the graph
            (default: 10). Higher values better amortize launch overhead but use
            more memory when rotating buffers.
        sleep_after_run (bool): If True, sleep briefly after each iteration (default: False).
        input_args (tuple): Positional arguments to pass to fn. GPU tensors in
            this structure will be cloned when ``cold_l2_cache=True``.
        input_kwargs (dict, optional): Keyword arguments to pass to fn. GPU tensors
            in this structure will be cloned when ``cold_l2_cache=True``.
        cold_l2_cache (bool): If True, use rotating buffers to ensure cold L2 cache
            for each kernel invocation within the graph (default: True).

    Returns:
        List[float]: Per-iteration execution times in milliseconds. Each time is
        the graph replay duration divided by ``num_iters_within_graph``.

    Example:
        Cold-L2 benchmarking (default, for memory-bound kernels):

        >>> def run_attention(q, k, v, o):
        ...     flashinfer.single_prefill_with_kv_cache(q, k, v, o)
        ...
        >>> q = torch.randn(batch, heads, seq_len, head_dim, device="cuda")
        >>> k = torch.randn(batch, heads, seq_len, head_dim, device="cuda")
        >>> v = torch.randn(batch, heads, seq_len, head_dim, device="cuda")
        >>> o = torch.empty_like(q)
        >>> times = bench_gpu_time_with_cudagraph(
        ...     fn=run_attention,
        ...     input_args=(q, k, v, o),
        ... )
        >>> print(f"Cold-L2 median time: {np.median(times):.3f} ms")

    Example:
        Hot L2 benchmarking (for compute-bound kernels):

        >>> times = bench_gpu_time_with_cudagraph(
        ...     fn=lambda: torch.matmul(q, k.T),
        ...     cold_l2_cache=False,
        ... )

    Note:
        - When using ``input_args``/``input_kwargs``, the function must accept the
          tensors as arguments (not capture them from closure).
        - GPU tensors are automatically detected and cloned. Non-tensor arguments
          (scalars, booleans, etc.) are preserved across all copies.
        - Memory usage scales with the number of rotations needed to exceed L2 cache.

    See Also:
        - ``calculate_rotation_count``: Computes required buffer copies for cold-L2.

    .. deprecated::
        The ``l2_flush``, ``l2_flush_size_mb``, and ``l2_flush_device`` parameters
        are deprecated. Use ``cold_l2_cache`` instead.
    Nc              3      K   | ]}|d uV  	d S rE   r>   r   s     r   rF   z0bench_gpu_time_with_cudagraph.<locals>.<genexpr>c  r   r   zl2_flush, l2_flush_size_mb, and l2_flush_device are deprecated. Use cold_l2_cache instead. For CUDA graphs, cold_l2_cache uses rotating buffers (not L2 flush) to ensure cold cache.r-   r   Tr   r  r0   r   zfcold_l2_cache=True but no GPU tensors found in input_args/input_kwargs. Cold L2 benchmarking disabled.Fc                  4    r
  i  d S                d S rE   r>   r  s   r   r  z.bench_gpu_time_with_cudagraph.<locals>.call_fn  r  r   buf_idxc                 .    |          \  }} |i | d S rE   r>   )r  argskwargsr  rotated_copiess      r   call_fn_with_rotationz<bench_gpu_time_with_cudagraph.<locals>.call_fn_with_rotation  s,    %g.f
DFr   r~   r1   c                 N    g | ]"}t           j                            d           #S r
  r  r  s     r   rC   z1bench_gpu_time_with_cudagraph.<locals>.<listcomp>  r  r   c                 N    g | ]"}t           j                            d           #S r
  r  r  s     r   rC   z1bench_gpu_time_with_cudagraph.<locals>.<listcomp>  r  r   )r   r  r  r  r_   r  r   r   r  r'   r   rr  r9   r[   r  r  ru  rv  rw  rx  rV   ry  rz  r  r{  r  r4   r   r#   )!r  r   r   r   r   r  r   r   r   r   rS   rT   r   
_do_rotate_devicer  r   r8   r^   r  r  r  rX   r  r'  r  r#  r$  r%  r&  r(  r  r  s!   `         ``                   @@r   rs  rs    s6   x  
P
P8-="O
P
P
PPP #D (	
 	
 	
 	
 "*!5XX4

"
 )\6JJG J54#5#5H*"""66K
  t 44I MN #*:669M:
 :
 
 {q  M1$	    JJ4['JJMq  !>m" "
 #
       s       
 
J
AMM%*++--...			1		  q 	 	AGIIII	               
J++A... 	
A			!		 	 	 	-!++!"899 / /"]2%%g..../
 122  					 	 	 	 	 	 	 	 	 	 	 	 	 	 	 
J $%%  	



	J  ++.?? $
 As?5T#TUUVV1c.3R"RSSTT 
J=!!  	



	J WV%BUBUVVVLTTl@S@STTTJ	J,'' D DX%%'''	


8##%%% 	D"#BCCC 
JN,'' 
 
"//
80DEE$%	
 	
 	
 	
 s%   HHH AKKKenable_cuptic                     t          d |||fD                       r#t          j        dt          d           ||n|}n|}|	rt	          | ||||||
|||
  
        S |
rt          | |||||||||
  
        S t          | ||||||||	  	        S )	a  
    Unified GPU benchmarking interface with configurable timing backends.

    This is the recommended entry point for GPU kernel benchmarking. It provides
    a single interface that dispatches to the appropriate timing implementation
    based on the configuration flags.

    **Timing Backends** (in order of precedence):

    1. **CUPTI** (``enable_cupti=True``): Most accurate, measures pure GPU kernel
       time via hardware profiling. Requires cupti-python >= 13.
    2. **CUDA Graphs** (``use_cuda_graph=True``): Amortizes launch overhead by
       capturing and replaying multiple kernel calls. Good balance of accuracy
       and availability.
    3. **CUDA Events** (default): Simplest method, measures launch + execution.
       Available everywhere but includes CPU overhead.

    **Cold-L2 Strategy** (automatically selected based on timing backend):

    .. list-table::
       :header-rows: 1

       * - Timing Backend
         - Cold-L2 Strategy
         - How it Works
       * - CUPTI
         - L2 Flush
         - Flush L2 cache before each iter
       * - CUDA Events (no CUDA Graphs)
         - L2 Flush
         - Flush L2 cache before each iter
       * - CUDA Events + CUDA Graphs
         - Rotating Buffers
         - Clone GPU tensors in input_args/input_kwargs and rotate through them
        use_cuda_graph (bool): If True, use CUDA graph timing (default: False).
        num_iters_within_graph (int): Kernel calls per graph (CUDA graph mode only,
            default: 10).
        input_args (tuple): Positional arguments to pass to fn.
        input_kwargs (dict, optional): Keyword arguments to pass to fn.
        cold_l2_cache (bool): If True, ensure cold L2 cache for each iteration
            (default: True). The strategy is automatically selected based on timing
            backend.

    Returns:
        List[float]: Per-iteration execution times in milliseconds.

    Example:
        Simple benchmarking with CUDA events (default):

        >>> times = bench_gpu_time(fn=lambda: my_kernel())
        >>> print(f"Median: {np.median(times):.3f} ms")

    Example:
        CUDA graph benchmarking for reduced launch overhead:

        >>> def run_kernel(x, y, out):
        ...     my_memory_bound_kernel(x, y, out)
        >>> times = bench_gpu_time(
        ...     fn=run_kernel,
        ...     input_args=(x, y, out),
        ...     use_cuda_graph=True,
        ... )

    Example:
        CUPTI benchmarking for most accurate GPU kernel time:

        >>> times = bench_gpu_time(
        ...     fn=run_kernel,
        ...     input_args=(x, y, out),
        ...     enable_cupti=True,
        ... )

    See Also:
        - ``bench_gpu_time_with_cuda_event``: Direct CUDA event timing.
        - ``bench_gpu_time_with_cudagraph``: Direct CUDA graph timing.
        - ``bench_gpu_time_with_cupti``: Direct CUPTI timing.

    .. deprecated::
        The ``l2_flush``, ``l2_flush_size_mb``, and ``l2_flush_device``
        parameters are deprecated. Use ``cold_l2_cache`` instead.
    c              3      K   | ]}|d uV  	d S rE   r>   r   s     r   rF   z!bench_gpu_time.<locals>.<genexpr>G  r   r   r   r-   r   N)
r  r   r   r   r   r   r*  rS   rT   r   )
r  r   r   r   r   r  r   rS   rT   r   r2  )r   r  r  r  r  rs  r)  )r  r   r   r   r   r   r   r   r   r  r*  r  rS   rT   r   _cold_l2_caches                   r   bench_gpu_timer    s   F 
P
P8-="O
P
P
PPP 
')'		
 	
 	
 	
 &.%9}& 
('%+)+)!%(
 
 
 	
  
,'%+)#9+!%(
 
 
 	
 *#!'%'!$
 
 
 
r   c                       e Zd Zd Zd ZdS )empty_suppressc                     | S rE   r>   selfs    r   	__enter__zempty_suppress.__enter__{  s    r   c                     d S rE   r>   r  rX   s     r   __exit__zempty_suppress.__exit__~  s    r   N__name__
__module____qualname__r  r  r>   r   r   r  r  z  s2              r   r  c                       e Zd Zd Zd ZdS )suppress_stdout_stderrc                 .   t          t          j        d          | _        t          t          j        d          | _        t
          j                                        | _        t
          j	                                        | _
        t          j        t
          j                                                  | _        t          j        t
          j	                                                  | _        t
          j        | _        t
          j	        | _        t          j        | j                                        | j                   t          j        | j                                        | j
                   | j        t
          _        | j        t
          _	        | S )Nw)openosdevnulloutnull_fileerrnull_filesysstdoutfilenoold_stdout_fileno_undupstderrold_stderr_fileno_undupdupold_stdout_filenoold_stderr_fileno
old_stdout
old_stderrdup2r  s    r   r  z suppress_stdout_stderr.__enter__  s	    S11 S11'*z'8'8':':$'*z'8'8':':$!#
(9(9(;(;!<!<!#
(9(9(;(;!<!<**
!((**D,HIII
!((**D,HIII&
&
r   c                    | j         t          _        | j        t          _        t          j        | j        | j                   t          j        | j	        | j
                   t          j        | j                   t          j        | j	                   | j                                         | j                                         d S rE   )r  r  r  r  r  r  r  r  r  r  r  closer  r  r  s     r   r  zsuppress_stdout_stderr.__exit__  s    _
_

&(DEEE
&(DEEE
'(((
'(((!!!!!!!!r   Nr  r>   r   r   r  r    s2          (" " " " "r   r     	num_testssuppress_kineto_output
trace_pathflush_l2with_multiple_kernelsc           
         t          t          j                            dd                    }t          d          } |              |r	|st          nt
          }	 |	            5  |s#t          j                            dddd          nd }
|s6t          j        	                    t          j        j
        j        g|
          nt                      }|5  t          d          D ]i}t          |          D ]A}|r3t          j        |t          j         d	                                            |              B|s|                                 j	 d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   |rdS t!          |t"          t$          f          sJ t!          |t$                    }|                                                    d
d                              d          }t!          |t"                    r|fn|}t-          d |D                       sJ |s2|D ]/t/          fd|D                       dk    sJ d d            0||                    |           ddd}g }|D ]Ɗd}d}|D ]}|v r|                                d         }|                                d         }|                                D ]V\  }}||v rM|t5          |                    |d                    |z  t          |          z  z  }|t          |          z  } nW|                    ||z             |rt%          |          n|d         S )NDG_NSYS_PROFILINGr   g    eAr0   )waitwarmupactiver	   )rR  scheduler-   r   r;   cuda_time_totalr   )sort_bymax_name_column_width
c                 8    g | ]}t          |t                    S r>   )r   rt  )rB   r=  s     r   rC   z bench_kineto.<locals>.<listcomp>  s"    ???$
4%%???r   c                     g | ]}|v S r>   r>   )rB   liner=  s     r   rC   z bench_kineto.<locals>.<listcomp>  s    <<<<<<r   zErrors of the kernel z in the profiling tabler   rl  )r   usr   rc    )r  r  environgetr  r  r   profilerr  profileProfilerActivityCUDArV   r  r  stepr   rt  r%   key_averagestablern  allr   export_chrome_tracerQ   rr   replacer#   )r  r  r  r  r  r  r  
using_nsysflush_l2_sizesuppressr  r  _irX   is_tuple
prof_linesunitskernel_times
total_time	total_numr  time_strnum_strunitscaler=  s                            @r   bench_kinetor    sY    RZ^^$7;;<<J MMM BDDD
 "	*4	 
 
 $ $ EN##1Qq#III 	 "EN""!N;@AH #     !! 	  
	$ 
	$Ahh 	$ 	$y))  A ")6  %'''BDDDD! $MMOOO	$
	$ 
	$ 
	$ 
	$ 
	$ 
	$ 
	$ 
	$ 
	$ 
	$ 
	$ 
	$ 
	$ 
	$ 
	$$ $ $ $ $ $ $ $ $ $ $ $ $ $ $4  q lS%L11111,..H	(	D	D	t 
 '1s&C&CUL??L??,???@@@@@    	 	D<<<<<<<==BBBEEEE CBBB
 $$Z000 c""EL 4 4
	 
	 
	Dt||::<<+**,,r*#(;;==  KD%x''"!("2"24"<"<==EGT
 "S\\1	 ( 	J23333"*?5Q?s8   $A0E2A:EE2E	E2"E	#E22E69E6c                      d}| D ]Y}t          |t          t          f          r|t          | z  }+|,||                                |                                z  z  }Z|S rB  )r   r%   r$   count_bytesr   r   r   s      r   r	  r	    sh    E 2 2a%'' 	2[!_$EE]QWWYY!1!111ELr   rE   )Nr-   )r   )NNr   r   NNNFr>   NT)NNr   r   NNNFFr>   NT)NNr   r   r  NNNFr>   NT)NNr   r   NNNFFFr  r>   NT)r  FNTF)7__doc__r2   r   r   typingr   r   r   r   r  r  r  numpyr   r   einopsr   r   r	   flashinfer.utilsr
   r  r   r   r!   r'   r9   r@   r(   r[   r_   rj   r|   r   r   r   r   r   r   r   r   r   bfloat16r   r   r  rt  r)  r  rs  r  r  r  r  r	  r>   r   r   <module>r     sf        - - - - - - - - - - - - 				 



       , , , , , , , , , , % % % % % % c     T%,%7 C    $el!3    0 DE'- '-%,'-=@'-'- '- '- '-T( ( (V%):=	%t
   B   (;el ; ; ; ;UU\ UeEL%,4N.O U U U UU\ eEL%,4N.O    "_ _ _D4 4 4n0 0 0&  $1 1 1hH H HV#; #; #;L!> !> !>Z N^N(F (F (F (Ff N^N0F 0F 0F 0Fj #&*%)!#'\ \\ \ 	\
 \ tn\ sm\ c]\ \ \ 4.\ \ \ \ \B #&*%)! #'r rr r 	r
 r tnr smr c]r r r r 4.r r r r rn "$#&*%)!#'^ ^^ ^ 	^
 ^  ^ tn^ sm^ c]^ ^ ^ 4.^ ^ ^ ^ ^F #&*%)! "$#'S SS S 	S
 S tnS smS c]S S S S  S S 4.S S S S Sl        "  "  "  "  "  "  "  "N #("'\@ \@ \@ !	\@
 \@ \@  \@ \@ \@ \@~    r   