
    )`iB|                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZmZ d dl	m
Z
mZ d dlmZ d dlmZmZmZmZmZmZmZmZ d dlZd dlmZ ddlmZ d	Zd
efdZ e
d           G d d                      Z e
dd           G d d                      Z  e
dd           G d d                      Z! e
d           G d d                      Z" e
d           G d d                      Z#ee#e"f         Z$e
 G d d                      Z%e
 G d d                      Z& G d d e          Z'e j(        d)d!efd"            Z)e
 G d# d$                      Z* ed%          d&             Z+ G d' d(          Z,dS )*    N)ABCabstractmethod)	dataclassfield)	lru_cache)AnyCallableDictListSetTupleUnionOptional)delay_kernel   )loggerz0.1	is_modulec                    t           j                            d                              dd          }t                              dd          }d| d| }| rd| S t
          j                            t
          j                            t
          j        	                    t                              d|d	z             S )
Nr    _.v_trtllm_fused_moe_zflashinfer.tuning_configs.tuning_configsz.py)torchcudaget_device_namereplace_nvfp4_cutlass_versionospathjoindirnamerealpath__file__)r   dev_namecutlass_verconfig_names       h/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/autotuner.pyget_config_pathr*      s    z))!,,44S#>>H(00c::K?k??X??K 
9K999w||GOOBG,,X6677%
 
 	
    T)slotsc                       e Zd ZU dZeedf         ed<   eedf         ed<   eeedf         ef         ed<   eed<    e	d           Z
ee         ed	<   d
 ZdefdZdS )DynamicTensorSpecat  
    A specification for a dynamic tensor dimension.
    Args:
        input_idx: A list of the indices of the input tensors.
        dim_idx: A list of the indices of the dimensions to tune.
            The length of input_idx and dim_idx must be the same.
            For every tensor mapped to the input_idx, their dimension mapped to the dim_idx must be the same.
        gen_tuning_buckets: A tuple of values to try or a function generating values.
        map_to_tuning_buckets: A function to map dimensions to valid values during inference.
        tensor_initializers: A list of functions to initialize the tensors.
    .	input_idxdim_idxgen_tuning_bucketsmap_to_tuning_bucketsc                      d S N r5   r+   r)   <lambda>zDynamicTensorSpec.<lambda>:   s     r+   default_factorytensor_initializersc                 x    | j         2d t          t          | j                            D             | _         d S d S )Nc                     g | ]}d  S )c                 `    t          j        | |          dz  dz
                      |          S N)device
      r   randtoshapesdtyper>   s      r)   r6   z<DynamicTensorSpec.__post_init__.<locals>.<listcomp>.<lambda>@   s.    Jvf555:Q>"U)) r+   r5   ).0r   s     r)   
<listcomp>z3DynamicTensorSpec.__post_init__.<locals>.<listcomp>?   s/     ( ( (  ( ( (r+   )r9   rangelenr/   selfs    r)   __post_init__zDynamicTensorSpec.__post_init__<   sN    #+( ( s4>2233	( ( (D$$$ ,+r+   returnc                     t          | j        | j        t          | j        t
                    r| j        nt          | j                  t          | j                  f          S r4   )hashr/   r0   
isinstancer1   tupleidr2   rK   s    r)   __hash__zDynamicTensorSpec.__hash__F   s^     d5u==1''/004-..

 

 
	
r+   N)__name__
__module____qualname____doc__r   int__annotations__r   r	   r   r9   r   rM   rT   r5   r+   r)   r.   r.   (   s         
 
 S#X38_eCHox78888####*/%*M*M*MhMMM  
# 
 
 
 
 
 
r+   r.   )r,   unsafe_hashc                   2    e Zd ZU dZeed<   eed<   eed<   dS )ConstraintSpecz
    A specification for a constraint on a tensor dimension.
    Args:
        input_idx: The index of the input tensor.
        dim_idx: The index of the dimension to constrain.
        infer_shape: A function to infer the shape of the dimension.
    r/   r0   infer_shapeN)rU   rV   rW   rX   rY   rZ   r	   r5   r+   r)   r]   r]   U   s<           NNNLLLr+   r]   )kw_onlyr[   c                   P    e Zd ZU dZdZeedf         ed<   dZee	df         ed<   dS )TuningConfigaY  Configuration for autotuning.

    This class specifies all the tuning configurations for a single tuning process.
    Args:
        dynamic_tensor_specs (Tuple[DynamicTensorSpec]): Specifications for how different tensor dimensions
            should be tuned to optimize performance. Each spec defines:
            - Which input tensor dimension is dynamic
            - How to generate tuning values
            - How to map dimensions to valid values during inference

            Example:
                >>> config = TuningConfig(
                ...     dynamic_tensor_specs=(
                ...         DynamicTensorSpec(
                ...             input_idx=[0],
                ...             dim_idx=[1],
                ...             gen_tuning_buckets=(32, 64, 128),
                ...             map_to_tuning_buckets=lambda x: ((x + 31) // 32) * 32
                ...         ),
                ...     )
                ... )
        constraint_specs (Tuple[ConstraintSpec]): Specifications for constraints on tensor dimensions.
            Each spec defines:
            - Which input tensor dimension is constrained
            - How to infer the shape of the dimension based on other dimensions

            Example:
                >>> config = TuningConfig(
                ...     constraint_specs=(
                ...         ConstraintSpec(
                ...             input_idx=1,
                ...             dim_idx=2,
                ...             infer_shape=lambda shapes: shapes[0][0] * 2
                ...         ),
                ...     )
                ... )
    r5   .dynamic_tensor_specsconstraint_specsN)
rU   rV   rW   rX   rb   r   r.   rZ   rc   r]   r5   r+   r)   ra   ra   d   sV         $ $L ;=% 13 67<<<35eNC/055555r+   ra   )r[   c                        e Zd ZU eed<   d ZdS )	StaticDimvalc                     | j         S r4   )rf   rK   s    r)   _optzStaticDim._opt   	    xr+   N)rU   rV   rW   rY   rZ   rh   r5   r+   r)   re   re      s.         	HHH    r+   re   c                   8    e Zd ZU dZeed<   eed<   eed<   d ZdS )
DynamicDimzRange of one dimensionminoptmaxc                     | j         S r4   )rm   rK   s    r)   rh   zDynamicDim._opt   ri   r+   N)rU   rV   rW   rX   rY   rZ   rh   r5   r+   r)   rk   rk      sF           	HHH	HHH	HHH    r+   rk   c                   d    e Zd ZU dZeee                  ed<   eee                  ed<   d Z	d Z
dS )OptimizationProfilez$Ranges of all tensors, all dimensionrE   r9   c                 *    |                                  S r4   )get_opt_shapesrK   s    r)   get_hash_keyz OptimizationProfile.get_hash_key   s    ""$$$r+   c                     g }| j         D ].}|                    t          d |D                                  /t          |          S )z.Only the opt shapes are considered as hash keyc                 6    g | ]}|                                 S r5   )rh   rG   ds     r)   rH   z6OptimizationProfile.get_opt_shapes.<locals>.<listcomp>   s     $9$9$9!QVVXX$9$9$9r+   )rE   appendrR   )rL   
opt_shapests      r)   rs   z"OptimizationProfile.get_opt_shapes   sV     
 	< 	<Ae$9$9q$9$9$9::;;;;Z   r+   N)rU   rV   rW   rX   r   DimrZ   r   r	   rt   rs   r5   r+   r)   rq   rq      sd         ..cOhx01111% % %! ! ! ! !r+   rq   c                   N    e Zd ZU ej        ed<   ej        ed<   ee         ed<   dS )
FakeTensorrF   r>   shapeN)	rU   rV   rW   r   rF   rZ   r>   r   r|   r5   r+   r)   r~   r~      s>         ;L9r+   r~   c            
           e Zd Zedeej                 dedee         fd            Z	d Z
e	 	 ddeej                 ded	edefd
            Zd ZdS )TunableRunnerinputsprofilerN   c                     dgS )ar  One tactic corresponding to one cuda kernel normally, but how to interpret the meaning
        of tactic is pure internal details of the runner.

        The autotuner will just pass the tactic value to the forward w/o any knowledge on what the tactic
        means.

        tactic==-1 has special meaning, means the fallback kernel which should be able to implement any shapes
        This fallback tactic is needed for 2 reasons:
            * when the autotuner cannot find a valid tactic in it's cache.
            * in eager mode, w/o autotunning the custom op should have at least one kernel, which makes the autotuning
              process an optional process, such that user can opt out.

        We choose not to have a standalone can_implement function, the tactics returned by get_valid_tactics should return
        valid kernel for these given input tensors.
        r5   )rL   r   r   s      r)   get_valid_tacticszTunableRunner.get_valid_tactics   s    & tr+   c                      | j         |fi |S r4   )forward)rL   r   kwargss      r)   __call__zTunableRunner.__call__   s    t|F--f---r+   r   Ftacticdo_preparationc                     t           )ad  Forward pass for tunable runners.

        Args:
            inputs: List of input tensors (position-only argument)
            tactic: Integer ID specifying which implementation tactic to use.
                   -1 (default) represents the fallback tactic that must be implemented
                   to handle any input shapes when autotuning is disabled.
            do_preparation: When True, allows one-time setup operations to be performed
                          before tactic evaluation begins. These operations are excluded
                          from the performance measurements during autotuning. Notice that
                          anything prepared in this phase should be persistent in the forward
                          and can be accessed by the following forward calls.

        Returns:
            Any: Output of the forward pass

        )NotImplementedError)rL   r   r   r   r   s        r)   r   zTunableRunner.forward   s
    2 "!r+   c                 h    t          t          | j                                                            S r4   )rP   rR   __dict__valuesrK   s    r)   rT   zTunableRunner.__hash__   s&    E$-..0011222r+   N)r   F)rU   rV   rW   r   r   r   Tensorrq   rY   r   r   boolr   r   rT   r5   r+   r)   r   r      s        5<(3F	c   ^(. . .  $	" "U\"" " 	" 
" " " ^"43 3 3 3 3r+   r   	tune_modec              #     K   t                                           j        }| t                                           _        | o| }|rt          j        d           	 d V  |t                                           _        |rt          j        d           d S d S # |t                                           _        |rt          j        d           w w xY w)Nz*[Autotuner]: Autotuning process starts ...z$[Autotuner]: Autotuning process ends)	AutoTunergetis_tuning_moder   info)r   old_modeautotune_enableds      r)   autotuner      s      }}-H%.IMMOO" 1\ B@AAA@)1	& 	@K>?????	@ 	@ *2	& 	@K>????	@s   B 7Cc                       e Zd ZU dZdZeed<    ee          Z	e
eee         f         ed<    ee          Ze
eeeeeef                  f         ed<    ee          Ze
eef         ed<    ee          Ze
eef         ed<   d	efd
ZdS )AutoTunerStatisticsa  Statistics collected by the AutoTuner.

    Attributes:
        cache_misses (int): Number of cache misses requiring fallback
        cache_miss_config_collection (Dict[str, Set[OptimizationProfile]]): Collection of configs that caused cache misses
        failed_profiling_count (Dict[str, int]): Number of failed profiling attempts per operation
        tuned_op_total_configs (Dict[str, int]): Total configurations tried per operation
        tuned_op_successful_configs (Dict[str, int]): Successful configurations per operation
    r   cache_missesr7   cache_miss_config_collectionfailed_profiling_counttuned_op_total_configstuned_op_successful_configsrN   c                    d}|d| j          dz  }| j        r\|dz  }t          | j                                                  D ]0\  }}|d| dz  }t          |t                    D ]}|d| dz  }1| j        r|d	z  }t          | j                                                  D ]}| j        |         }| j                            |d
          }t          | j
                            |t                                          }|d
k    r||z  dz  nd
}|d| dz  }|d| dz  }|d| dz  }|d| dz  }|d
k    r|dz  }| j
        |         D ]}	|d|	 dz  }|d|ddz  }|S )z7Return a string representation of collected statistics. zCache misses: 
zCache miss config collection:
z  z:
)keyz    - Config: zTuned operations:
r   d   z    - Total configs tried: z    - Successful configs: z    - Failed profiling count: z%    - Failed profiling combinations:
z      - z    - Success rate: z.1fz%
)r   r   sorteditemsstrr   keysr   r   rJ   r   set)
rL   	stats_stropprofilesr   total
successfulfailedsuccess_rate
failed_keys
             r)   __str__zAutoTunerStatistics.__str__  s	   	;d&7;;;;	, 	>::I &t'H'N'N'P'P Q Q > >H\"\\\)	%hC888 > >G!='!=!=!==II> & 	J..IT8==??@@ J J3B7!=AA"aHH
T8<<RGGHH=BQYY
U 2S 8 8A\"\\\)	D5DDDD	H*HHHH	HfHHHH	A::!III&*&A"&E ? ?
!%>
%>%>%>>		ILIIIII		r+   N)rU   rV   rW   rX   r   rY   rZ   r   dictr   r
   r   r   rR   r   r   r   rq   r   r   r   r5   r+   r)   r   r   	  s          L#:?%PT:U:U:U $sCJ"7UUU 	d### DSsM+>>?@@ $ $ $ .3U4-H-H-HDcNHHH27%2M2M2Mc3hMMM      r+   r   maxsizec                    t          d          }	 t          j        |          }|j        }n# t          t
          f$ r d }Y nw xY w|^t          | d         | d         | d         f          }||v r6t          j        d| d           d||         d         ||         d         d fS t          j        d|  d           d	S )
NT)r   r   r      z![Autotuner]: Loading configs for z from file.z1 from file failed; Using default configs instead.Fr   r   N)	r*   	importlibimport_modulebest_configsImportErrorAttributeErrorr   r   r   )r   module_namemoduler   ks        r)   load_from_filer   <  s    !D111K(55*(   QQQ())KJAJJJKKKa+\!_Q-?EE
KbCbbb   s   . AAc                      e Zd ZdZdZd!dZed             Zded	e	e
         d
eej                 dedeeeeef         f
dZded	e	e
         dede	ej                 dee
ef         f
dZde	ej                 de	ej                 fdZde
de	ej                 dedefdZdede	ej                 de	e         fdZe ed          deej                 dedefd                        Zedede
d
eej                 dedef
d            Zdej        de	e         dedej        fdZdede	ej                 de	ej                 fdZd"dZ d"d Z!dS )#r   a  AutoTuner for optimizing TensorRT-LLM operations.

    This class handles automatic performance tuning of tensor operations by profiling
    different implementations and caching the best performing configurations.

    Args:
        warmup (int): Number of warmup iterations before profiling (default: 3)
        repeat (int): Number of profiling iterations for averaging (default: 10)
        stream_delay_micro_secs (int): Delay on CUDA stream before the profiled kernel runs in microseconds (default: 1000)
    Nr   r?     c                     || _         || _        || _        i | _        d| _        t                      | _        d| _        d S )NFT)repeatwarmupstream_delay_micro_secsprofiling_cacher   r   statsprofiling_debug)rL   r   r   r   s       r)   __init__zAutoTuner.__init__]  sE    '>$!# )**
#r+   c                 D    | j         t                      | _         | j         S r4   )	_instancer   )clss    r)   r   zAutoTuner.geti  s    = %KKCM}r+   	custom_oprunnersinput_shapestuning_configrN   c                     |D ]y}t                               ||||          }t          j                            dd          dk    r| j        st          |          }|c S || j        v rdg| j        |         R c S zdS )a  Search for cached profiling results matching the current configuration.

        Args:
            custom_op (str): The name of the custom operation to be tuned
            runners (List[TunableRunner]): List of candidate implementations to profile
            profile (OptimizationProfile): Optimization profile

        Returns:
            A tuple containing:
            [is_cache_hit, runner_id, tactic, stored_profile]
        #FLASHINFER_AUTOTUNER_LOAD_FROM_FILE01Tr   )r   _get_cache_keyr    environr   r   r   r   )rL   r   r   r   r   r	cache_keyoutputs           r)   search_cachezAutoTuner.search_cacheo  s    $  	> 	>A!001lM I 
DcJJcQQ+ R (	22d222=d29====== 3 "!r+   r   c                    t          |                     |                    }| j        s}|                     ||||          \  }}}	}
||         }|sRt	          j        d| d|            t	          j        dt                              ||d         ||                      ||	fS t          |          dk    s
J d            t          d |D                       s
J d            | 
                    ||          }t          |          | j        j        |<   i }|D ]?}d t          j        |j                  j                                        D             ||<   @|D ]f}|                     ||          }|                     |||                                |          \  }}}	}|st)          d	          }d
\  }}	t+          |          D ]_\  }}|                    ||          }||         }d|v rt          |          dk    r ||fddd| |D ]}	  | j        |||fi |}n# t0          $ r}|                     |          }t	          j        d| d| d|            t	          j        d| d| d| d|            || j        j        vrt7                      | j        j        |<   | j        j        |                             t                              |||                                |                     t)          d	          }Y d}~nd}~ww xY w||k     r|}||}	}a|t                              |||         |                                |          }||	|f| j        |<   | j        j                            |d          dz   | j        j        |<   t	          j        d||          d|	 d|            h|                     ||||          \  }}}	}||         |	fS )a  Choose the best runner and tactic combination through performance profiling.

        Args:
            custom_op (str): The name of the custom operation to be tuned
            runners (List[TunableRunner]): List of candidate implementations to profile
            tuning_config (TuningConfig): Configuration for the tuning process
            inputs (List[torch.Tensor]): Input tensors for profiling
            **kwargs: Arbitrary keyword arguments, will be passed to get_valid_tactics and forward method of each runner

        Returns:
            Tuple[TunableRunner, int]: A tuple containing:
                - The selected runner implementation
                - The best tactic ID for that runner (-1 if using fallback)

        Note:
            The method profiles different implementations and tactics to find the
            optimal combination based on performance measurements. It caches results
            to avoid redundant profiling of the same configuration.
            Although runners[0] with tactic=-1 is always treated as the fallback runner.
            Runner authors are suggested to provide a fallback implementation for each runner to avoid potential issues.
        z([AutoTunner]: Using fallback tactic for z with input shapes z[AutoTunner]: Generated keyr   zAt least one runner is requiredc                 8    g | ]}t          |t                    S r5   )rQ   r   )rG   r   s     r)   rH   z(AutoTuner.choose_one.<locals>.<listcomp>  s"    BBBQJq-00BBBr+   z3All Given runners must be subclass of TunableRunnerc                     h | ]	}|j         
S r5   )name)rG   params     r)   	<setcomp>z'AutoTuner.choose_one.<locals>.<setcomp>  s'     ' ' '$
' ' 'r+   inf)NNr   r   T)r   r   z[Autotuner]: Skipping tactic r   z", due to failure while profiling: z#[Autotuner]: Failed when profiling 	, shapes=z. Error occurred: Nr   z&[Autotuner]: profiling chosen runner: z for ) rR   _get_input_sizesr   r   r   debugr   r   rJ   all_generate_optimization_profilesr   r   inspect	signaturer   
parametersr   _prepare_input_tensorsrs   float	enumerater   _profile_single_kernel	Exceptionwarningr   r   addr   r   r   )rL   r   r   r   r   r   r   is_cache_hit	runner_idr   stored_profilerunnerr   runner_arg_names_mapr   ptensorsr   min_timer_idvalid_tacticsrunner_arg_namestactime_measurederE   r   s                              r)   
choose_onezAutoTuner.choose_one  s"   < T226::;; " 	">B>O>O7L-? ?;L)V^ Y'F   kykk]ikk    A)2J2J9V]^_V`bnp}2~2~  A  A   6>!7||a!BBB'BBBCC 	
 	
A	
 	
C 77vNN7:8}}
))4  " 	 	A' '(/(9!)(D(D(O(V(V(X(X' ' ' ##  :	 :	A11!V<<G151B1B7A$4$4$6$62 2.L)VQ   5 <<$.!	6(11 $: $:GD!$%$7$7$C$CM';A'>$'+;;;M@R@RUV@V@V'L"TLLVLLL, : :9,GD,G !7C- -39- -MM  ) 9 9 9%)%:%:7%C%CF"N n n nC n nkl n n  
 #L ua u u# u uX^ u urs u u  
  )
0QQQORuu
 A) L J=iHLL ) 8 8$-q!2B2B2D2Dm!" !"   -2%LLMMMMMM-9. )833'4H04cvI=:> ( ) 8 8!79#5q7G7G7I7I=! !I 8A&!6LD(3
>BB9aPPSTT J:9E LnASnnV\nnclnn   #'"3"3wm#
 #
9fa y!6))s   H++
L	5CL	L	c                     d |D             }|S )Nc                     g | ]D}t          |t          j                  r|                                nt          j        d           ES ))r   )rQ   r   r   sizeSize)rG   inputs     r)   rH   z.AutoTuner._get_input_sizes.<locals>.<listcomp>  sO     
 
 
 'uel;;QEJJLLLDAQAQ
 
 
r+   r5   )rL   r   sizess      r)   r   zAutoTuner._get_input_sizes  s'    
 

 
 

 r+   r   r   c           
         t           j                                        }t          | j                  D ]} ||fd|i| |                                 | j        dk    rt          | j                   t           j                            d          }t           j                            d          }|	                    |           t          | j
                  D ]} ||fd|i| |	                    |           |                                 |                    |          | j
        z  }	|                     |          }
t          j        d| d| d|
 d	|	            |	S )
aO  Profile a single kernel implementation for performance measurement.

        Args:
            runner (TunableRunner): The runner implementation to profile
            inputs (List[torch.Tensor]): Input tensors for the kernel
            tactic (int): Tactic ID to use for this profiling run

        Returns:
            Average execution time in milliseconds

        Note:
            The method performs warmup runs, then measures multiple iterations
            to get an average execution time. Stream synchronization and delays
            are used to ensure accurate timing.
        r   r   T)enable_timing)streamz[Autotuner]: profiling r   r   z, avg_time )r   r   current_streamrI   r   synchronizer   r   Eventrecordr   elapsed_timer   r   r   )rL   r   r   r   r   r  r   startendavg_timerE   s              r)   r   z AutoTuner._profile_single_kernel!  s   $ **,,t{## 	4 	4AF633&3F3333
 '!++5666
  t 44jT22F###t{## 	4 	4AF633&3F3333

&
!!!%%c**T[8&&v..]f]]v]]]]S[]]	
 	
 	
 r+   c           
      t   t          d |D             dgt          |          z            }g }g }|j        D ]L}t          j        |j                  s+t          |j        t          t          f          s
J d            t          |j	                  t          |j
                  k    s4J dt          |j	                   dt          |j
                               t          |j                  t          |j	                  k    s4J dt          |j                   dt          |j	                               t          |j	                  D ]\  }}|j        |         |j        |<   t          j        |j                  rO|                    |j        |j	        d                  |j
        d                                                            }	n|j        }	t          t          t!          |	                              }	t          |	          dk    s
J d            d	 t#          |	t          |	d
d                   t%          d          fz   d          D             }
|                    |j	        |j
        |
|	f           Nt)          j        d |D              }|D ]}t-          j        |          }t          |          D ]f\  }\  }}}
}||         }|}|
|         }t1          t          |                    D ]-}t3          |||          |j        ||                  ||         <   .g|j        D ]V}|                    |                                          x}x}}t3          |||          |j        |j	                 |j
        <   W|                    |           t;          j        d|            |S )a  Generate optimization profiles for autotuning.

        Args:
            tuning_config (TuningConfig): Tuning configuration
            inputs (List[torch.Tensor]): List of input tensors

        Returns:
            List of OptimizationProfile objects representing different configurations

        Note:
            This method performs a cartesian product of all possible dimension
            combinations specified in dynamic_tensor_specs.
        c                     g | ]J}t          |t          j                  rd  |                                D             nt	          d          gKS )c                 ,    g | ]}t          |          S r5   )re   )rG   xs     r)   rH   zHAutoTuner._generate_optimization_profiles.<locals>.<listcomp>.<listcomp>g  s    444aYq\\444r+   r   )rQ   r   r   r  re   )rG   r{   s     r)   rH   z=AutoTuner._generate_optimization_profiles.<locals>.<listcomp>e  sb         "!U\22(44166884444#A,,	  r+   Nz`The given dynamic dimension must provide a opt value generation function or a list of opt valueszHThe number of input indices and dimension indices must be the same, got z and zJThe number of tensor initializers and input indices must be the same, got r   z$Empty tuning buckets are not allowedc                     i | ]\  }}||	S r5   r5   )rG   v1v2s      r)   
<dictcomp>z=AutoTuner._generate_optimization_profiles.<locals>.<dictcomp>  s.       B B  r+   r   r   T)strictc                     g | ]
}|d          S )r   r5   rw   s     r)   rH   z=AutoTuner._generate_optimization_profiles.<locals>.<listcomp>  s    'D'D'D!"'D'D'Dr+   z [Autotuner]: generated profile: )rq   rJ   rb   r   
isfunctionr1   rQ   listrR   r/   r0   r9   r   rE   rh   r   r   zipr   ry   	itertoolsproductcopydeepcopyrI   rk   rc   r^   rs   r   r   )rL   r   r   base_profilegenerated_profilesdynamic_dimsspeciidxrz   opt_shapes_max	dim_grids	opt_pointr   posr/   r0   _opt_shapes	opt_value	min_value	max_valueconstraint_specs                         r)   r   z)AutoTuner._generate_optimization_profilesP  s}   ( +      FS[[ 

 

 9;.0!6 "	 "	D%d&=>> *'$C C   s  
 t~&&#dl*;*;;;; I[^_c_m[n[n  I  Iuxy}  zF  vG  vG  I  I <;; t/00C4G4GGGG W]`aeay]z]z  W  W  BE  FJ  FT  BU  BU  W  W HGG $DN33 T T38<8PQR8S055!$"9:: 5!44 'q(9:4<?KPPRR 

 "4
 vc*oo6677Jz??Q&&&(N&&& !jn 5 5u GPT    N ~zJ   
 %'D'D|'D'D'DE	" 	A 	AIl++AJSK K 
 
FFi.+ &cN	%	*95	s9~~..  A9C!9i: :AHYq\*71:66 $1#A  4C4O4O$$&&5 5 	 I	 y)Y?? 23O4KLL %%a(((L?A??@@@@!!r+   r   rE   c                 d   t          d |D                       }|j        D ]X}|                    ||j        d                  |j        d                            ||j        d                  |j        d         <   Y|j        D ]}d||j                 |j        <   t          d |D                       S )a  Find the nearest optimization profile for given inputs
        User can define their own nearest profile generation method to reduce the host overhead.

        Args:
            shapes: Tuple of input tensor shapes
            tuning_config: Tuning configuration

        Return:
            Tuple: A tuple containing:
                - attributes: Tuple of runner attributes, sorted.
                - profile: Tuple of input tensor shapes
        c              3   4   K   | ]}t          |          V  d S r4   )r"  rG   r   s     r)   	<genexpr>z2AutoTuner._find_nearest_profile.<locals>.<genexpr>  s(      <<EDKK<<<<<<r+   r   r   c              3   4   K   | ]}t          |          V  d S r4   )rR   r9  s     r)   r:  z2AutoTuner._find_nearest_profile.<locals>.<genexpr>  s(      <<eU5\\<<<<<<r+   )r"  rb   r2   r/   r0   rc   rR   )r   rE   r   r(  r+  r6  s         r)   _find_nearest_profilezAutoTuner._find_nearest_profile  s    " <<V<<<<<!6 	 	D** !23DLOD  *+DLO<<  -= 	R 	ROOQL23O4KLL<<|<<<<<<r+   c                 d    ||j         j        t          |          |                     ||          fS r4   )	__class__rU   rP   r<  )r   r   r   r   r   s        r)   r   zAutoTuner._get_cache_key  s6     %LL%%lMBB	
 	
r+   origin_tensordimsinitializerc                    |j         }|j        }g }|D ]c}t          |t                    r|                    |j                   2t          |t                    sJ |                    |j                   d ||||          S )a  Create a new tensor matching the properties of the original tensor.

        Args:
            origin_tensor (torch.Tensor): Template tensor to match
            dims (List[Dim]): List of dimensions for the new tensor

        Returns:
            New tensor with specified dimensions and matching properties

        Note:
            Creates a zero tensor with the same dtype and device as the original,
            but with dimensions specified by the dims parameter.
        )rF   r>   rQ   re   ry   rf   rk   rm   )rL   r?  r@  rA  rF   r>   rE   rx   s           r)   _create_tensor_likezAutoTuner._create_tensor_like  s      #% 	% 	%A!Y'' %ae$$$$ "!Z00000ae$$$${65&111r+   r   c                    d }g }t          |j                  D ]f\  }}t          d |D                       r+|                     ||         ||j        |         p|          }n||         }|                    |           g|S )Nc                 `    t          j        | |          dz  dz
                      |          S r=   rA   rD   s      r)   r6   z2AutoTuner._prepare_input_tensors.<locals>.<lambda>  s.    Jvf---2Q6
"U)) r+   c              3   @   K   | ]}t          |t                    V  d S r4   )rQ   rk   rw   s     r)   r:  z3AutoTuner._prepare_input_tensors.<locals>.<genexpr>  s,      88:a,,888888r+   )r   rE   anyrC  r9   ry   )rL   r   r   default_initializerr   r,  r   tensors           r)   r   z AutoTuner._prepare_input_tensors  s      gn-- 		# 		#DAq88a88888 #111I/2I6I   NN6""""r+   c                 8    | j                                          dS )zClear the profiling cache.N)r   clearrK   s    r)   clear_cachezAutoTuner.clear_cache  s    ""$$$$$r+   c                 ,    t                      | _        dS )zReset all statistics counters.N)r   r   rK   s    r)   reset_statisticszAutoTuner.reset_statistics  s    (**


r+   )r   r?   r   )rN   N)"rU   rV   rW   rX   r   r   classmethodr   r   r   r   r   r   r	  ra   r   rY   rq   r   r   r  r   r   r   r   r   r<  r   r|   r	   rC  r   rL  rN  r5   r+   r)   r   r   O  s       	 	 I
$ 
$ 
$ 
$   [
"" m$" EJ'	"
 $" 
tS#22	3" " " "BF*F* m$F* $	F*
 U\"F* 
}c!	"F* F* F* F*PtEL'9 d5:>N    -#--1%,-?-IL-	- - - -^b")b"373Eb"	!	"b" b" b" b"H Yt=5:&=7C=	= = =  [=: 

 
 EJ'	

 $
 

 
 
 [
2"\215c2IQ2	2 2 2 28*484F	el	   &% % % %+ + + + + +r+   r   )T)-
contextlibr&  r   r   r$  r    abcr   r   dataclassesr   r   	functoolsr   typingr   r	   r
   r   r   r   r   r   r   flashinfer.tllm_utilsr   jit.corer   r   r   r*   r.   r]   ra   re   rk   r|   rq   r~   r   contextmanagerr   r   r   r   r5   r+   r)   <module>rX     s                 				 # # # # # # # # ( ( ( ( ( ( ( (       I I I I I I I I I I I I I I I I I I I I  / . . . . .      
  
t 
 
 
 
 )
 )
 )
 )
 )
 )
 )
 )
X 4(((       )( 4T***(6 (6 (6 (6 (6 (6 (6 +*(6V t        t        J	!" ! ! ! ! ! ! ! !&        53 53 53 53 53C 53 53 53p @ @ @ @ @ @ / / / / / / / /d 4  $H+ H+ H+ H+ H+ H+ H+ H+ H+ H+r+   