
    `ic1                        d dl Z d dlZd dlmZmZ d dlmZ d dlmZ d dl	m
Z
mZmZmZ d dlmZmZmZmZ d dlZd dlmZmZ d dlmZ ej                            ed	          Zeoej                                        Zd
Z ed          Z  ed          Z!deee
e f         e!f         deee
e f         e!f         fdZ" G d d          Z# G d de#          Z$ G d de$          Z%er
 e%            n	 e$            Z&dS )    N)cached_propertywraps)chain)median)AnyCallableOptionalUnion)Concatenate	ParamSpecSelfTypeVar)countersdynamo_timed)use_experimental_benchmarkerbenchmarkingi  PTfnreturnc           	           t                     dt          dt          j        dt          j        dt
          f fd            }|S )zWraps `fn` with `dynamo_timed` context, and increments the appropriate dynamo
    counters. It is expected that `fn` is a method of `Benchmarker` or one of its
    subclasses; typing limitations prevent us from declaring this directly.
    selfargskwargsr   c                     | j         j         dj         }t          d         d| xx         dz  cc<   t          |d          5   | g|R i |cd d d            S # 1 swxY w Y   d S )N.inductorzbenchmarking.   F)log_pt2_compile_event)	__class____name__r   r   )r   r   r   fn_qual_namer   s       x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torch/_inductor/runtime/benchmarking.pywrapperztime_and_count.<locals>.wrapper"   s    .1AABKAA;\;;<<<A<<<,eDDD 	- 	-2d,T,,,V,,	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	-s   A  A$'A$)r   r   r   r   r   r   )r   r$   s   ` r#   time_and_countr%      sZ     2YY-c -!& -AH - - - - - - Y- N    c                       e Zd ZdeddfdZedededef         deedf         de	e
ef         d	edefd
            Ze	 ddedeg ef         dededef
d            Zededed	edefd            ZdS )Benchmarkerr   r   Nc                     d S N )r   s    r#   __init__zBenchmarker.__init__-   s    r&   r   .fn_args	fn_kwargsr   c                 l   d}t                                                    D ]A}t          |t          j                  s||j        }'|j        |k    rt          d          B|t          d          fd}|t          j        d          k    r | j        |fi |S  | j        |fi |S )a  Benchmark `fn(*fn_args, *fn_kwargs)` and return the runtime, in milliseconds (the
        actual runtime calculation is dictated by the benchmarking implementation, but may be
        one of [mean, median, minimum, etc.]). Functions as a convenience wrapper around
        device-specific implementations, like `benchmark_cpu` and `benchmark_gpu`. Raises
        `ValueError(...)` if we can't safely infer the device type of `fn`; for example,
        if multiple device types are found in `fn_args` and `fn_kwargs`, or if no device
        types are found.

        Arguments:
        - fn: The function to benchmark.
        - fn_args: The function's arguments.
        - fn_kwargs: The function's kwargs.

        Keyword Arguments:
        - **kwargs: The benchmarking implementation's kwargs.

        Returns:
        - The runtime of `fn(*fn_args, **fn_kwargs)`, in milliseconds.
        NzcCan't safely infer the device type of `fn` with multiple device types in `fn_args` and `fn_kwargs`!zCan't safely infer the device type of `fn` with no device types in `fn_args` or `fn_kwargs`! You should be calling `.benchmark_cpu` or `.benchmark_gpu` directly.c                        i S r*   r+   )r   r-   r.   s   r#   <lambda>z'Benchmarker.benchmark.<locals>.<lambda>Y   s    BB5955 r&   cpu)	r   values
isinstancetorchTensordevice
ValueErrorbenchmark_cpubenchmark_gpu)r   r   r-   r.   r   inferred_devicearg_or_kwarg	_callables    ```    r#   	benchmarkzBenchmarker.benchmark0   s   6 !'9+;+;+=+=>> 	 	LlEL99 &"."5$77 y   8 " t   655555	el51111%4%i::6::: "t!)66v666r&      d   r=   warmuprepc                     dt           dt          t                   ffd} ||           t           ||                    S )a  Benchmark the CPU callable, `_callable`, and return the median runtime,
        in milliseconds.

        Arguments:
        - _callable: The CPU callable to benchmark.

        Keyword Arguments:
        - warmup: Optionally, the duration, in milliseconds, to run `_callable`
        before benchmarking starts.
        - rep: Optionally, the duration, in milliseconds, to run `_callable`
        during benchmarking.

        Returns:
        - The median runtime of `_callable`, in milliseconds.
        msr   c                     g }t          j                    }	 t          j                    }              t          j                    }|                    ||z
  t          z             ||z
  t          z  | k    rnc|S r*   )timeperf_counterappendMILLISECONDS_PER_SECOND)rD   timingsrun_start_tstart_tend_tr=   s        r#   run_forz*Benchmarker.benchmark_cpu.<locals>.run_foru   s    G+--K+--	)++3JJKKK[(,CCrII Nr&   )intlistfloatr   )r   r=   rA   rB   rN   s    `   r#   r9   zBenchmarker.benchmark_cpua   sX    (
	 
	U 
	 
	 
	 
	 
	 
	 	ggcll###r&   r   c                     t           r*   )NotImplementedError)r   r   r   s      r#   r:   zBenchmarker.benchmark_gpu   s    !!r&   )r?   r@   )r!   
__module____qualname__r   r,   r%   r   r   tupledictstrrQ   r>   rO   r9   r:   r+   r&   r#   r(   r(   ,   s>       t      .7.7S#X.7 sCx.7 S>	.7
 .7 
.7 .7 .7 ^.7` OR $  $ $'C0 $:= $IL $	 $  $  $ ^ $D "D " " " " " " ^" " "r&   r(   c            	       v    e Zd Zedededef         fd            Zededeg ef         dede	fd            Z
dS )	TritonBenchmarkerr   r   .c                 Z    	 ddl m} n"# t          $ r}t          d          |d}~ww xY w|S )z"Lazily import Triton's `do_bench`.r   )do_benchzrequires TritonN)triton.testingr\   ImportErrorrS   )r   r\   es      r#   triton_do_benchz!TritonBenchmarker.triton_do_bench   sV    	@/////// 	@ 	@ 	@%&788a?	@s   	 
(#(r=   r   c                    t          j        | j                  j        }t	          |                                          D ]	}||vr||= 
d|v r | j        |fi |d         S d|v r | j        |fi |S  | j        |fi |ddiS )a  Benchmark the GPU callable, `_callable`, and return the runtime, in milliseconds.

        Arguments:
        - _callable: The GPU callable to benchmark.

        Keyword Arguments:
        - quantiles: Optionally, a tuple of floats denoting the requested quantiles.
        - return_mode: Optionally, the requested return mode. Currently, Triton's
        `do_bench` supports min, max, mean, and median return modes.
        - **kwargs: Additional kwargs passed to Triton's `do_bench`.

        Returns:
        - The runtime of `callable`, in milliseconds. If `kwargs["quantiles"]` is specified,
        this is the first requested quantile. Else, if `kwargs["return_mode"]` is specified,
        this is the requested return mode. Otherwise, this is the median.
        	quantilesr   return_moder   )inspect	signaturer`   
parametersrP   keys)r   r=   r   do_bench_paramskwargs        r#   r:   zTritonBenchmarker.benchmark_gpu   s    $ "+D,@AAL&++--(( 	" 	"EO++5M&  '4'	<<V<<Q??f$$'4'	<<V<<<#t#INNNNXNNNNr&   N)r!   rT   rU   r   r   r   r   r`   r%   rQ   r:   r+   r&   r#   rZ   rZ      s        d xS'9    _ OD OXb#g-> O# ORW O O O ^O O Or&   rZ   c                   z   e Zd Zededefd            Zdededeee	j
        j        e	j
        j        f                  fdZdedeee	j
        j        e	j
        j        f                  defdZe	 	 	 	 	 	 ddedeg ef         dedededededeee	j                          dedeeee         f         fd            ZdS )InductorBenchmarkerr   r   c                     t           j                                        }t           j                            |          }|j        S )z7Get the L2 cache size, in bytes, of the current device.)r5   cudacurrent_deviceget_device_propertiesL2_cache_size)r   r7   propss      r#   rp   z!InductorBenchmarker.L2_cache_size   s5     **,,
0088""r&   itersc                 4    d t          |          D             S )z!Get `iters` pairs of CUDA events.c                     g | ]B}t           j                            d           t           j                            d           fCS )T)enable_timing)r5   rm   Event).0_s     r#   
<listcomp>z7InductorBenchmarker.get_event_pairs.<locals>.<listcomp>   sX     
 
 

  
  t 44
  t 44
 
 
r&   )range)r   rr   s     r#   get_event_pairsz#InductorBenchmarker.get_event_pairs   s+    
 

 5\\
 
 
 	
r&   event_pairsc                 4    t          d |D                       S )zIGet the minimum timing, in milliseconds, for a group of CUDA event pairs.c                 >    g | ]\  }}|                     |          S r+   elapsed_timerw   start_event	end_events      r#   ry   zBInductorBenchmarker.get_event_pairs_min_timing.<locals>.<listcomp>   s:       *K ((33  r&   )min)r   r|   s     r#   get_event_pairs_min_timingz.InductorBenchmarker.get_event_pairs_min_timing   s2      .9  
 
 	
r&      r@      r   Nr=   estimation_itersmemory_warmup_itersbenchmark_itersmax_benchmark_durationrc   grad_to_noner   c           	         t           j                                          |             t           j                                         t          j        | j        dz  t           j        d          }	|	                                 |                     |          }
|
D ]Y\  }}||D ]	}d|_        
|	                                 |	                                  |             |	                                 Zt           j                                         | 
                    |
          }t          t          |t          ||z                      d          }t          |          D ]}|	                                 |                     |          }
|
D ]Y\  }}||D ]	}d|_        
|	                                 |	                                  |             |	                                 Zt           j                                         ~	|dk    rd |
D             }|S |dk    r%| 
                    |
          }t          ||          S t          d	| d
          )a  Benchmark a GPU callable using a custom benchmarking implementation.

        Arguments:
        - _callable: The callable to benchmark.

        Keyword Arguments:
        - estimation_iters: Optionally, the number of iterations to run `_callable`
        during runtime estimation.
        - memory_warmup_iters: Optionally, the number of iterations to flush the L2
        cache before starting benchmarking.
        - benchmark_iters: Optionally, the number of iterations to run `_callable`
        during the benchmarking.
        - max_benchmark_duration: Optionally, the maximum duration of the benchmarking,
        in milliseconds. An estimated duration is calculated based on the values
        of `memory_warmup_iters` and `benchmark_iters`, along with the estimated
        runtime of `_callable` and various other factors, and we then shrink
        `benchmark_iters` to fit in the allotted maximum duration.
        - return_mode: Return mode for benchmark results. Options are "min" (default),
        "all" (returns all measurements).
        - grad_to_none: Optionally, a list of tensors whose gradients should be cleared
        before each benchmark iteration.
        - **kwargs: Additional kwargs that may be passed to the fallback.

        Returns:
        - If return_mode="min": The minimum runtime of `_callable`, in milliseconds.
        - If return_mode="all": List of all runtime measurements, in milliseconds.
           rm   )dtyper7   Nr   allc                 >    g | ]\  }}|                     |          S r+   r   r   s      r#   ry   z5InductorBenchmarker.benchmark_gpu.<locals>.<listcomp>,  s:       *K ((33  r&   r   zUnsupported return_mode: z. Use 'min' or 'all'.)r5   rm   synchronizeemptyrp   rO   zero_r{   gradrecordr   maxr   rz   r8   )r   r=   r   r   r   r   rc   r   r   bufferr|   r   r   xestimated_timingrx   all_timingsbenchmarked_timings                     r#   r:   z!InductorBenchmarker.benchmark_gpu   s   P 	
    		
    T/14EIfUUU **+;<<&1 	 	"K'% " "A!AFFLLNNN   IKKK
   ::;GG %;?O%O!P!PQQST
 

 *++ 	 	ALLNNNN **?;;&1 	 	"K'% " "A!AFFLLNNN   IKKK
     % .9  K E!!!%!@!@!M!M ');<<<NKNNN  r&   )r   r@   r@   r   r   N)r!   rT   rU   r   r   rO   rp   rP   rV   r5   rm   rv   r{   rQ   r   r%   r   r   rX   r	   r6   r
   r:   r+   r&   r#   rk   rk      s       #D #S # # # _#





	eEJ$ej&667	8

 

 

 

	
	
!%eEJ,<ej>N,N&O!P	
		
 	
 	
 	
  !"#&"&( 59i iiBG$i i !	i
 i !$i i tEL12i i 
ud5k!	"i i i ^i i ir&   rk   )'rd   rF   	functoolsr   r   	itertoolsr   
statisticsr   typingr   r   r	   r
   typing_extensionsr   r   r   r   r5   torch._dynamo.utilsr   r   torch._inductor.configr   _logginggetArtifactLoggerr!   loggerrm   is_availablerI   r   r   r%   r(   rZ   rk   benchmarkerr+   r&   r#   <module>r      s2     , , , , , , , ,             1 1 1 1 1 1 1 1 1 1 1 1 C C C C C C C C C C C C  6 6 6 6 6 6 6 6 ? ? ? ? ? ? 
	)	)(N	C	C >UZ%<%<%>%> 
  IcNNGCLLS!V$a'(k#q&!1$%   $Z" Z" Z" Z" Z" Z" Z" Z"z$O $O $O $O $O $O $O $ONI I I I I+ I I IZ :R?P?P?R?R r&   