
    `iy                       d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlmZmZ d dlmZ d dlmZmZmZmZ d dlmZmZmZmZmZmZ d dlZd dl Zd dl!m"Z" d dl#m$Z$ d d	l%m&Z& d d
l'm(Z(m)Z)m*Z*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5 erd dl6m7Z7 d dl8m9Z9m:Z: ddl;m<Z< ddl=m>Z> ddl?m@Z@ dZA e3eBd          ZC G d deD          ZE G d d          ZF G d d          ZGee&jH        e&jI        f         ZJejK         G d d                      ZLejK         G d d                      ZM G d  d!eM          ZN G d" d#          ZO G d$ d%          ZP G d& d'eM          ZQ G d( d)eOeQ          ZR G d* d+ePeQ          ZS G d, d-eOeM          ZT G d. d/ePeM          ZU G d0 d1eOeM          ZVejW        d8d3            ZXd9d7ZYdS ):    )annotationsN)IterableSequence)ThreadPoolExecutor)byrefc_size_tc_void_pCDLL)AnyCallableIOOptionalTYPE_CHECKINGUnion)get_interface_for_device)rand_strided)ir)CppCodeCacheCUDACodeCache
DLLWrapperget_hashPyCodeCache)get_gpu_typeget_ld_library_pathis_gpupython_subprocess_env)getArtifactLogger)
OrderedSet)
ModuleType)PartialRenderTritonTemplateCaller   )config)benchmarker)VCUDA_VISIBLE_DEVICES
autotuningc                      e Zd ZdS )!NonzeroWorkspaceNotSupportedErrorN__name__
__module____qualname__     t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torch/_inductor/autotune_process.pyr)   r)   ;           Dr/   r)   c                      e Zd ZdZedd            Zed d
            Zed!d            Zd"dZd Z	d#dZ
d$dZd%d&dZd'd(dZd)dZd)dZd)dZdS )*TuningProcesszF
    Class to launch and interact with a benchmarking subprocess.
    	read_pipe	IO[bytes]
write_pipereturnNonec                     t                               dt          j                    t          j                            t                                fd}	  |             dS # t          $ r Y dS w xY w)z4
        Entry point for the child process.
        z3Started autotune subprocess %s. Visible devices: %sc                     	 t                                         } | d S 	  |             }n# t          $ r}|}Y d }~nd }~ww xY wt                               |           _N)r3   recv	Exceptionsend)jobresulter4   r6   s      r0   workloopz,TuningProcess.process_main.<locals>.workloopO   s    	7#((33;E SUUFF    FFFFFF""6:666	7s   
- 
A>AN)autotuning_logdebugosgetpidenvirongetr&   EOFError)r4   r6   rB   s   `` r0   process_mainzTuningProcess.process_mainD   s    
 	AIKKJNN/00	
 	
 	

	7 
	7 
	7 
	7 
	7 
	7	HJJJJJ 	 	 	DD	s   
A% %
A32A3objr   c                X    t          j        | |           |                                 d S r;   )pickledumpflush)rK   r6   s     r0   r>   zTuningProcess.senda   s,    C$$$r/   c                *    t          j        |           S r;   )rM   load)r4   s    r0   r<   zTuningProcess.recvf   s    {9%%%r/   deviceOptional[int]c                <    || _         |                                  d S r;   )rR   start)selfrR   s     r0   __init__zTuningProcess.__init__j   s    

r/   c                   t           j                            t           j                            t                    d          }t          j                    \  }}t          j                    \  }}t          j        |d          | _        t          j        |d          | _        t          j
                    | _        | j                            | j        t          j                   t          j        |dt          j                     dt#          |           dt#          |           g}i t%                      dt'                      t(          j        rdndd	}| j        t#          | j                  |t.          <   t1          j        ||||f          | _        t          j        |           t          j        |           d| _        d
S )z4
        Start the benchmarking subprocess.
        z__autotune_main__.pywbrbz	--parent=z
--read-fd=z--write-fd=01)TORCH_WARM_POOLLD_LIBRARY_PATH3TORCHINDUCTOR_PROFILE_WITH_DO_BENCH_USING_PROFILINGN)envpass_fdsT)rE   pathjoindirname__file__pipefdopenr6   r4   	selectorsDefaultSelectorselectorregister
EVENT_READsys
executablerF   strr   r   r#   /profile_bandwidth_with_do_bench_using_profilingrR   r&   
subprocessPopenprocesscloserunning)rV   entrysubproc_read_fdwrite_fdread_fdsubproc_write_fdcmdr`   s           r0   rU   zTuningProcess.startn   s    RW__X668NOO$&GII!$&GII!!)Hd337D11!133t~y/CDDD N%	%%/_--//1#.//11

#%%
  #244 ED33
 
 
 ;"(+DK(8(8C$%!'%'78
 
 

 	!!!
!"""r/   boolc                F    | j         o| j                                        du S )z:
        True if the subprocess is still running.
        N)ru   rs   pollrV   s    r0   alivezTuningProcess.alive   s$     |; 1 1 3 3t ;;r/   reqc                    |                                  s|                                  t                              || j                   dS )z8
        Push a work item to the child process.
        N)r   rU   r3   r>   r6   )rV   r   s     r0   putzTuningProcess.put   s?     zz|| 	JJLLL300000r/         ^@timeoutfloatc                   	 | j                             |          st          d| j        j                   t
                              | j                  }n# t          $ r |                                   t          $ r | 
                                  t          $ r; t                              d| j        j                   |                                   w xY wt          |t                    r||S )z
        Get a response from the child process. Raises TimeoutError on timeout;
        raises EOFError if the subprocess crashes.
        zTimeout in autotune subprocess z.Unexpected exception in autotune subprocess %s)rj   selectTimeoutErrorrs   pidr3   r<   r4   killrI   rt   r=   rC   	exception
isinstance)rV   r   r@   s      r0   rH   zTuningProcess.get   s   
	=''00 Y"#WT\EU#W#WXXX"''77FF 	 	 	IIKKK 	 	 	JJLLL 	 	 	$$@$,BR   IIKKK	 fi(( 	Ls   AA BCTwaitc                    |                                  r t                              d| j                   |r|                                  dS dS )zC
        Signal the child process to shut down gracefully.
        N)r   r3   r>   r6   r   )rV   r   s     r0   shutdownzTuningProcess.shutdown   sO     ::<< 	6tT_555 	IIKKKKK	 	r/   c                    |                                  r| j                                         |                                  dS )z5
        Wait for the child process to exit.
        N)r   rs   r   rt   r   s    r0   r   zTuningProcess.wait   s9     ::<< 	 L

r/   c                    | j                                          | j                                         | j                                         d| _        dS )z"
        Close resources.
        FN)rj   rt   r4   r6   ru   r   s    r0   rt   zTuningProcess.close   sK     	r/   c                    |                                  r>t                              d| j        j                   | j                                         |                                  dS )z6
        Send a SIGKILL to the child process.
        z)Sending SIGKILL to autotune subprocess %dN)r   rC   errorrs   r   r   rt   r   s    r0   r   zTuningProcess.kill   s^     ::<< 	   ;    L

r/   N)r4   r5   r6   r5   r7   r8   )rK   r   r6   r5   r7   r8   )r4   r5   r7   r   )rR   rS   )r7   r|   )r   r   r7   r8   )r   )r   r   r7   r   )T)r   r|   r7   r8   r7   r8   )r+   r,   r-   __doc__staticmethodrJ   r>   r<   rW   rU   r   r   rH   r   r   rt   r   r.   r/   r0   r3   r3   ?   s8            \8    \ & & & \&   + + +Z< < < <1 1 1 1    6          
 
 
 
 
 
r/   r3   c                  J    e Zd ZdZddZedd            ZddZddZddZ	dS )TuningProcessPoolz
    Maintains a pool of TuningProcesses to benchmark kernels in parallel
    across devices. By default, we create one TuningProcess per device and
    set the sub-process environment to make only that device visible.
    r7   r8   c                B   |                                  }t                              d|           d |D             | _        t	          j                    | _        | j        D ]}| j                            |           t          t          |                    | _
        dS )z,
        Start the child processes.
        z$Sub-process autotune device list: %sc                .    g | ]}t          |           S ))rR   )r3   ).0rR   s     r0   
<listcomp>z.TuningProcessPool.__init__.<locals>.<listcomp>   s#    MMM6-v666MMMr/   )max_workersN)get_device_listrC   rD   	processesqueueQueueprocess_queuer   r   lenexecutor)rV   devicesps      r0   rW   zTuningProcessPool.__init__   s     &&((CWMMM NMWMMM9> 	& 	&A""1%%%%
 +s7||DDDr/   Sequence[Optional[int]]c                 v   t           j        sdgS t                      } t          |           }|                                }t
          t          j        v rKd t          j        t
                                       d          D             }t          |          |k    sJ |S t          t          |                    S )zD
        Gather the list of devices to be used in the pool.
        Nc                ,    g | ]}t          |          S r.   )int)r   ds     r0   r   z5TuningProcessPool.get_device_list.<locals>.<listcomp>  s    SSS!s1vvSSSr/   ,)r#   autotune_multi_devicer   r   device_countr&   rE   rG   splitr   listrange)gpu_typedevice_interfacecountr   s       r0   r   z!TuningProcessPool.get_device_list  s    
 + 	6M>>3H== --//  2:--SSrz2F'G'M'Mc'R'RSSSGw<<5((((NE%LL!!!r/   c                    | j                                          | j        D ]}|                    d           | j        D ]}|                                 dS )z5
        Signal all child processes to exit.
        F)r   N)r   r   r   r   )rV   r   s     r0   r   zTuningProcessPool.shutdown  sj     	    	# 	#AJJEJ"""" 	 	AFFHHHH	 	r/   choicer!   r   c                p   |j         J | j                                        }|                    |j         j                   	 |                    t
          j                  | j                            |           S # t          $ rD t          j	        d| d           t          d          cY | j                            |           S t          $ rD t          j	        d| d           t          d          cY | j                            |           S w xY w# | j                            |           w xY w)z
        Entry point for the thread-pool helper threads: Wait for an open TuningProcess,
        remove it from the queue, execute the benchmark in that subprocess, and return
        the TuningProcess to the queue.
        NzTimed out benchmarking choice 'z['. It will be ignored. Please debug the root cause in case the choice can bring perf gains.infzFailed to benchmark choice ')bmreqr   rH   r   	benchmarkr#   +max_autotune_subproc_result_timeout_secondsr   warningswarnr   r=   )rV   r   rs   s      r0   targetzTuningProcessPool.target%  sz    |'''$((**FL*+++	,;;B $ ""7++++  	  	  	 MW& W W W  
 << ""7++++  	  	  	 MWv W W W  
 <<""7++++	  ""7++++s0   A< <1D-D 	0D9D DD D5choiceslist[TritonTemplateCaller]!dict[TritonTemplateCaller, float]c           	     |    t          t          || j                            | j        |                              }|S )z>
        Benchmark each choice in a separate process.
        )dictzipr   mapr   )rV   r   resultss      r0   r   zTuningProcessPool.benchmarkD  s4     s7DM$5$5dk7$K$KLLMMr/   Nr   )r7   r   )r   r!   r7   r   r   r   r7   r   )
r+   r,   r-   r   rW   r   r   r   r   r   r.   r/   r0   r   r      s         E E E E& " " " \"(	 	 	 	, , , ,>     r/   r   c                  p    e Zd ZU ded<   ded<   ded<   ded<   d	ed
<   dZded<   edd            ZddZdS )
TensorMetaztorch.devicerR   ztorch.dtypedtypeztorch._prims_common.ShapeTypesizesztorch._prims_common.StrideTypestridesr   offsetNzOptional[str]nameirnodes/Union[LayoutOrBuffer, Sequence[LayoutOrBuffer]]r7   #Union[TensorMeta, list[TensorMeta]]c           
          t          |t                    r+ fd|D             }t          d |D                       sJ |S |}t          |t          j                  rt          j        d|          }|                                }|J |                                }|J t          ||t          j
        j                            |                                t          j                  t          j
        j                            |                                t          j                  t          j
        j                            |                                j        t          j                  |                                          S )Nc                :    g | ]}                     |          S r.   )from_irnodes)r   xclss     r0   r   z+TensorMeta.from_irnodes.<locals>.<listcomp>d  s'     F F F!1!1!!4!4 F F Fr/   c              3  @   K   | ]}t          |t                    V  d S r;   )r   r   r   r   s     r0   	<genexpr>z*TensorMeta.from_irnodes.<locals>.<genexpr>e  s,      AAQz!Z00AAAAAAr/   fake)r   layout)fallback)rR   r   r   r   r   r   )r   r   allr   LayoutBuffer	get_dtype
get_devicer   r%   graphsizevars
size_hintsget_sizer#   unbacked_symint_fallback
get_stride	size_hint
get_layoutr   get_name)r   r   r@   noder   rR   s   `     r0   r   zTensorMeta.from_irnodes_  sz    gx(( 	 F F F Fg F F FFAA&AAAAAAAAMdBI&& 	79&666D     ""!!!'"--8 .   G$//!!8 0   7#--!!(8 .   
 
 
 	
r/   torch.Tensorc                \    t          | j        | j        | j        | j        | j                  S )N)rR   r   
extra_size)r   r   r   rR   r   r   r   s    r0   	to_tensorzTensorMeta.to_tensor  s2    JL;*{
 
 
 	
r/   )r   r   r7   r   )r7   r   )r+   r,   r-   __annotations__r   classmethodr   r   r.   r/   r0   r   r   V  s         ((((++++KKKD!
 !
 !
 [!
F
 
 
 
 
 
r/   r   c                  F    e Zd ZdZddZddZddZddddZddddZdS )BenchmarkRequesta1  
    Only handle triton template benchmark for now. The extern kernel benchmark
    can be done inside the same process since they usually don't cause crash.

    Important: Instances of this class and subclasses have to be serializable
    across process boundaries. Do not put CUDA Tensors in here!
    kernel_namero   input_tensor_metar   output_tensor_meta
extra_argsIterable[Any]r7   r8   c                   || _         t          |t                    r|g}|| _        t          t          t
          f          r8t                    dk    rt          fdD                       sJ d         | _        || _	        d S )Nr"   c              3  r   K   | ]1}d D ],}t          d         |          t          ||          k    V  -2dS ))rR   r   r   r   r   r   N)getattr)r   r   attrr   s      r0   r   z,BenchmarkRequest.__init__.<locals>.<genexpr>  sm         Q   .q1488GAt<L<LL      r/   r   )
r   r   r   r   tupler   r   r   r   r   )rV   r   r   r   r   s      ` r0   rW   zBenchmarkRequest.__init__  s     ''44 	4!2 3!2(5$-88 	7%&&**    /       
 "4A!6"4$r/   input_tensorsr   outCallable[[], None]c                   t           r;   NotImplementedErrorrV   r   r   s      r0   make_run_fnzBenchmarkRequest.make_run_fn  s
     "!r/   c                    d S r;   r.   r   s    r0   cleanup_run_fnzBenchmarkRequest.cleanup_run_fn  s    r/   Nr   Optional[torch.Tensor]r   c                   t           r;   r  rV   fnr   r   s       r0   do_benchzBenchmarkRequest.do_bench  s
     "!r/   c                  t                               t          j                  }|rt	          j                    }|Lt          |          dk    sJ t          d | j        D                       }| j        	                                }|r)t	          j                    |z
  }t	          j                    }	  | j
        |d|i}n9# t          $ r, t                               d           t          d          cY S w xY w|r)t	          j                    |z
  }t	          j                    } | j        |g||R  }|rAt	          j                    |z
  }	t                               dt!          |           |||	           |                                  |S )Nr   c              3  >   K   | ]}|                                 V  d S r;   )r   r   s     r0   r   z-BenchmarkRequest.benchmark.<locals>.<genexpr>  s*      !P!PA!++--!P!P!P!P!P!Pr/   r   z0Skipping op due to nonzero workspace requirementr   z6InChildProcess %s: load %f, create tensor %f, bench %f)rC   isEnabledForloggingDEBUGtimer   r   r   r   r   r  r)   infor   r  rD   ro   r  )
rV   r   r   rD   start_tscreate_tensor_elapser  load_elapseresbench_elapses
             r0   r   zBenchmarkRequest.benchmark  s   
 ++GM:: 	#y{{H ;}%%****!!P!P9O!P!P!PPPM)3355C 	##'9;;#9 y{{H	 !!=:c::BB0 	  	  	  RSSS<<	 
  	#)++0Ky{{HdmB44444 	9;;1L  HD		$   	
s   4C 3C87C8)
r   ro   r   r   r   r   r   r   r7   r8   r   r   r   r   r7   r  r   r   r   r   r
  r7   r   )	r+   r,   r-   r   rW   r  r  r  r   r.   r/   r0   r   r     s         % % % %6" " " "
    '+	" " " " " " '+) ) ) ) ) ) ) )r/   r   c                  4    e Zd ZdZ	 	 	 	 	 dddZddddZdS )_TestBenchmarkRequestz
    Supports unit testing. Defined in this file instead of the test file so the
    TuningProcess sub-process can unpickle these objects.
            NFr@   r   rR   rS   sleepOptional[float]excOptional[Exception]crashr|   c                L    || _         || _        || _        || _        || _        d S r;   )r@   rR   r   r"  r$  )rV   r@   rR   r   r"  r$  s         r0   rW   z_TestBenchmarkRequest.__init__  s+     



r/   r	  r   r   r   r
  r7   c               *   | j         =t          j                            t          d           t          | j                   k    sJ | j        rt          j        | j                   | j        r| j        | j	        rt          j        d           | j        S )Nr"   )rR   rE   rG   rH   r&   ro   r   r  r"  r$  rm   exitr@   r  s      r0   r   z_TestBenchmarkRequest.benchmark  s     ;":>>"6==T[AQAQQQQQ: 	#Jtz"""8 	(N: 	HQKKK{r/   )r  NNNF)
r@   r   rR   rS   r   r!  r"  r#  r$  r|   r  )r+   r,   r-   r   rW   r   r.   r/   r0   r  r    sm           $!%#'     KO       r/   r  c                      e Zd Zddd
d	ZdS )GPUDeviceBenchmarkMixinNr	  r   r   r   r
  r7   r   c                  t          d g ||D                       }t          |          dk    sJ d|             t          d |D             d          }t          |          }t          |          dk    rt          t	          |                    }n|                                }|                    |          5  t          j        |          }|	                                 d d d            n# 1 swxY w Y   |S )Nc              3     K   | ]O}t          |t          j                  r3t          |j        j                  r|j        j        A|j        j        V  Pd S r;   )r   torchTensorr   rR   typeindexr   tensors     r0   r   z3GPUDeviceBenchmarkMixin.do_bench.<locals>.<genexpr>  sm       $
 $
&%,//$
 v})**	$

 #/	 M 0///$
 $
r/   r"   zCan not mix devices c              3  `   K   | ])}t          |j        j                  |j        j        V  *d S r;   )r   rR   r.  r0  s     r0   r   z3GPUDeviceBenchmarkMixin.do_bench.<locals>.<genexpr>   sN        &-,--"     r/   cuda)
r   r   nextr   itercurrent_devicerR   r$   benchmark_gpusynchronize)	rV   r  r   r   device_idx_setdevice_typer   
device_idxr  s	            r0   r  z GPUDeviceBenchmarkMixin.do_bench  sr    $ $
 $
/M/3/$
 $
 $
 
 
 >""a''')P)P)P''' +  
 
 
 4K@@~!##d>2233JJ)88::J$$Z00 	+ 	++B//C((***	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 
s    )C55C9<C9r  r+   r,   r-   r  r.   r/   r0   r)  r)    s7        
 '+	       r/   r)  c                      e Zd Zddd
d	ZdS )CPUDeviceBenchmarkMixinNr	  r   r   r   r
  r7   r   c               *    t          j        |          S r;   )r$   benchmark_cpur  s       r0   r  z CPUDeviceBenchmarkMixin.do_bench4  s     (,,,r/   r  r<  r.   r/   r0   r>  r>  3  s7        
 '+	- - - - - - - -r/   r>  c                  B     e Zd Z	 	 	 	 	 dd fdZddZd Zd dZ xZS )!TritonBenchmarkRequestr   r   ro   r   r   r   r   r   module_pathmodule_cache_key
num_stagesr   	num_warpsnum_consumer_groupsnum_buffers_warp_specmatrix_instr_nonkdimwaves_per_eukpackr7   r8   c                    t                                          ||||           || _        || _        || _        || _        |	| _        |
| _        || _        || _	        || _
        d S r;   )superrW   rC  rD  rE  rF  rG  rH  rI  rJ  rK  )rV   r   r   r   r   rC  rD  rE  rF  rG  rH  rI  rJ  rK  	__class__s                 r0   rW   zTritonBenchmarkRequest.__init__@  sn      	&79KZXXX& 0$"#6 %:"$8!(


r/   r   r   r   r  c                  t          j        | j        | j                  }t                              d| j        | j                   t          || j                  j        }t          | j
                  }d|j        _        i }dd l}d|                    |          j        v rd|d<   |j        j        dk    rd}n?|j        j        }	t%          |	          }
|
                    | j        j        j                  }t-          t          || j                  t.          j        j        j        j                  rt9          j        |g|||R i |d|iS t9          j        |g|||R i ||ddS )	Nz"benchmark module key: %s, path: %sFr   warmupcpustreamT)rR  benchmark_run)r   load_by_key_pathrD  rC  rC   rD   r   r   runr   r   __self__with_bandwidth_infoinspect	signature
parametersrR   r.  r   get_raw_streamr   r/  r   r,  	_inductorruntimetriton_heuristicsDebugAutotuner	functoolspartial)rV   r   r   mod
run_methodr   
warmup_argrX  rR  r:  r   s              r0   r  z"TritonBenchmarkRequest.make_run_fn[  s    *4+@$BRSS0!	
 	
 	
 S$"2337
$/**
27
/ 
w((44???#(Jx :?e##FF*/K7DD%44'.4 F C)**O#5D
 
 	 $  	  
       $  	  
  "    r/   c                    t          j        | j        | j                  }t	          || j                                                   d S r;   )r   rT  rD  rC  r   r   
precompile)rV   rb  s     r0   rf  z!TritonBenchmarkRequest.precompile  s?    *4+@$BRSST%&&1133333r/   c                6    d| j         d| j        d| j        S )Nself.kernel_name=z, self.module_path=z, self.module_cache_key=)r   rC  rD  r   s    r0   __str__zTritonBenchmarkRequest.__str__  s)    T$"TTt'7TTD<QTTTr/   )r   r   r   r   r   )r   ro   r   r   r   r   r   r   rC  ro   rD  ro   rE  r   rF  r   rG  r   rH  r   rI  r   rJ  r   rK  r   r7   r8   r  r7   ro   )r+   r,   r-   rW   r  rf  ri  __classcell__rN  s   @r0   rB  rB  =  s         $%%&$%      64 4 4 4l4 4 4U U U U U U U Ur/   rB  c                      e Zd ZdS )TritonGPUBenchmarkRequestNr*   r.   r/   r0   rn  rn    r1   r/   rn  c                      e Zd ZdS )TritonCPUBenchmarkRequestNr*   r.   r/   r0   rp  rp    r1   r/   rp  c                  P     e Zd ZdZd fdZd ZddZddZd ZddZ	ddZ
 xZS )CUDABenchmarkRequestae  
    A class to handle CUDA (CUTLASS) benchmark requests. This class is for
    managing the lifecycle of a CUDA kernel benchmark, including compiling
    the source code, managing workspace memory, and executing the kernel.

    Important: Instances of this class have to be serializable across
    process boundaries. Do not put CUDA Tensors in here!
    r   ro   r   r   r   r   r   source_coder7   r8   c                    t                                          ||||           || _        d| _        d | _        d | _        d| _        d| _        d| _        t          j
        | j        d          \  | _        | _        d S )Nr   F so)rM  rW   rs  workspace_size	workspaceDLL_workspace_size_updatedhash_keysource_filer   writerV   r   r   r   r   rs  rN  s         r0   rW   zCUDABenchmarkRequest.__init__  s     	&79KZXXX&#$15)-',$ "*7*=d>NPT*U*U't'''r/   c                    t                               d|            t          j        | j        d           t                               d|            dS )z
        Precompile the CUDA source code to populate the CUDACodeCache.
        This may happen in a separate thread pool.
        Precompiling %srv  Done precompiling %sN)rC   rD   r   compilers  r   s    r0   rf  zCUDABenchmarkRequest.precompile  sN    
 	.555d.5553T:::::r/   r   r   r   r  c          	     \  
 |                                   |                                  d t          |          |gz   D             }t                              d| j        | j        | j        | j        || j	                   t          t          j                                        j                  }t          | j        | j                  }t          d          }| j        dk    r\t          j        | j        dz   dz  t          j        |j                  | _        t          | j                                                  }t-          j        |g|| j	        d||R  }	  |             nA# t0          $ r4}t3          |          

fd}	|                                  |	cY d}~S d}~ww xY w|S )	zc
        Create a function to run the CUDA kernel with the given input and output tensors.
        c                P    g | ]#}t          |                                          $S r.   )r	   data_ptrr0  s     r0   r   z4CUDABenchmarkRequest.make_run_fn.<locals>.<listcomp>  s*    VVV**++VVVr/   zqmake_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sr         )r   rR   Nc                 "    t                     r;   )RuntimeError)err_msgs   r0   raise_runtime_errorz=CUDABenchmarkRequest.make_run_fn.<locals>.raise_runtime_error  s    "7+++r/   )ensure_dll_loadedupdate_workspace_sizer   rC   rD   r   r|  r{  ry  r   r	   r,  r3  current_streamcuda_streamr   rw  zerosfloat64rR   rx  r  r`  ra  r  ro   r  )rV   r   r   args
stream_ptrrc  workspace_ptrretrA   r  r  s             @r0   r  z CUDABenchmarkRequest.make_run_fn  s    	   ""$$$VV$}:M:MQTPU:UVVVMHO	
 	
 	
 ej7799EFF
TXt'788
 """[$q(Q.mz  DN
 %T^%<%<%>%>??M 

 _
 	

 
 
 
 
		'CEEEE 	' 	' 	'!ffG, , , , , !!!&&&&&&&	' 
s    
E+ +
F)5)F$F)$F)c           
        | j         rd S |                                  t          t                              d | j        D                                 }d t          |dz             D             }t          t          j	        
                                j                  }t          | j        | j                  }t                      } |g || j        t#          |          d |R   t          j	                                         |j        | _        t*                              d| j        | j        | j        | j        | j        || j                   d| _         d S )Nc              3  $   K   | ]}|j         V  d S r;   )r   )r   metas     r0   r   z=CUDABenchmarkRequest.update_workspace_size.<locals>.<genexpr>  s$      GG$)GGGGGGr/   c                ,    g | ]}t          d           S r;   )r	   )r   _s     r0   r   z>CUDABenchmarkRequest.update_workspace_size.<locals>.<listcomp>  s    FFF1FFFr/   r"   zupdate_workspace_size called: new workspace size=%d, self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sT)rz  r  r   r   fromkeysr   r   r	   r,  r3  r  r  r   ry  r   r   r   r   r8  valuerw  rC   rD   r|  r{  )rV   unique_input_countr  r  rc  c_workspace_sizes         r0   r  z*CUDABenchmarkRequest.update_workspace_size  s~   ' 	F    MMGG0FGGGGG
 
 GF.@1.D(E(EFFFej7799EFF
TXt'788
#::
 	
	
_	
   	
 	
 	
 	
 	
 	
 	
   .4 hMHO		
 		
 		
 (,$$$r/   c                r    | j         /t          j        | j        d          \  | _         | _        | _        d S d S )Nrv  )ry  r   rQ   rs  r{  r|  r   s    r0   r  z&CUDABenchmarkRequest.ensure_dll_loaded   s@    88E8J $9 95DHdmT%5%5%5 r/   c                b    | j          | j                                          d | _         d | _        d S r;   )ry  rt   rx  r   s    r0   r  z#CUDABenchmarkRequest.cleanup_run_fn&  s.    8HNNDHr/   c                6    d| j         d| j        d| j        S )Nrh  z, self.source_file=z, self.hash_key=)r   r|  r{  r   s    r0   ri  zCUDABenchmarkRequest.__str__,  s(    L$"LLt'7LLDMLLLr/   r   ro   r   r   r   r   r   r   rs  ro   r7   r8   r  r   rj  )r+   r,   r-   r   rW   rf  r  r  r  r  ri  rk  rl  s   @r0   rr  rr    s         V V V V V V$; ; ;4 4 4 4l", ", ", ",H     M M M M M M M Mr/   rr  c                  >     e Zd Zd fdZd ZddZddZddZ xZS )CppBenchmarkRequestr   ro   r   r   r   r   r   rs  r7   r8   c                    t                                          ||||           || _        t          |          | _        d | _        d S r;   )rM  rW   rs  r   r{  ry  r~  s         r0   rW   zCppBenchmarkRequest.__init__4  sG     	&79KZXXX& --6:r/   c                    t                               d|            t          j        | j        d           t                               d|            d S )Nr  rQ  r:  r  )rC   rD   r   rQ   rs  r   s    r0   rf  zCppBenchmarkRequest.precompileA  sQ     	.555$*>>>>3T:::::r/   r   r   r   r  c                  t          j        | j        d          | _        d t	          |          |gz   D             }t
                              d| j        | j        || j                   t          | j        | j                  }t          d | j        D                       sJ t          j        gt          |          t          t	          | j                            z   z  |_        t          j        |g|| j        R  S )NrQ  r  c                6    g | ]}|                                 S r.   )r  r0  s     r0   r   z3CppBenchmarkRequest.make_run_fn.<locals>.<listcomp>M  s"    LLLf!!LLLr/   zJmake_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%sc              3  J   K   | ]}t          |t          j                  V  d S r;   )r   ctypesc_ulonglong)r   args     r0   r   z2CppBenchmarkRequest.make_run_fn.<locals>.<genexpr>W  s/      RR3:c6#566RRRRRRr/   )r   rQ   rs  ry  r   rC   rD   r   r   r   r   r  r  r   argtypesr`  ra  )rV   r   r   r  rc  s        r0   r  zCppBenchmarkRequest.make_run_fnH  s     $T%55IIILL]0C0Cse0KLLLXHO	
 	
 	
 TXt'788
RR$/RRRRRRRR%12IID11222


  

 _
 
 
 	
r/   c                z    | j         1	 t          | j         d          r| j                                          d S d S d S )Nrt   )ry  hasattrrt   r   s    r0   r  z"CppBenchmarkRequest.cleanup_run_fnc  sO    8 tx)) !       ! !r/   c                    d| j         S )Nrh  )r   r   s    r0   ri  zCppBenchmarkRequest.__str__k  s    %$"%%%r/   r  r  r   rj  )	r+   r,   r-   rW   rf  r  r  ri  rk  rl  s   @r0   r  r  0  s        ; ; ; ; ; ;; ; ;
 
 
 
6! ! ! !& & & & & & & &r/   r  c                  4     e Zd ZdZd fdZddZddZ xZS )CuteDSLBenchmarkRequestz;Benchmark request for CuteDSL (CUTLASS Python DSL) kernels.r   ro   r   r   r   r   tuple[Any, ...]rs  r    r7   r8   c                    t                                          ||||           |                                }t          j        |          \  | _        | _        d S r;   )rM  rW   finalize_allr   r}  rD  rC  )rV   r   r   r   r   rs  finalized_coderN  s          r0   rW   z CuteDSLBenchmarkRequest.__init__r  sW     	&79KZXXX$11332=2CN2S2S/t///r/   r   r   r   r  c                  t          j        | j        | j                  ddlm} | j         d| }t          |          s0fdt                    D             }t          d| d|           t          |          fd}|S )z
        Create a function to run the CuteDSL kernel with the given input and output tensors.
        Similar to TritonBenchmarkRequest.make_run_fn but for CuteDSL kernels.
        r"   )MAIN_SUFFIXr  c                N    g | ]!}t          t          |                    |"S r.   )callabler   )r   r   rb  s     r0   r   z7CuteDSLBenchmarkRequest.make_run_fn.<locals>.<listcomp>  s0    SSS$hwsD?Q?Q6R6RSSSSr/   z-Could not find CuteDSL main kernel function 'z'. Available callables: c                 z    t          d          } |                     j        j                  } g R d|iS )Nr3  rR  )r   r[  rR   r/  )r   rR  r   kernel_funcr   s     r0   
run_kernelz7CuteDSLBenchmarkRequest.make_run_fn.<locals>.run_kernel  sL    7??%44SZ5EFFF;BBsBBB6BBBr/   )r   rT  rD  rC  codegen.cutedsl.cutedsl_kernelr  r   r  dirr  r   )	rV   r   r   r  main_func_name	availabler  r  rb  s	    ``    @@r0   r  z#CuteDSLBenchmarkRequest.make_run_fn  s     *4+@$BRSS 	@????? ,<<{<<sN++ 	SSSS#c((SSSIssshqss   c>22	C 	C 	C 	C 	C 	C 	C
 r/   c                    dS )z*Clean up any resources used by the kernel.Nr.   r   s    r0   r  z&CuteDSLBenchmarkRequest.cleanup_run_fn  s      r/   )r   ro   r   r   r   r   r   r  rs  r    r7   r8   r  r   )r+   r,   r-   r   rW   r  r  rk  rl  s   @r0   r  r  o  sq        EET T T T T T   :9 9 9 9 9 9 9 9r/   r  r7   c                 T    t                      } t          j        | j                   | S r;   )r   atexitrk   r   )pools    r0   get_tuning_process_poolr    s$    D
ODM"""Kr/   r   r   r   c                D    t                                          |           S )zO
    Do benchmarking in a subprocess and return the perf number (latency).
    )r  r   )r   s    r0   benchmark_in_sub_processr    s     #$$..w777r/   )r7   r   r   )Z
__future__r   r  r  dataclassesr`  r  rE   rM   r   rh   rq   rm   r  r   collections.abcr   r   concurrent.futuresr   r   r   r	   r
   typingr   r   r   r   r   r   r,  torch._inductor.async_compiletorch._dynamo.device_interfacer   torch._dynamo.testingr   torch._inductorr   torch._inductor.codecacher   r   r   r   r   torch._inductor.utilsr   r   r   r   torch._loggingr   torch.utils._ordered_setr   typesr    torch._inductor.select_algorithmr    r!   ru  r#   runtime.benchmarkingr$   virtualizedr%   r&   r+   rC   r=   r)   r3   r   r   r   LayoutOrBuffer	dataclassr   r   r  r)  r>  rB  rn  rp  rr  r  r  cacher  r  r.   r/   r0   <module>r     s   " " " " " "            				           



   . . . . . . . . 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 D D D D D D D D D D D D D D D D  $ $ $ $ C C C C C C . . . . . .                               - , , , , , / / / / / /  U      TTTTTTTT       - - - - - -       . ""8\::	 	 	 	 		 	 	 	i i i i i i i iXe e e e e e e eP ry")+, 3
 3
 3
 3
 3
 3
 3
 3
l ] ] ] ] ] ] ] ]@    ,   D               F- - - - - - - -YU YU YU YU YU- YU YU YUx	 	 	 	 	 79O 	 	 		 	 	 	 	 79O 	 	 	LM LM LM LM LM24D LM LM LM^<& <& <& <& <&13C <& <& <&~.9 .9 .9 .9 .957G .9 .9 .9b    8 8 8 8 8 8r/   