
    -`if                        d dl Z d dlZd dlZd dlmZ d dl mZ d dlmZmZ d dl	m
Z
 d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ  ee          Z G d d          Z G d d          Z de!e         fdZ"de#e$ef         de%fdZ& G d de          Z' G d de          Z(de#e$ef         deddfdZ)ddZ* G d de          Z+dS )    N)Callable)	ExitStack)AnyLiteral)patch)compilation_counter)
VllmConfig)Range)init_logger)	safe_hash)is_torch_equal_or_newerc                      e Zd ZU dZeed<   	 ddedededd	fd
ZdedefdZ		 dde
j        dee         deeef         deded	z  deedef         d	z  ed	z  f         fdZdede
j        dee         dedededef         fdZd	S )CompilerInterfacez@
    The interface for a compiler that can be used by vLLM.
    nameF 	cache_dirdisable_cacheprefixreturnNc                     dS )a\  
        when the vLLM process uses `cache_dir` as the cache directory,
        the compiler should initialize itself with the cache directory,
        e.g. by re-directing its own cache directory to a sub-directory.

        prefix can be used in combination with cache_dir to figure out the base
        cache directory, e.g. there're multiple parts of model being compiled,
        but we want to share the same cache directory for all of them.

        e.g.
        cache_dir = "/path/to/dir/backbone", prefix = "backbone"
        cache_dir = "/path/to/dir/eagle_head", prefix = "eagle_head"
        N selfr   r   r   s       w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/compilation/compiler_interface.pyinitialize_cachez"CompilerInterface.initialize_cache#   s	      	    vllm_configc                     dS )a  
        Gather all the relevant information from the vLLM config,
        to compute a hash so that we can cache the compiled model.

        See [`VllmConfig.compute_hash`][vllm.config.VllmConfig.compute_hash]
        to check what information
        is already considered by default. This function should only
        consider the information that is specific to the compiler.
        r   r   )r   r   s     r   compute_hashzCompilerInterface.compute_hash5   s	     rr   graphexample_inputscompiler_configcompile_rangekey.c                     dS )ab  
        Compile the graph with the given example inputs and compiler config,
        with a range. The `compile_range` specifies the range of the inputs,
        it could be concrete size (if compile_sizes is provided), e.g. [4, 4]
        or a range [5, 8].
        Right now we only support one variable in ranges for all inputs,
         which is the batchsize (number of tokens) during inference.

        Dynamo will make sure `graph(*example_inputs)` is valid.

        The function should return a compiled callable function, as well as
        a handle that can be used to directly load the compiled function.

        The handle should be a plain Python object, preferably a string or a
        file path for readability.

        If the compiler doesn't support caching, it should return None for the
        handle. If the compiler fails to compile the graph, it should return
        None for the compiled function as well.

        `key` is required for StandaloneInductorAdapter, it specifies where to
        save the compiled artifact. The compiled artifact gets saved to
        `cache_dir/key`.
        NNr   r   r    r!   r"   r#   r$   s         r   compilezCompilerInterface.compileA   s
    @ zr   handlegraph_indexc                      t          d          )z
        Load the compiled function from the handle.
        Raises an error if the handle is invalid.

        The handle is the second return value of the `compile` function.
        zcaching is not supported)NotImplementedError)r   r)   r    r!   r*   r#   s         r   loadzCompilerInterface.loadc   s     ""<===r   Fr   N)__name__
__module____qualname____doc__str__annotations__boolr   r	   r   fxGraphModulelistr   dictr
   tupler   r(   intr-   r   r   r   r   r      sq          III JL -1CF	   $

 
s 
 
 
 
$    ~  S	  c3h	 
   4Z  
xS!D(#*4	5       D>> ~> S		>
 > > 
#s(	> > > > > >r   r   c                   z    e Zd ZdZddZdededed         fdZdededee         fd	Z	dededed
         fdZ
dS )AlwaysHitShapeEnva  
    Why do we need this class:

    For normal `torch.compile` usage, every compilation will have
    one Dynamo bytecode compilation and one Inductor compilation.
    The Inductor compilation happens under the context of the
    Dynamo bytecode compilation, and that context is used to
    determine the dynamic shape information, etc.

    For our use case, we only run Dynamo bytecode compilation once,
    and run Inductor compilation multiple times with different shapes
    plus a general shape. The compilation for specific shapes happens
    outside of the context of the Dynamo bytecode compilation. At that
    time, we don't have shape environment to provide to Inductor, and
    it will fail the Inductor code cache lookup.

    By providing a dummy shape environment that always hits, we can
    make the Inductor code cache lookup always hit, and we can
    compile the graph for different shapes as needed.

    The following dummy methods are obtained by trial-and-error
    until it works.
    r   Nc                     g | _         d S r/   )guards)r   s    r   __init__zAlwaysHitShapeEnv.__init__   s    !#r   argskwargsTc                     dS )NTr   r   rB   rC   s      r   evaluate_guards_expressionz,AlwaysHitShapeEnv.evaluate_guards_expression   s    tr   c                     g S r/   r   rE   s      r   get_pruned_guardsz#AlwaysHitShapeEnv.get_pruned_guards   s    	r   r   c                     dS )Nr   r   rE   s      r   produce_guards_expressionz+AlwaysHitShapeEnv.produce_guards_expression   s    rr   r   N)r0   r1   r2   r3   rA   r   r   rF   r9   rH   rJ   r   r   r   r>   r>   t   s         0$ $ $ $ s wt}    s c d3i    s c gbk      r   r>   r   c                      g } ddl m} |                                }|                     |           ddl m}  |            }|                     |           | S )Nr   )	CacheBase)	torch_key)torch._inductor.codecacherM   
get_systemappendrN   )factorsrM   system_factorsrN   torch_factorss        r   get_inductor_factorsrU      sr    G333333))++NNN>""" 433333IKKMNN=!!!Nr   vllm_additional_inductor_configc                 |    |                      dd          }t          j         ot          j        j        j         o| S )Nforce_disable_cachesF)getenvsVLLM_DISABLE_COMPILE_CACHEtorch	_inductorconfigrX   )rV   "vllm_inductor_config_disable_caches     r   is_compile_cache_enabledr`      sN     *I)L)L* *& ++ 	3&;;	322r   c                       e Zd ZdZdZded         ddfdZdedefd	Z		 ddede
deddfdZ	 ddej        dee         deeef         dededz  deedef         dz  edz  f         fdZdedej        dee         dedededef         fdZdS )InductorStandaloneAdaptorz
    The adaptor for the Inductor compiler.
    Requires PyTorch 2.8+.
    This is not on by default yet, but we plan to turn it on by default for
    PyTorch 2.8.

    Use VLLM_USE_STANDALONE_COMPILE to toggle this on or off.
    inductor_standalonesave_format)binaryunpackedr   Nc                     || _         d S r/   )rd   )r   rd   s     r   rA   z"InductorStandaloneAdaptor.__init__   s    &r   r   c                     t                      }t          t          |                                          d                                          d d         }|S NF)usedforsecurity
   rU   r   r4   encode	hexdigestr   r   rR   hash_strs       r   r   z&InductorStandaloneAdaptor.compute_hash   S    &((!LL!!5
 
 

)++crc r   Fr   r   r   r   c                     || _         d S r/   )r   r   s       r   r   z*InductorStandaloneAdaptor.initialize_cache   s     #r   r    r!   r"   r#   r$   .c                    t           xj        dz  c_        i }||                    |           t          ||           t	                       |                                rd}nd}ddlm} t          d          }	|	s&t          j
        rt                              d           |d|id	}
|	ot          j
        }|rd
|
d<    |||fi |
}|r.ddlm} t          ||          sJ t!          |d          sJ |d fS |J t"          j                            | j        |          }t+          |          r1|                    || j                   t           xj        dz  c_        |||ffS )N   from_example_inputs
from_graphr   )standalone_compilez
2.10.0.devzCRITICAL: VLLM_USE_MEGA_AOT_ARTIFACT is enabled but PyTorch version does not support 'aot' parameter in standalone_compile. This requires PyTorch 2.10.0+. Falling back to non-AOT mode.config_patches)dynamic_shapesoptionsTaot)AOTCompiledArtifact	serializepathformat)r   num_inductor_compilesupdateset_inductor_configset_functorch_configis_single_sizetorch._inductorrw   r   rZ   VLLM_USE_MEGA_AOT_ARTIFACTloggererror"torch._inductor.standalone_compiler|   
isinstancehasattrosr   joinr   r`   saverd   num_compiled_artifacts_saved)r   r    r!   r"   r#   r$   current_configry   rw   supports_aotcompile_kwargsuse_aotcompiled_graphr|   r   s                  r   r(   z!InductorStandaloneAdaptor.compile   s    	11Q611&!!/222NM:::'')) 	*2NN)N666666.|<< 	 ? 	LL9   - .
 
 %H)H  	)$(N5!++E>TT^TT 	(NNNNNNn.ABBBBB>;77777 "4'' w||DNC00#O44 	BT$2BCCC<<A<<T{**r   r)   r*   c                   	
 t          |t                    sJ t          |d         t                    sJ t          |d         t                    sJ |d         }t          j        j                            || j                  	ddlm	}  ||          
dt          dt          t          df         t          z  f	
fd}|S )	Nr   rt   r~   graph_returns_tuplerB   r   .c                  &     |  }r|S |d         S Nr   r   )rB   graph_outputinductor_compiled_graphreturns_tuples     r   compiled_graph_wrapperz>InductorStandaloneAdaptor.load.<locals>.compiled_graph_wrapper,  s*    22D9L  '###A&r   )r   r;   r4   r\   r]   CompiledArtifactr-   rd   torch._inductor.compile_fxr   r   )r   r)   r    r!   r*   r#   r   r   r   r   r   s            @@r   r-   zInductorStandaloneAdaptor.load  s     &%(((((&)S)))))&)S)))))ay"'/"B"G"Gd. #H #
 #
 	CBBBBB++E22	'# 	'%S/C2G 	' 	' 	' 	' 	' 	' 	' &%r   r.   r/   )r0   r1   r2   r3   r   r   rA   r	   r4   r   r6   r   r7   r8   r9   r   r:   r
   r;   r   r(   r<   r-   r   r   r   rb   rb      s         !D'G,@$A 'd ' ' ' '
 s     JL# ##-1#CF#	# # # # A+ A+~A+ S	A+ c3h	A+
 A+ 4ZA+ 
xS!D(#*4	5A+ A+ A+ A+F&& ~& S		&
 & & 
#s(	& & & & & &r   rb   c                   &   e Zd ZdZdZdedefdZ	 dded	ed
eddfdZ		 dde
j        dee         deeef         dededz  deedef         dz  edz  f         fdZdede
j        dee         dedededef         fdZdej        e         fdZdS )InductorAdaptorzG
    The adaptor for the Inductor compiler, version 2.5, 2.6, 2.7.
    inductorr   r   c                     t                      }t          t          |                                          d                                          d d         }|S ri   rl   ro   s       r   r   zInductorAdaptor.compute_hash@  rq   r   Fr   r   r   r   Nc                    || _         || _        |r|d t          |                    n|| _        |rd S t          j                            | j        d          }t	          j        |d           |t          j        d<   t          j                            | j        d          }t	          j        |d           |t          j        d<   d S )Ninductor_cacheT)exist_okTORCHINDUCTOR_CACHE_DIRtriton_cacheTRITON_CACHE_DIR)	r   r   lenbase_cache_dirr   r   r   makedirsenviron)r   r   r   r   r   r   s         r   r   z InductorAdaptor.initialize_cacheG  s     #;APi3v;;,77y 	F
 d&9;KLL
NT22220>
,-w||D$7HH
L40000)5
%&&&r   r    r!   r"   r#   r$   .c                     t           xj        dz  c_        ddlm} i }||                    |           d|d<   d|d<   t          ||           t                       t          j        |          }d\  dd	l	m
}m t          j                            d
          r=|j        d}	dt           dt           dt           f fd}
t          j        j        j        }n/t          j        dk    rd }	dt           dt           dt           f fd}dt           dt           dt           ffd}dt           dt           dd fd}dt&          fd}t)                      5 }|	#|                    t-          |	|
                     |                    t-          d|                     |                    t-          d|                     ddlm} t3          |d          r#|                    t-          d|                     |                    t-          d|                     |                                                                t7          d          r|                    t          j        j                            d                     |                    t          j        j                            d                     |                    t          j        j                            d                      |||||          }d d d            n# 1 swxY w Y   t=          |          rt?          d          
J d             |ffS )!Nrt   r   )
compile_fxTfx_graph_cacheFfx_graph_remote_cacher&   )FxGraphCachecompiled_fx_graph_hash2.5z+torch._inductor.codecache.FxGraphCache.loadrB   rC   r   c                  6    | i |}|j         }|j        j                            j                  sb|j        [|j        D ]S}t          |j                  s|j        j        j                            j                  r|j        j        j         nT|S r/   )current_callable__code__co_filename
startswithr   __closure__callablecell_contents)rB   rC   r   compiled_fncell	file_pathoriginal_loadr   s        r   hijack_loadz,InductorAdaptor.compile.<locals>.hijack_load  s    *7-*H*H*H'5F'0<	!,,T-@AA"#/; !, 7 " "'(:;; %$-6BMM /  " )-(:(C(OI!E" /.r   2.6c                  b   t          j        j        j        | i |}|}||j        }|j        j                            	j                  sZ|j	        S|j	        D ]K}t          |j                  s|j        j        }|j                            	j                  r	|j         nL|j        |S r/   )r\   r]   r   compile_fx_innerr   r   r   r   r   r   r   r   _fx_graph_cache_key)
rB   rC   outputr   r   r   coder   rp   r   s
          r   hijacked_compile_fx_innerz:InductorAdaptor.compile.<locals>.hijacked_compile_fx_inner  s    3DdUfUU*0'*6"9"JK + 4 @I%001DEE&'3? %0$; & &D#+D,>#?#? ) (#'#5#>D#/::4;NOO & -1,<	 %	&
  7JHr   c                  (     | i |}|d         |S r   r   )rB   rC   outr   rp   s      r   hijack_compiled_fx_graph_hashz>InductorAdaptor.compile.<locals>.hijack_compiled_fx_graph_hash  s&    (($9&99C1vHJr   c                      d S r/   r   rB   rC   s     r   _check_can_cachez1InductorAdaptor.compile.<locals>._check_can_cache  s	     Fr   c                      t                      S r/   r>   r   r   r   _get_shape_envz/InductorAdaptor.compile.<locals>._get_shape_env  s    $&&&r   z0torch._inductor.codecache.compiled_fx_graph_hash5torch._inductor.codecache.FxGraphCache._get_shape_envAOTAutogradCacher   Mtorch._functorch._aot_autograd.autograd_cache.AOTAutogradCache._get_shape_envz7torch._inductor.codecache.FxGraphCache._check_can_cache)r   )enable_autograd_cache)enable_remote_autograd_cache)inner_compilerx   zvLLM failed to compile the model. The most likely reason for this is that a previous compilation failed, leading to a corrupted compilation artifact. We recommend trying to remove ~/.cache/vllm/torch_compile_cache and try again to see the real issue. z1failed to get the file path of the compiled graph) r   r   r   r   r   r   r   copydeepcopyrO   r   r   r\   __version__r   r-   r   r]   r   r>   r   enter_contextr   -torch._functorch._aot_autograd.autograd_cacher   r   metrics_contextr   r^   
_functorchr`   RuntimeError)r   r    r!   r"   r#   r$   r   r   r   original_load_namer   r   r   r   r   stackr   r   r   r   rp   r   s   `                 @@@@r   r(   zInductorAdaptor.compileZ  s    	11Q611999999&!!/222 ,0'(27./NM::: e$$ ))RRRRRRRR''.. 7	(-M!N/3 /# /# / / / / / / / /, ).(B(S%%%''!%          4	 	 	 	 	 	 	 	 	 		C 	3 	4 	 	 	 		' 1 	' 	' 	' 	' [[ E	E!-##E*<k$J$JKKK F1    K"    WVVVVV ')9:: ##g&    M$     4 4 6 6777 'u-- ##O*00u0MM   ##$+111NN   ##$+11u1UU   (Z7-	  NAE	 E	 E	 E	 E	 E	 E	 E	 E	 E	 E	 E	 E	 E	 E	P $O44 	".   ((C )(( )444s   F5LLLr)   r*   c                    t          |t                    sJ t          |d         t                    sJ t          |d         t                    sJ |d         }ddlm} ddlm} t                      5 }	|	                    t          dd                      t          |d          r$|	                    t          dd	                      |	                    |                                            t          j                            d
          r%|                    ||dd          
J d            nIt          j        dk    r9ddlm}
  |
|          }|                    ||dd |          \  }
J d            d d d            n# 1 swxY w Y   ddlm}  ||          dt(          dt          t(          df         t(          z  ffd}|S )Nr   rt   r   )r   r   c                      t                      S r/   r   r   s     r   <lambda>z&InductorAdaptor.load.<locals>.<lambda>2  s    ,=,?,? r   r   r   c                      t                      S r/   r   r   s     r   r   z&InductorAdaptor.load.<locals>.<lambda>:  s    0A0C0C r   r   TFzNInductor cache lookup failed. Please remove the cache directory and try again.r   )CompiledFxGraphConstantsWithGmr   rB   r   .c                  P    t          |           } |          }r|S |d         S r   )r9   )rB   	list_argsr   r   r   s      r   r   z,InductorAdaptor.load.<locals>.compiled_graph`  s6    T

I229==L '###A&r   )r   r;   r4   r   r   rO   r   r   r   r   r   r   r\   r   r   _lookup_graphtorch._inductor.output_coder   r   r   r   )r   r)   r    r!   r*   r#   rp   r   r   
exit_stackr   	constants_r   r   r   r   s                  @@r   r-   zInductorAdaptor.load  s    &%(((((&)S)))))&)S)))))!9RRRRRR::::::[[ %	J$$K??    ')9:: ((gCC    $$T%9%9%;%;<<< ++E22 *6*D*DndE+ +' /::: ;::: "e++VVVVVV::5AA	-9-G-GndD). .*' /::: ;::E%	 %	 %	 %	 %	 %	 %	 %	 %	 %	 %	 %	 %	 %	 %	Z 	CBBBBB++E22	'# 	'%S/C*? 	' 	' 	' 	' 	' 	' 	' s   6DFFFc                     t          d          r"ddl}|j        j                                        S t          j                    S )a  
        This method returns the Dynamo metrics context (if it exists,
        otherwise a null context). It is used by various compile components.
        Present in torch>=2.6, it's used inside FxGraphCache in
        torch==2.6 (but not after). It might also be used in various other
        torch.compile internal functions.

        Because it is re-entrant, we always set it (even if entering via Dynamo
        and the context was already entered). We might want to revisit if it
        should be set at a different mode of compilation.

        This is likely a bug in PyTorch: public APIs should not rely on
        manually setting up internal contexts. But we also rely on non-public
        APIs which might not provide these guarantees.
        r   r   N)r   torch._dynamo.utils_dynamoutilsget_metrics_context
contextlibnullcontext)r   r\   s     r   r   zInductorAdaptor.metrics_contextl  sF      #5)) 	,&&&&=&::<<<)+++r   r.   r/   )r0   r1   r2   r3   r   r	   r4   r   r6   r   r7   r8   r9   r   r:   r
   r;   r   r(   r<   r-   r   AbstractContextManagerr   r   r   r   r   r   9  s         D
 s     JL6 66-16CF6	6 6 6 62 B5 B5~B5 S	B5 c3h	B5
 B5 4ZB5 
xS!D(#*4	5B5 B5 B5 B5HLL ~L S		L
 L L 
#s(	L L L L\,!B3!G , , , , , ,r   r   r^   r#   c                 n    |                                 r t          j        | d<   t          j        | d<   d S d S )Nmax_autotunecoordinate_descent_tuning)r   rZ   !VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE.VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING)r^   r#   s     r   r   r     sF    ##%% 
 "&!G~? 	*+++	
 
r   c                  N    t           j        sdt          j        j        _        d S d S )NF)rZ   r   r\   r   r^   bundled_autograd_cacher   r   r   r   r     s+    * ?9>666? ?r   c                       e Zd ZdZ	 ddej        dee         dee	ef         de
de	dz  deed	ef         dz  edz  f         fd
ZdS )EagerAdaptoreagerNr    r!   r"   r#   r$   r   .c                 4    t           xj        dz  c_        |d fS )Nrt   )r   num_eager_compilesr'   s         r   r(   zEagerAdaptor.compile  s$     	..!3.. d{r   r/   )r0   r1   r2   r   r7   r8   r9   r   r:   r4   r
   r;   r   r(   r   r   r   r   r     s        D  ~ S	 c3h	
  4Z 
xS!D(#*4	5     r   r   rK   ),r   r   r   collections.abcr   r   typingr   r   unittest.mockr   r\   r   torch.fxr7   	vllm.envsrZ   vllm.compilation.counterr   vllm.configr	   vllm.config.utilsr
   vllm.loggerr   vllm.utils.hashingr   vllm.utils.torch_utilsr   r0   r   r   r>   r9   rU   r:   r4   r6   r`   rb   r   r   r   r   r   r   r   <module>r     s        				 $ $ $ $ $ $                            ! ! ! !             8 8 8 8 8 8 " " " " " " # # # # # # # # # # # # ( ( ( ( ( ( : : : : : :	X		W> W> W> W> W> W> W> W>t# # # # # # # #Ld3i     %)#s(^	   "{& {& {& {& {& 1 {& {& {&|H, H, H, H, H,' H, H, H,V

S#X 
u 
 
 
 
 
? ? ? ?
    $     r   