
    .`i8+                     j    d dl mZ d dlmZmZ d dlmZ d dlmZ  ee	          Z
 G d d          ZdS )    )product)CUDAGraphMode
VllmConfig)BatchDescriptor)init_loggerc                       e Zd ZdZdefdZddZdeded	ede	fd
Z
dede	fdZ	 ddedefdZ	 	 	 ddeded	ededeee	f         f
dZdeeeee	         f                  fdZdS )CudagraphDispatchera  
    Runtime cudagraph dispatcher to dispatch keys for multiple set of
    cudagraphs.

    The dispatcher stores two sets of dispatch keys, one for PIECEWISE and one
    for FULL cudagraph runtime mode. The keys are initialized depending on
    attention support and what cudagraph mode is set in CompilationConfig. The
    keys stored in dispatcher are the only source of truth for valid
    cudagraphs that can be dispatched at runtime.

    At runtime, the dispatch method generates the runtime cudagraph mode (FULL,
    PIECEWISE, or NONE for no cudagraph) and the valid key (batch descriptor)
    based on the input key. After dispatching (communicated via forward
    context), the cudagraph wrappers will trust the dispatch key to either
    capture or replay (if the mode matches), or pass through to the underlying
    runnable without cudagraph (if the mode does not match or mode is NONE).
    vllm_configc                    || _         |j        | _        | j         j        sdnd| j         j        j        z   | _        t
          j        t                      t
          j        t                      i| _	        | j        j
                                        rJ| j                                        s1J d| j        j
         d| j        j         d| j        j                     d| _        t
          j        | _
        d S )N   zCompilation mode should be CompilationMode.VLLM_COMPILE when cudagraph_mode piecewise cudagraphs is used, and attention should be in splitting_ops or inductor splitting should be used. cudagraph_mode=z, compilation_mode=z, splitting_ops=F)r
   compilation_configspeculative_confignum_speculative_tokensuniform_decode_query_lenr   	PIECEWISEsetFULLcudagraph_keyscudagraph_moderequires_piecewise_compilationis_attention_compiled_piecewisemodesplitting_opskeys_initializedNONE)selfr
   s     p/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/cudagraph_dispatcher.py__init__zCudagraphDispatcher.__init__   s   &"-"@ #6PAAT%8OO 	% #SUUJ
 '6UUWW	
&FFHH	
 	
E #5D	E E
 !% 7 <E E "4BE E		
 	
I !&+0    returnNc                    | j         j        }| j         j        }dg|dz   z  | _        t	          ||dz   gz   dg|z             D ]3\  }}t          ||          D ]}||k    r|| j        |<   || j        |<   4| j         j        r]| j        t          j	        k    rJ| j         j        D ]?}|| j         j        k    r)| j        |         }||k    rt          d| d| d          <dS dS dS )z=Pre-compute the mapping from batch size to padded graph size.r   r   zcompile_sizes contains z which would be padded to zw. All compile_sizes must be values that won't be changed by cudagraph padding. Use values from cudagraph_capture_sizes.N)r   max_cudagraph_capture_sizecudagraph_capture_sizes_bs_to_padded_graph_sizeziprangecompile_sizesr   r   r   
ValueError)r   max_sizecapture_sizesendstartbssizepaddeds           r    _compute_bs_to_padded_graph_sizez4CudagraphDispatcher._compute_bs_to_padded_graph_size?   s\   *E/G453(Q,3G%X\N*C-
 
 	< 	<JC E3'' < <;;8=D1"558;D1"55	< #1	#}'999/= 	 	42MMM!:4@F~~(Gd G G)/G G G  	 	99	 	r   
num_tokensuniform_decodehas_lorac                    | j         j        j        }| j        }| j        |         }|r5| j                            t          j                  r||z  }||z  dk    sJ nd}t          ||          }t          ||||          S )Nr   F)r1   num_reqsuniformr3   )r
   scheduler_configmax_num_seqsr   r$   r   has_moder   r   minr   )r   r1   r2   r3   r8   r   num_tokens_paddedr5   s           r   _create_padded_batch_descriptorz3CudagraphDispatcher._create_padded_batch_descriptor_   s     '8E#'#@  9*E 	<d1::=;MNN 	<(,DDH$'??1DDDDD"N,l;;H("	
 
 
 	
r   runtime_modebatch_descriptorc                     |t           j        t           j        fv sJ d|             | j        |                             |           d S )Nz)Invalid cudagraph runtime mode for keys: )r   r   r   r   add)r   r=   r>   s      r   add_cudagraph_keyz%CudagraphDispatcher.add_cudagraph_keyt   sY      79KLLLLFFF MLL 	L)--.>?????r   r   r   r   c           	      F   || _         |t          j        k    r	d| _        d S |                                  | j        j        r| j        j        rddg}ndg}ndg}|	                                t          j        k    rot          | j        j        |          D ]T\  }}|                     |	                                |                     |d|                                                     U|                                t          j        k    r|                                rx| j        j        j        z  fd| j        j        D             }t          ||          D ]:\  }}|                     t          j        |                     |d|                     ;d| _        d S )NTFc                 ,    g | ]}|k    |k    |S  rD   ).0xmax_num_tokensr   s     r   
<listcomp>zACudagraphDispatcher.initialize_cudagraph_keys.<locals>.<listcomp>   s9     2 2 2&&10H+H+H +H+H+Hr   )r   r   r   r   r0   r
   lora_configr   cudagraph_specialize_lora
mixed_moder   r#   rA   r<    relax_for_mixed_batch_cudagraphsdecode_moder   separate_routiner7   r8   )r   r   r   
lora_casesr-   r3   "cudagraph_capture_sizes_for_decoderG   s     `    @r   initialize_cudagraph_keysz-CudagraphDispatcher.initialize_cudagraph_keys|   s   
 - ]///$(D!F--/// ' 	!&@ $"E]

"V

J
 $$&&-*<<< ''?! !  H &&"--//88E8 6688	    &&((M,>>>//11 ? )"3@A 2 2 2 2 20H2 2 2.
 !((JJ W W  H&&!&88T8LL   
 !%r   Fdisable_fullc                 
   | j         r%| j        t          j        k    s|| j        j        k    rt          j        t          |          fS |                     |||          }|                                }|sN|| j	        t          j
                 v rt          j
        |fS || j	        t          j
                 v rt          j
        |fS || j	        t          j                 v rt          j        |fS t          j        t          |          fS )a!  
        Given conditions(e.g.,batch descriptor and if using piecewise only),
        dispatch to a cudagraph runtime mode and the valid batch descriptor.
        A new batch descriptor is returned as we might dispatch a uniform batch
        to a graph that supports a more general batch (uniform to non-uniform).

        Args:
            num_tokens: Number of tokens in the batch.
            uniform_decode: Whether the batch is uniform decode (i.e. uniform and query
                length is uniform_decode_query_len).
            has_lora: Whether LoRA is active.
            disable_full: If True, skip FULL cudagraph checks and
                return PIECEWISE or NONE only. (can be used for features like
                cascade attention that are not supported by full cudagraphs)
        )r   r   r   r   r   r"   r   r<   rL   r   r   r   )r   r1   r2   r3   rR   
batch_descrelaxed_batch_descs          r   dispatchzCudagraphDispatcher.dispatch   s
   . %	C"m&888D3NNN %z'B'BBB99
 

 (HHJJ 	>T01CDDD$):55 "T%89K%LLL$)+=== !4]5L!MMM *,>>> !?:#>#>>>r   c                    | j         r| j        t          j        k    rg S g }t          j        t          j        fD ]M}t          | j        |                   }|r/|                    d d           |	                    ||f           N|S )a  
        Returns capture descriptors for cudagraph capturing.

        Returns:
            List of (runtime_mode, batch_descriptors) tuples, ordered PIECEWISE
            first then FULL. Batch descriptors are sorted largest-first for
            memory efficiency.
        c                     | j         S )N)r1   )ds    r   <lambda>z7CudagraphDispatcher.get_capture_descs.<locals>.<lambda>   s     r   T)keyreverse)
r   r   r   r   r   r   listr   sortappend)r   resultr   descss       r   get_capture_descsz%CudagraphDispatcher.get_capture_descs   s     $ 	(;}?Q(Q(QI",m.@A 	- 	-D,T233E -

55t
DDDtUm,,,r   )r    N)r   )FFF)__name__
__module____qualname____doc__r   r   r0   intboolr   r<   r   rA   rQ   tuplerV   r]   rb   rD   r   r   r	   r	      sl        $1J 1 1 1 1@   @

/3
?C
	
 
 
 
*@)@=L@ @ @ @ NO:% :%+:%GJ:% :% :% :%~  %"1? 1?1? 1? 	1?
 1? 
}o-	.1? 1? 1? 1?f4mT/=R.R(S#T      r   r	   N)	itertoolsr   vllm.configr   r   vllm.forward_contextr   vllm.loggerr   rc   loggerr	   rD   r   r   <module>ro      s          1 1 1 1 1 1 1 1 0 0 0 0 0 0 # # # # # #	X		t t t t t t t t t tr   