
    .`iWI                     p   d dl Z d dlmZ d dlmZ d dlmZ d dlZd dlm	Z	 d dl
mZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZmZmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dl m!Z!m"Z"  ee#          Z$e G d d                      Z%e G d d                      Z& G d d          Z' G d d          Z(dS )    N)Callable)	dataclass)Any)CUDAGraphWrapper)CUDAGraphMode
VllmConfig)get_ep_group)set_graph_pool_id)
DPMetadatacreate_forward_contextget_forward_contextoverride_forward_context)init_logger)current_platform)IntermediateTensors)has_deep_gemm)UBatchContextmake_ubatch_contextsc                   v    e Zd ZU eed<   ej        ed<   ej        ed<   ej        dz  ed<   edz  ed<   eed<   dS )UbatchMetadatacontext	input_ids	positionsNinputs_embedsintermediate_tensors
num_tokens)	__name__
__module____qualname__r   __annotations__torchTensorr   int     u/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/worker/gpu_ubatch_wrapper.pyr   r      se         ||<$&&&&-4444OOOOOr%   r   c                   L    e Zd ZU ej        j        ed<   eed<   dZe	dz  ed<   dS )CUDAGraphMetaData	cudagraphubatch_metadataNoutputs)
r   r   r   r!   cuda	CUDAGraphr    r   r+   r   r$   r%   r&   r(   r(   )   sC         z########GS4Zr%   r(   c                   R    e Zd Zdedeegdf         deegdf         fdZd Zd ZdS )SMControlContextManagercomm_smsset_comm_smsNset_compute_smsc                     t          j                    s
J d            t          j                            t          j                                                  }|j        }||k     sJ || _        ||z
  | _        || _	        || _
        || _        dS )a  
        Context manager for controlling SM (Streaming Multiprocessor)
        allocation. Upon entering the context, it sets the number of SMs
        allocated for communication and computation to comm_sms and
        total_sms - comm_sms respectively. Upon exiting, it restores the
        allocation to use all available SMs (i.e. total_sms).

        Args:
            comm_sms (int): The number of SMs to allocate for communication.
                (The remainder will be used for computation.)
            set_comm_sms (Callable[[int], None]):
                A function that sets the number of SMs for communication.
            set_compute_sms (Callable[[int], None]):
                A function that sets the number of SMs for computation.
        z.SM control is currently only supported on CUDAN)r   is_cudar!   r,   get_device_propertiescurrent_devicemulti_processor_count	total_smscompute_smsr0   r1   r2   )selfr0   r1   r2   propsr8   s         r&   __init__z SMControlContextManager.__init__1   s    ,  ')) 	
 	
<	
 	
) 
001J1J1L1LMM/	)####"$x/ (.r%   c                 n    |                      | j                   |                     | j                   d S N)r1   r0   r2   r9   r:   s    r&   	__enter__z!SMControlContextManager.__enter__U   s5    $-(((T-.....r%   c                 n    |                      | j                   |                     | j                   d S r>   )r1   r8   r2   )r:   exc_type	exc_value	tracebacks       r&   __exit__z SMControlContextManager.__exit__Y   s4    $.)))T^,,,,,r%   )r   r   r   r#   r   r<   r@   rE   r$   r%   r&   r/   r/   0   sy        "/"/ ud{+"/ "3%+.	"/ "/ "/ "/H/ / /- - - - -r%   r/   c                       e Zd Zdedededej        j        fdZ	e
defd            ZdefdZd	efd
Zd	ej        fdZd	ej        fdZd	ee         fdZdefdZd ZdS )UBatchWrapperrunnablevllm_configruntime_modedevicec                    || _         || _        |j        | _        t          j                            |          | _        t          j        | j        j	        j
        dz             | _        i | _        d | _        d | _        |t          j        ur/t#          |||          | _        t%          j                    | _        |                     |          | _        || _        d S )N)rK      )rJ   )rH   rI   compilation_configr!   r,   Streamcomm_stream	threadingBarrierparallel_confignum_ubatchesready_barrier
cudagraphscudagraph_wrapper
graph_poolr   NONEr   r   get_global_graph_pool_create_sm_control_context
sm_controlrK   )r:   rH   rI   rJ   rK   s        r&   r<   zUBatchWrapper.__init___   s     !&"-"@ :,,F,;;&.,9A=
 
 9;!%}111%5+L& & &D" /DFFDO99+FFr%   c                 J   t           j        }d }| j        j        rUt	                      }|j        }d ||j        &                                }|t          ||          }|dk    rfd}d }t                      r|dk    r	dd l
fd}t          |||          S )Nc                     d S r>   r$   smss    r&   <lambda>z:UBatchWrapper._create_sm_control_context.<locals>.<lambda>   s    4 r%   r   c                 .                         |           S r>   set_num_sms)r`   all2all_managers    r&   ra   z:UBatchWrapper._create_sm_control_context.<locals>.<lambda>   s    ?+F+Fs+K+K r%   c                     d S r>   r$   r_   s    r&   ra   z:UBatchWrapper._create_sm_control_context.<locals>.<lambda>   s    d r%   c                 .                         |           S r>   rc   )r`   dgs    r&   ra   z:UBatchWrapper._create_sm_control_context.<locals>.<lambda>   s    "..*=*= r%   )r0   r1   r2   )envsVLLM_DBO_COMM_SMSrS   enable_expert_parallelr	   device_communicatorre   max_sms_usedminr   	deep_gemmr/   )	rI   r0   r1   ep_grouprl   rm   r2   re   rh   s	          @@r&   r[   z(UBatchWrapper._create_sm_control_context|   s    .''&= 	L $~~H"*">"O"."5"E*.;;==+"8\::H!|| ;KKKK +*?? 	>x!||""""====O&%+
 
 
 	
r%   keyc                     t          | j        |          rt          | j        |          S t          d| d| j                   )Nz
Attribute z2 not exists in the runnable of cudagraph wrapper: )hasattrrH   getattrAttributeError)r:   rq   s     r&   __getattr__zUBatchWrapper.__getattr__   sZ    4=#&& 	/4=#...2 2 2"&-2 2
 
 	
r%   returnc                     | j         S r>   )rH   r?   s    r&   unwrapzUBatchWrapper.unwrap   s
    }r%   c                     t          j                     fd            }g }|d         j        j        }|d         j        |d         j        z   }t          d          5  g }|D ]C}t          j        |||f          }	|                    |	           |		                                 D j
                                         t          t           j                                        |          }
 j        t!           j                   n t!          t#          j                               t           j                            |
j        | j                  5  |d         j        j                                         |D ]}	|	                                 d t1          |          D             }t          j        |d	          }||
_        ddd           n# 1 swxY w Y   |
 j        |<   ddd           n# 1 swxY w Y   |
j        S )
a:  
        Capture a cudagraph for a microbatched run.

        The logic here is somewhat complicated because we need to make sure that
        each of the ubatch threads initialize the cuda context before we start
        the graph capture.

        The flow is as follows:
        1. The main thread starts up each ubatch thread. Each thread will
        initialize its cuda context (torch.cuda.current_blas_handle())
        before going to sleep upon entering the ubatch_context.

        2. The main thread starts the graph capture and wakes up the first
        ubatch thread.

        3. Each ubatch thread runs the model to completion and returns the
        completed output tensors back to the main thread.

        4. The main thread stores the captured cudagraph along with its metadata
        and returns
        c                    t           j                            j                   |j        }t           j                            |j                  5  t           j                                        }d d d            n# 1 swxY w Y   t           j                            |j                  5  t           j                                        }d d d            n# 1 swxY w Y   |5   |j	        |j
        |j        |j                  }d d d            n# 1 swxY w Y   |                     |j        j        |f           d S N)r   r   r   r   )r!   r,   
set_devicerK   r   streamcompute_streamcurrent_blas_handlerP   r   r   r   r   appendid)resultsr*   ubatch_context_model_outputmodelr:   s        r&   _capture_ubatch_threadz?UBatchWrapper._capture_ubatches.<locals>._capture_ubatch_thread   s   J!!$+...,4N"">#@AA 5 5J22445 5 5 5 5 5 5 5 5 5 5 5 5 5 5"">#=>> 5 5J22445 5 5 5 5 5 5 5 5 5 5 5 5 5 5  $u-7-7)8)M"1"?	                    NNO36EFFFFFs6   A<<B B +CCC#$DDDr   rM   Ntargetargs)r)   r*   )r~   poolc                     g | ]\  }}|S r$   r$   .0positionvalues      r&   
<listcomp>z3UBatchWrapper._capture_ubatches.<locals>.<listcomp>   s    !O!O!OOHe%!O!O!Or%   dim)r!   inference_moder   r   r   r   rQ   Threadr   startrU   waitr(   r,   r-   rX   r
   r   graph_pool_handlegraphr)   cpu_wait_eventsetjoinsortedcatr+   rV   )r:   r*   r   r   r   r   r   ubatch_threadsmetadatathreadcudagraph_metadatasorted_resultsresults   ` `          r&   _capture_ubatcheszUBatchWrapper._capture_ubatches   s   . 
				G 	G 	G 	G 	G 
 		G" 35(+3B$Q'2_Q5G5RR
 &d++ "	= "	=N+ 	 	")1    %%f---##%%% "3*..00 /" " " *!$/2222!"2"D"F"FGGG!!",%_ "   
4 
4
  "*9==???, " "FKKMMMM!O!Ovg!O!O!O>q999-3"*
4 
4 
4 
4 
4 
4 
4 
4 
4 
4 
4 
4 
4 
4 
4 +=DOJ'E"	= "	= "	= "	= "	= "	= "	= "	= "	= "	= "	= "	= "	= "	= "	=F "))s8    C7G8A4GG8G	G8G	G88G<?G<c                 .   t          j                    d             }g }t          d           5  g }|D ]D}t          j        ||||f          }|                    |           |                                 E| j                                         |d         j	        j
                                         |D ]}|                                 	 d d d            n# 1 swxY w Y   d t          |          D             }t          j        |d          }	|	S )Nc                     |j         5   ||j        |j        |j        |j                  }d d d            n# 1 swxY w Y   |                     |j         j        |f           d S r|   )r   r   r   r   r   r   r   )r   r   r*   r   s       r&   _ubatch_threadz3UBatchWrapper._run_ubatches.<locals>._ubatch_thread  s     (  $u-7-7)8)M"1"?	                    NNO36EFFFFFs   $8<<r   r   c                     g | ]\  }}|S r$   r$   r   s      r&   r   z/UBatchWrapper._run_ubatches.<locals>.<listcomp>"  s    GGGOHe%GGGr%   r   )r!   r   r   rQ   r   r   r   rU   r   r   r   r   r   r   r   )
r:   r*   r   r   r   r   r   r   r   r   s
             r&   _run_ubatcheszUBatchWrapper._run_ubatches   s   					G 	G 
 		G 35
 &d++ 	 	N+ 
 
"))    %%f---##%%%A&599;;;(  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	" HGvgGGG>q111s   B CC"Cc                 D   g }|ot          |t                    }t          |          D ]L\  }}|                    t	          |||         nd | j        |	|         |
||r||         nd                      Mt          t          |          | j        ||| j	                  }g }t          |          D ]m\  }}| 
                    |j        ||||          \  }}}}|                    t          ||         |||||j        j        |j        j        z
                       n|S )N)dp_metadatabatch_descriptorcudagraph_runtime_modeslot_mapping)num_micro_batchesrP   r   forward_contextsrU   )r   r   r   r   r   r   )
isinstancelist	enumerater   r   rI   r   lenrP   rU   _slice_model_inputstoken_slicer   stopr   )r:   ubatch_slicesattn_metadatar   r   r   r   r   r   r   r   r   r   has_slot_mappingiubatch_sliceubatch_ctxsr*   sliced_input_idssliced_positionssliced_inputs_embedssliced_intermediate_tensorss                         r&   _make_ubatch_metadataz#UBatchWrapper._make_ubatch_metadata&  s     (JJ|T,J,J(77 
	 
	OA|##&(5(AM!$$t$ +A%5+A4D!Na$  	 	 	 	 +!-00()-,
 
 
 13(77 	 	OA| ((($   $+ ""'N.."6)D+7<".4 5  
 
 
 
 r%   tokens_slicec                     ||         }|j         dk    r|d d |f         }n||         }|r||         nd }|r||         nd }	||||	fS )N   )ndim)
r:   r   r   r   r   r   r   r   r   r   s
             r&   r   z!UBatchWrapper._slice_model_inputsi  s     %\2 >Q(L9(6>KU}\::QU2FP ..D 	$
  '	
 	
r%   c                 ^   t                      }|j        }|j        }|j        }|i|t          j        u r|J |j        | j        v rt          j        }|t          j        t          j	        fv r | j
        |i |S | j        J  | j        |i |S |j        }|j        }|d         j        j        |d         j        j        z
  dz  }	|d         }
|d         }|d         }|d         }t"          j                                        }|j        }|J g }|D ]u}| j        j        j        }t#          j        |j        g|z  dt"          j                  }|                    t7          j        | j        j        |j        |                     v|	| j        vrt|t          j        u rf|                     ||||
||||||t          j        	          }| j        5  |                     || j                   cd d d            S # 1 swxY w Y   d S |	| j        v r;|t          j        u r-| j        |	         }|j!        "                                 |j#        S |                     ||||
||||||t          j        	          }| j        5  | $                    || j                   cd d d            S # 1 swxY w Y   d S )
Nr   r   r   r   r   r   cpu)rK   dtype)r   r   r   r   r   r   r   r   r   r   r   )%r   r   r   r   r   FULLr   rV   rY   	PIECEWISErH   rW   r   r   r   r   r   r!   r,   current_streamr   rI   rS   data_parallel_sizetensorint32r   r   maker   r\   r   r   r)   replayr+   r   )r:   r   kwargsforward_contextr   r   r   r   r   r   r   r   r   r   r   r   ubatch_dp_metadatar   dp_sizeubatch_num_tokens_across_dpr*   r   s                         r&   __call__zUBatchWrapper.__call__  s   -//*;'5!0!G   &);;;'333#.$/AA-:-?*%-*<m>U)VVV$t}d5f555-999-t-t>v>>>'5&3!(-a0@0L0RR
 ;'	;'	%&<=/2244%1 &&&) 	 	L&6IG*/,()G3E+ + +' %%$4 +/     do--&-*<<<"88++)##%9+-.!1'4'9 9  O  K K--otzJJK K K K K K K K K K K K K K K K K K $/))&-*<<<!%!<(//111%--"88++)##%9+-.!1'4'9 9  O  G G))/4:FFG G G G G G G G G G G G G G G G G Gs$   G88G<?G<:J""J&)J&N)r   r   r   r   r   r   r!   r,   rK   r<   staticmethodr[   strrv   ry   r"   r   r   r   r   r   slicer   r   r$   r%   r&   rG   rG   ^   sP          $	
 
!   :  

  
  
  
 \ 
D
s 
 
 
 
    R*5< R* R* R* R*h$u| $ $ $ $LA 
n	A A A AF

 
 
 
6_G _G _G _G _Gr%   rG   ))rQ   collections.abcr   dataclassesr   typingr   r!   	vllm.envsri   vllm.compilation.cuda_graphr   vllm.configr   r   vllm.distributedr	   6vllm.distributed.device_communicators.pynccl_allocatorr
   vllm.forward_contextr   r   r   r   vllm.loggerr   vllm.platformsr   vllm.sequencer   vllm.utils.import_utilsr   vllm.v1.worker.ubatchingr   r   r   loggerr   r(   r/   rG   r$   r%   r&   <module>r      s6       $ $ $ $ $ $ ! ! ! ! ! !              8 8 8 8 8 8 1 1 1 1 1 1 1 1 ) ) ) ) ) ) T T T T T T            $ # # # # # + + + + + + - - - - - - 1 1 1 1 1 1 H H H H H H H H	X		                +- +- +- +- +- +- +- +-\EG EG EG EG EG EG EG EG EG EGr%   