
    .`ix                     j   d Z ddlZddlZddlmZmZ ddlmZ ddlm	Z	m
Z
mZ ddlZddlZddlZddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZmZmZ dd	lmZ dd
l m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5 ddl6m7Z7 ddl8m9Z9 ddl:m;Z;m<Z<m=Z= ddl>m?Z? ddl@mAZAmBZB ddlCmDZDmEZE ddlFmGZGmHZH ddlImJZJmKZKmLZL ddlMmNZNmOZO ddlPmQZQ ddlRmSZS ddlTmUZU ddlVmWZW  e*eX          ZYe	rdd lZm[Z[ dd!l\m]Z]  G d" d#eS          Z^	 	 	 d-d&ed'e_d(e`dz  d)e_d*e`d+dfd,ZadS ).zA GPU worker class.    N)AbstractContextManagernullcontext)NoneType)TYPE_CHECKINGAnycast)CUDAGraphMode
VllmConfigset_current_vllm_config)CompilationMode)!ensure_model_parallel_initializedinit_distributed_environmentset_custom_all_reduce)ensure_ec_transfer_initialized)ensure_kv_transfer_initializedensure_kv_transfer_shutdownget_kv_transfer_grouphas_kv_transfer_group)get_pcp_groupget_pp_groupget_tp_group)init_logger)LoRARequest)is_mixture_of_experts)kernel_warmup)current_platform)CudaProfilerWrapperTorchProfilerWrapper)IntermediateTensors)SupportedTask)MemorySnapshot
format_gibmemory_profiling)set_random_seed)GrammarOutputSchedulerOutput)ReconfigureDistributedRequestReconfigureRankType)KVCacheConfigKVCacheSpec)AsyncModelRunnerOutputDraftTokenIdsModelRunnerOutput)compute_iteration_detailsreport_usage_stats)is_residual_scattered_for_sp)
WorkerBase)init_workspace_manager   )request_memory)TensorizerConfigGPUModelRunnerc                       e Zd Z	 dJdededededef
 fdZdKd	ed
dfdZdLde	e         dz  d
dfdZ
ded
efdZdeded
dfdZd ZdMdZdeeef         d
dfdZdMdZ ej                    d
efd            Zd
edz  fdZd
eeef         fdZded
dfdZded
dfdZdMd ZdMd!Zd
ej         fd"Z!d
e"e#d#f         fd$Z$d
eeeee%ez  f         f         fd%Z&d& Z' ej                    d'd(d
e(e)z  fd)            Z* ej                    d*d+d
e(e)z  dz  fd,            Z+d
e,dz  fd-Z-dNd/efd0Z.dMd1Z/d2e0d
efd3Z1d4ed
efd5Z2d
e3e         fd6Z4d4ed
efd7Z5dMd8Z6d9ed:ed
dfd;Z7d9ed:ed<e	ej8                 dz  d
dfd=Z9d>e:d
dfd?Z;d9ed:ed
e	ej8                 dz  fd@Z<d>e:d
dfdAZ=	 	 dOdBedCedz  dDedz  d
dfdEZ>	 	 	 	 dPdHZ?dMdIZ@ xZAS )QWorkerFvllm_config
local_rankrankdistributed_init_methodis_driver_workerc                    t                                          |||||           t          j        }t	          j        |           i | _        d | _        |j        }|j        dk    r1|j	         d| j
         }t          ||| j        ddg          | _        n'|j        dk    rt          |          | _        nd | _        t          j        | _        d S )N)r:   r;   r<   r=   r>   torchz-rank-CPUCUDA)worker_namer;   
activitiescuda)super__init__envsVLLM_FLOAT32_MATMUL_PRECISIONr@   set_float32_matmul_precision_sleep_saved_buffersprofilerprofiler_configinstance_idr<   r   r;   r   VLLM_USE_V2_MODEL_RUNNERuse_v2_model_runner)
selfr:   r;   r<   r=   r>   	precisionrM   rC   	__class__s
            m/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/worker/gpu_worker.pyrG   zWorker.__init__E   s     	#!$;- 	 	
 	
 	
 6	*9555 >@! %)%5#w..(4GGDIGGK0'?!6?	  DMM %///@@DMM DM#'#@       r3   levelreturnNc                 "   ddl m} t          j                                        d         }|dk    r/| j        j        }d |                                D             | _        |	                                }|
                    |dk    rdnt                                 t          j                                        \  }}||z
  }||z
  }	|dk    s
J d            t                              d	t          |          t          |	                     d S )
Nr   CuMemAllocator   c                 b    i | ],\  }}||                                                                 -S  )cpuclone).0namebuffers      rT   
<dictcomp>z Worker.sleep.<locals>.<dictcomp>v   s?     ) ) )/;tVfjjll((**) ) )rU   r3   )weights)offload_tagsz&Memory usage increased after sleeping.z>Sleep mode freed %s GiB memory, %s GiB memory is still in use.)vllm.device_allocator.cumemrZ   r@   rE   mem_get_infomodel_runnermodelnamed_buffersrK   get_instancesleeptupleloggerinfor"   )
rQ   rV   rZ   free_bytes_before_sleepri   	allocatorfree_bytes_after_sleeptotalfreed_bytes
used_bytess
             rT   rl   zWorker.sleepn   s)   >>>>>>"'*"9"9";";A"> A::%+E) )?D?R?R?T?T) ) )D% #//11	UaZZ\\UWWMMM(-
(?(?(A(A%,/FF33
a!IL{##z""	
 	
 	
 	
 	
rU   tagsc                    ddl m} |                                }|                    |           t	          | j                  r`| j        j        }|                                D ]8\  }}|| j        v r*|j	        
                    | j        |         j	                   9i | _        |d|v rQ| j        j                            d          r0t          | j        d          r| j                                         d S d S d S d S )Nr   rY   kv_cachefp8init_fp8_kv_scales)rf   rZ   rk   wake_uplenrK   rh   ri   rj   datacopy_cache_configcache_dtype
startswithhasattrrz   )rQ   rv   rZ   rq   ri   ra   rb   s          rT   r{   zWorker.wake_up   s&   >>>>>>"//11	$ t()) 	+%+E % 3 3 5 5 L Lf4444K%%d&?&E&JKKK(*D% \Z4//!-88?? 0)+?@@ 0 0022222	 0/////rU   tagc                     | j         j        j        rXddlm} |                                }|dk    r"|                                dk    s
J d            |                    |          S t                      S )Nr   rY   rd   z9Sleep mode can only be used for one instance per process.r   )	r:   model_configenable_sleep_moderf   rZ   rk   get_current_usageuse_memory_poolr   )rQ   r   rZ   rq   s       rT   _maybe_get_memory_pool_contextz%Worker._maybe_get_memory_pool_context   s    (: 
	!BBBBBB&3355Ii 2244999O :99 ,,,555== rU   num_gpu_blocksnum_cpu_blocksc                 6    || j         _        || j         _        d S N)r   r   r   )rQ   r   r   s      rT   initialize_cachezWorker.initialize_cache   s    +9(+9(((rU   c                    | j         j        dk    rt          j                            dd            | j        }|j        dvr|j        dk    r|j        dk    r| j        j	        }|| j        j
        }| j        j        | j        j        z  }| xj        ||z  z  c_        | j        t          j                                        k     sJ d| j         d            t          j                                        rt          j                                        nd}| j        j        |k    sJ d	| j        j         d
| d            t          j        d| j                   | _        t)          j        | j                   t)          j        | j        j                   t3          | j        | j        | j        | j        t(          j                   t=          | j        j                   tA          j!                     t          j        "                                 tG          | j                  x| _$        }tK          || j&                  | _'        tP          )                    d| j$                   tP          )                    dtU          | j'                             ntW          d| j         j                   | j        j        j,        rdnd}t[          | j        |           | j.        r"ddl/m0}  || j        | j                  | _1        n!ddl2m0}  || j        | j                  | _1        | j        dk    rtg          | j                   d S d S )NrE   NCCL_ASYNC_ERROR_HANDLING)rayexternal_launcherr   r3   zDP adjusted local rank z is out of bounds. r   zlocal_world_size (z?) must be less than or equal to the number of visible devices (z).zcuda:)devicezworker init memory snapshot: %rzworker requested memory: %sGiBzNot support device type: r[   r6   )4device_configdevice_typeosenvironpopparallel_configdistributed_executor_backenddata_parallel_backendnnodes_within_dpdata_parallel_rank_localdata_parallel_indexpipeline_parallel_sizetensor_parallel_sizer;   r@   rE   device_countis_availablelocal_world_sizer   r   
set_devicecheck_if_supports_dtyper   dtype#init_worker_distributed_environmentr:   r<   r=   dist_backendr$   seedgccollectempty_cacher!   init_snapshotr4   r   requested_memoryrn   debugr"   RuntimeError
enable_dbor2   rP   vllm.v1.worker.gpu.model_runnerr7   rh   vllm.v1.worker.gpu_model_runnerr/   )	rQ   r   dp_local_ranktp_pp_world_sizevisible_device_countr   num_ubatchesGPUModelRunnerV2GPUModelRunnerV1s	            rT   init_devicezWorker.init_device   s   )V33JNN6==="2O<34 4#9UBB#499 !% 4 M ($($8$LM (?*?@ ! =3C#CC)@)@)B)BBBBRdoRRR CBB 271H1H1J1JQEJ++---PQ % +<@TTTT1)=)N 1 1,1 1 1 UTT
  ,'@t'@'@AADK'4444T5F5LMMM 0 	, -   D-2333 JLLLJ""$$$ 2@t{1S1S1SSD$2=$BS$T$TD!LL:D<NOOOLL0*T=R2S2S    V4;M;TVVWWW !,<GNqqQt{L999 # 	P     
 1A0@ $+1 1D      !1 01A4; O OD9>>t/00000 >rU   c                    t           j                            d          dk    }|                     d          ot	          | j                  5  | j                            |           d d d            d S # 1 swxY w Y   d S )NVLLM_ELASTIC_EP_SCALE_UP_LAUNCH1rd   r   )eep_scale_up)r   r   getr   r   r:   rh   
load_model)rQ   r   s     rT   r   zWorker.load_model  s    z~~&GHHCO00 1 
 
 8%d&677	D 	D ((l(CCC	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	Ds   A77A;>A;	overridesc                 :    | j                             |           d S r   )rh   update_config)rQ   r   s     rT   r   zWorker.update_config  s    ''	22222rU   c                 8    | j                                          d S r   )rh   reload_weightsrQ   s    rT   r   zWorker.reload_weights      ((*****rU   c                    | j         j        x}rb| j                                         dt	          | j        j                   dt	          |           d}t                              |           |S t          | j        t          | j        j                            5 }| j                                         ddd           n# 1 swxY w Y   |j        | _        |j        | _        |j        j        }| j        j        |k    s5J dt	          | j        j                   dt	          |           d            | j        |j        z
  | _        | j        j        | j        z
  }t                              d	t	          | j        j                  | j         j        t	          | j                             t                              d
t	          |          t	          ||z
                       t                              |           t                              dt	          | j                  d           t          | j                  S )a  Profiles the peak memory usage of the model to determine how much
        memory can be used for KV cache without OOMs.

        The engine will first conduct a profiling of the existing memory usage.
        Then, it calculates the free memory that can be used for KV cache in
        bytes.

        Tip:
            You may limit the usage of GPU memory
            by adjusting the `gpu_memory_utilization` parameter.
        zInitial free memory z GiB, reserved a   GiB memory for KV Cache as specified by kv_cache_memory_bytes config and skipped memory profiling. This does not respect the gpu_memory_utilization config. Only use kv_cache_memory_bytes config when you want manual control of KV cache memory size. If OOM'ed, check the difference of initial free memory between the current run and the previous run where kv_cache_memory_bytes is suggested and update it correspondingly.)weights_memoryNz/Error in memory profiling. Initial free memory z GiB, current free memory z GiB. This happens when other processes sharing the same container release GPU memory while vLLM is profiling during initialization. To fix this, ensure consistent GPU memory allocation or isolate vLLM in its own container.z@Initial free memory: %s GiB; Requested memory: %f (util), %s GiBzFFree memory after profiling: %s GiB (total), %s GiB (within requested)z!Available KV cache memory: %s GiBlocal)scope)r   kv_cache_memory_bytesrh   profile_runr"   r   free_memoryrn   ro   r#   intmodel_memory_usagenon_torch_increasenon_torch_memorytorch_peak_increasepeak_activation_memoryafter_profiler   non_kv_cache_memoryavailable_kv_cache_memory_bytesr   gpu_memory_utilization	info_once)rQ   r   msgprofile_resultfree_gpu_memoryunrequested_memorys         rT   determine_available_memoryz!Worker.determine_available_memory  s    %)$5$KK  	) ))+++	#z$2D2P'Q'Q 	# 	#!+,A!B!B	# 	# 	#  KK(( t0CDD
 
 
 	, ))+++		, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, !/ A&4&H#(6B !-???1#-d.@.L#M#M1 1#-o#>#>1 1 1 @?? !N$FF 	, "/;d>SSNt)5664t,--		
 	
 	
 	T'');;<<	
 	
 	

 	^$$$/t;<< 	 	
 	
 	
 47888s   CCCc                     t                      sdS t                      }|                                x}dS t                      j        }||iS )z8Get KV connector metadata from this worker if available.N)r   r   get_handshake_metadatar   rank_in_group)rQ   	connectormetadatatp_ranks       rT   #get_kv_connector_handshake_metadataz*Worker.get_kv_connector_handshake_metadatal  sW     %&& 	4)++	 "88:::HC4...""rU   c                 4    | j                                         S r   )rh   get_kv_cache_specr   s    rT   r   zWorker.get_kv_cache_spec{  s     22444rU   max_model_lenc                     || j         _        | j        | j                            |           t                              d|           dS )a7  Update max_model_len after auto-fit to GPU memory.

        This is called when max_model_len=-1 is used and the engine
        automatically determines the maximum context length that fits
        in GPU memory. Workers need to update their cached max_model_len
        to match the engine's decision.
        NzUpdated max_model_len to %d)r   r   rh   update_max_model_lenrn   r   )rQ   r   s     rT   r   zWorker.update_max_model_len~  sJ     +8'(22=AAA2MBBBBBrU   kv_cache_configc                 P   t          | j        |           | j        j        j        reddlm} |                                }|                    d          5  | j        	                    |           ddd           dS # 1 swxY w Y   dS | j        	                    |           dS )z9Allocate GPU KV cache with the specified kv_cache_config.r   rY   rx   r   N)
r   r:   r   r   rf   rZ   rk   r   rh   initialize_kv_cache)rQ   r   rZ   rq   s       rT   initialize_from_configzWorker.initialize_from_config  s    	't'7III(: 	CBBBBBB&3355I**z*:: G G!55oFFFG G G G G G G G G G G G G G G G G G 11/BBBBBs   A??BBc                    g }| j         j        j        t          j        k    r| j         j        j        }||                                ng }g | j         j        j        t          j	        k    r%| j         j        j
        }|g n|fd|D             }| j         j                                        }t                    }|                    d |D                        |D ]7t          fd|D                       s|                    j                   8t#          |d          D ]:}t$                              d|           | j                            |dd           ;| j                            | j        j                   t1          |            d	}| j        j        s| j                                        }| j        j        `t=          | d
          rOd}| j        j        | j         z   | j!        z   |z   }	| j"        j#        |	z
  |z
  }
tI          | j%                  |	z
  |z
  }dtM          | j"        j#                   dtM          | j"        j'                   d| j        j(         dtM          | j%                   dtM          | j        j                   dtM          | j                    dtM          | j!                   dtM          |           d| dtM          |           d|
 dtM          |
           dtM          | j)                   d}t$          *                    |           tW                      j,        rt[          | j.        j/        | j.        j0                  }| j                            |dt          j	                  \  }}| j        j1        r| j        2                    |           n| j        3                    |           ti          | j        j5                   d S )Nc                     g | ]}|v|	S r]   r]   )r`   xcg_capture_sizess     rT   
<listcomp>z3Worker.compile_or_warm_up_model.<locals>.<listcomp>  s$    UUUa1DT;T;T;T;T;TrU   c                 <    g | ]}t          |t                    |S r]   )
isinstancer   )r`   r   s     rT   r   z3Worker.compile_or_warm_up_model.<locals>.<listcomp>  s'    LLLAAs9K9KLaLLLrU   c              3       K   | ]}|v V  	d S r   r]   )r`   r   compile_ranges     rT   	<genexpr>z2Worker.compile_or_warm_up_model.<locals>.<genexpr>  s(      AA!1-AAAAAArU   T)reversez(Compile and warming up model for size %dF)	skip_eplbremove_lorar   r   i  `	zFree memory on device (/z5 GiB) on startup. Desired GPU memory utilization is (z, z GiB). Actual usage is z GiB for weight, z GiB for peak activation, z GiB for non-torch memory, and zY GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=z` (z: GiB) to fit into requested memory, or `--kv-cache-memory=zE GiB) to fully utilize gpu memory. Current kv cache memory in use is z GiB.)
num_tokensr   cudagraph_runtime_mode)hidden_states)6r:   compilation_configmoder   VLLM_COMPILEcompile_sizescopycudagraph_moder	   NONEcudagraph_capture_sizesget_compile_rangessetupdateanyappendendsortedrn   ro   rh   
_dummy_runmaybe_remove_all_loraslora_configr   r   enforce_eagercapture_modelr   r   r   r   r   r   r   r   r   r   r"   total_memoryr   r   r   r   is_last_rankminscheduler_configmax_num_seqsmax_num_batched_tokensis_pooling_model_dummy_pooler_run_dummy_sampler_runr$   r   )rQ   warmup_sizesr  cg_sizescompile_ranges	all_sizessizecuda_graph_memory_bytesredundancy_buffer_memoryr   "kv_cache_memory_bytes_to_gpu_limit(kv_cache_memory_bytes_to_requested_limitr   max_num_reqsr   last_hidden_statesr   r   s                   @@rT   compile_or_warm_up_modelzWorker.compile_or_warm_up_model  s   .37SSS !,?MM3@3L=--///RTL*,2A]EWWW+>V)1)922x UUUU<UUU!-@SSUUN ,--ILLLLLMMM!/ ; ;AAAAyAAAAA ; ''(9::: <666 	R 	RDKKBDIII((5(QQQQ001B1NOOO 	d"# . 	H&*&7&E&E&G&G#2:w*@
 @
: (7$!4-.'( **   ".%&*+ / D)**%&*+ 5Kt1=>>K Kd0=>>K K %<	K K
 d344K K $.d.?.R#S#SK K $.d.I#J#JK K )343H(I(IK K .88O-P-PK K <K K GHHK K 6K K  ABB!K K$ dBCC%K K K , LL >>& 	W%2%< L 150A0L0L''4'9 1M 1 1-M-
  1 W!33MBBBB!44CU4VVV 	)./////rU   c                 8    | j                                          d S r   )rh   reset_mm_cacher   s    rT   r+  zWorker.reset_mm_cache  r   rU   c                 4    | j                                         S r   )rh   	get_modelr   s    rT   r-  zWorker.get_model  s     **,,,rU   .c                 4    | j                                         S r   )rh   get_supported_tasksr   s    rT   r/  zWorker.get_supported_tasks  s     44666rU   c                 4    | j                                         S )z+Get encoder timing stats from model runner.)rh   get_encoder_timing_statsr   s    rT   r1  zWorker.get_encoder_timing_stats!  s     99;;;rU   c                 |   | j         st                      S | j                                          t          |          }d                    dt          |j                  dt          |j                  dt          |j                  dt          |j	                  dg	          }| j         
                    |          S )N execute_context_(z)_generation_))rL   r   stepr.   joinstrnum_ctx_requestsnum_ctx_tokensnum_generation_requestsnum_generation_tokensannotate_context_manager)rQ   scheduler_outputiteration_details
annotations       rT   annotate_profilezWorker.annotate_profile%  s     } 	!== 56FGGWW"%677%455%=>>%;<<

 

 }55jAAArU   grammar_outputzGrammarOutput | Nonec                 6    | j                             |          S r   )rh   sample_tokens)rQ   rC  s     rT   rE  zWorker.sample_tokens?  s      ..~>>>rU   r?  r&   c                    d }|j         dk    }|j         }i }| j        j        }| j        j        }|j        dk    r|j        j        r|r| j        rJ t          j	        t          |j                                                  t          j                  }| j                            |t!          |          ||                                d          \  }	}
}	}	}	dt%          | j        |
j                   i}|rUt)                      j        sBt)                                          t/                      |          }|J t1          |          }|                     |          5  | j                            ||          }t7          |t8          t:          z  t<          z            r|cd d d            S 	 d d d            n# 1 swxY w Y   t7          |t0                    sJ | j        j        }|j        dk    rt)                      j         rJ t)                      !                    |j"        t/                      |           d S )	Nr   r3   )r   F)r   num_reqsnum_scheduled_tokens_npmax_num_scheduled_tokensuse_cascade_attnresidual)all_gather_groupall_gather_tensorsr   )#total_num_scheduled_tokensr:   r  r   r   pass_config	enable_sprP   nparraylistnum_scheduled_tokensvaluesint32rh   &_determine_batch_execution_and_paddingr|   maxr0   r   r   is_first_rankrecv_tensor_dictr   r   rB  execute_modelr   r-   r+   r   r   r  send_tensor_dicttensors)rQ   r?  intermediate_tensorsforward_passrT  rM  r  r   rH  _
batch_desctensor_dictoutputs                rT   r[  zWorker.execute_modelE  s     $'BQF/J!-@*: 2Q66".8 7 7
 ////&(h%:AACCDDh' ' '# !HH3 !899,C-D-H-H-J-J%* I   #Az1a  <$j&;! ! "  	D < 	D&..99!-#5 :  K ***#6{#C#C ""#344 	 	&44 "6 F ),BBXM   	 	 	 	 	 	 	 			 	 	 	 	 	 	 	 	 	 	 	 	 	 	 &"566666*:8<OOO NN/ PO 0 	''N)^^1 	( 	
 	
 	
 ts   "AF>>GGc                 4    | j                                         S r   )rh   take_draft_token_idsr   s    rT   re  zWorker.take_draft_token_ids  s     55777rU   Tis_startc                     | j         t          d          |r| j                                          d S | j                                          d S )NzProfiling is not enabled. Please set --profiler-config to enable profiling. Example: '--profiler-config.profiler=torch --profiler-config.torch_profiler_dir=YOUR_DIR_PATH_TO_DUMP_TRACE')rL   r   startstop)rQ   rf  s     rT   profilezWorker.profile  s_    = 0    	!M!!!!!M     rU   c                 >    | j                             dd           d S )Nr3   T)uniform_decode)rh   r  r   s    rT   execute_dummy_batchzWorker.execute_dummy_batch  s$    $$Qt$<<<<<rU   lora_requestc                 6    | j                             |          S r   )rh   add_lora)rQ   rn  s     rT   rp  zWorker.add_lora  s     )),777rU   lora_idc                 6    | j                             |          S r   )rh   r   rQ   rq  s     rT   r   zWorker.remove_lora  s     ,,W555rU   c                 4    | j                                         S r   )rh   
list_lorasr   s    rT   ru  zWorker.list_loras  s     ++---rU   c                 6    | j                             |          S r   )rh   pin_lorars  s     rT   rw  zWorker.pin_lora  s     ))'222rU   c                     d S r   r]   r   s    rT   check_healthzWorker.check_health  s    rU   old_ep_sizenew_ep_sizec                    ddl m}  |            j        dk    rt                              d           fdt          |          D             }| j        j        J | j        j                            dd |           t          j
                                          |            j        dk    rt                              d           d S d S )Nr   get_ep_groupz>[Elastic EP] Starting expert resharding before scaling down...c                 &    i | ]}||k     r|nd S )r]   )r`   old_ep_rankr{  s     rT   rc   z2Worker._eplb_before_scale_down.<locals>.<dictcomp>  s9     
 
 
 k(A(Ar
 
 
rU   Texecute_shuffleglobal_expert_loadsrank_mapping)[Elastic EP] Expert resharding completed!)vllm.distributed.parallel_stater~  r<   rn   ro   rangerh   
eplb_state	rearranger@   rE   synchronize)rQ   rz  r{  r~  r  s     `  rT   _eplb_before_scale_downzWorker._eplb_before_scale_down  s    @@@@@@<>>!##KKP  
 
 
 
$[11
 
 
  +777$..  $% 	/ 	
 	
 	

 	
   <>>!##KKCDDDDD $#rU   r  c                 \   ddl m}  |            j        dk    rt                              d           d t          |          D             }| j        j        J | j        j                            d||            |            j        dk    rt                              d           d S d S )Nr   r}  z;[Elastic EP] Starting expert resharding after scaling up...c                     i | ]}||S r]   r]   )r`   r  s     rT   rc   z/Worker._eplb_after_scale_up.<locals>.<dictcomp>  s    WWW[[WWWrU   Tr  r  )	r  r~  r<   rn   ro   r  rh   r  r  )rQ   rz  r{  r  r~  r  s         rT   _eplb_after_scale_upzWorker._eplb_after_scale_up  s     	A@@@@@<>>!##KKUVVVWWE+DVDVWWW +777$..  3% 	/ 	
 	
 	

 <>>!##KKCDDDDD $#rU   reconfig_requestc                     | j         j        }|j        |_        |j        t
          j        k    r|j        |_        |j        t
          j        k    r|j        |_	        |j
        |_        |j        |_        dS )zG
        Update parallel config with provided reconfig_request
        N)r:   r   new_data_parallel_sizedata_parallel_sizenew_data_parallel_rankr(   KEEP_CURRENT_RANKdata_parallel_ranknew_data_parallel_rank_localr   new_data_parallel_master_ipdata_parallel_master_ipnew_data_parallel_master_portdata_parallel_master_port)rQ   r  r   s      rT   _reconfigure_parallel_configz#Worker._reconfigure_parallel_config  s     *:-=-T*3"45 5 2B1XO.9"45 5 != 4 8 	/ : 	111rU   c                 X   ddl mm}m} ddlm}m | j        j        dt          j
        j        dt          |         fd}dt          |         dt          ffd	} || j        j                  }|d         j        j        }	 |||	           d
}
t%          | j        d          r+t%          | j        j        d          r| j        j        j        }
|
Ft)          |
          r7 ||
          }|d         j        j        |	k    s
J d             |||	           |k     rX|	}| j        j        J | j        j        j        j        d         }|| j        j        j        j        d         z
  j        _        d
}nt          j        |	gt          j        d          }t          j                            | |            j        d           t          |                                           }|z  }| j        j        J | j        j        !                    d          }tE          t          t          j#                 |          }||d         j        d         z
  j        _         || j        j                   |
 ||
           | j        j        $                    ||           |S )z
        Reconfigure MoE modules with provided reconfig_request

        Return the global expert load if new_ep_size > old_ep_size,
        otherwise None
        r   )get_dp_groupr~  &prepare_communication_buffer_for_model)FusedMoEFusedMoEParallelConfigri   rW   c                 >    d |                                  D             S )Nc                 R    g | ]$}|j         j        d k    s|j         j        dk    "|%S )r  SharedFusedMoE)rS   __name__)r`   modules     rT   r   zDWorker._reconfigure_moe.<locals>.get_moe_modules.<locals>.<listcomp>
  sH       $-;;'04DDD	  EDDrU   )modules)ri   s    rT   get_moe_modulesz0Worker._reconfigure_moe.<locals>.get_moe_modules	  s+     #mmoo   rU   moe_modulesnum_local_expertsc                 Z   t          fd| D                       s
J d            | D ]}z  |j        _        |j        j        |_                            t                      j        t                      j                     j                  |_        |j        |j        _        | S )Nc              3   8   K   | ]}|j         j        k    V  d S r   )
moe_configr  )r`   r  r  s     rT   r   zFWorker._reconfigure_moe.<locals>.update_moe_modules.<locals>.<genexpr>  sC         !37HH     rU   z4All MoE modules must have the same number of experts)tp_size_	pcp_size_dp_size_vllm_parallel_config)	allr  num_expertsglobal_num_expertsmaker   
world_sizer   moe_parallel_config)r  r  r  r  r  r{  r   s    ` rT   update_moe_modulesz3Worker._reconfigure_moe.<locals>.update_moe_modules  s        )     F F FF F  & 	S 	S0AK0O!-,2,=,I)-C-H-H)^^6+oo8)\^^6)8	 .I . .* 9?8R!55rU   Ndrafterz,Drafter and model configs should be the samer3   r^   )r   r   )group	group_srcF)r  )num_physical_expertsnum_local_physical_experts)%r  r  r~  r  *vllm.model_executor.layers.fused_moe.layerr  r  r:   r   r@   nnModulerS  r   rh   ri   r  r  r   r  r   r  physical_to_logical_mapshapelogical_replica_counteplb_confignum_redundant_expertstensorrV  distributed	broadcast	cpu_groupitemr  r   Tensor update_physical_experts_metadata)rQ   rz  r{  r~  r  r  r  r  model_moe_modulesr  drafter_modeldrafter_moe_modulesr  new_physical_expertsr  !num_local_physical_experts_tensorglobal_expert_loads_anyr  r  r   s     `              @@@rT   _reconfigure_moezWorker._reconfigure_moe  s   	
 	
 	
 	
 	
 	
 	
 	
 	
 	

	
 	
 	
 	
 	
 	
 	
 	

 *:	58? 	tH~ 	 	 	 		DN 	s 	 	 	 	 	 	 	 	 	" ,OD,=,CDD-a0;M,.?@@@4$i00 	<W%w6
 6
 	< !-5;M$)>})M)M$"1/-"@"@ $A&1CGXXXX= YXX24EFFF$$):&$/;;;!,DJ1M ! %#.DJ1MN '= #'05"#5;u1 1 1- ''1"lnn. (   
 *--N-S-S-U-U)V)V&#=#K $/;;;&*&7&B&L&L % 'M ' '# #'tEL'9;R"S"S$':1'='CA'FF '= 	/.t/@/FGGG$22=AAA@@!5'A 	A 	
 	
 	
 #"rU   c                 ~   ddl m} ddlm}m}  |            j        } |            j        }|j        t                      j        z  t                      j        z  }||k     r| 
                    ||            |             |j        t          j        k    r
||k    sJ d S |                     |            || j                  5  t!          | j        | j        | j        | j                   d d d            n# 1 swxY w Y   |                     ||          }||k    r|J |                     |||           d S d S )Nr   )r   )cleanup_dist_env_and_memoryr~  )vllm.configr   r  r  r~  r  r<   r  r   r   r  r  r(   SHUTDOWN_CURRENT_RANKr  r:   r   r=   r;   r  r  )	rQ   r  r   r  r~  rz  r  r{  r  s	            rT   reinitialize_distributedzWorker.reinitialize_distributed\  s    	877777	
 	
 	
 	
 	
 	
 	
 	

 #lnn/"lnn)3nn'(nn'( 	
 $$((kBBB##%%% 3"89 9 +----F))*:;;;$$T%566 	 	/ 	,	  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 #33KMM$$&222%%k;@STTTTT %$s   'C99C= C=pathpatternmax_sizec                 X    ddl m} |                    | j        j        |||           d S )Nr   )ShardedStateLoader)r  r  ) vllm.model_executor.model_loaderr  
save_modelrh   ri   )rQ   r  r  r  r  s        rT   save_sharded_statezWorker.save_sharded_state  sP     	HGGGGG%%#	 	& 	
 	
 	
 	
 	
rU   tensorizer_configr5   c                 <    | j                             |           d S )N)r  )rh   save_tensorized_model)rQ   r  s     rT   r  zWorker.save_tensorized_model  s1     	/// 	0 	
 	
 	
 	
 	
rU   c                 t    t           t                       | j        | j                                         d S d S r   )r   rL   shutdownr   s    rT   r  zWorker.shutdown  s>    &2')))=$M""$$$$$ %$rU   )F)r3   r   )rW   N)T)NN)r  r5   rW   N)Br  
__module____qualname__r
   r   r9  boolrG   rl   rS  r{   r   r   r   r   r   dictr   r   r   r@   inference_moder   r   r*   r   r   r)   r   r)  r+  r  r  r-  rm   r    r/  floatr1  rB  r-   r+   rE  r[  r,   re  rj  rm  r   rp  r   r
  ru  rw  ry  r  r  r  r'   r  r  r  r  r  r  __classcell__)rS   s   @rT   r9   r9   D   s$        "''A 'A'A 'A 	'A
 "%'A 'A 'A 'A 'A 'A 'AR
 
3 
t 
 
 
 
03 3DI, 3 3 3 3 30!# !:P ! ! ! !:s :C :D : : : :[1 [1 [1~D D D D3tCH~ 3$ 3 3 3 3+ + + + UN9C N9 N9 N9 N9`#TD[ # # # #54[(8#9 5 5 5 5C# C$ C C C CCm C C C C C&x0 x0 x0 x0t+ + + +-29 - - - -7U=#+=%> 7 7 7 7<$sDeck9I4J/J*K < < < <B B B4 U?4?	3	3? ? ? ?
 UE 1E	3	3d	:E E E EN8md&: 8 8 8 8! ! ! ! ! != = = =8[ 8T 8 8 8 863 64 6 6 6 6.CH . . . .3 3 3 3 3 3   E3 ES ET E E E E*EE E "%,/$6	E
 
E E E E(
 =
	
 
 
 
6f#f#-0f#	el	d	"f# f# f# f#P+U =+U	+U +U +U +U` ##	
 

 t
 *	

 

 
 
 

-
 

 
 
 
% % % % % % % %rU   r9   r  ncclr:   r<   r=   r;   backendrW   c                    | j         }| j        }ddlm}  ||j                   t          |j                    |pd}t          |j        ||||           t          |j
        |j        |j        |j                   t          |            dS )z'Initialize the distributed environment.r   )init_batch_invariancezenv://N)attention_configr   *vllm.model_executor.layers.batch_invariantr  r  r   disable_custom_all_reducer   r  r   r   r   prefill_context_parallel_sizedecode_context_parallel_sizer   )	r:   r<   r=   r;   r  r  r   r  init_methods	            rT   r   r     s     #3!1OPPPPPP*2333oGGHHH)5XK "D+z7   &,.54	   #;/////rU   )Nr  r  )b__doc__r   r   
contextlibr   r   typesr   typingr   r   r   numpyrQ  r@   torch.distributedtorch.nnr  	vllm.envsrH   r  r	   r
   r   vllm.config.compilationr   vllm.distributedr   r   r   vllm.distributed.ec_transferr   vllm.distributed.kv_transferr   r   r   r   r  r   r   r   vllm.loggerr   vllm.lora.requestr   %vllm.model_executor.models.interfacesr   (vllm.model_executor.warmup.kernel_warmupr   vllm.platformsr   vllm.profiler.wrapperr   r   vllm.sequencer   
vllm.tasksr    vllm.utils.mem_utilsr!   r"   r#   vllm.utils.torch_utilsr$   vllm.v1.core.sched.outputr%   r&   vllm.v1.enginer'   r(   vllm.v1.kv_cache_interfacer)   r*   vllm.v1.outputsr+   r,   r-   vllm.v1.utilsr.   r/   vllm.v1.worker.utilsr0   vllm.v1.worker.worker_baser1   vllm.v1.worker.workspacer2   utilsr4   r  rn   +vllm.model_executor.model_loader.tensorizerr5   r   r7   r9   r   r9  r   r]   rU   rT   <module>r     s     				 				 : : : : : : : :       + + + + + + + + + +                      J J J J J J J J J J 3 3 3 3 3 3         
 H G G G G G                    
 $ # # # # # ) ) ) ) ) ) G G G G G G B B B B B B + + + + + + K K K K K K K K - - - - - - $ $ $ $ $ $ M M M M M M M M M M 2 2 2 2 2 2 D D D D D D D D M M M M M M M M A A A A A A A A         
 H G G G G G G G = = = = = = 1 1 1 1 1 1 ; ; ; ; ; ; ! ! ! ! ! !	X		 ?LLLLLL>>>>>>a% a% a% a% a%Z a% a% a%N +/0 00
0 !4Z0 	0
 0 
0 0 0 0 0 0rU   