
    .`i                    .   U d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZm	Z	 d dl
mZ d dlmZmZ d dlmZ d dl mZ d dlmZmZmZmZmZ d dlZd dlZd dlZd dlmZ d d	lmZ d dlmZ d d
l m!Z!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' d dl(m)Z) d dl*m+Z+m,Z,m-Z-m.Z.m/Z/ d dl0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6m7Z7 d dl8m9Z9 d dl:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@ d dlAmBZBmCZC d dlDmEZE d dlFmGZGmHZH d dlImJZJ d dlKmLZL d dlMmNZNmOZO d dlPmQZQmRZR d dlSmTZTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z] d dl^m_Z_m`Z`maZa d dlbmcZc d dldmeZemfZfmgZg d dlhmiZi d d ljmkZk d d!llmmZm d d"lnmoZo d d#lpmqZqmrZrmsZs d d$ltmuZu d d%lvmwZw d d&lxmyZymzZz d d'l{m|Z|m}Z} d d(l~mZ d d)lmZ d d*lmZmZ d d+lmZmZmZmZmZmZmZ d d,lmZ d d-lmZmZmZ d d.lmZ d d/lmZ d d0lmZmZmZmZmZmZmZmZmZmZmZ d d1lmZmZmZmZmZmZmZmZmZmZmZ d d2lmZmZ d d3lmZmZ d d4lmZ d d5lmZ d d6lmZ d d7lmZ d d8lmZ d d9lmZ d d:lmZ d d;lmZ d d<lmZ d d=lmZ d d>lmZ d d?lmZmZ d d@lmZ d dAlmZmZ d dBlmZ d dClmZ d dDlmZmZ d dElmZ d dFlmZ d dGlmZ d dHlmZmZmZmZ d dIlmZ d dJlmZ dKdLlmZmZmZmZmZ erd dMlmZ d dNlmZmZ  eEe          Zeeef         ZeedO<   ee         ez  ZeedP<    G dQ dRe          Z G dS dTe          Z G dU dVe          Z G dW dXeee֦          Ze G dY dZ                      ZdS )[    N)defaultdict)IteratorSequence)contextmanager)copydeepcopy)	dataclass)reduce)TYPE_CHECKINGAny
NamedTuple	TypeAliascast)tqdm)	AttentionMLAAttention)compilation_counter)CUDAGraphStatCUDAGraphWrapper)set_cudagraph_capturing_enabled)CompilationModeCUDAGraphMode
VllmConfigget_layers_from_vllm_configupdate_config)get_ec_transferhas_ec_transfer)	EplbState)get_kv_transfer_grouphas_kv_transfer_group)copy_kv_blocks)get_dcp_groupget_pp_groupget_tp_groupgraph_captureis_global_first_rank&prepare_communication_buffer_for_model)BatchDescriptorset_forward_context)init_logger)LoRAMappingLoRAMappingType)AttentionLayerBase)RoutedExpertsCapturer)MRotaryEmbeddingXDRotaryEmbedding)TensorizerLoaderget_model_loader)
MultiModalEmbeddingsSupportsMRoPESupportsMultiModalSupportsXDRoPEis_mixture_of_expertssupports_eagle3supports_mropesupports_multimodal_pruningsupports_transcriptionsupports_xdrope)VllmModelForPoolingis_pooling_modelis_text_generation_model)MULTIMODAL_REGISTRY)BatchedTensorInputsMultiModalKwargsItemPlaceholderRange)group_mm_kwargs_by_modality)PoolingParams)SamplingType)IntermediateTensors)GenerationTaskPoolingTaskSupportedTask)&length_from_prompt_token_ids_or_embeds)json_map_leaves)cdivround_up)DeviceMemoryProfiler
format_gib)PytHooks)is_pin_memory_available)get_dtype_sizekv_cache_dtype_str_to_dtype)AttentionBackendAttentionCGSupportAttentionMetadataAttentionMetadataBuilderAttentionTypeCommonAttentionMetadata
MultipleOf)GDNAttentionMetadataBuilder)"create_fast_prefill_custom_backendget_dcp_local_seq_lens+reorder_batch_to_split_decodes_and_prefills)NewRequestData)CudagraphDispatcher)AttentionSpecChunkedLocalAttentionSpecCrossAttentionSpecEncoderOnlyAttentionSpecFullAttentionSpecKVCacheConfigKVCacheGroupSpecKVCacheSpec	MambaSpecSlidingWindowSpecUniformTypeKVCacheSpecs)EMPTY_MODEL_RUNNER_OUTPUTAsyncModelRunnerOutputDraftTokenIdsECConnectorOutputKVConnectorOutputLogprobsListsLogprobsTensorsModelRunnerOutputPoolerOutputSamplerOutput&make_empty_encoder_model_runner_output)PoolingMetadataPoolingStates)LogitsProcessorsbuild_logitsprocs)LogitsProcessor)SamplingMetadata)RejectionSampler)Sampler)DraftModelProposer)EagleProposer)MedusaProposer)SpecDecodeMetadata)NgramProposer)SuffixDecodingProposer)apply_grammar_bitmask)CpuGpuBufferrecord_function_or_nullcontext)mamba_utils) check_attention_cp_compatibilityget_total_cp_world_size)coordinate_batch_across_dp)ECConnectorModelRunnerMixin)CachedRequestState
InputBatch)UBatchWrapper)KVConnectorModelRunnerMixin)LoRAModelRunnerMixin)UBatchSlicescheck_ubatch_thresholdsmaybe_create_ubatch_slicessplit_attn_metadata)is_residual_scattered_for_sp)lock_workspace   )AttentionGroupMultiModalBudget(add_kv_sharing_layers_to_kv_cache_groupsbind_kv_cachesanity_check_mm_encoder_outputs)TensorizerConfig)GrammarOutputSchedulerOutputAttnMetadataDictPerLayerAttnMetadatac                   j    e Zd Zdedej        dedz  dee         dej	        j
        defdZd	efd
ZdS )AsyncGPUModelRunnerOutputmodel_runner_outputsampled_token_idslogprobs_tensorsNinvalid_req_indicesasync_output_copy_stream
vocab_sizec                    || _         || _        t          j                    | _        || _        || _        || _        t          j        	                                }t          j        
                    |          5  |                    |           | j                            dd          | _        | j        r| j                                        nd | _        | j                                         d d d            d S # 1 swxY w Y   d S NcpuTnon_blocking)_model_runner_output_invalid_req_indicestorchEventasync_copy_ready_event_sampled_token_idsr   _logprobs_tensorscudacurrent_streamstreamwait_streamtosampled_token_ids_cputo_cpu_nonblocking_logprobs_tensors_cpurecord)selfr   r   r   r   r   r   default_streams           s/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/worker/gpu_model_runner.py__init__z"AsyncGPUModelRunnerOutput.__init__   sJ    %8!$7! ',kmm# #4$!1 2244Z788 
	1 
	1$00@@@)-)@)C)CD *D * *D&
 )&99;;; &
 '..000
	1 
	1 
	1 
	1 
	1 
	1 
	1 
	1 
	1 
	1 
	1 
	1 
	1 
	1 
	1 
	1 
	1 
	1s   9A7C==DDreturnc                    | j         j        d         }| j                                         | `| `|dk    r`| j                                         }| j        D ]}||                                          d}| j	        | j	        
                                }n/t          j        | j         | j        | j        | j	                  \  }}| j        }||_        ||_        |S )zCopy the device tensors to the host and return a ModelRunnerOutput.

        This function blocks until the copy is finished.
        r   Nr   )r   shaper   synchronizer   r   tolistr   clearr   tolistsr~   parse_outputr   r   r   logprobs)r   max_gen_lenvalid_sampled_token_idsilogprobs_listsoutputs         r   
get_outputz$AsyncGPUModelRunnerOutput.get_output   s    
 06r:#//111 "#!&*&@&G&G&I&I#. 3 3'*002222!N)5!%!;!C!C!E!E6F6S*)!%!;	7 7 73#^ *#: (    )__name__
__module____qualname__rt   r   Tensorrs   listintr   Streamr   r    r   r   r   r      s        !1.!1 !<!1 *D0	!1
 "#Y!1 #(*"3!1 !1 !1 !1 !1F-      r   r   c                   R    e Zd Zdededee         dej        j	        fdZ
defdZdS )	 AsyncGPUPoolingModelRunnerOutputr   raw_pooler_outputfinished_maskr   c                    || _         t          j                    | _        || _        t          j                                        }t          j                            |          5  |                    |           t          d | j                  }| j        
                                 d t          ||          D             | j         _        d d d            d S # 1 swxY w Y   d S )Nc                 8    | d n|                      dd          S r   r   xs    r   <lambda>z;AsyncGPUPoolingModelRunnerOutput.__init__.<locals>.<lambda>  s    !)$$e$1O1O r   c                      g | ]\  }}|r|nd S Nr   .0outincludes      r   
<listcomp>z=AsyncGPUPoolingModelRunnerOutput.__init__.<locals>.<listcomp>#  s5     7 7 7 C (D7 7 7r   )r   r   r   r   _raw_pooler_outputr   r   r   r   rL   r   zippooler_output)r   r   r   r   r   r   raw_pooler_output_cpus          r   r   z)AsyncGPUPoolingModelRunnerOutput.__init__
  s,    %8! ',kmm# #4 2244Z788 
	 
	$00@@@$3OO'% %! '..0007 7$'(=}$M$M7 7 7D%3
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	s   $A)CC!Cr   c                 F    | j                                          | `| j        S )zCopy the device tensors to the host and return a ModelRunnerOutput.
        This function blocks until the copy is finished.
        )r   r   r   r   r   s    r   r   z+AsyncGPUPoolingModelRunnerOutput.get_output(  s*     	#//111 #((r   N)r   r   r   rt   ru   r   boolr   r   r   r   r   r   r   r   r   r   	  ss        . ( Dz	
 #(*"3   <)- ) ) ) ) ) )r   r   c                      e Zd ZU dZded<   ej        ed<   edz  ed<   edz  ed<   ej        ed<   ej        ed	<   e	ej                 dz  ed
<   e
dz  ed<   edz  ed<   eeej        f         e	eeej        f                  z  dz  ed<   dS )ExecuteModelStatezwEphemeral cached state transferred between execute_model() and
    sample_tokens(), after execute_model() returns None.r   scheduler_outputlogitsNspec_decode_metadata spec_decode_common_attn_metadatahidden_statessample_hidden_statesaux_hidden_statesec_connector_outputcudagraph_statsslot_mappings)r   r   r   __doc____annotations__r   r   r   rZ   r   rp   r   dictstrr   r   r   r   r   3  s         < < ('''L,t3333&=&DDDD<,&&&EL)D0000*T1111"T))))U\)*T$sEL7H2I-JJTQQQQQQr   r   c                   2   e Zd Zdedej        fdZdeddfdZddZ	 ej
                    dd	            Zd
efdZdddeej        z  dej        dedefdZd ZddZddZddZddZdej        ddddfdZdededefdZdefdZdefd Zddde fd!Z!d"ede fd#Z"	 dd
e#j$        d$e#j        dz  de%e#j$        e#j$        f         fd%Z&ddd&ed'e#j$        ddfd(Z'd)e(eef         d*e)d+ede%ej        dz  e#j$        dz  f         fd,Z*ddd)e#j$        de%ej        e+dz  f         fd-Z,	 	 	 	 	 	 	 	 	 dd
ed+ed/ed0edz  d1edz  d2e-dz  d3ej        dz  d4ed5ed)e(eef         dz  d6e.e.e                  dz  d7e(eej        f         dz  de%e/e0dz  f         fd8Z1d)e#j$        d9e#j$        d:e.e         de.e.e                  dz  fd;Z2d)e#j$        d9e#j$        d:ed*e)d<e3defd=Z4dd>Z5dd?Z6d@e#j$        dAe#j$        de+fdBZ7d3ej        dej        fdCZ8ddde%e.e         e.e9         e.e%ee:f                  f         fdDZ;ddde.ej                 fdEZ<	 ddddGede%e.ej                 ej        f         fdHZ=de>j?        fdIZ@de.eA         fdJZBde.eC         fdKZDde%eEdLf         fdMZFd
edNeGdz  dOedeGfdPZHddQedReddfdSZIdTej        d)edUe#j$        dVeJdz  deKeLz  f
dWZMd)edefdXZNd
ede%ej        dz  ej        f         fdYZO	 ddddZedNeGdz  de%ej        dz  ej        dz  ej        eGdz  e(eef         ePdz  f         fd[ZQd\ej        dz  d]e+dz  deRfd^ZSddd_eRd\ej        dz  dTej        d)ed]e+dz  de%e(eef         eTdz  e.e.e                  e(eeUdz  f         e.e         e(eef         e.e         f         fd`ZVeWda             ZX	 	 	 	 ddbej        dz  dcej        dz  dNeGdz  ddej        dz  dee(eef         defdfZYeZ	 ddgedhed
ed+ediedz  defdj            Z[	 	 	 	 	 dd
ed+edUe#j$        dgedkedledmediedz  dnedz  doede%e\e]eej        dz  e^dz  f         fdpZ_ddqZ`	 dd0ed1edred2dsde%e(eej        f         dz  e(eej        f         e.e(eej        f                  z  dz  f         f
dtZa ej
                    	 ddddNeGdz  deKeLz  eGz  dz  fdu            Zbej
        dvdwdeKeLz  eGz  fdx            Zcdeddz  fdyZe	 ddddzeddfd{Zfde%e.e.e                  e.e         f         fd|Zgd}ej        d~ej        ddfdZhde.e         fdZidddej        e.e.e                  z  dejdTej        dej        de.ej                 dz  d]e+dz  de0d7e(eej        f         e.e(eej        f                  z  dz  de.e.e                  ej        z  fdZkde(eef         ddfdZlddeddfdZmde%edLf         dz  fdZnddZo	 	 	 	 ddZpdTej        d)e(eef         de(eeUdz  f         fdZqd\ej        dz  de(eef         fdZreWdbej        dz  ddej        dz  fd            Zsdedede fdZt ej
                    	 	 	 	 	 	 	 	 	 	 dd
ede\dz  dededlededRededededede%ej        ej        f         fd            Zu ej
                    dTej        dej        fd            ZvdTej        deCdewfdZx ej
                    dTej        dewfd            ZyddZzdefdZ{de.e]         de\fdZ|de}ddfdZ~de}de.e         ddfdZde.eee                           de.e         ddfdZddZeZdede.e         defd            Zde}de.e         ddfdZde}de(eej        f         fdZdee         fdZdee         fdZde}de.e         fdZde}de(eej        f         de.e         de(eej        f         fdZde(eej        f         ddfdZde}de.e         de(eej        f         fdZde}ddfdZde}ddfdZd ZddZde(ee)f         fdZdej        de.e.e                  fdZde(ee(eeez  f         f         fdZeWdede.e%eef                  dedefdǄ            ZdS )GPUModelRunnervllm_configdevicec                 $   || _         |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j	        | _	        ddl
m}  |t          | j        j        dz                       | j        }| j        }| j        }| j        }|| _        t                      | _        | j        j        | _        t%          |j        | j                  | _        |j        dk    | _        |j        | _        |j        | _        d| _        |j        | _        | j        j        | _        | j        j        | _        | j        dk    rdnt=                      j        | _         |j!        | _"        |j#        | _$        | j        j%        dk    o#tM          tO                      j(                  dk    | _)        |*                    |          | _+        |,                                | _-        |j.        | _.        |j/        | _0        | j        j1         | _2        | j        j3        | _3        th          | _5        |j6        | _6        |j7        | _7        | j5        8                    |          | _9        | j        j:        r|j;        | _<        nd| _<        | j        j=        | _>        t          | j        j@                  | _A        d | _B        	 g | _C        d | _D        d | _E        g | _F        i | _G        d| _H        | j        rgtO                      jI        rS|  | j        jJ        d	k    rt          | j                   | _L        n| j        M                                r"t          | j         | j        | 
          | _L        n| j        jJ        dk    rt          | j                   | _L        n| j        P                                rBt          | j         | j        |           | _L        | j        jJ        dk    r| jL        jR        | _H        nM| j        jJ        dk    r!t          | j         | j                  | _L        nt          d| j        jJ                   t          | jA                  | _V        d| _W        | j        r?| j        jX        | _W        | j        jY        }||j        |j        | _Z        n| j        | _Z        i | _[        i | _\        t          j^        _                                | _`        |ja        }	|	t          |	          nd}
t          | j$        t          | j        | j<                  | j"        | j        | j        | j        e                                | j        jf        g| j        jf        gt          | j         j                  t          | j         | j        | j        | j        |
          t          |
          | j        | j        ji                  | _j        d | _k        d | _l        | j>        r;t          j^        _                                | _k        t          jm                    | _l        | j        jn        r8| j        jo        t          jq        k    rt          | j        jn                  | _s        | t                                 i | _u        t          jw                    | _x        | y                    | j"        t          jz                  | _{        | y                    | j"        t          j|                  | _}        | y                    | j$        dz   t          jz                  | _~        | y                    | j$        t          jz                  | _        | y                    | j$        t          jz                  | _        | j        dk    r+| y                    | j$        t          jz                  | _        | y                    | j"        | j-        | j        d          | _        | y                    | j"        t          jg                  | _        | y                    | j$        t          jg                  | _        | y                    | j$        t          jz                  | _        | y                    | j$        t          j|                  | _        | j9        rX| y                    | j"        t          jg                  | y                    | j"        t          jg                  g| _        d| _        | j6        r0| y                    d| j"        dz   ft          j|                  | _        | j7        dk    r5| y                    | j7        | j"        dz   ft          j|                  | _        d | _        t          j        t          | j$        dz   | j        | j"                  t          j|                  | _        i | _        t!                      | _        d | _        | j        j        r0t          j        | j"        t          jz        | j                  | _        d| jW        z   | _        t-          | j                   | _        | j9        rt1          | j         | j5                  nd | _        d | _        t!                      | _        d | _        d | _        t          jm                    | _        t          j        | j$        dft          j|        d| j                  | _        d | _        d | _        d | _        d | _        d | _        d | _        | jW        rt          jm                    | _        t          j^        _                                | _        t          j        | j$        | jW        ft          j|        d| j                  | _        | j>        rlt          jm                    | _        t          j^        _                                | _        t          j        | j$        t          j|        d| j                  | _        d | _        d | _        i | _        d| _        d S )Nr   )set_cpu_offload_max_bytes   @poolingFr   external_launcher)logprobs_modengram)r	  r
  runnersuffixeagle3medusa)r	  r
  z%Unknown speculative decoding method: r   )max_num_reqsmax_model_lenmax_num_batched_tokensr
  
pin_memoryr   block_sizeskernel_block_sizesis_spec_decodelogitsprocs!logitsprocs_need_output_token_idsr>   cp_kv_cache_interleave_sizedtype)r!  numpy   r!  r
  r   )r!  r
  r  )r	  model_configcache_configcompilation_configlora_configload_configparallel_configscheduler_configspeculative_configobservability_config vllm.model_executor.models.utilsr  r   cpu_offload_gbr
  rR   r  r!  rT   cache_dtypekv_cache_dtyperunner_typer>   enable_prompt_embeds"is_multimodal_raw_input_only_modelis_multimodal_pruning_enabledr  calculate_kv_scalesdecode_context_parallel_sizedcp_world_sizer"   rank_in_groupdcp_rankr  max_num_tokensmax_num_seqsr  distributed_executor_backendlenr#   ranksbroadcast_pp_outputget_num_attention_headsnum_query_headsget_inputs_embeds_sizeinputs_embeds_sizeattention_chunk_size
uses_alibi	use_alibidisable_cascade_attncascade_attn_enabledis_mm_prefix_lmr@   mm_registry
uses_mropeuses_xdrope_dimsupports_multimodal_inputssupports_mm_inputsis_encoder_decodermax_num_encoder_input_tokensmax_encoder_lenasync_schedulinguse_async_schedulingr   r  sampler
eplb_state	kv_cachescross_layers_kv_cachecross_layers_attn_backendattn_groupsencoder_cacheuse_aux_hidden_state_outputsis_last_rankmethodr   drafteruses_draft_modelr   r   	use_eagler   eagle3_use_aux_hidden_stater   
ValueErrorr~   rejection_samplernum_spec_tokensnum_speculative_tokensdraft_model_configeffective_drafter_max_model_lenrequestsnum_prompt_logprobsr   r   r   comm_streamlogits_processorstupler   maxget_vocab_size
block_sizer   r{   r  input_batchr   prepare_inputs_eventr   cudagraph_capture_sizescudagraph_moder   NONEsortedcudagraph_batch_sizes_init_device_propertiesencoder_timing_registry	threadingLock_encoder_timing_lock_make_bufferint32	input_idsint64	positionsquery_start_locseq_lensencoder_seq_lensdcp_local_seq_lensinputs_embedsis_token_idsdiscard_request_masknum_decode_draft_tokensnum_accepted_tokensis_mm_embed_buffersis_mm_embed_idxmrope_positionsxdrope_positionsintermediate_tensorsnparange	arange_npshared_kv_cache_layersset'kv_sharing_fast_prefill_eligible_layers&kv_sharing_fast_prefill_logits_indiceskv_sharing_fast_prefillzerosuniform_decode_query_lenra   cudagraph_dispatcherr   	mm_budgetreorder_batch_thresholdrunner_only_attn_layers_draft_token_ids_draft_token_req_idstransfer_eventemptysampled_token_ids_pinned_cpuvalid_sampled_token_count_event%valid_sampled_token_count_copy_streamdraft_token_ids_eventdraft_token_ids_copy_streamvalid_sampled_token_count_cpudraft_token_ids_cpuexecute_model_statekv_connector_outputmamba_state_idxlayerwise_nvtx_hooks_registered)r   r	  r
  r  r%  r&  r+  r*  draft_configrl  custom_logitsprocss              r   r   zGPUModelRunner.__init__F  s   
 ''4'4"-"@&2&2*: + <"-"@$/$D!NNNNNN!!#d&7&F&P"Q"QRRR((0.133&,
9$d&7
 
 !- 8I E$0$E!; 	/ .3*)7 $(#4#H "2O!0A55=??;X.E,9  =ATT .LNN())A- 	   ,CCOTT"."E"E"G"G$0$E!%0(,(9(N$N!#0@ /&1+;"&"2"M"M#
 #
 / 	% $4#PD  #$D  %)$9$J! T->-LMMM,0	 .0:>"HL&79 79,1)
 " !	D|~~'B !	D  &-88,T-=>>(99;; 1 $ 0;     
 (/8;;5d6FGG(2244 ,T-=t{DQQ*1X==@ 5 (/8;;- $ 0      !8.58 8   &6dl%C%CD" " 	J#'#:#QD 2EL'L,F,R7C7Q447;7I4 8: 46  :,,.. ):(9(EE#$$$2 	 &* d0$2FGG#'#6;(7799*56 $ 1 <= 0 CDD) %"  /33E.F.F!2(,(<(X/
 
 
8 CG% 9=!$ 	6,1J,=,=,?,?D)(-D% #;	'6-:LLL)/'?* *D&
 	$$&&& GI$$-N$4$4! **4+>ek*RR**4+>ek*RR#00!  1  
  
 ))$*;5;)OO $ 1 1$2C5; 1 W W""&*&7&7! '8 ' 'D# "..!8
RW / 
 
 !--d.A-TT$($5$5UZ %6 %
 %
! (,'8'8U[ (9 (
 (
$ $(#4#4U[ $5 $
 $
 
 " 	% !!$"5UZ!HH!!$"5UZ!HH(D$ $%D  ? 	 $(#4#4D'!+,EK $5 $ $D 
 !##$($5$5%t':Q'>?u{ %6 % %D!
 AE! !A%t'94;NOO(
 
 
 79#AD46:34 	:?+#5;t{; ; ;D7 )*D,@(@% %88H$I$I! &T-t/?@@@ 	 48$
 25$ HL6:!#kmm,1K"+	-
 -
 -
) DH,OS2 :>"EI(BF*8<  	).D&/4z/@/@/B/BD,',{"D$89k?	( ( (D$ ( 7<{}}4=BZ=N=N=P=P:5:[%+ #	6 6 62 >B =A /1/4,,,r   r  r   Nc                 l    || _         | j        r#| j        j        }||j         | j         | _        d S d S d S r   )r  r,  rg  rh  )r   r  r  s      r   update_max_model_lenz#GPUModelRunner.update_max_model_len  sR    *" 	J2EL#|'A'I7;7I444	J 	J'I'Ir   c                 J    | j         r| j                                          d S d S r   )r  reset_cacher   s    r   reset_mm_cachezGPUModelRunner.reset_mm_cache  s0    > 	)N&&(((((	) 	)r   c                    | j         j                            d          sdS t          | dg           }|D ]}||                                 d}d}| j        j        }|                                D ]\  }}t          |t          t          f          rd\  }}	|D ]Q}
t          ||
          r?t          ||
          }t          |t          j                  r|                    |           R|D ]Q}
t          ||
          r?t          ||
          }t          |t          j                  r|                    |	           RdS )a  
        Re-initialize the KV cache and FP8 scales after waking from sleep.
        1. Zero out the KV cache tensors to remove garbage data from re-allocation.
        2. Reset Attention layer scaling factors (_k_scale, _v_scale) to 1.0.
          If these are left at 0.0 (default after wake_up), all KV cache values
          become effectively zero, causing gibberish output.
        fp8NrW  )_k_scalek_scale)_v_scalev_scale)      ?r  )r&  r0  
startswithgetattrzero_r'  static_forward_contextitems
isinstancer   r   hasattrr   r   fill_)r   rW  cache_tensork_attr_namesv_attr_namesattn_layersnamemodulek_scale_valv_scale_valattrparams               r   init_fp8_kv_scalesz!GPUModelRunner.init_fp8_kv_scales  sr     ,77>> 	FD+r22	% 	% 	%L'""$$$..-D'--// 	5 	5LD&&9l";<< 5 ,4([ ) 5 5Dvt,, 5 ' 5 5%eU\:: 5!KK444 ) 5 5Dvt,, 5 ' 5 5%eU\:: 5!KK444+	5 	5r   
num_tokensc                 x   t          |t                    rV| j        r| j        j        d d d |f         S | j        dk    r| j        j        d d d |f         S | j        j        d |         S | j        r| j        j        d d |f         S | j        dk    r| j        j        d d |f         S | j        j        |         S Nr   )r  r   rL  r  gpurM  r  r  )r   r  s     r   _get_positionszGPUModelRunner._get_positions  s    j#&& 	2 @+/;J;??#a'',0KZK@@>%kzk22 ?+/:>>#a'',0J??>%j11r   T)r"  sizer!  r"  c                4    t          ||| j        | j        |dS )N)r!  r
  r  
with_numpy)r   r
  r  )r   r!  r"  r  s       r   r}  zGPUModelRunner._make_buffer  s.     ;
 
 
 	
r   c                    t          t          t          f                     }| j        s|S | j        j        }| j                                        }t          t          t          f                     }t          |          D ].\  }}|j	        "|j	        
                    d          x}	 |||<   /t          |          dk    r|S | j        j        d |         }g }	t          |          D ]c}|
                    |||                   }
t          j        ||                   |
k                                    }|	                    |           dt          j        |	                              | j                  |d<   |S )Ncompressed_token_type_idsr   r
  token_type_ids)r  r  r   r>   rq  num_reqsget_pooling_paramsr   	enumerateextra_kwargsgetr>  r  r  ranger   r  appendconcatr   r
  )r   model_kwargsr  pooling_paramstoken_type_id_requestsr   r  token_typesr  r  posidss               r   _init_model_kwargsz!GPUModelRunner._init_model_kwargs  s   CH~''$ 	 #,)<<>>!%c3h!1!1!.11 	8 	8HAu".$)$6$:$:;V$W$WW[ -8&q)%&&!++=$YhY/x 	' 	'A(,,Q<<C<,,388::C!!#&&&&).n)E)E)H)H; *I *
 *
%& r   r   r   c                     t          | j        j                  dk    rdS | j        t	          | j        || j                   dS dS )a[  
        Update the order of requests in the batch based on the attention
        backend's needs. For example, some attention backends (namely MLA) may
        want to separate requests based on if the attention computation will be
        compute-bound or memory-bound.

        Args:
            scheduler_output: The scheduler output.
        r   N)decode_threshold)r>  kv_cache_configkv_cache_groupsr  r_   rq  )r   r   s     r   _may_reorder_batchz!GPUModelRunner._may_reorder_batch3  sc     t#34499F'37  !%!=      43r   c                 z    t           j                            | j                  | _        | j        j        | _        dS )z;Initialize attributes from torch.cuda.get_device_propertiesN)r   r   get_device_propertiesr
  device_propertiesmulti_processor_countnum_smsr   s    r   rx  z&GPUModelRunner._init_device_propertiesM  s-    !&!A!A$+!N!N-Cr   c                 B    t           j                                         d S r   )r   r   r   r   s    r   _sync_devicezGPUModelRunner._sync_deviceS  s    
     r   c                    |j         D ]8}| j                            |d           | j                            |d           9|j         D ]}| j                            |           |j        D ]}| j                            |d           |j        	                                }| j        j
        	                                }|j        j        }|||z
  z
  }|D ]}| j                            |           g }|j        D ]}	|	j        }|| j        v r,|                     ||	          }
|                    |
           ?|	j        }|	j        }|rJ|j        t(          j        k    r5t-          j        | j                  }|                    |j                   nd}| j        rm|J |j        }|
J d            t;          t<          |                                           }|j         !                    |          }|"                    |           tG          ||	j$        |	j%        |	j&        ||||	j'        |	j(        g |	j)                  }
|
| j        |<   |r-|j*        &|j*        dk    r| j        j+        n|j*        | j        |<   | j,        r| -                    |
           | j.        dk    r| /                    |
           |                    |
           ta                      j1        }|j        }|j2        }| 3                                }ti          |j5                  D ]\  }}| j        |         }
|j(        |         }|j6        |         }||j        v }|j7        |         }| j        j
        8                    |          }|
j9        ri| j:        rb|d|
_9        nX| j        j;        J | j        j;        |         }||         dz
  }|
j9        |z
  }||z  }|
j<        =                    dg|z             ||
_(        |sx|j>        |         }|t          |          z   |
j@        z
  }|dk    r!|
j<                            |d                    nr|dk    r#|
j<        =                    || d                    nH|t          |
j<                  k     r0|
j<        |d= |$| j        jA        |         |z   } | | j        jB        |<   |s3|0t          |
j'        |          D ]\  }!}"|!=                    |"           n|J |J ||
_'        |A| j:        r#|dk    r|jD        |         }#|#| d         |
_<        |                    |
           "|| j        jE        |<   | | j        jF        G                    ||           |s6|}$|t          |          z   }%|| j        jH        ||$|%f<   |%| j        jB        |<   | j        I                    |
|           |D ]7}&| j        J                    |&           | j        I                    |&|           8| j        K                                 | L                    |           | j        M                                 dS )ab  Update the cached states and the persistent batch with the scheduler
        output.

        The updated states are used by the `_prepare_inputs` function to create
        the input GPU tensors for the model.

        The SamplingMetadata is updated and copied to the GPU if there is a
        new/resumed/paused/finished request in the batch.
        Nr  z!You did not set `task` in the API)req_idprompt_token_idsprompt_embedsmm_featuressampling_paramsr  	generator	block_idsnum_computed_tokensoutput_token_idslora_requestr   r   r   )Nfinished_req_idsri  poprj  rq  remove_requestfree_encoder_mm_hashesr[  num_scheduled_tokenskeysreq_id_to_indexscheduled_cached_reqsresumed_req_idsscheduled_new_reqsr  _update_streaming_requestr  r  r  sampling_typerF   RANDOM_SEEDr   	Generatorr
  manual_seedseedr>   taskr   r=   	get_modelpoolerget_pooling_updatesapplyr   r  r  r  r  r  r  prompt_logprobsr   rL  _init_mrope_positionsrM  _init_xdrope_positionsr#   r]  scheduled_spec_decode_tokens_get_valid_sampled_token_countr  req_idsnew_block_idsnum_output_tokensr  prev_num_draft_lenrT  prev_req_id_to_indexr  extendnew_token_idsr>  r  num_prompt_tokensnum_tokens_no_specr   all_token_idsnum_computed_tokens_cpublock_table
append_rowtoken_ids_cpuupdate_req_spec_token_idsadd_requestcondenser  refresh_metadata)'r   r   r  mm_hashscheduled_req_idscached_req_idsr  unscheduled_req_idsreqs_to_addnew_req_data	req_stater  r  r  r  model	to_updater]  req_datascheduled_spec_tokensvalid_sampled_token_countr   r  r  resumed_from_preemptionr  	req_indexprev_req_indexnum_acceptednum_rejectedr   num_new_tokensend_idxr  new_idsresumed_token_idsstart_token_indexend_token_indexrequests'                                          r   _update_stateszGPUModelRunner._update_statesV  s    '7 	7 	7FMfd+++$((6666 '7 	4 	4F++F3333 (> 	2 	2G""7D1111 -AFFHH)9>>@@*@P -0AO0ST
 * 	4 	4F++F333302,? ;	* ;	*L!(F&& ::6<PP	""9---*:O)8N  !#1\5MMM!O4;???	%%o&:;;;; 	$ 0%111%*'')L'''0$..2B2BCC!L<<TBB	///*!-!>*8(4 /-#&0$0$D!#)6  I %.DM&! ?#B#N '6"<< $//(8 (0  6**9555 #a''++I666y)))) $~~2#9 0 M %)$G$G$I$I!"8#344 i	Y i	YIAvf-I"*">q"A$215M&,0H&H# ( :1 =(8<<VDDI+ K0I K $34I00+@LLL%)%5%J6%RN#<^#Lq#PL#,#?,#NL'<7'.55rd\6IJJJ -@I) M !) 6q 9 (#m*<*<<y?SS  "Q&&.55mB6GHHHH#a''.55m^ODTDT6UVVV"S)C%D%DDD ./@/A/AB((:9E+,  FMD$7	B + 
4 ,.1)2E}.U.U 2 2*	7!((1111 ((($000 '4	# 
 , X1BQ1F1F )1(>v(F%1BDUCUCVCV1WI.""9--- CVD4Y?( ,77yQQQ   Q$7!"5M8J8J"J "  .0@@ BQ 3I> 66yBWXXXX # 	W 	WG((11166w@UVVVV 	!!### 0111))+++++r   r  c           
      b   | j         r| j        j        sdS t          j        |t          j        |                    d          dfd|j                  gd          dk                                    	                    d          
                                                                }t          |          D ]\  }}|| j        j        |<   | j        j        dk    rQt#          j        || j        | j        | j        | j        | j        j        | j                                                   dS dS )a  Update the cached states after model execution.

        This is used for MTP/EAGLE for hybrid models, as in linear attention,
        only the last token's state is kept. In MTP/EAGLE, for draft tokens
        the state are kept util we decide how many tokens are accepted for
        each sequence, and a shifting is done during the next iteration
        based on the number of accepted tokens.
        Nr   r   r   r  dimalign)r,  r%  	is_hybridr   catfullr  r
  r   argmaxr   r"  r  rq  num_accepted_tokens_cpur&  mamba_cache_moder   postprocess_mambar  ri  r  r'  r  r3  get_mamba_state_copy_func)r   r  r   r  r   r  s         r   "_update_states_after_model_executez1GPUModelRunner._update_states_after_model_executeG  sM    & 	d.?.I 	F
 	(
-22155q9#3#:   
 
 
  SUUVBZZSUUUWW% 	( '':;; 	E 	EMAz:DD4Q77-88) $ $'>
4466     98r   r  r1  c                    | j                             |           | j        |         }|j        |_        |j        |_        |j        |_        |j        |_        |j        |_        |j        |_        |j	        |_	        t          |j        |j                  |_        |j                                         | j        r|                     |           |S )aj  Updates streaming session request from `scheduled_new_reqs`.

        Removes the request from InputBatch (if present), updates the cached
        state, and prepares it for re-addition to the batch.

        NOTE: prompt_token_ids includes intermediate output tokens - tokens
        previously generated but now are input context (part of the prompt).
        )rq  r  ri  r  r  r  r  r  r  r  rK   r!  r  r   rL  r  )r   r  r1  r2  s       r   r
  z(GPUModelRunner._update_streaming_requestw  s     	''///M&)	%1%B	" , 8	"."<	$0$@	!#/#>	 *4	(4(H	%&L&	(?'
 '
	# 	"((***? 	2&&y111r   r2  c                    |                                  }t          |          s
J d            |j        
J d            t          t          |          }|                    |j        |j                  \  |_        |_        d S )Nz"M-RoPE support is not implemented.z1M-RoPE requires prompt_token_ids to be available.)	r  r9   r  r   r4   get_mrope_input_positionsr  r  mrope_position_delta)r   r2  r3  mrope_models       r   r  z$GPUModelRunner._init_mrope_positions  s      e$$JJ&JJJ$)55? 655 =%00 11*%  	B	!9#A#A#Ar   c                     |                                  }t          t          |          }|j        
J d            t	          |          s
J d            |                    |j        |j                  |_        d S )Nz2XD-RoPE requires prompt_token_ids to be available.z#XD-RoPE support is not implemented.)r  r   r6   r  r<   get_xdrope_input_positionsr  r  )r   r2  r3  xdrope_models       r   r  z%GPUModelRunner._init_xdrope_positions  s      NE22)55@ 655 u%%LL'LLL%%1%L%L&!&
 &
	"""r   c                 ,   |r| j         si S t          t                               }|j        D ]-}|j        D ]#}|j        |                    |j                   $.i }t          || j        | j	                  D ]\  }}}|
                    |           |S )Nr
  r  )r4  r   rB   r	  r  datar  rD   r
  r  update)r   r   	mm_kwargsreqfeaturemm_kwargs_combined_mm_kwargs_groups           r   _extract_mm_kwargsz!GPUModelRunner._extract_mm_kwargs  s       	t'N 	I-.00	#6 	3 	3C? 3 3<+$$W\2223
 35%@;&
 &
 &
 	7 	7!Aq/
 %%o6666!!r   num_seqsc                 ~    | j         si S | j        }|J |                                }|                     ||          S r   )r4  r  get_modality_with_max_tokens_get_mm_dummy_batch)r   re  r  dummy_modalitys       r   _dummy_mm_kwargszGPUModelRunner._dummy_mm_kwargs  sL    6 	IN	$$$"??AA''AAAr   cumsum_dtypec                     t          j        ||          }|d         }t          j        ||z
  |          }| j        d|         |z
  }||fS )zGet the cumulative sum and batched arange of the given array.
        # E.g., [2, 5, 3] -> ([2, 7, 10], [0, 1, 0, 1, 2, 3, 4, 0, 1, 2])
        # Equivalent to but faster than:
        # np.concatenate([np.arange(n) for n in num_tokens])
        r   r   N)r  cumsumrepeatr  )r   r  rk  cu_num_tokenstotal_num_tokenscumsums_offsetsr  s          r   _get_cumsum_and_arangez%GPUModelRunner._get_cumsum_and_arange  s\     	*LAAA(,)MJ$>
KK 1!1 12_Df$$r   total_num_scheduled_tokensro  c                    | j         j        W| j                            |           | j        r4| j                            |           | j                            |           dS | j         j        }|J g }g }g }g }d}	d}
d}|j        }| j         j	        
                                D ]\  }}|                    |          x}|                    |           t          |                    |d                    }||z  }||                                         dz
  }|                    ||z
             |                    t!          ||z
  dz   |dz                        || j        z  }|                    t!          |||z                        |	||k    z  }	t%          |
|          }
t          |          }||z
  }||k     rU| j                            |           | j        r4| j                            |           | j                            |           |dk    rdS |	r`|
|dz
  k    rW| j        j        d|                             | j         j        d|df         d           | j        rd| j        j        d|<   dS t+          j        |t*          j        | j                                      | j        d          }t+          j        |t*          j        | j                                      | j        d          }| j        j                            d|| j         j        |df         	           | j        |sdS t;          | j        t*          j                  sJ t+          j        |t*          j        | j                                      | j        d          }t+          j        |t*          j        | j                                      | j        d          }| j                            t*          j        
          }| j        j                            d||                                 |         	           dS )a  Prepare the input IDs for the current batch.

        Carefully handles the `prev_sampled_token_ids` which can be cached
        from the previous engine iteration, in which case those tokens on the
        GPU need to be copied into the corresponding slots into input_ids.NTr   r   r   r   r   )r!  r  )rG  indexsrcr   )!rq  prev_sampled_token_idsr  copy_to_gpur3  r  r  r  r  r  r  r  r  r>  itemr  r  re  rn  r  copy_r   tensorr  r  r   r
  scatter_r  r  r   r~  flatten)r   r   rs  ro  r  sample_flattened_indicesspec_flattened_indicesprev_common_req_indicesprev_draft_token_indicesindices_matchmax_flattened_indextotal_num_spec_tokensr6  r  	cur_index
prev_index	draft_lenflattened_indexstartnum_commmon_tokenstotal_without_specsampled_tokens_index_tensorprev_common_req_indices_tensordraft_tokens_index_tensorprev_draft_token_indices_tensordraft_token_idss                             r   _prepare_input_idsz!GPUModelRunner._prepare_input_ids  s    2:N&&'ABBB( J"../IJJJ!--.HIIIF
  $/D#///.0 ,.-/.0   ! 0 M!%!1!A!G!G!I!I 	P 	PFI266v>>>
K'..z:::   5 9 9&" E EFF	%2%"/	":"?"?"A"AA"E )//)0KLLL&--/I59?Q;NOO   #T%99 )//eUY=N0O0OPPP!>>&)*=&O&O# !9::7:OO 222 N&&'ABBB( J"../IJJJ!--.HIII"" F 	05G!5KLL
 N2 22399 78K9K8KQ8NO! :    ( B=A!%&9'9&9:F&+l$EKDO'
 '
 '

"T[t"
,
, 	$ */#5;4?*
 *
 *

"T[t"
,
, 	' 	##- 7.1 	$ 	
 	
 	
  (0F(F$/>>>>>$)L"%+$/%
 %
 %

"T[t"
,
, 	" +0,$EKDO+
 +
 +

"T[t"
,
, 	( /222EE##+''))*IJ 	$ 	
 	
 	
 	
 	
r   r  kv_cache_specr  c                    t          |t                    sdS d| j        j        d |<   |D ]e}| j        j        |         }| j        |         }|j        d| j        j        |<   8t          d |j        D                       }|| j        j        |<   f| j        	                    |           | j        j
        d |         }| j        j        d |         }	||	fS )NNNr   c              3   .   K   | ]}|j         j        V  d S r   )mm_positionlength)r   r`  s     r   	<genexpr>z7GPUModelRunner._get_encoder_seq_lens.<locals>.<genexpr>{  s9       ' '/6#*' ' ' ' ' 'r   )r  rd   r  r  rq  r  ri  r  sumrx  r  )
r   r  r  r  r  r9  r2  encoder_input_tokensr  encoder_seq_lens_cpus
             r   _get_encoder_seq_lensz$GPUModelRunner._get_encoder_seq_lensd  s    -);<< 	: /0 (+ + 	G 	GF(8@If-I$,67%(3
 $' ' ':C:O' ' ' $ $  3GD!$Y//))(33304YhY?#47		B!555r   c                     |j         }|dk    sJ  j        j        }|dk    sJ  j        j                            |           t          j         j        d|         |          }                     |          \  }} j	        j        d|         }t          j
         j        j        |         ||            j        r                     |            j        dk    r                     |           || j        j        j        d         z  z   }	t%          j        |	          }
t%          j         j        j                                        d|
 j        j        d|                     j        rH j        j                                        }t%          j        |d|
 j        j        d|                     j        j        rd}t;          |          D ]}||         }| j        j        vr||z  }|dk    r||z  }* j        j        |         } j        j        |         }||j        d         k    r||z  }e||z   }t=          ||j        d                   }||z
  }|dk    r2 j        j        |||z                                 |||                    ||z  }ȉ j        j        !                    ||            j        j        "                    |           d j#        j        d<   | j#        j        d|dz   <    j#        j        |dz   d         $                    |d                     j#        %                                  j#        j&        d|dz            } j        j        d|         |z    j'        j        d|<    j'        j        |d         $                    d            j'        %                                  fd j        j(        D             }t          j)        |t
          j*                  } j'        j        d|         |k      j+        j        d|<    j+        %                    |            ,                    |||            j        rD j-        j&        ddd|f                               j-        j        ddd|f         d	           ni j        dk    rD j.        j&        ddd|f                               j.        j        ddd|f         d	           n j	        %                    |           t_          |j0                  dk    }|s1|dd         dz
  }d}t          j1        |t
          j*                  }n/t          j2        |t
          j*                  }t          j3        |dt
          j*                  }|j0        4                                D ]a\  }} j        j5        |         }t_          |          ||<    j        j        |          j        j6        |         k    rt_          |          ||<   b 7                    ||          }|j8        }|dz   }| j9        j        d|<    j9        j        |d         $                    d            j9        %                                  j:        rEt          j;        |           j<        j=        j>        k    sJ  ?                     j        ||           ||fS )
z]
        :return: tuple[
            logits_indices, spec_decode_metadata,
        ]
        r   N)r   r   r   c                 4    g | ]}j         |         j        S r   )ri  r  )r   rr   s     r   r   z2GPUModelRunner._prepare_inputs.<locals>.<listcomp>
  s#    TTTadmA&1TTTr   r   Tr   )@rs  rq  r  r%  commit_block_tabler  rn  r  rr  r  addr$  rL  _calc_mrope_positionsrM  _calc_xdrope_positionsr'  r   r   
from_numpyindex_selecttoken_ids_cpu_tensorr}  r  r   r3  is_token_ids_tensorr  req_prompt_embedsr  minr  rz  compute_slot_mappingcommit_slot_mappingr  fillrx  r  r  r  arrayr~  r  r  r  r  r>  r  onesr  rK  r  r  r!  _calc_spec_decode_metadatalogits_indicesr  r(  r  r	  r+  r  set_active_loras)r   r   r  rs  r  req_indicesro  r  positions_nptoken_indicestoken_indices_tensorr  
output_idxreq_idx	num_sched
req_embeds	start_posend_pos
actual_endactual_num_schedr  r  num_tokens_npuse_spec_decoder  r   num_sampled_tokensnum_draft_tokensr  r  r  s   `                              r   _prepare_inputszGPUModelRunner._prepare_inputs  s    &6%P")A----#,!|||| 	$77AAA iyy 9;OPP !% ; ;<P Q Qv ~()D*D)DE
4[A	
 	
 	
 	
 ? 	9&&'7888 !##''(8999 ;)9)G)Ma)PPP 	  %/>>
 	199;; "#>$>#>?		
 	
 	
 	
 $ 	+?GGIIL$%)*E+E*EF	    - !	(J ?? ( (09	 $"2"DDD)+J >>)+J!-?H
 ,DWM	 
 0 333)+J $i/ **:1*=>>
#-	#9 #a''&*"Z2B%BBeJy';<===i'

$99+|TTT$889STTT &'"4AHqL 01 	1/44]25FGGG((***.2>X\>B 4YhY?BVV 	(# 	#((+++!!###TTTT4;K;STTT
28<<<
 MYhY'-7 	!$YhY/ 	!--h777 	&	
 	
 	
 ? 	C $QQQ(C)C(C%CDJJ$(,G-G,G)GH! K     !A%%!%aaa)D*D)D&DEKK%)!!!-H.H-H*HI! L     N&&'ABBB.KLLqP $	7 -QRR014N#' !#!B!B!B
  "xAAA ')gh"(&K&K&K# ">DDFF
L 
L *:6B,/,@,@ )$<WE'9'BC C 8;?7K7K+G4#'#B#B -$ $  2@N!1A!59PD(+IXI6(+HII6;;B???(44666  	)**#4KL L L L !! "68J  
  
 	
r   Fmax_query_lennum_tokens_paddednum_reqs_paddedubatch_slicesr  r  for_cudagraph_capturecascade_attn_prefix_lensr  c                 
   	$%& t           j        j                  dk    ri dfS p|pJ i $|&d t          t          |                    D             $	r j        }n8 j        j        d                                                                         }rc j	        j
        d          j        j        d<    j        j        d                             d            j                                          j        j        &dt          f& fd}|J  |d          }|d         } j        j        r3|d|                                                                          _        t)           j        j        ddz             j        j        ddz             j        j        d          j        j        d          j	        j        d         ||||d          } j        dk    rt3           j        j        d          j         j         j        j                   j        j        d<    j        j        d                             d            j                                        j        j        d         |_         j        j        d         |_        |@ j         j!        r4|"                    d          |_#         $                    |          |_%        i %	 ddt          d	t          d
t(          dt          dz  ddf
$%	& fd}d}tM          &          D ]\  }}tO          |          } (                    |
pi |j)                  \  |_*        |_+        |dk    r ||          |_,        ||         |_         j-        r:|8t]           j/        t`                    r j/        j1        d         |j2        v r|}n|}t          t           j3        |                             D ]C}|2tM          ti          ||                    D ]\  }} |||||           6 ||||           D j5        ri } j	        j6        D ]b}g } j7        |         }|j8        D ]2}|j9        }|:                                } |;                    |            3 j	        j<        |         }!|||!<   ct]          $tz                    r$$D ] }"|">                                D ]	}#||#_?        
!n$>                                D ]	}#||#_?        
|"k    s|k    r|@                    |          }$|fS )zQ
        :return: tuple[attn_metadata, spec_decode_common_attn_metadata]
        r   Nc                 *    g | ]}t                      S r   )r  r   rb  s     r   r   z<GPUModelRunner._build_attention_metadata.<locals>.<listcomp>}  s    GGGTVVGGGr   r   kv_cache_gidc                 2   J |          j         }t          |t                    r)t          j        dft          j        j                  }n'j        j        |          }|	                              }|         
                    d           |S )Nr   r$  r   )r  r  re   r   r  r~  r
  rq  r%  get_device_tensorr  )	r  r  blk_table_tensor	blk_tabler  r  r  r  r   s	       r   _get_block_tablezBGPUModelRunner._build_attention_metadata.<locals>._get_block_table  s    ".3D3P3PP+L9GM-)ABB P#(;$a(+;$ $ $   !,8F	#,#>#>#O#O  Xo56<<R@@@##r   T)r  query_start_loc_cpur  _seq_lens_cpu_num_computed_tokens_cpur  num_actual_tokensr  max_seq_lenblock_table_tensorslot_mappingcausalattn_gidcommon_attn_metadataubidr   c                 ,   j         |          |         }|                    |pd          }|          j        }t          |t                    r|j        |j        d                  }|t          |          f}r|          |         nd}i }	rVt          |t                    rA|
J d            t          j
        j        d          j        j        d                    }	r|                    |          }
nO|v r/|j        r(|                    |         |j        |j                  }
n |j        d||d|	}
|j        r|
|<   |t          t                    sJ }nt          t(                    sJ |         }|j        D ]}|
||<   d S )Nr   z$UBatching not supported with GDN yet)r  num_decode_draft_tokens_cpu)common_prefix_lenr  r   )rZ  get_metadata_builderr  r  rl   kv_cache_specslayer_namestyper\   r  r  r  r  r   build_for_cudagraph_capturesupports_update_block_tableupdate_block_tabler  r  buildr   )r  r  r  r  
attn_groupbuilderr  	cache_keycascade_attn_prefix_lenextra_attn_metadata_argsattn_metadata_iattn_metadata_dict
layer_nameattn_metadatacached_attn_metadatar  r  r  r  r   r  s                r   _build_attn_group_metadatazLGPUModelRunner._build_attention_metadata.<locals>._build_attn_group_metadata  s.    ),7AJ 55dia@@G+L9GM-)@AA X - <Z=STU=V W&W6I ,(6x@@ $ (*$ :g7R#S#S ||%K|||+/(,(@(DEUoEU(V040L0P((1, , ,( % F")"E"E(# # 1117 2 #*"<"<(3(;(5# # #0'- #&=)=# # /# #
 6 F6E(3|!-66666%2""!-66666%24%8"(4 A A
1@":..A Ar   r   )Ar>  r  r  r  r  r  r  rn  ry  rq  rM  r  r  rx  r   r%  enable_return_routed_expertsr   r"  r  rZ   r  r  num_computed_tokens_cpu_tensorr8  r^   r:  r*  r  r  r  dcp_local_seq_lens_cpur&  r  r  num_logits_indices _prepare_kv_sharing_fast_prefilllogits_indices_paddedr  r   r  r  r  r  r  r,  r  r_  r   attn_layer_namesr  rZ  r   rJ  r  ri  r  r  extract_embeds_ranger  r  r   valuesmm_prefix_rangeunpadded)'r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  block_table_gid_0slot_mapping_gid_0cm_baser  r   r  kv_cache_groupcmr  r  _cmreq_doc_rangesr  image_doc_rangesr2  
mm_featurepos_infoimg_doc_ranger  ub_metadata	_metadatar  r  r  s'   ` ` ``  `` `                        @@@r   _build_attention_metadataz(GPUModelRunner._build_attention_metadataa  s   & t#34499t8O-;)5X*/@/L/LL.0$GGU3}3E3E-F-FGGGM  	C ,KK-*9H9599;;@@BBK 	3 8(C $'		2 $'		277:::$00222.>	$3 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$$ (((,,Q//*1-9 	N 2;J; ? C C E E K K M MD) 045J7J5JK $ 4 89N?Q;N9N O]&'7'78-+,<_,<=%)%5%T  & %/'#0+
 
 
" ""5K!)8),#$@	6 6D#'		2 #'		288;;;#//@@@)-)@)DEUoEU)VG&-1-D-H  .G* %$*;*S%)7)<)<Q)?)?G&,0,Q,Q- -G)  	  $	;	A ;	A;	A;	A #:;	A *	;	A
 ;	A ;	A ;	A ;	A ;	A ;	A ;	A ;	A ;	A ;	A ;	A ;	A ;	A~ ,0(,5o,F,F 	K 	K(L.gB <@;U;U$*,< <8B!8
 a(8(8(F(F%"/"=& :+K+SdlM:: :|4Q7>;UUU;=8794!#d&6|&D"E"EFF K K ,%./B=RT/U/U%V%V V V	c22<3PTUUUUV /.|XrJJJJK  	?N*2 ; ;#%  M&1	"+"7 ; ;J)5H$,$A$A$C$CM$++M::::*:6B*:w''-.. ?#0 C CK%0%7%7%9%9 C C	4B	11CC "/!5!5!7!7 ? ?I0>I--+7'':9J+J+J 199*hOO - >>>r   r  num_common_prefix_blocksc           
         d}t          | j        j                  }d t          |          D             }t          |          D ]}| j        |         D ]y}t          |j        t                    rd}	n6|                     ||||         |j        |	                                          }	||         
                    |	           ||	dk    z  }z|r|ndS )z
        :return: Optional[cascade_attn_prefix_lens]
            cascade_attn_prefix_lens is 2D: ``[kv_cache_group_id][attn_group_idx]``,
            None if we should not use cascade attention
        Fc                     g | ]}g S r   r   r  s     r   r   zDGPUModelRunner._compute_cascade_attn_prefix_lens.<locals>.<listcomp>b  s%     5
 5
 5
B5
 5
 5
r   r   N)r>  r  r  r  rZ  r  r  re    _compute_cascade_attn_prefix_lenr  r  )
r   r  r  r  use_cascade_attnnum_kv_cache_groupsr  r  r  r  s
             r   !_compute_cascade_attn_prefix_lensz0GPUModelRunner._compute_cascade_attn_prefix_lensT  s    !!$"6"FGG5
 5
1225
 5
 5
  ""566 	@ 	@L".|< @ @
j68PQQ 
./++ /3.S.S,+0>"0"7799/ /+ )6==>UVVV $;a$??  @ ,<E''Er   attn_metadata_builderc                    ||j         z  }|dk    rdS t          ||                                          }||j         z  |j         z  }t          |t                    pt          |t                    o|j        du}t          |t                    pt          |t                    o|j        du}t          |t                    sJ |	                    ||| j
        |j        | j        ||| j        | j        	  	        }	|	r|ndS )a6  Compute the length of the common prefix for cascade attention.

        NOTE(woosuk): The common prefix length returned by this function
        represents the length used specifically for cascade attention, not the
        actual number of tokens shared between requests. When cascade attention
        is disabled (use_cascade=False), this function returns 0 even if
        requests share common tokens. Additionally, the common prefix length is
        truncated to a multiple of the block size and may be further truncated
        due to implementation details explained below.

        Args:
            num_scheduled_tokens: Number of tokens scheduled per request.
            num_common_prefix_blocks: Number of shared KV cache blocks.

        Returns:
            int: Length of common prefix in tokens.
        r   N)	r  
query_lensrB  num_kv_headsrG  use_sliding_windowuse_local_attentionr  r8  )rp  r  r  rk   rf   sliding_windowrc   rE  rb   use_cascade_attentionrB  r  rG  r  r8  )
r   r  r  r  r  r  r  r  r  use_cascades
             r   r  z/GPUModelRunner._compute_cascade_attn_prefix_lenx  s;   4 5}7OO!!1R   13F3J3J3L3LMM !99M<TT 	 (7HII 
}&788 9,D8 	 )8QRR 
}&788 ?2$> 	 -77777+AA/+ 0&3n1 3L. B 

 

 %06  Q6r   c                 r   d}t          | j        j                  D ]\  }}| j        |         }|j        J | j        j        |         }|j        |         }t          |j        |j	                  }||z   |k    r't          d||z
            }	t          d||	z
            }
n|}	d}
||	|
z   k    sJ |	dk    r9|}||	z   }|}||	z   }|j        d d ||f         | j        j        d d ||f<   ||	z  }|
dk    r@|}||
z   }|j        J t          j        | j        j        ||j        ||	z   |
           ||
z  }d S )Nr   )r   
out_offsetrU  context_lenr=  )r  rq  r  ri  r  r$  r  rK   r  r  rn  r   rU  r/   get_next_input_positions_tensorr  )r   r   mrope_pos_ptrru  r  r_  r  r  r!  prompt_part_lencompletion_part_len	dst_startdst_end	src_startsrc_ends                  r   r  z$GPUModelRunner._calc_mrope_positions  s   &t'7'?@@ -	5 -	5ME6-'C&222"&"2"J5"Q#3#H#P  F$c&7! ! #%99<MMM"%a):=P)P"Q"Q&)!-AO-S&T&T##"6&'#'?=P+PPPPP"")	'/9/	-?ADATAAy((B$(Ig,=)=> 0"Q&&)	'*==/;;; @,/(),)A 3o E#6    !44[-	5 -	5r   c                 T   d}t          | j        j                  D ]
\  }}| j        |         }|j        J | j        j        |         }|j        |         }t          |j        |j	                  }||z   |k    r't          d||z
            }	t          d||	z
            }
n|}	d}
||	|
z   k    sJ |	dk    r9|}||	z   }|}||	z   }|j        d d ||f         | j        j        d d ||f<   ||	z  }|
dk    r1|}||
z   }t          j        | j        j        |||	z   |
           ||
z  }d S )Nr   )r   r  r  r=  )r  rq  r  ri  r  r$  r  rK   r  r  rn  r   r0   r  r  )r   r   xdrope_pos_ptrru  r  r_  r  r  r!  r!  r"  r#  r$  r%  r&  s                  r   r  z%GPUModelRunner._calc_xdrope_positions
  s   &t'7'?@@ +	6 +	6ME6-'C'333"&"2"J5"Q#3#H#P  F$c&7! ! #%99<MMM"%a):=P)P"Q"Q&)!-AO-S&T&T##"6&'#'?=P+PPPPP""*	(?:/	-?BEBVAAy((C%)!!!Yw->*>? /1"Q&&*	(+>>!A-0( 3o E#6	    "55W+	6 +	6r   r  cu_num_scheduled_tokensc           	      p   |dz   }|                      |t          j                  \  }}t          j        ||z
  |          }||z  }|dz
  }|                      |t          j                  \  }}t          j        ||z
  |          }	|	|z  }	t	          j        |                              | j        d          }t	          j        |                              | j        d          }t	          j        |                              | j        d          }t	          j        |	                              | j        d          }	t	          j        |                              | j        d          }| j        j	        |         }
|
|	dz            }
t          |
|                                |||	||          S )Nr   )rk  Tr   )r  r  cu_num_draft_tokenscu_num_sampled_tokenstarget_logits_indicesbonus_logits_indicesr  )rr  r  r~  rn  r   r  r   r
  r  r  r   r   )r   r  r)  r  r,  r  r  r.  r+  r-  r  s              r   r  z)GPUModelRunner._calc_spec_decode_metadata9  s   " .1 )-(C(CRX )D )
 )
%v #&88:L
 
 	&   5q8
 '+&A&A28 'B '
 '
#V !#	!$668H!
 !
 	' $./BCCFFKd G 
 
 !& 01F G G J JKd !K !
 !
 ).99<<Kd = 
 
 !& 01F G G J JKd !K !
 !
  %/0DEEHHKd  I  
  
 .,^<)*?!*CD!+-4466 3"7"7!5)
 
 
 	
r   c                 d   | j         J |j        d         }|dk    sJ | j         d |                             |           | j         |d                              |d                                                    | j                            |d          \  }}|j        }| j         d |         }|S )Nr   r   Tdisable_full)r  r   rz  r  ry  r  dispatchr  )r   r  
num_logitsrb  
batch_descnum_logits_paddedr  s          r   r  z/GPUModelRunner._prepare_kv_sharing_fast_prefill  s     :FFF#)!,
A~~~~3KZK@FF~VVV
 	3JKK@FF2##%%	
 	
 	
 1::T ; 
 
: '1 $ K!
 %$r   c                    |j         }|sg g g fS t          t                               }t          t                               }t          t          t          t
          f                              }|                                D ]|\  }}| j        |         }|D ]g}	|j        |	         }
|
j	        |
                    |
j                   |
                    |
j	                   |
                    ||
j        f           h}|||fS )a  Batch multimodal inputs from scheduled encoder inputs.

        Args:
            scheduler_output: The scheduler output containing scheduled encoder
                inputs.

        Returns:
            A tuple of (mm_hashes, mm_kwargs, mm_lora_refs) where:
            - mm_hashes: List of multimodal hashes for each item
            - mm_kwargs: List of multimodal kwargs for each item
            - mm_lora_refs: List of (req_id, placeholder_range) for each item
        )scheduled_encoder_inputsr   r  rB   rm  rC   r  ri  r  r\  r  
identifierr  )r   r   r7  	mm_hashesr^  mm_lora_refsr  encoder_input_idsr2  mm_input_idr  s              r   _batch_mm_inputs_from_schedulerz.GPUModelRunner._batch_mm_inputs_from_scheduler  s   ( $4#L ' 	r2:IKK	-.00	 E#'7"789;;)A)G)G)I)I 
	F 
	F%F%f-I0 F F&2;?
?*  !6777  111##VZ-C$DEEEEF )\11r   c           
                                |          \  }}}|sg S t           j        o j        j        o|j                  }t          t           j                  } j        r/ j	        
                                rg }g }t                      }	g }
|D ]\  }} j        j        |         }t           j        j        |                   } j                            |j                  }|                    |           |                    |g|z             |
                    |           |dk    r6 j        j                            |          }||	                    |           t/          t1          |          t1          |          dt2          j                  } j	                            |	|           t9           j        d          rÈ fd|
D             }t;          j        t;          j        |t:          j                   t;          j        |t:          j                             }t/          t1          |!                                          t1          |          dt2          j"                  } j	                            |	|           g }d}tG          | j$         j%                  D ]C\  }}} j&        r|d	k    r|d
k    rtO          tP          j)                             }tU          |          D ]}|||z            } +                    ||||z   d
          5  tY          tG          |g j$         j%                            \  }}} |j-        di |}|                    |           d d d            n# 1 swxY w Y   |} n= +                    ||||          5   |j-        di |} d d d            n# 1 swxY w Y   t]          | |           |                    |            ||z  }Et_          ||          D ]E\  }!}"|" j0        |!<   tb          2                    d|!            3                     j0        |!           F|S )Nr   T)
is_prefillr  get_num_mm_connector_tokensc                 D    g | ]}j                             |          S r   )r3  r@  )r   r  r   s     r   r   z6GPUModelRunner._execute_mm_encoder.<locals>.<listcomp>	  s9     " " "" J:::FF" " "r   r   )index_mappingprompt_mappingr?  r  r[  videor   expected_num_itemszFinish execute for mm hash %sr   )4r=  r   r-  enable_mm_processor_statsr7  r   r5   r3  r(  lora_managersupports_tower_connector_lorar  rq  r  r   request_lora_mappingget_num_mm_encoder_tokensget_num_embedsr  r  lora_id_to_lora_requestr  r  r+   rm  r,   TOWERset_active_adaptersr  r  rn  r  r~  r   	CONNECTORrD   r
  r  r5  r   r   r   r  timed_encoder_operationnextembed_multimodalr   r   r[  loggerdebugmaybe_save_ec_to_connector)#r   r   r9  r^  r:  should_timer3  prompt_lora_mappingtoken_lora_mappinglora_requestsencoder_token_countsr  r  r  lora_idr  r  tower_mappingpost_op_countsconnector_token_mappingconnector_mappingencoder_outputscurrent_item_idxmodality	num_itemsrc  curr_group_outputs_lst	video_idxvideo_mm_kwargs_itemrb  micro_batch_mm_inputsmicro_batch_outputscurr_group_outputsr,  r   s#   `                                  r   _execute_mm_encoderz"GPUModelRunner._execute_mm_encoder  s    .2-Q-Q.
 .
*	9l  	I% :)C: 9
 
 '44 6	 1 O O Q Q 6	 #%!#EEM#% $0 8 8 *:6Bd.CGLMM "ZAA+ 
 $**7333"))7)j*@AAA$++J777Q;;#'#3#K#O#OPW#X#XL#/%)),777 (()))**$*	  M 11-OOOtz#@AA " " " "&:" " "
 +-)H0AAAH^28<<<+ +' %0"'(?(F(F(H(H"I"I#()<#=#=#(2	% % %! !55!%  
 /14O;5
 5
 5
 >	* >	*0Hi" 2&S''MM)-el);)=)=&!&y!1!1 K KI+45E	5Q+R(55#\3Ci3OQR  K K 7;7!5 6'+{+/?  7 7313 /Ee.D / /3/ /+ /556IJJJK K K K K K K K K K K K K K K" &<"" 11/?  S S *@)?)R)R/)R)R&S S S S S S S S S S S S S S S
 ,"#,    ""#5666	)  #9o>> 	I 	IOGV*0Dw'LL8'BBB++D,>HHHHs%   &ANN
N*OO	O	r   shift_computed_tokensc                 p   |j         }d| j        z
  | _        | j        | j                 }t          t          j                             }|j        }d|d |<   d}d}d}	| j        j        D ]}
g }|j	        |
         }| j
        |
         }|j        |z   }|j        D ]}|j        }|j        }|j        }|||z   k    r n||z   |k    r-t!          ||z
  d          }t#          ||z
  |z   |          }||k     sJ |                    ||          \  }}||k    r~|j        }| j                            |d           }|J d| d            |j        x}|||         }|||         }n
|||         }||z   |z
  }|d|||z   ||z   <   n|||z   ||z   xx         |z  cc<   |                    |           | j        rd| j        r]|j        J d}| j                            |j        ||j        |j                  \  }}}|j                            |           ||_        |                     |           ||z  }|!                    |          }|r/| "                    |           | j        !                    |           |	r/| #                    |           | j$        !                    |           ||fS )Nr   Fr   zEncoder cache miss for .T)r  multimodal_embeddingsr  r  )%rs  r  r  r   r   r   r   rq  r  r  ri  r  r  r  offsetr  rn  r  get_embeds_indices_in_ranger8  r[  r  is_embedr  r5  rL  r  r3  recompute_mrope_positionsr  rz  rU  r  rx  r  r  r  )r   r   rl  rs  is_mm_embed_buf	mm_embedsis_mm_embedreq_start_idxshould_sync_mrope_positionsshould_sync_xdrope_positionsr  mm_embeds_reqr  r2  r  r  r  r  num_encoder_tokens	start_idxr>  curr_embeds_startcurr_embeds_endr,  encoder_outputrr  mm_embeds_itemreq_start_posnew_mrope_positions	new_deltas                                 r   _gather_mm_embeddingsz$GPUModelRunner._gather_mm_embeddingsh	  s   
 &6%P"  !4#77243GH&((	%)38///0&+#',$&. K	2 K	2F02M#3#H#P f-I"+"?BW"W'3 35 35
%1$O	%-_"  36J JJJE115HHH  3i ?CC	')36JJ&  !7****88GLL 3!?
 %77$/!%!3!7!7!F!F%113WW3W3W3W111 ( 11H>'	'(9:H%34Eo4U%VNN%3Ig4E%FN -	 9<O O#  	 9MG<S STT  %	1MG4KK  !"    $$^44441 ;do ; 0<<<.2+J88"+"<.;(1(A,5,I	 9   >2I )//0CDDD1:	.]+++11MM%112LMM& 	I&&'7888 ,,-GHHH' 	J''(8999!--.HIII+%%r   c                     t          | j        t          t          f          r| j                                        S | j        S r   )r  r3  r   r   unwrapr   s    r   r  zGPUModelRunner.get_model	  s8    dj#3]"CDD 	':$$&&&zr   c                    |                                  }t          t                               }t          |          r|                    d           t          |          r|j        rdgS |                    d           |S )Ngeneratetranscription)r  r   rH   r?   r  r;   supports_transcription_only)r   r3  supported_taskss      r   get_supported_generation_tasksz-GPUModelRunner.get_supported_generation_tasks	  s      ~.00#E** 	/"":...!%(( 	40 )'((""?333r   c                 D   |                                  }t          |          sg S t          |j                                                  }d|v rPt          | j        j        dd          }|dk    r/|                    d           t          
                    d           |S )Nscore
num_labelsr   r   z.Score API is only enabled for num_labels == 1.)r  r>   r   r  get_supported_tasksr  r%  	hf_configremoverT  
debug_once)r   r3  r  r  s       r   get_supported_pooling_tasksz*GPUModelRunner.get_supported_pooling_tasks	  s      && 	Iu|??AABBo%% !2!<lANNJQ&&w///!!"RSSSr   .c                 .   t          t                               }| j        j        dk    r'|                    |                                            | j        j        dk    r'|                    |                                            t          |          S )Nr  r  )r   rJ   r%  r2  r  r  r  rm  )r   taskss     r   r  z"GPUModelRunner.get_supported_tasks	  s|    ]#%%(J66LL<<>>???(I55LL99;;<<<U||r   r  	sync_selfc                   	 | j         J | j        j        j        	t	          | j                  |ra|J |                                D ]H\  }}|dk    o}|r	z  n}| j         |         d |                             |d |         d           It          	fd| j                                         D                       S )NresidualTr   c                 V    i | ]%\  }}||d k    rr|dz           n	|d         &S )r  Nr   )r   kvis_rsr  tps      r   
<dictcomp>zFGPUModelRunner.sync_and_slice_intermediate_tensors.<locals>.<dictcomp>
  s_        Aq 
??u? 'zR''(({
{^  r   )r  r	  r*  tensor_parallel_sizer   r  rz  rG   )
r   r  r  r  r  r  is_scatteredcopy_lenr  r  s
    `      @@r   #sync_and_slice_intermediate_tensorsz2GPUModelRunner.sync_and_slice_intermediate_tensors
  s"    (444-B,T-=zJJ  	'333,2244  1 J85/;K:++)!,YhY7==ixiLt >     #      !5;;==	  
 
 	
r   is_dummy
is_profilec                     | j         j        sdS | j        J |                                 }t	          |          sJ | j                            ||| j         j        j                   dS )zN
        Step for the EPLB (Expert Parallelism Load Balancing) state.
        N)	log_stats)r*  enable_eplbrV  r  r7   stepeplb_configlog_balancedness)r   r  r  r3  s       r   	eplb_stepzGPUModelRunner.eplb_step"
  s     #/ 	F***  $U+++++*6G 	 	
 	
 	
 	
 	
r   r   num_scheduled_tokens_npr  c                 J   | j         j        }|t          | j         j                  k    s
J d            |d |         }| j        j        d |         }| j                                         }|                    |||j                   t          t          | j                  }|                    ||          }	d t          ||j                  D             }
t          | j         j                                        | j         j                                        |          }|	t'          |
          sd g|z  |_        |S | j        rt-          ||	|
| j                  S t1          d |	          }	d t          |	|
          D             |_        |                                  |S )	NzEEither all or none of the requests in a batch must be pooling requestr  r   pooling_metadatac                      g | ]\  }}||k    S r   r   )r   seq_len
prompt_lens      r   r   z(GPUModelRunner._pool.<locals>.<listcomp>K
  s1     
 
 
# z!
 
 
r   )r  r  r  )r   r   r   r   c                 8    | d n|                      dd          S r   r   r   s    r   r   z&GPUModelRunner._pool.<locals>.<lambda>c
  s    aiddQTT%dT-K-K r   c                      g | ]\  }}|r|nd S r   r   r   s      r   r   z(GPUModelRunner._pool.<locals>.<listcomp>f
  s5     -
 -
 -
W $CC-
 -
 -
r   )rq  r  r>  r  r  r   get_pooling_metadatabuild_pooling_cursorr
  r   r=   r3  r  r   prompt_lensrt   r  r   r  anyr   rT  r   r   rL   r  )r   r   r  r  r  r  seq_lens_cpur  r3  r   r   r   s               r   _poolzGPUModelRunner._pool2
  s    #,3t/>?????S @?? &&;';&;<}((3+@@BB--#\-:N 	. 	
 	
 	
 ($*55*/,,':J +7 +
 +

 
'*<9I9U'V'V
 
 

 0$,1133 ,<AACC 3
 
 
 $C,>,>$150A-&&$ 	3$7"3+)-)F	    ,KK
 
-
 -
 #$5} E E-
 -
 -
) 	""r   c                 v    | j         j        j        }| j        j        j        r|dk    rt          ||          S |S Nr   )r	  r*  r  r'  pass_config	enable_sprN   )r   r  tp_sizes      r   _pad_for_sequence_parallelismz,GPUModelRunner._pad_for_sequence_parallelismn
  sB     "2G".8 	;Wq[[0':::##r   c                 x    | j         j        r| j        j        d |         }nd }| j        j        d |         }||fS r   )r3  requires_raw_input_tokensr  r  r  )r   r  r  r  s       r   _prepare_mm_inputsz!GPUModelRunner._prepare_mm_inputsv
  sJ     :/ 	*;J;7III*.{
{;-''r   num_input_tokensc                    |j         }t                      j        }| j        j        }d }| j        r|r|s|                     || j                  5 }|                     |           | 	                    |          \  }}	d d d            n# 1 swxY w Y   | j
                            | j        j        d |         ||	          }
| j        j        d |                             |
           |                     |          \  }}i |                                 |                     |          }n| j        r|r| j        j        d |                             d                              d          }|                                dk    r<| j        j        |         }| j
                            |          }|| j        j        |<   | j        j        d |         }|                                 }d }n*| j        j        d |         }d }|                                 }| j        r| j        j        d d d |f         }n8| j        dk    r| j        j        d d d |f         }n| j        j        d |         }|rd }n|J |                     ||d          }|r3|j        r,|                     |          }|                    d	|i           ||||||fS )
Nr[  )ro  is_multimodalF)as_tupler   r   )r  Tra  ) rs  r#   is_first_rankr%  rP  rO  maybe_get_ec_connector_outputr[  rk  r  r3  embed_input_idsr  r  r  rz  r  r  rd  r3  r  nonzerosqueezenumelrL  r  rM  r  r  r  r7  r]  )r   r   r  r  r  r  rP  r   ru  rv  inputs_embeds_scheduledr  r  r  token_ids_idx	token_idstokens_to_embedsr  ra  s                      r   _preprocesszGPUModelRunner._preprocess
  s     0J$4!.A #" <	5} <	5=O <	533 "0 4   V %(()9:::)-)C)CDT)U)U&	;V V V V V V V V V V V V V V V '+j&@&@"#8$8#89&/) 'A ' '# "#8$8#89??@WXXX'+'>'>?O'P'P$I}))++))*:;;LL & "	5= "	5 !%&;';&;<%((  ""$$q(( N.}=	#':#=#=	#=#R#R 8H"&}5 .23D4D3DEM2244LII *+<,<+<=I M2244L? 	>,04E5E4E1EFII!A%%-1!!!5F6F5F2FGII*+<,<+<=I 	#'  '333#'#K#K "6$ $   	F"2"K 	F #667GHHO!2O DEEE  
 	
s   .B

BBr   r   c                 0   | j         j        }| j                                          ||                     ||          S | j        r8| j        1|                                 \  }}| j                             |           |                     |d ||          }|S )Nr   sampling_metadata)	rq  r  update_async_output_token_idsrU  rT  r  _get_draft_token_ids_cpuupdate_async_spec_token_idsrd  )r   r   r   r  r  rb  sampler_outputs          r   _samplezGPUModelRunner._sample
  s     !,> 	66888'<<"3      $ 	N)B)N%)%B%B%D%D"889LMMM// 	
 
 r   r  c                    i }t           j        r|                     |          }| j        j        }t          j        | j        j        d |                   d         }	|	D ]Z}
| j        j        	                    t          |
                    }|*|                    |                                dz
             [| j        j                                        }| j        j                                        }|j        j        d         }|j        }|j        }g }d }| j        s|j        d         }|dk    rX|                     |          }|	D ])}
|t          |
                                                    *||                                }nt/          j        || j        j        |	|          \  }}ng }|	                                }t7          |          | j        j        |j        d         dk    sJ || j        _        fdt;          | j        j                  D             | j        _        | j        j        }t?          |          D ]}| j        r
|vrdgnd }n||         }|rtA          |          nd}|s1| j        j!        |         }||z   }|| j"        k    sJ d| d| j"                     || j        j#        |||f<   d	| j        j$        |||f<   || j        j!        |<   ||         }| j%        |         }|j&        '                    |           | (                    |d |         |j)                  }|||||||fS )
Nr      r   r   r   c                 $    i | ]\  }}|v	||S r   r   )r   r   r  invalid_req_indices_sets      r   r  z4GPUModelRunner._bookkeeping_sync.<locals>.<dictcomp>[  s5     5 5 5Av333 333r   zGSampled token IDs exceed the max model length. Total number of tokens: z > max_model_len: T)*envsVLLM_COMPUTE_NANS_IN_LOGITS_get_nans_in_logitsrq  r  r  r  r  
generatorsr  r   
set_offset
get_offsetr  r   r  r   r   r   rT  _to_listr   r   r~   r   r   r   r  rw  r  r  r  r>  r"  r  r'  r  ri  r  r  _get_prompt_logprobs_dictr  )r   r   r  r   r   r  r   num_nans_in_logitsr  "discard_sampled_tokens_req_indicesr   genreq_ids_output_copyreq_id_to_index_output_copyr  r   r   r   r   r   r   r  r  sampled_idsnum_sampled_idsr|  r>  r  r2  prompt_logprobs_dictr  s                                 @r   _bookkeeping_syncz GPUModelRunner._bookkeeping_sync  s   "  + 	B!%!9!9&!A!A#,-/Z%((3.
 .

.* 4 	5 	5A"-11#a&&99Cs~~//!3444 #.6;;==&*&6&F&K&K&M&M#+=CAF*<): ( $	+1"5Ka*.--8I*J*J'; < <A+CFF399;;;;#/%5%=%=%?%?N ;K:W%$/6%5	; ; ;7' ')#"D"K"K"M"M&)*=&>&># 6>(.r2a7777:K 75 5 5 5!*4+;+C!D!D5 5 5D1 "*/00 	; 	;G( ?&-5L&L&LrddRV5g>7B#I3{#3#3#3O (;GDI/1Gd0000(+2( (%( ( 100 JUD*7Ig4E+EFHLD)'9W3D*DE;BD/8W%Ff-I&--k::::  $==///01 
  
 # '
 	
r   c              #      K   | j         d V  d S | j                                          	 d V  | j                                          d S # | j                                          w xY wr   )rr  r   r   r   s    r   synchronize_input_prepz%GPUModelRunner.synchronize_input_prep  sx      $,EEEF
 	!--///	/EEE%,,.....D%,,....s   A	 	A$r  r  r  r  c                 &     | j         d||||d|S )aM  Helper method to call the model forward pass.

        This method can be overridden by subclasses for model execution.
        Motivation: We can inspect only this method versus
        the whole execute_model, which has additional logic.

        Args:
            input_ids: Input token IDs
            positions: Token positions
            intermediate_tensors: Tensors from previous pipeline stages
            inputs_embeds: Input embeddings (alternative to input_ids)
            **model_kwargs: Additional model arguments

        Returns:
            Model output tensor
        r  r  r  r  r   )r3  )r   r  r  r  r  r  s         r   _model_forwardzGPUModelRunner._model_forward  s:    0 tz 
!5'	
 

 
 
 	
r   max_num_scheduled_tokensr  force_uniform_decodec                 (    || |k    o|| |z  k    n|S )zn
        Checks if it's a decode batch with same amount scheduled tokens
        across all requests.
        r   r  r  r  r  r  s        r   _is_uniform_decodez!GPUModelRunner._is_uniform_decode  s4    " $+ *-EE H#;h#FF &	
r   r  allow_microbatchingforce_eagerforce_has_loranum_encoder_reqsc           
                                | j        |||           j        j        o|
dk    }|	t	           j        j                  dk    n|	                     |           fd} ||p|          \  }}|j         j	        j
        j        r'|j         j        j        j        z  dk    s
J d            d\  }} j        j        j        dk    r j	        j        t"          j        k    }t'          | j        ||||j                  \  }}}|c j        j        }t-          ||                                                    ||t"          j        j        k              \  }}|j        k    sJ d } j        j        j        r-t7          ||j        |j        |z
  t9          |          	          }|||||fS )
Nr  r   c                 z    sj                             | |          nt          j        t	                    fS )N)r  has_lorauniform_decoder1  )r  r2  r   ru  r(   )r  r1  r  r  r  r   r  s     r   r   zGGPUModelRunner._determine_batch_execution_and_padding.<locals>.<lambda>   sU     .JT-F-O-O%!-)	 .P . . .  $o6G&H&HI r   zQSequence parallelism requires num_tokens to be a multiple of tensor parallel size)FNr   )num_tokens_unpaddedr*  r  allow_dp_paddingr  r   num_scheduled_tokens_per_requestrt  r0  )num_unpadded_tokensnum_padded_tokensnum_paddingsruntime_mode)r  r  r%  rP  r>  rq  rM  r  r  r'  r  r  r	  r*  r  data_parallel_sizert  r   ru  r   valuedata_parallel_rankr   ry  	PIECEWISEr-  cudagraph_metricsr   r  )r   r  r  r  r  r  r  r  r  r  r  has_encoder_outputdispatch_cudagraphrt  batch_descriptorshould_ubatchnum_tokens_across_dpr  synced_cudagraph_modedp_rankr  r  r  r  s   `      `             @@@r   &_determine_batch_execution_and_paddingz5GPUModelRunner._determine_batch_execution_and_padding  s   * 00%=%)%B!!5 1 
 
 0I5E5I 	 %  899A== 	 !>>zJJJ J J J J J J J 	 ,>+=/E3E,
 ,
(( -7".8 	 +"2GH  5	   /:+++>BB '6-:LL 
 +(2$($8(;%5&7#15L#1#7	 	 	 GM/1F $/.A$'(<W(E(J(J(L(L$M$M!3E3E%!6-:Q:W!W4 4 40 0 (26GGGGG0B 	+$."2"=-8:E 00	  O  
 	
r   c                    | j         j        j        r| j        s| j        j        t          j        k    rt          	                    d           | j         j        j
        t          j        k    rt          	                    d           dS t                      }|                    | j        | j        j        j                   d| _        dS dS dS )z
        Register layerwise NVTX hooks if --enable-layerwise-nvtx-tracing is enabled
        to trace detailed information of each layer or module in the model.
        zlayerwise NVTX tracing is not supported when CUDA graph is turned off; you may observe part or all of the model missing NVTX markerszylayerwise NVTX tracing is not supported when CompilationMode is STOCK_TORCH_COMPILE, skipping function hooks registrationTN)r	  r-  enable_layerwise_nvtx_tracingr  r'  rt  r   ru  rT  r  moder   STOCK_TORCH_COMPILErQ   register_hooksr3  	__class__r   )r   	pyt_hookss     r   _register_layerwise_nvtx_hooksz-GPUModelRunner._register_layerwise_nvtx_hooksO  s     1O	<8	< &59KKK!!+    38"67 7 !!2     %JJ	((TZ5I5RSSS7;4449	< 	< 	< 	<r   r  zUBatchSlices | Nonec                     t           d          r$ j        t           j        j                  dk    sdS dt          f fdfdt           j        j                  D             }i }t           j        j                  D ]\  }}||         }	|j        D ]}
|	||
<   |Lg }|D ]C}i }|                                D ]\  }
}	|	|j                 ||
<   |	                    |           D||fS ||fS )a2  
        Build slot mappings in both formats needed by the system.

        Args:
            num_tokens_padded: Total number of tokens (padded)
            num_reqs_padded: Total number of requests (padded)
            num_tokens_unpadded: Actual number of tokens (unpadded)
            ubatch_slices: Optional ubatch slicing info for DBO

        Returns:
            A tuple of:
            - slot_mappings_by_gid: dict[int, torch.Tensor] for attention metadata
            - slot_mappings_by_layer: dict[str, torch.Tensor] or list for ForwardContext
        r  Nr   r  r  c                 B   J j         j        |          j        }t          |t                    r(t          j        ft
          j        j                  }n&j	        j
        |          }|j        j        d          }|                             d           |S )Nr$  r   )r  r  r  r  re   r   r  r  r
  rq  r%  r  r  r  )r  r  r  r  r  r  r  r   s       r   _get_slot_mappingz<GPUModelRunner._get_slot_mappings.<locals>._get_slot_mapping  s    ".3D3P3PP 0@  -)ABB N${&(+;      !,8F	(59:L;L:LM ,->>?EEbIIIr   c                 .    i | ]\  }}| |          S r   r   )r   gidrb  r"  s      r   r  z5GPUModelRunner._get_slot_mappings.<locals>.<dictcomp>  s;      
  
  
Q ""3'' 
  
  
r   )
r  r  r>  r  r   r  r  r  token_slicer  )r   r  r  r  r  slot_mappings_by_gidslot_mappings_by_layerr$  r  r  r  resultubatchsliced_mappingsr"  s   ````          @r   _get_slot_mappingsz!GPUModelRunner._get_slot_mappingss  s   2 D+,,	$0D(899A==:	 C 	  	  	  	  	  	  	  	  	 * 
  
  
  
#D$8$HII 
  
  

 ;=#,T-A-Q#R#R 	B 	BC/4L,8 B B
5A&z22B $46F' / /;=0F0L0L0N0N S S,J2>v?Q2ROJ//o....'//#%;;;r   c                      j         t          d           j        j        j        rDt          j                    }||                                 nt          	                    d           j
        r4t                      r&t                                          j
                   j        }t          d          5                                   5                                  t%                      rt'                      j        rp                      j                  5 }                                t1                    cd d d            cd d d            cd d d            S # 1 swxY w Y   |s j        j        dk    r% j        j        dk    r                     d           t                      st:          cd d d            cd d d            S                       j                  cd d d            cd d d            S  j        j         r j!        r
J d             j"        j#        } j"        j$        }fd|D             }tK          j&        |tJ          j'        	          }	tQ          |	)                                          }
j        } *                    |	          \  }}d } j+        r: j        j,        s. -                    |	 j"        j.        d |         j/                  } 0                    |||	|
|d utc          j2                  
          \  }}}}}t          3                    d||||           |j4        }|j#        |j#        n|}tk          ||	|| j        j6                  \  }}t          3                    d||           to           fdtq           j9        j:                  D                        }|tv          j<        k    } j        j=        dk    rUt}          j?         j9         j         j@         j"         jA         jB        jC         jD        E                                           tc          jF                  dk    }|r|n|} G                    |s|r|n||s|r|n|||          \  }} H                    ||r|nd ||r|nd |
|||jI        ||          \  }} J                    ||          \  } }!}"}}#}d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y    jK        rtv          jL        }d _K        tc          j2                  }$ j        jM        o|$dk    }%t          | j        |||||||%	  	        5  t          d          5   O                              5 }&  jP        d| |"||!d|#}'d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   t          d          5   jQ        r|'\  }(})n|'}(d }) jR        st                      jT        s3t          |(t                    sJ |&|(_W        |& _W        |(cd d d            S  jX        r$ Y                    |(||	|&          cd d d            S |(|         }* jD        Z                    |*          }+n jX        rJ |(|         }*t                      jT        sPdt           j        |           i},t                      \                    |(j]        t                      |,           d }+n jD        Z                    |*          }+i }-|+|+_                                |-d<   t                      `                    |-tc          t                      ja                  dz
            }.|.J |.d         }+d d d            n# 1 swxY w Y   t          |+|||(|*|)|||
  
         _         |& _W        d S )NzOState error: sample_tokens() must be called after execute_model() returns None.&RoutedExpertsCapturer not initialized.zgpu_model_runner: preprocessr  r  r   z--kv-sharing-fast-prefill produces incorrect logprobs for prompt tokens, tokens, please disable it when the requests need prompt logprobsc                 *    g | ]}j         |         S r   )r  )r   r   r   s     r   r   z0GPUModelRunner.execute_model.<locals>.<listcomp>   s"    PPP1&;A>PPPr   r   )r  r  r  r  r  r  zhRunning batch with cudagraph_mode: %s, batch_descriptor: %s, should_ubatch: %s, num_tokens_across_dp: %s+ubatch_slices: %s, ubatch_slices_padded: %sc              3      K   | ]E\  }}t          |j        t                    t          d  j        |         D                       V  FdS )c              3   .   K   | ]}|j         j        V  d S r   )backend forward_includes_kv_cache_update)r   gs     r   r  z9GPUModelRunner.execute_model.<locals>.<genexpr>.<genexpr>B  s;         I>     r   N)r  r  re   allrZ  )r   idspecr   s      r   r  z/GPUModelRunner.execute_model.<locals>.<genexpr>A  s       - -
 B!$"46NOO-  !-b1    - - - - - -r   rH  r   r  r  r  r  )r  r  r  r  r  r  r  r  r  r  r  F)r  r  cudagraph_runtime_moder  r  r  skip_compiledzgpu_model_runner: forwardr  zgpu_model_runner: postprocessr  )all_gather_groupall_gather_tensorsr   )rv  r   )cr  RuntimeErrorr	  r%  r  r.   get_instanceclear_bufferrT  errorpreempted_req_idsr    r   handle_preemptionsrs  r   r  rD  r   r   is_producerr  r[  rk  rw   r*  r=  r  
_dummy_runrm   kv_connector_no_forwardr&  r  rj  rq  r  r  r  r  r~  r   rn  r  rI  use_ubatchingr  r$  r  r  r>  r7  rU  r  r   num_ubatchesr5  r  r  r  r   FULLrN  r   preprocess_mambar  ri  r'  r  r3  rP  r  r+  r  r  r  r6  ru  rP  r)   maybe_get_kv_connector_outputr  r\  r@  r#   r]  r  rG   r  r>   r  compute_logitsr   send_tensor_dicttensorsr$   
contiguousbroadcast_tensor_dictr?  r   )/r   r   r  capturerr  r   r  r  tokensr  r  r  r  r   r  rt  r4  r  r  r  r  r  r  ubatch_slices_paddedhas_separate_kv_updatepad_attnr  ubatch_slices_attnslot_mappings_by_groupr  r  r   r  r  r  r  r  has_encoder_inputr  model_outputr   r   r   r   r<  model_output_broadcast_databroadcasteds/   ``                                             r   execute_modelzGPUModelRunner.execute_model  sx    #/6  
 (E 	G,9;;H#%%''''EFFF- 	2G2I2I 	!##66 2    0J*+IJJh	 h	''))h	 h	
  0111   T_%6%6%B T77$"&"4 8   T ),,-=>>>ABRSST T T T T T Th	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	T T T T T T T T ( X(E*+ +,?!CC OOA&&&,.. 54=h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	> 334DdFVWW?h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	B  8 3  @ 3 '0H&.GPPPPPPPF&(hvRX&F&F&F#'*+B+F+F+H+H'I'I$"2"M373G3G '4 40N0
 (,$( 1E1S +/+Q+Q+$<YhYG$=, ,( ;;.!(?)A!9!E!$%5%N!O!O <  $ LL>$   !+ 5'1':'F
##H  3M'!$13 3/M/ LL=$   *- - - - -
 !*$*>*N O O- - - * * &" &);;H 1W<<,$(%($M+BJ88::	 	 	 ""2"OPPSTTO9A!T!5!5}484K4K#)5#)"3"3('/W3IWOOx$72 5L 	5 	51"M ..2;C&M&7&7%7?$IOOT":"4#1$3)9)N-E"8 /   <M;.    "35I $#Kh	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	 h	Z # 	-*/N',D$ /HII0I5E5I 	   ,%9'5!+2*/
 
 
	 	 ++FGG	 	 ../?@@	 EX.4. ##%9+	 
  L	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	. ,,KLL 9	/ 9	/0 )3?000 !-$(!+ 0/#~~2 )%m5HIIIII8KM5/BD,(!9	/ 9	/ 9	/ 9	/ 9	/ 9	/ 9	/ 9	/$ ( ::%,/+	 )9	/ 9	/ 9	/ 9	/ 9	/ 9	/ 9	/ 9	/6 (5^'D$223GHH  0000'4^'D$#~~2 M"(D ,.?) ) %*&
 !NN33%-)5+= 4   
 "FF!Z667KLLF>@+%<B<M<M<O<O/9*nnBB/S9M5N5NQR5R C   #...$X.s9	/ 9	/ 9	/ 9	/ 9	/ 9	/ 9	/ 9	/ 9	/ 9	/ 9	/ 9	/ 9	/ 9	/ 9	/v $5 , $
 $
  $7 ts   ?S>AS''$E0S'S>0E44S'7E48AS'S>S'9S>K	S'S>'S+	+S>.S+	/S>>TT&W6WV+W+V//W2V/3W6WW	W	W	
WW W3A^%^%=D^%%^),^)grammar_outputzGrammarOutput | Nonec                      j         }d  _          j        <|sd S |                                rt          S t	          t                    }||_         |S  j        \
  }}}d  _        |t          | j        |           t          d          5                       |          }d d d            n# 1 swxY w Y    	                    |j
                   d  _        d  _        d  j        _         fd} j        }	d}
|	Sd uoj         j        z    j        k    }|	                                p|	                                o|	j         }|rt+           j        t.          t0          z            sJ |j
        }|r ||           nˉ j        J  j                            | j         j         j        j                  \  }}                     ||           t?          j         d j!        t>          j"                  #                    tI           j        j%                   j                   _         &                    d           n|}
t          d          5   '                    ||j(                  \  }}}}}}}d d d            n# 1 swxY w Y   |
r ||           t          d	          5   )                                 d d d            n# 1 swxY w Y   t          d
          5   j*        j+        rKtY          j-                    }||.                     j/                   nt`          1                    d           te          |||||| j3        r|nd ||	  	        }d d d            n# 1 swxY w Y    j4        s|S t          d          5  tk          ||j
        |j6        | j7         j        j8                  }d d d            n# 1 swxY w Y   t          d          5   j        9                    |j:        |j;                   d d d            n# 1 swxY w Y   |S )Nzgpu_model_runner: samplec                     J t          d          5                      | j        j        	  	        _                                       d d d            d S # 1 swxY w Y   d S )Nzgpu_model_runner: draft)r   propose_draft_token_idsrq  r  r  _copy_draft_token_ids_to_cpu)	r   r   r   r   r   r   r  r   r   s	    r   r_  z=GPUModelRunner.sample_tokens.<locals>.propose_draft_token_ids(  s    3???/0IJJ D D(,(D(D$%$6!(%(4!
) 
)% 112BCCCD D D D D D D D D D D D D D D D D Ds   AA$$A(+A(Fr   r
  r!  T)
zeros_onlyzgpu_model_runner: bookkeepzgpu_model_runner: eplbz#gpu_model_runner: ModelRunnerOutput)indicesr-  )	r  r  r   r   r  r  r   r  r  z+gpu_model_runner: AsyncGPUModelRunnerOutput)r   r   r   r   r   r   z-gpu_model_runner: set_async_sampled_token_ids)<r  r  is_emptyrm   r   r   rq  r   r  rQ  r   r  r  rw  r,  r  re  rh  ra  r`  disable_padded_drafter_batchr  r_  r   r   r  prepare_next_token_ids_paddedri  r  r  _copy_valid_sampled_token_countr   r  r
  r~  expandr>  r  r`  r  rs  r  r%  r  r.   r>  save_captured_expertsr  rT  r@  rt   rO  rT  r   r   r   r   set_async_sampled_token_idsr   r   ) r   r\  r  r   r   r   r  r  r_  spec_config propose_drafts_after_bookkeepinginput_fits_in_drafteruse_gpu_toksr   next_token_idsvalid_sampled_tokens_countr  r   r   r  r  r  r   rP  async_outputr   r   r   r   r  r   r   s    `                        @@@@@@@r   sample_tokenszGPUModelRunner.sample_tokens  s    #6#' #+& t #++-- 100344F)<F&M $	
 ,  $(  %! .$2BF   ,,FGG 	H 	H!\\&2FGGN	H 	H 	H 	H 	H 	H 	H 	H 	H 	H 	H 	H 	H 	H 	H 	//,.>	
 	
 	
 !%$(!26/	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D  -+0("$DD$P %0<t?SS78 "
 %%''I;+G+G+I+I?!>>   I "$,@R0RSSSSS$2$D!( Y++,=>>>>9E;GGGBB<- M , 59  ?N$> 88&(B  
 -2K$+U[- - -fS!1!9::D<PQQ ) 556FSW5XXX3H0+,HII 	 	 &&  ;$ "'$#+#	 	 	 	 	 	 	 	 	 	 	 	 	 	 	$ , 	= $#$;<<<+,DEE 	 	NN	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ,,QRR 	 	 = K0=??'224;L2MMMMLL!IJJJ&+ ;"9'%9$7*%$7$7#5 /  F	 	 	 	 	 	 	 	 	 	 	 	 	 	 	, ( 	M+9
 
 
	 
	 5$*"0"B!/!@$7)-)F+6  L
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 ,;
 
 	 	
 8823  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 sm   B>>CC8(J,,J03J0K44K8;K8A9NNN6/O11O58O5&P==QQc                 p    | j         r| j        sd S |                                 \  }}t          ||          S r   )re  r  r  ro   )r   r  r  s      r   take_draft_token_idsz#GPUModelRunner.take_draft_token_ids  sB    # 	4+D 	4#'#@#@#B#B Wo666r   rb  c                    | j         r|j        s| j        j        j        sd S | j        j                                        | _        | j        }t          j
        |          sd S | j        J | j        J | j        J t          j                                        }|j        d         }t          j                            | j                  5  |s?| j                            |           | j        d |                             |d           nd| j        d |<   | j                                         d d d            d S # 1 swxY w Y   d S Nr   Tr   )rT  has_structured_output_requestsrq  r  r  r  r   r  r  r   	is_tensorr  r  r  r   r   r   r   r   rz  r   )r   r   rb  r  r   r  s         r   r`  z+GPUModelRunner._copy_draft_token_ids_to_cpu  s   
 $ 	;	1B	 F$($4$<$A$A$C$C!(,(=// 	F)555/;;;'3332244"(+Zt?@@ 
	0 
	0 80<<^LLL((399#$ :    
 78((3&--///
	0 
	0 
	0 
	0 
	0 
	0 
	0 
	0 
	0 
	0 
	0 
	0 
	0 
	0 
	0 
	0 
	0 
	0s   A'E  EEc                 ,   t          | j        t                    r| j        | j        j        fS | j        }|g g fS | j        J | j        J | j                                         | j        d t          |                   
                                |fS r   )r  r  r   rq  r  r  r  r  r   r>  r   )r   r  s     r   r  z'GPUModelRunner._get_draft_token_ids_cpu  s    d+T22 	C($*:*BBB+?r6M)555'333"..000'#g,,7>>@@'IIr   ro  rp  c                    | j         d S t          j                                        }t          j                            | j                  5  | j                            |           |}| j        }|J |d |j        d                  	                    |d           | j         
                                 d d d            n# 1 swxY w Y   |                    d          | j        _        d S )Nr   Tr   r   )r  r   r   r   r   r  r   r  r   rz  r   	unsqueezerq  rw  )r   ro  rp  r   counts
counts_cpus         r   rg  z.GPUModelRunner._copy_valid_sampled_token_count  s&    /7F2244 ZtIJJ 	: 	:6BB>RRR/F;J)))(a()//T/JJJ077999	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 3A2J2J12M2M///s   A+CC
Cc                     | j         j        }| j        }||g S | j        }|J |                                 |d |j        d                                                  S r  )rq  rw  r  r  r   r   r   )r   rw  sampled_count_eventr}  s       r   r  z-GPUModelRunner._get_valid_sampled_token_count  st    !%!1!H"B&*@*HI7
%%%'')));28;;<CCEEEr   r   r  r   r   r  c
                 r	   |j         | j        }
|
J |
j        dk    rgt          |t                    sJ t          | j        t                    sJ | j                            || j        j	        | j        j
        |	          }n/|
j        dk    rWt          |t                    sJ t          | j        t                    sJ | j                            | j        ||	          }n|
j        dk    rt          |t                    sJ t          | j        t                    sJ |j        d         t          |          k    r|}n~g }d}|
J d            t          |j        |          D ]5\  }}|                    |t          |          z   dz
             ||dz   z  }6t%          j        || j                  }||         }| j                            |||		          }n|
                                s|
                                rt          | j        t.          t0          z            sJ |
j        rLt          |t                    s
J d
            | j                            || j        | j        |j                  }not          |t$          j                  s
J d            | j                            ||| j        | j        | j        j                   \  }}| !                    ||           d }|fd }| j"        j         d          }| #                              }| j$        r(|J t%          j%        fd|D             d          }n|d          }n|
j        rd }| j        &                    |||j                  \  }| j"        j                  }| #                              }| j$        r'|J t%          j%        fd|D             d          }n|         }n| j        '                    |||          \  }}}|j(        | j"        j         d          }| #                              }| j$        r'|J t%          j%        fd|D             d          }n
|d          }| j)        r| *                    |d          }nd }| j                            ||||||||||	
  
        }|S )Nr  )r  r  r  r   z"No spec decode metadata for medusar   r  )target_hidden_statesr  r  zGsampled_token_ids should be a python list whenpadded-batch is disabled.zGsampled_token_ids should be a torch.Tensor whenpadded-batch is enabled.c                 $    g | ]}|d          S r   r   )r   hr  s     r   r   z:GPUModelRunner.propose_draft_token_ids.<locals>.<listcomp>c  s%    MMMa0001MMMr   r   rF  c                      g | ]
}|         S r   r   )r   r  r  s     r   r   z:GPUModelRunner.propose_draft_token_ids.<locals>.<listcomp>t  s    III!Q}-IIIr   c                 $    g | ]}|d          S r   r   )r   r  rp  s     r   r   z:GPUModelRunner.propose_draft_token_ids.<locals>.<listcomp>  s%    MMMaQ0 001MMMr   )rl  )
target_token_idstarget_positionsr  ro  last_token_indicesr  r  mm_embed_inputsnum_rejected_tokens_gpur  )+rs  r,  r^  r  r   r_  r   proposerq  r"  r'  r   r   r   r>  r   r  r  r   r{  r
  ra  r`  r   r   re  prepare_next_token_ids_cpuri  r  r   rf  r  r  rg  r  r  r\  rJ  prepare_inputsprepare_inputs_paddedr  rO  r  )r   r   r   r  r   r   r   r   r  r  rk  r  rc  rp  	num_draftrQ  ro  rp  r  token_indices_to_sampler  r  r  r  r  r  rp  s                           @@@r   r_  z&GPUModelRunner.propose_draft_token_ids  s     0J-&&&((/66666dlM:::::"l22! 3 .+	 3  OO 8++/66666dl,BCCCCC"l22 "3= 3  OO 8++/66666dlN;;;;;#)!,4E0F0FFF 4+778 877 *-(9;L* * , ,%Iv NN6CKK#7!#;<<<i!m+FF,wt{CCC 4W ="l22%2"3+ 3  OO
 ""$$ n	(D(D(F(F n	dlM<N,NOOOOO7 " ""3T::  0 : "&!H!H%M$$9	" " ""3U\BB  / B
 L>>,)(15  ; : 44"$>   '+##+*.'#'>#56K7K6K#L #'#6#67K#L#L 4 P,888+09MMMM;LMMMSU, , ,(( ,99N:N9N+O((; $P.2+:>,:U:U,),=; ;7(-
 (,~'9-'H$'+':':='I'I$8 L0<<</4yIIII7HIIIr0 0 0,, 0=]/K,, ::,,2 	,// (<'M$'+~'9:K;K:K'L$'+':':;K'L'L$8 P0<<</4yMMMM;LMMMSU0 0 0,, 0==N>N=N/O,& '"&"<"<$*+ #= # #
 #'"l22!1!1%9-#:"3%9 /(?+ 3  O r   	overridesc                     ddh}|                                 D ]J\  }}||v sJ d| d|             t          | |          }t          ||          }t          | ||           Kd S )Nr)  r%  zConfig `z"` not supported. Allowed configs: )r  r  r   setattr)r   r  allowed_config_namesconfig_nameconfig_overridesconfig
new_configs          r   r   zGPUModelRunner.update_config  s     -~>-6__->-> 	3 	3)K)"6666;; ; ;$8; ; 766 T;//F&v/?@@JD+z2222	3 	3r   eep_scale_upc                 @   t                               d| j        j        d           |rt	          j        | j                  nd\  }}}| j        j        r!t	          | j        | j                  | _	        d}	 t                      5 }t          j                    }t          | j                  }|                    | j        | j                  | _        | j        r+|                     | j        | j        | j                  | _        t'          | d          r!t                               d           | j                            | j                   t'          | j        d	          rt+          | j        j                  r| j        j        r| j        j        }	|	J |	j        J t                               d|	j        j                   |r||         nd
}
|r||         nd
}| j	        t	          | j        | j                  | _	        | j	                            | j        j        |	j        |
||           |dz  }| j        rt5          |                                           st9          d          |                                 }|rt                               d|           n| j                                        }| j                             |           t          j                    }d
d
d
           n# 1 swxY w Y   |j!        | _"        nB# tF          j$        j%        $ r+}d}| d| d}t           &                    |           |d
}~ww xY wt                               dtO          | j"                  ||z
  d           tQ          | j                   tS          | dd
          x}r"tS          |d	d
          x}rtQ          |           | j        j*        }tW          |                                           o|d
uo|,                                | _,        t+          | j                  r| j        j        rt                               d| j        j                   |r||         nd
}
|r||         nd
}| j	        J | j	                            | j        | j        |
||           | j	        j-        r| j	        .                    |           | j        j/        j0        tb          j2        k    rW| j        j/        3                    | j                  }th          xj5        dz  c_5        | j        6                    d|           d
S | j/        j7        }|J |8                                r9| j        j9        s-tu          | j        | j        tv          j<                  | _        d
S | j        j9        rx|8                                r2t{          | j        | j        tv          j<        | j                  | _        d
S t{          | j        | j        tv          j>        | j                  | _        d
S d
S )z_
        Args:
            eep_scale_up: the model loading is for elastic EP scale up.
        zStarting to load model %s...globalscope)NNNr   )r	  r%  r_  zLoading drafter model...r3  Nz%EPLB is enabled for drafter model %s.r   zRModel does not support EAGLE3 interface but aux_hidden_state_outputs was requestedz2Using auxiliary layers from speculative config: %szFailed to load model - not enough GPU memory. Try lowering --gpu-memory-utilization to free memory for weights, increasing --tensor-parallel-size, or using --quantization. See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ for more tips.z (original error: )z1Model loading took %s GiB memory and %.6f secondslocalzEPLB is enabled for model %s.)rank_mappingT)	fullgraphr2  )r
  )?rT  	info_oncer%  r3  r   get_eep_stater*  r  r
  rV  rO   timeperf_counterr2   r)  
load_modelr	  r(  load_lora_modelr  r_  r7   r,  rg  	add_modelr\  r8   r  r=  "_get_eagle3_aux_layers_from_configinfo"get_eagle3_aux_hidden_state_layersset_aux_hidden_state_layersconsumed_memorymodel_memory_usager   r   OutOfMemoryErrorr@  rP   r'   r  multimodal_configr:   r5  is_asyncstart_async_loopr'  r  r   r  init_backendr   stock_torch_compile_countcompilert  has_full_cudagraphsrF  r   r   rH  r   ru  )r   r  global_expert_loads#old_global_expert_indices_per_modelr  eplb_modelsmtime_before_loadmodel_loaderrk  global_expert_loadold_global_expert_indices
aux_layerstime_after_loademsgcombined_msgr_  drafter_model	mm_configr2  rt  s                         r   r  zGPUModelRunner.load_model  s
   
 	*# 	 	
 	
 	
 $I#D$8999# 	O@, + 	'(<dkJJDOKQ	%'' D61#'#4#6#6 /0@AA)44 $ 0t?P 5  
 # !%!5!5
D$4dk" "DJ 4++ %)$$%?@@@L++DJ777g66")1$,2DEE") !0<")
 '+&6&I*666*=III((C':@    3&/<<!% +  C&?LL!% 2
  ?2.7 $ 4dk/ /DO 11 L.':.5(   $q(4 G*4>>+;+;<< *E   "&!H!H!J!JJ! UP&   
 &*Z%R%R%T%T
J:::FFF"&"3"5"5ID6 D6 D6 D6 D6 D6 D6 D6 D6 D6 D6 D6 D6 D6 D6J '(&7D##z* 
	 
	 
	!  "99Q999LLL&&&G
	 	?t.//..	 	 	
 	
 	
 	/tz:::tY555G 	B$Wgt<<<M	B 3=AAA%7	'(8(899 :%:7799 	* !,, 	L1E1Q 	L<d>O>UVVV4GQ#K00T 
 73K@@ &
 ?...O%%
!")   ' L00l0KKK /423 3 &9FFtGWXXG99Q>99Jw???F
 0?)))..00	(6	 *
D,=;M  DJJJ !/ 	1133 *J 0-2Ddk 


 +J 0-2Ddk 


	 	s=   7L
 I!K2&L
 2K66L
 9K6:L
 
M	&MM	c                     | j         r| j         j        sdS | j         j        j        }t          |d          sdS |j        }|r+t          |t          t          f          rt          |          S dS )am  Extract Eagle3 auxiliary layer indices from speculative config.

        These indices specify which hidden states from the base model should
        be used as auxiliary inputs for the Eagle3 drafter model during
        speculative decoding.

        Returns:
            Tuple of layer indices if found in draft model config,
            None otherwise.
        N eagle_aux_hidden_state_layer_ids)r,  rg  r  r  r  r  r   rm  )r   r  	layer_idss      r   r  z1GPUModelRunner._get_eagle3_aux_layers_from_config\  s     ' 	D,C,V 	4+>H	y"DEE 	4>	 	$Ie}== 	$###tr   c                     t          | dd           
J d            t          | j                  }t                              d           |                    |                                 | j                   d S )Nr3  z-Cannot reload weights before model is loaded.zReloading weights inplace...)r%  )r  r2   r)  rT  r  load_weightsr  r%  )r   r  s     r   reload_weightszGPUModelRunner.reload_weightst  sv    tWd++77; 877 ((8992333!!$.."2"2AR!SSSSSr   tensorizer_configr   c                 b    t          j        |                                 || j                   d S )N)r  r%  )r1   
save_modelr  r%  )r   r  s     r   save_tensorized_modelz$GPUModelRunner.save_tensorized_model|  s>     	#NN/*	
 	
 	
 	
 	
 	
r   c                    | j         }|si S | j        j        }i }g }|                                D ]\  }}|                    |          }	|	| j        |         }
|
j        3t          |
j                  }t          j	        |
j                  
                    | j        d          }|                    |          }|s t          j        |dz
  |dz             }|||<   |
j        }|dz   }||z
  }|	|k    r|	}n|}|                    |           |||<   |dk    r| j        j        |         }| j        j        |                                         }||||z            }| j                            |          }||||z            }| j                            |          }| j                            |||          \  }}}t3          |||z             }|j        |                             |d           |j        |                             |d           |j        |                             |d           |D ]}||= ||= 	|r|                                  |S )NTr   r   r   )rj  rq  in_progress_prompt_logprobs_cpur  r  ri  r  r>  r   r{  r   r
  rs   	empty_cpur  r  r  r  r  ry  r3  rK  rU  compute_logprobsgather_logprobsslicelogprob_token_idsrz  r   selected_token_ranksr  )r   r   r  num_prompt_logprobs_dictin_progress_dictr  completed_prefill_reqsr  rj  r  rC  r!  r  r   r|  	start_toknum_remaining_tokensr3  r  rp  prompt_hidden_statesr   tgt_token_idsr   r  r?  chunk_slices                              r   r  z(GPUModelRunner._get_prompt_logprobs_dict  s   
 $(#; ' 	I+KBD "$+C+I+I+K+K L	 L	'F'-11&99J! mF+G'/ #G$< = =$|G,DEEHH$  I    
  033F;;# < $3#<%)+>+B$ $  ,< (  3I!AI#4y#@ 111
 (

 2
&--f555/?$V,Q 
 &6v>G),W5::<<F#0&::M1M#N Z../CDDF
 -YZ9O-OPM |44V<<H)-)E)E-}* *&Ix
  	9z+ABBK.{;AA B    %k288PT8UUU1+>DDD E     - 	) 	)F(0 ((   	 ##r   c                    	 |d | j         j        D             S i }|                                                    d                                                                          }| j         j        D ]A}| j         j        |         }|&||j        d         k     rt          ||                   nd||<   B|S # t          $ r i cY S w xY w)Nc                     i | ]}|d S r   r   )r   r  s     r   r  z6GPUModelRunner._get_nans_in_logits.<locals>.<dictcomp>  s    IIIfIIIr   r   rF  r   )
rq  r  isnanr  r   r"  r  r   r   
IndexError)r   r   r  num_nans_for_indexr  r9  s         r   r  z"GPUModelRunner._get_nans_in_logits  s    	~II0@0HIIII!#!'!3!3!3!;!;!?!?!A!A!G!G!I!I*2   ,<VD	 *5)flSTo:U:U *95666 #6**
 &% 	 	 	III	s   B8 BB8 8CCc              #      K    j         j        j        }t          j        o|dk    }|sdV  dS |t
          j        dt          j        f fd            }t          
                    d           |                     |            d|                    d                   d           dV  |                    d           dS t
          j        dt          j        f fd	            }|J t          
                    d
           |                     |            d|                    d                   d           dV  |                    d           dS )z
        Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set.
        This is to help balance expert-selection
         - during profile_run
         - during DP rank dummy run
        r   Nr   c                  t    t          j         j        j        d j                                                  S )Nr   )lowhigh)r   randint_liker  r  r%  ro  r   s   r   rand_input_idsz=GPUModelRunner.maybe_randomize_inputs.<locals>.rand_input_ids  s:    )N&*99;;   r   z'Randomizing dummy input_ids for DP Rankr   Tr   c                  @    t          j         j        j                  S r   )r   
randn_liker  r  r   s   r   rand_inputs_embedszAGPUModelRunner.maybe_randomize_inputs.<locals>.rand_inputs_embeds#  s!    '&*  r   z+Randomizing dummy inputs_embeds for DP Rank)r	  r*  r  r  VLLM_RANDOMIZE_DP_DUMMY_INPUTS	functoolscacher   r   rT  r  rz  r  r  )r   r  r  dp_sizerandomize_inputsr  r  s   `      r   maybe_randomize_inputsz%GPUModelRunner.maybe_randomize_inputs  s      "2E>N7Q; 	#EEEEE"_EL      _ GHHHOONN,,-@y~~a/@/@-@APTOUUUEEEOOA _      _
 !,,,KLLL""$$%<}'9'9!'<'<%<=D      EEE"""""r   rc  max_items_per_batchc                 "   | j         J | j                            | j        |di| j         j                  }|d         |         d         }|
J d            |g|z  }t          d t          || j        | j                  D                       S )	z<Dummy data for profiling and precompiling multimodal models.Nr   )	mm_countsr  r^  r   z!Item should not already be cachedc              3   "   K   | ]
\  }}}|V  d S r   r   )r   rb  rc  s      r   r  z5GPUModelRunner._get_mm_dummy_batch.<locals>.<genexpr>G  s<       
 
%1o 
 
 
 
 
 
r   r[  )	r  rK  get_dummy_mm_inputsr%  r  rR  rD   r
  r  )r   rc  r  dummy_mm_inputsdummy_mm_itemdummy_mm_itemss         r   rh  z"GPUModelRunner._get_mm_dummy_batch1  s     ~))) *>>m.& ? 
 

 (4X>qA ((*M((('+>> 
 
)D{?* * *
 
 
 
 
 	
r   r9  force_attentionr  	skip_eplbcreate_mixed_batchremove_loraactivate_lorais_graph_capturingc                    | j         j        j        }|r/|j        r(t	          j        g           t	          j        g           fS ||                                sJ |r| j        n|}|| j        j	        k    sJ | j        j
        }|r1|rJ t          |dz
  |dz            }||z
  }|dz   }dg|z  |gz   }|}nj|r:|rJ t          |t          ||                    }|g|z  }||z  dk    r||z  |d<   n.t          ||          }||z  }|g|z  }|dxx         ||z  z  cc<   t          |          |k    sJ t          |          |k    sJ t          j        |t          j                  }t%          |                                          }t          j        |t          j                  }|                     ||||d||p|t*          j        k    ||
	  	        \  }}}}}||}n||k    sJ d	| d
| d            |j        }|j        |j        n|}t3          ||||| j         j        j                  \  }}t8                              d||           d} |                     ||||          \  }!}"|s|t*          j        k    r|rdg|z  |dz   gz   }#n|}#|#| j         j        d|<   d| j         j        |d<   | j         !                                 | "                    |          \  }$}|$| j#        j        d|dz   <   | j#        !                                 |t*          j        k    }%| $                    ||||%r|n|||!          \  } }| %                    | j&        |||
|	          5  || j'        k    sJ | (                                }&| j)        r>| j        j*        s2| +                    |          \  }'}(i |&| ,                    |          }&nH| j-        r+d}'| j.        j/        d|         }(| (                                }&n| j0        j/        d|         }'d}(| j1        r| j2        j/        ddd|f         })n8| j3        dk    r| j4        j/        ddd|f         })n| j5        j/        d|         })tm                      j7        rd}*nT| j8        6| j9        :                    | j'        | j        j;        | j<                  | _8        | =                    |dd          }*||d         j        }|||dd<   | >                    |'|(          5  t          | | j         ||||||"          5   | j9        d|'|)|*|(d|&}+ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y   | j@        r|+\  },}n|+},| jA        r| jA        B                                s| jA        C                                rt          | jE        t          t          z            sJ | jA        J |o|t*          jH        k    p| o|t*          j        k    o| jA        jI         }-| jJ        jK        r|
rd}-| jE        L                    ||-||"           ddd           n# 1 swxY w Y   | M                                 |s| N                    d|           t          jO        |          dz
  }.t	          jP        |.          Q                    | j<        d          }/|,|,|/         fS )a  
        Run a dummy forward pass to warm up/profile run or capture the
        CUDA graph for the model.

        Args:
            num_tokens: Number of tokens to run the dummy forward pass.
            cudagraph_runtime_mode: used to control the behavior.
                - if not set will determine the cudagraph mode based on using
                    the self.cudagraph_dispatcher.
                - CUDAGraphMode.NONE: No cudagraph, for warm up and profile run
                - CUDAGraphMode.PIECEWISE: Piecewise cudagraph.
                - CUDAGraphMode.FULL: Full cudagraph, attention metadata is
                    needed.
            force_attention: If True, always create attention metadata. Used to
                warm up attention backend when mode is NONE.
            uniform_decode: If True, the batch is a uniform decode batch.
            skip_eplb: If True, skip EPLB state update.
            is_profile: If True, this is a profile run.
            create_mixed_batch: If True, create a mixed batch with both decode
                (1 token) and prefill (multiple tokens) requests.
            remove_lora: If False, dummy LoRAs are not destroyed after the run
            activate_lora: If False, dummy_run is performed without LoRAs.
        Nr      r   r   r   F)	r  r  r  r  r  r  r  r  r  z7Cudagraph runtime mode mismatch in dummy_run. Expected z
, but got rn  r/  r8  )r  r  r  r  r  r  )
batch_sizer!  r
  )r  r  r9  r  r  r  r  )use_cudagraphsr  r  T)r  r  r   r   )Rr	  r%  r  mm_encoder_onlyr   r{  valid_runtime_modesr  r+  r  r<  r  rM   r  r>  r  r  r~  r   r  r  r   ru  r  r  r   r*  rG  rT  rU  r+  rH  r  rx  rr  r  r  maybe_dummy_run_with_lorar(  r;  r  rO  rP  r  rj  r3  r  r  r  rL  r  rM  r  r  r#   r  r  r3  make_empty_intermediate_tensorsr!  r
  r  r  r)   r\  r,  ra  r`  r  r_  r   r   r  enforce_eagerr'  cudagraph_specialize_lora	dummy_runr  r  rm  r  r   )0r   r  r9  r  r  r  r  r  r  r   r  r  r  r  r  num_decode_tokensnum_prefill_tokensr  num_scheduled_tokens_listmin_tokens_per_reqr  r  r  _cudagraph_moder4  r  r  rb  r  r  r  rR  r  rV  r  r  cum_num_tokensrT  r  r  r  r  r  outputsr   r  logit_indiceslogit_indices_devices0                                                   r   rD  zGPUModelRunner._dummy_runP  s	   L $1C	 	62 	6 <##U\"%5%555 #*%99;; +* <" :HW55Z
 T2IIIII,9 	C%%%% !$L1$4jAo F F!+.?!?(1,H *+.?(?CUBV(V%.MM 
	C))))<j-)H)HIIH)6((B%M)Q..0:]0J)"-:|44H!+x!7);(<x(G%%b)))Z(-BB))),--;;;;,--9999!x(ARRR!"6":":"<"<==WXRX>>> 77.!(<)6!&$7& B*m.@@
 &4  -# 8   	L]4H!, ")%4"")_<<<Q+Q Q7MQ Q Q =<<
 '1#-#6#BJ 	 /I ,9/
 /
++ 	9 	
 	
 	
 6:040G0G(+ 3.	 1H 1
 1
-  	48JJJ! ) 3!226H16L5MM(*2DMYhY'*+DMXYY'M%%''' $ ; ;<P Q QNA8FD #A1$45 ,,...-1CCH#==.(+6>Q22M&84  >    M1 ++ 
 
 r	 r	 %(;;;;;2244L& %t/@/S %+/+B+BCT+U+U(	= " ++H55  * % 	 $ 2 67I8I7I J#6688 N./A0A/AB	 $ C 04QQQ8J9J8J5JK		%)) 15aaa9K:K9K6KL		 N./A0A/AB	~~+ '+$$,4
BB'+':"&"3"9#'; C   - (,'O'O%tU( ($ $/ %9$;$F!'3.?(+ ++I}EE #!$0)=+A%/"6!.	 	 	  %$* '')="/	 
 #                              * 0 (#* qq '&  '1133 *;;==  "$,@R0RSSSSS.::: + N2m6MM
 /. I2m6HH	"@ 1??  *D + +%*N&&#1'9"/	 '   [r	 r	 r	 r	 r	 r	 r	 r	 r	 r	 r	 r	 r	 r	 r	z 	++---  	ANNDZN@@@	"677!;$/>>AAKd  B  
  
 m,@AAAs\   &FYU"U5UUUU	UYU	YU	 CYYYc           
      \     j         j        j        }|r|j        rt	          j        g           S t	          j        |          } j                            |          }|	                    d           fd}t          d$i d |d          ddddd |d	          d
 ||	                    d          dz
            di dd dddd d |d          d |d          d |d          dd t                    D             dd t                    D             dd di dt                      }	                      ||          }n9# t          $ r,}dt          |          v rt          d d          ||d }~ww xY w j        rd  t                    D             }t#          j        | j                  }	t)          d! |D                       }
d }t	          j        |
z   |j        d"          j        |j        #          }                     |	|||           |S )%Nr   c                 >    t          j        f| j                  S )Nr  )r   rK  r
  )r  r  r   s    r   r   z3GPUModelRunner._dummy_sampler_run.<locals>.<lambda>  s    %*h[!DK"P"P"P r   temperatureg      ?
all_greedyF
all_randomtop_pg?top_kr   r  max_num_logprobsno_penaltiesTr  frequency_penaltiesg?presence_penaltiesrepetition_penaltiesr  c                     g | ]}g S r   r   r  s     r   r   z5GPUModelRunner._dummy_sampler_run.<locals>.<listcomp>  s    :::Qb:::r   spec_token_idsc                     g | ]}g S r   r   r  s     r   r   z5GPUModelRunner._dummy_sampler_run.<locals>.<listcomp>  s    8881B888r   allowed_token_ids_maskbad_words_token_idsr  r  out of memoryz9CUDA out of memory occurred when warming up sampler with m dummy requests. Please try lowering `max_num_seqs` or `gpu_memory_utilization` when initializing the engine.c                     g | ]}d gS r  r   r  s     r   r   z5GPUModelRunner._dummy_sampler_run.<locals>.<listcomp>  s    <<<qs<<<r   c              3   4   K   | ]}t          |          V  d S r   )r>  )r   r  s     r   r  z4GPUModelRunner._dummy_sampler_run.<locals>.<genexpr>  s(      AA#SXXAAAAAAr   r   ra  r   )r	  r%  r  r  r   r{  	rand_liker3  rK  r  r}   r  rz   rU  r=  r  r,  r   
make_dummyr
  r  randnr   r!  rd  )r   r   r  r   dummy_tensorsdummy_metadatar  r  r  dummy_spec_decode_metadatar  draft_probsr  s   `           @r   _dummy_sampler_runz!GPUModelRunner._dummy_sampler_run  s(    $1C	 	$2 	$<###66**=99;;q>>PPPPP) 
 
 
%c***
u
 u
  -$$$	

  -A 2333
 r
 "T
 
 "T
 !.c 2 2 2
  -}S111
 "/s!3!3!3
 ;:%//::::
 98h8888
 $(4
  !#!
" )***#
&	!\\ *  NN  		 		 		#a&&(("// / / 
  		 " 	<<E(OO<<<O);)F* *& AAAAAAAJ K[X%R {l	  F ""*	   s   >E 
F 'FFr  c           	         |j         d         }| j        j        }t          ||          }||z  }t	          j        ||          }|dxx         ||z  z  cc<   t	          j        |          |k    sJ t          |          |k    sJ ||z  }t          j	        |          }	t          j
        ||ft          j        | j                  }
t          t          |                                           }t!          |          }|                    || j                   |j                            |          }|                    |           t-          |	|
|g|z  d t/          |          D                       }|                    ||	|j                   	 |                    ||	          S # t2          $ r/}d
t5          |          v rt3          d|d| d          ||d }~ww xY w)Nr   r   r$  )r  )r  r%  c                 *    g | ]}t                      S r   )ry   )r   r   s     r   r   z9GPUModelRunner._dummy_pooler_run_task.<locals>.<listcomp>  s    EEEMOOEEEr   )r  r  r  pooling_states)r  r
  r  r(  z9CUDA out of memory occurred when warming up pooler (task=z) with r)  )r   r+  r<  r  r  rK  r  r>  r   r  r  r~  r
  r   r=   r  rE   verifyr%  r  r  r  rx   r  r  r=  r  )r   r   r  r  r  r  r  r  req_num_tokensdummy_prompt_lensdummy_token_idsr3  dummy_pooling_paramsr4  r0  r  s                   r   _dummy_pooler_run_taskz%GPUModelRunner._dummy_pooler_run_task  sS   
 #(+
,9z<00'83"$'(4F"G"G###zH'<<###v-..*<<<<*++x7777#x/!,-DEE+~&ek$+
 
 
 ($..*:*:;;,$777##D<M#NNNL44T::	,---(),01H<EEU8__EEE	
 
 
 	++#* ' 	, 	
 	
 	
	<<+n       		 		 		#a&&(("// /&./ / / 
  		s   F, ,
G%6*G  G%c                    | j         j        j        }|r|j        rt	          j        g           S |                                 }|st          d| j        j         d          t          t          t          f                     }|D ]5}|                     ||          }t          d |D                       ||<   ~6t          |                                d           d         }|                     ||          S )NzModel zq does not support any pooling tasks. See https://docs.vllm.ai/en/latest/models/pooling_models.html to learn more.c              3   (   K   | ]}||j         V  d S r   )nbytes)r   os     r   r  z3GPUModelRunner._dummy_pooler_run.<locals>.<genexpr>6  s$      #N#NAH#N#Nr   c                     | d         S r  r   r   s    r   r   z2GPUModelRunner._dummy_pooler_run.<locals>.<lambda>9  s
    !A$ r   )keyr   )r	  r%  r  r  r   r{  r  r=  r3  r  rI   floatr<  r  rn  r  )r   r   r  supported_pooling_tasksoutput_sizer  r   max_tasks           r   _dummy_pooler_runz GPUModelRunner._dummy_pooler_run  s   
 $1C	 	$2 	$<### #'"B"B"D"D& 	!*0 ! ! !   ;-.00+ 	 	D00EEF ##N#Nf#N#N#N N NK{((**???B**=(CCCr   c                 B   | j         r| j        j        }|"|j        rt                              d           n| j        }|J |                                x}dk    r|                                }|j	        |         }t                              d|||           | 
                    ||          } | j        j        di |}t          ||           t          |          D ]\  }}	|	| j        d| <   |                     | j        d          \  }
}t%                      j        r3| j        r|                     |
          }	n|                     |          }	nd }	|                                  ~
~	| j                                         t3          j                     d S )	NzCSkipping memory profiling for multimodal encoder and encoder cache.r   zxEncoder cache will be initialized with a budget of %s tokens, and profiled with %s %s items of the maximum feature size.rE  tmp_T)r  r   )rO  r%  r  skip_mm_profilingrT  r  r  get_encoder_budgetrg  max_items_per_batch_by_modalityrh  r3  rS  r   r  r[  rD  r;  r#   r]  r>   rG  r3  r  r   gccollect)r   r  r  encoder_budgetri  max_mm_items_per_batchbatched_dummy_mm_inputsdummy_encoder_outputsr   r   r   last_hidden_statess               r   profile_runzGPUModelRunner.profile_run<  s   " -	@);I$)D$%   
 !N	 ,,,&/&B&B&D&DDNII &/%K%K%M%MN-6-V&.* KK0 '.&   /3.F.F&./ /+ -HDJ,G - -1- -) 4-+A    &//D%E%E @ @	69?*:!::66 -1OOD -< -
 -
)) >>& 	$ E//>>001CDDF6  """

r   c                    | j         j        t          j        k    rt                              d           dS t          xj        dz  c_        t          j	                    }t          d             }t          d            |            5  t          | j                  5  t          j                                        d         }| j                                        D ]\  }}|                     ||           t          j                                         t          j                                        d         }d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   t          d           t+                       t          j	                    }||z
  }||z
  }	t                              d	||	d
z  d           |	S )NzrSkipping CUDA graph capture. To turn on CUDA graph capture, ensure `cudagraph_mode` was not manually set to `NONE`r   r   c               3   .  K   t          j                     t          j         } | rt          j                     	 d V  | r(t          j                     t          j                     d S d S # | r't          j                     t          j                     w w xY wr   )rM  rN  r  VLLM_ENABLE_CUDAGRAPH_GCfreezeunfreeze)should_freezes    r   	freeze_gcz/GPUModelRunner.capture_model.<locals>.freeze_gc  s      
 JLLL $ ==M 	!  !KMMMJLLLLL! != !KMMMJLLLL!s   A) )+BTr  )batch_descriptorsr9  Fz4Graph capturing finished in %.0f secs, took %.2f GiBr  r  r  )r'  rt  r   ru  rT  warningr   num_gpu_runner_capture_triggersr  r  r   r   r%   r
  r   r   mem_get_infor  get_capture_descs_capture_cudagraphsr   r   r  )
r   
start_timer[  start_free_gpu_memoryr
  batch_descsend_free_gpu_memoryend_timeelapsed_timecuda_graph_sizes
             r   capture_modelzGPUModelRunner.capture_model}  s\   "1]5GGGNNI   1;;q@;;&((
		! 	! 
	!$ 	(---Y[[ 	? 	?-t{;;; 	? 	?$)J$;$;$=$=a$@!
 *<<>>  ((&1+7 )    
 J""$$$"'*"9"9";";A">	? 	? 	? 	? 	? 	? 	? 	? 	? 	? 	? 	? 	? 	? 	? 	? 	? 	? 	? 	? 	? 	? 	? 	? 	? 	? 	? 	? 	? 	?( 	(... 	$&&*,/2EEBw'	 	 	
 	
 	
 s7   EBE;EE	EE	EE"%E"r\  c                    |t           j        k    r|                                sJ d|             |sd S |d         j        }|t           j        k    }t          j        | j        |dd|          }t                      r:t          || j
        j         d                    |rdnd|j                  	          }|D ]}|j        }|j        }| j        j        o-|t           j        k    o|ot%          | j        j        ||
          }	t)          | j        j                  D ]}
 ||t           j        |	|            ||||	|d           |                     | j                   d S )Nz Invalid cudagraph runtime mode: r   TF)r  r  r   r  zCapturing CUDA graphs ({}, {})decodezmixed prefill-decode)disabledesc)r  r  r  )r9  r  r  )r9  r  r  r  )r   ru  r  uniformrH  r  partialrD  r&   r   r)  use_tqdm_on_loadformatr  r  r  r*  rF  r   r	  r  r'  cudagraph_num_of_warmupsmaybe_remove_all_lorasr(  )r   r\  r9  r  r  r  r4  r  r  r  rb  s              r   ra  z"GPUModelRunner._capture_cudagraphs  s    #m&888&::<< 98F.DFF 98 = ! 	F*1-50M4FF%O)+
 
 
	  !! 	 $! ,==5<< .JHH4J*/ ! ! ! , '	 '	J#.J&/M $2 *m.@@" ,+;)#1  	   42KLL   	+8+=(;"/	     I'=$7+#'     	##D$455555r   r  c                    
 t           j                  dk    s
J d             G d dt                    
dt          dt          t
          
t          t                   f         t          t          t                            f         f
 fd}dt
          
t          t                   f         d	t          dt          t                   fd
}g }g }|j        D ]C} ||          }|                    |d                    |                    |d                    D                     ||j                   t!           j                   t%          |          D ])\  }}	 j                             ||	|                     *dS )zT
        Initialize the attention backends and attention metadata builders.
        r   z*Attention backends are already initializedc                   0    e Zd ZU ee         ed<   eed<   dS )AGPUModelRunner.initialize_attn_backend.<locals>.AttentionGroupKeyattn_backendr  N)r   r   r   r  rU   r  ri   r   r   r   AttentionGroupKeyrv    s0         /0000&&&&&&r   rx  kv_cache_group_specr   c                   	 t          t          t                   t                    }t	          j        || j                  }i 	t          t                    }| j        D ]}||         	                                }|j
        v rt          d|          }|                                }| j        }t          |t                    r|j        |         }||f} 
||          	|<   ||                             |           	fd|                                D             t'          d 	                                D                       fS )NFastPrefillc                 (    i | ]\  }}|         |S r   r   )r   r  r  attn_backendss      r   r  z_GPUModelRunner.initialize_attn_backend.<locals>.get_attn_backends_for_group.<locals>.<dictcomp><  s$    MMMAq!1MMMr   c              3   $   K   | ]}|j         V  d S r   )rw  )r   	group_keys     r   r  z^GPUModelRunner.initialize_attn_backend.<locals>.get_attn_backends_for_group.<locals>.<genexpr>=  s%      SSyI*SSSSSSr   )r   r  r   r-   r   r	  r  r   r   get_attn_backendr  r]   full_cls_namer  r  rl   r  r  r  r  r  )ry  
layer_typelayersattn_backend_layersr  rw  r  layer_kv_cache_specrB  r}  rx  r   s            @r   get_attn_backends_for_groupzKGPUModelRunner.initialize_attn_backend.<locals>.get_attn_backends_for_group  sr    d3i);<<J0 *.A.M F M"-d"3"3 2= < <
%j1BBDD!MMM#E%$$ $L
 !- : : < <&9&G#13JKK Y*=*LZ*X'$&9:%6%6 "5& &c" $C(//
;;;;MMMM1D1J1J1L1LMMMSSM<P<P<R<RSSSSS r   attn_backends_mapkv_cache_group_idc                     g }|                                  D ]/\  \  }}}t          ||||          }|                    |           0|S r   )r  r   r  )r  r  rZ  rw  r  r  r  s          r   create_attn_groupszBGPUModelRunner.initialize_attn_backend.<locals>.create_attn_groups@  sh     13K>O>U>U>W>W / /:-}{+ !%	 
 "":....r   r   N)r>  rZ  r   rh   rm  r  r   r  r  r  rU   r   r   r  r   _check_and_update_cudagraph_moder   r	  r  )r   r  r  r  attention_backend_mapsattention_backend_listry  r}  r   attn_backend_maprx  s   `         @r   initialize_attn_backendz&GPUModelRunner.initialize_attn_backend  s    4#$$)))+W)))	' 	' 	' 	' 	'
 	' 	' 	'#	!1#	4)4945s4@P;Q7RRS#	 #	 #	 #	 #	 #	 #	J	#$5tCy$@A	"	 .!	 	 	 	  "$!##2#B 	< 	<778KLLM"))-*:;;;"))-*:;;;; 	--"O$C	
 	
 	

 	))9:::#,-C#D#D 	M 	MA##$6$67G$K$KLLLL	M 	Mr   r  c           
      B   t          t          |j                            D ]h}| j        |         D ]X}|                    | j        | j        |t          |          k     r||         nd| j        j        sdn| j        j	                   Yi| 
                                 dS )zW
        Create the metadata builders for all KV cache groups and attn groups.
        Nr   )num_metadata_builders)r  r>  r  rZ  create_metadata_buildersr	  r
  r*  rF  rG  !calculate_reorder_batch_threshold)r   r  r  r  r  s        r   initialize_metadata_buildersz+GPUModelRunner.initialize_metadata_buildersb  s     "'s?+J'K'K!L!L 	 	"./@A 
 

33$K(3/A+B+BBB ''899/=+;!!-: 4 	 	 	 	
 	..00000r   attention_backendsr  c           	         t           j        }d}t          ||          D ]W\  }}|D ]O}|                                }|                    | j        |j                  }	|	j        |j        k     r	|	}|j        }PX| j	        j
        }
|
J |
                                t          j        k    r|t           j        k    rd|
j         d| d| d}|t           j        k    r|dz  }t!          |          | j	                                        r|dz  }t          j        x}
| j	        _
        n|dz  }t          j        x}
| j	        _
        t(                              |           |
                                t          j        k    r|t           j        k    rd|
j         d| d| d}| j	        j        t0          j        k    rC| j	                                        s| j	        j        r|d	z  }t          j        x}
| j	        _
        n|d
z  }t          j        x}
| j	        _
        t(                              |           |
                                t          j        k    r| j        dk    r|j        t           j        j        k     rd|
j         d| d| d}| j	                                        r|dz  }t          j        x}
| j	        _
        n|dz  }t          j        x}
| j	        _
        t(                              |           |
                                r.|t           j        k    rt!          d|
j         d| d| d          |
                                t          j        k    r`|
                                 rL| j        dk    rA| j	        !                    | j        | j"        j#                   | j	        j$        }||ng | _%        |
| j	        _
        | j&        '                    |
| j                   | j(        rQ| j(        )                                r:tU          | j+        tX                    sJ | j+        '                    |
           dS dS dS )z
        Resolve the cudagraph_mode when there are multiple attention
        groups with potential conflicting CUDA graph support.
        Then initialize the cudagraph_dispatcher based on the resolved
        cudagraph_mode.
        NzCUDAGraphMode.z is not supported with z backend (support: r  zU; please try cudagraph_mode=PIECEWISE, and make sure compilation mode is VLLM_COMPILEz+; setting cudagraph_mode=FULL_AND_PIECEWISEz); setting cudagraph_mode=FULL_DECODE_ONLYzJ; setting cudagraph_mode=PIECEWISE because attention is compiled piecewisezI; setting cudagraph_mode=NONE because attention is not compiled piecewiser   z9 is not supported with spec-decode for attention backend z (support: z"; setting cudagraph_mode=PIECEWISEz; setting cudagraph_mode=NONEz backend (support:zW) ; please try cudagraph_mode=PIECEWISE, and make sure compilation mode is VLLM_COMPILE)-rV   ALWAYSr   get_builder_clsget_cudagraph_supportr	  r  r  r   r'  rt  
mixed_moder   rH  r  NEVERrc  splitting_ops_contain_attentionFULL_AND_PIECEWISEFULL_DECODE_ONLYrT  r]  decode_moder  r   VLLM_COMPILEuse_inductor_graph_partitionr  ru  r  UNIFORM_BATCHr  separate_routine&adjust_cudagraph_sizes_for_spec_decoder*  r  rs  rw  r  initialize_cudagraph_keysr,  ra  r  r_  r   )r   r  r  min_cg_supportmin_cg_backend_nameattn_backend_setr  rw  builder_cls
cg_supportrt  r  capture_sizess                r   r  z/GPUModelRunner._check_and_update_cudagraph_modey  sU    ,2"031
 1
 	@ 	@,n !1 @ @*::<<(>>$n&B 
 #n&:::%/N*6*?'@ 0?))) %%''=+==="4";;;%!4 % %+% %!% % % 
 !3!999A !oo% &FFHH 	DD!4!8!G!G BB!2!8!G NN3 &&((M,>>>"4":::%!4 % %+% %!% % % 
 &+/KKK'GGII L*G L 6
 "+!8!G!G :
 "&!8!G NN3
 &&((M,>>>-11$'9'G'MMME!4 E E&E E3AE E E 
 &FFHH 	;;!+!8!G!G 66!&!8!G NN3
 ..00
	"4":::A!4 A A"5A A)A A A   &&((M,>>>//11 ?-11#JJ-t/C/X   !3KM!.!: & 2@.!;;D9	
 	
 	

 " 	Ct'>'H'H'J'J 	CdlM:::::L22>BBBBB	C 	C 	C 	Cr   c                     d }d |                                  D             }t          |          dk    r	d| _        dS t          ||          | _        dS )a  
        Choose the minimum reorder batch threshold from all attention groups.
        Backends should be able to support lower threshold then what they request
        just may have a performance penalty due to that backend treating decodes
        as prefills.
        c                 2    || n| |nt          | |          S r   )r  )abs     r   r   zBGPUModelRunner.calculate_reorder_batch_threshold.<locals>.<lambda>   s     !)QQaiSQRTUYY r   c                 @    g | ]}|                                 j        S r   )r  r  r   groups     r   r   zDGPUModelRunner.calculate_reorder_batch_threshold.<locals>.<listcomp>"  s7     6
 6
 6
 &&((@6
 6
 6
r   r   N)_attn_group_iteratorr>  r  r
   )r   min_none_highreorder_batch_thresholdss      r   r  z0GPUModelRunner.calculate_reorder_batch_threshold  st     WV6
 6
22446
 6
 6
  '((A--+/D(F'-m=U'V'V$$$r   kv_manager_block_sizerZ  c                 H   dt           t          t                            dt          dt          fd}d |D             } |||           r| S t          d |D                       }t          |d          D ]}| |z  d	k    r |||          r|c S t          d
|  d          )a  
        Select a block size that is supported by all backends and is a factor of
        kv_manager_block_size.

        If kv_manager_block_size is supported by all backends, return it directly.
        Otherwise, return the max supported size.

        Args:
            kv_manager_block_size: Block size of KV cache
            attn_groups: List of attention groups

        Returns:
            The selected block size

        Raises:
            ValueError: If no valid block size found
        backendsrp  r   c                     | D ]u}d}|                                 D ]W}t          |t                    r	||k    rd} t          |t                    r||j        z  dk    rd}Ft          d|           |s dS vdS )zO
            Check if the block size is supported by all backends.
            FTr   zUnknown supported size: ) get_supported_kernel_block_sizesr  r   r[   baserc  )r  rp  r2  is_supportedsupported_sizes        r   block_size_is_supportedzHGPUModelRunner.select_common_block_size.<locals>.block_size_is_supportedC  s     $ ! !$&-&N&N&P&P V VN!.#66 V%77+/L#NJ?? V%(;;q@@+/L()TN)T)TUUU# ! 55!4r   c                     g | ]	}|j         
S r   )r2  r  s     r   r   z;GPUModelRunner.select_common_block_size.<locals>.<listcomp>X  s    ;;;eEM;;;r   c              3   r   K   | ]2}|                                 D ]}t          |t                    |V  3d S r   )r  r  r   )r   r2  r  s      r   r  z:GPUModelRunner.select_common_block_size.<locals>.<genexpr>g  si       &
 &
")"J"J"L"L&
 &
 .#..	&
&
 &
 &
 &
 &
 &
 &
r   T)reverser   zNo common block size for z. )r   r  rU   r   r   r  rv  rc  )r  rZ  r  r  all_int_supported_sizesr  s         r   select_common_block_sizez'GPUModelRunner.select_common_block_size-  s   ,	4 012	@C		 	 	 	* <;{;;; #"8-BCC 	)(( #& &
 &
#&
 &
 &
 #
 #
 %%<dKKK 	& 	&N$~5::&&x@@ &%%%%&N5JNNNOOOr   c                 @   d |j         D             }g }t          | j        | j                  }t	          |j                   D ]\  }}t          |j        t                    r t          |||         t                      z            }t          |j        t                    r-| j        j        r|nd|j        j        z   }	t          ||	          }|                    |           || j        j        gk    s|| j        j        gk    r| j        j        dk    s
J d            t#          | j        || j        | j        | j        | j                                        |||t1          | j        j                  | j        j        | j        j        | j                  | _        dS dS )a]  
        Re-initialize the input batch if the block sizes are different from
        `[self.cache_config.block_size]`. This usually happens when there
        are multiple KV cache groups.

        Args:
            kv_cache_config: The KV cache configuration.
            kernel_block_sizes: The kernel block sizes for each KV cache group.
        c                 Z    g | ](}t          |j        t                    |j        j        )S r   )r  r  re   rp  )r   r  s     r   r   z?GPUModelRunner.may_reinitialize_input_batch.<locals>.<listcomp>  sB     
 
 
n:<TUU
(3
 
 
r   r   r   zCannot re-initialize the input batch when CPU weight offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 for more details.)r  r  r  r
  r  r   r  r  max_num_blocks_per_reqr  r  r  r>   N)r  rn  r  rR  r  r  r  re   rM   r   rj   r&  enable_prefix_cachingnum_speculative_blocksr  rp  r/  r   r  r;  r
  r  r%  ro  r   r	  r,  rq  r  r  r>   )
r   r  r  r  max_num_blocksr  r   r  r  mamba_blocks_per_reqs
             r   may_reinitialize_input_batchz+GPUModelRunner.may_reinitialize_input_batchu  s   
 
"1"A
 
 

 D.0DEE!*?+J!K!K 	: 	:A~.68PQQ %){1~0G0I0II& &" .6	BB  (>**"0G	(H$
 *-*,@* *& !!"899994,7888<N(S
 =
 =
 $3q888$ 988
  *!.+'+':{?,;;=='#5'5#D$4$GHH ,8262B2d!%!6     D=
 =
r   c                 r   i }|j         D ]<}t          j        |j        t          j        | j                  }|j        D ]}|||<   =t                      }|j        D ]+}|j	        D ]!}|| j
        v r|                    |           ",|t          |                                          k    s
J d            |S )au  
        Initializes the KV cache buffer with the correct size. The buffer needs
        to be reshaped to the desired shape before being used by the models.

        Args:
            kv_cache_config: The KV cache config
        Returns:
            dict[str, torch.Tensor]: A map between layer names to their
            corresponding memory buffer for KV cache.
        r$  z)Some layers are not correctly initialized)kv_cache_tensorsr   r  r  int8r
  	shared_byr  r  r  r  r  r  )r   r  kv_cache_raw_tensorskv_cache_tensorr{  r  r  r  s           r   _allocate_kv_cache_tensorsz)GPUModelRunner._allocate_kv_cache_tensors  s     9;.? 	: 	:O[$EJt{  F .7 : :
39$Z00: ee$4 	, 	,E#/ , ,
!===
++++, c"6";";"="=>>>>>7 ?>> $#r   c                 J    t           j                            | j                  S r   )	itertoolschainfrom_iterablerZ  r   s    r   r  z#GPUModelRunner._attn_group_iterator  s    ,,T-=>>>r   c              #   J   K   | j         j        sd S | j        D ]
}|E d {V  d S r   )r  r  rZ  )r   rZ  s     r   "_kv_cache_spec_attn_group_iteratorz1GPUModelRunner._kv_cache_spec_attn_group_iterator  sP      #3 	F+ 	# 	#K"""""""""	# 	#r   c                 J   g }t          |j                  D ]
\  }}|j        }t          |t                    r3t          t          |j                                                            }t          |t                    rkt          |t                    rE| j        |         }|j        j        }|                     ||          }|                    |           t          |t                    r|                    |j                   t!          d|j                   |S )a  
        Generate kernel_block_sizes that matches each block_size.

        For attention backends that support virtual block splitting,
        use the supported block sizes from the backend.
        For other backends (like Mamba), use the same block size (no splitting).

        Args:
            kv_cache_config: The KV cache configuration.

        Returns:
            list[int]: List of kernel block sizes for each cache group.
        zunknown kv cache spec )r  r  r  r  rl   rR  iterr  r  re   rb   rZ  rp  r  r  rj   NotImplementedError)	r   r  r  r  r  r  rZ  r  selected_kernel_sizes	            r   _prepare_kernel_block_sizesz*GPUModelRunner._prepare_kernel_block_sizes  s9     ,5o6U,V,V 	 	(L.*8M-)@AA R !%T-*F*M*M*O*O%P%P Q Q-)ABB M=99  #.|<(6(D(O%'+'D'D);( ($ #))*>????M955  #))-*BCCCC)K^-IKK   "!r   r  c           
      <   i }d\  }}|                                  D ]}|j        }|j        }	|j        t	          |          k    r*||j                 }
|j        D ]}|| j        v r||         }|                                |j        z  dk    sJ |                                |j        z  }t          |t                    r<d}|j        |
z  }||z  }|	                    ||
|j        |j        | j        j                  |j        }	 |	                                t	                    t	                    k    sJ n@# t&          t(          f$ r, t+          t-          t	                                        Y nw xY wt+          fdD                       fdt-          t	                              D             } ||                             |                                        j        | ||<   t          |t2                    rd}||         }g }d}t5          |j        |j                  D ]\  }}t;          |          }|j        |z  }|g|R }t=          j        |                                           }|g|dd         R }||z  dk    sJ t=          j!        |                    |          ||||z  	          }|"                    |           ||d         |z  z  }|||<   t(          |r|r| #                    |           |S )
a  
        Reshape the KV cache tensors to the desired shape and dtype.

        Args:
            kv_cache_config: The KV cache config
            kv_cache_raw_tensors: The KV cache buffer of each layer, with
                correct size but uninitialized shape.
            kernel_block_sizes: The kernel block sizes for each KV cache group.
        Returns:
            Dict[str, torch.Tensor]: A map between layer names to their
            corresponding memory buffer for KV cache.
        FFr   T)cache_dtype_strc              3   (   K   | ]}|         V  d S r   r   )r   r   kv_cache_shapes     r   r  z;GPUModelRunner._reshape_kv_cache_tensors.<locals>.<genexpr>?  s9       + +./q)+ + + + + +r   c                 :    g | ]}                     |          S r   )ru  )r   r   kv_cache_stride_orders     r   r   z<GPUModelRunner._reshape_kv_cache_tensors.<locals>.<listcomp>C  s7     ! ! ! .33A66! ! !r   r   N)r  stridestorage_offset)$r  r  r2  r  r>  r  r  r  page_size_bytesr  rb   rp  get_kv_cache_shaper  	head_sizer&  r0  r!  get_kv_cache_stride_orderAttributeErrorr  rm  r  viewpermuterj   r   shapesdtypesrS   r   r  r  
as_stridedr  %_update_hybrid_attention_mamba_layout)r   r  r  r  rW  has_attn	has_mambar  r  rw  kernel_block_sizer  
raw_tensor
num_blocksnum_blocks_per_kv_blockkernel_num_blocksr!  	inv_orderstate_tensorsstorage_offset_bytesr   
dtype_sizenum_element_per_pagetarget_shaper  target_strider{  r  r  s                              @@r   _reshape_kv_cache_tensorsz(GPUModelRunner._reshape_kv_cache_tensors  s   $ .0	*)<<>> M	. M	.E!/M =L&#.@*A*AAA 253J K#/ F. F.
!===1*=
!''))M,IIQNNNN'--//=3PP
m];; @.#H%04EE , )35L(L%%1%D%D))%2%/(,(9(E &E & &N */ER0<0V0V0X0X-"#899S=P=PPPPPP*,?@ R R R05eC<O<O6P6P0Q0Q---R &+ + + + +3H+ + + & &N! ! ! !!&s+@'A'A!B!B! ! !I
!,Z8en-- )- j))  y99 . $I!5j!AJ$&M+,((+M,@-BV(W(W G Gu%3E%:%:
)9ZG - )3';U';';!&\!:!:!A!A!C!C)=(Kqrr
(K(K3j@AEEEE!&!1&OOE22!-#0+?:+M	" " " &,,V444,q	J0FF,,,9Ij))--MF.P  	B	 	B66yAAAs   6E:F ?F rW  c           	         |                                  D ]}|j        }|j        D ]}||         }t          |t                    r|j        d         dk    r|j        d         dk    sJ d|j                     |j        dd                                         }|                    |j        |d|z  g|                                dd         R            dS )z
        Update the layout of attention layers from (2, num_blocks, ...) to
        (num_blocks, 2, ...).

        Args:
            kv_caches: The KV cache buffer of each layer.
        r   r  r   zkFail to determine whether the layout is (2, num_blocks, ...) or (num_blocks, 2, ...) for a tensor of shape N)r  r  )	r  r  r  r  rb   r   r  as_strided_r  )r   rW  r  r  r  kv_cachehidden_sizes          r   r  z4GPUModelRunner._update_hybrid_attention_mamba_layoutm  s    <<>> 	 	E!/M#/  
$Z0m];; 
q@QUV@V@V#>!,111>-5^> > 211
 #+."4":":"<"<K((%^ +Q_Ux?P?PQRQSQS?TUU )   	 	r   c                    | j         j        }|                     | j        |          r6|                     || j        || j        |          \  }}}|| _        || _        n,|                     |          }| 	                    |||          }| j
                                        D ],\  }}	t                              d||	           ||	         ||<   -| j        j        j        dk    rdnd}
t#          || j        j        | j        |
           |S )a\  
        Initialize the memory buffer for KV cache.

        Args:
            kv_cache_config: The KV cache config
            kernel_block_sizes: The kernel block sizes for each KV cache group.

        Returns:
            Dict[str, torch.Tensor]: A map between layer names to their
            corresponding memory buffer for KV cache.
        z%s reuses KV cache of %slongcat_flashr  r   )r&  r0  use_uniform_kv_cacherZ  allocate_uniform_kv_cachesr
  rX  rY  r  r   r  r  rT  rU  r%  r  
model_typer   r'  r  rW  )r   r  r  r0  rW  rX  rw  r  r  target_layer_namenum_attn_modules              r   initialize_kv_cache_tensorsz*GPUModelRunner.initialize_kv_cache_tensors  sM     '3$$T%5{CC 	//#$K&  ;I,l *?D&-9D** $(#B#B?#S#S  66!57I I
 .2-H-N-N-P-P 	A 	A)J)LL3ZARSSS$-.?$@Ij!! ",7?JJAAPQ 	 	#:N		
 	
 	
 r   c                    | j         sdS t          | j         |j        | j                   | j        j        rRt          | j        t                    }t          |          D ]*}|| j         v r| j
                            |           & dS dS dS )z
        Add layers that re-use KV cache to KV cache group of its target layer.
        Mapping of KV cache tensors happens in `initialize_kv_cache_tensors()`
        N)r  r   r  r  r&  r  r   r	  r   reversedr  r  )r   r  r  r  s       r   .maybe_add_kv_sharing_layers_to_kv_cache_groupsz=GPUModelRunner.maybe_add_kv_sharing_layers_to_kv_cache_groups  s     * 	F0'+(	
 	
 	
 4 		 6d6F	RRK&{33  
!<<<@DDZPPPPEE		 		
 r   c                    t          |          }|| _        |                                  |                     |           |                     |           |                     |          }|                     ||           |                     ||           |                     ||          }| j	        rp| j	        
                                s| j	                                        r>t          | j        t          t          z            sJ | j                            |           t#                      rnt%                      }| j        *| j        J |                    | j        | j                   n|                    |           |                    t0                     | j        j        r|                                  dS dS )z
        Initialize KV cache based on `kv_cache_config`.
        Args:
            kv_cache_config: Configuration for the KV cache, including the KV
            cache size of each layer
        N)r   r  .may_add_encoder_only_layers_to_kv_cache_configr  r  r  r  r  r  r,  ra  r`  r  r_  r   r   validate_same_kv_cache_groupr    r   rX  rY  register_cross_layers_kv_cacheregister_kv_cachesset_host_xfer_buffer_opsr!   r%  r  init_routed_experts_capturer)r   r  r  rW  kv_transfer_groups        r   initialize_kv_cachez"GPUModelRunner.initialize_kv_cache  s    #?33.;;===;;OLLL$$_555 "==oNN 	))/;MNNN 	))/;MNNN44/
 
	 " 	G#--//	G&7799	G dlM<N,NOOOOO L55oFFF "" 		G 5 7 7)55AAA!@@.0N    "44Y???66~FFF9 	0--/////	0 	0r   c                 H   t                               d| j        j                   t	          j                    }| j        j        }| j        j	        t          | j        j                  z  dz   |z  | _        |                    | j        j        | j        | j                   d S )NzFInitializing routed experts capturer, enable_return_routed_experts: %sr   )r  max_num_kv_tokensr	  )rT  r  r%  r  r.   creater&  rp  r  r  r>  r  r  init_bufferr+  r  r	  )r   routed_experts_capturerrp  s      r   r  z+GPUModelRunner.init_routed_experts_capturer  s    T:	
 	
 	
 #8">"@"@&1
 +s43G3W/X/XX" 	 ++#'#8#O"4( 	, 	
 	
 	
 	
 	
r   c                 ~   | j         j        j        }t          t                    }t          | j         t                    }|                                D ]q\  }}|j        t          j
        k    rWt          ||j        |j        | j                  }||                             |           | j                            |           rt%          |          dk    rdt%          |          dk    s
J d            |                                \  }}| j        j                            t-          ||                     dS dS )zA
        Add encoder-only layers to the KV cache config.
        )rp  r  r  r!  r   r   z0Only support one encoder-only attention spec now)r  r  N)r	  r&  rp  r   r   r   r   r  	attn_typerY   ENCODER_ONLYre   r  r  r1  r  r  r  r>  popitemr  r  rh   )	r   rp  encoder_only_attn_specsr  r  attn_module	attn_specr7  r  s	            r   r  z=GPUModelRunner.may_add_encoder_only_layers_to_kv_cache_config  sP    %2=
BMdBSBS1$2BINN'2'8'8':': 		= 		=#J$(BBB+C)!,!9)3-	, , ,	 (	299*EEE,00<<<&''!++.//1444B 544 !8 ? ? A AD+ 077 [MMM     ,+r   c                    t                      rt                      j        ri S i }t          t          t
                   t                    }t          | j        |          }|	                                D ]O\  }}t          |t                    r|j        x}r|| j        |<   .|                    | j                  x}r|||<   P|S )a0  
        Generates the KVCacheSpec by parsing the kv cache format from each
        Attention module in the static forward context.
        Returns:
            KVCacheSpec: A dictionary mapping layer names to their KV cache
            format. Layers that do not need KV cache are not included.
        )r   r   rC  r   r  r   r-   r   r	  r  r  r   kv_sharing_target_layer_namer  get_kv_cache_spec)r   r  r  r  r  r#  kv_tgt_layerr7  s           r   r'  z GPUModelRunner.get_kv_cache_spec9  s      	!2!2!> 	I02$s)%788
1$2BJOO'2'8'8':': 	1 	1#J+y11  + HH ;G+J7"44T5EFFFt 1,0j)r   c                     | j         d |j        d                  }|                    |d           | j                                         | j                                         |                                S rv  )r  r   rz  r  r   r   r   )r   r   pinneds      r   r  zGPUModelRunner._to_listY  so     23O5F5LQ5O3OP&T:::""$$$'')))}}r   c                     | j         5  d | j                                        D             }| j                                         |cddd           S # 1 swxY w Y   dS )z
        Get encoder timing stats for all requests and clear the registry.

        Returns:
            Dictionary mapping request_id to stats dict.
        c                 >    i | ]\  }}||                                 S r   )to_dict)r   r  	stats_objs      r   r  z;GPUModelRunner.get_encoder_timing_stats.<locals>.<dictcomp>p  s:       %FI 	))++  r   N)r|  ry  r  r   )r   statss     r   get_encoder_timing_statsz'GPUModelRunner.get_encoder_timing_statsh  s     & 	 	 )-)E)K)K)M)M  E (..000	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   >AAArW  group_lora_refsrb  rd  c              #     K   |sdV  dS ||||z            }d |D             }t           j                                         t          j                    }	 dV  t           j                                         t          j                    |z
  }|t          t          |          d          z  }	| j        5  |D ]N}
|
| j        vrt                      | j        |
<   | j        |
         }|xj
        |	z  c_
        |xj        dz  c_        O	 ddd           dS # 1 swxY w Y   dS # t           j                                         t          j                    |z
  }|t          t          |          d          z  }	| j        5  |D ]N}
|
| j        vrt                      | j        |
<   | j        |
         }|xj
        |	z  c_
        |xj        dz  c_        O	 ddd           w # 1 swxY w Y   w xY w)a@  
        Context manager to time encoder forward operations.

        Args:
            should_time: Whether timing is enabled
            group_lora_refs: Full list of (request_id, pos_info) tuples
            current_item_idx: Starting index for this group
            num_items: Number of items in this group
        Nc                     h | ]\  }}|S r   r   )r   r  rb  s      r   	<setcomp>z9GPUModelRunner.timed_encoder_operation.<locals>.<setcomp>  s    @@@	V@@@r   r   )r   r   r   r  r  rn  r>  r|  ry  EncoderTimingStatsencoder_forward_timenum_encoder_calls)r   rW  r1  rb  rd  
group_refsgroup_request_idsrb  elapsedper_request_timer  r/  s               r   rQ  z&GPUModelRunner.timed_encoder_operationw  s     "  	EEEF$%58H98T%TU
@@Z@@@
   &((
	1EEEJ""$$$'))J6G&S1B-C-CQ)G)GG* 1 1/ 1 1FT%AAA?Q?S?S4V< 8@E..2BB..++q0+++11 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 J""$$$'))J6G&S1B-C-CQ)G)GG* 1 1/ 1 1FT%AAA?Q?S?S4V< 8@E..2BB..++q0+++11 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1sE   D" 5ADDD"AG)>AGG)G!!G)$G!%G))r   N)r   r   r   Nr   )	NNNNFFNNN)r   r   r  r  )NNNN)TFNNr   )F)r  r   r   N)
NFFTFFFTFF)r   r   r   r   r   r
  r   r   r  r  inference_moder  r   r  SymIntr!  r   r   r}  r  r  rx  r  rD  r   rQ  r  r`   r   r
  r  r  rA   rd  rj  r  ndarrayrm  rr  r  r  ri   r  r   r  r   r   r   rZ   r  r  rX   r  r  r  r  r  rB   rC   r=  rk  r  nnModuler  rH   r  rI   r  rJ   r  rG   r  r  rq   rt   rn   r  r  r  rp   r  rv   r  rr   rs   r  r   r  r  staticmethodr  r   r(   r   r  r  r+  r[  rr  ro   rt  r`  r  rg  r  r}   r_  r   r  r  r  r  r  r  r  rh  rD  r3  ru   r<  rG  rT  ri  ra  rg   r  r  r  r  rU   rh   r  r  r   r  r  r  r   r  r  r  r   r  r  r  r  r  r  r'  r  rC  r0  rQ  r   r   r   r  r  C  s       y5y5 y5 y5 y5 y5vJ# J$ J J J J) ) ) ) U)5 )5 )5 )5V2 2 2 2 2 LP	
 	
 	
5<'	
05	
DH	
		
 	
 	
 	
     D   4D D D D! ! ! !o, o, o, o,b. %.@Q.	. . . .`  )7 	       D
/A 
 
 
 

0B 
 
 
 
"+" 
" " " "0B B1D B B B B )-% %J% ho% 
rz2:%	&	% % % %(w
+w
 %(w
 z	w

 
w
 w
 w
 w
r 6"38n 6 # 6 	 6
 
u|d"BJ$55	6 6  6  6  6DY
+Y
 !jY
 
T!	#
	Y
 Y
 Y
 Y
@ )-&*-1.2 %&+6:;?8<q? q?q? q? 	q?
 :q? tq? $d*q? t+q? q?  $q? #38nt3q? #'tCy/D"8q? C-.5q? 
#%<t%CC	Dq? q? q? q?f"F j"F  Z"F #'s)	"F
 
d3i4	"F "F "F "FH_7 j_7  Z_7 #&	_7
 #_7  8_7 
_7 _7 _7 _7B/5 /5 /5 /5b-6 -6 -6 -6^M
*M
 "$M
 
	M
 M
 M
 M
^%% 
% % % %2)2+)2 
S	!"U3(()*	,
)2 )2 )2 )2VZ 1Z	el	Z Z Z Z~ &'k& k&+k&  #k& 
tEL!5</	0	k& k& k& k&Z29    ^0D    T+->    U=#+=%>    

 2D8
 	

 

 
 
 
>
 
$ 
D 
T 
 
 
 
 :#|:# ":# "$	:#
 /5:# 
3	3:# :# :# :#x$# $# $ $ $ $	(	(	u|d"EL0	1	( 	( 	( 	( <@	r
 r
+r
 r
 2D8	r

 
ttd"S#XD 	"
r
 r
 r
 r
ht# 147 
	   <}
+}
 &}
 t#	}

 |}
 "}
 147}
 
S#XT#YS/D(()S	S#XS		
}
 }
 }
 }
~ / / ^/  *.)-;?-1
 
<$&
 <$&
 2D8	

 |d*
 S#X
 

 
 
 
@  -1
 
"%
"%
 
 	

 #Tk
 

 
 
 \
6 %)! -1&* !w
 w
w
 w
 "$	w

 #&w
 w
 "w
 w
 #Tkw
 tw
 w
 
t		
w
 w
 w
 w
r"< "< "< "<R 04H< H<H< H< !	H<
 -H< 
S%,$&S%,$tC,='>"??$F	H
H< H< H< H<T U <@r r+r 2D8r 
3	36I	ID	P	r r r rh	 u4u	3	36I	Iu u u un7md&: 7 7 7 7 GL0 0 10?C0	0 0 0 0B	J%T#Yc0J*K 	J 	J 	J 	JN#lNHMN	N N N N&
FS	 
F 
F 
F 
Fh+h !<$tCy/9h ,	h
 |h $lh  -4h 147h 6h C-.d3;L6M1NNQUUh 
d3i5<	'h h h hT	3tCH~ 	3$ 	3 	3 	3 	3j jt j j j j jXE#s(Od4J    0T T T T
-
 

 
 
 
g$|g$ #38ng$ 
c?T))	*	g$ g$ g$ g$Rt# 
c3h   * *#,*#=B\D=P*# *# *# ^*#X

 !
 
	
 
 
 
> U 8< %$$( #( ##(DB DBDB !. 4DB 	DB
 DB "DB DB DB !DB DB DB !DB 
u|U\)	*DB DB DB DBL
 UL|L 
L L L L\5|5 5 
	5 5 5 5n UD|D 
D D D D<? ? ? ?BDs D D D DLL60L6 !.L6 L6 L6 L6\OM} OM OM OM OM OMb1,1BFs)1	1 1 1 1.^C T*:%;!<=^C ./^C 
	^C ^C ^C ^C@W W W W( EP"EP15n1EEP	EP EP EP \EPN:,:BFs):	: : : :x$,$	c5<	 $ $ $ $@?h~&> ? ? ? ?#H^4L # # # #)"= )"TRUY )" )" )" )"Vf&f #3#45f !I	f
 
c5<	 f f f fPc5</0	   65,5BFs)5	c5<	 5 5 5 5n,	   810= 10T 10 10 10 10f
 
 
"   44[(8#9    @%, 4S	?    $sDeck9I4J/J*K     )1)1 eCHo.)1 	)1
 )1 )1 )1 ^)1 )1 )1r   r  c                   V    e Zd ZU dZdZeed<   	 dZeed<   	 de	e
eez  f         fdZdS )	r5  z7Per-request timing statistics for encoder forward pass.g        r6  r   r7  r   c                      | j         | j        dS )Nr6  r7  rD  r   s    r   r-  zEncoderTimingStats.to_dict  s    $($=!%!7
 
 	
r   N)r   r   r   r  r6  rC  r  r7  r   r  r  r-  r   r   r   r5  r5    sm         AA"%%%%%>s>
c53;./ 
 
 
 
 
 
r   r5  (   r  rM  r  rz  r  collectionsr   collections.abcr   r   
contextlibr   r   r   dataclassesr	   r
   typingr   r   r   r   r   r"  r  r   torch.distributedtorch.nnr?  r   	vllm.envsr  vllm.attention.layerr   r   vllm.compilation.counterr   vllm.compilation.cuda_graphr   r   vllm.compilation.monitorr   vllm.configr   r   r   r   r   vllm.distributed.ec_transferr   r    vllm.distributed.eplb.eplb_stater   vllm.distributed.kv_transferr   r    /vllm.distributed.kv_transfer.kv_connector.utilsr!   vllm.distributed.parallel_stater"   r#   r$   r%   r&   r'   vllm.forward_contextr(   r)   vllm.loggerr*   vllm.lora.layersr+   r,   /vllm.model_executor.layers.attention_layer_baser-   <vllm.model_executor.layers.fused_moe.routed_experts_capturerr.   +vllm.model_executor.layers.rotary_embeddingr/   r0    vllm.model_executor.model_loaderr1   r2   %vllm.model_executor.models.interfacesr3   r4   r5   r6   r7   r8   r9   r:   r;   r<   *vllm.model_executor.models.interfaces_baser=   r>   r?   vllm.multimodalr@   vllm.multimodal.inputsrA   rB   rC   vllm.multimodal.utilsrD   vllm.pooling_paramsrE   vllm.sampling_paramsrF   vllm.sequencerG   
vllm.tasksrH   rI   rJ   
vllm.utilsrK   vllm.utils.jsontreerL   vllm.utils.math_utilsrM   rN   vllm.utils.mem_utilsrO   rP   vllm.utils.nvtx_pytorch_hooksrQ   vllm.utils.platform_utilsrR   vllm.utils.torch_utilsrS   rT   vllm.v1.attention.backendrU   rV   rW   rX   rY   rZ   r[   #vllm.v1.attention.backends.gdn_attnr\    vllm.v1.attention.backends.utilsr]   r^   r_   vllm.v1.core.sched.outputr`   vllm.v1.cudagraph_dispatcherra   vllm.v1.kv_cache_interfacerb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   vllm.v1.outputsrm   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   vllm.v1.pool.metadatarx   ry   vllm.v1.sample.logits_processorrz   r{   )vllm.v1.sample.logits_processor.interfacer|   vllm.v1.sample.metadatar}    vllm.v1.sample.rejection_samplerr~   vllm.v1.sample.samplerr   vllm.v1.spec_decode.draft_modelr   vllm.v1.spec_decode.eagler   vllm.v1.spec_decode.medusar   vllm.v1.spec_decode.metadatar   "vllm.v1.spec_decode.ngram_proposerr   #vllm.v1.spec_decode.suffix_decodingr   vllm.v1.structured_output.utilsr   vllm.v1.utilsr   r   vllm.v1.workerr   vllm.v1.worker.cp_utilsr   r   vllm.v1.worker.dp_utilsr   .vllm.v1.worker.ec_connector_model_runner_mixinr   vllm.v1.worker.gpu_input_batchr   r   !vllm.v1.worker.gpu_ubatch_wrapperr   .vllm.v1.worker.kv_connector_model_runner_mixinr   &vllm.v1.worker.lora_model_runner_mixinr   vllm.v1.worker.ubatch_utilsr   r   r   r   vllm.v1.worker.utilsr   vllm.v1.worker.workspacer   utilsr   r   r   r   r   +vllm.model_executor.model_loader.tensorizerr   r   r   r   rT  r  r  r   r  r   r   r   r   r   r  r5  r   r   r   <module>r     s
        				          # # # # # # . . . . . . . . % % % % % %         ! ! ! ! ! !       B B B B B B B B B B B B B B                            8 8 8 8 8 8 8 8 8 8 8 8 8 8 G G G G G G G G D D D D D D              J I I I I I I I 6 6 6 6 6 6 U U U U U U U U J J J J J J                       $ # # # # # 9 9 9 9 9 9 9 9 N N N N N N             P O O O O O O O                                
 0 / / / / /         
 > = = = = = - - - - - - - - - - - - - - - - - - A A A A A A A A A A = = = = = = / / / / / / 0 0 0 0 0 0 0 0 A A A A A A A A 2 2 2 2 2 2 = = = = = =                         L K K K K K         
 5 4 4 4 4 4 < < < < < <                                                   A @ @ @ @ @ @ @ O O O O O O O O E E E E E E 4 4 4 4 4 4 = = = = = = * * * * * * > > > > > > 3 3 3 3 3 3 5 5 5 5 5 5 ; ; ; ; ; ; < < < < < < F F F F F F A A A A A A F F F F F F F F & & & & & &        ? > > > > > V V V V V V I I I I I I I I ; ; ; ; ; ; V V V V V V G G G G G G            > = = = = = 3 3 3 3 3 3               ILLLLLLHHHHHHHH	X		"3(9#9: ) : : :"&'7"8;K"K i K K KA A A A A 6 A A AH') ') ') ') ')'= ') ') ')TR R R R R
 R R R ^Y1 ^Y1 ^Y1 ^Y1 ^Y157R^Y1 ^Y1 ^Y1Bs 
 
 
 
 
 
 
 
 
 
r   