
    .`i                     0   d dl Z d dlZd dlmZ d dlmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dlmZm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z(m)Z)m*Z*m+Z+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5 d dl6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z> d dl?m@Z@mAZAmBZB d dlCmDZD d dlEmFZF d dlGmHZH d dlImJZJ d dlKmLZL d dlMmNZN d dlOmPZP d dlQmRZR d d lSmTZT d d!lUmVZV d d"lWmXZX  eeY          ZZ G d# d$eX          Z[dS )%    N)deepcopy)Any)
VllmConfig)CUDAGraphMode)&prepare_communication_buffer_for_model)set_forward_context)init_logger)get_model_loader)MULTIMODAL_REGISTRY)DeviceMemoryProfiler
format_gib)STR_DTYPE_TO_TORCH_DTYPE)GrammarOutputSchedulerOutput)KVCacheConfig)ModelRunnerOutput)AsyncOutput)build_attn_metadatabuild_slot_mappings_by_layerget_kv_cache_specinit_attn_backendinit_kv_cache)BlockTables)UvaBufferPool)CudaGraphManager)get_cudagraph_and_dp_paddingmake_num_tokens_across_dp)
InputBatchInputBuffers combine_sampled_and_draft_tokensexpand_idx_mappingget_num_sampled_and_rejectedpost_updateprepare_pos_seq_lensprepare_prefill_inputs)NO_OP_KV_CONNECTORKVConnectorget_kv_connector)	LoraState)EncoderRunner)
MRopeState)SamplerOutput)PromptLogprobsWorker)Sampler)init_speculator)rejection_sample)RequestState)StructuredOutputsWorker)LoRAModelRunnerMixinc                   \   e Zd Zdedej        fdZdeddfdZde	e
         fdZd5d	Zdej        fd
Zd ZdeddfdZdeddfdZ ej                    dddedede	ej        ej        f         fd            Z ej                    dej        ddfd            Z ej                    d5d            Zd5dZdedefdZ ej                    defd            Zd5dZdeddfdZ deddfdZ!deddfd Z"deddfd!Z#ded"edefd#Z$ ej                    d$e%e
e&e         f         dede	e&ej                 ej        f         fd%            Z'dej        ded&e(dz  de	e)ej        ej        f         fd'Z*ded(ej        d)ej        d*ej        ddf
d+Z+ ej                    ded,ej        d-e&ej                 dz  d)ej        d*ej        dej        fd.            Z, ej                    	 	 	 d6ded0e-dz  d1ed2ede.dz  f
d3            Z/ ej                    d&e(dz  de0e.z  fd4            Z1dS )7GPUModelRunnervllm_configdevicec                  	   || _         |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j	        | _	        || _
        | j        j        | _        | j        | _        | j        j        dk    rt          | j        j                 | _        d| _        | j                                        | _        | j        j        | _        | j        j        | _        | j        j        | _        | j                                        | _        t2          | _        | j                            | j                  | _        | j        r,t;          | j        | j        | j        | j
                  | _        | j        j        | _        | j        r,tA          | j        | j        | j        | j
                  | _!        | j        j"        | _#        tH          j%        &                    | j
                  | _'        tH          j%        (                                | _)        | j#        rGtH          j%        (                                | _*        tH          j%        (                                | _+        nd | _*        d | _+        | j        8d| _,        | j        j-        | _.        t_          | j         | j
                  | _0        nd| _,        d| _.        d | _0        tc          | j        | j        | j        | j.        | j        | j
                  | _2        tg          | j        | j        | j
                  | _4        tk          | j        | j        | j
        | j        j6        	          | _7        tq          | j                  | _9        tu          | j         | j        | j
                  | _;        ty          | j        | j.        d
z   z  | j                  | _=        t}          | j                  | _?        t          | j        tH          jA                  | _B        t          | j        d
z   tH          jA                  | _C        t          | j        d
z   tH          jA                  | _D        t          | _F        d S )NautoF)max_num_tokenshidden_sizedtyper7   )max_num_reqsr:   max_model_lenr7   Tr   )r=   r>   max_num_batched_tokensnum_speculative_steps
vocab_sizer7   )r=   r:   r7   )r=   rA   r7   logprobs_mode   )max_num_logitsrA   )r=   )Gr6   model_configcache_configcompilation_configlora_configload_configparallel_configscheduler_configspeculative_configobservability_configr7   r<   kv_cache_dtypecache_dtyper   is_pooling_modelget_vocab_sizerA   r>   r?   r:   max_num_seqsr=   get_inputs_embeds_sizeinputs_embeds_sizer   mm_registrysupports_multimodal_inputssupports_mm_inputsr*   encoder_runner
uses_mroper+   mrope_statesasync_schedulinguse_async_schedulingtorchcudaStreamoutput_copy_streamEventoutput_copy_eventinput_prep_eventstructured_outputs_eventdo_spec_decodenum_speculative_tokensr@   r/   
speculatorr1   
req_statesr   input_buffersr.   rB   samplerr-   prompt_logprobs_workerr   cudagraph_managerr2   structured_outputs_workerr)   
lora_stater   int32tmp_idx_mappingtmp_cu_num_logitstmp_query_start_locr&   kv_connector)selfr6   r7   s      s/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/worker/gpu/model_runner.py__init__zGPUModelRunner.__init__F   s   
 ''4'4"-"@&2&2*: + <"-"@$/$D!&,
"j(F22":!-#D !&+::<<!.<"3J 1>"&"3"J"J"L"L /"&"2"M"M#
 #
 " 	"/#2 3j{	# # #D +6? 	 *!.#2"0{	! ! !D %)$9$J!"'*"3"3DK"@"@!&!1!1!3!3$ 	1$)J$4$4$6$6D!,1J,<,<,>,>D))$(D!,0D)"."&D)-)@)WD&-d.>LLDOO"'D)*D&"DO&*,#'#6"&"<;
 
 
 **.;
 
 

 *;+9	
 
 
 ';4;L&M&M# "2dot{"
 "
 *A,0JQ0NO*
 *
 *
&
 $1BCCC  -T->LL!.t/@1/Dek!R!R#01BQ1F#T#T );    r>   returnNc                 ,    || _         || j        _         d S N)r>   rh   )rt   r>   s     ru   update_max_model_lenz#GPUModelRunner.update_max_model_len   s    *(5%%%rw   c                     dS )N)generate rt   s    ru   get_supported_tasksz"GPUModelRunner.get_supported_tasks   s    }rw   c                    t          j                    }t                      5 }t          | j        j                  }t                              d           |                    | j        | j        j	                  | _
        | j        r+|                     | j
        | j        | j                  | _
        | j        r| j                            | j
                   d d d            n# 1 swxY w Y   t          j                    }|j        | _        t                              dt%          |j                  ||z
             t'          | j
                   | j        r)t)          | j        dd           }|t'          |           d S d S d S )NzLoading model from scratch...)r6   rE   z*Model loading took %s GiB and %.6f secondsmodel)timeperf_counterr   r
   r6   rI   loggerinfo
load_modelrE   r   rH   load_lora_modelr7   re   rg   consumed_memorymodel_memory_usager   r   getattr)rt   argskwargstime_before_loadmmodel_loadertime_after_loadspeculator_models           ru   r   zGPUModelRunner.load_model   s   ,..!## 	7q+D,<,HIILKK7888%00 ,!-: 1  DJ  !11J$K 

 " 7**4:666	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7  +--"#"38q())..	
 	
 	
 	/tz::: 	I&tFF+67GHHHHH	I 	I++s   B7C%%C),C)c                     | j         S rz   )r   r   s    ru   	get_modelzGPUModelRunner.get_model   s
    zrw   c                 *    t          | j                  S rz   )r   r6   r   s    ru   r   z GPUModelRunner.get_kv_cache_spec   s     !1222rw   kv_cache_configc                    t          |          }|| _        d |j        D             }t          || j        | j        | j        | j                  | _        t          | j        | j
        | j                  \  | _        | _        | j        r+| j                            | j        | j        | j                   g | _        t#          | j        | j        j        | j        | j        | j                  }t)          | j
        |          | _        g | _        d S )Nc                 &    g | ]}|j         j        S r~   )kv_cache_spec
block_size).0kv_cache_groups     ru   
<listcomp>z6GPUModelRunner.initialize_kv_cache.<locals>.<listcomp>   s.     
 
 
 (3
 
 
rw   )block_sizesr=   r?   r>   r7   )r   r   kv_cache_groupsr   r=   r:   r>   r7   block_tablesr   r6   attn_backendsattn_metadata_buildersre   rg   set_attn	kv_cachesr   rG   static_forward_contextr(   rs   attn_groups)rt   r   r   kv_caches_dicts       ru   initialize_kv_cachez"GPUModelRunner.initialize_kv_cache   s0   "?33.
 
"1"A
 
 

 (#*#'#6,;
 
 
 ;L K;
 ;
7D7
  	O$$$+!   .0&N#: K
 
 -T-=~NN rw   input_batchc                 n   | j                             |j                  }| j                             |j                  }t          || j                  }t          | j        |j        |j        |j	        t          j        |j                  |j        | j        ||| j        
  
        }||_        ||_        d S )N
r   num_reqs
num_tokensquery_start_loc_gpuquery_start_loc_cpuseq_lensmax_seq_lenr   slot_mappingsr   )r   get_dummy_block_tablesr   get_dummy_slot_mappingsr   r   r   r   r   query_start_locr]   
from_numpyquery_start_loc_npr   r>   attn_metadatar   )rt   r   r   r   slot_mappings_by_layerr   s         ru   prepare_dummy_attn_metadataz*GPUModelRunner.prepare_dummy_attn_metadata  s    (??@TUU)AA"
 
 ">4/"
 "
 ,#'#> )"- + ; % 01O P P )*%' 0
 
 
 %2!$:!!!rw   T	skip_attnr   r   c                   t          || j                  }||z  g|z  }|dxx         ||z  z  cc<   t          |          |k    sJ d t          |          D             }t	          j                    }||_        ||_        | j        	                    d           | 
                    |d|           | j        	                    d           | j        J | j        \  }	}
}|	|
j                 }|	|fS )Nc                      i | ]\  }}d | |S )_dummy_req_r~   )r   ins      ru   
<dictcomp>z-GPUModelRunner._dummy_run.<locals>.<dictcomp>2  s4      
  
  
%)Q!q 
  
  
rw   T)	dummy_runskip_attn_for_dummy_runF)minr=   sum	enumerater   
make_emptytotal_num_scheduled_tokensnum_scheduled_tokensrs   set_disabledexecute_modelexecute_model_statelogits_indices)rt   r   r   r   r   r   num_tokens_per_requestr   dummy_scheduler_outputhidden_statesr   _sample_hidden_statess                ru   
_dummy_runzGPUModelRunner._dummy_run%  s?    z4#455","8!9H!Dr"""j8&;;""")**j8888 
  
-67M-N-N 
  
  
 "1!;!=!=<F96J3 	&&t,,, 	"dI 	 	
 	
 	
 	&&u---'333(,(@%{A,[-GH222rw   r   c                 \   |j         d         }| j                            |          }t          j        |t          j        | j                  }t          j        |t          j                  }t          j        |t          j	        | j                  }| 
                    ||||           d S )Nr   r<   r7   r<   )shaper   compute_logitsr]   arangero   r7   npzerosint64rj   )rt   r   r   logitsidx_mappingidx_mapping_npposs          ru   _dummy_sampler_runz!GPUModelRunner._dummy_sampler_runF  s    
 !&q)**=99l85;t{SSS828<<<k(%+dkJJJ 	V[.#>>>>>rw   c                 f   |                      | j        d          \  }}|                     |           | j        rBt	          | j        j        | j                  }| j                            | j        d d |           t          j
                                         ~~t          j                     d S )NTr   )r   r   num_tokens_across_dp)r   r:   r   re   r   rJ   data_parallel_sizerg   	run_modelr]   r^   synchronizegccollect)rt   r   r   r   s       ru   profile_runzGPUModelRunner.profile_runU  s    .2oo /> /
 /
++ 	 4555 		#<$79L$ $  O%%#""%9	 &    	
   /

rw   c                     d S rz   r~   r   s    ru   reset_mm_cachezGPUModelRunner.reset_mm_cachej  s    rw   r   c                     |S rz   r~   )rt   r   s     ru   _get_num_input_tokensz$GPUModelRunner._get_num_input_tokensm  s    ##rw   c           
      l   | j                                         st                              d           dS t	          j                    }t          j                     t          j	        
                                 t          j	                                        d         }|                     | j                  5  d }| j        r| j        j        }d }| j        r| j        j        }| j                             | j        | j        ||| j        | j        | j                   | j        r| j                                         d d d            n# 1 swxY w Y   t	          j                    }t          j	                                        d         }||z
  }||z
  }t                              d||dz             |S )NzrSkipping CUDA graph capture. To turn on CUDA graph capture, ensure `cudagraph_mode` was not manually set to `NONE`r   )r   ri   mrope_positionsinputs_embedsr   r   r   z4Graph capturing finished in %.0f secs, took %.2f GiBi   @)rl   needs_capturer   warningr   r   r   r   r]   r^   empty_cachemem_get_infomaybe_setup_dummy_lorasrH   rY   rZ   r   rW   rX   r   capturer   ri   r   r   r   re   rg   capture_modelr   )	rt   
start_timestart_free_gpu_memoryr   r   end_timeend_free_gpu_memoryelapsed_timecuda_graph_sizes	            ru   r   zGPUModelRunner.capture_modelq  s   %3355 	NNI   1&((



    %
 7 7 9 9! <))$*:;; 	0 	0"O D"&"3"C M& B $ 3 A"**j"0 /+!.'+'B $ 4 +    " 0--///#	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0& $&&#j5577:*,/2EEBw'	
 	
 	

 s   8BE		EEc                     t          d | j                                        D                       r<|                     | j        d           t
          j                                         d S d S )Nc              3   B   K   | ]}d |                                 v V  dS )
FLASHINFERN)get_name)r   bs     ru   	<genexpr>z4GPUModelRunner.warmup_for_prefill.<locals>.<genexpr>  s/      QQ|qzz||+QQQQQQrw   Fr   )allr   valuesr   r:   r]   r^   r   r   s    ru   warmup_for_prefillz!GPUModelRunner.warmup_for_prefill  so     QQT5G5N5N5P5PQQQQQ 	%OOD/5OAAAJ""$$$$$	% 	%rw   scheduler_outputc                 >   |j         }|j        r|                    |j                  }|D ]q}| j                            |           | j        r| j                            |           | j                            |           | j                            |           rd S rz   )	finished_req_idspreempted_req_idsunionrh   remove_requestrW   rX   rk   rn   )rt   r	  r  req_ids       ru   finish_requestszGPUModelRunner.finish_requests  s    +<- 	/55 2    ' 	3 	3FO**6222& ;#226:::'66v>>>O**62222	3 	3rw   c                 `    | j         r$|j        D ]}| j                            |           d S d S rz   )rW   free_encoder_mm_hashesrX   free_encoder_cache)rt   r	  mm_hashs      ru   free_stateszGPUModelRunner.free_states  sT    " 	@+B @ @#66w????	@ 	@@ @rw   c                    |j         D ]S}|j        J |j        J |j        J |j        }t          |j                  }| j                            |||j        |j                   | j        j	        |         }| j
        r | j                            ||j                   | j        r-| j                            || j        |j        |j                   | j                            ||j        d           | j                            |||j                   | j                            |||j                   | j                            |||j                   U|j         r| j                                         | j                            | j        j        j        | j        j        j        | j        j                   | j        r| j                                         d S d S d S )N)r  
prompt_lenprefill_token_idsnum_computed_tokens)mm_featuresT	overwrite)scheduled_new_reqsprompt_token_idsr  sampling_paramsr  lenrh   add_requestr  req_id_to_indexrW   rX   r  rY   rZ   init_prefill_mrope_positionsr   r   append_block_ids	block_idsrj   rk   rn   lora_requestapply_staged_writesgpuprefill_lenr   r  )rt   r	  new_req_datar  r  	req_indexs         ru   add_requestszGPUModelRunner.add_requests  s2   ,? #	V #	VL0<<<1===/;;;!(F\:;;JO''%"."@$0$D	 (    7?I& R#//8PQQQ  !>>J 2 , 8	 ?    ..<1T /    L$$:|'C   '33	<#?   O''	<;TUUUU. 	8O//111L,,15+.*  
  8!5577777	8 	88 8rw   c                     |j         }t          |j                  D ]C\  }}| j        j        |         }|j        |         }|| j                            ||d           Dd S )NFr  )scheduled_cached_reqsr   req_idsrh   r"  new_block_idsr   r$  )rt   r	  cached_reqsr   r  r+  req_new_block_idss          ru   update_requestszGPUModelRunner.update_requests  s~    &<";#677 	 	IAv7?I + 9! < ,!220E 3   		 	rw   num_tokens_after_paddingc                 	    j         }|dk    sJ t          j                  }t          j                                        fd          }t          j        fd|D             t
          j                  } fd|D             }t          j        |t
          j                  } j        	                    |          }	j
        sSd}
|}t          j        |dz   t
          j                  }t          j        |dz    j        t          j                  }|	}nىj
        t          j        fd	|D             t
          j                  }t          |                                          }
||
z   }|dz   }t          j        |dz   t
          j                  }d|d<   t          j        ||dd          
            j        	                    |          }t'          |	|| j        dz             } j                            |	          }t          j         j        dz   t
          j                  }d|d<   t          j        ||d|dz            
           |||dz   d <    j        	                    | j        j        
           |d |dz            }t          j        |          } j        j        d |dz            }t9           j        j         j        j        |	| j        j         j!         j        j"        j!         j        j#        j!                   tI          |	| j        j#        j!         j        j%         j        j&                    j        j&        d |         } j'        r; j(        )                    |	| j        j"        j!         j        j#        j!                   tU           j        j        |	 j        j+        || j        j"        j!         j        j,        ||	  	        } j        -                    |	| j        j%        d |                   }t]          | j/                  }ta           j1        |||| j        j&         j2        || j/        
  
        } j        j        d |         } j        j%        d |         }d } j'        r j(        j3        d d d |f         }ti          d"i d|d|d|	d|d|d|d|d|d|
d|d|d|d|d|d|dd d|d|d|d |d!|S )#Nr   c                     j         |          S rz   r   )kr	  s    ru   <lambda>z/GPUModelRunner.prepare_inputs.<locals>.<lambda>  s    *?B rw   )keyc                 *    g | ]}j         |         S r~   r7  )r   r   r	  s     ru   r   z1GPUModelRunner.prepare_inputs.<locals>.<listcomp>  s"    GGG!215GGGrw   r   c                 4    g | ]}j         j        |         S r~   )rh   r"  )r   r  rt   s     ru   r   z1GPUModelRunner.prepare_inputs.<locals>.<listcomp>  s0     
 
 
8>DO+F3
 
 
rw   rC   )r7   r<   c                 F    g | ]}|v rt          |                   nd S )r   )r   )r   r  draft_tokenss     ru   r   z1GPUModelRunner.prepare_inputs.<locals>.<listcomp>  sE        28<1G1GCV,---Q  rw   )out)max_expand_lenr   r/  r   r   r   expanded_idx_mappingr   r   r4  num_draft_tokensr   r   r   	input_ids	positionsr   r   r   r   r   cu_num_logitscu_num_logits_npr~   )5r   r   r   sortedkeysr   arrayro   rp   copy_to_gpuscheduled_spec_decode_tokensr   r]   r7   intr   emptycumsumrq   r!   r@   r   gather_block_tablesr=   rr   ri   r   r   r%   rC  rh   next_prefill_tokensr  r(  r)  r  r$   rD  r   rY   rZ   prepare_mrope_positionsr    last_sampled_tokensr>  compute_slot_mappingsr   r   r   r   r>   r   r   )rt   r	  r4  r   r   r/  r   idx_mapping_listr   r   total_num_draft_tokenstotal_num_logitsrF  rE  rA  rB  
num_logitsr   r   r   r   r   r   r   r   r   rC  rD  r   r>  s   ``                           @ru   prepare_inputszGPUModelRunner.prepare_inputs  sg   
 &@
A~~~~'<== 16688BBBB
 
 
  "xGGGGwGGGrx 
  
  

 
 
 
BI
 
 
 "2"(CCC*66~FF  <  	%&"'!yARXFFF!L1T[  M $/  +HL!x   ")   h      &))9)=)=)?)?%@%@"'*@@)A-J!x1BHEEE"#QIj&6qrr&:;;;; 2>>?OPPM#5 #9A=	$ $ $  (<<[II  Xd&7!&;28LLL !1
	&,>q8a<?O,PQQQQ .88a<>>* ,,"2 	- 	
 	
 	
 0(Q,?#./ABB,<^x!|^L 	(O/O-1O'+O/3	
 	
 	
 	O/3('	
 	
 	
 %.yy9 ? 	55+/37	   :(O/O'+O(

 

 )??(*5
 
 ">4/"
 "

 ,#'#>! / 3'0*%' 0
 
 
 &01J2J1JK	&01J2J1JK	? 	"/?,,,,O  
 
 
G
X
 $
 *>	

 "6!5
 "6!5
 "z
 &>%=
 43
 ,O
  21
 X
  i
  i
 ,O
  $!
" (-#
$ 10%
& *>'
( (-)
* .-+
 	
rw   scheduled_encoder_inputsc           	      P   | j                             |          \  }}| j                             | j        ||           | j                             |j        |j        |j        |j        | j	        j
        j        |j                 | j	        j        |j                           \  }}||fS rz   )rX   prepare_mm_inputsexecute_mm_encoderr   gather_mm_embeddingsr/  r   r   r   rh   r)  r   r   num_computed_prefill_tokens)rt   rY  r   	mm_hashes	mm_kwargs	mm_embedsis_mm_embeds          ru   get_mm_embeddingsz GPUModelRunner.get_mm_embeddings  s      $2DD$ 
  
	9 	..tz9iPPP!%!4!I!I",*O'*;+EFO78RS"
 "
	; +%%rw   grammar_outputc                 d   ||j                  }|j        |j                  }| j                            |          }|'| j                            |||j        |j                   |                     ||j	        |j
        |          }|j        dk    r,t          j        |j        t          j        | j                  }n=|j        |j                  }	t%          |j        |	|j        | j                  \  }
}|
|_        t-          ||j        |j        |j        | j        j        j                  \  }}|||fS )Nr   r   )r   rD  r   r   rm   apply_grammar_bitmaskstructured_output_request_idsgrammar_bitmaskrj   rA  r   rB  r]   onesr   ro   r7   rC  r0   sampled_token_idsrE  r@   r"   r   r   rh   r)  r(  )rt   r   r   rd  r   
sample_posr   sampler_outputnum_sampledrC  sampled_tokensnum_rejecteds               ru   samplezGPUModelRunner.sample  sS     -[-GH *;+EF
**+?@@%*@@<.	   ,&	
 
 '1,,*$EK  KK
 $-k.HII*:0)*	+ +'NK 0>N, %A %#O'+%
 %
!\ {L88rw   rn  rm  ro  c           
      2   t          |j        | j        j        j        | j        j        | j        j        j        ||||j	                   |j
        }| j        j        }t          j        ||         |j        z   | j        j        j        |                   ||<   d S rz   )r#   r   rh   r  r(  rR  rj   penalties_stateoutput_bin_countsr   r   r^  r   minimumr   r)  )rt   r   rn  rm  ro  r   computed_prefills          ru   postprocesszGPUModelRunner.postprocess  s     	#O/3O/L(:'		
 		
 		
 %3?F+-:^,{/OOO'*>:,
 ,
(((rw   last_hidden_statesaux_hidden_statesc                     | j         J | j                             |||||| j        j        | j        j        | j        j        j        j        | j        j        j	        j        	  	        }|S rz   )
rg   proposerh   rR  rP  rj   sampling_statestemperaturer(  seeds)rt   r   rw  rx  rm  ro  r>  s          ru   propose_draftzGPUModelRunner.propose_draft	  sl     ***..O/O/L(48L(.2

 

 rw   Fintermediate_tensorsr   r   c                    |J |s|                      |           |                     |           |                     |           |                     |           | j                                         |j        dk    r| j                            |          }|S | j	        
                    |j        |j                                                  }t          |j        || j        j        | j        j                  \  }}}	|dk    r| j                            |          }|S |s|                     ||          }
| j        r5| j                            |
j        |
j        |
j                  } | j        |  | j        rY|                     |j        |
          \  }}| j                            | j        |
j        ||          }|d |
j                 |
_         nrtC          || j"                  }tG          j$        ||| j%        | j&                  }
| j'        r| j(        j)        d d d |f         |
_)        |s| *                    |
           |r:| j        +                    |           | j	        ,                    |
j                  }n|
j-        }| j'        r|
j)        J |
j)        }t]          |
j/        | j0        |
j        tb          j2        |	|
j3                  5  | j        +                    |           |                     |
j        ||
j                   }d d d            n# 1 swxY w Y   | j        4                    |          }||
|f| _5        d S )Nr   )r   r   ri   r7   )r   cudagraph_runtime_moder   slot_mapping)rC  rD  r   )6r  r  r,  r3  r   r'  r   rs   
no_forwardrl   get_cudagraph_sizer   r  r   rJ   r   data_parallel_rankrX  rH   rn   make_lora_inputsr/  r   _set_active_lorasrW   rc  rY  rX   get_inputs_embedsr   rC  r4  r   r   r=   r   
make_dummyri   r7   rY   rZ   r   r   pre_forwardrunrD  r   r   r6   r   NONEr   post_forwardr   )rt   r	  r  r   r   empty_outputcudagraph_sizeuse_cudagraphr4  r   r   lora_inputsra  rb  r   r   r   rD  kv_connector_outputs                      ru   r   zGPUModelRunner.execute_model   s    $+++ 
	$  !1222-....///  !122211333:a??#0;;<LMM## /BB7188::
 

 ) ;$7$7	  	F/1E $q((,778HIIL )	> -- ( K  5"o>>'.4 
 '&44& 
)-)?)?$={* *&	; !% 3 E EJ 5y+! ! -::k::-)
 3T5FGGH$/!3"0{	  K  .2.?.OAA0000/+ + >00===  	 ))*:;;; 2664 MM
 $-I 8"2>>>'7	$) &?'4'9%9(6     !--.>??? $

)3'"-"; !+ ! !                #/<<=MNN#0+?R#R ts   =L  L$'L$c           
         | j         J | j         \  }}}d | _         |                     |||          \  }}}| j                            | j        j        ||| j        j        j        | j        j	        j        | j        j
        | j        j        j        | j        j                  }t          |j        d t!          |j                  D             d ||          }	t#          |	||| j        | j                  }
|                     ||j        ||           | j        r-|                     ||d ||          }|| j        j        |j        <   | j        r|
S |
                                S )Nc                     i | ]\  }}||	S r~   r~   )r   r   r  s      ru   r   z0GPUModelRunner.sample_tokens.<locals>.<dictcomp>  s    WWW91fVQWWWrw   )r/  r"  rj  prompt_logprobs_dictr  )model_runner_outputrl  num_sampled_tokenscopy_stream
copy_event)r   rp  rk   compute_prompt_logprobsr   r   rh   r  r(  r  r  r)  r   r^  r   r/  r   r   r`   rb   rv  rj  re   r~  r>  r   r\   
get_output)rt   rd  r   r   r  rl  rm  ro  r  r  async_outputr>  s               ru   sample_tokenszGPUModelRunner.sample_tokens  s   
 '333:>:R7{$7#' 48KK;5
 5
1\  $:RRJ%O-1O/3O&O'*O7	 
 	 
 0' XW	+BU8V8VWWW"!5 3
 
 
 # 3)*/-
 
 
 	9;	
 	
 	
  	Q-- L EQDO()@A$ 	 &&(((rw   )rx   N)NFF)2__name__
__module____qualname__r   r]   r7   rv   rL  r{   tuplestrr   r   nnModuler   r   r   r   r   r   inference_modeboolTensorr   r   r   r   r   r   r  r   r  r  r,  r3  rX  dictlistrc  r   r,   rp  rv  r~  r   r   r   r   r  r~   rw   ru   r5   r5   E   sl       n<n< n< n< n< n<`6# 6$ 6 6 6 6U3Z    I I I IB29    3 3 3(= (T ( ( ( (T;z ;d ; ; ; ;. U
 	3 3 33 	3 
u|U\)	*3 3 3 3@ U?|? 
? ? ? ? U   (   $# $# $ $ $ $ U*s * * * *X% % % %3 3D 3 3 3 3@O @ @ @ @ @
.8_ .8 .8 .8 .8 .8`	 	D 	 	 	 	p
)p
 #&p
 
	p
 p
 p
 p
d U&"&sDI~"6&  & 
tEL!5</	0	& & & &&39|39  39 &,	39
 
}elEL8	939 39 39 39j

 
 \	

 l
 

 
 
 
8 U "L  -4	
 \ l 
   , U ,0(-s s)s "Djs 	s
 "&s 
T	!s s s sj U<)%,<) 
(	(<) <) <) <) <) <)rw   r5   )\r   r   copyr   typingr   numpyr   r]   torch.nnr  vllm.configr   vllm.config.compilationr   vllm.distributed.parallel_stater   vllm.forward_contextr   vllm.loggerr	    vllm.model_executor.model_loaderr
   vllm.multimodalr   vllm.utils.mem_utilsr   r   vllm.utils.torch_utilsr   vllm.v1.core.sched.outputr   r   vllm.v1.kv_cache_interfacer   vllm.v1.outputsr   vllm.v1.worker.gpu.async_utilsr   vllm.v1.worker.gpu.attn_utilsr   r   r   r   r   vllm.v1.worker.gpu.block_tabler   vllm.v1.worker.gpu.buffer_utilsr   "vllm.v1.worker.gpu.cudagraph_utilsr   vllm.v1.worker.gpu.dp_utilsr   r   vllm.v1.worker.gpu.input_batchr   r   r    r!   r"   r#   r$   r%   vllm.v1.worker.gpu.kv_connectorr&   r'   r(   vllm.v1.worker.gpu.lora_utilsr)   $vllm.v1.worker.gpu.mm.encoder_runnerr*   !vllm.v1.worker.gpu.mm.mrope_utilsr+    vllm.v1.worker.gpu.sample.outputr,   (vllm.v1.worker.gpu.sample.prompt_logprobr-   !vllm.v1.worker.gpu.sample.samplerr.   vllm.v1.worker.gpu.spec_decoder/   /vllm.v1.worker.gpu.spec_decode.rejection_sampler0   vllm.v1.worker.gpu.statesr1   %vllm.v1.worker.gpu.structured_outputsr2   &vllm.v1.worker.lora_model_runner_mixinr3   r  r   r5   r~   rw   ru   <module>r     s   
			                         " " " " " " 1 1 1 1 1 1 R R R R R R 4 4 4 4 4 4 # # # # # # = = = = = = / / / / / / A A A A A A A A ; ; ; ; ; ; D D D D D D D D 4 4 4 4 4 4 - - - - - - 6 6 6 6 6 6              7 6 6 6 6 6 9 9 9 9 9 9 ? ? ? ? ? ?       	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	         
 4 3 3 3 3 3 > > > > > > 8 8 8 8 8 8 : : : : : : I I I I I I 5 5 5 5 5 5 : : : : : : L L L L L L 2 2 2 2 2 2 I I I I I I G G G G G G	X		N) N) N) N) N)) N) N) N) N) N)rw   