
    -`i                         d Z ddlZddlZddlZddlZddlZddlmZ ddlZ	ddl
m
Z
 ddlmZmZ ddlmZ ddlmZ ddlmZ d	ej        d
eeef         ddfdZdej        fdZd	ej        fdZdS )z?Benchmark the latency of processing a single batch of requests.    N)Any)tqdm)#convert_to_pytorch_benchmark_formatwrite_to_json)
EngineArgs)
PromptType)BeamSearchParamsargsresultsreturnc                     t          | dd         ifddD                       }|r?t          j                            | j                  d          d}t          ||           d S d S )Nlatency	latenciesc                 "    i | ]}||         S  r   ).0kr   s     k/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/benchmarks/latency.py
<dictcomp>z4save_to_pytorch_benchmark_format.<locals>.<dictcomp>   s    JJJaAwqzJJJ    )avg_latencypercentiles)r
   metrics
extra_infor   z.pytorch.json)r   ospathsplitextoutput_jsonr   )r
   r   
pt_recordspt_files    `  r    save_to_pytorch_benchmark_formatr!      s     5GK01JJJJ+IJJJ  J
  +W%%d&677:IIIgz*****+ +r   parserc                    |                      dt          d           |                      dt          d           |                      dt          d           |                      dt          d	d
           |                      dd           |                      dt          dd           |                      dt          dd           |                      ddd           |                      dt          d d           |                      ddd           t          j        |           } |                     d           d S )Nz--input-len    )typedefaultz--output-len   z--batch-size   z--n   z)Number of generated sequences per prompt.)r%   r&   helpz--use-beam-search
store_true)actionz--num-iters-warmup
   z'Number of iterations to run for warmup.z--num-iters   zNumber of iterations to run.z	--profilez0profile the generation process of a single batch)r,   r*   z--output-jsonz0Path to save the latency results in JSON format.z--disable-detokenizez`Do not detokenize responses (i.e. do not include detokenization time in the latency measurement)F)enable_prefix_caching)add_argumentintstrr   add_cli_argsset_defaults)r"   s    r   r3   r3   "   s   
C<<<
S#>>>
S!<<<
8	     +LAAA
6	     C2P     ?    
 ?	     >	     $V,,F e44444r   c           	           t          j                   }ddlm}m}  |d i t          j        |          j        j        j	         j
         j        z   k    s
J d             | j        ddd j         j                   t          j                            d j         j
        f          }d	 |                                D              fd
d!dt&          ffd}t)          d           t+          t-           j                  d          D ]} |d            j        rS|j        }|j        dk    rt)          d|j         d           n|j        dk    rt)          d            |d           d S g }t+          t-           j                  d          D ]!}|                     |d                     "t          j        |          }g d}	t          j        ||	          }
t)          dt          j         |           d           tC          |	|
          D ]\  }}t)          | d| d            j"        rt          j         |          |                                tG          tC          |	|
                                                    d}tI           j"        d          5 }tK          j&        ||d           d d d            n# 1 swxY w Y   tO           |           d S d S )"Nr   )LLMSamplingParamszUPlease ensure that max_model_len is greater than the sum of input_len and output_len.g      ?T)ntemperaturetop_p
ignore_eos
max_tokens
detokenizei'  )sizec                     g | ]}d |iS )prompt_token_idsr   )r   batchs     r   
<listcomp>zmain.<locals>.<listcomp>k   s+     ' ' '(-	U#' ' 'r   c                       j         s                    d           d S                     t           j         j        d                     d S )NF)sampling_paramsuse_tqdmT)
beam_widthr<   r;   )use_beam_searchgeneratebeam_searchr	   r8   
output_len)r
   dummy_promptsllmrD   s   r   llm_generatezmain.<locals>.llm_generateo   so    # 
	LLRWLXXXXXOO #v##      r   F
do_profilec                     | r4                                                                                 d S t          j                    }              t          j                    }||z
  }|S )N)start_profilestop_profiletimeperf_counter)rN   
start_timeend_timer   rL   rM   s       r   run_to_completionzmain.<locals>.run_to_completion|   sv     		LNNN*,,JLNNN(**H+GNr   zWarming up...zWarmup iterations)desc)rN   torchz8Profiling with torch profiler (results will be saved to z)...cudaz Profiling with cuda profiler ...zBench iterations)r-      2   K   Z   c   zAvg latency: z secondsz% percentile latency: )r   r   r   w   )indentr   )F)(r   from_cli_argsvllmr6   r7   dataclassesasdict
llm_enginemodel_configmax_model_len	input_lenrJ   r8   disable_detokenizenprandomrandint
batch_sizetolistboolprintr   rangenum_iters_warmupprofileprofiler_configprofilertorch_profiler_dir	num_itersappendarray
percentilemeanzipr   dictopenjsondumpr!   )r
   engine_argsr6   r7   dummy_prompt_token_idsrV   _ru   r   percentagesr   
percentager{   r   frK   rL   rM   rD   s   `              @@@@r   mainr   P   s    *400K )((((((( #
0
0";//
0
0C>&4(  	0   %n
&?..  O  Y..T_dn5 /  ' '1G1N1N1P1P' ' 'M       
 
d 
 
 
 
 
 
 
 
/%-..5HIII , ,U+++++| 
%5#w..=#6= = =    %//4555T**** I%''.@AAA > >**e<<<====##I***K-	;77K	
6"'),,
6
6
6777"%k;"?"? I I
JGG:GGGHHHH  879--"))++K1C1C1E1E F FGG
 

 $"C(( 	,AIgq++++	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	,(w777778 8s   =K!!K%(K%)__doc__argparserd   r   r   rR   typingr   numpyrk   r   vllm.benchmarks.lib.utilsr   r   vllm.engine.arg_utilsr   vllm.inputsr   vllm.sampling_paramsr	   	Namespacer~   r2   r!   ArgumentParserr3   r   r   r   r   <module>r      sA   F E       				                  X X X X X X X X , , , , , , " " " " " " 1 1 1 1 1 1
+


+'+CH~
+	
+ 
+ 
+ 
++50 +5 +5 +5 +5\\8x! \8 \8 \8 \8 \8 \8r   