
    .`i                        U d Z ddlZddlZddlmZmZ ddlmZ ddlm	Z	m
Z
 ddlmZmZ ddlZddlmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddl m!Z!  ee"          Z# G d de$          Z%e
 G d d                      Z&e
 G d d                      Z'e
 G d d                      Z( G d d          Z) G d de          Z* G d d          Z+i Z,e-e.e/d         f         e0d<    G d dee          Z1 G d de*          Z2 G d  d!e*          Z3 G d" d#e*          Z4 G d$ d%e1          Z5 G d& d'e*          Z6 G d( d)e*          Z7 G d* d+e*          Z8 G d, d-e*          Z9 G d. d/e*          Z: G d0 d1e1          Z; G d2 d3e1          Z< G d4 d5          Z= G d6 d7          Z> G d8 d9          Z?d:e@d;e.fd<ZAd@d:e@d=eBe.         d>e@fd?ZCdS )Az
Analytic flops/memory estimation module for transformer components,
to help derive MFU (Model Flops Utilization) stats for a running model.
    N)ABCabstractmethod)Iterable)asdict	dataclass)AnyProtocol)	BaseModelFieldValidationErrormodel_validator)Self)
VllmConfig)init_logger)STR_DTYPE_TO_TORCH_DTYPEget_dtype_sizeget_kv_cache_torch_dtype)SchedulerOutputc                       e Zd ZdZdS )InvalidComponentzt
    Custom exception to indicate that a certain ComponentMetric is not
    applicable to the given VllmConfig.
    N)__name__
__module____qualname____doc__     h/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/metrics/perf.pyr   r   !   s         
 	Dr   r   c                       e Zd ZU dZeed<   dZeed<   dZeed<   dZ	e
eef         dz  ed<   dZe
eef         dz  ed<   dZe
eef         dz  ed	<   dZe
eef         dz  ed
<   dS )DebugPerfStats        calc_durationr   num_prefill_requestsnum_decode_requestsNcontext_breakdownnum_flops_per_gpu_breakdown num_read_bytes_per_gpu_breakdown!num_write_bytes_per_gpu_breakdown)r   r   r   r!   float__annotations__r"   intr#   r$   dictstrr%   r&   r'   r   r   r   r   r   -   s          M5 !#!!!    /3tCH~,3339=c3h$!6===>B$d38nt&;BBB?C%tCH~'<CCCCCr   r   c                   N    e Zd ZU dZeed<   dZeed<   dZeed<   dZe	dz  ed<   dS )	PerfStatsr   num_flops_per_gpunum_read_bytes_per_gpunum_write_bytes_per_gpuNdebug_stats)
r   r   r   r/   r*   r)   r0   r1   r2   r   r   r   r   r.   r.   9   sZ         s"#C####$S$$$)-K$&-----r   r.   c            	           e Zd ZU dZdZeed<   dZeed<   dZeed<   dZ	eed<   dZ
eed<   dZeed<   dZeed	<   dZeed
<   dedededdfdZdefdZdefdZededededd fd            ZdS )ExecutionContexta  
    Represents an execution context for a batch of requests.

    This class aggregates statistics across multiple requests in a batch,
    separately tracking prefill and decode phases.

    Example)
    - Batch with one full prefill (2048 tokens) and one decode (1 token, 8192 context):
      ctx = ExecutionContext()
      ctx.add(2048, 2048, is_prefill=True)
      ctx.add(1, 8192, is_prefill=False)
    r   r"   prefill_num_tokensprefill_context_lenprefill_token_context_productr#   decode_num_tokensdecode_context_lendecode_token_context_product
num_tokenscontext_len
is_prefillreturnNc                    |rE| xj         dz  c_         | xj        |z  c_        | xj        |z  c_        | xj        ||z  z  c_        dS | xj        dz  c_        | xj        |z  c_        | xj        |z  c_        | xj        ||z  z  c_        dS )z8Add a single request's statistics to this batch context.   N)r"   r5   r6   r7   r#   r8   r9   r:   )selfr;   r<   r=   s       r   addzExecutionContext.add\   s     		J%%*%%##z1##$$3$$..*{2JJ....$$)$$""j0""##{2##--k1II----r   c                      | j         | j        z   S )z8Total number of tokens across all requests in the batch.)r5   r8   rA   s    r   total_num_tokensz!ExecutionContext.total_num_tokensi   s    &)???r   c                      | j         | j        z   S )z<Total sum of (num_tokens * context_len) across all requests.)r7   r:   rD   s    r   total_token_context_productz,ExecutionContext.total_token_context_productm   s    1D4UUUr   c                 H     |             }|                     |||           |S )zwCreate an ExecutionContext from a single request.

        This is a convenience method primarily for testing.
        )rB   )clsr;   r<   r=   ctxs        r   from_single_requestz$ExecutionContext.from_single_requestq   s*     cee
K444
r   )r   r   r   r   r"   r*   r)   r5   r6   r7   r#   r8   r9   r:   boolrB   rE   rG   classmethodrK   r   r   r   r4   r4   A   sd          !"#!!!    )*!3***  !   s() #)))Jc J J J$ J J J J@# @ @ @ @VS V V V V 		+.	<@			 	 	 [	 	 	r   r4   c                   R    e Zd ZdZdedefdZdededdfdZdeeef         fdZ	dS )	
ParsedArgsz
    Syntactic sugar so that Parsers can use dot notations
    to access/update the parsed arguments.

    e.g.)
        args = ParsedArgs()
        args.x = 3
        args.y = args.x + 1
    namer>   c                 R    t          dt          |           j         d| d          )N'z' has no attribute ')AttributeErrortyper   )rA   rP   s     r   __getattr__zParsedArgs.__getattr__   s-    Qd!4QQ$QQQRRRr   valueNc                 >    t                               | ||           d S N)object__setattr__)rA   rP   rV   s      r   rZ   zParsedArgs.__setattr__   s     4u-----r   c                 D    t          |                                           S rX   )varscopyrD   s    r   
model_dumpzParsedArgs.model_dump   s    Dzz   r   )
r   r   r   r   r,   r   rU   rZ   r+   r^   r   r   r   rO   rO   ~   s         S S S S S S. .C .D . . . .!DcN ! ! ! ! ! !r   rO   c                   "    e Zd ZdededefdZdS )Parserargsvllm_configr>   c                     dS )z
        Parse the vllm config and update the current ParsedArgs and pass it on.
        If the parser isn't applicable to the vllm_config, it will do nothing.
        Nr   )rA   ra   rb   s      r   parsezParser.parse   s	    
 	r   N)r   r   r   rO   r   rd   r   r   r   r`   r`      s=        * : *      r   r`   c                   B    e Zd ZdZdeddfdZdeddfdZdedefd	Z	dS )
ParserChainz
    Applies chain of parser in a sequential order.
    Later parsers might overwrite results from previous parsers,
    so parsers should be chained in the appropriate order if they
    are not mutually exclusive.
    parsersr>   Nc                 .    t          |          | _        d S rX   )listrg   )rA   rg   s     r   __init__zParserChain.__init__   s    G}}r   parserc                 :    | j                             |           d S rX   )rg   append)rA   rk   s     r   
add_parserzParserChain.add_parser   s    F#####r   rb   c                 b    t                      }| j        D ]}|                    ||          }|S rX   )rO   rg   rd   )rA   rb   ra   rk   s       r   rd   zParserChain.parse   s6    ||l 	3 	3F<<k22DDr   )
r   r   r   r   r`   rj   rn   r   rO   rd   r   r   r   rf   rf      s         % %D % % % %$ $D $ $ $ $ 
      r   rf   ComponentMetrics_COMPONENT_METRICS_REGISTRYc            
          e Zd ZdZeedefd                        Zeedefd                        Z	d Z
ededefd            Zedeed                   fd            Ze	 dd
ededeeef         fd            Ze	 dd
ededeeef         fd            Ze	 dd
ededeeef         fd            Zdd
ededefdZdd
ededefdZdd
ededefdZdS )rp   a-  
    Each concrete ComponentMetrics class is associated with:
    - fields that are required for metric derivation
      (fields are specified/validated through pydantic model)
    - parser to parse VllmConfig into fields
    - metric methods that derive flops/bytes for a given execution context
    r>   c                     d S rX   r   rI   s    r   component_typezComponentMetrics.component_type   s    $'Cr   c                     dS )a  
        Return a ParserChain that provides values for all required fields.
        The returned parser chain must populate ParsedArgs with values for every
        field defined on this ComponentMetrics class. Missing fields will cause
        a ValidationError when from_vllm_config() is called.
        See individual Parser docstrings for which args they provide, and field
        comments on ComponentMetrics subclasses for which parser provides each field.
        Nr   rt   s    r   
get_parserzComponentMetrics.get_parser   s	     	r   c                 >    | t           |                                 <   d S rX   )rq   ru   rt   s    r   __init_subclass__z"ComponentMetrics.__init_subclass__   s    <?#C$6$6$8$8999r   rb   c                    |                                  }|                    |          }	 |                     |                                          S # t          $ r-}t          d|                                  d|           |d}~ww xY w)zj
        Instantiate this class from VllmConfig.
        Raises ValidationError if parsing fails.
        zInvalid z	 config: N)rw   rd   model_validater^   r   r   ru   )rI   rb   rk   parsed_argses        r   from_vllm_configz!ComponentMetrics.from_vllm_config   s     !!ll;//	Y%%k&<&<&>&>??? 	Y 	Y 	Y"#Pc.@.@.B.B#P#PQ#P#PQQWXX	Ys   &A 
B	(BB	c                 N    t          t                                                    S rX   )iterrq   valuesrt   s    r   registered_metricsz#ComponentMetrics.registered_metrics   s    /6688999r   TrJ   per_gpuc                     d S rX   r   rA   rJ   r   s      r   get_num_flops_breakdownz(ComponentMetrics.get_num_flops_breakdown   	     r   c                     d S rX   r   r   s      r   get_read_bytes_breakdownz)ComponentMetrics.get_read_bytes_breakdown   r   r   c                     d S rX   r   r   s      r   get_write_bytes_breakdownz*ComponentMetrics.get_write_bytes_breakdown   r   r   c                 l    t          |                     ||                                                    S rX   )sumr   r   r   s      r   get_num_flopszComponentMetrics.get_num_flops   s,    4//W==DDFFGGGr   c                 l    t          |                     ||                                                    S rX   )r   r   r   r   s      r   get_read_byteszComponentMetrics.get_read_bytes   s,    400g>>EEGGHHHr   c                 l    t          |                     ||                                                    S rX   )r   r   r   r   s      r   get_write_bytesz ComponentMetrics.get_write_bytes   s,    411#w??FFHHIIIr   NT)r   r   r   r   rM   r   r,   ru   rf   rw   ry   r   r   r~   r   rT   r   r4   rL   r+   r*   r   r   r   r   r   r   r   r   r   rp   rp      sk         's''' ^ ['	; 	 	 	 ^ [	@ @ @ Y: Y$ Y Y Y [Y :8D1C,D#E : : : [: 59 #.2	c3h   ^ 59 #.2	c3h   ^ 59 #.2	c3h   ^H H!1 HD HC H H H HI I"2 IT IS I I I IJ J#3 Jd Jc J J J J J Jr   c                   &    e Zd ZdZdededefdZdS )BaseConfigParserz
    Parses base model configuration.
    Provides: vocab_size, hidden_size, num_attention_heads, num_hidden_layers,
    weight_byte_size, activation_byte_size, dp_size, tp_size, pp_size, enable_ep
    ra   rb   r>   c                    |j         }|                                |_        |                                |_        t          |j        d          |_        t          |j        d          |_        |j         j	        }t          |t          j	                  r|}nSt          |t                    r|t          v rt          |         }n't                              d|           t          j        }t#          |          |_        d|_        |j        j        |_        |j        j        |_        |j        j        |_        |j        j        |_        |S )Nnum_attention_headsnum_hidden_layersz.Unknown model_dtype %s, defaulting to bfloat16   )model_configget_vocab_size
vocab_sizeget_hidden_sizehidden_sizeget_requiredhf_text_configr   r   dtype
isinstancetorchr,   r   loggerwarningbfloat16r   weight_byte_sizeactivation_byte_sizeparallel_configdata_parallel_sizedp_sizetensor_parallel_sizetp_sizepipeline_parallel_sizepp_sizeenable_expert_parallel	enable_ep)rA   ra   rb   r   model_dtypetorch_dtypes         r   rd   zBaseConfigParser.parse	  s6   "/&5577'7799 $0')>$
 $
  ".')<"
 "
 ".4k5;// 
	)%KKS)) 	)k=U.U.U2;?KK NN@    .K .{ ; ; %&!"2E"2G"2I$4Kr   Nr   r   r   r   rO   r   rd   r   r   r   r   r     sG         '* ': '* ' ' ' ' ' 'r   r   c                   &    e Zd ZdZdededefdZdS )BaseAttentionConfigParserzo
    Parses attention-specific configuration.
    Provides: num_key_value_heads, head_dim, cache_byte_size
    ra   rb   r>   c                     |j         }|                                |_        |                                |_        |j         j        }|j        j        }t          ||          }t          |          |_
        |S rX   )r   get_total_num_kv_headsnum_key_value_headsget_head_sizehead_dimr   cache_configcache_dtyper   r   cache_byte_size)rA   ra   rb   r   r   r   kv_cache_torch_dtypes          r   rd   zBaseAttentionConfigParser.parse<  sm    "/#/#F#F#H#H $2244!.4!.:7[QQ-.BCCr   Nr   r   r   r   r   r   6  sG         
* : *      r   r   c                   &    e Zd ZdZdededefdZdS )!AttentionQuantizationConfigParserza
    Parses quantization configuration for attention layers.
    Overrides: weight_byte_size
    ra   rb   r>   c                     |j         }||S |                                }|dv rd|_        n|dk    rd|_        nt          |S N)fp8
fbgemm_fp8r@   mxfp4g      ?quant_configget_namer   r   rA   ra   rb   cfgquant_methods        r   rd   z'AttentionQuantizationConfigParser.parseQ  s]    &;K||~~000 %&D!!W$$$'D!! #"r   Nr   r   r   r   r   r   K  sG         
* : *      r   r   c            	       .   e Zd ZU  edd          Zeed<    edd          Zeed<    edd          Zeed<    edd          Z	eed<    edd          Z
eed<    edd          Zeed	<    edd          Zeed
<    edd          Zeed<    edd          Zeed<    edd          Zeez  ed<   edefd            Zedefd            Z	 ddededeeef         fdZ	 ddededeeef         fdZ	 ddededeeef         fdZdS )AttentionMetrics.r   gtr   r   r   r   r   r   r   r   r   r   r>   c                     dS )Nattnr   rt   s    r   ru   zAttentionMetrics.component_type{  s    vr   c                 l    t          t                      t                      t                                S rX   )rf   r   r   r   rt   s    r   rw   zAttentionMetrics.get_parser  s/    %''-//
 
 	
r   TrJ   r   c                    | j         | j        | j        | j        | j        f\  }}}}}|                                }|                                }	|r:|| j        z  }t          d|| j	        z            }t          d|| j	        z            }d|z  |z  |d|z  z   z  |z  |z  d|z  |	z  |z  |z  d|z  |	z  |z  |z  d|z  |z  |z  |z  |z  dS )Nr@   r   )qkv_projattn_qkattn_avout_proj)
r   r   r   r   r   rE   rG   r   maxr   )
rA   rJ   r   LDqkvdTTCs
             r   r   z(AttentionMetrics.get_num_flops_breakdown  s
    "$$M
1aQ   "",,.. 	,$,AAqDL())AQdl*++B A	QRZ014q81urzA~)1urzA~)A	A)A-	
 
 	
r   c                    | j         | j        | j        | j        | j        f\  }}}}}|                                }|r:|| j        z  }t          d|| j        z            }t          d|| j        z            }i }	||z  | j	        z  |z  |	d<   t          ||d|z  z   z  |z  | j        z  |z            |	d<   |j        dk    r)|j        |z  d|j        z  |z  z   |z  | j	        z  |z  |	d<   |j        dk    rN|	                    dd          |j        |z  |z  | j	        z  |z  d|j        z  |z  |z  | j        z  |z  z   z   |	d<   ||z  |z  | j	        z  |z  |	d<   t          ||z  |z  | j        z  |z            |	d<   |	S )	Nr@   	qkv_inputr   
qkv_weightr   
attn_input	out_input
out_weight)r   r   r   r   r   rE   r   r   r   r   r*   r   r5   r6   r8   getr9   r   )
rA   rJ   r   r   r   r   r   r   r   
read_bytess
             r   r   z)AttentionMetrics.get_read_bytes_breakdown  s    "$$M
1aQ   "" 	,$,AAqDL())AQdl*++B
"#a%$*C"Ca"G
;#&qABJ'7!';d>S'SVW'W#X#X
<  !A%%'!+a#2I.IB.NN+,  |$  1$$'1~~lA'F'F%)A-0IIAMc,,r1A58LLqPQ(J|$
 #$a%!)d.G"G!"K
;#&q1uqy43H'H1'L#M#M
< r   c                 l   | j         | j        | j        | j        | j        f\  }}}}}|                                }|r:|| j        z  }t          d|| j        z            }t          d|| j        z            }||d|z  z   z  |z  | j	        z  |z  d|z  |z  |z  | j
        z  |z  ||z  | j	        z  |z  dS )z4Calculate write memory traffic for attention layers.r@   r   )
qkv_outputkv_cache
out_output)r   r   r   r   r   rE   r   r   r   r   r   )	rA   rJ   r   r   r   r   r   r   r   s	            r   r   z*AttentionMetrics.get_write_bytes_breakdown  s    
 "$$M
1aQ   "" 	,$,AAqDL())AQdl*++B q1r6z*Q.1JJQNA
Q)==Aa%$";;a?
 
 	
r   Nr   )r   r   r   r   r   r*   r)   r   r   r   r   r   r   r   r   r   r(   rM   r,   ru   rf   rw   r4   rL   r+   r   r   r   r   r   r   r   r   g  sb        "U31---s---uSQ'''K'''$uSQ////// %ca 0 0 0#0005###GS###5###GS###  %uSQ//////E#!$$$Hc$$$ 5+++OS+++ %*E#!$4$4$4cEk444
 s    [ 
; 
 
 
 [
 6:
 
#
.2
	c3h
 
 
 
6 6:+ +#+.2+	c3h+ + + +\ 6:
 
#
.2
	c3h
 
 
 
 
 
r   r   c                   &    e Zd ZdZdededefdZdS )BaseFfnConfigParserz
    Parses FFN and MoE configuration.
    Provides: intermediate_size, num_experts, num_experts_per_tok,
    moe_intermediate_size, num_shared_experts, num_moe_layers
    ra   rb   r>   c                    |j         j        }t          |d          r|j        |j        }t	          |d|j        dz            |_        |j                                         |_        t          |ddgd          |_
        t          |ddgd          |_        t          |dd	gd          |_        |j        dk    }|r|j        nd|_        |S )
Ntext_configintermediate_size   num_experts_per_tokmoe_topkr   moe_intermediate_sizen_shared_expertsnum_shared_experts)r   	hf_confighasattrr   getattrr   r   get_num_expertsnum_expertsgetattr_from_listr   r   r   r   num_moe_layers)rA   ra   rb   r   is_moes        r   rd   zBaseFfnConfigParser.parse  s    &03&& 	"3?+F/C!(.A4CSVWCW!X!X '3CCEE#4'4a$
 $
  &7)+>?&
 &
" #4$&:;Q#
 #
 !Q&8>Ed44Ar   Nr   r   r   r   r   r     sG         * : *      r   r   c                   &    e Zd ZdZdededefdZdS )FfnParallelParserzW
    Parses FFN parallelism configuration.

    Provides: ffn_tp_size, ffn_ep_size
    ra   rb   r>   c                 v    |j         rd|j        |j        z  }}n|j        |j        z  d}}||_        ||_        |S )Nr@   )r   r   r   ffn_tp_sizeffn_ep_size)rA   ra   rb   r  r  s        r   rd   zFfnParallelParser.parse  sJ     > 	F'($,*EKK'+|dl'BAK&&r   Nr   r   r   r   r  r    sG         * : *      r   r  c                   &    e Zd ZdZdededefdZdS )InterleaveMoeLayerStepParserzg
    Parses interleave_moe_layer_step field for models like Llama4.

    Overrides: num_moe_layers
    ra   rb   r>   c                     |j         j        t          d          rj        j        t          d          r=j        dk    r2t          fdt          |j                  D                       |_        |S )Nr   interleave_moe_layer_stepr   c                 6    g | ]}|d z   j         z  dk    |S )r@   r   )r  .0layerr   s     r   
<listcomp>z6InterleaveMoeLayerStepParser.parse.<locals>.<listcomp>2  s:       	S%BBaGG GGGr   )	r   r   r   r   r  lenranger   r   rA   ra   rb   r   s      @r   rd   z"InterleaveMoeLayerStepParser.parse(  s    &03&& 	"3?+F/C C455
	-11"%   !&t'=!>!>  # #D r   Nr   r   r   r   r  r  !  sG         * : *      r   r  c                   &    e Zd ZdZdededefdZdS )MoeLayerFreqParserzy
    Parses moe_layer_freq and first_k_dense_replace fields for models like Deepseek.

    Overrides: num_moe_layers
    ra   rb   r>   c                     |j         j        t          d          rj        j        t          d          rBt          d          r2t	          fdt          |j                  D                       |_        |S )Nr   moe_layer_freqfirst_k_dense_replacec                 F    g | ]}|j         k    r|j        z  d k    |S )r   )r  r  r
  s     r   r  z,MoeLayerFreqParser.parse.<locals>.<listcomp>J  sF        999 22a77  877r   )r   r   r   r   r  r  r   r   r  s      @r   rd   zMoeLayerFreqParser.parseC  s    &03&& 	"3?+F/C3()) 	gc;R.S.S 	"%   !&t'=!>!>  # #D r   Nr   r   r   r   r  r  <  sG         * : *      r   r  c                   &    e Zd ZdZdededefdZdS )FfnQuantizationConfigParserz\
    Parses quantization configuration for FFN layers.

    Overrides: weight_byte_size
    ra   rb   r>   c                     |j         }||S |                                }|dv rd|_        n|dk    rd|_        nt          |S r   r   r   s        r   rd   z!FfnQuantizationConfigParser.parse\  s]    &;K||~~000
 %&D!W$$$'D!! #"r   Nr   r   r   r   r  r  U  sG         * : *      r   r  c            	          e Zd ZU  edd          Zeed<    edd          Zeed<    edd          Zeed<    edd          Z	eed<    edd          Z
eed<    edd          Zeed	<    edd          Zeed
<    ed          Zeed<    ed          Zeed<    ed          Zeed<    ed          Zeed<    edd          Zeed<    edd          Zeez  ed<    ed          defd            Zedefd            Zedefd            Z	 d dededeeef         fdZ	 d dededeeef         fdZ 	 d dededeeef         fdZ!dS )!
FfnMetrics.r   r   r   r   r   r   r  r  r   r   r@   r   r   r   )ger   r   after)moder>   c                     | j         dk    rK| j        sJ d| j                    | j        sJ d| j                    | j        sJ d| j                    | S )zJValidate that MoE-related fields are properly set when num_moe_layers > 0.r   zself.num_experts=zself.num_experts_per_tok=zself.moe_intermediate_size=)r   r   r   r   rD   s    r   validate_moe_fieldszFfnMetrics.validate_moe_fields  s     ""#;;%;(8%;%;;;#+KK-K0H-K-KKK+-OO/O$2L/O/OOO-r   c                     dS )Nffnr   rt   s    r   ru   zFfnMetrics.component_type  s    ur   c           	          t          t                      t                      t                      t	                      t                      t                                S rX   )rf   r   r  r   r  r  r  rt   s    r   rw   zFfnMetrics.get_parser  sJ    !!(**  '))
 
 	
r   TrJ   r   c                    | j         | j        | j        }}}| j        | j        | j        | j        f\  }}}}	|                                }
||z
  }|r|
|z  nd}|r6|| j        z  }|| j        z  }|| j	        z  }|
|| j	        z  }|r
|| j
        z  }i }|rd|z  dz  |z  |
z  |z  |d<   |r|rd|z  dz  |z  |z  |z  |d<   |r|	rd|z  dz  |z  |	z  |
z  |z  |d<   |S )z)Calculate flops breakdown for FFN layers.r   Nr      	dense_ffn
routed_ffn
shared_ffn)r   r   r   r   r   r   r   rE   r   r  r  )rA   rJ   r   r   r   DILmEMISr   Ldnum_activated_tokensflopss                 r   r   z"FfnMetrics.get_num_flops_breakdown  sc    )4+;T=Sb1$&#	
Ar1   ""V()0q1uuq 	:4<B4<B4##B~t'' :$)99$  	9!"QR!!3b!8E+  	M! 	M"#a%!)b.3G"G""LE,  	>! 	>"#a%!)b.1"4q"82"=E,r   c                    | j         | j        | j        }}}| j        | j        | j        | j        f\  }}}}	|                                }
| j        }||z
  }|r|
|z  nd}|rB|| j	        z  }|| j	        z  }|| j
        z  }|
|| j
        z  }|r
|| j        z  }|
|| j        z  }i }|rt          |
|z  | j        z  |z            |d<   t          d|z  |z  | j        z  |z            |d<   t          d|
z  |z  | j        z  |z            |d<   t          |
|z  | j        z  |z            |d<   t          ||z  | j        z  |z            |d<   |ro|rt          ||          }t          ||z  | j        z  |z            |d	<   t          d|z  |z  |z  | j        z  |z            |d
<   t          d|z  |z  | j        z  |z            |d<   t          ||z  | j        z  |z            |d<   t          ||z  |z  | j        z  |z            |d<   |	rt          |
|z  | j        z  |z            |d<   t          d|z  |z  |	z  | j        z  |z            |d<   t          d|
z  |z  |	z  | j        z  |z            |d<   t          |
|z  | j        z  |z            |d<   t          ||z  |	z  | j        z  |z            |d<   |S )z-Calculate read memory traffic for FFN layers.r   Ndense_up_gate_inputr   dense_up_gate_weightsdense_silu_inputdense_down_inputdense_down_weightsrouted_up_gate_inputrouted_up_gate_weightsrouted_silu_inputrouted_down_inputrouted_down_weightsshared_up_gate_inputshared_up_gate_weightsshared_silu_inputshared_down_inputshared_down_weights)r   r   r   r   r   r   r   rE   r   r   r  r  r*   r   r   min)rA   rJ   r   r   r   r)  r*  r+  r,  r-  r   r   r.  r/  r   num_activated_expertss                   r   r   z#FfnMetrics.get_read_bytes_breakdown  s    )4+;T=Sb1$&#	
Ar1   ""&V()0q1uuq 
	14<B4<B4##B~t'' :$)99$& 00
  	X03A11B61 1J,- 36A
T22R73 3J./ .1A
T66;. .J)* .1B22R7. .J)* 031r6D<Q3QTV3V/W/WJ+, &	 (+,@+(N(N%58(1,t/HH2M6 6
12 8;EBJ!669NNQSS8 8
34 36,,r1D4MMPRR3 3
./ 36(2-0IIBN3 3
./ 58F22T5JJRO5 5
01
  58ED55:6 6
12 8;EBJNT%::R?8 8
34 36EBJNT%>>C3 3
./ 36FT66;3 3
./ 58FQJ!66;5 5
01 r   c                    | j         | j        | j        }}}| j        | j        | j        | j        f\  }}}}	|                                }
||z
  }|r|
|z  nd}|r6|| j        z  }|| j        z  }|| j	        z  }|
|| j	        z  }|r
|| j
        z  }i }|rct          d|
z  |z  | j        z  |z            |d<   t          |
|z  | j        z  |z            |d<   t          |
|z  | j        z  |z            |d<   |r|rct          d|z  |z  | j        z  |z            |d<   t          ||z  | j        z  |z            |d<   t          ||z  | j        z  |z            |d	<   |	rlt          d|
z  |	z  |z  | j        z  |z            |d
<   t          |
|	z  |z  | j        z  |z            |d<   t          |
|	z  |z  | j        z  |z            |d<   |S )z.Calculate write memory traffic for FFN layers.r   Nr   dense_up_gate_outputdense_silu_outputdense_down_outputrouted_up_gate_outputrouted_silu_outputrouted_down_outputshared_up_gate_outputshared_silu_outputshared_down_output)r   r   r   r   r   r   r   rE   r   r  r  r*   r   )rA   rJ   r   r   r   r)  r*  r+  r,  r-  r   r.  r/  write_bytess                 r   r   z$FfnMetrics.get_write_bytes_breakdown/  s    )4+;T=Sb1$&#	
Ar1   ""V()0q1uuq 	:4<B4<B4##B~t'' :$)99$  		25A
T66;3 3K./ 03B22R70 0K+, 03A11B60 0K+,
  	 	7:,,r1D4MMPRR8 834 58(2-0IIBN5 501 58(1,t/HH2M5 501  	7:EAINT%>>C8 834 58EBJ!::R?5 501 58EAI 99B>5 501 r   Nr   )"r   r   r   r   r   r*   r)   r   r   r   r  r  r   r   r   r   r   r   r   r(   r   r   r   rM   r,   ru   rf   rw   r4   rL   r+   r   r   r   r   r   r   r  r  t  s        "U31---s---uSQ'''K''' %ca 0 0 0#0005###GS### uSQ'''K'''uSQ'''K''' #U31---s---uQxxK$uQxx'''!&q3)))#eAhh&&&  %***NC*** %*E#!$4$4$4cEk444_'"""T    #" s    [ 
; 
 
 
 [
 6:) )#).2)	c3h) ) ) )X 6:X X#X.2X	c3hX X X Xv 6:@ @#@.2@	c3h@ @ @ @ @ @r   r  c            	       Z   e Zd ZU  edd          Zeed<    edd          Zeed<    edd          Zeed<    edd          Z	eed<   eed<   e
d	efd
            Ze
d	efd            Z	 ddeded	eeef         fdZ	 ddeded	eeef         fdZ	 ddeded	eeef         fdZdS )UnembedMetrics.r   r   r   r   r   r   r   r>   c                     dS )Nunembedr   rt   s    r   ru   zUnembedMetrics.component_type~  s    yr   c                 8    t          t                                S rX   )rf   r   rt   s    r   rw   zUnembedMetrics.get_parser  s    
 
 	
r   TrJ   r   c                 x    | j         | j        }}|                                }|r
|| j        z  }dd|z  |z  |z  iS )z0Calculate flops breakdown for unembedding layer.rQ  r   )r   r   rE   r   rA   rJ   r   r   Vr   s         r   r   z&UnembedMetrics.get_num_flops_breakdown  sT     1  "" 	$,A q1uqy1}
 	
r   c                     | j         | j        }}|                                }|r
|| j        z  }||z  | j        z  ||z  | j        z  dS )z4Calculate read memory traffic for unembedding layer.)inputweight)r   r   rE   r   r   r   rT  s         r   r   z'UnembedMetrics.get_read_bytes_breakdown  sf     1  "" 	$,A UT66!ed33
 
 	
r   c                 n    | j         }|                                }|r
|| j        z  }d||z  | j        z  iS )z5Calculate write memory traffic for unembedding layer.output)r   rE   r   r   )rA   rJ   r   rU  r   s        r   r   z(UnembedMetrics.get_write_bytes_breakdown  sM     O  "" 	$,A a!ed77
 	
r   Nr   )r   r   r   r   r   r*   r)   r   r   r   rM   r,   ru   rf   rw   r4   rL   r+   r   r   r   r   r   r   rO  rO  u  s        uSQ'''K'''eCA&&&J&&&!E#!,,,c,,, %ca 0 0 0#000LLLs    [ 
; 
 
 
 [
 6:
 
#
.2
	c3h
 
 
 
 6:
 
#
.2
	c3h
 
 
 
  6:
 
#
.2
	c3h
 
 
 
 
 
r   rO  c            	           e Zd ZdeddfdZdefdZddededefd	Z	ddededefd
Z
ddededefdZ	 ddededeeef         fdZ	 ddededeeef         fdZ	 ddededeeef         fdZdedefdZdS )ModelMetricsrb   r>   Nc           	         || _         g | _        t                                          D ]}	 |                    |          }| j                            |           t                              d|                                t          |                     n# t          $ rE}t                              d|                                t          |                     Y d}~d}~ww xY wdS )z
        Parse vllm_config to instantiate metrics for each component.
        is_enabled() will return False if no component metrics could be instantiated.
        z,Instantiated ComponentMetrics [%s] with (%s)z Failed to instantiate %s from %sN)rb   metricsrp   r   r~   rm   r   inforu   r,   r   debug)rA   rb   
metric_clsmetricr}   s        r   rj   zModelMetrics.__init__  s    '/1*==?? 	 	J#44[AA##F+++B))++KK   
 $   6--//FF       	 	s   A*B
C% ;C  C%c                 2    t          | j                  dk    S Nr   )r  r^  rD   s    r   
is_enabledzModelMetrics.is_enabled  s    4<  1$$r   TrJ   r   c                 H    t          fd| j        D                       S )Nc              3   D   K   | ]}|                               V  d S rX   )r   r  rb  rJ   r   s     r   	<genexpr>z-ModelMetrics.get_num_flops.<locals>.<genexpr>  s3      QQ&6''W55QQQQQQr   r   r^  r   s    ``r   r   zModelMetrics.get_num_flops  s,    QQQQQDLQQQQQQr   c                 H    t          fd| j        D                       S )Nc              3   D   K   | ]}|                               V  d S rX   )r   rh  s     r   ri  z.ModelMetrics.get_read_bytes.<locals>.<genexpr>  s3      RR66((g66RRRRRRr   rj  r   s    ``r   r   zModelMetrics.get_read_bytes  s,    RRRRRT\RRRRRRr   c                 H    t          fd| j        D                       S )Nc              3   D   K   | ]}|                               V  d S rX   )r   rh  s     r   ri  z/ModelMetrics.get_write_bytes.<locals>.<genexpr>  s3      SSF6))#w77SSSSSSr   rj  r   s    ``r   r   zModelMetrics.get_write_bytes  s,    SSSSSdlSSSSSSr   c                     i }| j         D ]a}|                    ||          }|                                fd|                                D             }|                    |           b|S )Nc                 &    i | ]\  }} d | |S .r   r  keyval	components      r   
<dictcomp>z8ModelMetrics.get_num_flops_breakdown.<locals>.<dictcomp>  -    TTThc39,,s,,cTTTr   )r^  r   ru   itemsupdaterA   rJ   r   totalrb  	breakdownprefixedrv  s          @r   r   z$ModelMetrics.get_num_flops_breakdown  s~     l 	# 	#F66sGDDI--//ITTTT)//BSBSTTTHLL""""r   c                     i }| j         D ]a}|                    ||          }|                                fd|                                D             }|                    |           b|S )Nc                 &    i | ]\  }} d | |S rq  r   rs  s      r   rw  z9ModelMetrics.get_read_bytes_breakdown.<locals>.<dictcomp>  rx  r   )r^  r   ru   ry  rz  r{  s          @r   r   z%ModelMetrics.get_read_bytes_breakdown  s~     l 	# 	#F77WEEI--//ITTTT)//BSBSTTTHLL""""r   c                     i }| j         D ]a}|                    ||          }|                                fd|                                D             }|                    |           b|S )Nc                 &    i | ]\  }} d | |S rq  r   rs  s      r   rw  z:ModelMetrics.get_write_bytes_breakdown.<locals>.<dictcomp>  rx  r   )r^  r   ru   ry  rz  r{  s          @r   r   z&ModelMetrics.get_write_bytes_breakdown  s~     l 	# 	#F88gFFI--//ITTTT)//BSBSTTTHLL""""r   scheduler_outputc           	         t          j                    }t                      }|j        D ]M}|j        }|j                            |d          }|dk    r+|j        |z   }|                    ||d           N|j	        }t          |j                  D ]V\  }	}|j                            |d          }|dk    r'|j        |	         }
|
|z   }|dk    }|                    |||           W|                     |d          }|                     |d          }|                     |d          }t          t!          |                                          t!          |                                          t!          |                                                    }t$          j        rEt)          t          j                    |z
  |j        |j        t/          |          |||          |_        |S )zV
        Calculate perf stats for the current step based on scheduled tokens.
        r   T)r=   r@   )time	monotonicr4   scheduled_new_reqsreq_idnum_scheduled_tokensr   num_computed_tokensrB   scheduled_cached_reqs	enumeratereq_idsr   r   r   r.   r   r   envsVLLM_DEBUG_MFU_METRICSr   r"   r#   r   r2   )rA   r  t0rJ   new_reqr  r;   r<   cached_reqsir  r=   num_flops_breakdownread_bytes_breakdownwrite_bytes_breakdown
perf_statss                   r   get_step_perf_stats_per_gpuz(ModelMetrics.get_step_perf_stats_per_gpu  s    ^    (: 		> 		>G^F)>BB61MMJQ "5
BKGGJG==== '<";#677 	9 	9IAv)>BB61MMJQ #."A!"D-
:K $aJGGJZ8888"::3EE#<<S$GG $ > >sD I I#**,,--$++--..%,,..//
 

 & 		%3  2%('s#$%& &J" r   r   )r   r   r   r   rj   rL   re  r4   r*   r   r   r   r+   r,   r   r   r   r   r.   r  r   r   r   r\  r\    s       J 4    2%D % % % %R R!1 RD RC R R R RS S"2 ST SS S S S ST T#3 Td Tc T T T T 6:	 	#	.2		c3h	 	 	 	 6:	 	#	.2		c3h	 	 	 	 6:	 	#	.2		c3h	 	 	 	< /<	< < < < < <r   r\  c                   :    e Zd Zd Zd ZdeddfdZdedefd	Z	dS )
PerfMetricsDebugLoggingc                 .    |                                   d S rX   )resetrD   s    r   rj   z PerfMetricsDebugLogging.__init__A  s    

r   c                 v    d| _         d| _        d| _        d| _        i | _        i | _        i | _        i | _        d S )Nr    r   )total_calc_durationtotal_num_prefill_requeststotal_num_decode_requeststotal_num_batchestotal_context_breakdown!total_num_flops_per_gpu_breakdown"total_read_bytes_per_gpu_breakdown#total_write_bytes_per_gpu_breakdownrD   s    r   r  zPerfMetricsDebugLogging.resetD  sH    *- /0'./&&'79$AC.BD/CE000r   r2   r>   Nc                    | xj         |j        z  c_         | xj        |j        z  c_        | xj        |j        z  c_        | xj        dz  c_        t          | j        | j	        | j
        | j        g|j        |j        |j        |j        g          D ]R\  }}t!          |t"                    sJ |                                D ]!\  }}|                    |d          |z   ||<   "Sd S )Nr@   r   )r  r!   r  r"   r  r#   r  zipr  r  r  r  r$   r%   r&   r'   r   r+   ry  r   )rA   r2   dstsrcrt  ru  s         r   observezPerfMetricsDebugLogging.observeN  s     K$==  '';+KK''&&+*II&&!#,678	 -7<=	
 
 	1 	1HC c4(((((IIKK 1 1S773??S0C1	1 	1r   
log_prefix
delta_timec                    d | j                                         D             }d | j                                        D             }d | j                                        D             }t                              d|t          j        | j        | j	        | j
        | j        ||||dd| j        |z  dd	d	
                     d S )Nc                 (    i | ]\  }}||d z  ddS )   mB.1fTFr   r  kvs      r   rw  z/PerfMetricsDebugLogging.log.<locals>.<dictcomp>h  s>     -
 -
 -
1 !d(""""-
 -
 -
r   c                 (    i | ]\  }}||d z  ddS     eAr  GBr   r  s      r   rw  z/PerfMetricsDebugLogging.log.<locals>.<dictcomp>l  s>     .
 .
 .
1 !c'!!!!.
 .
 .
r   c                 (    i | ]\  }}||d z  ddS r  r   r  s      r   rw  z/PerfMetricsDebugLogging.log.<locals>.<dictcomp>p  s>     /
 /
 /
1 !c'!!!!/
 /
 /
r   z%sMFU details: %sr  sz.1%)	prefill_reqsdecode_reqsnum_batchesr$   flops_breakdownnum_read_bytes_breakdownnum_write_bytes_breakdowndurationmfu_calc_overheadr   )indent)r  ry  r  r  r   r`  jsondumpsr  r  r  r  r  )rA   log_fnr  r  r  r  r  s          r   logzPerfMetricsDebugLogging.logf  s   -
 -
>DDFF-
 -
 -
).
 .
?EEGG.
 .
 .
*/
 /
@FFHH/
 /
 /
+
 	J$($C#'#A#'#9)-)E'H0R2U#- 4 4 4 43j@FF    	
 	
 	
 	
 	
r   )
r   r   r   rj   r  r   r  r,   r(   r  r   r   r   r  r  @  sy          F F F1> 1d 1 1 1 10"
c "
u "
 "
 "
 "
 "
 "
r   r  c                   P    e Zd ZdefdZd ZdeddfdZej	        dfd	e
ddfd
ZdS )PerfMetricsLoggingrb   c                     || _         |j        j        | _        d | _        t
          j        rt                      | _        |                                  d S rX   )	rb   r   r   r   debug_loggingr  r  r  r  )rA   rb   s     r   rj   zPerfMetricsLogging.__init__  sI    &"2I=A& 	;!8!:!:D

r   c                     t          j                    | _        d| _        d| _        d| _        | j        r| j                                         d S d S rd  )r  r  last_log_timetotal_num_flops_per_gputotal_read_bytes_per_gputotal_write_bytes_per_gpur  r  rD   s    r   r  zPerfMetricsLogging.reset  sZ    !^--,-$-.%./& 	'$$&&&&&	' 	'r   r  r>   Nc                     | xj         |j        z  c_         | xj        |j        z  c_        | xj        |j        z  c_        | j        r*|j        J | j                            |j                   d S d S rX   )	r  r/   r  r0   r  r1   r  r2   r  )rA   r  s     r   r  zPerfMetricsLogging.observe  s    $$
(DD$$%%)JJ%%&&**LL&& 	?)555&&z'=>>>>>	? 	?r    r  c                 R   | j         s| j        s	| j        sd S t          j                    }|| j        z
  }|dk    rd}d}n"| j         |z  dz  }| j        | j        z   |z  dz  } |d|||           | j        r| j                            |||           |                                  d S )Nr    r  r  z"%sMFU: %.1f TF/s/GPU %.1f GB/s/GPU)	r  r  r  r  r  r  r  r  r  )rA   r  r  nowr  avg_tflops_per_gpuavg_gbps_per_gpus          r   r  zPerfMetricsLogging.log  s    (	,	 -	
 Fn4--
!$"!%!=
!JT!Q.1OO  	0		
 	
 	
  	C""6:zBBB

r   )r   r   r   r   rj   r  r.   r  r   r_  r,   r  r   r   r   r  r    s        J    ' ' '?) ? ? ? ? ?      #  t            r   r  objattrc                 h    t          | |          st          d| d          t          | |          S )zMGet an attr from an object, or throw a InvalidComponentError if it's not set.zMissing required attr z
 in config)r   r   r   )r  r  s     r   r   r     s?    3 JHHHHIII3r   attrsdefaultc                 T    |D ]$}t          | |          rt          | |          c S %|S )zdTry to get the first attr that exists in the object
    from a list of attrs. Otherwise return None.)r   r   )r  r  r  r  s       r   r   r     sE      & &3 	&3%%%%%	&Nr   rX   )Dr   r  r  abcr   r   collections.abcr   dataclassesr   r   typingr   r	   r   pydanticr
   r   r   r   typing_extensionsr   	vllm.envsr  vllm.configr   vllm.loggerr   vllm.utils.torch_utilsr   r   r   vllm.v1.core.sched.outputr   r   r   	Exceptionr   r   r.   r4   rO   r`   rf   rq   r+   r,   rT   r)   rp   r   r   r   r   r   r  r  r  r  r  rO  r\  r  r  rY   r   ri   r   r   r   r   <module>r     s    
   # # # # # # # # $ $ $ $ $ $ ) ) ) ) ) ) ) )                  G G G G G G G G G G G G " " " " " "       " " " " " " # # # # # #         
 6 5 5 5 5 5	X			 	 	 	 	y 	 	 	 D D D D D D D D . . . . . . . . 9 9 9 9 9 9 9 9x! ! ! ! ! ! ! !0    X          * DF T#t,>'?"?@ E E EEJ EJ EJ EJ EJy# EJ EJ EJV. . . . .v . . .h       *       8~
 ~
 ~
 ~
 ~
' ~
 ~
 ~
H    &   B       *    6   6       2    &   >{ { { { {! { { {B<
 <
 <
 <
 <
% <
 <
 <
DC C C C C C C CRH
 H
 H
 H
 H
 H
 H
 H
V> > > > > > > >Hf C     6 $s) f      r   