
    -`iK?                        U d dl Z d dlmZ d dlmZ d dlmZmZ d dlm	Z	m
Z
 d dlZd dlmZ d dlmZmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ  ee          Zej        d k    Ze e!d<   d a"e#e!d<   d a$e#e!d<   ej        a%e#e!d<    ee&          Z'ee!d<    G d de
          Z(dej)        de*de&e*         fdZ+dej)        de*de*de*de&e*         f
dZ,e G d d                      Z-e G d d                      Z.da/e.dz  e!d<   de.fdZ0de fd Z1d dej2        ddddd!fd"e	d#ed$e*d%e-dz  d&ed'e(dz  d(edz  d)e3e4ej)        f         dz  d*e3e4e	f         dz  d+e fd,Z5ed-e.dz  fd.            Z6ed ddej2        dddd!fd"e	d#ed$e*d/e*dz  d0ej)        dz  d&ed'e(dz  d(edz  d)e3e4ej)        f         e&e3e4ej)        f                  z  dz  d+e fd1            Z7dS )2    N)defaultdict)contextmanager)	dataclassfield)Any
NamedTuple)CUDAGraphModeParallelConfig
VllmConfig)init_logger)current_platform)AttentionMetadata)coordinate_batch_across_dp)UBatchSlicestrack_batchsizelast_logging_timeforward_start_timebatchsize_logging_intervalbatchsize_forward_timec                   \    e Zd ZU dZeed<   dZedz  ed<   	 dZeed<   	 dZ	eed<   	 d
d	Z
dS )BatchDescriptorz
    Batch descriptor for cudagraph dispatching. We should keep the num of
    items as minimal as possible to properly and uniquely describe the padded
    batch for cudagraph.
    
num_tokensNnum_reqsFuniformhas_lorareturnc                 <    t          | j        dd| j                  S )z
        Return a relaxed version of current batch descriptor that is still compatible
        with PIECEWISE cudagraphs (or mixed prefill-decode FA cudagraphs).
        NF)r   r   r   )r   r   r   selfs    h/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/forward_context.py relax_for_mixed_batch_cudagraphsz0BatchDescriptor.relax_for_mixed_batch_cudagraphs3   s(    
 OdEDM
 
 
 	
    )r   r   )__name__
__module____qualname____doc__int__annotations__r   r   boolr   r!    r"   r    r   r      s           OOOHcDj GT Hd
 
 
 
 
 
r"   r   num_tokens_across_dp_cpusequence_parallel_sizer   c                 j    | |z   dz
  |z  }|                     |          }|                                S N   )repeat_interleavetolist)r+   r,   	sp_tokenss      r    _compute_sp_num_tokensr3   =   sD     	!#99A=	 I ++,BCCIr"   max_num_tokens	chunk_idxc                     t          | |          }t          |          }dg|z  }t          |          D ]2}t          |||         ||z  z
            ||<   ||         dk    rd||<   3|S )Nr   r/   )r3   lenrangemin)r+   r,   r4   r5   r2   sp_size
local_sizeis           r    !_compute_chunked_local_num_tokensr>   H   s     ''?AWXXI)nnGJ7^^  NIaLNY<V,WXX
1a=AJqMr"   c            	          e Zd ZU ej        ed<   ej        ed<   dZee         dz  ed<   e	de
dedej        dd fd            Zed	ed
edefd            Zed	efd            Zdee         dz  fdZdedej        fdZdS )
DPMetadatamax_tokens_across_dp_cpur+   Nlocal_sizesparallel_configr   r   c                     |J | j         dk    sJ | j        dusJ | j        }|}||         |k    sJ ||          d|             t          j        |          }t          ||          S )Nr/   F )data_parallel_sizeis_moe_modeldata_parallel_ranktorchmaxr@   )rC   r   r+   dp_rank	batchsizerA   s         r    makezDPMetadata.makeb   s     (3331A5555+58888!4	 (0I==='0>>9>> >== $)9-E#F#F 24LMMMr"   r,   max_chunk_size_per_rankr5   c              #   |   K   t          | j        |||          | _        	 | j        V  d| _        dS # d| _        w xY w)a  
        Context manager to compute and temporarily set the per-rank local token
        sizes for a specific chunk during chunked forward execution.

        This is necessary to ensure each DP (data parallel) rank processes its
        designated portion of tokens in lockstep with others, even when the
        token counts are uneven or some ranks have completed their input early.

        For chunked execution, we break up the total tokens on each rank into
        multiple chunks (of at most `max_chunk_size_per_rank`), and for a given
        `chunk_idx`, this context manager sets `self.local_sizes` to the number
        of tokens to process in that chunk on each rank.

        `self.local_sizes` is only valid inside the context.

        Args:
            sequence_parallel_size: When Attn is TP and MoE layers are EP,
                                    we use SP between the layers to avoid
                                    redundant ops. We need this value to
                                    compute the chunked sizes.
            max_chunk_size_per_rank: The max number of tokens each rank is
                                     allowed to process in this chunk.
            chunk_idx: The index of the chunk to compute sizes for.
        N)r>   r+   rB   )r   r,   rN   r5   s       r    chunked_sizeszDPMetadata.chunked_sizesv   s_      8 =)"#	
 
	$""""#DtD####s   	2 	;c              #   x   K   t          | j        |          | _        	 | j        V  d| _        dS # d| _        w xY w)z|
        Context manager for setting self.local_sizes. Same as self.chunked_sizes
        but without any chunking.
        N)r3   r+   rB   )r   r,   s     r    sp_local_sizeszDPMetadata.sp_local_sizes   sY       2)+A
 
	$""""#DtD####s   	0 	9c                 "    | j         J | j         S N)rB   r   s    r    get_chunk_sizes_across_dp_rankz)DPMetadata.get_chunk_sizes_across_dp_rank   s    +++r"   r;   c                 x    | j         dz
  |z   |z  }|                    |          }t          j        |d          S )Nr/   r   )dim)r+   r0   rI   cumsum)r   r;   num_tokens_across_sp_cpus      r    cu_tokens_across_spzDPMetadata.cu_tokens_across_sp   sI    )A-7$  $<#M#Mg#V#V |4!<<<<r"   )r#   r$   r%   rI   Tensorr(   rB   listr'   staticmethodr
   rM   r   rP   rR   rU   rZ   r*   r"   r    r@   r@   Z   sA        #l***#l*** %)KcT!(((N'NN #(,N 
	N N N \N& $$&)$$DG$$TW$$ $$ $$ ^$$L $S $ $ $ ^$ S	D0@        =3 =5< = = = = = =r"   r@   c                   |   e Zd ZU eeef         ed<   eeef         eeeef                  z  ed<   eee	j
        f         eeee	j
        f                  z  ed<   	 eed<   dZedz  ed<   ej        Zeed<   dZedz  ed<   dZedz  ed	<   d
Zeed<   dZee         dz  ed<    ee          Zeeef         ed<   d ZdS )ForwardContextno_compile_layersattn_metadataslot_mappingvirtual_engineNdp_metadatacudagraph_runtime_modebatch_descriptorubatch_slicesFskip_compiledremaining_moe_layers)default_factoryadditional_kwargsc                 \    | j                                         sJ d| j                      d S )Nz Invalid cudagraph runtime mode: )re   valid_runtime_modesr   s    r    __post_init__zForwardContext.__post_init__   sC    *>>@@ 	
 	
Lt/JLL	
 	
@ 	
 	
r"   )r#   r$   r%   dictstrr   r(   r   r\   rI   r[   r'   rd   r@   r	   NONEre   rf   r   rg   r   rh   r)   ri   r   rk   rn   r*   r"   r    r_   r_      sR         CH~%%%../$tCAR<R7S2TTTTTsEL()Dc5<6G1H,IIIII %)Kd"))) -:,>M>>>/3o,333)-M<$&---  M40 .2$s)d*111(-d(C(C(CtCH~CCC
 
 
 
 
r"   r_   _forward_contextc                  2    t           
J d            t           S )z Get the current forward context.NzXForward context is not set. Please use `set_forward_context` to set the forward context.rr   r*   r"   r    get_forward_contextru      s$    ''	G ('' r"   c                      t           d uS rT   rt   r*   r"   r    is_forward_context_availablerw     s    4''r"   Fra   vllm_configrc   rd   re   rf   rg   rb   rk   rh   c
                     |j         j        }
ddlm fd|
                                D             }|                                 t          |
||| |pi |||||	|pi           S )Nr   )FusedMoEc                 :    g | ]\  }}t          |          |S r*   )
isinstance).0namelayerrz   s      r    
<listcomp>z*create_forward_context.<locals>.<listcomp>  s<       uZx=X=X  r"   )r`   ri   rc   ra   rb   rd   re   rf   rg   rh   rk   )compilation_configstatic_forward_context*vllm.model_executor.layers.fused_moe.layerrz   itemsreverser_   )ra   rx   rc   rd   re   rf   rg   rb   rk   rh   r`   ri   rz   s               @r    create_forward_contextr     s     $6MCCCCCC    1 7 7 9 9     """+1%#!'R5)##+1r   r"   forward_contextc              #   8   K   t           }| a 	 dV  |a dS # |a w xY w)zA context manager that overrides the current forward context.
    This is used to override the forward context for a specific
    forward pass.
    Nrt   )r   prev_contexts     r    override_forward_contextr   )  s?       $L&('<''''s    r   num_tokens_across_dpc
              #   Z  K   t           o| du}
|
rt          j                    ad}|j        j        dk    rQ| |M|(|J |J t          ||j        dd          \  }}}|J t                              |j        |pd|          }|t          j
        k    r||pt          |          }t          j        | ||||||||	  	        }t          | |||||||||	
  
        }	 t          |          5  dV  ddd           n# 1 swxY w Y   |
rH|}t          j        }|
 |             t          j                    }t"          |                             |t          z
  dz             |t&          z
  t(          k    r|ag }t"                                          D ]\  }}t-          |          dk    rt/          j        t/          j        |          d	
                                          }t7          |d          }|                    |t-          |          |f           |                    d d           |r!t:                              d|           dS dS dS dS # |
rG|}t          j        }|
 |             t          j                    }t"          |                             |t          z
  dz             |t&          z
  t(          k    r|ag }t"                                          D ]\  }}t-          |          dk    rt/          j        t/          j        |          d	
                                          }t7          |d          }|                    |t-          |          |f           |                    d d           |rt:                              d|           w w w w xY w)zA context manager that stores the current forward context,
    can be attention metadata, etc.
    Here we can inject common logic for every model forward pass.
    Nr/   F)num_tokens_unpaddedrC   allow_microbatchingallow_dp_paddingr   )r   )	ra   rx   rc   rd   r   r   re   rf   rg   i  g      ?)q   c                     | d         S r.   r*   )xs    r    <lambda>z%set_forward_context.<locals>.<lambda>  s
    1 r"   T)keyr   zDBatchsize forward time stats (batchsize, count, median_time(ms)): %s)r   timeperf_counterr   rC   rF   r   r@   rM   r	   rq   r   r   set_additional_forward_contextr   r   synchronizer   appendr   r   r   r8   rI   quantiletensoritemroundsortloggerinfo)ra   rx   rc   r   r   re   rf   rg   rb   rh   need_to_track_batchsizerd   _rk   r   rL   r   nowforward_statsbstimesmediums                         r    set_forward_contextr   8  s     $ .K-t2K 1!.00%)K"599!Z%;
  ' ((())))C$. + ;$)!&	* * *&A#Q (333 oo'q:N
 
 !333
8N+U*/U/U/U(G#%15)#
 
 
 - O"%o66 	 	EEE	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 # 	"I +6K&#%%C"9-44c<N6NRV5VWWW&&)CCC$'! "!7!=!=!?!? C CIB5zzQ "^EL,?,?3GGGLLNNF"61--F!(("c%jj&)ABBBB""~~t"DDD  KKF &    /	 	 DC - # 	"I +6K&#%%C"9-44c<N6NRV5VWWW&&)CCC$'! "!7!=!=!?!? C CIB5zzQ "^EL,?,?3GGGLLNNF"61--F!(("c%jj&)ABBBB""~~t"DDD  KKF &   /	 Ds1    I /D 4I  DI DI EN*)8r   collectionsr   
contextlibr   dataclassesr   r   typingr   r   rI   	vllm.envsenvsvllm.configr	   r
   r   vllm.loggerr   vllm.platformsr   vllm.v1.attention.backendr   vllm.v1.worker.dp_utilsr   vllm.v1.worker.ubatch_utilsr   r#   r   VLLM_LOG_BATCHSIZE_INTERVALr   r)   r(   r   floatr   r   r\   r   r   r[   r'   r3   r>   r@   r_   rr   ru   rw   rq   ro   rp   r   r   r   r*   r"   r    <module>r      s    # # # # # # % % % % % % ( ( ( ( ( ( ( ( " " " " " " " "        A A A A A A A A A A # # # # # # + + + + + + 7 7 7 7 7 7 > > > > > > 4 4 4 4 4 4	X		8A= = = = 5    E   $($D E D D D&1k$&7&7  7 7 7
 
 
 
 
j 
 
 
@#lDG	#Y   #l  	
 
#Y   $ ]= ]= ]= ]= ]= ]= ]= ]=@ 7
 7
 7
 7
 7
 7
 7
 7
t +/ .4' . . .^    (d ( ( ( ( %),9,>/3)-37/3        d"	 
 *  &,   $&  sEL()D0  CH~,         F (nt.C ( ( ( (  !04,9,>/3)-SWk kkk k d
	k
  ,-k *k &,k  $&k sEL()Dc5<6G1H,IIDPk k k k k k kr"   