
    -`it                        d dl Z d dlmZ d dlmZ d dlmZmZmZ d dl	Z	d dl
mZmZmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z" erd dl#m$Z$ d dl%m&Z& d dl'm(Z( neZ$eZ&eZ( ee)          Z*ed         Z+ed         Z,ed         Z-ed         Z.ed         Z/ee G d d                                  Z0ee G d d                                  Z1dS )    N)Callablereplace)TYPE_CHECKINGAnyLiteral)Fieldfield_validatormodel_validator)	dataclass)ProcessGroupReduceOp)Self)config)init_logger)vllm_is_batch_invariant)current_platform)get_open_ports_list)cuda_device_count_stateless)
RuntimeEnv)PlacementGroupExecutor)linearround_robin)raympuniexternal_launcher)r   r   default)naivepplxdeepep_high_throughputdeepep_low_latencymoriallgather_reducescatterflashinfer_all2allvc                       e Zd ZU dZdZeed<   	 dZeed<   	  edd          Z	eed<   	 d	Z
eed
<   	 dZeed<   	 d	Zeed<   	 dZeed<   	  ed          defd            ZdS )
EPLBConfigz6Configuration for Expert Parallel Load Balancing (EP).i  window_sizei  step_intervalr   r    genum_redundant_expertsFlog_balancedness   log_balancedness_interval	use_asyncr    policyaftermodereturnc                     | j         r| j        dk    rt          d          | j        r| j        dk    rt          d          | S )Nr    z5Async EPLB is only supported with the default policy.r   z1log_balancedness_interval must be greater than 0.)r2   r3   
ValueErrorr/   r1   selfs    h/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/config/parallel.py_validate_eplb_configz EPLBConfig._validate_eplb_configW   sU    > 	VdkY66TUUU  	RT%Cq%H%HPQQQ    N)__name__
__module____qualname____doc__r*   int__annotations__r+   r	   r.   r/   boolr1   r2   r3   EPLBPolicyOptionr   r   r=    r>   r<   r)   r)   4   s          A@K0M3 "'qQ!7!7!73777D"d""" &'s&&& It  )F(((D_'"""t    #"  r>   r)   c                      e Zd ZU dZdZeed<   	 dZeed<   	 dZeed<   	 dZ	eed<   	 dZ
eed<   	 dZeed	<   	 d
Zed
z  ed<   	 dZeed<   	 dZeed<   	 dZeed<   	 dZeed<   	 dZeed<   	 dZeed<   	 d
Zed
z  ed<   	 dZeed<   	 dZeed<   	  ee          Zeed<   	 dZeed<   	 dZeed<   	 d
Z ed
z  ed <   	 dZ!eed!<   	 dZ"eed"<   	 dZ#eed#<   	 d$Z$eed%<   	 d&Z%eed'<   	  ed
(          Z&eed)<   	 dZ'eed*<   	 d
Z(e)d
z  ed+<   	 d
Z*e+d
z  ed,<   	 d
Z,ee-z  e.e/         z  d
z  ed-<   	 d.Z0eed/<   	 d.Z1eed0<   	 d1Z2eed2<   	 dZ3eed3<   	 d4Z4eed5<   	 dZ5eed6<   	 dZ6eed7<   	  ed8          Z7eed9<   	 dZ8eed:<   	  ee9          Z:e9e         ed;<   	 dZ;eed<<   	 dZ<eed=<   	 dZ=eed><   	  ed8          Z>eed?<   	  edd@          Z?eedA<   	  eddBC          Z@eedD<   	  eAd)dEF          eBdGeCdHeDdIeCfdJ                        ZE eFdKF          dIeGfdL            ZHeIdIefdM            ZJeIdIefdN            ZKeIdIefdO            ZLeIdIefdP            ZMdIefdQZNdIeOfdRZPeIdIefdS            ZQeIdIefdT            ZReIdIefdU            ZSeIdIefdV            ZTeUdWeOdXedIefdY            ZVeUdWeOdZedIefd[            ZWd\ ZXdad]ZYeIdIefd^            ZZ eFdKF          dIeGfd_            Z[dIeGfd`Z\d
S )bParallelConfigz,Configuration for the distributed execution.r0   pipeline_parallel_sizetensor_parallel_sizeprefill_context_parallel_sizedata_parallel_sizedata_parallel_size_localr   data_parallel_rankNdata_parallel_rank_localz	127.0.0.1data_parallel_master_ipins  data_parallel_rpc_porti<s  data_parallel_master_portr   data_parallel_backendFdata_parallel_external_lbdata_parallel_hybrid_lbis_moe_modelenable_expert_parallelenable_eplb)default_factoryeplb_configr   expert_placement_strategyr&   all2all_backendmax_parallel_loading_workersdisable_custom_all_reduce
enable_dboubatch_size    dbo_decode_token_thresholdi   dbo_prefill_token_threshold)r    #disable_nccl_for_dp_synchronizationray_workers_use_nsightray_runtime_envplacement_groupdistributed_executor_backendauto
worker_clssd_worker_cls worker_extension_clsmaster_addri=s  master_port	node_ranknnodes)init
world_sizerank_data_parallel_master_port_listdecode_context_parallel_sizedcp_kv_cache_interleave_sizecp_kv_cache_interleave_sizedata_parallel_index)r    gt_api_process_countr,   _api_process_rankwrapr5   valuehandlerr7   c                      |dn
 ||          S )zFSkip validation if the value is `None` when initialisation is delayed.NrG   )clsr   r   s      r<   _skip_none_validationz$ParallelConfig._skip_none_validation.  s     }tt''%..8r>   r4   c                 F   | j         | j        k    rt          d| j         d| j                    | j        | j        k    r t          d| j         d| j         d          | j        dk    r| j        rt          d          | j        rlt          j                    st          d          | j	        st          d	          | j
        | j        z  dk    r t          d
| j
         d| j         d          n-| j        j        dk    rt          d| j        j         d          | S )NzBInvalid value of `_api_process_rank`. Expected to be `-1` or `[0, z)`, but found: zdata_parallel_size_local (z!) must be <= data_parallel_size ()r0   zEdata_parallel_external_lb can only be set when data_parallel_size > 1zXExpert parallelism load balancing is only supported on CUDA devices or ROCm devices now.z0enable_expert_parallel must be True to use EPLB.zZEPLB requires tensor_parallel_size or data_parallel_size to be greater than 1, but got TP=z,DP=.r   z num_redundant_experts is set to zL but EPLB is not enabled. Either enable EPLB or unset num_redundant_experts.)r~   r|   r9   rN   rM   rU   rY   r   is_cuda_alikerX   rK   r[   r.   r:   s    r<   _validate_parallel_configz(ParallelConfig._validate_parallel_config4  s   !T%<<<7/3/F7 7"47 7   (4+BBBMT-J M M262IM M M  
 "a''D,J'W    	#133  8   . U !STTT(4+BBaGG T3T T9=9PT T T   H 5:: -'=- - -   r>   c                      | j         | j        z  S )zaworld_size_across_dp is TPxPPxDP, it is the size of the world
        including data parallelism.)rt   rM   r:   s    r<   world_size_across_dpz#ParallelConfig.world_size_across_dpa  s     !888r>   c                 &    | j         p
| j        dk    S Nr0   r`   ra   r:   s    r<   use_ubatchingzParallelConfig.use_ubatchingg  s    6$"2Q"66r>   c                 "    | j         rdn| j        S )N   r   r:   s    r<   num_ubatcheszParallelConfig.num_ubatchesk  s    O9qq)99r>   c                     | j         p| j        S )z
        Client manages local+remote EngineCores in pure internal LB case.
        Client manages local EngineCores in hybrid and external LB case.
        )rU   rV   r:   s    r<   local_engines_onlyz!ParallelConfig.local_engines_onlyo  s     -M1MMr>   c                 v    | j         r| j                                         }n| j        }| xj        dz  c_        |S )a  
        We might need to initialize process groups in multiple
        processes that is related to data parallelism,
        e.g. both in the worker and in the engine, which
        can live in different processes. To avoid port conflicts, we
        pop a new port from the prepared port list each time we need to
        initialize a new process group related to data parallelism.
        r0   )rv   poprS   )r;   answers     r<   get_next_dp_init_portz$ParallelConfig.get_next_dp_init_portw  sF     / 	09==??FF3F**a/**r>   c                 N   ddl m} ddlm} d}d }t	          |          D ]}	  || j        |                                 | j        | j        t          j
                  c S # |$ r9}dt          |          v r!t                              d           |}Y d }~w|d }~ww xY w|J |)Nr   )DistNetworkError).stateless_init_torch_distributed_process_group   )backend
EADDRINUSEz1Address already in use. Retrying with a new port.)torch.distributedr   vllm.distributed.utilsr   rangerQ   r   rO   rM   r   dist_backendstrloggerwarning)r;   r   r   max_retrieslast_exc_es          r<   stateless_init_dp_groupz&ParallelConfig.stateless_init_dp_group  s    	766666	
 	
 	
 	
 	
 	
 %){## 	 	AEE0..00++,9      $   3q66))NN#VWWW HHHHH ###s   :A  B%-BBBc                 N    | j         dv o| j        o| j        dk    o
| j        dk    S )N)r&   r!   r#   r$   r%   r0   )r]   rX   rK   rM   r:   s    r<   use_sequence_parallel_moez(ParallelConfig.use_sequence_parallel_moe  sJ       
, +
, )A-
, '!+	
r>   c                      | j         | j        z  S N)rq   nnodes_within_dpr:   s    r<   node_rank_within_dpz"ParallelConfig.node_rank_within_dp  s    ~ 555r>   c                 N    | j         dk    rdS | j        | j        z  }| j         |z  S r   )rr   rM   rN   )r;   data_parallel_node_sizes     r<   r   zParallelConfig.nnodes_within_dp  s7    ;!1#t'DD 	  {555r>   c                      | j         | j        z  S r   )rt   r   r:   s    r<   local_world_sizezParallelConfig.local_world_size  s    $"777r>   dp_grouphas_unfinishedc                     t          j        |gt           j        d          }t           j                            |t
          j        |            t          |                                          }|S )Ncpudtypedeviceopgroup)	torchtensorint32distributed
all_reducer   MAXrE   item)r   r   r   aggregated_has_unfinisheds       r<   has_unfinished_dpz ParallelConfig.has_unfinished_dp  s[    ~.ek%PPP
 	$$VH$MMM$($7$7!((r>   kv_cache_memoryc                    |dk    r#t          j        t           j                  j        }t          j        |gt           j        d          }t           j                            |t          j        |            |	                                S )Nr}   r   r   r   )
r   iinfoint64maxr   r   r   r   MINr   )r   r   r   s      r<   sync_kv_cache_memory_sizez(ParallelConfig.sync_kv_cache_memory_size  sl    b  #k%+66:O/u{5QQQ 	$$VH$MMM{{}}r>   c                 H    h d}ddl m}m}  || |          } ||          S )a  
        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.

        This hash is also used for DP worker configuration validation
        to prevent hangs from mismatched collective communication patterns.
        >   ru   rr   rq   rk   ro   rp   rl   rh   rg   r~   r|   rO   rz   rn   rT   rR   rf   rV   rQ   rP   rN   rU   rS   r_   ri   r^   rv   r   )get_hash_factorshash_factors)vllm.config.utilsr   r   )r;   ignored_factorsr   r   factorss        r<   compute_hashzParallelConfig.compute_hash  sP    
 
 
> 	EDDDDDDD""499|G$$$r>   c                    t          j        d          r+t                              d           t           j        | _        | j        | j        z  | j        z  | _	        | j
        dk    r/t                              d           | xj	        | j        z  c_	        | j        dk    s| j        dk    r| j
        dk    rTt          t          j        d                   | j	        | j        z  z  | _        t                              d| j                   | j        st'          d	          | _        | j                                        | _        d| j        cxk    r| j        k     s"n t-          d
| j         d| j         d          nxt           j        | _        t           j        | _        t           j        | _        t           j        | _        t           j        | _        | j        dk    r| j        du rt-          d          | j        | _        | j
        dk    r)dt          j        d<   t                              d           | j
        6| j	        dk    r*ddl m!} d}|"                                }tG          j$                    rt           j%        rd}ntG          j&                    r| j'        dk    rd}ntG          j&                    r@tQ                      | j	        k     r)tQ                      }t-          d| j	         d| d          | j)        dk    rt                              d           d}n.|r,| j*        rd}n"ddl+m,}  |            rddl-m.}  |            rd}|| _
        t          /                    d|           | j
        | j	        dk    rd| _
        | j0        t          1                    d           d}| j
        |vr| j'        dk    rt-          d          d S d S ) NVLLM_ALL2ALL_BACKENDzVLLM_ALL2ALL_BACKEND environment variable is deprecated and will be removed in v0.15.0. Please use the --all2all-backend command-line argument instead.r   z2Using external launcher for distributed inference.r0   r   RANKz+Set data_parallel_rank to %d automatically.r   zdata_parallel_rank (z) must be in the range [0, r   FzDOffline data parallel mode is not supported/useful for dense models.0VLLM_ENABLE_V1_MULTIPROCESSINGz3Disabling V1 multiprocessing for external launcher.	ray_utilsr   r   zWorld size (z/) is larger than the number of available GPUs (z) in this node. If this is intentional and you are using:
- ray, set '--distributed-executor-backend ray'.
- multiprocessing, set '--nnodes' appropriately.r   zDUsing ray distributed inference because data_parallel_backend is ray)is_initialized)get_current_placement_groupz.Defaulting to use %s for distributed inferencezLmax_parallel_loading_workers is currently not supported and will be ignored.)r   r   r   z]nnodes > 1 can only be set when distributed executor backend is mp, uni or external_launcher.)2envsis_setr   warning_oncer   r]   rJ   rK   rL   rt   ri   inforM   rN   rC   osenvironrO   rv   r   r   rS   r9   VLLM_DP_SIZEVLLM_DP_RANKVLLM_DP_RANK_LOCALrP   VLLM_DP_MASTER_IPrQ   VLLM_DP_MASTER_PORTrW   rz   vllm.v1.executorr   ray_is_availabler   is_tpuVLLM_XLA_USE_SPMDis_cudarr   r   rT   rh   r   r   ray.utilr   debugr^   r   )r;   r   r   	ray_found	gpu_countray_is_initializedr   allowed_backendss           r<   __post_init__zParallelConfig.__post_init__  s   ;-.. 	=C  
 $(#<D  ''(01 	 ,0CCCKKLMMMOOt66OO"Q&&$*G1*L*L04GGG +.bj.@*A*AOt'>>+' A+   7 N7J17M7M4-1-Q-U-U-W-WD*0JJJJ43JJJJJ L4+B L L151HL L L   K '+&7D#&*&7D#,0,CD)+/+AD(-1-ED*&**t/@E/I/I )  
 $(#: ,0CCC;>BJ78KKMNNN,419L9L 32222226G!2244I&((  ,T-C  ,!)++ ,a (**,/11DOCC799	 G4? G G'0G G G   +u443     
,' 	,#GGHHHHHH))++ ,HHHHHH6688 ,&+G07D-LLI7SSS,4A9M9M05D-,8NN5   >-5EEEa;   FEr>   c                 x    | j         dk    p/t          | j         t                    ot          | j         dd          S )Nr   uses_rayF)ri   
isinstancetypegetattrr:   s    r<   use_rayzParallelConfig.use_ray  s?    0E9 
t8$?? N9:uMM	
r>   c                 H   ddl m} t                      rd| _        | j        at          | j        t                    sGt          | j        t                    rt          | j        |          st          d| j         d          | j
        rddl m} |                                 t          j                    s!d| _        t                              d           | j        dk    r!d| _        t                              d	           | j        r| j
        st          d
          | S )Nr   r   Tz*Unrecognized distributed executor backend zl. Supported values are 'ray', 'mp' 'uni', 'external_launcher',  custom Executor subclass or its import path.r   zVDisabled the custom all-reduce kernel because it is not supported on current platform.r0   zBDisabled the custom all-reduce since we are running on multi-node.z;Unable to use nsight profiling unless workers run with Ray.)r   r   r   r_   ri   r   r   r   
issubclassr9   r   r   assert_ray_availabler   use_custom_allreducer   r   rr   rf   )r;   r   r   s      r<   _verify_argszParallelConfig._verify_args  sr    	.----- #$$ 	2-1D* -9t@#FF : 4<dCC : t@(KK	 : @4@ @ @   < 	-222222**,,,466 	-1D*LL1   ;??-1D*LLT   & 	t| 	M   r>   c                     t          | fi |S r   r   )r;   kwargss     r<   r   zParallelConfig.replace  s    t&&v&&&r>   )r7   N)]r?   r@   rA   rB   rJ   rC   rD   rK   rL   rM   rN   rO   rP   rQ   r   rR   rS   rT   DataParallelBackendrU   rE   rV   rW   rX   rY   r	   r)   r[   r\   ExpertPlacementStrategyr]   All2AllBackendr^   r_   r`   ra   rc   rd   re   rf   rg   r   rh   r   ri   DistributedExecutorBackendr   r   rk   rl   rn   ro   rp   rq   rr   rt   ru   listrv   rw   rx   ry   rz   r|   r~   r
   classmethodr   r   r   r   r   r   propertyr   r   r   r   r   r   r   r   r   r   r   staticmethodr   r   r   r   r   r   r   rG   r>   r<   rI   rI   `   s         76"#C###- !#!!!+)*!3***4G$%c%%%/*+/cDj///#.S...)"'C'''+%*s***+15.555A&+t+++- %*T)))# !%L$+$$$7#(D(((NKB#eJ???K???+9A6AAA> '@O^???K 04 #*333" ',t+++EJ;K &(((( (+***$
 16d0C0C0C'CCC $)D((( `)-OZ$&---A-1O^d*1118 	 !((4>9D@  A J1M3M "#""" #K""":K:Is:FCOOO: e'''J'''JD#MMM+16t1L1L1L#T#YLLL )* #)))* )* #))) ()(((  %u%000000, $eA!444444 #U1444s444 _:HHH9# 9 9S 9 9 9 [ IH9 _'"""*4 * * * #"*X 9c 9 9 9 X9
 7t 7 7 7 X7 :c : : : X: ND N N N XNs    "$ $ $ $ $^ 
4 
 
 
 X
 6S 6 6 6 X6 6# 6 6 6 X6 8# 8 8 8 X8 )L )$ )4 ) ) ) \) L 3 SV    \-% -% -%^z z z zx 
 
 
 
 X
 _'"""+d + + + #"+Z'4 ' ' ' ' ' 'r>   rI   )2r   collections.abcr   dataclassesr   typingr   r   r   r   pydanticr	   r
   r   pydantic.dataclassesr   r   r   r   typing_extensionsr   	vllm.envsr   r   r   vllm.loggerr   *vllm.model_executor.layers.batch_invariantr   vllm.platformsr   vllm.utils.network_utilsr   vllm.utils.torch_utilsr   ray.runtime_envr   ray.util.placement_groupr   r   r   r?   r   r  r  r  rF   r  r)   rI   rG   r>   r<   <module>r     sn   
			 $ $ $ $ $ $       . . . . . . . . . .  < < < < < < < < < < * * * * * * 4 4 4 4 4 4 4 4 " " " " " "       $ $ $ $ $ $ # # # # # #      , + + + + + 8 8 8 8 8 8 > > > > > > ******777777)))))))JNH	X		!"9: $%LM k* 9%  
' ' ' ' ' ' '  'T 
l	' l	' l	' l	' l	' l	' l	'  l	' l	' l	'r>   