
    -`i?                     B    d Z ddlZddlZddlmZ  G d de          ZdS )aA  
Expert parallelism load balancer (EPLB) for vLLM.

This module implements the core rearrangement algorithm.

The rearrangement algorithm is adapted from
[DeepSeek EPLB](https://github.com/deepseek-ai/eplb).

Please find at [#12](https://github.com/deepseek-ai/EPLB/issues/12) an example
on how the EPLB algorithm works.
    N   )AbstractEplbPolicyc                      e Zd Zedej        dedeej        ej        f         fd            Zedej        dedeej        ej        ej        f         fd            Z	edej        deded	ed
edeej        ej        ej        f         fd            Z
edej        dej        dedej        deej        ej        f         f
d            Ze	 ddej        deded	ededej        dz  deej        ej        ej        f         fd            ZdS )DefaultEplbPolicyweight	num_packsreturnc                    |j         \  }}||z  dk    sJ ||z  }|dk    rYt          j        t          j        |t          j                  |df          }t          j        |t          j                  }||fS t          j        | d          }t          j        ||fdt          j                  }t          j        ||fdt          j                  }t          j        ||ft          j	                  }	t          j        ||ft          j                  }
t          |          D ]}|	|         }|
|         }||         D ]z}t          t          j        |                    }||||f<   ||         |||f<   ||xx         |||f         z  cc<   ||xx         dz  cc<   ||         |k    rt          j        ||<   {||fS )a  
        Pack n weighted objects to m packs, such that each bin contains exactly
        n/m objects and the weights of all packs are as balanced as possible.

        Parameters:
            weight: [X, n], the weight of each item
            num_packs: number of packs

        Returns:
            pack_index: [X, n], the pack index of each item
            rank_in_pack: [X, n], the rank of the item in the pack
        r   r   dtypeaxis)shapenptilearangeint64
zeros_likeargsortfullzerosfloat64rangeintargmininf)clsr   r   
num_layers
num_groupsgroups_per_pack
pack_indexrank_in_packindicespack_weights
pack_items	layer_idxweights_row	items_rowgrouppacks                   x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/distributed/eplb/policy/default.pybalanced_packingz"DefaultEplbPolicy.balanced_packing   s     "(
JI%****$	1a:RX!F!F!FUVXXJ=28DDDL|++ *fW2...Wj*5rJJJ
w
J728LLLxY 7rzJJJXz95RXFFF
 z** 	/ 	/I&y1K"9-I + 
/ 
/29[1122/3
9e+,1:4Y-.D!!!VIu,<%==!!!$1$T?o55(*K%
/ <''    num_phyc                    |j         \  }}||z
  }|dk    sJ t          j        t          j        |t          j                  |df          }t          j        ||ft          j                  }t          j        ||ft          j                  }t          j        |t          j                  }	t          ||          D ]G}
t          j        ||z  d          }||dd|
f<   ||	|f         |dd|
f<   ||	|fxx         dz  cc<   H|||fS )a  
        Replicate `num_log` experts to `num_phy` replicas, such that the maximum
        load of all replicas is minimized.

        Parameters:
            weight: [X, num_log]
            num_phy: total number of experts after replication

        Returns:
            phy2log: [X, num_phy], logical expert id of each physical expert
            replica_idx: [X, num_phy], the index of the replica for each logical expert
            logcnt: [X, num_log], number of replicas for each logical expert
        r   r   r   r   r   N)	r   r   r   r   r   r   onesr   argmax)r   r   r/   nnum_lognum_redundantphy2logreplica_idxlogcntarangeniredundant_indicess               r,   replicate_expertsz#DefaultEplbPolicy.replicate_expertsK   s,   " \
7')!!!!'")G28<<<q!fEEh7|28<<<!WRX666)ARX...w(( 	4 	4A "	&6/ C C C-GAAAqDM &w0A'A BK17--...!3....V++r.   num_physical_expertsr    	num_nodesnum_gpusc                    |j         \  }}||z  dk    sJ ||z  }||z  dk    sJ ||z  }	||z  dk    sJ ||z  dk    sJ ||z  }
dt          j        dt          j        fd}|                    |||                              d          }|                     ||          \  }}||	z  |z   d         |z  t          j        |t          j                  z                       ||          } ||          }t          j        ||d	                              d||z            }| 	                    |||z            \  }}}t          j        ||z  |d	          }|                     |||z            \  }}||
z  |z   } ||          }t          j        ||d	          }|                    ||d          t          j        d|||z  t          j                  d
d
d
d
f         z                       |d          }t          j        ||d	          }t          j        ||d	                              |d          }t          j        |                    |d          |d	          }|||fS )a  
        Parameters:
            weight: [num_moe_layers, num_logical_experts]
            num_physical_experts: number of physical experts after replication
            num_groups: number of expert groups
            num_nodes: number of server nodes, where the intra-node network
                (e.g, NVLink) is faster
            num_gpus: number of GPUs, must be a multiple of `num_nodes`

        Returns:
            phy2log: [layers, num_replicas], the expert
                index of each replica
            pphy_replicas_idx: [layers, num_logical_experts, X],
                the replica indices for each expert
            logcnt: [layers, num_logical_experts], number of
                physical replicas for each logical expert
        r   permr	   c                     t          j        |           }t          j        | j        d                   d d d f         }t          j        | j        d         t           j                  }|||| f<   |S )Nr   r   r   )r   
empty_liker   r   r   )rA   invrow_idxcol_idxs       r,   inversezADefaultEplbPolicy.rebalance_experts_hierarchical.<locals>.inverse   sb    -%%Ci
1..qqq$w7Gi
1RX>>>G!(CJr.   r   r   ).Nr   r   N)
r   r   ndarrayreshapesumr-   r   r   take_along_axisr<   )r   r   r=   r    r>   r?   r   num_logical_experts
group_sizegroups_per_nodephy_experts_per_gpurG   tokens_per_groupgroup_pack_indexgroup_rank_in_packlog2mlogmlog2logtokens_per_mlogphy2mlogreplicas_idxmlogcnttokens_per_phyr"   r#   phy2pphypphy2phy	pphy2mlogpphy2logpphy_replicas_idxr8   s                                 r,   rebalance_experts_hierarchicalz0DefaultEplbPolicy.rebalance_experts_hierarchicalj   s#   4 +1,'
'"Z/14444(J6
I%****$	1)#q((((#h.!33332h>	"* 	 	 	 	 	 ">>*j*MMQQ R 
 
 03/C/Ci0
 0
,, "O36HH)T i
"(333	4
 '*1
2
2 	 78$$ ,VXAFFFNN#y0
 
 +.*?*?1Y>+
 +
', +Og,ExVWXXX#&#7#7H	1$
 $
 
L  33lB78$$ &xBBB	j)R88i##y0h	  
 AAAtm '*b
!
! 	 %h	BBB.|XANNNVV
 
 #GOOJ$C$CXTUVVV*F22r.   r6   phy_replicas_idx	num_ranksold_phy2logc                    |j         d         }|dk    s	||z  dk    r||fS ||z  }|j         d         }|                                }|                                }	t          |          D ]}
|
|z  }||z   }|dd||f         }|dd||f         }|dd||f         }t          j        ||ft
                    }t          j        ||ft
                    }t          |          D ]}||dd|f         dddf         k    | z  }|                    d          }t          j        |          rjt          j        |d          }t          j        |          d         }||         }|||f         ||||z   f<   |||f         |	|||z   f<   d|||f<   d|||f<   | }| }|                                rF|                                r1t          j	        t          j
        |          |df          }|dz   }t          j        |||          }t          j        |||          }t          j        |d          }t          j        |d          }|                    d          } |                    d          }!t          j        | |!          }"t          |          D ]Z}#t          |"|#                   }$|$dk    r||#d|$f         }%||#d|$f         }&||#|%f         ||#||&z   f<   ||#|%f         |	|#||&z   f<   [||	fS )ar  
        Reorder the new mapping per GPU so that experts that remain on the same GPU
        keep their previous slot positions when possible. Incoming experts to that GPU
        fill any remaining available slots. This is applied only when the number of GPUs
        is unchanged and the slots per GPU remain the same between
        the old and new mappings.
        r   r   Nr   r   T)r   copyr   r   r   boolanyr2   nonzeror   r   wherer   rJ   minimumr   )'r   r6   r`   ra   rb   num_phy_expertsslots_per_gpur   post_phy2logpost_phy_replicas_idxgpu_idxstartend	old_local	new_localnew_ridxused_new_indicespreserved_positionsslot_idxmatcheshas_any	first_idxlayer_indicesmatched_new_positionsremaining_mask	fill_maskidx_baselargeremaining_priorityfill_priorityremaining_indicesfill_indicesremaining_countsfill_countstake_countsr'   ksrc_posdst_poss'                                          r,   preserve_intragpu_slotsz)DefaultEplbPolicy.preserve_intragpu_slots   s    "-*>>_y8A==,,, (94]1%
||~~ 0 5 5 7 7Y'' >	 >	Gm+E-'C#AAAuSyL1I59-I'595H!x](C4PPP"$(J+Fd"S"S"S "-00 H H %	!!!X+(>qqq$w(GG%% "++1+--6'?? H "	' : : :I$&Jw$7$7$:M,5m,D)DM%'<<EL0@!@A NV%'<<N)-9I*IJ NR$]4I%IJCG'x(?@ /.N,,I!!## 	 729]#;#;j!_MM%) &(Xnh%N%N" "He D D$&J/A$J$J$J!!z-a@@@#1#5#51#5#=#= 'mmm33 j)9;GG!&z!2!2  IK	233AAvv /	2A2>G*9bqb=9G?H!7*@LEGO!;< IQ!7*I))UW_*DEE 222r.   Nnum_replicasold_global_expert_indicesc                    |j         }|j        \  }}	|                                                                                                }
|&|                                                                nd}||z  dk    r|                     |
||||          \  }}}n|                     |
|dd|          \  }}}||                     ||||          \  }}||	z
  }|dz   }t          j        ||	|fdt          j	                  }t          j
        |          dddf         }t          j        t          j
        |t          j	                  |df          }|||||f<   t          j        |                              |          }t          j        |                              |          }t          j        |                              |          }|||fS )a?  
        Entry point for expert-parallelism load balancer.

        Parameters:
            weight: [layers, num_logical_experts], the load statistics for all
                logical experts
            num_replicas: number of physical experts, must be a multiple of
                `num_gpus`
            num_groups: number of expert groups
            num_nodes: number of server nodes, where the intra-node network
                (e.g, NVLink) is faster
            num_ranks: number of ranks, must be a multiple of `num_nodes`
            old_global_expert_indices: [layers, num_logical_experts], the old global
                expert indices. Used to avoid unnecessary weight copying
                for experts moving within one rank.
        Returns:
            phy2log: [layers, num_replicas], the expert
                index of each replica
            log2phy: [layers, num_logical_experts, X],
                the replica indices for each expert
            logcnt: [layers, num_logical_experts], number of
                physical replicas for each logical expert
        Nr   r   r   r   )devicer   floatcpunumpyr_   r   r   r   r   r   r   torch
from_numpyto)r   r   r   r    r>   ra   r   r   r   rL   	weight_npold_phy2log_np
phy2log_npphy_replicas_idx_np	logcnt_npnum_redundant_experts	maxlogcnt
log2phy_nprz   replica_indicesr6   log2phyr8   s                          r,   rebalance_expertsz#DefaultEplbPolicy.rebalance_experts(  s   B *0,'
'LLNN&&((..00	 )4 &))++11333 	 	!Q&& 22|ZI  7J+YY 22|Q9  7J+Y %0.1.I.I/N/ /+J+ !-/B B)A-	W,i8"BH
 
 

 	*--aaag6'Il"(333j!_
 
 FU
=*.AAB":..11&99":..11&99!),,//77''r.   )N)__name__
__module____qualname__classmethodr   rH   r   tupler-   r<   r_   r   r   Tensorr    r.   r,   r   r      s%       2(Z2(,/2(	rz2:%	&2( 2( 2( [2(h ,Z,*-,	rz2:rz1	2, , , [,< _3
_3 "_3 	_3
 _3 _3 
rz2:rz1	2_3 _3 _3 [_3B Y3Y3 *Y3 	Y3
 ZY3 
rz2:%	&Y3 Y3 Y3 [Y3v  :>O( O(O( O( 	O(
 O( O( $)<$#6O( 
u|U\5<7	8O( O( O( [O( O( O(r.   r   )__doc__r   r   r   abstractr   r   r   r.   r,   <module>r      sw   
 
      ( ( ( ( ( (c( c( c( c( c(* c( c( c( c( c(r.   