
    -`i^9                        d dl Z d dlmZmZ d dlmZ d dlZd dlmc m	Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'  ee(          Z)dedeej*                 f         dedej*        f         fdZ+ G d d          Z, G d de,          Z- G d de,          Z. ej/                    Z0 G d de,          Z1 G d de,          Z2 G d d e'          Z3dS )!    N)CallableSequence)Any)PatternMatcherPass)
VllmConfig)Range)get_tp_group tensor_model_parallel_all_reduce)$get_tensor_model_parallel_world_size)init_logger)kFp8StaticTensorSym)current_platform   )enable_fake_mode)MatcherFusedAddRMSNormMatcherQuantFP8MatcherRMSNorm)NoOpEliminationPass)VllmInductorPassVllmPatternMatcherPassfn.returnc                 n     t          j                   dt          dt          j        f fd            }|S )Nargsr   c                       |  d         S )Nr    )r   r   s    y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/compilation/sequence_parallelism.pywrapperz&get_first_out_wrapper.<locals>.wrapper"   s    r4y|    )	functoolswrapsr   torchTensor)r   r   s   ` r   get_first_out_wrapperr$      sO     _Rs u|       Nr   c                       e Zd ZdZdedej        dedz  ddfdZdej	        dej	        fd	Z
dej	        dej	        fd
Zdej	        dej	        fdZdS )_SequenceParallelPatternHelperz)Helper for sequence parallelism patterns.epsilondtypedeviceNr   c                 |    || _         || _        || _        t                      | _        t                      | _        d S N)r'   r(   r)   r	   tp_groupr   tp_size)selfr'   r(   r)   s       r   __init__z'_SequenceParallelPatternHelper.__init__,   s5     
$;==r   xc                      t          |          S r+   )r
   r.   r0   s     r   _all_reducez*_SequenceParallelPatternHelper._all_reduce8   s    /222r   c                 z    t           j        j        j                            |d| j        | j        j                  S Nr   )dim
world_size
group_name)r"   opsvllmreduce_scatterdefaultr-   r,   unique_namer2   s     r   _reduce_scatterz._SequenceParallelPatternHelper._reduce_scatter;   s6    y~,441$-:S 5 
 
 	
r   c                 z    t           j        j        j                            |d| j        | j        j                  S r5   )r"   r9   r:   
all_gatherr<   r-   r,   r=   r2   s     r   _all_gatherz*_SequenceParallelPatternHelper._all_gather@   s6    y~(001$-:S 1 
 
 	
r   )__name__
__module____qualname____doc__floatr"   r(   strr/   r#   r3   r>   rA   r   r   r   r&   r&   )   s        33
>
> {
> d
	
>
 

> 
> 
> 
>3U\ 3el 3 3 3 3
 
%, 
 
 
 


U\ 
el 
 
 
 
 
 
r   r&   c                   r     e Zd Zdedej        dedz  ddf fdZdeej	                 fdZ
deddfd	Z xZS )
FirstAllReduceRMSNormPatternr'   r(   r)   Nr   c                 v    t                                          |||           t          |          | _        d S r+   )superr/   r   rmsnorm_matcherr.   r'   r(   r)   	__class__s       r   r/   z%FirstAllReduceRMSNormPattern.__init__G   s5    %000-g66r   c                     t          j        g d| j        | j                  }t          j        dg| j        | j                  }||gS )Nr         r)   r(   rR   r"   emptyr)   r(   )r.   inputarg3_1s      r   
get_inputsz'FirstAllReduceRMSNormPattern.get_inputsK   sF    IIIdkLLLaSDJGGGvr   pm_passc                 t    dt           j        dt           j        dt          t           j        t           j        f         f fd}dt           j        dt           j        dt          t           j        t           j        f         f fd}t          j        ||                                 t          j        |           d S )NrV   rW   r   c                 b                         |           }                    ||          }||fS r+   r3   rL   )rV   rW   
all_reducermsnormr.   s       r   patternz6FirstAllReduceRMSNormPattern.register.<locals>.patternR   s8     ))%00J**:v>>GJ&&r   c                                          |           }                    ||          }                    |          }||fS r+   )r>   rL   rA   )rV   rW   r;   r^   r@   r.   s        r   replacementz:FirstAllReduceRMSNormPattern.register.<locals>.replacement[   sK     "11%88N**>6BBG))'22J~--r   r"   r#   tuplepmregister_replacementrX   fwd_onlyr.   rY   r_   ra   s   `   r   registerz%FirstAllReduceRMSNormPattern.registerQ   s    	'<	'L	' 5<-.	' 	' 	' 	' 	' 	'	.<	.L	. 5<-.	. 	. 	. 	. 	. 	. 	[$//"3"3R['	
 	
 	
 	
 	
r   rB   rC   rD   rF   r"   r(   rG   r/   listr#   rX   r   rh   __classcell__rN   s   @r   rI   rI   F   s        7 7ek 73: 7RV 7 7 7 7 7 7D.    
 2 
t 
 
 
 
 
 
 
 
r   rI   c                   r     e Zd Zdedej        dedz  ddf fdZdeej	                 fdZ
deddfd	Z xZS )
MiddleAllReduceRMSNormPatternr'   r(   r)   Nr   c                 v    t                                          |||           t          |          | _        d S r+   )rK   r/   r   rL   rM   s       r   r/   z&MiddleAllReduceRMSNormPattern.__init__k   s5    %0005g>>r   c                     t          j        ddg| j        | j                  }t          j        ddg| j        | j                  }t          j        ddg| j        | j                  }|||gS )NrR   rS   rT   )r.   mm_1residualrms_norm_weightss       r   rX   z(MiddleAllReduceRMSNormPattern.get_inputso   sr    {Aq6$+TZHHH;1vdkLLL ;1vdkTTT 
 	
r   rY   c           
      @    dt           j        dt           j        dt           j        dt          t           j        t           j        f         f fd}dt           j        dt           j        dt           j        dt          t           j        t           j        f         f fd}t          j        ||                                 t          j        |           t          j        t          |          t          |                                           t          j        |           d S )Nrr   rq   rs   r   c                 |                         |          }                    |||           }|d         |d         fS )Nr   r   r\   )rr   rq   rs   r]   r^   r.   s        r   r_   z7MiddleAllReduceRMSNormPattern.register.<locals>.pattern|   sC    
 ))$//J**:7GRRG1:wqz))r   c                                          |          }| d|                    d          df         }                     |||           }                    |d                   }||d         fS )Nr   .r   )r>   sizerL   rA   )rr   rq   rs   r;   r^   r@   r.   s         r   ra   z;MiddleAllReduceRMSNormPattern.register.<locals>.replacement   sz     "11$77NN$7$7$:$: :C ?@H**>;KXVVG))'!*55J wqz))r   r"   r#   rc   rd   re   rX   rf   r$   rg   s   `   r   rh   z&MiddleAllReduceRMSNormPattern.register{   s)   	*l	*,	* $l	* 5<-.		* 	* 	* 	* 	* 	*	*l	*,	* $l	* 5<-.		* 	* 	* 	* 	* 	*  	[$//"3"3R['	
 	
 	
 	!'**!+..OOK	
 	
 	
 	
 	
r   ri   rl   s   @r   rn   rn   j   s        ? ?ek ?3: ?RV ? ? ? ? ? ?

D. 

 

 

 

#
 2 #
t #
 #
 #
 #
 #
 #
 #
 #
r   rn   c                   r     e Zd Zdedej        dedz  ddf fdZdeej	                 fdZ
deddfd	Z xZS )
%FirstAllReduceRMSNormStaticFP8Patternr'   r(   r)   Nr   c                     t                                          |||           t          |          | _        t	          t
                    | _        d S r+   )rK   r/   r   rL   r   r   quant_matcherrM   s       r   r/   z.FirstAllReduceRMSNormStaticFP8Pattern.__init__   sH     	%000-g66,-@AAr   c                     t          j        g d| j        | j                  }t          j        dg| j        | j                  }t          j        d| j        t           j                  }|||gS )NrP   rS   rR   g      ?)r"   zerosr)   r(   rU   tensorfloat32)r.   rV   weightscales       r   rX   z0FirstAllReduceRMSNormStaticFP8Pattern.get_inputs   sc    IIIdkLLLaSDJGGGSEMJJJvu%%r   rY   c           
          dt           j        dt           j        dt           j        dt          t           j        t           j        f         f fd}dt           j        dt           j        dt           j        dt          t           j        t           j        f         f fd}t          j        ||                                 t          j        |           d S )NrV   r   r   r   c                                          |           }                    ||          }                    ||          \  }}||fS r+   r3   rL   r|   )rV   r   r   r]   rmsquant_r.   s          r   r_   z?FirstAllReduceRMSNormStaticFP8Pattern.register.<locals>.pattern   sQ    
 ))%00J&&z6::C))#u55HE1*$$r   c                                          |           }                    ||          }                    ||          \  }}                    |          }||fS r+   )r>   rL   r|   rA   )	rV   r   r   r;   r   r   r   r@   r.   s	           r   ra   zCFirstAllReduceRMSNormStaticFP8Pattern.register.<locals>.replacement   sd    
 "11%88N&&~v>>C))#u55HE1))%00J~--r   rb   rg   s   `   r   rh   z.FirstAllReduceRMSNormStaticFP8Pattern.register   s    	%<	%L	% <	% 5<-.		% 	% 	% 	% 	% 	%
	.<
	.L
	. <
	. 5<-.	
	. 
	. 
	. 
	. 
	. 
	. 	[$//"3"3R['	
 	
 	
 	
 	
r   ri   rl   s   @r   rz   rz      s        BB {B d
	B
 
B B B B B B&D. & & & &
 2 
t 
 
 
 
 
 
 
 
r   rz   c                   r     e Zd Zdedej        dedz  ddf fdZdeej	                 fdZ
deddfd	Z xZS )
&MiddleAllReduceRMSNormStaticFP8Patternr'   r(   r)   Nr   c                     t                                          |||           t          |          | _        t	          t
                    | _        d S r+   )rK   r/   r   rL   r   r   r|   rM   s       r   r/   z/MiddleAllReduceRMSNormStaticFP8Pattern.__init__   sF    %0005g>>,-@AAr   c                 0   t          j        ddg| j        | j                  }t          j        ddg| j        | j                  }t          j        ddg| j        | j                  }t          j        ddg| j        t           j                  }||||gS )NrR   rS   r   )r"   rU   r)   r(   r   )r.   rq   rr   rs   r   s        r   rX   z1MiddleAllReduceRMSNormStaticFP8Pattern.get_inputs   s    {Aq6$+TZHHH;1vdkLLL ;1vdkTTTQF4;emLLL$ 0%88r   rY   c                 p    dt           j        dt           j        dt           j        dt           j        dt          t           j        t           j        f         f
 fd}dt           j        dt           j        dt           j        dt           j        dt          t           j        t           j        f         f
 fd}t          j        ||                                 t          j        |           t          j        t          |          t          |                                           t          j        |           d S )Nrr   rq   rs   r   r   c                     	                     |          }	                    |||           \  }}	                    ||          \  }}||fS r+   r   )
rr   rq   rs   r   r]   r   residual_outr   r   r.   s
            r   r_   z@MiddleAllReduceRMSNormStaticFP8Pattern.register.<locals>.pattern   s`     ))$//J $ 4 4,h! !C ))#u55HE1,&&r   c                    
                     |          }| d|                    d          df         } 
                    |||           \  }}
                    ||          \  }}
                    |          }	|	|fS )Nr   .)r>   rw   rL   r|   rA   )rr   rq   rs   r   r;   r   r   r   r   r@   r.   s             r   ra   zDMiddleAllReduceRMSNormStaticFP8Pattern.register.<locals>.replacement   s     "11$77NN$7$7$:$: :C ?@H $ 4 4 0(! !C ))#u55HE1))%00J |++r   rx   rg   s   `   r   rh   z/MiddleAllReduceRMSNormStaticFP8Pattern.register   sA   	'l	',	' $l	' <		'
 5<-.	' 	' 	' 	' 	' 	'	,l	,,	, $l	, <		,
 5<-.	, 	, 	, 	, 	, 	,* 	[$//"3"3R['	
 	
 	
 	!'**!+..OOK	
 	
 	
 	
 	
r   ri   rl   s   @r   r   r      s        B Bek B3: BRV B B B B B B
9D. 9 9 9 9-
 2 -
t -
 -
 -
 -
 -
 -
 -
 -
r   r   c                        e Zd ZdZededdf fd            ZdedefdZ	e
j        dej        ddfd	            Z xZS )
SequenceParallelismPassa  
    This pass enables sequence parallelism for models.
    It identifies patterns where an AllReduce operation is followed by
    an RMSNorm (or RMSNorm and then Quantization) operation.
    These patterns are replaced with a ReduceScatter operation, followed by
    a local RMSNorm/Quantization, and then an AllGather operation.

    The general transformation is:
    Input -> AllReduce -> RMSNorm -> Output
    becomes
    Input -> ReduceScatter -> RMSNorm -> AllGather -> Output

    While this pass itself does not directly yield performance improvements,
    it lays the groundwork for subsequent fusion passes, such as
    GEMM + ReduceScatter and AllGather + GEMM fusions. These fusions can
    significantly reduce communication overhead and improve overall model
    performance.


    This pass splits up the residual tensor across TP ranks and hence divides its size.
    Because the pattern matcher starts at the end of the graph, the replacement
    contains a slice that temporarily conforms the input residual to the correct size.
    After all patterns have been matched, we use a NoOpEliminationPass to clean up
    what have now become no-op slices.

    Note that an older version of the pass did not need this as it operated only on
    custom rms_norm and fused_rms_norm_add custom ops which did not complain about
    mismatched shapes during replacement. So this approach has the same assumption that
    correctness is only maintained if all rms_norm operations are split across ranks.

    Correctness-wise, this is approach strictly better than before - before,
    the graph was incorrect semantically and shape-wise during the pass.
    With this approach there's only semantic incorrectness during the pass.
    Both approaches restore a correct graph once all patterns are matched.
    configr   Nc                    t                                          |           t          |          | _        | j         d| j        j         | j        _        t          d          | _        dD ]}t          || j        | j	                  
                    | j                   t          || j        | j	                  
                    | j                   t          || j        | j	                  
                    | j                   t          || j        | j	                  
                    | j                   |                     || j                   d S )N.sequence_parallelism_pass)	pass_name)gh㈵>gư>)rK   r/   r   noop_cleanupr   r   patternsrz   model_dtyper)   rh   r   rI   rn   dump_patterns)r.   r   r'   rN   s      r   r/   z SequenceParallelismPass.__init__4  sG       077)-&W&W$:K:U&W&W#,>1-
 -
 -
 $ 	& 	&G1)4; ht}%%%2)4; ht}%%% ))4; ht}%%%))4; ht}%%%%64=11111r   compile_rangec                     | j         j        r| j         j        rdS t                      }|                                o|j        |z  dk    }|S )NTr   )compilation_configsplitting_opsuse_inductor_graph_partitionr   is_single_sizeend)r.   r   r-   results       r   is_applicable_for_rangez/SequenceParallelismPass.is_applicable_for_rangeU  sb     '5	&C	 4688%4466 
'1, 	 r   graphc                     | j                             |          | _        t                              d| j                   |                     |           d S )NzReplaced %s patterns)r   applymatched_countloggerdebugr   )r.   r   s     r   __call__z SequenceParallelismPass.__call__n  sL    !]0077+T-?@@@%     r   )rB   rC   rD   rE   r   r   r/   r   boolr   r   time_and_logfxGraphr   rk   rl   s   @r   r   r     s        " "H 2z 2d 2 2 2 2 2 2@U t    2 "!bh !4 ! ! ! #"! ! ! ! !r   r   )4r    collections.abcr   r   typingr   r"   torch._inductor.pattern_matcher	_inductorpattern_matcherrd   torch.fxr   r   vllm.configr   vllm.config.utilsr   vllm.distributedr	   r
   vllm.distributed.parallel_stater   vllm.loggerr   9vllm.model_executor.layers.quantization.utils.quant_utilsr   vllm.platformsr   inductor_passr   matcher_utilsr   r   r   noop_eliminationr   vllm_inductor_passr   r   rB   r   r#   r$   r&   rI   rn   	fp8_dtype	FP8_DTYPErz   r   r   r   r   r   <module>r      s       . . . . . . . .        , , , , , , , , ,       > > > > > > " " " " " " # # # # # # K K K K K K K K P P P P P P # # # # # #      , + + + + + + + + + + + R R R R R R R R R R 1 1 1 1 1 1 H H H H H H H H	X		hu|,,-c5<    
 
 
 
 
 
 
 
:!
 !
 !
 !
 !
#A !
 !
 !
H4
 4
 4
 4
 4
$B 4
 4
 4
n '&((	*
 *
 *
 *
 *
,J *
 *
 *
Z;
 ;
 ;
 ;
 ;
-K ;
 ;
 ;
|d! d! d! d! d!4 d! d! d! d! d!r   