
    -`i              !          U d dl mZ d dlmZ d dlZd dlmc mZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZmZ d dlmZ d dlmZ d dlmZ d dl m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z)m*Z*  ej+                    Z,dZ-edz  e.d<    ed          r!	 d dl/m0Z1  e2e1d          re1Z-n# e3$ r Y nw xY w ee4          Z5 e2ej6        j7        d          rej6        j7        j8        j9        Z: G d d          Z; G d de;          Z< G d de;          Z= G d de;          Z> G d d e;          Z? G d! d"e;          Z@ G d# d$e;          ZA G d% d&e*          ZBd'd(d)d*d'd+dd*d,ZCeDeEeDeEeFf         f         e.d-<   d+d(d)d*d+d.dd*d,ZGeDeEeDeEeFf         f         e.d/<   e-daHd0ZI	 	 	 	 dVd1ejJ        d2ejJ        d3ejJ        d4eFd5eEd6eEd7eKd8eKd9eKd:eEd;eEd<ejJ        dz  d=ejJ        dz  d>ejJ        dz  d?ejJ        dz  d@df dAZL	 	 	 	 dVd1ejJ        d2ejJ        d3ejJ        d4eFd5eEd6eEd7eKd8eKd9eKd:eEd;eEd<ejJ        dz  d=ejJ        dz  d>ejJ        dz  d?ejJ        dz  d@df dBZM e!dCeLg dDeME           ej6        jN        jO        j9        ZO G dF dG          ZP G dH dIe;          ZQ G dJ dKe;          ZR G dL dMe;          ZS G dN dOe;          ZT G dP dQe;          ZU G dR dSe;          ZV G dT dUe*          ZWdS )W    )	find_spec)
ModuleTypeN)auto_functionalized)PatternMatcherPass)enable_symm_mem_for_group)
VllmConfig)Range)get_tp_group tensor_model_parallel_all_reduce)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)init_logger)kFp8StaticTensorSym)current_platform)direct_register_custom_op   )enable_fake_mode)MatcherFusedAddRMSNormMatcherQuantFP8MatcherRMSNorm)VllmInductorPassVllmPatternMatcherPassflashinfer_comm
flashinfertrtllm_allreduce_fusionscaled_fp4_quantc                   2    e Zd Zdej        dedz  ddfdZdS )BasePatterndtypedeviceNreturnc                 n    || _         || _        t                      | _        t	                      | _        d S N)r   r    r
   tpr   tp_size)selfr   r    s      v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/compilation/collective_fusion.py__init__zBasePattern.__init__2   s,    
..;==    )__name__
__module____qualname__torchr   strr(    r)   r'   r   r   1   sC        >ek >3: >$ > > > > > >r)   r   c                   @    e Zd Zdeej                 fdZdeddfdZdS )GEMMReduceScatterPatternr!   c                     t          j        ddg| j        | j                  }t          j        ddg| j        | j                  }||gS )N      r    r   r-   emptyr    r   )r&   mul	mm_weights      r'   
get_inputsz#GEMMReduceScatterPattern.get_inputs:   sI    k2q'$+TZHHHKAt{$*MMM	Yr)   pm_passNc                     dt           j        dt           j        dt           j        f fd}dt           j        dt           j        dt           j        f fd}t          j        ||                                 t          j        |           d S )Nr8   r9   r!   c                     t           j        j        j                            | |          }t           j        j        j                            |dj        j        j	                  }|S Nr   dim
world_size
group_name)
r-   opsatenmmdefaultvllmreduce_scatterr%   r$   unique_name)r8   r9   rE   rH   r&   s       r'   patternz2GEMMReduceScatterPattern.register.<locals>.pattern@   s\    "**3	::B"Y^:BB<7.	 C  N "!r)   c                 x    t           j        j                            | |ddj        j        j                  }|S )Navgr   )scatter_dimrB   )r-   rC   symm_memfused_matmul_reduce_scatterr$   device_grouprB   )r8   r9   gemm_rsr&   s      r'   replacementz6GEMMReduceScatterPattern.register.<locals>.replacementJ   sA    i(DD7/: E  G Nr)   r-   Tensorpmregister_replacementr:   fwd_onlyr&   r;   rJ   rR   s   `   r'   registerz!GEMMReduceScatterPattern.register?   s    	" 	"%, 	"5< 	" 	" 	" 	" 	" 	"		U\ 		el 		u| 		 		 		 		 		 		 	[$//"3"3R['	
 	
 	
 	
 	
r)   	r*   r+   r,   listr-   rT   r:   r   rY   r/   r)   r'   r1   r1   9   sY         D.        

 2 
t 
 
 
 
 
 
r)   r1   c                   @    e Zd Zdeej                 fdZdeddfdZdS )AllGatherGEMMPatternr!   c                     t          j        ddg| j        | j                  }t          j        ddg| j        | j                  }||gS )Nr4   r5   r6   )r&   xweights      r'   r:   zAllGatherGEMMPattern.get_inputs[   sH    KAt{$*EEEaVDKtzJJJ6{r)   r;   Nc                     dt           j        dt           j        dt           j        f fd}dt           j        dt           j        dt           j        f fd}t          j        ||                                 t          j        |           d S )Nr_   r`   r!   c                     t           j        j        j                            | dj        j        j                  }t           j        j        j	                            ||          S r>   )
r-   rC   rG   
all_gatherrF   r%   r$   rI   rD   rE   )r_   r`   rc   r&   s      r'   rJ   z.AllGatherGEMMPattern.register.<locals>.patternb   sX     2::<7.	 ;  J 9>$,,Z@@@r)   c                 ~    t           j        j                            | |gdj        j        j                  \  }}|S )Nr   )
gather_dimrB   )r-   rC   rN   fused_all_gather_matmulr$   rP   rB   )r_   r`   	ag_output
mm_outputsr&   s       r'   rR   z2AllGatherGEMMPattern.register.<locals>.replacemento   sF    $)I$6$N$N7/:	 %O % %!Iz r)   rS   rX   s   `   r'   rY   zAllGatherGEMMPattern.registera   s    	A|	AL	A \	A 	A 	A 	A 	A 	A	5< 	 	%, 	 	 	 	 	 	 	[$//"3"3R['	
 	
 	
 	
 	
r)   rZ   r/   r)   r'   r]   r]   Z   sY        D.    
 2 
t 
 
 
 
 
 
r)   r]   c                   @    e Zd Zdeej                 fdZdeddfdZdS )ScaledMMReduceScatterPatternr!   c                    t          j        ddg| j        t                    }t          j        ddg| j        t                                                                        dd          }t          j        ddg| j        t           j                  }t          j        ddg| j        t           j                  }||||gS Nr3   r5   r   r   )r-   r7   r    	FP8_DTYPE
contiguous	transposefloat32)r&   inputr9   scale_ascale_bs        r'   r:   z'ScaledMMReduceScatterPattern.get_inputs~   s    RHT[	JJJKRIFFFZ\\Yq!__ 	
 +r1gdkOOO+q"gdkOOOy'733r)   r;   Nc           
      x    dt           j        dt           j        dt           j        dt           j        dt           j        f
 fd}dt           j        dt           j        dt           j        dt           j        dt           j        f
 fd}t          j        ||                                 t          j        |           d S )Nrq   mat2rr   rs   r!   c           	          t           j        j        j                            | |||d d j                  }t           j        j        j                            |dj        j	        j
                  }|S )Nru   rr   rs   biasscale_result	out_dtyper   r?   )r-   rC   rD   
_scaled_mmrF   r   rG   rH   r%   r$   rI   )rq   ru   rr   rs   	scaled_mmrH   r&   s         r'   rJ   z6ScaledMMReduceScatterPattern.register.<locals>.pattern   s|     	199!* :  I #Y^:BB<7.	 C  N "!r)   c                     g | j         d d         |j         d         }d}t          j        j                            | |||d||j        j        j        |d d j        d          }|S Nr   r   rL   F	shaper-   rC   rG   *patched_fused_scaled_matmul_reduce_scatterr$   rP   rB   r   )rq   ru   rr   rs   output_shaperM   rQ   r&   s          r'   rR   z:ScaledMMReduceScatterPattern.register.<locals>.replacement   s{     >U["-=tz!}=LKinOO$/
 G  Nr)   rS   rX   s   `   r'   rY   z%ScaledMMReduceScatterPattern.register   s    	"<	",	" \	" \		"
 \	" 	" 	" 	" 	" 	".	<	,	 \	 \		
 \	 	 	 	 	 	6 	[$//"3"3R['	
 	
 	
 	
 	
r)   rZ   r/   r)   r'   rj   rj   }   sY        	4D. 	4 	4 	4 	45
 2 5
t 5
 5
 5
 5
 5
 5
r)   rj   c                   @    e Zd Zdeej                 fdZdeddfdZdS )AllGatherScaledMMPatternr!   c                    t          j        ddg| j        t                    }t          j        ddg| j        t                                                                        dd          }|j        d         | j        z  }t          j        |dg| j        t           j                  }t          j        ddg| j        t           j                  }||||gS N   r3   r5   r   r   )	r-   r7   r    rm   rn   ro   r   r%   rp   )r&   r_   r`   s1rr   rs   s         r'   r:   z#AllGatherScaledMMPattern.get_inputs   s    KB9EEEKRIFFFZ\\Yq!__ 	 WQZ$,&+r1gdkOOO+q"gdkOOO67G,,r)   r;   Nc           
      x    dt           j        dt           j        dt           j        dt           j        dt           j        f
 fd}dt           j        dt           j        dt           j        dt           j        dt           j        f
 fd}t          j        ||                                 t          j        |           d S )Nr_   r`   rr   rs   r!   c           	          t           j        j        j                            | dj        j        j                  }t           j        j        j	                            ||||d d j
                  S )Nr   r?   rw   )r-   rC   rG   rc   rF   r%   r$   rI   rD   r{   r   )r_   r`   rr   rs   rc   r&   s        r'   rJ   z2AllGatherScaledMMPattern.register.<locals>.pattern   ss     2::qT\dg>Q ;  J 9>,44!* 5   r)   c                     t           j        j                            | |g||gdd gd gj        gdgj        j        j        
  
        \  }}|S Nr   F)re   biasesresult_scales
out_dtypesuse_fast_accumrB   r-   rC   rN   fused_all_gather_scaled_matmulr   r$   rP   rB   )r_   r`   rr   rs   rg   rh   r&   s         r'   rR   z6AllGatherScaledMMPattern.register.<locals>.replacement   sf     %*I$6$U$U	v#f J< %w7/: %V % %!Iz r)   rS   rX   s   `   r'   rY   z!AllGatherScaledMMPattern.register   s    	|	L	 \	 \		
 \	 	 	 	 	 	(	|	L	 \	 \		
 \	 	 	 	 	 	( 	[$//"3"3R['	
 	
 	
 	
 	
r)   rZ   r/   r)   r'   r   r      sY        -D. - - - -+
 2 +
t +
 +
 +
 +
 +
 +
r)   r   c                   @    e Zd Zdeej                 fdZdeddfdZdS )#CutlassScaledMMReduceScatterPatternr!   c                    t          j        ddg| j        t                    }t          j        ddg| j        t                                                                        dd          }t          j        ddg| j        t           j                  }t          j        ddg| j        t           j                  }t          j        ddg| j        | j                  }|||||gS rl   )r-   r7   r    rm   rn   ro   rp   r   )r&   rq   r9   rr   rs   cutlass_mm_outputs         r'   r:   z.CutlassScaledMMReduceScatterPattern.get_inputs   s    RHT[	JJJKRIFFFZ\\Yq!__ 	
 +r1gdkOOO+q"gdkOOO!KRDJWWWy'74EFFr)   r;   Nc                     dt           j        dt           j        dt           j        dt           j        dt           j        dt           j        f fd}dt           j        dt           j        dt           j        dt           j        dt           j        dt           j        f fd	}t          j        ||                                 t          j        |           d S )
Nrq   r`   rr   rs   r   r!   c           	         t           j        j                            t           j        j        j        j        || |||d           }t           j        j        j                            |d         dj	        j
        j                  }|S )Noutaba_scalesb_scalesrx   r   r   r?   )r-   rC   higher_orderr   _Ccutlass_scaled_mmrF   rG   rH   r%   r$   rI   )rq   r`   rr   rs   r   r   rH   r&   s          r'   rJ   z=CutlassScaledMMReduceScatterPattern.register.<locals>.pattern  s     !&	 6 J J	.6%   !K ! ! #Y^:BB!!$<7.	 C  N "!r)   ru   c                     g | j         d d         |j         d         }d}t          j        j                            | |||d||j        j        j        |d d j        d          }|S r~   r   )	rq   ru   rr   rs   r   r   rM   rQ   r&   s	           r'   rR   zACutlassScaledMMReduceScatterPattern.register.<locals>.replacement'  s{     >U["-=tz!}=LKinOO$/
 G  Nr)   rS   rX   s   `   r'   rY   z,CutlassScaledMMReduceScatterPattern.register  s    	"<	"L	" \	" \		"
  %|	" \	" 	" 	" 	" 	" 	"2	<	,	 \	 \		
  %|	 \	 	 	 	 	 	8 	[$//"3"3R['	
 	
 	
 	
 	
r)   rZ   r/   r)   r'   r   r      s^        GD. G G G G8
 2 8
t 8
 8
 8
 8
 8
 8
r)   r   c                   @    e Zd Zdeej                 fdZdeddfdZdS )AllGatherCutlassScaledMMPatternr!   c                    t          j        ddg| j        t                    }t          j        ddg| j        t                                                                        dd          }|j        d         | j        z  }t          j        |dg| j        t           j                  }t          j        ddg| j        t           j                  }|j        d         }t          j        ||g| j        | j	                  }|||||gS r   )
r-   r7   r    rm   rn   ro   r   r%   rp   r   )r&   r_   r`   r   rr   rs   s2outputs           r'   r:   z*AllGatherCutlassScaledMMPattern.get_inputsI  s    KB9EEEKRIFFFZ\\Yq!__ 	 WQZ$,&+r1gdkOOO+q"gdkOOO\!_b"XdkLLL67GV44r)   r;   Nc                     dt           j        dt           j        dt           j        dt           j        dt           j        dt           j        f fd}dt           j        dt           j        dt           j        dt           j        dt           j        dt           j        f fd}t          j        ||                                 t          j        |           d S )	Nr_   r`   rr   rs   r   r!   c           	         t           j        j        j                            | dj        j        j                  }t           j        j        	                    t           j        j
        j        j        |||||d           }|d         S )Nr   r?   r   r   )r-   rC   rG   rc   rF   r%   r$   rI   r   r   r   r   )r_   r`   rr   rs   r   rc   r   r&   s          r'   rJ   z9AllGatherCutlassScaledMMPattern.register.<locals>.pattern\  s     2::qT\dg>Q ;  J !&	 6 J J	.6   !K ! ! %Q''r)   c                     t           j        j                            | |g||gdd gd gj        gdgj        j        j        
  
        \  }}|S r   r   )r_   r`   rr   rs   r   rg   rh   r&   s          r'   rR   z=AllGatherCutlassScaledMMPattern.register.<locals>.replacementr  sf     %*I$6$U$U	v#f J< %w7/: %V % %!Iz r)   rS   rX   s   `   r'   rY   z(AllGatherCutlassScaledMMPattern.register[  s    	(|	(L	( \	( \		(
 L	( \	( 	( 	( 	( 	( 	(,	|	L	 \	 \		
 L	 \	 	 	 	 	 	* 	[$//"3"3R['	
 	
 	
 	
 	
r)   rZ   r/   r)   r'   r   r   H  sY        5D. 5 5 5 5$.
 2 .
t .
 .
 .
 .
 .
 .
r)   r   c                   |     e Zd Zededdf fd            ZdedefdZe	j
        dej        ddfd            Z xZS )	AsyncTPPassconfigr!   Nc                 v   t                                          |           t          t                      j        j                   t          d          | _        t          | j	        | j
                                      | j                   t          | j	        | j
                                      | j                   | j	        t          j        k    rt          | j	        | j
                                      | j                   t!          | j	        | j
                                      | j                   t#          | j	        | j
                                      | j                   t%          | j	        | j
                                      | j                   |                     || j                   d S )Nasync_tp_pass	pass_name)superr(   r   r
   rP   rB   r   patternsr1   model_dtyper    rY   r]   r-   bfloat16rj   r   r   r   dump_patterns)r&   r   	__class__s     r'   r(   zAsyncTPPass.__init__  s       	",.."="HIII,>%-
 -
 -
 	!!14;??HHWWWT-t{;;DDT]SSS
 u~--()94;GGPP   %T%5t{CCLL   00@$+NNWW   ,D,<dkJJSS   	64=11111r)   compile_rangec                     | j         j        r| j         j        rdS t                      }t	          |                                o|j        |z  dk              S )NTr   )compilation_configsplitting_opsuse_inductor_graph_partitionr   boolis_single_sizeend)r&   r   r%   s      r'   is_applicable_for_rangez#AsyncTPPass.is_applicable_for_range  s_    
 '5	&C	 4688M0022W}7H77RVW7WXXXr)   graphc                     | j                             |          | _        t                              d| j                   d S )NReplaced %s patterns)r   applymatched_countloggerdebugr&   r   s     r'   __call__zAsyncTPPass.__call__  s8    !]0077+T-?@@@@@r)   )r*   r+   r,   r   r   r(   r	   r   r   r   time_and_logfxGraphr   __classcell__r   s   @r'   r   r     s        2z 2d 2 2 2 2 2 2@
YU 
Yt 
Y 
Y 
Y 
Y "Abh A4 A A A #"A A A A Ar)   r   @      g      ?)r   r4   r       )Z   d   FI_ALLREDUCE_FUSION_MAX_SIZE_MBr4   #_FI_ALLREDUCE_ONE_SHOT_MAX_SIZES_MB   allreduce_inresidual	rms_gammarms_eps
world_rankrA   launch_with_pdltrigger_completion_at_endfp32_accmax_token_numpattern_codenorm_out	quant_out	scale_outscale_factorr!   c                 X   | j         \  }}|                                 }||z  |z  }|	|z  |z  }||k    sJ d| d|	 d| d|             t          j                    }||                                nd }t
                              |i                               |d           }|d u p||t          z  k    }t          
J d            || }|}n| }t          
                    | | j         d         |||||||| j         d         t          |||||
d ||t          j        j        |           d S )	NzCurrent tensor size z is larger than max token num z * hidden size z * element size z0Flashinfer must be enabled when using flashinferr   r   )r   	token_numresidual_inresidual_outr   r   r   r   rA   
hidden_dimworkspace_ptrsr   use_oneshotr   r   r   allreduce_outr   r   layout_coder   )r   element_sizer   get_device_capabilityto_intr   getMiB_FI_WORKSPACE_TENSORr   r   QuantizationSFLayoutSWIZZLED_128x4)r   r   r   r   r   rA   r   r   r   r   r   r   r   r   r   
num_tokenshidden_sizer   current_tensor_sizemax_tensor_sizecurr_devicedevice_capabilitymax_one_shot_sizer   r   s                            r'    call_trtllm_fused_allreduce_normr     s   " #/"4
K#0022(;6E'+5D"o555+#6 + +*+ +;F+ +(+ + 655
 '<>>4?4KK..000QU @CC
 
 #j$

 	 %W)<@QTW@W)W 	 $//> 0// #H#LL
 (L 	//%"(+ %!!#)"-/+#&?%'<K%- 	0 	
 	
 	
 	
 	
r)   c                     d S r#   r/   )r   r   r   r   r   rA   r   r   r   r   r   r   r   r   r   s                  r'   %call_trtllm_fused_allreduce_norm_faker   1  s	    " 	r)   &flashinfer_trtllm_fused_allreduce_norm)r   r   r   r   r   )op_nameop_funcmutates_args	fake_implc                   V    e Zd ZdZ	 	 ddededededd	f
d
Zdeeeez  f         fdZ	d	S )FlashInferFusedAllReduceParamsz5Parameters for FlashInfer fused allreduce operations.F   rankrA   use_fp32_lamportr   r!   Nc                 h    || _         || _        || _        d| _        d| _        d| _        || _        d S )NT)r	  rA   r
  r   r   r   r   )r&   r	  rA   r
  r   s        r'   r(   z'FlashInferFusedAllReduceParams.__init__X  s?     	$ 0)-&#*r)   c                 P    | j         | j        | j        | j        | j        | j        dS )N)r   rA   r   r   r   r   )r	  rA   r   r   r   r   r&   s    r'   !get_trtllm_fused_allreduce_kwargsz@FlashInferFusedAllReduceParams.get_trtllm_fused_allreduce_kwargsg  s2    )/#3)-)G!/
 
 	
r)   )Fr  )
r*   r+   r,   __doc__intr   r(   dictr.   r  r/   r)   r'   r  r  U  s        ?? "'!+ ++ + 	+
 + 
+ + + +
4TCZ3H 
 
 
 
 
 
r)   r  c            
       z     e Zd ZdZdedej        dedz  deddf
 fdZ	de
ej                 fd	Zd
eddfdZ xZS )AllReduceRMSNormPatternz
    This pattern replaces the allreduce + rms norm (without residual)
    with fused flashinfer implementation.
    Applies to allreduce + rmsnorm before attn in the first Transformer block.
    epsilonr   r    Nallreduce_paramsr!   c                     t                                          ||           || _        || _        t	          |          | _        d S r#   r   r(   r  r  r   rmsnorm_matcherr&   r  r   r    r  r   s        r'   r(   z AllReduceRMSNormPattern.__init__y  D     	''' 0-g66r)   c                 r    | j                                         \  }}|                    | j                  |gS r#   r  inputstor   )r&   rq   r`   s      r'   r:   z"AllReduceRMSNormPattern.get_inputs  s5    ,3355v $$f--r)   r;   c                 t    dt           j        dt           j        dt          t           j        t           j        f         f fd}dt           j        dt           j        dt          t           j        t           j        f         f fd}t          j        ||                                 t          j        |           d S )Nrq   r`   r!   c                 V    t          |           }                    ||          }||fS r#   r   r  )rq   r`   allreduce_outputrmsr&   s       r'   rJ   z1AllReduceRMSNormPattern.register.<locals>.pattern  s6      @FF&&'7@@C(((r)   c                 "   t          j        |           }t          j        |           }t          
J d            t	          t
          f| ||d d |j        t          j        j        dj	        
                                }|d         |d         fS )NFlashInfer must be enabledr   r   r   r   r   r   r   r      r   )r-   
zeros_like
empty_liker   r   r  r  AllReduceFusionPatternkARResidualRMSNormr  r  )rq   r`   r   
rms_result	allreducer&   s        r'   rR   z5AllReduceRMSNormPattern.register.<locals>.replacement  s     '..H)%00J"..0L...+6"!# ,CV  'IIKK I Q<1--r)   r-   rT   tuplerU   rV   r:   rW   rX   s   `   r'   rY   z AllReduceRMSNormPattern.register  s    	)<	)).	)5<-.	) 	) 	) 	) 	) 	)	.<	.).	.5<-.	. 	. 	. 	. 	. 	.* 	[$//"3"3R['	
 	
 	
 	
 	
r)   r*   r+   r,   r  floatr-   r   r.   r  r(   r[   rT   r:   r   rY   r   r   s   @r'   r  r  r  s         
7
7 {
7 d
	
7
 9
7 

7 
7 
7 
7 
7 
7.D. . . . . 
 2  
t  
  
  
  
  
  
  
  
r)   r  c            
       z     e Zd ZdZdedej        dedz  deddf
 fdZ	de
ej                 fd	Zd
eddfdZ xZS )AllReduceFusedAddRMSNormPatternz
    This pattern replaces the allreduce + rms norm (with residual)
    with fused flashinfer implementation.
    Applies to o_proj + rmsnorm after attn and mlp + rmsnorm before attn.
    r  r   r    Nr  r!   c                     t                                          ||           || _        || _        t	          |          | _        d S r#   r   r(   r  r  r   r  r  s        r'   r(   z(AllReduceFusedAddRMSNormPattern.__init__  D     	''' 05g>>r)   c                 v    | j                                         \  }}}||                    | j                  |gS r#   r  )r&   rq   r   r`   s       r'   r:   z*AllReduceFusedAddRMSNormPattern.get_inputs  s:    "&"6"="="?"?x %((4:..77r)   r;   c           
      6    dt           j        dt           j        dt           j        dt          t           j        t           j        f         f fd}dt           j        dt           j        dt           j        dt          t           j        t           j        f         f fd}t          j        ||                                 t          j        |           d }t          j         ||           ||                                           t          j        |           d S )Nr   rq   r`   r!   c                 ^    t          |          }                    |||           \  }} || fS r#   r!  )r   rq   r`   r"  r#  r&   s        r'   rJ   z9AllReduceFusedAddRMSNormPattern.register.<locals>.pattern  s:      @FF 001A68TTMC= r)   c                     t           
J d            t          t          f|| d d d |j        t           j        j        dj                                        }|d         |d         fS )Nr%  r&  r   r   )r   r   r  r  r*  r+  r  r  )r   rq   r`   r-  r&   s       r'   rR   z=AllReduceFusedAddRMSNormPattern.register.<locals>.replacement  s     #..0L...+6"! ,CV  'IIKK I Q<1--r)   c                       fdS )Nc                 *     | ||          d         S )Nr   r/   )r   r   cfns      r'   <lambda>zLAllReduceFusedAddRMSNormPattern.register.<locals>.<lambda>.<locals>.<lambda>  s    rr!Q{{1~ r)   r/   )r>  s   `r'   r?  z:AllReduceFusedAddRMSNormPattern.register.<locals>.<lambda>  s    'E'E'E'E r)   r.  )r&   r;   rJ   rR   first_return_onlys   `    r'   rY   z(AllReduceFusedAddRMSNormPattern.register  s1   	!l	!+0<	!AF	!5<-.	! 	! 	! 	! 	! 	!	.l	.+0<	.AF	.5<-.	. 	. 	. 	. 	. 	.& 	[$//"3"3R['	
 	
 	
 FE
g&&k**OOK	
 	
 	
 	
 	
r)   r0  r   s   @r'   r3  r3    s         
?
? {
? d
	
?
 9
? 

? 
? 
? 
? 
? 
?8D. 8 8 8 8)
 2 )
t )
 )
 )
 )
 )
 )
 )
 )
r)   r3  c            
       z     e Zd ZdZdedej        dedz  deddf
 fdZ	de
ej                 fd	Zd
eddfdZ xZS )*AllReduceFusedRMSNormStaticQuantFP8Patternz
    This pattern replaces the allreduce + rms norm (without residual)
    + static fp8 quant with fused flashinfer implementation.
    Applies to allreduce + rmsnorm + quant before attn
    in the first Transformer block.
    r  r   r    Nr  r!   c                     t                                          ||           || _        || _        t          j        | _        t          |          | _        t          t                    | _        d S r#   )r   r(   r  r  r-   float8_e4m3fnquant_dtyper   r  r   r   quant_matcherr  s        r'   r(   z3AllReduceFusedRMSNormStaticQuantFP8Pattern.__init__  s`     	''' 0 .-g66,-@AAr)   c                     | j                                         \  }}| j                                        \  }}|                    | j                  ||gS r#   r  r  rF  r  r   )r&   rq   r`   _scales        r'   r:   z5AllReduceFusedRMSNormStaticQuantFP8Pattern.get_inputs	  sO    ,3355v%,,..5 $$fe44r)   r;   c           
          dt           j        dt           j        dt           j        dt          t           j        t           j        f         f fd}dt           j        dt           j        dt           j        dt          t           j        t           j        f         f fd}t          j        ||                                 t          j        |           d S )Nrq   r`   rJ  r!   c                     t          |           }                    ||          }                    ||          \  }}||fS r#   r   r  rF  )rq   r`   rJ  
all_reducer#  quantrI  r&   s          r'   rJ   zDAllReduceFusedRMSNormStaticQuantFP8Pattern.register.<locals>.pattern  sK    
 :%@@J&&z6::C))#u55HE1*$$r)   c                 Z   t          j        |           }t          j        |           }t          j        | j                  }t          
J d            t          t          f| |||d |j        t          j        j	        |d	j
                                        }|d         |d         fS )Nr   r%  	r   r   r   r   r   r   r   r   r   r4   r   )r-   r(  r)  rE  r   r   r  r  r*  kARResidualRMSNormFP8Quantr  r  )rq   r`   rJ  r   
result_rmsresult_quantr-  r&   s          r'   rR   zHAllReduceFusedRMSNormStaticQuantFP8Pattern.register.<locals>.replacement  s     '..H)%00J +E9IJJJL"..0L...+6"!#&  $:U"  'IIKK I$ Q<1--r)   r.  rX   s   `   r'   rY   z3AllReduceFusedRMSNormStaticQuantFP8Pattern.register  s    	%<	%L	% <	% 5<-.		% 	% 	% 	% 	% 	%	.<	.).	.>Cl	.5<-.	. 	. 	. 	. 	. 	.6 	[$//"3"3R['	
 	
 	
 	
 	
r)   r0  r   s   @r'   rB  rB    s         BB {B d
	B
 9B 
B B B B B B5D. 5 5 5 5(
 2 (
t (
 (
 (
 (
 (
 (
 (
 (
r)   rB  c            
       z     e Zd ZdZdedej        dedz  deddf
 fdZ	de
ej                 fd	Zd
eddfdZ xZS )-AllReduceFusedAddRMSNormStaticQuantFP8Patternz
    This pattern replaces the allreduce + rms norm (with residual)
    + static fp8 quant with fused flashinfer implementation.
    Applies to o_proj + rmsnorm after attn + quant and
    mlp + rmsnorm + quant before attn.
    r  r   r    Nr  r!   c                     t                                          ||           || _        || _        t          j        | _        t          |          | _        t          t                    | _        d S r#   )r   r(   r  r  r-   rD  rE  r   r  r   r   rF  r  s        r'   r(   z6AllReduceFusedAddRMSNormStaticQuantFP8Pattern.__init__C  s`     	''' 0 .5g>>,-@AAr)   c                     | j                                         \  }}}| j                                        \  }}||                    | j                  ||gS r#   rH  )r&   rq   r   r`   rI  rJ  s         r'   r:   z8AllReduceFusedAddRMSNormStaticQuantFP8Pattern.get_inputsR  sT    "&"6"="="?"?x%,,..5 %((4:..>>r)   r;   c                     dt           j        dt           j        dt           j        dt           j        dt          t           j        t           j        f         f
 fd}dt           j        dt           j        dt           j        dt           j        dt          t           j        t           j        f         f
 fd}t          j        ||                                 t          j        |           d S )Nr   rq   r`   rJ  r!   c                     t          |          }	                    |||           \  }}	                    ||          \  }}||fS r#   rM  )
r   rq   r`   rJ  r"  r#  resrO  rI  r&   s
            r'   rJ   zGAllReduceFusedAddRMSNormStaticQuantFP8Pattern.register.<locals>.patternZ  sS      @FF++,<fhOOHC))#u55HE1#:r)   c                 
   t          j        |j                  }t          
J d            t	          t
          f|| d |d |j        t          j        j        |d	j	        
                                }|d         |d         fS )NrQ  r%  rR  r4   r   )r-   r)  rE  r   r   r  r  r*  rS  r  r  )r   rq   r`   rJ  rU  r-  r&   s         r'   rR   zKAllReduceFusedAddRMSNormStaticQuantFP8Pattern.register.<locals>.replacementf  s     !+E9IJJJL"..0L...+6"!&  $:U"  'IIKK I" Q<1--r)   r.  rX   s   `   r'   rY   z6AllReduceFusedAddRMSNormStaticQuantFP8Pattern.registerY  s    
	l
	<
	 L
	 <	
	
 5<-.
	 
	 
	 
	 
	 
		.l	.<	. L	. <		.
 5<-.	. 	. 	. 	. 	. 	.6 	[$//"3"3R['	
 	
 	
 	
 	
r)   r0  r   s   @r'   rW  rW  ;  s         BB {B d
	B
 9B 
B B B B B B?D. ? ? ? ?*
 2 *
t *
 *
 *
 *
 *
 *
 *
 *
r)   rW  c            
       z     e Zd ZdZdedej        dedz  deddf
 fdZ	de
ej                 fd	Zd
eddfdZ xZS ),AllReduceFusedRMSNormStaticQuantNVFP4Patternz
    This pattern replaces the allreduce + rms norm (without residual)
    + static nvfp4 quant with fused flashinfer implementation.
    Applies to allreduce + rmsnorm + quant before attn
    in the first Transformer block.
    r  r   r    Nr  r!   c                     t                                          ||           || _        || _        t	          |          | _        d S r#   r  r  s        r'   r(   z5AllReduceFusedRMSNormStaticQuantNVFP4Pattern.__init__  r  r)   c                    t          j        g d| j        | j                  }t          j        d| j        t           j                  }t          j        ddg| j        t           j                  }t          j        dg| j        | j                  }t          j        ddg| j        t           j                  }|||||gS )N)r   r3   r3   r5   r3   r   r   r3      r4   r-   r7   r    r   uint8rp   int32)r&   rq   quant_resultinput_global_scaler`   output_scales         r'   r:   z7AllReduceFusedRMSNormStaticQuantNVFP4Pattern.get_inputs  s    KKK4:NNN{74;ekRRR"[F4;em
 
 
 bT$+TZHHH{C8DKu{SSS|V-?NNr)   r;   c                 0    dt           j        dt           j        dt           j        dt           j        dt           j        dt          t           j        t           j        t           j        f         f fd}dt           j        dt           j        dt           j        dt           j        dt           j        dt          t           j        t           j        t           j        f         f fd}t          j        ||                                 t          j        |           d S )	Nrq   rg  r`   rh  ri  r!   c                     t          |           }                    ||          }t          t          ||||d          }|d         ||d         fS NT)r   rq   ri  input_scaleis_sf_swizzled_layoutr   r   r   r  r   STATIC_FP4_QUANT_OP)	rq   rg  r`   rh  ri  rN  r#  quant_out_tupler&   s	           r'   rJ   zFAllReduceFusedRMSNormStaticQuantNVFP4Pattern.register.<locals>.pattern  sf     :%@@J&&z6::C1##).&*  O #1%z?13EEEr)   c                 2   t          j        |           }t          j        |           }t          
J d            t	          t
          f| |||||j        t          j        j        |d	j	        
                                }|d         |d         |d         fS )Nr%  rR  r4   r      )r-   r(  r)  r   r   r  r  r*  kARResidualRMSNormFP4Quantr  r  )	rq   rg  r`   rh  ri  r   rT  r-  r&   s	           r'   rR   zJAllReduceFusedRMSNormStaticQuantNVFP4Pattern.register.<locals>.replacement  s     '..H)%00J"..0L...+6"!#&&  $:U/  'IIKK I$ Q<1y|;;r)   r.  rX   s   `   r'   rY   z5AllReduceFusedRMSNormStaticQuantNVFP4Pattern.register  s+   	F<	F,	F L	F !&		F
  ,	F 5<u|;<	F 	F 	F 	F 	F 	F*	<<	<,	< L	< !&		<
  ,	< 5<u|;<	< 	< 	< 	< 	< 	<< 	[$//"3"3R['	
 	
 	
 	
 	
r)   r0  r   s   @r'   r_  r_    s         
7
7 {
7 d
	
7
 9
7 

7 
7 
7 
7 
7 
7	OD. 	O 	O 	O 	O6
 2 6
t 6
 6
 6
 6
 6
 6
 6
 6
r)   r_  c            
       z     e Zd ZdZdedej        dedz  deddf
 fdZ	de
ej                 fd	Zd
eddfdZ xZS )/AllReduceFusedAddRMSNormStaticQuantNVFP4Patternz
    This pattern replaces the allreduce + rms norm (with residual)
    + static nvfp4 quant with fused flashinfer implementation.
    Applies to o_proj + rmsnorm after attn + quant and
    mlp + rmsnorm + quant before attn.
    r  r   r    Nr  r!   c                     t                                          ||           || _        || _        t	          |          | _        d S r#   r5  r  s        r'   r(   z8AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern.__init__  r6  r)   c                    t          j        ddg| j        | j                  }t          j        ddg| j        | j                  }t          j        ddg| j        | j                  }t          j        d| j        t           j                  }t          j        ddg| j        t           j                  }t          j        ddg| j        t           j                  }||||||gS )Nr3   r5   rb  r   rc  r4   rd  )r&   rq   r   r`   rg  rh  ri  s          r'   r:   z:AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern.get_inputs  s    RHT[
KKK;Bx4:NNNb"XdkLLL{74;ekRRR"[F4;em
 
 
 {C8DKu{SSS 
 	
r)   r;   c                 `    dt           j        dt           j        dt           j        dt           j        dt           j        dt           j        dt          t           j        t           j        t           j        f         f fd}dt           j        dt           j        dt           j        dt           j        dt           j        dt           j        dt          t           j        t           j        t           j        f         f fd	}t          j        ||                                 t          j        |           d S )
Nrg  r   rq   ri  r`   rh  r!   c                     t          |          }	                    |||          \  }}t          t          | |||d          }|d         ||d         fS rl  ro  )
rg  r   rq   ri  r`   rh  r"  r#  rq  r&   s
            r'   rJ   zIAllReduceFusedAddRMSNormStaticQuantNVFP4Pattern.register.<locals>.pattern  so      @FF 001A68TTMC1##).&*  O #1%x1CCCr)   c                     t           
J d            t          t          f||d | ||j        t           j        j        |d	j                                        }|d         |d         |d         fS )Nr%  rR  r4   r   rs  )r   r   r  r  r*  rt  r  r  )rg  r   rq   ri  r`   rh  r-  r&   s          r'   rR   zMAllReduceFusedAddRMSNormStaticQuantNVFP4Pattern.register.<locals>.replacement  s     #..0L...+6"!&&  $:U/  'IIKK I" Q<1y|;;r)   r.  rX   s   `   r'   rY   z8AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern.register  sD   	D,	Dl	D <	D  ,		D
 L	D !&	D 5<u|;<	D 	D 	D 	D 	D 	D,	<,	<l	< <	<  ,		<
 L	< !&	< 5<u|;<	< 	< 	< 	< 	< 	<8 	[$//"3"3R['	
 	
 	
 	
 	
r)   r0  r   s   @r'   rv  rv    s         
?
? {
? d
	
?
 9
? 

? 
? 
? 
? 
? 
?
D. 
 
 
 
(5
 2 5
t 5
 5
 5
 5
 5
 5
 5
 5
r)   rv  c                        e Zd Zdeddf fdZedd            ZdedefdZ	e
j        dej        ddfd	            Zdd
Z xZS )AllReduceFusionPassr   r!   Nc                    t                                          |           d| _        t                      | _        | j        dk    rt
                              d           d S t          d          | _        |j	        t
                              d           d S |j	        
                                | _        t                      j        | _        t                      }| j        t"          j        k    }t&          t
                              d           d S |j        j                            | j                  }|"t
                              d| j                   d S |rd	nd
}|| j        |z  z  | _        t3          | j        |j        j                  | _        t
                              d|dz   d| j         d           t&                              || j        | j        | j        | j        |          \  | _        }|atA          || j        || j                  | _!        | "                                 | #                    || j                   d S )NTr   z3AllReduce fusion pass is disabled for tp_size <= 1.all_reduce_fusion_passr   z;AllReduce fusion pass is disabled for missing model_config.zTFlashinfer is not installed or comm module not found, skipping allreduce fusion passzZFlashinfer allreduce fusion is not supported for world size %s or max size is not providedr4   r   zFlashinfer max size: r   zB MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: global)scope)tp_rankr%   r   r   groupr
  )r	  rA   r
  r   )$r   r(   disabledr   r%   r   warning_oncer   r   model_configget_hidden_sizer   r
   rP   r  r   r   r-   rp   r   warningr   pass_configflashinfer_max_sizer   minscheduler_configmax_num_batched_tokens
debug_once1trtllm_create_ipc_workspace_for_all_reduce_fusionipc_handlesr   r  r  register_patternsr   )r&   r   r	  r
  max_sizer   workspace_tensorr   s          r'   r(   zAllReduceFusionPass.__init__?  s      ;==<1 UVVVF,>.-
 -
 -
 &M   F -==??!^^0
-//+u}<"NN1   F,8LLL
 
 NN/  
 F,3qq!%$/L*HI ! 7 N
 
 	AH$= A A,0,>A A 	 	 	
 	
 	
 MM"0?j!1 N   	+*  0 >|-,	!
 !
 !
 	   64=11111r)   c                 F   dD ]}t          || j        | j        | j                                      | j                   t          || j        | j        | j                                      | j                   t          j        d          rrt          || j        | j        | j                                      | j                   t          || j        | j        | j                                      | j                   t          || j        | j        | j                                      | j                   t          || j        | j        | j                                      | j                   t          j        j        j                                         d| _        d S )N)gh㈵>gư>r   F)rB  r   r    r  rY   r   rW  r   has_device_capabilityr_  rv  r  r3  r-   	_inductorpattern_matcher_seen_patternsclearr  )r&   r  s     r'   r  z%AllReduceFusionPass.register_patterns  s   # )	C )	CG6 %	 
 ht}%%%9 %	 
 ht}%%%5c:: *<$K)	 
 (4=)))?$K)	 
 (4=)))# %	 
 ht}%%%+ %	 
 ht}%%% O+:@@BBBBr)   r   c                     | j         rt                              d           dS t          |j        | j        k              S )Nz"AllReduce fusion pass is disabled.F)r  r   r  r   r   r   )r&   r   s     r'   r   z+AllReduceFusionPass.is_applicable_for_range  s?    = 	 DEEE5M%);;<<<r)   r   c                     | j         rt                              d           d S | j                            |          | _        t                              d| j                   d S )NzAllReduceFusionPass disabledr   )r  r   r   r   r   r   r   s     r'   r   zAllReduceFusionPass.__call__  sZ    = 	LL7888F!]0077+T-?@@@@@r)   c                     t          | dd          rd S t          't                              | j        | j                   d S d S )Nr  T)getattrr   +trtllm_destroy_ipc_workspace_for_all_reducer  r  r  s    r'   __del__zAllReduceFusionPass.__del__  sU    4T** 	F&GG $*     '&r)   )r!   N)r*   r+   r,   r   r(   r   r  r	   r   r   r   r   r   r   r   r  r   r   s   @r'   r}  r}  >  s        G2z G2d G2 G2 G2 G2 G2 G2R , , , ,\=U =t = = = = "Abh A4 A A A #"A       r)   r}  )NNNN)Ximportlib.utilr   typesr   r-   torch._inductor.pattern_matcherr  r  rU   torch.fxr   *torch._higher_order_ops.auto_functionalizer   r   #torch.distributed._symmetric_memoryr   vllm.configr   vllm.config.utilsr	   vllm.distributedr
   r   vllm.distributed.parallel_stater   r   vllm.loggerr   9vllm.model_executor.layers.quantization.utils.quant_utilsr   vllm.platformsr   vllm.utils.torch_utilsr   inductor_passr   matcher_utilsr   r   r   vllm_inductor_passr   r   	fp8_dtyperm   r   __annotations__flashinfer.commcomm_flashinfer_commhasattrImportErrorr*   r   rC   r   r   rF   rp  r   r1   r]   rj   r   r   r   r   r   r  r  r1  r   r   r   rT   r   r   r   rG   r  r  r  r3  rB  rW  r_  rv  r}  r/   r)   r'   <module>r     sb   % $ $ $ $ $ $        , , , , , , , , ,       J J J J J J > > > > > > I I I I I I " " " " " " # # # # # # K K K K K K K K        $ # # # # #      , + + + + + < < < < < < + + + + + + R R R R R R R R R R H H H H H H H H&&((	%)d" ) ) )9\ 2222227#%>?? 	/.O    
X		
759<+,, @),7?> > > > > > > >
 
 
 
 
{ 
 
 
B 
  
  
  
  
;  
  
  
FA
 A
 A
 A
 A
; A
 A
 A
H;
 ;
 ;
 ;
 ;
{ ;
 ;
 ;
|F
 F
 F
 F
 F
+ F
 F
 F
RA
 A
 A
 A
 A
k A
 A
 A
H1A 1A 1A 1A 1A( 1A 1A 1Ap 	 	 
 
@ @ c4U
+;&;!<   $ 	 	 
 
D D #T#tCJ/?*?%@    
C )-)-)-,0K
 K
lK
,K
 <K
 	K

 K
 K
 K
 $(K
 K
 K
 K
 ,%K
 <$&K
 <$&K
 lT)K
  
!K
 K
 K
 K
r )-)-)-,0 l, < 	
    $(    ,% <$& <$& lT)  
!   & 80
 
 
 8    		=E +

 
 
 
 
 
 
 
:9
 9
 9
 9
 9
k 9
 9
 9
xB
 B
 B
 B
 B
k B
 B
 B
JE
 E
 E
 E
 E
 E
 E
 E
PH
 H
 H
 H
 H
K H
 H
 H
VU
 U
 U
 U
 U
; U
 U
 U
p]
 ]
 ]
 ]
 ]
k ]
 ]
 ]
@N N N N N0 N N N N Ns   $B9 9C C