
    )`i!             A       |   d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	m
Z
mZmZ ddlZddlmZ ddlmZmZmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl#m+Z+m,Z,  G d de          Z- G d de          Z. G d de          Z/de/de0fdZ1dej2        de
ej2                 de/fdZ3 G d de          Z4 G d de          Z5ej6        	 d}d e/d!e/d"e
e7         de0fd#            Z8	 d}d$ej2        d%e9d&ede9f         dej2        fd'Z:	 d}d(ej2        d%e9d&ede9f         dej2        fd)Z;dej2        fd*Z<d+ Z=d,ej2        d-e9dej2        fd.Z>ej6        d~d1e7d2e0fd3            Z?edddddddddddddd0d0d0d0d0d0d4de.j@        fd5ej2        d6ej2        d7ej2        d8ej2        d9ej2        d:ejA        d;e	ej2                 d<e
ej2                 d=e
ej2                 d>e
ej2                 d?e
ej2                 d@e
ej2                 dAe
ej2                 dBe9dCe9dDe9dEe9dFe9dGe9dHe
ej2                 dIe0dJe0dKe0dLe0dMe0dNe0dOe9dPe
e0         dQe.dej2        f<dR            ZBej6        dS             ZCedddTe4jD        dTd4fdUej2        dVe
ej2                 dWej2        dXej2        dYej2        dZe9d[e9d\e
e9         d]e
e9         d^e9d_e9d`e9dae
eE         dbe9dce0dde9dPe0dOe9dej2        f&de            ZFe	 	 	 ddUej2        dVe
ej2                 dWej2        dXej2        dfej2        dgej2        dYej2        dhej2        dZe9d[e9d\e
e9         d]e
e9         d^e9d_e9d`e9dae
eE         die0dbe9dPe
e0         dOe9dej2        f*dj            ZGe	 	 	 	 	 ddUej2        dVe
ej2                 dWej2        dkej2        dXej2        dlej2        dYej2        dmej2        dZe9d[e9d\e
e9         d]e
e9         d^e9d_e9d`e9dae
eE         dbe9dce0dde9dPe
e0         dOe9dej2        f,dn            ZHe	 	 	 	 	 	 ddUej2        dVe
ej2                 dWej2        dke
ej2                 dXej2        dlej2        doe
ej2                 dpe
ej2                 dqe
ej2                 dre
ej2                 dYej2        dmej2        dse
ej2                 dte
ej2                 due
ej2                 dve
ej2                 dZe9d[e9d\e
e9         d]e
e9         d^e9d_e9d`e9dae
eE         dbe9dwe0dPe
e0         dxe9dHe
ej2                 dOe9de	ej2                 f>dy            ZIe	 	 	 	 	 	 ddzej2        dVe
ej2                 dWej2        dke
ej2                 dXej2        dlej2        doe
ej2                 dpe
ej2                 dqe
ej2                 dre
ej2                 dYej2        dmej2        dse
ej2                 dte
ej2                 due
ej2                 dve
ej2                 dZe9d[e9d\e
e9         d]e
e9         d^e9d_e9d`e9dae
eE         dbe9dwe0dPe
e0         dxe9dHe
ej2                 dOe9de	ej2                 f>d{            ZJe	 	 	 	 ddUej2        dVe
ej2                 dWej2        dXej2        dlej2        dpe
ej2                 dqe
ej2                 dre
ej2                 dYej2        dmej2        dZe9d[e9d\e
e9         d]e
e9         d^e9d_e9d`e9dae
eE         dbe9dPe
e0         dHe
ej2                 dOe9de	ej2                 f.d|            ZKdS )a3  
Copyright (c) 2025 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)IntEnum)SimpleNamespace)AnyDictListOptionalTupleUnion   )flashinfer_api)	AutoTunerDynamicTensorSpecOptimizationProfileTunableRunnerTuningConfig)is_cuda_version_at_least)logger)setup_cubin_loader)"gen_cutlass_fused_moe_sm120_module"gen_cutlass_fused_moe_sm103_module"gen_cutlass_fused_moe_sm100_module!gen_cutlass_fused_moe_sm90_module!gen_cutlass_fused_moe_sm89_module%gen_trtllm_gen_fused_moe_sm100_module)check_shape_dtype_devicedevice_support_pdl get_shuffle_matrix_a_row_indices#get_shuffle_matrix_sf_a_row_indicesregister_custom_opregister_fake_opget_compute_capability   )&get_last_power_of_2_num_tokens_bucketslast_positive_power_of_2c                   *    e Zd ZdZdZdZdZdZdZdZ	dS )	RoutingMethodTyper   r"   )r   )   )   )      N)
__name__
__module____qualname__DefaultRenormalize
DeepSeekV3Llama4RenormalizeNaiveTopKUnspecified     m/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/fused_moe/core.pyr&   r&   =   s2        GKJFDKKKr8   r&   c                   2    e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
S )ActivationTyper   r"   r   r)   r*   r+   r,         N)r-   r.   r/   GeluReluSiluSwigluGeglu
SwigluBiasRelu2IdentityInvalidTyper7   r8   r9   r;   r;   O   s;        DDDFEJEHKKKr8   r;   c                   l    e Zd Zd ZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdZdZdZdZdZdZdZdS )DtypeTrtllmGenc                 ~    |dz  |dz  z  |dz  z  |dz  z  |z  }t                               | |          }||_        |S )N         r=   )int__new___value_)clsblock_format_bit
signed_bitinteger_bitnum_bitsuidvalueobjs           r9   rN   zDtypeTrtllmGen.__new__\   sb    #R!b " 1} 	 	 kk#u%%
r8   )r   r"   r   rL   r   )r   r   r"   r"   r"   )r"   r"   r   r*   r   )r"   r"   r   r,   r)   )r"   r"   r   r,   r*   )r   r"   r   r=   r+   )r   r"   r   r=   r,   )r   r"   r   rL   r<   )r   r"   r       r=   )r   r"   r"   r=   	   )r   r"   r"   rX   
   )r   r"   r"   @      )r"   r"   r   r*      )r"   r"   r   r=      )r"   r"   r"   r*      )r   r   r   r=      )r   r   r"   r=   rL   )r   r   r"   rL      )r   r   r"   rX      )r   r   r"   r[      )r   r   r"      rK   )r   r"   r   r      N)r-   r.   r/   rN   Bfloat16BoolE2m1E2m3E3m2E4m3E5m2Fp16Fp32Int8Int32Int64MxE2m1MxE4m3MxInt4UE8m0UInt8UInt16UInt32UInt64UInt128Voidr7   r8   r9   rH   rH   [   s        
 
 
  HDDDDDDDDDEEFFFEEFFF GDDDr8   rH   dtypereturnc                     | t           j        t           j        t           j        t           j        t           j        fv rdS dS )NTF)rH   rs   rh   rr   rt   r|   s    r9   trtllm_gen_dtype_has_scaler      s>       tur8   xscalec                    | j         d         }| j        t          j        k    r|dz  }| j        t          j        k    rt
          j        }n| j        t          j        k    r|t
          j        nt
          j	        }n^| j        t          j        k    r:|
J d            |j         d         |dz  k    rt
          j
        }nt
          j        }nt          d          |S )Nr   z0Scale tensor must be provided for float4x2 inputrL   z$Unsupported trtllm-gen input tensor.)shaper|   torchuint8bfloat16rH   rf   float8_e4m3fnrk   rs   rh   rr   
ValueError)r   r   hidden_sizer|   s       r9   deduce_trtllm_gen_tensor_dtyper      s     '"+Kw%+qw%.  '	
E'	'	'',}##.:O	5;  "T   ;r?kR///"'EE")EE?@@@Lr8   c                       e Zd ZdZdZdZdS )WeightLayoutr   r"   r   N)r-   r.   r/   MajorKMajorMnBlockMajorKr7   r8   r9   r   r      s        FG KKKr8   r   c                       e Zd ZdZdZdS )GatedActTyper   r"   N)r-   r.   r/   SwiGluGeGlur7   r8   r9   r   r      s        FEEEr8   r   dtype_weights	dtype_actquant_methodc                    t          t          j                                                  }|d         dk     rdS | t          j        t          j        t          j        t          j        fvrdS | t          j        k    r|t          j        k    rdS | t          j        k    r|t          j        k    rdS | t          j        k    r|t          j        k    rdS | t          j        k    r'|t          j        t          j	        t          j        fvrdS dS )Nr   rZ   FT)
r!   r   cudacurrent_devicerH   rf   rk   rh   rr   rs   )r   r   r   archs       r9   is_trtllm_moe_supportedr      s    "%*";";"="=>>DAw||u	   u000000u+++	^=P0P0Pu+++	^=P0P0Pu---)D 3 3
 u4r8   dst_w3_w1_weightepilogue_tile_mnum_elts_per_sfc                     d|j         f}|| vrXt          |          }|t          ||          }nt          |||          }||                             |j                  | |<   | |         }|S )Nw3_w1)r   r   r   )r   /get_reorder_rows_for_gated_act_gemm_row_indicesr   r   todevice)_cache_permute_indicesr   r   r   	cache_keypermute0permute1permute_indicess           r9   '_maybe_get_cached_w3_w1_permute_indicesr      s     *01I...BCSTT"7 /  HH ;  / /  H -5X,>,A,A#-
 -
y) -Y7Or8   dst_w2_weightc                     d|j         f}|| vrZ|)t          ||                              |j                  }n*t	          |||                              |j                  }|| |<   | |         }|S )Nw2r   )r   r   r   r   r   )r   r   r   r   r   r   s         r9   !get_w2_permute_indices_with_cacher      s     }*+I..."> b%&& O B / /   b%&&	  -<y),Y7Or8   c                 |   |                                  dk    sJ d|                                               | j        \  }}|dz  dk    sJ d|             t          j        |t          j                  }|d|dz   dz           }||dz   dz  d         }t          j        |          }||ddd<   ||ddd<   |S )z
    Reorders rows in the gemm/MOE_gemm weight matrix for min-latency
    [r0, r1, r2, r3, ..., rN/2, r(N/2+1), .. r(N-1)]
    to
    [r0, rN/2, r1, rN/2+1, ..., r(N/2-1), r(N-1)]
    r   zx should be a 2D tensor, not r   zx.shape[0] must be even, not r   Nr"   )dimr   r   arangelong
empty_like)r   MKrow_indicestopbotpermuted_row_indicess          r9   r   r     s     5577a<<<BBB<<<7DAqq5A::::q:::::,q
333K nQ1n
%C
q1ulnn
%C !+K88 "%A!$Ar8   c                 B    t          |           fd} ||           S )zL
    PyTorch implementation of trt-llm gen `reorderRowsForGatedActGemm`
    c                     |          S Nr7   )r   r   s    r9   <lambda>z1reorder_rows_for_gated_act_gemm.<locals>.<lambda>3  s    + r8   )r   )r   permuter   s     @r9   reorder_rows_for_gated_act_gemmr   -  s/     B!DDK&&&&G71::r8   input_tensorblockKc                     | j         \  }}||z  dk    s
J d            |                     |||z  |                              ddd                                          S )Nr   zK must be divisible by blockKr"   r   )r   viewr   
contiguous)r   r   r   r   s       r9   convert_to_block_layoutr   8  s`    DAqv:???;???QVV44<<Q1EEPPRRRr8   100Fbackenduse_fast_buildc           ?         | dv r"t          |                                          n| dk    r"t          |                                          n| dv r"t          |                                          nb| dk    r"t	          |                                          n:| dk    r"t          |                                          nt          d|            ddlm} t          |j
        d	z  d
z            }                    |g            G fddt                    t          dd          d d d d ddddddddddddd t          j        dfdt           j        dt           j        dt           j        dt           j        dt           j        dt$          t           j                 dt           j        dt$          t           j                 dt           j        dt(          t           j                 dt$          t           j                 dt$          t           j                 d t$          t           j                 d!t$          t           j                 d"t*          d#t*          d$t*          d%t*          d&t*          d't*          d(t,          d)t,          d*t,          d+t,          d,t,          d-t*          d.t$          t,                   d/t          d0t,          d1t(          t           j                 f<fd2            }t/          d          	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d5dt           j        dt           j        dt           j        dt           j        dt           j        dt$          t           j                 dt           j        dt$          t           j                 dt           j        dt(          t           j                 dt$          t           j                 dt$          t           j                 d t$          t           j                 d!t$          t           j                 d"t*          d#t*          d$t*          d%t*          d&t*          d't*          d(t,          d)t,          d*t,          d+t,          d,t,          d-t*          d.t$          t,                   d0t,          f8d3            }t1          |4          S )6N)120121103)r   1109089zInvalid backend: r   )envnv_internaltensorrt_llmc            $          e Zd ZU  e            Zeeej        ej        ej        e	e	e	e	f         e
f         ed<    e edd ed          d           f          Zdej        dej        dej        d	ed
edededededede	de	de	de	de	de	dede	f$ fdZdeej                 dedee         fdZ	 	 d&deej                 ded e	fd!Ze ej        d"#          d$efd%                        Zd"S )'/get_cutlass_fused_moe_module.<locals>.MoERunnerrunner_dictr'       c                 <    t          t          |           d          S Nr   minr$   r   s    r9   r   z8get_cutlass_fused_moe_module.<locals>.MoERunner.<lambda>`      c":1"="=tDD r8   dynamic_tensor_specsx_dtypeweight_dtypeoutput_dtypetop_ktp_sizetp_rankep_sizeep_rankcluster_sizecluster_rankenable_alltoalluse_deepseek_fp8_block_scaleuse_w4_group_scalinguse_mxfp8_act_scalingmin_latency_mode
enable_pdlactivation_typeuse_packed_weightsc           	         || _         || _        || _        || _        || _        || _        || _        || _        |	| _        |
| _	        || _
        || _        || _        || _        || _        || _        || _        |||||||f}|| _        d | _        |j        vr#                    |||||||          j        |<   j        |         | _        d S r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   gemm_idx_for_tuningr   initfused_moe_runner)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   instance_key	MoERunnermodules                       r9   __init__z8get_cutlass_fused_moe_module.<locals>.MoERunner.__init__e  s
   * #DL ,D ,DDJ"DL"DL"DL"DL ,D ,D#2D 0LD-(<D%)>D&$4D!(DO&8D#,$%"L $3D 6:D$9#8886<kk  0()&7 7	%l3 %.$9,$GD!!!r8   inputsprofiler}   c                    	 | j                                         }| j                                         }||z   }nC# t          $ r6 t	          t          | j                                                             cY S w xY wt          | dd           }|dk    rt	          t          |                    S |dk    r t	          t          |||z                       S t	          t          |                    S )Nr   r"   r   )r   get_gemm1_tactic_countget_gemm2_tactic_count	Exceptionlistrangeget_tactic_numgetattr)r   r   r   gemm1_countgemm2_counttotalstages          r9   get_valid_tacticszAget_cutlass_fused_moe_module.<locals>.MoERunner.get_valid_tactics  s    K"3JJLL"3JJLL#k1 K K KE$"7"F"F"H"HIIJJJJJK D"7>>EzzE+..///zzE+{[/HIIJJJe%%%s   7: =A:9A:r   Ftacticdo_preparationc                     |\  }}}}}	| j                             |||||	| j        | j        | j        | j        | j        | j        | j        | j	        | j
        |d         ||| j        | j                   d S )Ngemm_idx)r   run_gemm_profiler   r   r   r   r   r   r   r   r   r   r   )
r   r   r  r  kwargsr   fc1_expert_weightsfc1_expert_biasesfc2_expert_weightsfc2_expert_biasess
             r9   forwardz7get_cutlass_fused_moe_module.<locals>.MoERunner.forward  s     "!"!!22"!"!
!!$%z"$'    r8   Nmaxsizetune_max_num_tokensc           	      t    t          t          ddt                    fd          f          | _        d S )Nr'   c                 >    t          t          |                     S r   r   r   r  s    r9   r   zVget_cutlass_fused_moe_module.<locals>.MoERunner.refine_tuning_config.<locals>.<lambda>      #&>q&A&ACV"W"W r8   r   )r   r   r#   tuning_configrP   r  s    `r9   refine_tuning_configzDget_cutlass_fused_moe_module.<locals>.MoERunner.refine_tuning_config  sR     !-%>?RSSWWWW	 &	! 	! 	!Cr8   r   F)r-   r.   r/   dictr   r   r	   r   r|   boolr   __annotations__r   r   r#   r  rM   r;   r   r   Tensorr   r
  r  classmethod	functools	lru_cacher  )r   r   s   r9   r   r   U  s^         DFF 	T%+u{EKtT4OPRUU
 	 	 	 %!!::4@@DD	 "	
 	
 	
>	H[>	H  +>	H  +	>	H
 >	H >	H >	H >	H >	H >	H >	H ">	H +/>	H #'>	H $(>	H  #!>	H" #>	H$ ,%>	H& !%'>	H >	H >	H >	H >	H >	H >	H@	&&	& )	& #Y		& 	& 	& 	&. #(	"	 "	&"	 "	 !	"	 "	 "	 "	H 
		T	*	*	*
	3 
	 
	 
	 
+	* 

	 
	 
	r8   r   zflashinfer::cutlass_fused_moe mutates_argsr"   r   Fr   outputinputtoken_selected_expertstoken_final_scalesr  r  r  r  r   quant_scalesinput_sfswiglu_alphaswiglu_betaswiglu_limitr   r   r   r   r   r   r   r   r   r   r   r  r   r   r   r}   c                    |t          |j                  }t          j                    }'                    |            'di d|j        d|j        d|d|                    d          d|d|d|d	|d
|d|d|d|d|d|d|d|d|d|}d|_        |                    d|g'j	        |||||gd          \  }} d|_        |                    d|g'j	        |||||gd          \  }}!|r|j
        j        n|j
        j        }"t          j        dt          j        |j                  }#t          j        |j        d         |j        d         ft          j        |j                  }$t          j        |j        d         ft          j        |j                  }%|r|#|$|%gng }& |"| ||||||||	|
|||g|&||||||||| |!g||R   |r| n| |#|$|%gS )Nr   r   r   r   r"   r   r   r   r   r   r   r   r   r   r   r   r   r   r   ztrtllm::fused_moe::gemm1)r  r   ztrtllm::fused_moe::gemm2r(   r|   r   r   r7   )r   r   r   getr  r|   sizer   
choose_oner  r   run_moe_min_latencyrun_moer   emptyint32r   float32)(r+  r,  r-  r.  r  r  r  r  r   r/  r0  r1  r2  r3  r   r   r   r   r   r   r   r   r   r   r   r  r   r   r   tuner
moe_runner_gemm_tactic_1gemm_tactic_2r:  num_active_experts_per_nodeexperts_to_token_scoreactive_expert_global_idsmin_latency_outputr   s(                                          r9   cutlass_fused_moez7get_cutlass_fused_moe_module.<locals>.cutlass_fused_moe  s   F +EL99J&&':;;; Y 
 
 
KK
+11
 &
 )--a000	

 G
 G
 G
 G
 &
 &
 ,O
 *F)E
 "6!5
 #8"7
 .-
  "z!
" ,O#
$  21%

, *+
& ++&L#"!"!  , 
 
= *+
& ++&L#"!"!  , 
 
=   5J';;,4 	
 ',kEL'
 '
 '
# "'%a(%+a.9-<"
 "
 "

 $);%a(*+<$
 $
 $
   +&(   	 	"	
  	
 	
  !	
" #	
$ %	
& '	
( )	
* +	
, -	
. M*/	
0 1	
2 3	
 	
 	
 	
<  FF +&(			
r8   c                    |j         d         }|j         d         }|r|j         d         }||z  |g}||g} |g}!|                    ||          |                    dgt          j                  |                    | t          j                  |                    |!t          j                  gS |                    ||g|          gS Nr   r"   r   )r   	new_emptyr   r<  r=  )"r+  r,  r-  r.  r  r  r  r  r   r/  r0  r1  r2  r3  r   r   r   r   r   r   r   r   r   r   r   r  r   r   seq_lenr   num_experts_on_rankoutput_shapeexperts_to_token_score_shapeactive_expert_global_ids_shapes"                                     r9   _fake_cutlass_fused_moez=get_cutlass_fused_moe_module.<locals>._fake_cutlass_fused_moe  s    > +a.(.q1 	Q"4":1"=#&99;GL,?+I(.A-B*LAA5;77 <EMRR >ekRR	  OOWk$:,OOOPPr8   )rG  )NNNNr"   r   r"   r   r"   r   FFFFFr   NF)r   build_and_loadr   r   r   r   r   jitr   strFLASHINFER_CSRC_DIRset_deepgemm_jit_include_dirsr   r   r;   rA   r   r$  r   r|   r   rM   r"  r    r   )r   r   jit_envdeepgemm_include_dirrG  rP  r   r   s         @@r9   get_cutlass_fused_moe_modulerX  >  s   .  3NCCRRTT	E		3NCCRRTT	N	"	"3NCCRRTT	D2>BBQQSS	D2>BBQQSS6W66777 %$$$$$#m3nD  ((*>)?@@@T T T T T T T TM T T Tl '   ,0/3.2/3 %-2%*&+!&#'%)*8*?#(;Z
 Z
Z
|Z
 !&Z
 "L	Z

 "LZ
 $EL1Z
 "LZ
 $EL1Z
 kZ
 5<(Z
 5<(Z
 u|,Z
 el+Z
 u|,Z
 Z
  !Z
" #Z
$ %Z
& 'Z
( )Z
* +Z
, '+-Z
. #/Z
0  $1Z
2 3Z
4 !5Z
6 TN7Z
8 (9Z
: !;Z
< 
el	=Z
 Z
 Z
 Z
 Z
	 Z
x 566 ,0/3.2/3 %-2%*&+!&#'%)#(9-Q -Q-Q|-Q !&-Q "L	-Q
 "L-Q $EL1-Q "L-Q $EL1-Q k-Q 5<(-Q 5<(-Q u|,-Q el+-Q u|,-Q -Q  !-Q" #-Q$ %-Q& '-Q( )-Q* +-Q, '+--Q. #/-Q0  $1-Q2 3-Q4 !5-Q6 TN7-Q8 !9-Q -Q -Q 76-Q` +   r8   r   r,  r-  r.  r  r  r   r/  r  r  r0  r1  r2  r3  r   r   r   r   r   r   r+  r   r   r   r   r   r   r  r   r   c                 H   t           j                                        \  }}|dz  |z    }|rt          d          |r3|dk    rt          d          t	          d          st          d          |t          | j                  }| j        d         } |r| |j        d         z  } |j        d	         }!| |!f}"|t          j        |"|| j        
          }nt          ||"|| j        d            t          |          j        || |||||||||	|
||||||||f|||||||||d	S )a  Compute a Mixture of Experts (MoE) layer using CUTLASS backend.

    This function implements a fused MoE layer that combines expert selection, expert computation,
    and output combination into a single operation. It uses CUTLASS for efficient matrix multiplication
    and supports various data types and parallelism strategies.

    Parameters
    ----------
    input : torch.Tensor
        Input tensor of shape [num_tokens, hidden_size].
        Support float, float16, bfloat16, float8_e4m3fn and nvfp4.
        For FP8, the input must be quantized.
        For NVFP4, both quantized and non-quantized inputs are supported.

    token_selected_experts : torch.Tensor
        Indices of selected experts for each token.

    token_final_scales : torch.Tensor
        Scaling factors for each token's expert outputs.

    fc1_expert_weights : torch.Tensor
        GEMM1 weights for each expert.

    fc2_expert_weights : torch.Tensor
        GEMM2 weights for each expert.

    output_dtype : torch.dtype
        Desired output data type.

    quant_scales : List[torch.Tensor]
        Quantization scales for the operation.

        NVFP4:
            - gemm1 activation global scale
            - gemm1 weights block scales
            - gemm1 dequant scale
            - gemm2 activation global scale
            - gemm2 weights block scales
            - gemm2 dequant scale

        FP8:
            - gemm1 dequant scale
            - gemm2 activation quant scale
            - gemm2 dequant scale
            - gemm1 input dequant scale

    fc1_expert_biases : Optional[torch.Tensor]
        GEMM1 biases for each expert.

    fc2_expert_biases : Optional[torch.Tensor]
        GEMM1 biases for each expert.

    input_sf : Optional[torch.Tensor]
        Input scaling factor for quantization.

    swiglu_alpha : Optional[torch.Tensor]
        Swiglu alpha for swiglu activation.

    swiglu_beta : Optional[torch.Tensor]
        Swiglu beta for swiglu activation.

    swiglu_limit : Optional[torch.Tensor]
        Swiglu limit for swiglu activation.

    tp_size : int = 1
        Tensor parallelism size. Defaults to 1.

    tp_rank : int = 0
        Tensor parallelism rank. Defaults to 0.

    ep_size : int = 1
        Expert parallelism size. Defaults to 1.

    ep_rank : int = 0
        Expert parallelism rank. Defaults to 0.

    cluster_size : int = 1
        Cluster size. Defaults to 1.

    cluster_rank : int = 0
        Cluster rank. Defaults to 0.

    output : Optional[torch.Tensor] = None
        The output tensor, if not provided, will be allocated internally.

    enable_alltoall : bool = False
        Whether to enable all-to-all communication for expert outputs. Defaults to False.

    use_deepseek_fp8_block_scale : bool = False
        Whether to use FP8 block scaling. Defaults to False.

    use_w4_group_scaling : bool = False
        Whether to use W4A8 group scaling. Defaults to False.

    use_mxfp8_act_scaling : bool = False
        Whether to use MXFP8 activation scaling. Defaults to False.

    min_latency_mode : bool = False
        Whether to use minimum latency mode. Defaults to False.

    use_packed_weights : bool = False
        Whether to use packed uint4x2 weights passed as packed uint8 values. Defaults to False.

    tune_max_num_tokens : int = 8192
        Maximum number of tokens for tuning. Defaults to 8192.

    activation_type: ActivationType = ActivationType.Swiglu
        Activation to apply on for GEMM1, note that Relu2 means non-gated GEMM1

    Returns
    -------
    out: torch.Tensor
        Output tensor of shape [seq_len, hidden_size].


    Raises
    ------
    NotImplementedError:
        If any of the following features are requested but not implemented:
            - Minimum Latency Mode

    Note
    ----
    - The function supports various data types including FP32, FP16, BF16, FP8, and NVFP4.
    - It implements both tensor parallelism and expert parallelism.
    - Currently, some advanced features like FP8 block scaling and minimum latency mode
        are not implemented for Blackwell architecture.
    rZ   z3min latency mode not yet implemented for Blackwell.r   z4FP8 block scaling not yet implemented for Blackwell.z12.8z9FP8 block scaling not implemented for CUDA 12.6 or lower.Nr   r"   r5  r+  )	r   r   r   r   r   r   r  r   r   )r   r   get_device_capabilityNotImplementedErrorr   r   r   r   r;  r   rX  rG  )#r,  r-  r.  r  r  r   r/  r  r  r0  r1  r2  r3  r   r   r   r   r   r   r+  r   r   r   r   r   r   r  r   r   majorminordevice_archnum_rowsr   rM  s#                                      r9   rG  rG    s   @ :3355LE5RZ%')K Y!"WXXX# $%F   *&11 	%K   '55
{1~H 0&,Q//$*1-Kk*L~\elSSS L,h	
 	
 	
 G'44F)* .'%A13)/';   r8   c            E      ,   t                      } |                                 t          t          |                                                       G fddt
                    t          dd          	 	 d@dt          j        dt          t          j                 d	t          j        d
t          j        dt          j        dt          dt          dt          t                   dt          t                   dt          dt          dt          dt          t                   dt          dt          dt          dt          t                   dt          dt          j        f&fd            }t          d          	 	 d@dt          j        dt          t          j                 d	t          j        d
t          j        dt          j        dt          dt          dt          t                   dt          t                   dt          dt          dt          dt          dt          dt          dt          t                   dt          f"d            }t          dd          	 	 	 dAdt          j        dt          t          j                 d	t          j        d
t          j        dt          j        dt          j        dt          j        d t          j        dt          dt          dt          t                   dt          t                   dt          dt          dt          dt          t                   d!t          dt          dt          t                   dt          dt          j        f*fd"            }t          d          	 	 dBdt          j        dt          t          j                 d	t          j        d
t          j        dt          j        dt          j        dt          j        d t          j        dt          dt          dt          t                   dt          t                   dt          dt          dt          dt          t                   d!t          dt          dt          t                   f&d#            }t          d$d          	 	 	 	 dCdt          j        dt          t          j                 d	t          j        d&t          j        d
t          j        d't          j        dt          j        d(t          j        d)t          j        dt          dt          dt          t                   dt          t                   dt          dt          dt          dt          t                   dt          dt          dt          dt          t                   dt          dt          j        f.fd*            }t          d$          	 	 	 	 dDdt          j        dt          t          j                 d	t          j        d&t          j        d
t          j        d't          j        dt          j        d(t          j        d)t          j        dt          dt          dt          t                   dt          t                   dt          dt          dt          dt          t                   dt          dt          dt          dt          t                   f*d+            }t          d,d          	 	 	 	 dEdt          t          j                 d-t          t          j                 d.t          t          j                 dt          t          j                 d	t          j        d&t          t          j                 d
t          j        d't          j        d/t          t          j                 d0t          t          j                 d1t          t          j                 d2t          t          j                 dt          j        d(t          j        d3t          t          j                 d4t          t          j                 d5t          t          j                 d6t          t          j                 dt          dt          dt          t                   dt          t                   dt          dt          d7t          dt          t                   dt          d8t          dt          t                   d9t          d)t          t          j                 dt          dt          t          j                 fBfd:            }t          d,          dt          j        d-t          t          j                 d.t          t          j                 dt          t          j                 d	t          j        d&t          j        d
t          j        d't          j        d/t          t          j                 d0t          t          j                 d1t          t          j                 d2t          t          j                 dt          j        d(t          j        d3t          t          j                 d4t          t          j                 d5t          t          j                 d6t          t          j                 dt          dt          dt          t                   dt          t                   dt          dt          dt          dt          t                   dt          d8t          dt          d9t          d)t          t          j                 dt          f@d;            }t          d<d          	 	 	 dFdt          j        dt          t          j                 d	t          j        d
t          j        d't          j        d0t          t          j                 d1t          t          j                 d2t          t          j                 dt          j        d(t          j        dt          dt          dt          t                   dt          t                   dt          dt          d7t          dt          t                   dt          dt          t                   d)t          t          j                 dt          dt          t          j                 f.fd=            }	t          d<          dt          j        dt          t          j                 d	t          j        d
t          j        d't          j        d0t          t          j                 d1t          t          j                 d2t          t          j                 dt          j        d(t          j        dt          dt          dt          t                   dt          t                   dt          dt          dt          dt          t                   dt          dt          d)t          t          j                 dt          f,d>            }
t          |||||	?          S )GNc                      e Zd Zd d d d d d gZ e edd ed	d
          d e          f          Z e edd ed	d
          d edd                   f          Z e	            Z
ej        dej        dfdededededededededededefdZdeej                 d ed!ee         f fd"Z	 	 d*deej                 d$ed%effd&Ze ej        d'          d(efd)                        ZdS )+.get_trtllm_moe_sm100_module.<locals>.MoERunnerc                 0    t          j        | ||          S N)r   r|   r   r;  shapesr|   r   s      r9   r   z7get_trtllm_moe_sm100_module.<locals>.MoERunner.<lambda>      %+vU+ + + r8   c                 0    t          j        | ||          S rd  )r   randrf  s      r9   r   z7get_trtllm_moe_sm100_module.<locals>.MoERunner.<lambda>  s    %*vU+ + + r8   c                 0    t          j        | ||          S rd  re  rf  s      r9   r   z7get_trtllm_moe_sm100_module.<locals>.MoERunner.<lambda>  rh  r8   c                 0    t          j        | ||          S rd  re  rf  s      r9   r   z7get_trtllm_moe_sm100_module.<locals>.MoERunner.<lambda>  rh  r8   c                 T    t          j        | |                              |          S N)r   )r   randnr   rf  s      r9   r   z7get_trtllm_moe_sm100_module.<locals>.MoERunner.<lambda>  s(    %+fV*L*L*L*O*O+ + r8   c                 T    t          j        | |                              |          S rn  )r   onesr   rf  s      r9   r   z7get_trtllm_moe_sm100_module.<locals>.MoERunner.<lambda>  s(    %*VF*K*K*K*N*N+ + r8   r   r"   r   r)   r*   r+   r   r   r   r   r   r   r   r"   c                 <    t          t          |           d          S r   r   r   s    r9   r   z7get_trtllm_moe_sm100_module.<locals>.MoERunner.<lambda>  r   r8   r   r   r"   r   r)   r*   r   r   r   r   r   c                 <    t          t          |           d          S r   r   r   s    r9   r   z7get_trtllm_moe_sm100_module.<locals>.MoERunner.<lambda>  r   r8   Nr+   Fr   num_local_expertsr   r   use_deepseek_fp8r   intermediate_sizegated_act_typeuse_shuffled_weightweight_layoutr   c                     || _         || _        || _        || _        || _        || _        || _        || _        t          |          | _        |	| _	        t          |
          | _        || _        d S r   )rx  r   r   r   ry  r   rz  r   r{  r|  r   r}  r   )r   r   rx  r   r   ry  r   rz  r{  r|  r}  r   s               r9   r   z7get_trtllm_moe_sm100_module.<locals>.MoERunner.__init__  sw     &7D"DJ&DN!.D$4D!DJ*D%6D"".~">">D':D$!-m!<!<D&8D###r8   r   r   r}   c                 j   |^}}}}}}|j         d         }	| j        | j        | j        | j        | j        | j        | j        | j        | j	        | j
        |	f}
|
j        vrI	  j        |
 }n3# t          $ r&}t          j        d|
 d|            g cY d }~S d }~ww xY w|j        |
<   j        |
         S )Nr   z-[Autotuner]: Failed to get valid tactics for z. Error occurred: )r   r   r   ry  r   r   rz  rx  r{  r|  r}  valid_tactics_dicttrtllm_get_valid_moe_configsr  r   debug)r   r   r   r+  routing_logitstopk_idsexpert_weightshidden_statesextra_inputs
num_tokensr   valid_tacticser   moe_ops                r9   r
  z@get_trtllm_moe_sm100_module.<locals>.MoERunner.get_valid_tactics  s    '-a0J "%
 &&#("L 9#???$GF$G$VMM    Lkkkhikk   IIIIII	
 >K	,\:/==s    
A+ +
B5BBBr   r  r  c                 	   |^}}}}}	}
|j         d         }d}t          | j                  r|
|         }|dz  }nd }|j         d         |k    s
J d            |j         d         |k    s
J d            |j         d         |k    s
J d            |	j         d         |k    s
J d            |3|                                dk    r|j         d         |k    s
J d            | j        t
          j        k    r                    ||d	         |	|d
         |d         |d         | j        |d         |d         | j	        |d         | j
        |d         |d         |d         |d         |d         |dk    rddgn|           d S | j        t
          j        k    r| j        t
          j        k    r~| j        r|	j         d         }|	j         d         }t          j        |dz  |fdt          j        |	j                  }                    ||d	         |	||d
         |d         |d         |d         ||d         | j        |d         |d         | j	        |d         | j
        |d         |d         |d         |d         |d         |dk    rddgn|           d S                     ||d	         |	|d
         |d         |d         |d         |d         ||d         | j        |d         |d         | j	        |d         | j
        |d         |d         |d         |d         |dk    rddgn|           d S | j        t
          j        k    r| j        t
          j        k    r                    ||d	         |	|d
         |d         |d         |d          |d!         |d         |d         |d         | j        |d         |d         | j	        |d         | j
        |d         |d         |d         ||dk    rddgn|           d S  j        g ||||d	         |	||d
         |d         |d"         |d         |d          |d!         |d         |d         |d#         |d$         |d%         |d&         |d         | j        |d         |d         | j	        |d         | j
        |d         |d         |d         |d'         | j        ||dk    rddgn|R   d S )(Nr   r"   z,output's first dimension must be batch size.z.topk_ids's first dimension must be batch size.z4expert_weights's first dimension must be batch size.z3hidden_states's first dimension must be batch size.r   z8hidden_states_scale's first dimension must be batch sizerouting_biasgemm1_weightsgemm2_weightsnum_expertsn_group
topk_grouplocal_expert_offsetrouted_scaling_factorrouting_method_typer|  r}  r   r   rd   g       @r5  gemm1_weights_scalegemm2_weights_scaleoutput1_scales_scalaroutput1_scales_gate_scalaroutput2_scales_scalaruse_routing_scales_on_inputgemm1_alpha
gemm1_betagemm1_clamp_limit
gemm1_bias
gemm2_biasoutput1_scale_scalaroutput1_scale_gate_scalaroutput2_scale_scalardo_finalize)r   r   r   r   r   rH   rf   trtllm_bf16_moer   rz  rx  rk   ry  r   fullfloatr   trtllm_fp8_block_scale_moetrtllm_fp8_per_tensor_scale_moert   trtllm_mxint4_block_scale_moetrtllm_fp4_block_scale_moer{  )r   r   r  r  r  r+  r  r  r  r  r  r  extra_input_idxhidden_states_scalecurrent_num_tokenscurrent_hidden_sizecurrent_hidden_states_scaler  s                    r9   r  z6get_trtllm_moe_sm100_module.<locals>.MoERunner.forward  s    '-a0JO)$.99 +&2?&C#1$&*#<?j000> 100 >!$
222@ 322 "'*j888F 988 !&q)Z777E 877 '.#''))Q..'-a0J>>>I ?>
 !^%<<<&&">*!?+?+=)J9%<(*01*230101?+<( &"RHH&%    * ."555&.*=== ( :)6)<Q)?&*7*=a*@'27*,35GH#k,3	3 3 3/ 55&~.%3/45/45}-
y)|,.45.674545/|,$*bLLRf-    4 ::&~.%/67;</67}-
y)|,.45.67<=45|,$*bLLRf+    0 ."999&.*???44">*!?+01=)<(./?+01=)J9%<(*01*2301<( &"RHH&-    2 21 !"!! #! >*	!
 "! (! ?+! 01! <(! =)! <(! ./! ?+! 01! <(!  12!!" 67#!$ 12%!& =)'!( J)!* 9%+!, <(-!. */!0 011!2 *3!4 235!6 017!8 <(9!: =);!< '=!> ?!@ !'"RHH&A! ! ! ! ! !r8   r  r  c                    t          t          ddt          d          fd| j                  f          | _        t          t          ddt          d          fd| j        d d	                   f          | _        d S )
Nrr  rs  r"   c                 >    t          t          |                     S r   r   r  s    r9   r   zUget_trtllm_moe_sm100_module.<locals>.MoERunner.refine_tuning_config.<locals>.<lambda>  r  r8   r   ru  rv  c                 >    t          t          |                     S r   r   r  s    r9   r   zUget_trtllm_moe_sm100_module.<locals>.MoERunner.refine_tuning_config.<locals>.<lambda>  r  r8   r+   )r   r   r#   dynamic_tensor_initializers'tuning_config_with_hidden_states_scales%tuning_config_no_hidden_states_scalesr  s    `r9   r  zCget_trtllm_moe_sm100_module.<locals>.MoERunner.refine_tuning_config  s     ;G%**>?RTUVVWWWW7 &
; 
; 
;C7 9E%''>?RTUVVWWWW7; &
9 
9 
9C555r8   r   )r-   r.   r/   r  r   r   r#   r  r  r!  r  r   r   r   r   rM   rH   r"  r   r   r   r$  r   r
  r  r%  r&  r'  r  )r   r  s   r9   r   rb    s             !'
#* 3?,!!&&::4CCDD/ "
3
 
3
 
3
/ 1=!!##::4CCDD/3 "
1
 
1
 
1
- "TVV #/"5(-!-!4',	9 	9	9  #	9 &		9
 *	9 #	9 	9  #	9  	9 "&	9 	9 !%	9 	9 	9 	96%	>&%	> )%	> #Y	%	> %	> %	> %	> %	> %	> %	>T #(	}	 }	&}	 }	 !	}	 }	 }	 }	 }	 }	~ 
		T	*	*	*	3 	 	 	 
+	* 
	 	 	r8   r   flashinfer::trtllm_bf16_moer(  r)  r   r  r  r  r  r  r  r   r  r  rz  r  local_num_expertsr  r  r|  r}  r   r  r}   c                    |t          |j                  }t          j                    }                    |           |j        d         }|j        d         }t          j        ||t          j        |j                  }t          j        ||t          j	        |j                  }t          j        ||| j
        |j                  }t          j        }t          j        } ||||d||	||t          j        
  
        }|| |||g}|                    d|gj        ||||||||
||||||          \  }}                     | |||||||||	|
|||||||dk    rddgn|          }|S )Nr   r   r5  F)
r   rx  r   r   ry  r   rz  r}  r|  r{  r  )r  r  r  r  r  r  r  r  r  r  r|  r}  r   )r   r   r   r6  r  r   r   r;  r   r<  r|   rH   rf   r   r   r8  r  r  )!r  r  r  r  r  r  r   r  r  rz  r  r  r  r  r|  r}  r   r  r>  r  r   r+  r  r  r   r   r?  r   r@  r  resultr   r  s!                                  r9   trtllm_bf16_moe_opz7get_trtllm_moe_sm100_module.<locals>.trtllm_bf16_moe_op  s   0 +M,@AAJ &&':;;;"(+
#)"- 5>-BV
 
 
 ;U[9M
 
 
 ^%9-BV
 
 
 #+	&/Y/'"#/' 3'.
 
 

 .(NMR$$)L;%''#! 3/"7 3 3'!# % 
 
	6* ''!"RHH&%
 
( r8   c                 ~    |j         d         }|j         d         }|                    ||gt          j                  gS rI  r   rJ  r   r   )r  r  r  r  r  r  r   r  r  rz  r  r  r  r|  r}  r   r  rK  r   s                      r9   _fake_trtllm_bf16_moez:get_trtllm_moe_sm100_module.<locals>._fake_trtllm_bf16_moea  sA    (  %a(#)!,''+(>en'UUVVr8   +flashinfer::trtllm_fp8_per_tensor_scale_moer   r  r  r  r  c                    |t          |j                  }t          j                    }!                    |           |j        d         }|j        d         }t          j        ||t          j        |j                  }t          j        ||	t          j	        |j                  }t          j        ||	| j
        |j                  }t          j        }t          j        } !|	|||d||t          j        d	  	        }|| |||g}|                    d|g!j        |||||||||
|||||||          \  }}"                    | ||||||||||	|
|||||||||dk    rddgn|          } | S )	Nr   r   r5  FT	r   rx  r   r   ry  r   rz  r}  r|  r  )r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   )r   r   r   r6  r  r   r   r;  r   r<  r|   rH   rk   r   r   r8  r  r  )#r  r  r  r  r  r  r  r  r  r   r  r  rz  r  r  r  r  r  r   r  r>  r  r   r+  r  r  r   r   r?  r   r@  r  r  r   r  s#                                    r9   "trtllm_fp8_per_tensor_scale_moe_opzGget_trtllm_moe_sm100_module.<locals>.trtllm_fp8_per_tensor_scale_moe_opz  s   4 +M,@AAJ&&':;;;"(+
#)"- 5>-BV
 
 
 ;U[9M
 
 
 ^%9-BV
 
 
 #'	&+Y/'"#/&- $

 

 


 .(NMR$$9L;%'"7'A'"7#! 3/"7(C 3!' % 
 
	6, 77!&!!'"RHH&+
 
0 r8   c                 ~    |j         d         }|j         d         }|                    ||gt          j                  gS rI  r  )r  r  r  r  r  r  r  r  r  r   r  r  rz  r  r  r  r  r  r   rK  r   s                        r9   %_fake_trtllm_fp8_per_tensor_scale_moezJget_trtllm_moe_sm100_module.<locals>._fake_trtllm_fp8_per_tensor_scale_moe  sA    ,  %a(#)!,''+(>en'UUVVr8   &flashinfer::trtllm_fp8_block_scale_moeFr  r  r  r+  c                    |t          |j                  }t          j                    }"                    |           |j        d         }|j        d         }t          j        ||t          j        |j                  }t          j        ||
t          j	        |j                  }t          j        ||
| j
        |j                  }t          j        }t          j        } "|
|||d||||	  	        }|| ||||g}|                    d|g"j        |||||||	|||||||||          \  }} #                    | |||||||||	|
||||||||||| dk    rddgn|           }!|!S )Nr   r   r5  Tr  r  )r  r  r  r  r  r  r  r  r  r  r  r  r|  r}  r   )r   r   r   r6  r  r   r   r;  r   r<  r|   rH   rk   r8  r  r  )$r  r  r  r  r  r  r  r  r+  r  r   r  r  rz  r  r  r  r  r|  r}  r   r  r>  r  r   r  r  r   r   r?  r   r@  r  r  r   r  s$                                     r9   trtllm_fp8_block_scale_moe_opzBget_trtllm_moe_sm100_module.<locals>.trtllm_fp8_block_scale_moe_op  s   8 +M,@AAJ &&':;;;"(+
#)"- 5>-BV
 
 
 ;U[9M
 
 
 ^%9-BV
 
 
 #'	&+Y/'!#/' 3

 

 


 
 $$4L=%' 3' 3#! 3/"7 3 3'!' % 
 
	6, 22!"RHH&-
 
2 r8   c                 ~    |j         d         }|j         d         }|                    ||gt          j                  gS rI  r  )r  r  r  r  r  r  r  r  r+  r  r   r  r  rz  r  r  r  r  r|  r}  r   rK  r   s                          r9    _fake_trtllm_fp8_block_scale_moezEget_trtllm_moe_sm100_module.<locals>._fake_trtllm_fp8_block_scale_moe~  sA    0  %a(#)!,''+(>en'UUVVr8   &flashinfer::trtllm_fp4_block_scale_moer  r  r  r  r  r  r  r  r  r  rx  r  r{  c                     | 8|
J d            |j         t          j        k    s
J d            t          j        } n| j         } |j        d         }!|j         t          j        k    r|!dz  }!|j        d         }"|'t          j        |"|t          j        |j                  }|t          j        |"|| |j                  }|t          |j                  }|(t          j        |"|!t          j        |j                  }nzt          |d t          j        |j        d           |j        d         |"k    sJ d|j        d          d	|"             |j        d
         |!k    sJ d|j        d
          d|!             t          j                    }#.                    |           t          ||          }$t          ||          }% .|||$|%d|!||t          j        d
  
        }&|.j        n.j        }'|| t          j        |"|| d          n| |||g}(||(                    |           |#                    d|&g|'|(||||||	|
|||||||||||||||          \  })}* /j        g | |||||||||	|
|||||||||||||||||||||*dk    rddgn|*R  }+|r|gS |+\  },}-t          j        |,          |t          j        |-          gS )Nz3either topk_ids or routing_logits must be provided.z!topk_ids must be an int32 tensor.r   r   r   r5  r+  zoutput.shape[0]=z must be equal to r"   zoutput.shape[1]=z must be less than or equal to FT
r   rx  r   r   ry  r   rz  r{  r}  r|  metar  )r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r{  )r|   r   r<  r   r   r   r;  r   r   r   r   r6  r  r   r   r   r  r  appendr8  r  from_dlpack)0r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  rz  r  rx  r  r  r  r   r{  r+  r  routing_dtyper   r  r>  r   r   r?  tunning_configr   r@  r  intermediate_outputgemm2_outputexpanded_idx_to_permuted_idxr   r  s0                                                 r9   trtllm_fp4_block_scale_moe_opzBget_trtllm_moe_sm100_module.<locals>.trtllm_fp4_block_scale_moe_op  s   L !''E ('' >U[0002U000!NMM*0M#)"-%+--%/K"(+
 {E]=Q  H !"[E}?S  N +M,@AAJ>[n$+	  FF %enm.BH   <?j000R6<?RRjRR 100 <?k111`6<?``S^`` 211 &&':;;;2=BUVV	6.
 
 Y/'"#/)&- $
 
 

 #* ;;B 	 % K
K}VTTTT
 *MM-...$$4L#%' 3!#!/' 3!!5&?!5! 3"7 3!#)5 % 
 
	6< @f? !
!
!
 !
 	!

 !
  !
 !
  !
 !
 !
 !
 !
 !
  !
 !
  !!!
" &#!
$ !%!
& '!
( )!
* +!
, -!
. /!
0  1!
2 3!
4 "5!
6  7!
8 9!
: ;!
< =!
> ?!
@ "RHH&A!
 !
 !
D  	8O9L6L6!,//!">?? r8   c                      |j         d         } ||j         d         n|j         d         }!|                    | |!gt          j                  gS rI  r  )"r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  rz  r  r  r  r  r  r   r{  r+  r  rK  r   s"                                     r9    _fake_trtllm_fp4_block_scale_moezEget_trtllm_moe_sm100_module.<locals>._fake_trtllm_fp4_block_scale_moeX  sR    F  %a(06m)!,,FLQRO''+(>en'UUVVr8   )flashinfer::trtllm_mxint4_block_scale_moec                 2   | j         }|j        d         }|j         t          j        k    r|dz  }|j        d         }t          j        ||t          j        |j                  }t          j        ||||j                  }|t          |j                  }|'t          j        ||t          j        |j                  }t          j
                    }#                    |           t          j        }t          j        } #||||d||t          j        t"          j        d
  
        }#j        }|| |||g} |                    d|g|| |
||||||||	||||||	          \  }!}"$                    | |||||||||	|
|||||||||||"dk    rddgn|"           |S )
Nr   r   r   r5  FTr  r  )r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   )r|   r   r   r   r;  r<  r   r   r   r   r6  r  rH   rf   rt   r   r   r   r   r  r8  r  )%r  r  r  r  r  r  r  r  r  r  r  r   r  r  rz  r  rx  r  r  r   r+  r  r  r   r  r  r  r>  r   r   r?  r  r   r@  r  r   r  s%                                      r9    trtllm_mxint4_block_scale_moe_opzEget_trtllm_moe_sm100_module.<locals>.trtllm_mxint4_block_scale_moe_op  s(   8 ',#)"-%+--%/K"(+
 ;U[9M
 
 
 ]=;O
 
 
 +M,@AAJ>[n$+	  F &&':;;;"+	&-Y/'"#/'.&2 $
 
 

 #H
 $$7L#%' 3#!/' 3! 3"7 3!' % 
 
	6. 	,,!"RHH&-	
 	
 	
0 r8   c                 ~    |j         d         }|j         d         }|                    ||gt          j                  gS rI  r  )r  r  r  r  r  r  r  r  r  r  r  r   r  r  rz  r  r  r  r  r   r+  r  rK  r   s                           r9   #_fake_trtllm_mxint4_block_scale_moezHget_trtllm_moe_sm100_module.<locals>._fake_trtllm_mxint4_block_scale_moe  sA    2  %a(#)!,''+(>en'UUVVr8   )r  r  r  r  r  r   r   Nr   )r   N)Fr   Nr   )r   Fr   N)Nr   Nr   )NNr   )r   rQ  r   rS  get_library_pathr   r   r   r$  r   rM   r  r"  r    r   r   )r   r  r  r  r  r  r  r  r  r  r  r   r  s              @@r9   get_trtllm_moe_sm100_moduler    s   244F""$$Fs6224455666I I I I I I I IM I I IV
 %  * &*#'%d ddu|,d |d |	d
 |d d d #d SMd d !d d  (d !d "d  !d" TN#d$ !%d& 
'd d d d d d	 dL 344" &*#'#W WWu|,W |W |	W
 |W W W #W SMW W !W W !W "W W  TN!W" !#W W W 54W0 5  , $%%)#')i iiu|,i |i |	i
  %|i %*Li |i  %|i i i #i SMi i !i i   (!i" &*#i$ !%i& TN'i( !)i* 
+i i i i i i	 iV CDD& $%%)'W WWu|,W |W |	W
  %|W %*LW |W  %|W W W #W SMW W !W W   (!W" &*#W$ !%W& TN'W W W EDW4 0  . %*%)#'-t ttu|,t |t #\	t
 |t #\t |t #\t t t t #t SMt t !t  !t"  (#t$ !%t& "'t( )t* TN+t, !-t. 
/t t t t t t	 tl >??& $%$)%)+W WWu|,W |W #\	W
 |W #\W |W #\W W W W #W SMW W !W  !W"  (#W$ !%W& "'W( )W* TN+W W W @?W8 0  B &*)-#'Aw w .w5<(w !.w u|,	w
 |w &el3w |w #\w U\*w el+w U\*w $EL1w |w #\w U\*w  'u|4!w" $,EL#9#w$ 'u|4%w& 'w( )w* #+w, SM-w. /w0 !1w2 3w4  (5w6 !7w8 9w: TN;w< =w> &?w@ !AwB 
el	Cw w w w w w	 wr >??%W%W5<(%W !.%W u|,	%W
 |%W #\%W |%W #\%W U\*%W el+%W U\*%W $EL1%W |%W #\%W U\*%W  'u|4!%W" $,EL#9#%W$ 'u|4%%W& '%W( )%W* #+%W, SM-%W. /%W0 !1%W2 3%W4  (5%W6 !7%W8 9%W: ;%W< =%W> &?%W@ !A%W %W %W @?%WN 3  0 &*)-#'-w wwu|,w |w |	w
 #\w el+w U\*w $EL1w |w #\w w w #w SMw w  !!w" #w$  (%w& !'w( TN)w* &+w, !-w. 
el	/w w w w w w	 wr ABBWWu|,W |W |	W
 #\W el+W U\*W $EL1W |W #\W W W #W SMW W  !!W" #W$  (%W& !'W( )W* &+W, !-W W W CBW: *(J#@#@&F   r8   Tr  r  r  r  r  r  r   r  r  rz  r  r  r  r  r|  r}  c                 f    t                                          | |||||||||	|
|||||||          S )aA	  BF16 MoE operation with autotuning support.

    This function implements a bfloat16 Mixture of Experts layer using the TensorRT-LLM backend
    with automatic performance tuning for optimal tile size selection.

    Args:
        routing_logits: [seq_len, num_experts] tensor of routing logits.
            Supports float32 or bfloat16.
        routing_bias: Optional [num_experts] tensor of routing bias.
            Must be bfloat16 if provided.
        hidden_states: [seq_len, hidden_size] tensor of input hidden states.
            Must be bfloat16.
        gemm1_weights: [num_experts, 2*intermediate_size, hidden_size] tensor of first layer weights.
            Must be bfloat16.
        gemm2_weights: [num_experts, hidden_size, intermediate_size] tensor of second layer weights.
            Must be bfloat16.
        num_experts: Total number of experts.
        top_k: Number of experts to route to per token.
        n_group: Number of expert groups.
        topk_group: Number of groups to consider for top-k routing.
        intermediate_size: Size of intermediate layer.
        local_expert_offset: Offset of local experts in global expert space.
        local_num_experts: Number of experts handled by this device.
        routed_scaling_factor (Optional[float]): Scaling factor for routing (can be None for some routing methods)
        routing_method_type: Type of routing method to use (default: 0).
            - 0: Default (Softmax -> TopK)
            - 1: Renormalize (TopK -> Softmax)
            - 2: DeepSeekV3 (Sigmoid -> RoutingBiasAdd -> Top2 in group -> Top4 groups -> Top8 experts)
            - 3: Llama4 (Top1 -> Sigmoid)
            - 4: RenormalizeNaive (Softmax -> TopK -> Renormalize)
        use_shuffled_weight: Whether to use shuffled weight layout for optimization (default: True).
        weight_layout: Weight layout format (default: WeightLayout.BlockMajorK).
            - 0: MajorK - K-major layout [Mn, K]
            - 1: MajorMn - M-major for A and N-major for B [K, Mn]
            - 2: BlockMajorK - Blocked along K dimension [K/blockK, Mn, blockK]
        enable_pdl: Whether to enable Programmatic Dependent Launch. Auto-enabled for >= sm90.
        tune_max_num_tokens: Maximum number of tokens for autotuning (default: 8192).

    Returns:
        torch.Tensor: Output tensor of shape [seq_len, hidden_size].
    )r  r  )r  r  r  r  r  r  r   r  r  rz  r  r  r  r  r|  r}  r   r  s                     r9   r  r  $  sY    | '((88%  r8   r  r  r  r  c                 j    t                                          | |||||||||	|
|||||||||          S )aF  FP8 per tensor scale MoE operation.

    Args:
        routing_logits: [seq_len, num_experts] tensor of routing logits
        routing_bias: [num_experts] tensor of routing bias
        hidden_states: [seq_len, hidden_size] tensor of input hidden states
        gemm1_weights: [num_experts, 2*intermediate_size, hidden_size] tensor of first layer weights
        output1_scales_scalar: [local_num_experts] tensor of first layer output scales
        output1_scales_gate_scalar: [local_num_experts] tensor of first layer gate scales
        gemm2_weights: [num_experts, hidden_size, intermediate_size] tensor of second layer weights
        output2_scales_scalar: [local_num_experts] tensor of second layer output scales
        num_experts: Total number of experts
        top_k: Number of experts to route to per token
        n_group: Number of expert groups
        topk_group: Number of groups to consider for top-k routing
        intermediate_size: Size of intermediate layer
        local_expert_offset: Offset of local experts in global expert space
        local_num_experts: Number of experts handled by this device
        routed_scaling_factor: Scaling factor for routing
        use_routing_scales_on_input: Whether to use routing scales on input
        routing_method_type: Type of routing method to use (default: 0)
        enable_pdl: Whether to enable Programmatic Dependent Launch (PDL). Auto-enabled for >= sm90.
        tune_max_num_tokens(int): Maximum number of tokens for tuning. (default: 8192)

    Returns:
        torch.Tensor: Output tensor of shape [seq_len, hidden_size]
    )r  r  )r  r  r  r  r  r  r  r  r  r   r  r  rz  r  r  r  r  r  r   r  s                       r9   r  r  x  s_    d '((HH"#)  r8   r  r  r  c                     t          j        |j        t           j        |j                  }t                                          | ||||||||||	|
||||||||||          S )a<  FP8 block scale MoE operation.

    Args:
        routing_logits: [seq_len, num_experts] tensor of routing logits
        routing_bias: [num_experts] tensor of routing bias
        hidden_states: [seq_len, hidden_size] tensor of input hidden states
        hidden_states_scale: [hidden_size//128, seq_len] tensor of hidden states block scales
        gemm1_weights: [num_experts, 2*intermediate_size, hidden_size] tensor of first layer weights
        gemm1_weights_scale: [num_experts, 2*intermediate_size//128, hidden_size//128] tensor of first layer block scales
        gemm2_weights: [num_experts, hidden_size, intermediate_size] tensor of second layer weights
        gemm2_weights_scale: [num_experts, hidden_size//128, intermediate_size//128] tensor of second layer block scales
        num_experts: Total number of experts
        top_k: Number of experts to route to per token
        n_group: Number of expert groups
        topk_group: Number of groups to consider for top-k routing
        intermediate_size: Size of intermediate layer
        local_expert_offset: Offset of local experts in global expert space
        local_num_experts: Number of experts handled by this device
        routed_scaling_factor: Scaling factor for routing
        routing_method_type: Type of routing method to use (default: 0)
        enable_pdl: Whether to enable Programmatic Dependent Launch (PDL). Auto-enabled for >= sm90.
        tune_max_num_tokens(int): Maximum number of tokens for tuning. (default: 8192)
    Returns:
        torch.Tensor: Output tensor of shape [seq_len, hidden_size]
    r5  )r   r;  r   r   r   r  r  )r  r  r  r  r  r  r  r  r  r   r  r  rz  r  r  r  r  r|  r}  r   r  r+  s                         r9   r  r    s    b [5>-:N  F '((CC-  r8   r  r  r  r  r  r  r  r  r  r{  c                      t                      j        g | dd|||||||||	|
|||||||||||||||||||R  S )a
  FP4 block scale MoE operation.

    Args:
        routing_logits (torch.Tensor): shape [seq_len, num_experts]
            Input tensor of routing logits. Supports float32, bfloat16.
        routing_bias (Optional[torch.Tensor]): shape [num_experts]
            Tensor of routing bias. Can be None for some routing methods. Must be the same type as routing logits.
        hidden_states (torch.Tensor): shape [seq_len, hidden_size // 2 if nvfp4 else hidden_size]
            Tensor of input hidden states. Supports bfloat16, mxfp8, and nvfp4 (packed into uint8)
        hidden_states_scale (Optional[torch.Tensor]): shape [seq_len, hidden_size // (32 if mxfp8, 16 if mxfp4)]
            Scale tensor of mxfp8 / nvfp4 hidden states. Dtype must be float8.
        gemm1_weights (torch.Tensor): shape [num_experts, 2 * intermediate_size, hidden_size // 2]
            Tensor of FC1 weights. Dtype must be uint8 (packed fp4)
        gemm1_weights_scale (torch.Tensor): shape [num_experts, 2 * intermediate_size, hidden_size // (32 if mxfp4 else 16)]
            Scale tensor of FC1 weights. Dtype must be float8.
        gemm1_bias (Optional[torch.Tensor]): shape [num_experts, 2 * intermediate_size]
            Tensor of FC1 biases. Dtype is float32.
        gemm1_alpha (Optional[torch.Tensor]): shape [num_experts]
            Tensor of swiglu alpha. Dtype is float32.
        gemm1_beta (Optional[torch.Tensor]): shape [num_experts]
            Tensor of swiglu beta. Dtype is float32.
        gemm1_clamp_limit (Optional[torch.Tensor]): shape [num_experts]
            Tensor of swiglu clamp limit. Dtype is float32.
        gemm2_weights (torch.Tensor): shape [num_experts, hidden_size, intermediate_size]
            Tensor of FC2 weights. Dtype must be uint8 (packed fp4)
        gemm2_weights_scale (torch.Tensor): shape [num_experts, hidden_size, intermediate_size // (32 if mxfp4 else 16)]
            Scale tensor of FC2 weights. Dtype must be float8.
        gemm2_bias (Optional[torch.Tensor]): shape [num_experts, hidden_size]
            Tensor of FC2 biases. Dtype is float32.
        output1_scale_scalar (Optional[torch.Tensor]): shape [local_num_experts]
            Tensor of scaling factors for first layer activation output
        output1_scale_gate_scalar (Optional[torch.Tensor]): shape [local_num_experts]
            Tensor of scaling factors for first layer gate output
        output2_scale_scalar (Optional[torch.Tensor]): shape [local_num_experts]
            Tensor of scaling factors for second layer output
        num_experts (int): Total number of experts
        top_k (int): Number of experts to route to per token
        n_group (Optional[int]): Number of expert groups (can be None for some routing methods)
        topk_group (Optional[int]): Number of groups to consider for top-k routing (can be None for some routing methods)
        intermediate_size (int): Size of intermediate layer
        local_expert_offset (int): Offset of local experts in global expert space
        local_num_experts (int): Number of experts handled by this device
        routed_scaling_factor (Optional[float]): Scaling factor for routing (can be None for some routing methods)
        routing_method_type (int): Type of routing method to use (default: 0)
            - 0: Default (Softmax -> TopK)
            - 1: Renormalize (TopK -> Softmax)
            - 2: DeepSeekV3 (Sigmoid -> RoutingBiasAdd -> Top2 in group -> Top4 groups -> Top8 experts)
            - 3: Llama4 (Top1 -> Sigmoid)
            - 4: RenormalizeNaive (Softmax -> TopK -> Renormalize)
        do_finalize (bool): Whether to finalize the output (default: False)
        enable_pdl (Optional[bool]): Whether to enable Programmatic Dependent Launch (PDL). Auto-enabled for >= sm90.
        gated_act_type (int): Type of gated activation function (default: 0)
            - 0: SwiGlu
            - 1: GeGlu
        tune_max_num_tokens(int): Maximum number of tokens for tuning. (default: 8192)
        output (Optional[torch.Tensor]): shape [seq_len, hidden_size]
            Optional inplace output tensor.
    Returns:
        List[torch.Tensor]: List of output tensors. If do_finalize=True, returns the final MoE output.
            Otherwise, returns intermediate results (gemm2_output, expert_weights, expanded_idx_to_permuted_idx) that need further processing.
    Nr  r  )r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  rz  r  r  r  r  r  r   r{  r+  r  s                                 r9   r  r  	  sg   | D&((C !!! 	! 		!
 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	!  	!!" 	"#!$ 	%!& 	'!( 	)!* 	+!, 	-!. 	/!0 	1!2 	3!4 	5!6 	7!8 	9!: 	;!< 	=!> 	?!@ 	A! ! ! !r8   r  c                      t                      j        g d| d|||||||||	|
|||||||||||||||||||R  S )aS  FP4 block scale MoE operation.

    Args:
        topk_ids (torch.Tensor): shape [seq_len, top_k]
            Tensor of top-k indices and expert weights. Dtype must be int32.
            It must represent a packed value. The most significant 16/32 bits represent the score and
            the least significant 16 bits represent the index of the chosen expert (unsigned).
        routing_bias (Optional[torch.Tensor]): shape [num_experts]
            Tensor of routing bias. Can be None for some routing methods. Must be the same type as routing logits.
        hidden_states (torch.Tensor): shape [seq_len, hidden_size // 2 if nvfp4 else hidden_size]
            Tensor of input hidden states. Supports bfloat16, mxfp8, and nvfp4 (packed into uint8)
        hidden_states_scale (Optional[torch.Tensor]): shape [seq_len, hidden_size // (32 if mxfp8, 16 if mxfp4)]
            Scale tensor of mxfp8 / nvfp4 hidden states. Dtype must be float8.
        gemm1_weights (torch.Tensor): shape [num_experts, 2 * intermediate_size, hidden_size // 2]
            Tensor of FC1 weights. Dtype must be uint8 (packed fp4)
        gemm1_weights_scale (torch.Tensor): shape [num_experts, 2 * intermediate_size, hidden_size // (32 if mxfp4 else 16)]
            Scale tensor of FC1 weights. Dtype must be float8.
        gemm1_bias (Optional[torch.Tensor]): shape [num_experts, 2 * intermediate_size]
            Tensor of FC1 biases. Dtype is float32.
        gemm1_alpha (Optional[torch.Tensor]): shape [num_experts]
            Tensor of swiglu alpha. Dtype is float32.
        gemm1_beta (Optional[torch.Tensor]): shape [num_experts]
            Tensor of swiglu beta. Dtype is float32.
        gemm1_clamp_limit (Optional[torch.Tensor]): shape [num_experts]
            Tensor of swiglu clamp limit. Dtype is float32.
        gemm2_weights (torch.Tensor): shape [num_experts, hidden_size, intermediate_size]
            Tensor of FC2 weights. Dtype must be uint8 (packed fp4)
        gemm2_weights_scale (torch.Tensor): shape [num_experts, hidden_size, intermediate_size // (32 if mxfp4 else 16)]
            Scale tensor of FC2 weights. Dtype must be float8.
        gemm2_bias (Optional[torch.Tensor]): shape [num_experts, hidden_size]
            Tensor of FC2 biases. Dtype is float32.
        output1_scale_scalar (Optional[torch.Tensor]): shape [local_num_experts]
            Tensor of scaling factors for first layer activation output
        output1_scale_gate_scalar (Optional[torch.Tensor]): shape [local_num_experts]
            Tensor of scaling factors for first layer gate output
        output2_scale_scalar (Optional[torch.Tensor]): shape [local_num_experts]
            Tensor of scaling factors for second layer output
        num_experts (int): Total number of experts
        top_k (int): Number of experts to route to per token
        n_group (Optional[int]): Number of expert groups (can be None for some routing methods)
        topk_group (Optional[int]): Number of groups to consider for top-k routing (can be None for some routing methods)
        intermediate_size (int): Size of intermediate layer
        local_expert_offset (int): Offset of local experts in global expert space
        local_num_experts (int): Number of experts handled by this device
        routed_scaling_factor (Optional[float]): Scaling factor for routing (can be None for some routing methods)
        routing_method_type (int): Type of routing method to use (default: 0)
            - 0: Default (Softmax -> TopK)
            - 1: Renormalize (TopK -> Softmax)
            - 2: DeepSeekV3 (Sigmoid -> RoutingBiasAdd -> Top2 in group -> Top4 groups -> Top8 experts)
            - 3: Llama4 (Top1 -> Sigmoid)
            - 4: RenormalizeNaive (Softmax -> TopK -> Renormalize)
        do_finalize (bool): Whether to finalize the output (default: False)
        gated_act_type (int): Type of gated activation function (default: 0)
            - 0: SwiGlu
            - 1: GeGlu
        tune_max_num_tokens(int): Maximum number of tokens for tuning. (default: 8192)
        output (Optional[torch.Tensor]): shape [seq_len, hidden_size]
            Optional inplace output tensor.

    Returns:
        List[torch.Tensor]: List of output tensors. If do_finalize=True, returns the final MoE output.
            Otherwise, returns intermediate results (gemm2_output, expert_weights, expanded_idx_to_permuted_idx) that need further processing.
    Nr  )r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  rz  r  r  r  r  r  r   r{  r+  r  s                                 r9   !trtllm_fp4_block_scale_routed_moer  	  sg   @ D&((C !!! 	! 		!
 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	!  	!!" 	"#!$ 	%!& 	'!( 	)!* 	+!, 	-!. 	/!0 	1!2 	3!4 	5!6 	7!8 	9!: 	;!< 	=!> 	?!@ 	A! ! ! !r8   c                 n    t                                          | |||||||||	|
|||||||||||          S )aN  MxInt4 block scale MoE operation.

    Args:
        routing_logits (torch.Tensor): shape [seq_len, num_experts]
            Input tensor of routing logits. Supports float32, bfloat16.
        routing_bias: Optional [num_experts] tensor of routing bias.
            Must be bfloat16 if provided.
        hidden_states (torch.Tensor): shape [seq_len, hidden_size]
            Tensor of input hidden states. Supports bfloat16.
        gemm1_weights (torch.Tensor): shape [num_experts, 2 * intermediate_size, hidden_size // 2]
            Tensor of FC1 weights. Dtype must be uint8 (packed mxint4)
        gemm1_weights_scale (torch.Tensor): shape [num_experts, 2 * intermediate_size, hidden_size // 32]
            Scale tensor of FC1 weights. Dtype must be bfloat16.
        gemm1_alpha (Optional[torch.Tensor]): shape [num_experts]
            Tensor of swiglu alpha. Dtype is float32.
        gemm1_beta (Optional[torch.Tensor]): shape [num_experts]
            Tensor of swiglu beta. Dtype is float32.
        gemm1_clamp_limit (Optional[torch.Tensor]): shape [num_experts]
            Tensor of swiglu clamp limit. Dtype is float32.
        gemm2_weights (torch.Tensor): shape [num_experts, hidden_size, intermediate_size]
            Tensor of FC2 weights. Dtype must be uint8 (packed mxint4)
        gemm2_weights_scale (torch.Tensor): shape [num_experts, hidden_size, intermediate_size // 32]
            Scale tensor of FC2 weights. Dtype must be bfloat16.
        num_experts (int): Total number of experts
        top_k (int): Number of experts to route to per token
        n_group (Optional[int]): Number of expert groups (can be None for some routing methods)
        topk_group (Optional[int]): Number of groups to consider for top-k routing (can be None for some routing methods)
        intermediate_size (int): Size of intermediate layer
        local_expert_offset (int): Offset of local experts in global expert space
        local_num_experts (int): Number of experts handled by this device
        routed_scaling_factor (Optional[float]): Scaling factor for routing (can be None for some routing methods)
        routing_method_type (int): Type of routing method to use (default: 0)
            - 0: Default (Softmax -> TopK)
            - 1: Renormalize (TopK -> Softmax)
            - 2: DeepSeekV3 (Sigmoid -> RoutingBiasAdd -> Top2 in group -> Top4 groups -> Top8 experts)
            - 3: Llama4 (Top1 -> Sigmoid)
            - 4: RenormalizeNaive (Softmax -> TopK -> Renormalize)
        enable_pdl (Optional[bool]): Whether to enable Programmatic Dependent Launch (PDL). Auto-enabled for >= sm90.
        tune_max_num_tokens(int): Maximum number of tokens for tuning. (default: 8192)
        output (Optional[torch.Tensor]): shape [seq_len, hidden_size]
            Optional inplace output tensor.
    Returns:
        torch.Tensor: returns the final MoE output.
    )r  r  )r  r  r  r  r  r  r  r  r  r  r  r   r  r  rz  r  r  r  r  r   r+  r  s                         r9   r  r  
  se    J '((FF-  r8   r   )r   Fr  )r   Fr   Nr   )r   TNr   Nr   )r   NNr   )L__doc__r&  enumr   typesr   typingr   r   r   r   r	   r
   r   api_loggingr   	autotunerr   r   r   r   r   jit.cpp_extr   jit.corer   rR  r   jit.fused_moer   r   r   r   r   r   utilsr   r   r   r   r   r    r!   r#   r$   r&   r;   rH   r"  r   r$  r   r   r   cacherS  r   rM   r   r   r   r   r   rX  rA   r|   rG  r  r   r  r  r  r  r  r  r  r7   r8   r9   <module>r     s
               ! ! ! ! ! ! : : : : : : : : : : : : : : : :  ( ( ( ( ( (              3 2 2 2 2 2                                                         $	 	 	 	 	W 	 	 	# # # # #W # # #L
n 
 
 
 
 
|$U\2   0    7       7     #' ! 3- 
	   J )-	 l  49%	
 \   B )-	 <  49%	
 \   2 %,        8  S%, S S S S S S  # t    F  1504'++/*.+/%)!).!&"'"$#!%&4&;;] ]<]!L] ] 	]
 ] +] u|$]  -]  -] u|$] 5<(] %,'] 5<(] ] ]  !]" #]$ %]& ']( U\")]* +], #'-]. /]0  1]2 3]4 5]6 7]8 9]: $;]< \=] ] ] ]F z z zz#  .2  $%1#%P PLP5<(P <P <	P
 <P P P c]P P P P P $E?P P P  !P" #P$ %P& \'P P P Pf &  !!%#)F FLF5<(F <F <	F
 !<F !&F <F !<F F F c]F F F F F  $E?!F" "&#F$ %F& 'F( )F* \+F F F FR $  ! %!%#+J JLJ5<(J <J 	J
 <J J <J J J J c]J J J J J  $E?!J" #J$ %J& 'J( )J* +J, \-J J J JZ 4  !!%%)#=~ ~L~5<(~ <~ "%,/	~
 <~ ~ &~ %,'~ &~  -~ <~ ~ &~ #5<0~  (5~  #5<0!~" #~$ %~& c]'~( )~* +~, -~. /~0 $E?1~2 3~4 5~6 7~8 9~: U\";~< =~> 
%,?~ ~ ~ ~B 4  !!%%)#=@ @l@5<(@ <@ "%,/	@
 <@ @ &@ %,'@ &@  -@ <@ @ &@ #5<0@  (5@  #5<0!@" #@$ %@& c]'@( )@* +@, -@. /@0 $E?1@2 3@4 5@6 7@8 9@: U\";@< =@> 
%,?@ @ @ @F (  !!%%)#-[ [L[5<([ <[ <	[
 [ %,'[ &[  -[ <[ [ [ [ c][ [ [  ![" #[$ $E?%[& '[( )[* U\"+[, -[. 
%,/[ [ [ [ [ [r8   