
    .`iq                     T   d dl mZ d dlZd dlZd dlZd dlZd dlmZmZ d dl	Z	d dl
mZ d dlmZmZ d dlmZmZmZ d dlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZm Z m!Z! ddl"m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z(  ed           G d d                      Z) ed           G d d                      Z* ed           G d d                      Z+ G d de          Z,e)Z- e.            a/e)0                                e)0                                fde)de)fdZ1dej%        fdZ2d Z3 ed           G d  d!                      Z4e G d" d#                      Z5d$ Z6e G d% d&                      Z7d' Z8d(e7fd)Z9d* Z:	 	 	 dAd,ej%        d-ej%        d.ej%        d/ej%        d0edz  d1edz  d2ej%        dz  d3e;d4e;fd5Z<d6 Z=	 	 	 	 	 	 	 	 	 	 dBd7edz  d8edz  d9edz  d:e5dz  d;ej%        dz  d<ej%        dz  d=e>dz  d>ej%        dz  de*dz  de+dz  fd?Z?	 	 	 	 	 	 	 dCd7ed8ed9ed:e5fd@Z@dS )D    )	dataclassN)Enumauto)target_info)
InFlexDataOutFlexData)
GatherIndxRoutingDataScatterIndx)is_cuda   )_matmul_ogs)_p_matmul_ogs"get_per_device_per_stream_alloc_fn)_reduce_grouped)MXFP_BLOCK_SIZE)make_opt_flagsupdate_opt_flags_constraintsInapplicableConstraint)
specialize)StorageTensorFP4bitwidthwrap_torch_tensorT)frozenc                   z    e Zd ZU eed<   ded<   ee         ed<    e            Zee         ed<   ed             ZdS )FnSpecsnameztriton.runtime.jit.JITFunctionfnfn_arg_namesfn_arg_do_not_specializec                  <    t          dd t                                S )Ndflt)r   tuple     ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/third_party/triton_kernels/matmul_ogs.pydefaultzFnSpecs.default    s    vtUWW---r'   N)	__name__
__module____qualname__str__annotations__r%   r"   staticmethodr)   r&   r'   r(   r   r      so         
III((((*+0577eCj222. . \. . .r'   r   c                   z    e Zd ZU e                                Zeed<    e            Zee	         ed<   dZ
eed<   dS )FusedActivationspecsfn_argsr   reduction_nN)r*   r+   r,   r   r)   r2   r.   r%   r3   objectr4   intr&   r'   r(   r1   r1   %   sV         __&&E7&&&"UWWGU6]$$$Kr'   r1   c                       e Zd ZU e                                Zeed<    e            Zee	         ed<    e            Z
ee	         ed<   dZeed<   dS )Epiloguer2   fn_arg_values_matmulfn_arg_values_finalizeNeffective_itemsize)r*   r+   r,   r   r)   r2   r.   r%   r9   r5   r:   r;   floatr&   r'   r(   r8   r8   ,   ss         __&&E7&&&*/%''%-111,1EGGE&M333 $$$$$$r'   r8   c                   "    e Zd Z e            ZdS )FnNameN)r*   r+   r,   r   QUANTIZE_MXFP8r&   r'   r(   r>   r>   3   s        TVVNNNr'   r>   epiloguefused_activationc                    |j         | j         f}|t          v rt          |         S |j        | j        d}|j        | j        d}|j        | j        z   }dd l}|                    dd                    |                     }|t          j	        |j
        <   t          t          ||||          |_        t          t          ||||          |_        t          t          ||||          |_        |t          |<   |S )N)ACTIVATION_FNEPILOGUE_FN)activation_fn_argsepilogue_fn_argsr   matmul_ogs__)do_not_specialize)r   _kernelsr    r!   r"   types
ModuleTypejoinsysmodulesr*   r   r   r   r   )r@   rA   keyspec_constantsspec_tuplesrI   rK   modules           r(   get_kernelsrT   <   s#    (-
0C
h}),{ N
 /;$1 K )AHDeeLLL;CHHSMM;;<<F#)CK #K6GI I IF%mV^[8IK K KF'Q\:KM M MFHSMMr'   tensorc                     d}d}t          | j                  D ]+}|| j        |         dz
  |                     |          z  z  },||k    S )Nir   r   )rangendimshapestride)rU   	max_int32offsetis       r(   can_overflow_int32r^   ]   sX    IF6; ; ;6<?Q&&--*:*:::Ir'   c                  4    t          d | D                       S )Nc              3   <   K   | ]}|d uot          |          V  d S N)r^   ).0rU   s     r(   	<genexpr>z(should_upcast_indices.<locals>.<genexpr>f   s5      TTVvT!@&8&@&@TTTTTTr'   )any)argss    r(   should_upcast_indicesrf   e   s    TTtTTTTTTr'   c                   j    e Zd ZU  e            Zeed<    e            Zeed<    e            Zeed<   dS )FlexCtxlhs_datarhs_dataout_dataN)	r*   r+   r,   r   ri   r.   rj   r   rk   r&   r'   r(   rh   rh   o   sT         %:<<Hj'''%:<<Hj''''KMMHk)))))r'   rh   c                       e Zd ZU dZeed<   dZeed<    e            Z	eed<   dZ
eed<   dZeed	<   dZeed
<   dZedz  ed<   dZedz  ed<   dZedz  ed<   dZej        ed<   dZeed<   dS )PrecisionConfigNmax_num_imprecise_accT
allow_tf32flex_ctxg      ?	acc_scaleFflexpoint_saturate_infreport_quantization_err_fn	act_scaleweight_scale	out_scale	out_dtypeenforce_bitwise_invariance)r*   r+   r,   rn   r6   r.   ro   boolrh   rp   rq   rr   rs   callablert   r   ru   rv   rw   torchdtyperx   r&   r'   r(   rm   rm   u   s         !%3%%%J		Hg!!!Is#(D(((+////#Iv}###!%L&$,%%%#Iv}###!Iu{!!!',,,,,,r'   rm   c                 f    t          j        dd          r| j        d uo|j        dk    o|j        S dS )N
   r   @   F)r   cuda_capability_geqru   block_mis_persistent)precision_config	opt_flagss     r(   get_swap_xwr      sB    &r1-- q,D8pY=NRT=TpYbYpp5r'   c                   d    e Zd ZU eed<   eee         ej        f         ed<   e	eef         ed<   dS )MatmulAllocationdeviceoutputscratchpadsN)
r*   r+   r,   r-   r.   r%   r6   r{   r|   dictr&   r'   r(   r   r      sL         KKK%*ek)****c5j!!!!!!r'   r   c                 d   |j         d         }| j         d         }	||j        j         d         }	|j        dk    s||	}
n|j        j         d         |j        z  }|}
| j        dk    r| j         d         nd}||
||j        z  f}|j        p| j        }||f}t                      }|j        dk    s	|0|j	        s)|j        dk    rt          j        n|}|j        d|	|f|f|d<   d|v r9|j        2|j        d|	t          j        |t                    ft          j        f|d<   t#          | j        ||          S )Nr   r      matmulmx_out_scale)rY   src_indxn_expts_actrX   r4   rw   r|   r   split_kfused_scatterr{   float32rv   tritoncdivr   uint8r   r   )xwr   rA   routing_datagather_indxscatter_indxr   NMy_rowsMc	batch_dim	out_shaperw   r   
scratchpadscratch_out_dtypes                     r(   init_allocationr      sX   	A	A &q)1$$(<"(+|/GGfkk

qIFA)9)E$EFI *5agI#FJ1!9)BY!9-6->-B-BEMM	!*!2Aq! <>OP
8:"2"<"H'0'8!QA@_@_&`bgbm%n
>"AHfj999r'   
allocationc                 2    t                      }|3t          j         j        d          j         j        d                   }n|j         j        d         k    sJ |d d d d d f         |d<    fd j                                        D             |d<   |S )Nr   r   r   r|   r   c           	      h    i | ].\  }}|t          j        |d          j        |d                   /S )r   r   r   )r{   emptyr   )rb   kvr   s      r(   
<dictcomp>z$apply_allocation.<locals>.<dictcomp>   sK       1 	
5;qtJ$5QqTBBB  r'   r   )r   r{   r   r   r   rY   r   items)r   r   rets   `  r(   apply_allocationr      s    
&&C~Z.q1*:KS]SdefSghhh|z0333334AAA:&CM   ".4466  C Jr'   c                    || j         j        k    sJ dg|| j         j        z
  z  t          | j         j                  z   }| j                             |          }|                    d          g|| j         j        z
  z  t          | j                                                   z   }| j                             ||          }||                    |          }t          || j	                  S )Nr   r   )
datarX   listrY   viewrZ   
as_stridedreinterpretr   layout)storageout_ndim	flex_datanew_storage_shapenew_storage_viewnew_storage_stridenew_storage_datas          r(   _canonicalize_storager      s    w|((((( x',*;;<tGLDV?W?WW|(():;;*11!445GLDU9UVY]^e^j^q^q^s^sYtYtt|../@BTUU$001ABB#W^444r'   Fr   indxoutout_mx_scalex_flexout_flex
x_mx_scalerw   rr   c                 .   |(| j         d         dk    r|                     d          dfS ||j         d         }n| j         d         }|t                      }|t                      }|dn|j         d         }|	| j        n|	}	| j         d         |j        z  dk    sJ d}|dn|j        }|dn|j        }|dn|j        }|dn|j	        }|dn|
                    d          }|dn|
                    d          }|dn|
                    d          }t          |j        |j                  } |j        |f         |                    |           | 
                    d          | 
                    d          | 
                    d          ||                    |          |
                    d          |
                    d          ||||| j         d         | j         d         |||||g|j        |j        |j        R |du|du|
||dd	 ||fS )
a~  
    In-place grouped row reduction.

    Arguments
    - x: Tensor[AnyFloat] of shape [(num_groups * K), N]
    - indx: Tensor[Int] of shape [num_groups, K]

    Description
    For each group g in [0, num_groups), this routine sums the K rows of `x`
    specified by `indx[g, :]` and overwrites the row corresponding to the first
    valid (non-negative) index with the per-group sum. Accumulation is performed
    in float32 for numerical stability, and the result is written back in the
    dtype of `x`.

    Behavior and edge cases
    - Invalid (-1) entries are skipped during accumulation and do not generate
      memory traffic. If a group has no valid entries, nothing is written for
      that group.
    - Reduction is performed tile-by-tile along the N dimension within a single
      kernel launch (persistent along N) to minimize launch overhead.

    Performance notes
    - Memory traffic per group is approximately (valid_rows_read + 1) * N * sizeof(x),
      plus index reads. With no invalid entries, this becomes (K + 1) reads/writes
      of length N per group.

    Returns
    - The input tensor `x` (modified in place).
    Nr   r   r   r   i      r   )HAS_IN_MX_SCALEHAS_OUT_MX_SCALEFLEXPOINT_SATURATE_INFBLOCK_NK	num_warps)rY   squeezer   r   r|   r4   scaleexpected_scaleactual_scalechecksum_scalerZ   rT   r2   r   r   r3   r:   )r   r   r   r   rA   r@   r   r   r   rw   rr   
num_groupsr   r   x_expected_scaleout_expected_scaleout_actual_scaleout_checksum_scale
stride_mxb
stride_mxsstride_omxskernelss                         r(   reduce_groupedr      sf   D |
ayy||T!!Z]

WR[
~==\tz!}A$,)I72;)55::::G%~tt6<!)!1x7N'/ttX5J!)!1x7N (j.?.?.B.BJ (j.?.?.B.BJ#+!!1D1DQ1G1GK(.*:*@AAG+GZN+1qxx{{AHHQKK!S!!3::a==#**Q--,.@$	
AGBKJ
k 
	! $4#? 
	(  #$.UYAY51    r'   c                 (    t          d| i           dS )z;
    persistent kernels will leave `num_idle_sms` idle
    idle_smsN)r   )num_idle_smss    r(   matmul_ogs_set_idle_smsr   %  s     !*l!;<<<<<r'   r   r   r   r   betasgammas	out_alphayc                 $   | j         dk    }|rM|
J d            |
J d            |
J d            |j         dk    r|j        d         | j        d         k    sJ |t                      }|4t          t                                          t                      d          }|At          t                                          t                      t                      d          }|+t          ddt          d|j        d                   d          }|j
        }|du}t                      o-t          j        d	d           ot          |j                  d
k    }|r#|                    d          dk    s
J d            t#          |t$                    s4|j        t&          j        k    rt*          n|j        }t-          ||          }|$t#          |t$                    st%          |          }|?|j                            t&          j                  |j        _        t&          j        |_        |j        }|du}|r#|                     d          dk    s
J d            |$t#          |t$                    st%          |          }t#          | t$                    st%          | | j                  } |du}|du}|j        du}|| j        d         n|j        j        d         }|j        |j         dk    r|j        d         nd}|j        dd         \  }}|| j        d         k    sJ | j         dk    r)|j         dk    r| j        d         |j        d         k    sJ |j        p| j        }|                                 dk    of| j                                        oM|                                dk    o5|j                                        o|du p|j                                        }|o?t&          j         !                                d         dk    pt          |j                  dk    }|o&|j"        j#        du o|j"        j#        du o
|j$        dk    }tK          || j        |j        ||||||||j&                  }|s|j'        rtQ          d          |)|j)        r"t          j*                    stW          d          |:|j        j,        j-        )|j)        s"t          j*                    rtW          d          |}t                      } |j.        dk    s	||j'        s| |} }t_          | |||||||          }!ta          |!|
          }"||z  |z  dk    r4|"d         1                    d          }#|s|#1                    d          }#|#S |j)        r&te          j3        ti          | j5                             d|"d         v }$|"d         6                    d|"d                   }%|%j        t&          j7        k    rtq                      n|j9        j:        }&|j;        }'|'>|'j                            t&          j                  }'|$rd|"d         v r|"d         d         }'|'duo|%<                                dk    }(|j9        })|dn|                    d          }*|dn|j        j        d         }+|j=        },|j>        }-|,dn|,j?        }.|,dn|,j@        |-         d         }/|,dn|,jA        }0|,dn|,jB        |-         }1te          jC        ||j>                  }2|1|D                    ||j>                  }2te          jC        ||jE                  }3||2z  |3z  |j.        z  }4|j)        r)t          t          jG                    |jH        z
  |4          n|4}5|ot          jI                    }6|j'        ot          jI                    }7t-          |j'        r@|%                    t          jK        |%j        dd                   |%j        d                   n9 |%j        t          jK        |%j        dd                   g|%j        dd         R            }
t          | j        |6rdnd|)jM                  }8t          |j        d|)jN                  }9t          |
j        |7rdnd|)j:                  }:|j)        o|6p| };|6r	d|jO        gnd|j>        |jO        g}<|;sdn|r|6sdnd}=|;r|8P                    |<|=          n|8j        }>|j)        o	|7p|j'         }?|jE        |jQ        z  |jR        z  }@|7rd|@gn	d|j>        |@g}A|?sdn|r|7sdnd}B|?r|:P                    |A|B          n|:j        }C|j)        }D|Dr#|9P                    d|jO        |jE        gd          n|9j        }E|}F|j)        o|du}G|Gr'|j        P                    |jE        |jO        gd          n|}Fdgd|8j        j         z
  z  t          |8j                                                  z   }H|r|                                nd}Iddt          |I          z
  z  |Iz   }I|r|Gs|                                nd}Jddt          |J          z
  z  |Jz   }J|(r|'                                nd}Kddt          |K          z
  z  |Kz   }Kt          |j"        |j"                  }L|9j                                        d         dk    }M |j)        r|LjV        n|LjW        |5f         g |C|:j        |%                                |(rd|'dfn|&|Kdd         |>|8j        |H|)jM        jX        |dn#|j                            t&          j                  |I|E|9j        |9j                                        |M|)jN        jX        |F|J||*| j        d         |j        | j        d         nd|||||dn|j        |dn|j        |+|j'        sdn|jY        |j'        sdn|jY        j        d         |.|0|/|1||2|3|	|jZ        |jR        |j[        |j\        |j$        |j]        |j^        |j_        |)jN        j`        |j>        |jE        |jO        |ja        R i d |jb        d!|j        j,        j-        d"|dn|j        j,        j-        d#|jQ        d$|j.        d%||jO        z  dk    d&|jc        d'|jd        d(|je        d)|jf        d*|jg        d+t          | ||%          d,|=d-|Bd.t          ||          d/|j"        j-        t          jk        j-        k    d0|j)        r|5nd|jl         ||j'        rdn|j                            d|j$                  }Nt          |%|N|"d         1                    d          |j;        | |t          |&j        |&jo        1          |j9        j:        |(r|'1                    d          nd|"d         j        |j_        2          \  }O}P|s|O1                    d          }O|P|P|_;        |OS )3zs
    Y[:, :] = 0.
    for e in num_experts:
        Y[idxs_y_m(e), :] += matmul(X[idxs_x_m(e), :], W[e, :, :])
    r   N$gather not supported in batched mode%scatter not supported in batched mode%routing not supported in batched moder   r   Fr~      r   zE`w` must be column-major when it has data-type FP8 on capability < 10r|   r   z0'x' must be row-major when it has data-type mxfp	      zFused scatter is not supportedz1Must use non-persistent kernel for simulated MXFPz?Must use persistent kernel and be TMA-compliant for native MXFPr   r   r   r   r   raggeddense)NNN)r   )NNNNXCD_SWIZZLESWIZZLE_MX_VALUESWIZZLE_MX_SCALEEPILOGUE_SUBTILESPLIT_KEVEN_KW_CACHE_MODIFIERTOKENS_PER_EXPT_FOR_ANNOTATIONr   
num_stagesarchUPCAST_INDICES
X_TMA_MODE
Y_TMA_MODESWAP_XWIS_EPILOGUE_QUANT_MXFP8NUM_SMS)r|   r   )r   r   r   rw   rr   )prX   rY   rm   r1   r   r)   r%   r8   r
   maxru   r   r   r   r   r|   rZ   
isinstancer   r{   r   r   r   r   r   r   rt   	expt_histr   rw   numelis_tma_compliantcudaget_device_capabilityr2   r    r   r   r;   r   r   r   has_native_mxfpNotImplementedErrorr   r   r   r   r   r   r   set_allocatorr   r   getr   r   rp   rk   rv   element_size	expt_datar   histtoken_offs_padtoken_offs_rawblock_pid_mapr   n_blocksblock_nminnum_smsr   has_tma_gathermathprodr   ri   rj   block_kmake_tmaepilogue_subtiler4   r   lenrT   r   r   r   dst_indxr3   r9   n_expts_totrn   ro   rr   is_per_batchgroup_mxcd_swizzlew_cache_modifierexpected_tokens_per_exptr   r   r   rf   r   r>   r?   target_kernel_kwargsr   r   r   )Qr   r   biasr   r   r   r   r   r   r   r   rA   r@   is_input_batchedw_scalew_has_mxis_hopper_fp8r|   x_scalex_has_mx
has_gatherhas_scatter	is_raggedr   
batch_sizer   r   rw   can_use_tmacan_use_fused_scatterr   matmul_fused_activationreduce_fused_activationr   memoryr   has_scratchpad
out_matmulout_matmul_flexout_matmul_scaleout_matmul_has_mxflexbias_stridenum_indxr  r   r  expt_hist_sumexpt_token_offs_rawexpt_block_pid_mapgrid_mgrid_nmax_gridgridhas_gather_tmahas_scatter_tma	x_storage	w_storage	y_storage	x_has_tmax_tma_block_size
x_tma_modex_tensor_or_tma	y_has_tmar  y_tma_block_size
y_tma_modey_tensor_or_tma	w_has_tmaw_tensor_or_tmaw_scale_tensor_or_tmaw_scale_has_tma	x_stridesx_scale_stridesw_scale_stridesout_matmul_scale_stridesr   w_transpose
group_indx	out_finalout_final_mx_scalesQ                                                                                    r(   
matmul_ogsr^  +  s   " v{ 8""$J"""##%L#####%L###v{{qwqzQWQZ7777*,,*7??+<+<eggqIIGOO--uwwGG"4s1agaj/A/A1EE+Gd"HIIgk&Eb!&L&L"LgQYZ[ZaQbQbfgQgMwQXXb\\Q...0w...a   .w%+--17au---:gv#>#>//&|00==(Gd"H])))+]))):gv#>#>//a   %1AG$$$D(Jd*K&d2I"*0D0J10MA+5=!&A++STJ7233<DAqv{{qv{{wqzQWQZ'''' *5agI''))a- IAI$>$>$@$@ I''))a-I$%I$>$>$@$@IT/GW_%E%E%G%G  g5:#C#C#E#Ea#H1#L#fPXYZY`PaPaefPfK'  T-=-C-F$-N  TU]UcUfjnUn  T  vB  vN  RS  vSy!'17<L	1a{,A8C^ I ! GY%< G$%EFFFy6{?Z?\?\!"UVVVw5:FyOfFkv  lG  lI  lIF!"cddd.-//1,":9CZ":;RTk!8 A'79Ik<< <Jj!,,FA~QX&&q)) 	!++a..C
 K?IIJJJ!55N%))(F84DEEJ'1'75='H'HkmmmN^NgNpO'1#+055ekBB 	Dn|0DDD%l3NC(4W9P9P9R9RVW9W$D,$$DKKNNK#+tt1F1LQ1OH&IG!)y~I%-DD93KG3TUW3XM"+"3$$9Q!*!2	8OPW8X[I-..F%&&q)*;<<[I-..FF"V+i.??HHQH_m3{"$$y'998DDDemD@K$>$@$@N-N+2L2N2NOene|  L*//$)J4DSbS4I*J*JJL\]_L`aaa  CR  CM  CR  SW  S\  ]g  ]m  nq  oq  nq  ]r  Ss  Ss  CL  v@  vF  GI  GJ  GJ  vK  CL  CL  CL  	M  	MA%ain1K!T][[I%aiDMBBI%aio1L1dm\\I'N^-M:~I1?n9,--aIZ\e\mEn&e	,e.,eHH^eJJSgi(()9:FFFYbYgO'\_-[ID[@[I9#==AXAddG'6[7||Q	@QSZ<[&f	,f/,fHH_fJJSgi(()9:FFFYbYgO'I`i}i((!Y->	@Q)RT[\\\oxo}O#-E'2EOjy  GW_55y7H)J[6\^efff  @GQ,,-Y^5J5J5L5L0M0MMI*2Jgnn&&&8JOq3#7#778?JO*2b?bgnn&&&PbOq3#7#778?JO<Mk/66888Sk$C0H,I,I(IJMee(.*A*GHHG
 .''))"-2KXi5NW7;NQUPWX 35"35$-N355?5F5F5H5H357H]d,d33o35 -RSS135 #	35 %.N	35 6?	35
 =&35 #?440A0A%+0N0N35 Ra35 #35 %.N35 6?^5J5J5L5L35 OZ35 =&35 )35 ,;35 35 %35 72;35 #/"8"@172;;d35 35 35 35 !35 '.44K4H35  (/44\5J!35" #35$  )6Q44L<Q%35&  )6Z44L<Q<WXY<Z'35( )35( 2)35( 4A)35( CU)35* +35*  &+35* (.+35, -35. ,3/35. 6M5X/350 11352  +3352 .:-E3354 $95356 $.7358 $:935: =-;35< $=35> $?35@ $A35B $C35 35 35 35D  )44E35F %&I$4$9$9G35H -4ODDAWA\I35J %.$>$>K35L %,,M35N i//144O35P %.$>$>Q35R 3?2W2WS35T '00U35V (22W35X "Y35Z #81j"I"I"I[35\ )j]35^ )j_35` ''7CCCa35b ,4>+>&BWB\+\+\c35d &/%<CTT!3g35 35 35j &-1H-lNcNhNhikmy  nF  OG  OGJ$2x  ##" 5_=[\\\!*32CM#++A..."(/F% % %!I!  )%%a((	%%7"r'   c                      j         dk    } j        j        dk    sJ |j        j        dk    sJ |rM|
J d            |
J d            |
J d            |j         dk    r|j        d          j        d         k    sJ |	d }	|
d }
||j         dk    r |j        dg|j        R  }|j         d	k    r |j        dg|j        R  } j         d	k    r  j        dg j        R   |t          d d |j        d         d          }|j        }|j        dk    rs|sq|j        }t          j
        |j        d         dz   t          j        
          }t          j        |d          |dd <   t          t          j        |                    }n& fdt!          |j        d                   D             }| j        d         n|j        j        d         }t          j
         j        d         ||j        d         f j         j                  }t'          |          D ]\  }\  }}|t          j        || j                  }n|j        ||         |z  }|r|nd}t          j         |	 ||d d f         t          j        ||d                                                    ||                                                   }|*||||d d f         n||d d f         |||d f         z  z  }|||||d f         z  } |
|          ||||d d f<   |s,|                    |j        d         |j        d	                   }||S |j        d         |z  }t          j
        ||j        d         ft          j         j                  }t'          |          D ]b\  }\  }}|j        ||         |z  }|dk    }|||         d d fxx         |||d d f         |d d f                                         z  cc<   c|S )Nr   r   r   r   r   r   c                     | S ra   r&   )r   idxs     r(   <lambda>z"matmul_ogs_torch.<locals>.<lambda>2  s     r'   c                     | S ra   r&   )r   s    r(   rb  z"matmul_ogs_torch.<locals>.<lambda>4  s    A r'   r   r   c                 .    g | ]}d j         d         gS )r   r   )rY   )rb   rH   r   s     r(   
<listcomp>z$matmul_ogs_torch.<locals>.<listcomp>E  s#    ;;;AAGAJ;;;r'   r   r   )r   r  )r|   r   )rX   r|   itemsizerY   r   r
   r   r  r  r{   zerosint32cumsumr   	itertoolspairwiserW   r  r   	enumeratearanger   r   r<   r   )r   r   r&  r   r   r   r   r   r   round_xround_yr'  r   sizesoffoffsn_rowsr   r]   lohira  batchr   dst_idxmsks   `                         r(   matmul_ogs_torchry     s    v{7a7a 8""$J"""##%L#####%L###v{{qwqzQWQZ7777""+DINNty(TZ(((v{{AF1qwv{{AF1qw"4qwqz1==*K!##,<#&k%+a.1,EK@@@,ua((ABBI&s++,,;;;;qwqz):):;;;&.QWQZZK4H4Nq4QFQWQZ5ahagVVVA  * *8B,r2ah777CC&r"u-<C%,1l771UC]#3U\"bQW5X5X5XYY__aaQ4::<<) )4111::DAAAJr"ud{AS4SSC6"R%+&&C$WS\\%B/ +FF171:qwqz**WQZ;&F
+vqwr{+5=
R
R
RC  < <8B'2.+=mGCL!!!O"R%(CF 3 9 9 ; ;;Jr'   )NNNNF)
NNNNNNNNNN)NNNNNNNN)Adataclassesr   rj  rN   r{   r   enumr   r   r  triton_kernelsr   triton_kernels.numericsr   r   triton_kernels.routingr	   r
   r   triton_kernels.target_infor   matmul_ogs_details._matmul_ogsr    matmul_ogs_details._p_matmul_ogsr   r   "matmul_ogs_details._reduce_groupedr   numerics_details.mxfpr   matmul_ogs_details.opt_flagsr   r   r   r   rU   r   r   r   r   r   r   r1   r8   r>   EpilogueSpecsr   rJ   r)   rT   r^   rf   rh   rm   r   r   r   r   r   ry   r   r   r<   r^  ry  r&   r'   r(   <module>r     s   " ! ! ! ! !     



            & & & & & & ; ; ; ; ; ; ; ; G G G G G G G G G G . . . . . . 7 7 7 7 7 7 _ _ _ _ _ _ _ _ ? ? ? ? ? ? 2 2 2 2 2 2 n n n n n n n n n n " " " " " " E E E E E E E E E E E E E E $. . . . . . . . $        $% % % % % % % %    T    466 %,OO$5$5SZSbSbSdSd  '     Bu|    U U U $* * * * * * * *
 - - - - - - - -   " " " " " " " "
: : :6!1    &5 5 5, 04Z^JO	I Iel I%, IU\ IY^Ye I%,I )4/I EJLSWDWI #	I DH	I I I I^= = = 370426:>,0-1)-(,:>+/o o(4/o&-o )4/o "14!7	o
 lT)o |d*o  $,o ,%o "14!7o "Do o o ol .2+/-159+/A A*A(A  +A $3	A A A A A Ar'   