
    )`i>                      b   U d Z ddlZddlZddlZddlZddlZddlmZmZm	Z	m
Z
 	 ddlmc mZ n# e$ rZ ed          edZ[ww xY wddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZ  G d dej                  Z  G d dej                  Z! G d dej                  Z"dej#        fdZ$dej#        fdZ%dej#        fdZ&dej'        fdZ(d Z)de*de*de*fdZ+dej#        dej#        fdZ,	 	 ddej#        de*d e*d!e
e*e*f         d"e	e*         d#e-d$e	ej'                 dej#        fd%Z.	 	 ddej#        de*d e*d&e
e*e*e*f         d"e	e*         d'e-fd(Z/ ej0        d)          d*             Z1d+e2de2fd,Z3 ej0        d)          de-fd-            Z4 ej0        d)          d.ej'        d/ej'        de
e*e*e*f         fd0            Z5 G d1 d2          Z6 G d3 d4          Z7	 dd5e*d6e*d7e*d8e*d9e-de-fd:Z8d;e*d<e*de*fd=Z9d>e*d?e*d@ej'        fdAZ:d>e*d?e*d@ej;        fdBZ<d>e*d?e*dCe*dDe!dEe!dFe"d@ej'        dGej'        dHe*dIe6de7fdJZ= ej0        d)          dKe dLe*dMe*d e*d"e*dDe!dEe!dFe"d@ej'        dGej'        d8e*de
e*e*e*e*e*e6e7f         fdN            Z>ej?        ej@        jA        ejB        ej@        jC        ejD        ej@        jE        ejF        ej@        jG        ejH        ej@        jA        ejI        ej@        jC        ejJ        ej@        jK        ejL        ej@        jM        ejN        ej@        jO        ejP        ej@        jQ        ejR        ej@        jS        ejT        ej@        jA        ejU        ej@        jA        ejV        ej@        jA        ejW        ej@        jA        iZXeee2f         eYdO<   ejZ        j[        ejZ        j[        ejZ        j\        ejZ        j]        ejZ        j^        dPZ_dej#        dQe
ej`        dRf         dSe
ej`        dRf         dTe
eja        dRf         dUejZ        dejb        fdVZcdej#        dWe*dXe*dYe*dZe*d[e*d\e*dejb        fd]Zdd^e!dej#        d_e*d`e*d>e*dCe*dae*d"e*d\e*dejb        fdbZed^e!dej#        dce*d`e*d?e*dCe*dae*d"e*d\e*dejb        fddZfd^e"dej#        d_e*dce*d>e*d?e*dae*d"e*d\e*dejb        fdeZgd^e!dej#        dfe*d`e*dge*dCe*d"e*d\e*dejb        fdhZhdidjddkejR        dlej;        dmiZii Zj G dn do          Zkdp Zldqe2dre2dekfdsZmdLe*dMe*d e*dte*d"e*dDe!dEe!dFe"due2dvej'        fdwZndxej#        dyej#        dzej#        d{ej#        d|ej#        d}ej#        dDe!dEe!due2fd~Zodxej#        dyej#        dzej#        d{ej#        d|ej#        d}ej#        dDe!dEe!due2ddfdZpdLe*dMe*d e*de*dte*d"e*dDe!dEe!dFe"due2dvej'        fdZqdxej#        dyej#        dzej#        d{ej#        d|ej#        dej#        de*dDe!dEe!due2fdZrdxej#        dyej#        dzej#        d{ej#        d|ej#        dej#        de*dDe!dEe!due2ddfdZs eddg          	 	 dde
ej#        ej#        f         de
ej#        ej#        f         d|ej#        d}ej#        d&e	e
e*e*e*f                  due2de-fd            Zt ei et          	 	 dde
ej#        ej#        f         de
ej#        ej#        f         d|ej#        d}ej#        d&e	e
e*e*e*f                  due2ddfd            Zu eddg          	 	 dde
ej#        ej#        f         de
ej#        ej#        f         d|ej#        dej#        de*d&e	e
e*e*e*f                  due2de-fd            Zv ei ev          	 	 dde
ej#        ej#        f         de
ej#        ej#        f         d|ej#        dej#        de*d&e	e
e*e*e*f                  due2ddfd            Zw G d d          Zx ex            ZydS )a*  
MIT License

Copyright (c) 2025 DeepSeek

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
    N)AnyDictOptionalTuplez^Could not import the 'cuda' module. Please install cuda-python that matches your CUDA version.   )ArtifactPath)checkCudaErrors)	get_cubin)FLASHINFER_CUBIN_DIR)ceil_divround_upsupported_compute_capabilitybackend_requirementc                   &    e Zd ZdZdZdZdefdZdS )GemmTyper   r      returnc                 $    dddd| j                  S )NzGemmType::NormalzGemmType::GroupedContiguouszGemmType::GroupedMasked)r   r   r   valueselfs    h/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/deep_gemm.py__str__zGemmType.__str__=   s&    !,(
 
 *	 	    N)__name__
__module____qualname__NormalGroupedContiguousGroupedMaskedstrr    r   r   r   r   8   s?        FM      r   r   c                   .    e Zd ZdZdZd Zd ZdefdZdS )MajorTypeABr   r   c                      | j         dk    rdndS )Nr   r   r   r   s    r   shape_directionzMajorTypeAB.shape_directionI   s    J!OOqq+r   c                      | j         dk    rdndS Nr   r'   r   r   s    r   non_contiguous_dimzMajorTypeAB.non_contiguous_dimL       Z1__rr",r   r   c                 "    ddd| j                  S )Nzcute::UMMA::Major::Kzcute::UMMA::Major::MN)r   r   r   r   s    r   r   zMajorTypeAB.__str__O   s    ).EFFtzRRr   N)	r   r   r   KMajorMNMajorr(   r,   r"   r   r#   r   r   r%   r%   E   s^        FG, , ,- - -S S S S S S Sr   r%   c                       e Zd ZdZdZd ZdS )MajorTypeCDr   r   c                      | j         dk    rdndS r*   r   r   s    r   r,   zMajorTypeCD.non_contiguous_dimW   r-   r   N)r   r   r   NMajorMMajorr,   r#   r   r   r2   r2   S   s-        FF- - - - -r   r2   tc                 j   |                                  dv sJ |                                  dk    rL|                     d          |                     d          |                     d          z  k    s
J d            |                     d          dk    s|                     d          dk    sJ d S d S )Nr      r9   r   r+   r'   z-Grouped dimension cannot have abnormal strider   )dimstridesizer6   s    r   major_checkr>   [   s    5577fuuww!||xx{{affRjj166"::5555; 655 88B<<1 1 1 11 1 1r   c                     t          |            |                     d          dk    rt          j        nt          j        S Nr'   r   )r>   r;   r%   r/   r0   r=   s    r   get_major_type_abrA   d   s2    NNN!""!2!2;8KKr   c                     t          |            |                     d          dk    rt          j        nt          j        S r@   )r>   r;   r2   r4   r5   r=   s    r   get_major_type_cdrC   i   s2    NNN!""!2!2;8JJr   dtypec                 Z    t           j        dt           j        dt           j        di|          S )Nr   r      )torchfloat8_e4m3fnbfloat16floatrD   s    r   get_element_sizerL   n   s,    QQ 	 r   c                      dS N   r#   r#   r   r   %get_m_alignment_for_contiguous_layoutrP   v   s    3r   xelement_sizer   c                 F    d}||z  dk    sJ ||z  }t          | |          S )N   r   )r   )rQ   rR   tma_alignment_bytes	alignments       r   get_tma_aligned_sizerW   z   s8    -2222#|3IAy!!!r   c                    | j         t          j        k    r|                                 dv sJ |                     t          j                  dz	                      t          j                  }| j        d         | j        d         }}d}|                                 dk    r| 	                    d          d}} | j        d         }t          |d	          }t          |d	          }t          j        |||f| j        t          j        
          }||d d d |d |f<   |                    d                              t          j                                      |||d	z            }t          j        t          j        ||d	z  |f| j        t          j        
          dd          }	||	d d d d d d f<   |	d d d |d d f         }
|r|
                    d          n|
S )Nr8      r+   r'   Fr   r   TrF   )devicerD   rK   r   )rD   rG   rJ   r:   viewinttouint8shape	unsqueezerW   r   zerosrZ   	transposeemptysqueeze)rQ   ue8m0_tensormnk
remove_dimb
aligned_mn	aligned_kpadded
transposed	aligned_xs              r   'get_col_major_tma_aligned_packed_tensorro      s   7ek!!aeegg&7&7&77 FF59%%+//<<L GBKBJuuww!||A:	
A%b!,,JAI[!Z3AHEKXXXF%F111crc2A2:[[__!!	!2277:yTU~VVF Q	Q
3AHEIVVV		 J
 !Jqqq!!!QQQw111crc1119%I#-<9Q9<r   Fsfrf   rg   gran
num_groupstma_stride_check
type_checkc                 $   || j         |k    sJ | j         t          j        t          j        fv sJ |                                 t          |d u          dz   k    sJ ||                     d          |k    sJ |                     d          t          ||d                   k    sJ |                     d          t          ||d         | j         t          j        k    rdndz            k    sJ |r|D|                     d          |                     d          |                     d          z  k    sJ |                     d          dk    sJ |                     d          t          || 	                                          k    sJ | S )Nr   r+   r   r'   r   rF   )
rD   rG   rJ   r\   r:   r<   r   r;   rW   rR   )rp   rf   rg   rq   rr   rs   rt   s          r   check_sf_layoutrw      s    x:%%%% 8UY/////6688s:T122Q66666wwr{{j((((772;;(2tAw//////772;;(1d1gbh%+6M6MST&UVVVVVV  L!99R==BIIbMMBGGBKK$?????yy}}!!!!yy}} 4R9J9J K KKKKKIr   recipeis_sfac           	      4   ||rdnd         |d         f}| j         t          j        k    o|dk    ot                      dv p*| j         t          j        k    o|dk    ot                      dv }|st	          | ||||           | j         t          j        k    r|dk    rt                      dk    rt          | j         t          j        k    rE|dk    r?t                      dv r/t          |           } t	          | ||d|d	t          j        
          S | j         t          j        k    r|dk    rt                      dk    rt          | j         t          j        k    rw|dk    rqt                      dv ra|                     dt          j	        || j
                  dz            } t          |           } t	          | ||d|d	t          j        
          S |r t	          | ||d|d	t          j        
          S t          d| j         d|dt                                 )Nr   r   r   )r   rO   100a103a)rO   rO   )rf   rg   rq   rr   90aT)rf   rg   rq   rr   rs   rt   r+   rZ   rO   zUnknown cases: sf.dtype=z, gran=z, arch=)rD   rG   r\   get_device_archrw   rJ   NotImplementedErrorro   index_selectarangerZ   AssertionError)rp   rf   rg   rx   rr   ry   rq   should_skip_transforms           r   !transform_sf_into_required_layoutr      sx    %11A&q	2D 	EI 	2H	2!11
 	EI 	2J	2!11  ! JrQTjIIII 
x5;48#3#38I8IU8R8R!! 	EKH!1114R88!!y
 
 
 	
 
x5;4:#5#5/:K:Ku:T:T!! 	EKJ!111__Rb!C!C!Cs!JKK4R88!!y
 
 
 	
  

!!y
 
 
 	
 H"(HHtHH_5F5FHH  r   )maxsizec                  p    t           j                                        \  } }| dk    rdnd}| dz  |z    | S )N	   a 
   )rG   cudaget_device_capability)majorminorsuffixs      r   r   r     sD    :3355LE5QJJSSBFbj5 *&***r   sc                     t          j                    }|                    |                     d                     |                                dd         S )Nutf-8r      )hashlibmd5updateencode	hexdigest)r   r   s     r   hash_to_hexr     sB    
+--CJJqxx  !!!==??1R4  r   c                  2    ddddt                               S )NTF)r~   r|   r}   )r   r#   r   r   must_be_k_majorr     s,       	 r   	sfa_dtype	sfb_dtypec           
          | t           j        t           j        fv sJ dt           j        fddt           j        fddt           j        fddt           j        fddt           j        fdit                      |f         S )Nr~   )r   rO   rO   r|   )r   r   rO   r}   )rG   rJ   r\   r   )r   r   s     r   get_default_reciper   (  sy     ei00000	m	}	[	}	[ )$& &r   c                   6    e Zd ZdedefdZdefdZdefdZdS )	MulticastConfignum_multicastis_multicast_on_ac                 "    || _         || _        d S N)r   r   )r   r   r   s      r   __init__zMulticastConfig.__init__7  s    *!2r   block_mc                 P    t                      dk    sJ || j        r| j        ndz  S Nr~   r   r   r   r   )r   r   s     r   get_ab_load_block_mz#MulticastConfig.get_ab_load_block_m;  s4      E))))1GN4--QOOr   block_nc                 P    t                      dk    sJ || j        rdn| j        z  S r   r   )r   r   s     r   get_ab_load_block_nz#MulticastConfig.get_ab_load_block_n@  s3      E)))) 6N11D<NOOr   N)r   r   r   r\   boolr   r   r   r#   r   r   r   r   6  sx        3c 3d 3 3 3 3P3 P P P P
P3 P P P P P Pr   r   c                   &    e Zd ZdedededefdZdS )SharedMemoryConfig	smem_sizeswizzle_a_modeswizzle_b_modeswizzle_cd_modec                     || _         || _        || _        || _        d| _        | j        dk    sJ | j        dk    sJ | j        dk    sJ | j        dk    sJ d S )Nr   rT   )r   r   r   r   swizzle_sf_mode)r   r   r   r   r   s        r   r   zSharedMemoryConfig.__init__G  s     #,,. "a''''"a''''#b((((#q((((((r   N)r   r   r   r\   r   r#   r   r   r   r   F  sN        )) ) 	)
 ) ) ) ) ) )r   r   	shape_dim	block_dimr   num_smsrequire_divisiblec                 L    t          | |          |z  dk    p| }|o||z  dk    S Nr   r   )r   r   r   r   r   	divisibles         r   is_multicast_legalr   \  s@     	I&&6!;TCT?T  5=0A55r   
block_size	elem_sizec                 N    dD ]}| |z  |z  dk    r|c S t          d           dS )N)rO   @       rT   r   zInvalid mode)r   )r   r   
mode_bytess      r   get_swizzle_moder   i  sL     (  
"j0A55 6>"""1r   r   r   ab_dtypec                     d}| |z  dk    sJ t           j        dt           j        t          | |          t          ||          fi|         S )NrO   r   )r   r   )rG   rI   rH   r   )r   r   r   num_utccp_aligned_elemss       r   get_sf_aligned_block_sizesr   s  sa    !,,1111W566W566
  r   c                 T    t          | ||          \  }}d|z  |dz  z   |dz  z   dk    S )Nr   r      )r   )r   r   r   
sf_block_m
sf_block_ns        r   is_tmem_size_legalr     s;    7(SSJ
[Z2-.*2BCKKr   block_kmajor_amajor_bmajor_dcd_dtype
num_stagesmulticast_configc
                    |t           j        k    sJ t          |          }
t          |          }|	                    |           }|	                    |          }t          |t          j        k    r|n||
          }t          |t          j        k    r|n||
          }t          |t           j        k    r|n| |          }d}t          | |          |z  dz  }||z  |
z  }||z  |
z  }t          | ||          \  }}|dz  }|dz  }|dz  dz  dz   dz   }d}d}||z  }|||z  z  }|||z  z  }|||z  z  }|||z  z  }||z  }||z  }t          ||||          S )NrO   r   rF      r9   r   r   )r2   r4   rL   r   r   r   r%   r/   minr   r   )r   r   r   r   r   r   r   r   r   r   ab_elem_sizecd_elem_sizeload_block_mload_block_nr   r   r   layout_ad_msmem_dsmem_a_per_stagesmem_b_per_stager   r   smem_scales_a_per_stagesmem_scales_b_per_stagesmem_barriersmem_tmem_ptrr   s                               r   get_smem_configr     s    k(((((#H--L#H--L#77@@L#77@@L%k000lL N &k000lL N 'k000g| O K+&&81<F $g-<#g-< 8(SSJ
(1n(1n >A%	1A5LM II...I...I555I555III>>?  r   	gemm_typemnc                   
" |t           j        k    sJ |	t           j        t           j        fv sJ d }| t          j        k    rt                      f}n|t          j        k    rdnd}|t          j        k    rt          t          ddd                    nt          t          ddd                    }dt          |          z  }
fd"
fd}"
fd	}d
\  }}|D ]}|D ]}d} |||           |||          }}||||k     rd}nW||k    rQ |||          } |||          }||k    }||k    r-|||k    o||k     z  }|||k    o||k     z  }|||k    o||k    z  }|t          |||          z  }|r||fn||f\  }}||J t          dd          }dt          |d
d          o| t          j        k    d}||k    rdndD ]&}dk    r||         rt          d|dk              } n'd\  }}}t          t!          fdd                    }|D ])} t#          ||||||||	| |
  
        }|j        |k    r| } n*|J |J  |||          }t'          t'          |          t'          |          z  z  |          }!t'          |!|j                  |j        z  }!|!
k    sJ |!||||||fS )N)rO   )rO      rT   i  r   rO   c                     | dk    rn| S r   r#   )rQ   r   s    r   <lambda>z"get_best_configs.<locals>.<lambda>  s    Q!VV'' r   c                 p    | r2t          t          |           t          |          z  z            nd S r   r   )bmbnr   r   rr   r   s     r   r   z"get_best_configs.<locals>.<lambda>  s<    	!R8Ar??2Z?III r   c                 d     t          |           t          |          z  z  z            S r   r   )r   r   fix_wave_saturater   r   rr   r   s     r   r   z"get_best_configs.<locals>.<lambda>  s7    (9(9	!R8Ar??	*Z	77B) ) r   )NNFTr   r   )AB)r   r   r   r   )NNi  c                 2    | t          dz  d          k    S )NrO   r   )max)r   rg   s    r   r   z"get_best_configs.<locals>.<lambda>(  s    c!s(A... r   )r            rF   r9   r   r   )rG   rH   rI   rJ   r   r    rP   r%   r/   tuplerangerL   r   r   r   r   filterr   r   r   r   )#r   r   r   rg   rr   r   r   r   r   r   r   block_msblock_nsr   get_num_wavesget_last_wave_utilbest_block_mbest_block_nr   r   success	num_wavesbest_num_wavesutil	best_utilbest_multicast_configis_legalibest_num_stagesbest_smem_configsm100_capacitystage_candidatesr   num_min_smsr   s#    ````     `                       @r   get_best_configsr    s^    u*****44444 !%HH...9;;=$(:::66
 k((( 	eBR  !!!5S"%%&&  %h///G::::      M
        ",L,   	 	GGgw//lL99 &I
 $'~--n,,))'7;;..|\JJ	*9$$w,6Q7\;QQGw,6Q7\;QQGw,6Q7\;QQG)'7HEEEG&-O'""L,3O 'L,,3	8 #(@(@@ ,At44
 <GTBB )(	 H (,66ZZJ  888$3AqCx$@$@!E
 9K5O%~....0HII  '  
*!
 
 %77(OE 8 '''&&&
 lL99IL!!HQ$=$==
JI K 	3ABB

-	.  '!!!! 	 r   tmap_type_map)r   rT   r   r   rO   	gmem_dims.gmem_strides	smem_dimsswizzle_typec                    t          |          }t          |          |dz
  k    sJ t          |          |k    sJ t          | j                 }t          t	          j        |||                                 |||t	          j        d          f|z  t          j        j	        |t          j
        j        t          j        j                            }|S Nr   )lenr  rD   r	   cbdcuTensorMapEncodeTileddata_ptr
cuuint32_tCUtensorMapInterleaveCU_TENSOR_MAP_INTERLEAVE_NONECUtensorMapL2promotion"CU_TENSOR_MAP_L2_PROMOTION_L2_256BCUtensorMapFloatOOBfill!CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE)r6   r  r  r  r  num_dimstensor_dtype
tensor_maps           r   make_tma_xd_descr)  p  s     9~~H|1,,,,y>>X%%%% )L "JJLL^A 8+%C&I'I	
 	
 J r   gmem_inner_dimgmem_outer_dimsmem_inner_dimsmem_outer_dimgmem_outer_strideswizzle_modec                    |dk    r4||                                  z  dk    sJ ||                                  z  }t          j        |          t          j        |          f}t          j        ||                                  z            f}t          j        |          t          j        |          f}	t	          | |||	t
          |                   S r   )rR   r  
cuuint64_tr  r)  swizzle_type_map)
r6   r*  r+  r,  r-  r.  r/  r  r  r  s
             r   make_tma_2d_descr3    s     qann...!3333%)9)99//1O1OPIN#4q~~7G7G#GHHJL//1O1OPI	9lI/?/M  r   
major_typeshape_mshape_kouter_stridec	           	          |dk    r| t           j        k    sJ |||z  fd d |                                          \  }	}
||fd d |                                          \  }}t          ||	|
||||          S r  )r%   r/   r(   r3  )r4  r6   r5  r6  r   r   r7  rr   r/  r*  r+  r,  r-  s                r   make_tma_a_descr9    s     A~~[/////&-w/C%D'':%%'''&"NN '.w%78W8W:;U;U;W;W8W%X"NN	  r   shape_nc	           	          ||fd d |                                           }	|	d         |	d         |z  }}
||fd d |                                           \  }}t          ||
|||||          S )Nr   r   )r(   r3  )r4  r6   r:  r6  r   r   r7  rr   r/  	io_shapesr*  r+  r,  r-  s                 r   make_tma_b_descr=    s     '"#B#Bj&@&@&B&B#BCI&/lIaL:4MNN&-w%78W8W:;U;U;W;W8W%X"NN	  r   c	           
      v    | t           j        k    sJ d}	t          ||||z  |t          ||	          ||          S rN   )r2   r4   r3  r   )
r4  r6   r5  r:  r   r   r7  rr   r/  r   s
             r   make_tma_cd_descr?    sU     +++++ K	*G[!!  r   shape_mnblock_mnc           	          | t           j        k    sJ |dk    sJ t          ||                                          }t	          ||t          ||dz            |z  |d||          S )Nr   rF   r   )r%   r0   rW   rR   r3  r   )r4  r6   r@  r6  rA  r   rr   r/  s           r   make_tma_sf_descrC    s     ,,,,, 1 $Hann.>.>??H	'A+&&3	  r   Ttruefalsezcutlass::bfloat16_trJ   c                       e Zd ZdededdfdZdej        fdZddZe	de
eef         defd	            Ze	d
ej        de
eef         dej        fd            ZdS )SM100FP8GemmRuntimepathsymbolr   Nc                 `    || _         d | _        d | _        || _        t          j        | _        d S r   )rH  libkernelrI  r  cuLibraryUnload_cleanup_func)r   rH  rI  s      r   r   zSM100FP8GemmRuntime.__init__   s/    	 0r   c                 J   | j         t          | j        d          }t          t	          j        |g g dg g d                    | _        t          t	          j        | j        t          | j        d                              | _         | 	                    | j         |          S )Nr   )encodingr   )
rL  bytesrH  r	   r  cuLibraryLoadFromFilerK  cuLibraryGetKernelrI  launch)r   kwargsrH  s      r   __call__zSM100FP8GemmRuntime.__call__(  s    ;W555D&)$B2r1EE DH *&txt{W1U1U1UVV DK
 {{4;///r   c                     | j         ]t          | dd           }t          |          r?	  || j                    d S # t          $ r}t	          d|            Y d }~d S d }~ww xY wd S d S )NrN  z5Failed to delete SM100FP8GemmRuntime with exception: )rK  getattrcallable	Exceptionprint)r   cleanupes      r   __del__zSM100FP8GemmRuntime.__del__7  s    8dOT::G   WWGDH%%%%%  W W WURSUUVVVVVVVVVW  W Ws   ; 
A"AA"rU  c                    | d         t           j        t           j        fv sJ d                    g d| d          d| d          dd| d         v r| d	         nd
 dd| d         v r| d         nd
 dd| d         v r| d         nd
 d| d          d| d          d| d          d| d          d| d          d| d          d| d          d| d          d| d          d| d          d| d          d| d          dt          | d                   d| d          dt          | d                   dt          | d                   d          }|S )N
CD_DTYPE_Tr   a+  
#ifdef __CUDACC_RTC__
#include <deep_gemm/nvrtc_std.cuh>
#else
#include <cuda.h>
#include <string>
#endif

#include <deep_gemm/impls/sm100_fp8_gemm_1d1d.cuh>

using namespace deep_gemm;

static void __instantiate_kernel() {
    auto ptr = reinterpret_cast<void*>(&sm100_fp8_gemm_1d1d_impl<
        MAJOR_Az
,
        MAJOR_Br   COMPILED_DIMSMr   r   Nrg   KBLOCK_MBLOCK_NBLOCK_K
NUM_GROUPSSWIZZLE_A_MODESWIZZLE_B_MODESWIZZLE_CD_MODE
NUM_STAGESNUM_LAST_STAGESNUM_NON_EPILOGUE_THREADSNUM_EPILOGUE_THREADSNUM_MULTICASTIS_MULTICAST_ON_A	GEMM_TYPEWITH_ACCUMULATIONz
      >);
};
)rG   rI   rJ   joinpytypes_to_ctypes)rU  codes     r   generatezSM100FP8GemmRuntime.generateA  s   l#'DDDDD% % % % % % 
		% % % % 
		% % % %  vo666A!% % % %" vo666A#% % % %$ vo666A%% % % %& 
		'% % % %( 
		)% % % %* 
		+% % % %, 
	-% % % %. 
 	!/% % % %0 
 	!1% % % %2 
!	"3% % % %4 
	5% % % %6 
!	"7% % % %8 
*	+9% % % %: 
&	';% % % %< 
	 =% % % %> 
6"56	7?% % % %@ 
	A% % % %B 
6"56	7C% % % %D 
6,/	0E% % % % % %L r   rL  c                    t          t          j        t          j        j        |d         | t          j        |d                                        t          j                    }|d         |j        _        d|j        _	        d|j        _
        t          j                    }t          j        j        |_        ||_        t          j                    }d|_        |g|_        |d         |_        d|_        d|_        |d         |d         z   |_        d|_        d|_        |d         |_        |d         |_        |d	                                         |d
         |d         |d         |d         |d         |d         |d         |d         |d         f
}t8          j        t8          j        t8          j        t8          j        d d d d d d f
}t          j        || ||fd          S )N	SMEM_SIZEDEVICE_INDEXrr  r   NUM_SMSrp  rq  STREAMGROUPED_LAYOUTrd  re  rf  TENSOR_MAP_ATENSOR_MAP_BTENSOR_MAP_SFATENSOR_MAP_SFBTENSOR_MAP_CTENSOR_MAP_Dr   ) r	   r  cuKernelSetAttributeCUfunction_attribute/CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTESCUdeviceCUlaunchAttributeValue
clusterDimrQ   yzCUlaunchAttributeCUlaunchAttributeID%CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSIONidr   CUlaunchConfignumAttrsattrsgridDimXgridDimYgridDimZ	blockDimX	blockDimY	blockDimZsharedMemByteshStreamr  ctypesc_void_pc_uint32cuLaunchKernelEx)rL  rU  attr_valattrconfig
arg_values	arg_typess          r   rT  zSM100FP8GemmRuntime.launchm  s   $(X{#VN344	 	
 	
 	
 -// & 7 ! !$&&)O
#%%v +-.8N1OO 	  &{ 3) #$--//3K3K3K>">"#$#$>">"

 OOOO
	 #FFZ4KQOOOr   )r   N)r   r   r   r"   r   r  CUresultrV  r^  staticmethodr   r   ry  CUkernelrT  r#   r   r   rG  rG    s        1S 1# 1$ 1 1 1 10CL 0 0 0 0W W W W (c3h (C ( ( ( \(V 8Ps| 8PT#s(^ 8P 8P 8P 8P \8P 8P 8Pr   rG  c                  <   t           D ]} | t          v rt           |          \  }}| dz   } t          t          j        dz   | z   |           t
          t          j        z  | z  }|                                sJ t          t          |          |          t          | <   d S )N.cubin/)	
KERNEL_MAPRUNTIME_CACHEr
   r   DEEPGEMMr   existsrG  r"   )
cubin_namerI  sha256rH  s       r   load_allr    s      K K
&&#J/(*
,'#-
:FCCC#l&;;jH{{}}}$7D		6$J$Jj!!K Kr   namerx  c                    |  d| }d|  dt          |           }|t          vrt          d|           |t          v rt          |         S t          |         \  }}|dz   }t	          t
          j        dz   |z   |           t          t
          j        z  |z  }|                                sJ t          t          |          |          t          |<   t          |         S )Nz$$zkernel..zcubin not registered: r  r  )r   r  
ValueErrorr  r
   r   r  r   r  rG  r"   )r  rx  	signaturer  rI  r  rH  s          r   loadr    s    !!4!!I:4::+i"8"8::J##>*>>???]""Z((
+NFFh&Jl#c)J6???,"77*DD;;=== 3CIIv F FM*$$r   rk   compiled_dimsoutput_dtypec
                    t           j                            d          j        }
t	          t
          j        | ||||||t           j        |	|
          \  }
}}}}}}i d|d| d|d|dt
          j        dd	d
d	d|d|d|d|d|d|d|dt          ||          |z  d|j	        d|j
        |j        |j        |j        d|	d}|
||||||f|fS )Nr   r   rc  rd  re  rf  rt  rp  rO   rq  ra  rb  rj  rg  rh  ri  rn  ro  rk  rl  Frm  rr  rs  ru  r`  )rG   r   get_device_propertiesmulti_processor_countr  r   r    rH   r   r   r   r   r   r   )r   r   rg   rk   rr   r   r   r   r  r  r   r   r   r   r   r   smem_configrU  s                     r   2m_grouped_fp8_gemm_nt_contiguous_static_kwargs_genr    s    j..f.==SG&	
 	
 RGWgw
4Dk 	Q 	Q	
 	Y 	X/ 	#C 	 	7 	7 	j 	7 	7 	7  	j!" 	8Aw//*<#$ 	+4%& 	+4'( '6)7-?""1  F6 	  r   r   sfari   sfbd	m_indicesc	                    | j         \  }	}
|j         \  }}}t          j        }t          |
d          }t	          |	||
|||||||j        
  
        \  \  }}}}}}}}t          || |	|
|                    |          ||                     |	                                          d|j
        	  	        }t          ||||
|                    |          ||                    |	                                          ||j        	  	        }t          |||	||||                    |	                                          d|j        	  	        }t!          t"          j        ||	|
||d|j                  }t!          t"          j        |||
||||j                  }i ||||j        ||||||t*          j                                        j        |j        j        d}||fS )NrO   r   )rr   r/  r  r}  r{  r  r  r  r  r  r  r~  r|  )r_   r2   r4   r   r  rD   r9  r   r;   r,   r   r=  r   r   r?  r   rC  r%   r0   r   r   rG   r   current_streamcuda_streamrZ   index)r   r  ri   r  r  r  r   r   r  r   rg   rr   r   _r   rk   r   r   r   r   r   r   r  static_kwargstensor_map_atensor_map_btensor_map_dtensor_map_sfatensor_map_sfb
all_kwargss                                 r   +m_grouped_fp8_gemm_nt_contiguous_kwargs_genr    sU    7DAqwJ1 G C  I 	;					 		
 #			,,W55	++--.. /
 
 
L #			,,W55	++--.. /
 
 
L $				++--.. 0
 
 
L &		 0	 	 	N &		 0	 	 	N
 $ *$$(($$*++--9  J *$$r   c	                     t          | ||||||||	  	        \  }	}
t                              |	          }t          d|          } |di |
 d S Nfp8_m_grouped_gemmr#   )r  rG  ry  r  )r   r  ri   r  r  r  r   r   r  r  r  rx  runtimes                r   &m_grouped_fp8_gemm_nt_contiguous_sm10xr  v  sm     !L	339gw! !M: ''66D'..GGjr   
expected_mc                    t           j                            d          j        }t	          t
          j        |||||||t           j        |
|          \  }}}}}}}|dk    r| |z  dk    sJ i d|	d| d|d|d	t
          j        d
dddd|d|d|d|d|d|d|dt          ||          |z  d|j	        d|j
        |j        |j        |j        d|
d}|||||||f|fS )Nr   r   r   r   rc  rd  re  rf  rt  rp  rO   rq  ra  rb  rj  rg  rh  ri  rn  ro  rk  rl  Fr  )rG   r   r  r  r  r   r!   rH   r   r   r   r   r   r   )r   r   rg   r  rk   rr   r   r   r   r  r  r   r   r   r   r   r   r  rU  s                      r   .m_grouped_fp8_gemm_nt_masked_static_kwargs_genr    s    j..f.==SG"	
 	
 RGWgw
4Dk A~~7{a 	Q 	Q	
 	Y 	X+ 	#C 	 	7 	7 	j 	7 	7 	7  	j!" 	8Aw//*<#$ 	+4%& 	+4'( '6)7-?""1  F6 	  r   masked_mc
                    | j         \  }
}}|j         \  }}}t          j        }t          |d          }t	          ||||||
||||	|j                  \  \  }}}}}}}}t          || |||                    |          ||                     |	                                          |
|j
        	  	        }t          |||||                    |          ||                    |	                                          |
|j        	  	        }t          |||||||                    |	                                          |
|j        	  	        }t!          t"          j        ||||||
|j                  }t!          t"          j        ||||||
|j                  }i ||||j        ||||||t*          j                                        j        |j        j        d}||fS )NrO   r  )r_   r2   r4   r   r  rD   r9  r   r;   r,   r   r=  r   r   r?  r   rC  r%   r0   r   r   rG   r   r  r  rZ   r  )r   r  ri   r  r  r  r  r   r   r  rr   r   rg   r  r   r   rk   r   r   r   r   r   r   r  r  r  r  r  r  r  r  s                                  r   'm_grouped_fp8_gemm_nt_masked_kwargs_genr    sA    wJ1gGAq! G C  I 	7					 		
 #			,,W55	++--.."
 
L #			,,W55	++--.."
 
L $				++--..#
 
L &		#	 	N &		#	 	N
 # *$$(($$*++--9  J *$$r   c
                     t          | |||||||||	
  
        \  }
}t                              |
          }t          d|          } |di | d S r  )r  rG  ry  r  )r   r  ri   r  r  r  r  r   r   r  r  r  rx  r  s                 r   "m_grouped_fp8_gemm_nt_masked_sm10xr  A  so     !H	338Z'=! !M: ''66D'..GGjr   d   g   nka_fp8b_fp8c                 8   t          | d                   }t          |d                   }|t          j        k    rt          d|           t	                      r"|t          j        k    rt          d|           |                                s$t          d|                                           | \  }}	|\  }
}|j        \  }}|
j        \  }}}|j        \  }}|                                }||k    s||k    s||k    s||k    r$t          d| d| d| d| d	| d
| d|           |j        t          j
        k    rt          d|j                   |
j        t          j
        k    rt          d|
j                   |j        t          j        k    rt          d|j                   |j        t          j        k    rt          d|j                   t          |          t          j        k    rt          dt          |                     dS )Nr    major_a must be KMajor, but got  major_b must be KMajor, but got z&m_indices must be contiguous, but got zShape mismatch. m = , m_ = , k = , k_ = , n = , n_ = z, m__ = !a must be float8_e4m3fn, but got !b must be float8_e4m3fn, but got d must be bfloat16, but got z!m_indices must be int32, but got d must be N-major, but got T)rA   r%   r/   r  r   is_contiguousr_   numelrD   rG   rH   rI   int32rC   r2   r4   )r  r  r  r  rx   r  r   r   r   r  ri   r  r   rg   rr   r   k_m_n_m__s                       r   4_check_group_deepgemm_fp8_nt_contiguous_problem_sizer  V  sU     a))Ga))G+$$$EGEEFFF Gg);;;EGEEFFF""$$ 
PY5L5L5N5NPP
 
 	
 FAsFAs7DAqJ2WFB
//

C 	Bww!r''Q"WWr		f1ffRffqffff1ffUWffadff
 
 	
 	w%%%%FQWFFGGGw%%%%FQWFFGGGw%.  AAABBB%+%%NY_NNOOO {111M7H7K7KMMNNN4r   )common_checkc                    |                                 }t          | d                   }t          |d                   }| \  }}	|\  }
}|j        \  }}|
j        \  }}}|dk    rd S |t          |	j        |j                  n|}t          |	|||d          }	t          |||||d          }t          j        t          |||          t          j        t          |||          dt                               } |||	|
|||           d S )Nr   T)rf   rg   rx   ry   Frf   rg   rx   rr   ry   r   r   r  r{   )
lowerrA   r_   r   rD   r   	functoolspartialr  r   )r  r  r  r  rx   r  r   r   r   r  ri   r  r   rg   rr   r   r  impls                     r    m_grouped_fp8_gemm_nt_contiguousr    sP    "''))Ma))Ga))GFAsFAs7DAqJ2 	Avv :@	39555VF
+CA6RV
W
W
WC
+Qv*U  C
 !2'	
 
 
 !2'	
 
 
  D 	DCCI&&&&&r   c                    t          | d                   }t          |d                   }|t          j        k    rt          d|           |t          j        k    rt          d|           |                                s$t          d|                                           | \  }	}
|\  }}|	j        \  }}}|j        \  }}}|j        \  }}}|                                }||k    s||k    s||k    rt          d| d| d| d|           ||k    s||k    s||k    r!t          d	| d
| d| d| d| d|           |dk    s|dk    s|dk    s|dk    s|dk    rt          d| d| d| d| d| 
          |	j        t          j	        k    rt          d|	j                   |j        t          j	        k    rt          d|j                   |j        t          j
        k    rt          d|j                   |j        t          j        k    rt          d|j                   t          |          t          j        k    rt          dt          |                     dS )Nr   r  r  z%masked_m must be contiguous, but got z"num_groups mismatch. num_groups = z, num_groups_ = z, num_groups__ = z, num_groups___ = zm, n, k mismatch. m = r  r  r  r  r  zMexpected_m, m, n, k, num_groups must be greater than 0, but got expected_m = z, m = z, num_groups = r  r  r  z masked_m must be int32, but got r  T)rA   r%   r/   r  r  r_   r  rD   rG   rH   rI   r  rC   r2   r4   )r  r  r  r  r  rx   r  r   r   r   r  ri   r  rr   r   rg   num_groups_r   r  num_groups__r  r  num_groups___s                          r   0_check_m_grouped_fp8_gemm_nt_masked_problem_sizer    sK     a))Ga))G+$$$EGEEFFF+$$$EGEEFFF!!## 
NH4J4J4L4LNN
 
 	
 FAsFAswJ1KB7L"bNN$$M 	k!!%%&& [  [  [[  [  [kw  [  [  LY  [  [
 
 	
 	Bww!r''Q"WW[Q[[r[[[[2[[Q[[WY[[
 
 	
 Q!q&&AFFa1ff
a _\f  _  _no  _  _wx  _  _  AB  _  _  S]  _  _
 
 	
 	w%%%%FQWFFGGGw%%%%FQWFFGGGw%.  AAABBB~$$LHNLLMMM {111M7H7K7KMMNNN4r   c           	      p   |                                 }t          | d                   }t          |d                   }||cxk    rt          j        k    sn J |                                sJ | \  }	}
|\  }}|	j        \  }}}|j        \  }}}|t          |
j        |j                  n|}t          |
||||d          }
t          |||||d          }t          j
        t          |||          t          j
        t          |||          dt                               } ||	|
|||||           d S )Nr   Tr  Fr  r{   )r  rA   r%   r/   r  r_   r   rD   r   r  r  r  r   )r  r  r  r  r  rx   r  r   r   r   r  ri   r  rr   r   rg   r  r   r  r   s                       r   m_grouped_fp8_gemm_nt_maskedr    s    "''))M  a))Ga))Gg3333!3333333!!#####FAsFAswJ1KB :@	39555VF
+Qv*T  C ,Qv*U  C
 !.'	
 
 
 !.'	
 
 
  D 	DCCHj11111r   c                   *    e Zd ZdZd Zd Zd Zd ZdS )	KernelMap@f161e031826adb8c4f0d31ddbd2ed77e4909e4e43cdfc9728918162a62fcccfbc                     d | _         d S r   )indicer   s    r   r   zKernelMap.__init__2  s    r   c                 ,   t           j        dz   dz   }t          || j                  s
J d            t          |z  }|                                sJ t          |d          5 }t          j        |          | _	        d d d            d S # 1 swxY w Y   d S )Nr  zkernel_map.jsonzCcubin kernel map file not found, nor downloaded with matched sha256r)
r   r  r
   KERNEL_MAP_HASHr   r  openjsonr  r  )r   indice_pathrH  fs       r   init_indiceszKernelMap.init_indices5  s    "+c14EEd&:;; 	
 	
Q	
 	
; $k1{{}}}$__ 	')A,,DK	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	's   "B		BBc              #   \   K   | j         |                                  | j         D ]}|V  d S r   r  r  )r   r  s     r   __iter__zKernelMap.__iter__?  sF      ;K 	 	DJJJJ	 	r   c                 R    | j         |                                  | j         |         S r   r  )r   keys     r   __getitem__zKernelMap.__getitem__E  s)    ;{3r   N)r   r   r   r  r   r  r  r  r#   r   r   r
  r
  .  sU        XO  ' ' '           r   r
  )FN)NF)F)Nr  )z__doc__r  enumr  r   r  typingr   r   r   r   cuda.bindings.driverbindingsdriverr  ImportErrorr]  rG   	artifactsr   
cuda_utilsr	   jit.cubin_loaderr
   jit.envr   utilsr   r   r   r   Enumr   r%   r2   Tensorr>   rA   rC   rD   rL   rP   r\   rW   ro   r   rw   r   	lru_cacher   r"   r   r   r   r   r   r   r   r   rJ   r   r   r  int8CUtensorMapDataTypeCU_TENSOR_MAP_DATA_TYPE_UINT8int16CU_TENSOR_MAP_DATA_TYPE_UINT16r  CU_TENSOR_MAP_DATA_TYPE_INT32int64CU_TENSOR_MAP_DATA_TYPE_INT64r^   uint16uint32CU_TENSOR_MAP_DATA_TYPE_UINT32uint64CU_TENSOR_MAP_DATA_TYPE_UINT64float32CU_TENSOR_MAP_DATA_TYPE_FLOAT32float16CU_TENSOR_MAP_DATA_TYPE_FLOAT16rI    CU_TENSOR_MAP_DATA_TYPE_BFLOAT16rH   float8_e4m3fnuzfloat8_e5m2float8_e5m2fnuzr  __annotations__CUtensorMapSwizzleCU_TENSOR_MAP_SWIZZLE_NONECU_TENSOR_MAP_SWIZZLE_32BCU_TENSOR_MAP_SWIZZLE_64BCU_TENSOR_MAP_SWIZZLE_128Br2  r1  r  CUtensorMapr)  r3  r9  r=  r?  rC  rw  r  rG  r  r  r  r  r  r  r  r  r  r  r  r  r
  r  r#   r   r   <module>rG     s    4         - - - - - - - - - - - -&&&&&&&&&&   
+	E    # # # # # # ' ' ' ' ' ' ' ' ' ' ' ' ) ) ) ) ) )           
 
 
 
 
ty 
 
 
S S S S S$) S S S- - - - -$) - - -25< 2 2 2 2L L L L L
K K K K K
EK      "C "s "s " " " "=u| = = = = =H #(,   S/	
   % \   H !%Q QQQ Q #sC- 	Q
 Q Q Q Q Qh T"""+ + #"+!3 !3 ! ! ! ! T"""    #" T"""
&{
&',{
&
3S=
& 
& 
& #"
&P P P P P P P P ) ) ) ) ) ) ) )6 $
6 
6
6
6 
6 	
6
 
6 

6 
6 
6 
6      	 	c 	U[ 	 	 	 	L Lc LU[ L L L L??? ? 	?
 ? ? k? k? ? &? ? ? ? ?D T"""JJ
J J 	J
 J J J J kJ kJ J 3S#sO5GGHJ J J #"J\ 
J'E	K(G	K(F	K(F	K(F	L#)H	L#)H	L#)H	M3*J	M3*J	NC+L	0N	32P	s.L	32P!tCH~   & 8988			:  |S^S() +, S^S()	
 ( 	_   <|  	
    	_   ,|  	
      	_   :|  	
      	_   6|  	
      	_   6|  	
     	_   < 	&	7	N)	K	  GP GP GP GP GP GP GP GPT	K 	K 	K%s %# %"5 % % % % >
>
> > 	>
 > > > > > +> > > >Bm%|m%	m% |m% 
	m%
 |m% |m% m% m% m% m% m% m%`|	 | 
	
 | |    
   (B
B
B B 	B
 B B B B B B +B B B BJo%|o%	o% |o% 
	o%
 |o% lo% o% o% o% o% o% o% o%d|	 | 
	
 | l     
   * sCj)) .2. .u|+,.u|+,. |. |	.
 U3S=)*. . 
. . . *).b E   .2,' ,'u|+,,'u|+,,' |,' |	,'
 U3S=)*,' ,' 
,' ,' ,'	 ,'^ sCj)) .2: :u|+,:u|+,: |: l	:
 : U3S=)*: : 
: : : *):z A   .2.2 .2u|+,.2u|+,.2 |.2 l	.2
 .2 U3S=)*.2 .2 
.2 .2 .2	 .2b               : Y[[


s   	/ AA  A