
    `i]                        d dl mZmZ ddlmZ ddlZddlmZm	Z	 ddl
mZ ddlZddlmZ ddlmZ ddlmZ dd	lmZmZmZ  G d
 de          ZddlZ ej        e          Zi Zdej        ej        ddddfdej        dej        dej        dej        dedej         dej         dee!e!f         dee!e!f         de!de	ej"                 deej        ej        f         fdZ#dS )   ))Sm100BlockScaledPersistentDenseGemmKernel1Sm100BlockScaledPersistentDenseGemmKernelNoDlpack    )driverN)TupleOptional)version)from_dlpack)_convert_to_cutlass_data_type)APIBaseis_power_of_2ceil_divc                   z    e Zd Zej        dddfdej        dej        dej        dej        dej        d	ej        d
ej        deeef         deeef         def fdZ	de
fdZddeej                 ddfdZ	 	 ddej        dej        dej        dej        dej        dej        deej                 de
ddfdZ xZS )GemmAmaxSm100   r   r   r       sample_asample_b
sample_sfa
sample_sfbsample_csample_amax	acc_dtypemma_tiler_mncluster_shape_mnsf_vec_sizec                 F   t                                                       | j                            d           | j                            d           || _        || _        || _        || _        || _	        | 
                    |dd          | _        || _        || _        |	| _        |
| _        d| _        d| _        d| _        | j                            d|j         d	|j         d
|j         d|j         d|j         d| j        j         d| d| d|	 d|
            d S )Nz$GemmAmaxSm100 is an experimental APIzEntering __init__   r   )r      r!   Tz'__init__ completed with args: sample_a z, sample_b z, sample_sfa z, sample_sfb z, sample_c z, sample_amax z, acc_dtype z, mma_tiler_mn z, cluster_shape_mn z, sf_vec_size )super__init___loggerwarningdebugr   r   r   r   r   _pad_tensor_to_ndimr   r   r   r   r   atom_matom_k_interpret_uint8_as_fp4x2shape)selfr   r   r   r   r   r   r   r   r   r   	__class__s              g/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/cudnn/gemm_amax/api.pyr#   zGemmAmaxSm100.__init__   s    	CDDD.///  $$ 33KMRR"( 0& )-& Uhn  U  UQYQ_  U  Unxn~  U  U  NX  N^  U  U  ks  ky  U  U  IM  IY  I_  U  U  mv  U  U  GS  U  U  hx  U  U  HS  U  U	
 	
 	
 	
 	
    returnc           
         | j                             d           | j                             d           |                     | j        t          j        t          j        t          j        t          j        gd          }|                     | j	        |dd           |t          j        k    r| j         
                    d           |                     | j        d	vd
| j         d           |                     | j        t          j        t          j        t          j        gd          }|                     | j        |dd           |t          j        k    r| j         
                    d           |                     |t          j        k    o
| j        dk    d           |                     |t          j        t          j        hv o
| j        dk    d           |                     | j        t          j        t          j        t          j        t          j        t          j        t          j        t          j        gd          }|                     |                     |          o|                     |           d|            |                     |                     |          o|                     |          d           |                     | j        t          j        dd           || _        || _        | j                             d           |                     | j        d          \  }}}|                     | j	        d          \  }}}|                     | j        d          \  }}}| j        j        \  }}}	}}
}| j        j        \  }}}}}
}|                     | j        |||fd           |                     | j	        |||fd           |                     | j        |||fd           |                     | j        | j        d         | j        d         |	| j        |
|fd           |                     | j        | j        d         | j        d         || j        |
|fd           |                     | j         d d!           tC          || j        d         | j        d         z            }tC          || j        d         | j        d         z            }|                     |	|k    d"| d#|	            |                     ||k    d$| d#|            | "                    | j        d|||z  f|d||z  fgd%          \  }| _#        | "                    | j	        d|||z  f|d||z  fgd%          \  }| _$        | "                    | j        d|||z  f|d||z  fgd%          \  }| _%        | j#        d&k    rd'nd(| _&        | j$        d&k    rd)nd(| _'        | j%        d&k    rd'nd)| _(        |                     |                     |          o| j&        d(k    o
| j'        d(k     d*| j&         d+| j'                    |                     |                     |          o
| j(        d'k    d,| j(                    | j                             d-           |                     | j)        d         d.vd/| j)        d                     |                     | j)        d         d.vd0| j)        d                     |                     | j)        d         d1k    d2           |                     |                     | j                  o| j)        d         d1k    o|d3k    d4|            |                     | j*        d         | j)        d         d1k    rd5ndz  dk     d6           |                     | j)        d.k    o/| j        dk    o$|t          j        t          j        t          j        hv d7           |                     | j*        d         d8k    of| j*        d         d8k    oU| j*        d         dk    oD| j*        d         dk    o3tW          | j*        d                   otW          | j*        d                    d9| j*                    | j                             d:           d; }|                      ||| j&        d'k    |||f          o1 ||| j'        d)k    |||f          o ||| j(        d'k    |||f           d<           | j                             d=           | ,                    t          j-        .                                 d>           t          j-        /                                }t          j-        0                    |          \  }}|d?z  |z   }| ,                    |d@k     dA| dB|            | ,                    |dCk    dD           |                     | j                  }|                     | j                  }|                     | j                  }tc          j2        t          j3                  }tc          j2        |j4                  tc          j2        dE          k    }|p|p|o| }|r'| j                             dF           tj          | _6        ntn          | _6        dG| _8        | j                             dH           dGS )INzEntering check_supportzChecking dtypes and sf_vec_sizeA)dtypenameBz A and B tensor dtypes must match)r3   r4   extra_error_msgzEUint8 ab_dtype will be interpreted as packed fp4, not as native uint8>      r   z"Unsupported sf_vec_size: received z, expected {16, 32}sfasfbz$sfa and sfb tensor dtypes must matchzGInt8 sf_dtype will be interpreted as float8_e8m0fnu, not as native int8r   zWUnsupported sf_dtype and sf_vec_size combination: float8_e4m3fn and 32 is not supportedr7   zfUnsupported ab_dtype and sf_vec_size combination: {float8_e5m2, float8_e4m3fn} and 16 is not supportedCzUUnsupported c_dtype and ab_dtype combination: fp4 c_dtype requires fp4 ab_dtype, got z\Unsupported c_dtype and ab_dtype combination: fp8 ab_dtype and fp8 c_dtype (fails to launch)AccumulatorzAccumulator must be float32zChecking tensor layoutr   )r4   r   r   r   r   r   r   r   amaxzHInput/Output shape mismatch: expected m_div_atom_m0_m1 (sfa.shape[2]) = z, got zHInput/Output shape mismatch: expected n_div_atom_m0_m1 (sfb.shape[2]) = )strider4   )r   r      mknzeUnsupported A or B tensor stride: Float4 tensors require k-major layout for hardware efficiency, got z and z`Unsupported C tensor stride: Float4 tensors require n-major layout for hardware efficiency, got z$Checking mma tiler and cluster shape)r      z6Unsupported mma_tiler_mn[0]: expected {128, 256}, got z6Unsupported mma_tiler_mn[1]: expected {128, 256}, got rC   z&mma_tiler_mn[0] == 256 currently hangsr   z8mma_tiler_mn (X, 256) requires k > 128 (packed x2), got r?   zIllegal cluster shapezomma_tiler_mn (128, 256), sf_vec_size 16, c_dtype {torch.float32, torch.float16, torch.bfloat16} fails to launchr!   zJInvalid cluster shape: expected cluster_shape_mn values in {1, 2, 4}, got zChecking tensor alignmentc                 ^    |rdnd}||         }dt          |           j        z  }||z  dk    S )Nr   r   r   )r   width)r3   is_mode0_majortensor_shapemajor_mode_idxnum_major_elementsnum_contiguous_elementss         r.   check_contigous_16B_alignmentzBGemmAmaxSm100.check_support.<locals>.check_contigous_16B_alignment   sC    "07QQaN!-n!=&,1Nu1U1U1[&\#%(??1DDr/   z9Unsupported tensor alignment: tensors must be 16B alignedzChecking environmentzCUDA is not available
   d   z9GemmAmax requires SM100+ compute capability, but found SMz on device g   z*cuteDSL GemmAmax is not supported on SM103z2.10.0z\Running no_dlpack kernel wrapper due to fp4 dtype or fp8 dtype on incompatible torch versionTz$check_support completed successfully)9r$   r&   _check_dtyper   torchfloat4_e2m1fn_x2uint8float8_e5m2float8_e4m3fnr   r%   _value_error_ifr   r   float8_e8m0fnuint8r   r   float32float16bfloat16	_is_fp4x2_not_implemented_error_if_is_fp8r   ab_dtypec_dtype_tensor_shaper+   _check_tensor_shaper(   r)   r   r   _check_tensor_stridea_stride_orderb_stride_orderc_stride_ordera_majorb_majorc_majorr   r   r   _runtime_error_ifcudais_availablecurrent_deviceget_device_capabilityr	   parse__version__base_versionr   _kernelr   _is_supported)r,   r^   sf_dtyper_   r@   rA   lrB   _m_div_atom_m0_m1sf_k_div_atom_kn_div_atom_m0_m1expected_m_div_atomexpected_n_div_atoma_strideb_stridec_striderK   devicemajorminorcompute_capability	is_ab_fp4is_c_fp4	is_ab_fp8torch_version_fp8_dlpack_supporteduse_no_dlpack_kernels                               r.   check_supportzGemmAmaxSm100.check_support:   sR   3444<===$$M)5;8I5K^_ % 
 

 	M>	 	 	
 	
 	
 u{""L  !hiiiH,X1AXXX	
 	
 	

 $$O')<ejI % 
 

 	OB	 	 	
 	
 	
 uz!!L  !jkkk++F0@B0Fe	
 	
 	
 	*E,?@@[TEUY[E[w	
 	
 	

 ##M=%-ARTYTgini  BG  BM  N $ 
 

 	NN7##DDNN8,D,D(Dndlnn	
 	
 	
 	&&LL!!<dll8&<&<j	
 	
 	
 	N-9	 	 	
 	
 	
 !3444$$T]$DD1a$$T]$DD1a$$T]$DD1a8<8M51?A8<8M51?A  Aq	3???  Aq	3???  Aq	3???  O[^T[^-=t{O]^_	
 	
 	

 	  O[^T[^-=t{O]^_	
 	
 	

 	  !19fEEE&q$+a.4;q>*IJJ&q$+a.4;q>*IJJ 33 EWj  E  E  sC  E  E	
 	
 	
 	 33 EWj  E  E  sC  E  E	
 	
 	
 )-(A(AM1q5MAq!a%=1 )B )
 )
%$%
 )-(A(AM1q5MAq!a%=1 )B )
 )
%$%
 )-(A(AM1q5MAq!a%=1 )B )
 )
%$% #1Y>>ssC"1Y>>ssC"1Y>>ssCNN8$$Zdlc.A.YdlVYFY)Z Vtx  uA  V  V  HL  HT  V  V	
 	
 	
 	NN7##;(;}oso{}}	
 	
 	

 	ABBBa 
2]tGXYZG[]]	
 	
 	
 	a 
2]tGXYZG[]]	
 	
 	
 	&&a C'4	
 	
 	
 	NN4=))Vd.?.Bc.IVaSVhJqJJ	
 	
 	
 	&q)$2CA2F#2M2MQQSTUYZZ[#	
 	
 	
 	&&+  E0@B0F  E7W\Wdfkfsuz  vD  WE  LE}	
 	
 	
 	%a(A- <)!,1<)!,q0< )!,q0< "$"7":;;	<
 "$"7":;; s[_[prr
	
 
	
 
	
 	6777	E 	E 	E 	--h8KaQRTUYWW [11(DLC<ORSUVXYQZ[[[11'4<3;NQRTUWXPYZZ
 H	
 	
 	
 	12225:#:#:#<#<<>UVVV**,,z77??u"RZ%/$oHZoogmoo	
 	
 	
 	#%8	
 	
 	

 NN4=11	>>$,//LL//	e&788 'm.H I IW][cMdMd d(aHa9`K`G` 	EL}~~~LDLLDDL!ABBBtr/   Ncurrent_streamc                 8   | j                             d           |                     |          }|                                  |                     | j        | j        | j                  }t          j	        
                                }|                    | j        d         | j        d         z            }| j        t          u r| j                             d           t          j        |t          | j        d          t          | j        d          t          | j        d          t          | j        d          t          | j        d          t          | j        d          ||	  	        | _        n| j        t.          u r_| j                             d	           |                     | j                  }|                     | j                  }|                     | j        |rd
ndd          \  }}}	|                     | j        |rd
ndd          \  }
}}|                     | j        |rd
ndd          \  }}}|                     | j        dd          \  }}}|                     | j        dd          \  }}}t          j        |fi d|d|d|	d|
d|d|d|d|d|d|d|d|d|d|d|d t          | j        d          d!|d"|| _        nt9          d#| j                   | j                             d$           d S )%NzEntering compiler   r   r   r   r   zCompiling gemm_amaxr7   assumed_aligna_tensorb_tensor
sfa_tensor
sfb_tensorc_tensoramax_tensormax_active_clustersstreamzCompiling gemm_amax (no dlpack)r   r2   r   r4   r5   r:   r8   r9   a_ptra_shapea_orderb_ptrb_shapeb_ordersfa_ptr	sfa_shape	sfa_ordersfb_ptr	sfb_shape	sfb_orderc_ptrc_shapec_order	amax_cuter   r   !Unreachable: invalid kernel type zKernel compiled successfully)r$   r&   _get_default_stream_ensure_support_checkedrq   r   r   r   cutlassutilsHardwareInfoget_max_active_clustersr   cutecompiler
   r   r   r   r   r   r   _compiled_kernelr   r[   r^   r_   _make_cute_tensor_descriptorNotImplementedError)r,   r   	gemm_amaxhardware_infor   r   r   r   r   rc   r   r   rd   r   r   re   r   r   sfa_stride_orderr   r   sfb_stride_orders                         r.   r   zGemmAmaxSm100.compile  s   -...11.AA$$&&&LL(*!2 ! 
 
	
  2244+CCDDYZ[D\_c_tuv_wDwxx<DDDL4555$(L$T]"EEE$T]"EEE&tbIII&tbIII$T]"EEE'(8KKK$7%
% 
% 
%D!! \NNN L@AAAt}55I~~dl33H-1-N-Nt}r{  mDlnln  BD  KN-N  .O  .O*E7N-1-N-Nt}r{  mDlnln  BD  KN-N  .O  .O*E7N-1-N-Nt}rz  mClnln  AC  JM-N  .N  .N*E7N373T3TUYUdtv  ~C3T  4D  4D0GY 0373T3TUYUdtv  ~C3T  4D  4D0GY 0$(L% % %e%  % '	%
 e%  % '%  % $)% +*%  % $)% +*% e%  %  '!%" &d&6bIIII#%$ %8$7%%& &~'%D!!, &&X$,&X&XYYY9:::::r/   Fr   r   r   r   r   r   skip_compilec	                    | j                             d           |                     |          }|                     |dd          }|                     | j                  }	|                     | j                  }
|s|                     | j        d u d           | j                             d           | j	        t          u rw|                     t          |d          t          |d          t          |d          t          |d          t          |d          t          |d          |           n| j	        t          u r|                     ||	rd	nd          }|                     ||	rd	nd          }|                     ||
rd	nd          }|                     |d          }|                     |d          }|                     |||||t          |d          |
           nt          d| j	                   | j                             d           n| j                             d           | 	                    | j        | j        | j                  }t$          j                                        }|                    | j        d         | j        d         z            }| j	        t          u ro |t          |d          t          |d          t          |d          t          |d          t          |d          t          |d          ||           n	| j	        t          u r|                     ||	rd	ndd          \  }}}|                     ||	rd	ndd          \  }}}|                     ||
rd	ndd          \  }}}|                     |dd          \  }}}|                     |dd          \  }}} |d+i d|d|d|d|d|d|d|d|d |d!|d"|d#|d$|d%|d&|d't          |d          d(|d)| nt          d| j	                   | j                             d*           d S ),NzEntering executer    r   zYGemmAmaxSm100 kernel not compiled; call compile() first or use execute(skip_compile=True)zExecuting with compiled kernelr7   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   z*Executed with compiled kernel successfullyz'Executing without compiled kernel (JIT)r   r   r   r   r2   r   r5   r:   r8   r9   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   zExecuted successfully )r$   r&   r   r'   r[   r^   r_   ri   r   rq   r   r
   r   _make_cute_pointerr   r   r   r   r   r   r   r   r   )r,   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rc   r   rd   r   re   r   r   r   r   s                                r.   executezGemmAmaxSm100.executeW  s    	-...11.AA..{A}MMNN4=11	>>$,// U	^""%-k   L?@@@|HHH%%(DDD(DDD*:RHHH*:RHHH(DDD +Kr J J J) &     !RRR//iH_]_/``//iH_]_/``//hH^\^/__11*B1OO11*B1OO%%##)+RHHH) &     **\dl*\*\]]]LKLLLLLHIII ,!.!%!6 %  I
 $M6688M"/"G"GH]^_H`cgcxyzc{H{"|"||HHH	(DDD(DDD*:RHHH*:RHHH(DDD +Kr J J J(;)	 	 	 	 	 !RRR151R1RS[qz  lCkmkm  AC  JM1R  2N  2N.w151R1RS[qz  lCkmkm  AC  JM1R  2N  2N.w151R1RS[qy  lBkmkm  @B  IL1R  2M  2M.w7;7X7XYcsu  }B7X  8C  8C4$47;7X7XYcsu  }B7X  8C  8C4$4	   %#G +N  %	
 $G +N $G (i /. $G (i /.  % $G +N  *+RHHHH!" )<(;#$ *>%  * **\dl*\*\]]]233333r/   )N)NF)__name__
__module____qualname__rP   rX   Tensorr3   r   intr#   boolr   r   rj   CUstreamr   r   __classcell__)r-   s   @r.   r   r      s        "'(2,2$
 $
,$
 ,$
 L	$

 L$
 ,$
 \$
 ;$
 CHo$
  S/$
 $
 $
 $
 $
 $
 $
L[t [ [ [ [z>; >;ht}&= >; >; >; >; >;P 37"i4 i4,i4 ,i4 L	i4
 Li4 ,i4 \i4 !/i4 i4 
i4 i4 i4 i4 i4 i4 i4 i4r/   r   rB   r   r   r   r   r   r   r   rh   r_   r   r   r   r   r   r0   c                 \   t                               d           | j        \  }}}|j        \  }}}d }|dk    r't          j        |||fd|||z  f|| j                  }n?|dk    r't          j        |||f|d||z  f|| j                  }nt          d|           t          j        dt          d           | j        t          j	        	          }| j        |j        |j        |j        | j
        |j
        |j
        |j
        |                                 |                                |                                |                                ||||||	f}|t          v rDt                               d
           t          |         }|                    | ||||||
           nt                               d           t          | |||||||||	
  
        }|                                sJ |                    |
           |                    | ||||||
           |t          |<   ||fS )NzAgemm_amax_wrapper_sm100: Creating empty output tensors c and amaxr@   r   )r3   r~   rB   z'c_major must be either 'm' or 'n', got r<   inf)r~   r3   zEgemm_amax_wrapper_sm100: Using previously cached GemmAmaxSm100 object)r   r   r   r   r   r   r   zkgemm_amax_wrapper_sm100: No previously cached GemmAmaxSm100 object found, creating new GemmAmaxSm100 object)
r   r   r   r   r   r   r   r   r   r   )r   )r$   r&   r+   rP   empty_stridedr~   
ValueErrorfullfloatrX   r3   r>   _cache_of_GemmAmaxSm100Objectsr   r   r   r   )r   r   r   r   rh   r_   r   r   r   r   r   r@   ru   rt   rB   r   r   	cache_keyr   s                      r.   gemm_amax_wrapper_sm100r     s    MMUVVVnGAq!nGAq!H#~~&1ay1aQ-wW_Wfggg	C&1ay1aQ-wW_WfgggL7LLMMM*YuhoUZUbcccK 	%I( 222]^^^29=	!!#! 	 	
 	
 	
 	
 	  D  	E  	E  	E!!!#%-#
 
 
	 &&(((((000!!#! 	 	
 	
 	
 5>&y1[  r/   )$&dense_blockscaled_gemm_persistent_amaxr   r   cuda.bindingsr   rj   rP   typingr   r   	packagingr	   r   cutlass.cuter   cutlass.cute.runtimer
   cudnn.datatypesr   cudnn.api_baser   r   r   r   logging	getLoggerr   r$   r   rX   r   strr3   r   r   r   r   r/   r.   <module>r      s         
 ) ( ( ( ( (  " " " " " " " "              , , , , , , 9 9 9 9 9 9 ; ; ; ; ; ; ; ; ; ;m4 m4 m4 m4 m4G m4 m4 m4` 
'
H
%
%!#   ="]$.(.&*V! V!lV!lV! V! 	V!
 V! [V! {V! S/V! CHoV! V! T]#V! 5<%&V! V! V! V! V! V!r/   