
    )`iE                     |   d Z ddlZddlZ	 ddlZ eed          redk    r ed          ddlT ddlT ddl	T n# e$ r ddl
T ddlT ddlT Y nw xY w	 ddlmZmZmZ n# e$ r ddlmZmZmZ Y nw xY we                    e          Zd	 e                                D             Zd
 ZddZddZd Zd Zd Zd ZdS )zN
Utilities for selecting CUTLASS library kernels based on problem description
    NCUTLASS_IGNORE_PACKAGETz+Disabling attempt to import cutlass_library)*   )get_valid_schedules)generate_data_types_from_math_instructionfix_alignmentsc                     i | ]\  }}||	S  r
   .0kvs      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/data/cutlass/python/cutlass_library/heuristics.py
<dictcomp>r   B   s    444daQ444    c                 f   |                                  }|D ]}|                                D ]P\  }}t          |t                    rt          |         ||<   +t          |t
                    rt          |         ||<   Q|d         }|D ]g}|                                D ]P\  }}t          |t                    rt          |         ||<   +t          |t
                    rt          |         ||<   Qht          |d          5 }t          j	        ||d           ddd           dS # 1 swxY w Y   dS )a  
  Utilitiy function to write heuristics results to a json file for debug

  args:
    problems_with_configs: List of problems provided to the heuristic, with a list of operations added to each problem dict
    outfile_path: Outfile path
      
  returns:
    None
  configsw   )indentN)
copyitems
isinstanceDataTypeDataTypeNames
LayoutTypeShortLayoutTypeNamesopenjsondump)	problems_with_configsoutfile_pathpc_copypr   r   r   cfs	            r   $serialize_heuristics_results_to_jsonr'   D   s    "&&((' ) )a		 ' '1	Ax	 	  'Q!a$$ '#A&!	lG ) )'')) ) )$!Qa"" 	)q!!A$$:&& 	)%a(!A$	)) L# $!Igq####$ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $s   D&&D*-D*Fc                 b    |t                      }|                    | |||||||||	|
          S )a  
  Get heuristic-suggested GEMM kernel configurations for a single GEMM problem.

  args:
    m, n, k: GEMM dimensions
    batch_count: batch count
    layouts: tuple of layouts of type LayoutType
    use_fast_acc: Use fast accumulation for FP8. Ignored for other precisions
    count: Number of configs to return
    provider: Heuristics provider to use

  returns:
    A list of dictionaries containing the suggested kernel configurations and additional info from the input required to define a Cutlass GemmOperation, with the following keys:
      - 'cta_tile_m', 'cta_tile_m', 'cta_tile_k': CTA tile size
      - 'instr_tile_m', 'instr_tile_n', 'instr_tile_k': Instruction tile size
      - 'stages': kernel pipeline stage count
      - 'cluster_m', 'cluster_n', 'cluster_k': cluster size
      - 'layout_a', 'layout_b': input tensor layouts of type LayoutType
      - 'alignment_a', 'alignment_b': input tensor alignments, in count of elements
      - 'dtype_a', 'dtype_b', 'dtype_acc': dtypes of a, b, and accumulator, of type DataType
      - 'swizzle_size' : suggested threadblock swizzle 
      - 'split_k_slices': number of partitions of the k dimension for splitK
      - 'raster_order': raster order for CTAs over output tiles ('along_m' or 'along_n')
  N)voidCuse_fast_acccount)MatmulHeuristicsget_configs)mnr   batch_countlayoutsdtypesalignment_aalignment_br)   r*   r+   providers               r   get_single_gemm_configr6   `   sU    2 !!H			aA{FG[R]ej  zF  NS	  
T  
T  Tr   c                    g }| D ]}|                                 }	 |d         }|d         }|d         }|d         }|d         }	|d         }
|d         }n3# t          $ r&}t                              d| d	|             d
}~ww xY w|                    dd          }|                    dd          }|                    dd          }|                    dd
          }|                    dd          }|                    dd          }|                    dd          }|t
          t          j                 k    rt          d|           t          |          dk    rt          d |D                       st          d|           t          d |D                       }	 |                                |	                                |                                ||                                n|
                                |
                                g}t          d |D                       }n0# t          $ r#}t                              d|             d
}~ww xY w|                    dd t          |d!                  z            }|                    d"d t          |d                  z            }t          |||||||||dk    |||          }||d#<   |                    |           |S )$a  
  Get heuristic-suggested GEMM kernel configurations for a set of GEMM problems.

  args:
    problems: List of dictionaries describing GEMM problems with the following keys:
      - 'm', 'n', 'k': Matrix dimensions (required)
      - 'dtype_a': Data type of matrix A (required)
      - 'dtype_b': Data type of matrix B (required)
      - 'dtype_c': Data type of matrix C (default: None)
      - 'dtype_d': Data type of matrix D (required)
      - 'dtype_acc': Compute data type (default 'f32')
      - 'layout': Operation layout (e.g. 'tnt')
      - 'alignment_a': Memory access granularity of A, in units of elements (default: 16 bytes equivalent elements)
      - 'alignment_b': Memory access granularity of B, in units of elements (default: 16 bytes equivalent elements)
      - 'alpha': Scalar multiplier for A*B (default: 1.0)
      - 'beta': Scalar multiplier for C (default: 0.0)
      - 'batch_count': Number of GEMM operations in batch (default: 1)
      - 'use_fast_acc': Enable fast accumulation for FP8 on Hopper (default: True)
    provider: Heuristics provider to use
    count: Number of configurations to return per problem (defualt: 1)
      
  returns:
    A copy of the input dictionary, with key `configs` added containing the selected gemm configs
  r.   r/   r   dtype_adtype_bdtype_dlayoutzMissing required parameter z for problem N	operationgemmr0   r   	dtype_accf32dtype_calphag      ?betag        r*   TzUnsupported operation    c              3      K   | ]}|d v V  	dS )ntNr
   )r   r%   s     r   	<genexpr>z#get_gemm_configs.<locals>.<genexpr>   s&      $?$?1Q$Y$?$?$?$?$?$?r   zDlayout must be a 3-character string containing only 'n' or 't', got c              3   R   K   | ]"}|d k    rt           j        nt           j        V  #dS )tN)r   RowMajorColumnMajor)r   ls     r   rF   z#get_gemm_configs.<locals>.<genexpr>   s6      ``TU188J''9O``````r   c              3   0   K   | ]}t           |         V  d S )N)	dtype_map)r   dts     r   rF   z#get_gemm_configs.<locals>.<genexpr>   s&      88rYr]888888r   zUnsupported data type: r3      r   r4   r   )r   KeyError_LOGGERerrorgetOperationKindNamesOperationKindGemm
ValueErrorlenalltuplelowerDataTypeSizer6   append)problemsr5   r+   retproblemr.   r/   r   r8   r9   r:   r;   er<   r0   r>   r@   rA   rB   r*   r1   
dtype_listr2   rN   r3   r4   r   s                              r   get_gemm_configsrc   }   sI   2 	# * *gllnnG

#,a
#,a
#,a	"g	"g	"gx ff   mmK!KK'KKLLL K00I++mQ//KK//Ikk)T**GKK%%E;;vs##D;;~t44L&}'9:::;	;;<<<KK1$?$?$?$?$?!?!?f^dffggg``Y_`````GMMOOW]]__ioo6G6G\c\ou|  vC  vC  vE  vE  GN  GT  GT  GV  GV  Wj88Z88888ff   mm2b22333 ++mSL4K-KLLK++mSL4K-KLLK$Q1k7FKYdfjlofoq}  @E  GO  P  PG GIJJw	*s0   8A
B !BB(BH99
I&I!!I&c                    d}d}| t                      } g }g }|D ]>}|d         |d         g|d         |d         g|d         d	t          |d
                  z  gf}|d         |d         |d         |d         |d
         f\  }	}
}}}|d         dz  dk    }|rd|d         z  n|d         |d         |d         dz  g}t          ||	|
|t          j        t
          j                  }|j        |j        |d         rt          j
        n|j        ||j        |j        dg}|d         |rdndz  |d         |d         f}t          |d         |d         z  |d         |d         z  |d         dz  |d         z  gdg d||||d         |d         |d         f          }g }|r,|                    t          j        t           j        g           n+|                    t          j        t           j        g           t)          | |g|g||t*          j        t*          j        gt0          j                  D ],}|                    |           |                    |           -@||fS )+  
  Generate CUTLASS operations based on the list of configs provided by the heuristic provider

  args:
    manifest: manifest argument to which to add operations, or None to just return the operations without a manifest (for pruning an existing manifest)
    cuda_version: Cuda compiler version for generating cutlass operations
    kernel_configs: list of configs generated by the heuristic
      
  returns:
    (configs, operations): a list of heuristic-provided kernel configs along with a one-to-one corresponding list of the generated operations
  d   e   Nlayout_ar3   layout_br4   layout_drO   r:   r8   r9   r>   r@   	cluster_mr   r   
cta_tile_m
cta_tile_n
cta_tile_k   r)   )a_typeb_typec_typed_typeacc_typeepi_typer   	cluster_n	cluster_k)ro   r   r   cluster_shape)tile_schedulers	gemm_kind)Manifestr\   MathInstructionOpcodeClassTensorOpMathOperationmultiply_add	element_a	element_br   voidelement_accumulatorTileDescriptionr]   KernelScheduleTypeTmaWarpSpecialized2SmSm100EpilogueScheduleTypeTmaWarpSpecialized2SmTmaWarpSpecialized1SmSm100TmaWarpSpecialized1SmCreateGemmUniversal3xOperatorTileSchedulerTypeDefaultStreamKGemmKindUniversal3x)manifestcuda_versionkernel_configsmin_ccmax_ccr   
operationsconfigr;   r   r   r   	element_c	element_dis_2sminstruction_shapemath_instruction
data_typestile_multipliertile_description	schedulesos                         r   &generate_sm100_from_heuristics_configsr      so    &&zzH'* . .fj!6-#89F:<NPVWdPe;fiopzi{  ~A  EQ  RX  Yb  Rc  Ed  ~d  ie  fFFLYFWY_`iYjlrs~l  BH  IR  BS  U[  \e  Uf  GfCIy-y) K 1$)F7=W!f\2226,CWY_`lYmou  wC  pD  HI  pI  J&/ 	  &/%/&,Wo_X]];K;_%9%9 	J k*F/AqqBF;DWY_`kYlmO&oa00oa00a/!"446 ggK(&*=vk?RS
 
 
 I t*EG[Gqrssss*EG[Gqrsss*8fX@P?QS]_h  |M  |U  Wh  Wp  {q  }E  }Q  R  R  R  nnV
 
*	r   c                 p   d\  }}| t                      } g }g }|D ]}|d         t          |d                  z  dk    o|d         t          |d                  z  dk    }|d         |d         g|d	         |d         gt          j        d
gf}	|d         |d         |d         |d         |d         f\  }
}}}}g d}t	          ||
||t
          j        t          j                  }t          |||          }|rt          ||	d          }	g d}t          |d         |d         |d         gd|||||d         |d         |d         f          }t          ||||d|	t          j        |d                   \  }}t          |          rNt!          | |	g|g||t          j                  D ],}|                    |           |                    |           -t          |          rOt!          | |	g|g||t$          j        g          D ],}|                    |           |                    |           -||fS )re   )Z   r   Nr3   r8   rO   r4   r9   rh   ri   r   r>   r@   r:   )r   r   r   )element_sourceelement_dest)alignment_bitsrl   rm   rn   r   rk   rv   rw   rx   i(#  r*   )r   r   
is_alignedr   instantiation_levelr;   r{   enable_fp8_fast_acc)r{   )rz   )r|   r\   r   rJ   r}   r~   r   r   r   r   r   r   r   r   r   rX   r   r]   r   r   )r   r   r   r   r   r   r   r   r   r;   r   r   r   r   r   dummy_instr_shaper   r   dummy_warp_countr   r   stream_k_schedulesr   s                          r   %generate_sm90_from_heuristics_configsr     s3    .&&zzH'* 4 4f',vi7H*IISP  WW]^kWlo{  }C  DM  }N  pO  XO  SV  XVJj!6-#89F:<NPVWdPe;fisi  BC  iD  EFFLYFWY_`iYjlrs~l  BH  IR  BS  U[  \e  Uf  GfCIy-y) "		&/ 	  ;;K\et}~~~J Fj&EEEf !yy&lVL16,3GHK(&*=vk?RS   %8'$ 0	% 	% 	%!I! 9~~ ,XxBRASU_ajv~  wK  L  L  L  !v!
 ,XxBRASU_$65F5N4OQ Q Q  ! 	v! 
*	r   c                 P   g }t          |j        d          5 }t          j        |          }ddd           n# 1 swxY w Y   |j        dk    s|j        dk    rdn|j        }t          |          }t          d |j                            d          D                       r|	                    d           t          |||j        	          }g }g }|D ]}	t          d
 |j                            d          D                       r(t          |j        rdn| |j        |	d                   \  }
}t          d |j                            d          D                       r(t          |j        rdn| |j        |	d                   \  }
}||z  }d |	                                D             fdt#          |
|          D             }||z  }|D ]-}|                     d|                                 d           .|st)          d          t+          ||j                   |S )a  
  Prune a manifest according to heuristics suggestions from the problems file

  args:
    manifest: Cutlass manifest to prune
    args: generator.py args, requires:
      - args.heuristics_problems_file
      - args.heuristics_gpu
      - args.heuristics_testlist_file
      
  returns:
    A list of dictionaries, each of which has information about an operation and a problem from the input problems
  rNauto )gpuc              3      K   | ]}d |v V  	dS )100Nr
   r   archs     r   rF   z<filter_manifest_and_write_heuristics_file.<locals>.<genexpr>o  s&      CCT%4-CCCCCCr   ;@   )r5   r+   c              3      K   | ]}d |v V  	dS )90Nr
   r   s     r   rF   z<filter_manifest_and_write_heuristics_file.<locals>.<genexpr>v  s&      
B
BD44<
B
B
B
B
B
Br   r   c              3   &   K   | ]}d |v pd|v V  dS )r   101Nr
   r   s     r   rF   z<filter_manifest_and_write_heuristics_file.<locals>.<genexpr>x  s/      
X
X$ETM-u}
X
X
X
X
X
Xr   c                 &    i | ]\  }}|d k    ||S )r   r
   r   s      r   r   z=filter_manifest_and_write_heuristics_file.<locals>.<dictcomp>|  s#    RRR11	>>q!>>>r   c                 J    g | ]\  }}d |                                 i| S )operation_name)procedural_name)r   r%   r   problem_without_configss      r   
<listcomp>z=filter_manifest_and_write_heuristics_file.<locals>.<listcomp>}  sP      W  W  Weiefhi*A,=,=,?,?`CZ`^_`  W  W  Wr   ^$z!No valid configurations generated)r   heuristics_problems_filer   loadheuristics_gpur,   anyarchitecturessplitset_cta_div_nrc   heuristics_configs_per_problemr   heuristics_restrict_kernelsr   r   r   zipadd_kernel_filterr   	Exceptionwrite_profiler_testlist_to_csvheuristics_testlist_file)r   argsheuristics_problemsr&   r   mmhr!   all_configs_and_operationsr   r`   problem_configsproblem_operationswith_problem_sizer<   r   s                 @r   )filter_manifest_and_write_heuristics_filer   \  s+    D)3// '1)A,,' ' ' ' ' ' ' ' ' ' ' ' ' ' '$..$2E2K2KRVRe#S!!!#CCT%7%=%=c%B%BCCCCC b*+>TXTwxxx!*& 	4 	4g

B
BD$6$<$<S$A$A
B
B
BBB s.S\`\|  UKTXTX  CK  MQ  M^  `g  hq  `r  /s  /s++

X
X$:L:R:RSV:W:W
X
X
XXX t.T]a]}  VLUYUY  DL  NR  N_  ah  ir  as  /t  /t++$$JRRRRR W  W  W  Wmp  rA  CU  nV  nV  W  W  W"33 C CiA9#<#<#>#>AAABBBB	# 9
7
8
88 !;T=Z[[[	##s   :>>c                 
   |                                  }|D ]g}|                                D ]P\  }}t          |t                    rt          |         ||<   +t          |t
                    rt          |         ||<   Qht          |dd          5 }|d                                         }t          j
        ||          }|                                 |                    |           ddd           dS # 1 swxY w Y   dS )a(  
  Write a list of configs to a testlist to be consumed by cutlass_profiler

  args:
    configs_list: List of kernel configs along with runtime arguments and any other columns to include in the CSV, expressed as a list of dictionaries
    outfile_path: Outfile path
      
  returns:
    None
  r   r   )modenewliner   )
fieldnamesN)r   r   r   r   r   r   r   r   keyscsv
DictWriterwriteheader	writerows)	configs_listr"   profiler_testlistr%   r   r   ofilek_nameswriters	            r   r   r     sQ    #'')) ' 'a		 ' '1	Ax	 	  'Q!a$$ '#A&!	' LsB/// (5"''))G^Eg666F

&'''( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( (s   AC88C<?C<)FTr   N)Nr   )__doc__r   r   builtinshasattrr   ImportErrorcutlass_library.librarycutlass_library.generator#cutlass_library.heuristics_providerlibrary	generatorheuristics_provider
sm90_utilsr   r   r   logging	getLogger__name__rQ   r   r   rM   r'   r6   rc   r   r   r   r   r
   r   r   <module>r      s(  B   




$///WX/00 E5Kt5S5S
+C
D
DD''''))))33333 $ $ $######$
          
                

H
%
%44m1133444	$ $ $8T T T T:G G G GTE E EPK K KZ)$ )$ )$V( ( ( ( (s!   -: AA
A A/.A/