Improve benchmarking scripts.

2026-04-19 22:39:03 +00:00 · 2026-02-09 04:34:27 -05:00
parent 53201d2081
commit 60d1ec34a9
2 changed files with 127 additions and 5 deletions
--- a/benchmarking/demangle-names.py
+++ b/benchmarking/demangle-names.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+
+import subprocess
+import sys
+from bs4 import BeautifulSoup
+import re
+import json
+
+def get_mangled_kernel_names_from_html(html_file):
+    with open(html_file, 'r') as f:
+      html_content = f.read()
+
+      # Parse with BeautifulSoup to find the right script tag
+      soup = BeautifulSoup(html_content, 'html.parser')
+      script_tags = soup.find_all('script')
+      
+      for i, script in enumerate(script_tags):
+          if script.string:
+              # Look for scripts containing our kernel signature strings
+              if 'tensor_operation' in script.string and 'device' in script.string:
+                  print(f"Found kernel data in script tag {i}")
+                  
+                  # Now extract the data array from this specific script
+                  match = re.search(r'var\s+data\s*=\s*(\[.*?\]);', script.string, re.DOTALL)
+                  if match:
+                      try:
+                          data_str = match.group(1)
+                          data = json.loads(data_str)
+                          
+                          # Extract kernel names
+                          kernel_names = [item['name'] for item in data]
+                          
+                          print(f"Found {len(kernel_names)} kernel names:")
+                          for name in kernel_names:
+                              print(f"  {name}")
+                          
+                          return kernel_names
+                      except json.JSONDecodeError as e:
+                          print(f"Error parsing JSON: {e}")
+                          # Continue to next script tag
+      
+      print("Could not find kernel data in any script tag")
+      return []
+
+def de_mangle_name(mangled):
+  demangler = '/opt/rocm/llvm/bin/llvm-cxxfilt'
+
+  try:
+    result = subprocess.run(
+        [demangler],
+        input=mangled,
+        text=True,
+        capture_output=True,
+        check=True
+    )
+    demangled = result.stdout.strip()
+    if demangled and demangled != mangled:
+        return demangled
+  except (FileNotFoundError, subprocess.CalledProcessError) as e:
+      print(f"Error using {demangler}: {e}")
+      return None
+
+def extract_instance_name(demangled, prefix, prefix_instance):
+    if not demangled.startswith(prefix + prefix_instance):
+        return None
+    
+    # Start after the prefix
+    start = len(prefix + prefix_instance)
+    
+    # Track angle bracket depth to find matching closing bracket
+    depth = 1  # We already counted the opening '<' from prefix
+    i = start
+    
+    while i < len(demangled) and depth > 0:
+        if demangled[i] == '<':
+            depth += 1
+        elif demangled[i] == '>':
+            depth -= 1
+        i += 1
+    
+    if depth == 0:
+        # Extract from start to just before the matching '>'
+        return prefix_instance + demangled[start:i]
+    
+    return None
+
+def extract_GridwiseGemmMultiD_xdl_cshuffle_v3(demangled):
+    prefix = "void ck::tensor_operation::device::(anonymous namespace)::kernel_grouped_conv_fwd_xdl_cshuffle_v3<"
+    prefix_instance = "ck::GridwiseGemmMultiD_xdl_cshuffle_v3<"
+    return extract_instance_name(demangled, prefix, prefix_instance)
+
+def extract_GridwiseGemmMultiD_xdl_cshuffle(demangled):
+    prefix = "void ck::tensor_operation::device::(anonymous namespace)::kernel_grouped_conv_fwd_xdl_cshuffle<"
+    prefix_instance = "ck::GridwiseGemmMultiD_xdl_cshuffle<"
+    return extract_instance_name(demangled, prefix, prefix_instance)
+
+if __name__ == "__main__":
+    # get_mangled_kernel_names_from_html(sys.argv[1])
+    
+    # Mangled name is the first argument    
+    if len(sys.argv) > 1:
+      mangled = sys.argv[1]
+      demangled = de_mangle_name(mangled)
+      print()
+      print("Demangled name:")
+      print(demangled)
+      v3_instance_name = extract_GridwiseGemmMultiD_xdl_cshuffle_v3(demangled)
+      if v3_instance_name:
+          print()
+          print("Extracted GridwiseGemmMultiD_xdl_cshuffle_v3 instance name:")
+          print(v3_instance_name)
+      v1_instance_name = extract_GridwiseGemmMultiD_xdl_cshuffle(demangled)
+      if v1_instance_name:
+          print()
+          print("Extracted GridwiseGemmMultiD_xdl_cshuffle instance name:")
+          print(v1_instance_name)
+    else:
+      print("Please provide a mangled name as an argument.")
--- a/benchmarking/run-best-instances.py
+++ b/benchmarking/run-best-instances.py
@@ -5,12 +5,16 @@ import subprocess
 import sys
 import argparse

+# Run under rocprof-compute
+#    HIP_VISIBLE_DEVICES=7 rockprof-compute profile -n grouped_conv_fwd --roofline-data-type FP16 -- ./run-best-instances.py --profiler-path ../build-improved-convs/bin/ckProfiler
+
+# Set no verify, and timing of the kernel. Multiple calls of the same kernel inside the script will confuse the profiler.
 profiler_commands = [
-  "1         1       1              0      1         0     1           2     32  32  4     4    3  3  200  200   1   1   1   1   1   1    1   1", 
-  "1         1       1              0      1         0     1           2     32  32  8     8    3  3  200  200   2   2   1   1   1   1    1   1", 
-  #"1         1       1              0      1         0     1           2     32  32  8     8    3  3  100  100   1   2   1   1   1   1    1   1", 
-  "1         1       1              0      1         0     1           2     1   32  2376  256  3  3  100  100   1   1   1   1   1   1    1   1", 
-  #"1         1       1              0      1         0     1           2     1   32  256   256  3  3  100  100   1   1   1   1   1   1    1   1"
+  "1         1       1              0      1         0     0           2     32  32  4     4    3  3  200  200   1   1   1   1   1   1    1   1", 
+  "1         1       1              0      1         0     0           2     32  32  8     8    3  3  200  200   2   2   1   1   1   1    1   1", 
+  #"1         1       1              0      1         0     0           2     32  32  8     8    3  3  100  100   1   2   1   1   1   1    1   1", 
+  "1         1       1              0      1         0     0           2     1   32  2376  256  3  3  100  100   1   1   1   1   1   1    1   1", 
+  #"1         1       1              0      1         0     0           2     1   32  256   256  3  3  100  100   1   1   1   1   1   1    1   1"
  ]

 baseline_instances = [