Adding README back into the gemm directory and integrate new preshuffle functions

2026-06-30 11:47:48 +00:00 · 2026-01-11 23:15:28 -05:00
parent 523ec7c863
commit 3791cfd71d
20 changed files with 732 additions and 380 deletions
--- a/6
+++ b/6
@@ -1640,7 +1640,7 @@ pipeline {
                                            -D GEMM_PRESHUFFLE_LAYOUT="rcr" \
                                            -D GEMM_PRESHUFFLE_CONFIG_FILE="default_ci_config.json" .. && \
                                           ninja -j${nthreads()} benchmark_gemm_universal_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all && \
-                                           python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
+                                           python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
                                           python3 ../tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
                                           python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
                    }
@@ -1681,7 +1681,7 @@ pipeline {
                                            -D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8;bf16;bf8" \
                                            -D GEMM_PRESHUFFLE_LAYOUT="rcr" .. && \
                                           ninja -j${nthreads()} benchmark_gemm_universal_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all benchmark_gemm_streamk_all && \
-                                           python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
+                                           python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
                                           python3 ../tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
                                           python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
                    }
@@ -1706,7 +1706,7 @@ pipeline {
                                            -D GEMM_UNIVERSAL_DATATYPE="fp16" \
                                            -D GEMM_UNIVERSAL_LAYOUT="rcr;rrr;crr;ccr" .. && \
                                           ninja -j${nthreads()} benchmark_gemm_universal_all && \
-                                           python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
+                                           python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
                    }
                    steps{
                        buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
--- a/test/ck_tile/gemm_tile_engine/CMakeLists.txt
+++ b/test/ck_tile/gemm_tile_engine/CMakeLists.txt
@@ -10,7 +10,7 @@
 # ============================================================================

 # Locate tile_engine GEMM scripts directory
-set(TILE_ENGINE_GEMM_DIR "${PROJECT_SOURCE_DIR}/tile_engine/ops/gemm")
+set(TILE_ENGINE_GEMM_DIR "${PROJECT_SOURCE_DIR}/tile_engine/ops/gemm/gemm_universal")

 if(NOT EXISTS ${TILE_ENGINE_GEMM_DIR})
    message(WARNING "Tile engine directory not found: ${TILE_ENGINE_GEMM_DIR}")
@@ -32,11 +32,11 @@ endif()
 #   config_json  - Full path to JSON configuration file
 # ============================================================================
 function(create_individual_gemm_test_target datatype layout config_name trait tile_config config_json)
-    set(target_name "test_gemm_tile_engine_${datatype}_${layout}_${config_name}_${trait}_${tile_config}")
+    set(target_name "test_gemm_universal_tile_engine_${datatype}_${layout}_${config_name}_${trait}_${tile_config}")
    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}/${config_name}")
    
    # Generated header path (already created during cmake configuration)
-    set(test_header "${working_path}/gemm_single_${datatype}_${layout}_${trait}_${tile_config}.hpp")
+    set(test_header "${working_path}/gemm_universal_single_${datatype}_${layout}_${trait}_${tile_config}.hpp")
    set(test_params_header "${working_path}/test_params.hpp")
    
    # Verify header exists (should have been generated during cmake configuration)
@@ -118,7 +118,7 @@ function(build_gemm_test_targets datatype layout config_name)

    # STEP 1: Discovery phase - list all valid kernel configurations
    execute_process(
-        COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_instance_builder.py
+        COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_universal_instance_builder.py
                --working_path ${working_path}
                --datatype ${datatype}
                --layout ${layout}
@@ -178,7 +178,7 @@ function(build_gemm_test_targets datatype layout config_name)
                    
                    # Generate header using --gen_single
                    execute_process(
-                        COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_instance_builder.py
+                        COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_universal_instance_builder.py
                                --working_path ${working_path}
                                --gpu_target "${GEMM_TEST_GPU_TARGETS}"
                                --datatype ${datatype}
--- a/tile_engine/ops/common/benchmark_utils.py
+++ b/tile_engine/ops/common/benchmark_utils.py
@@ -2,17 +2,16 @@
 # Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT

-import sys
 import json
 import subprocess
-import argparse
 import csv
-import time
 from pathlib import Path
-from typing import List, Dict, Tuple, Optional
+from typing import List, Dict, Optional


-def run_kernel(build_dir: Path, kernel_path: Path, params: Dict[str, str], verbose: bool = False) -> Optional[Dict]:
+def run_kernel(
+    build_dir: Path, kernel_path: Path, params: Dict[str, str], verbose: bool = False
+) -> Optional[Dict]:
    """Run a single kernel with given parameters and save output to individual JSON file"""
    # Create results directory
    results_dir = build_dir / "results"
@@ -59,6 +58,7 @@ def run_kernel(build_dir: Path, kernel_path: Path, params: Dict[str, str], verbo
        print(f"Error running {kernel_path.name}: {e}")
        return None

+
 def parse_json_file(json_file: Path, verbose: bool = False) -> Optional[Dict]:
    """Parse JSON data from individual kernel output file"""
    try:
@@ -88,9 +88,8 @@ def parse_json_file(json_file: Path, verbose: bool = False) -> Optional[Dict]:
            print(f"Error reading JSON file {json_file}: {e}")
        return None

-def find_best_kernel(
-     results: List[Dict], metric: str = "tflops"
-) -> Optional[Dict]:
+
+def find_best_kernel(results: List[Dict], metric: str = "tflops") -> Optional[Dict]:
    """Find the best performing kernel based on metric"""
    if not results:
        return None
@@ -126,7 +125,8 @@ def export_csv(results: List[Dict], filename: str, verbose: bool = False):

    print(f"Results exported to {filename}")

-def export_best_kernels( best_kernels: Dict, filename: str, verbose: bool = False):
+
+def export_best_kernels(best_kernels: Dict, filename: str, verbose: bool = False):
    """Export best kernel selections to file"""
    with open(filename, "w") as f:
        f.write("# Best kernel selections\n")
@@ -141,7 +141,10 @@ def export_best_kernels( best_kernels: Dict, filename: str, verbose: bool = Fals

    print(f"Best kernels exported to {filename}")

-def export_json(results: List[Dict], filename: str, best_kernels: Dict = None, verbose: bool = False):
+
+def export_json(
+    results: List[Dict], filename: str, best_kernels: Dict = None, verbose: bool = False
+):
    """Export all results and best kernels to JSON with comprehensive metadata"""
    from datetime import datetime

@@ -223,9 +226,7 @@ def export_json(results: List[Dict], filename: str, best_kernels: Dict = None, v
        "benchmark_metadata": {
            "timestamp": datetime.now().isoformat(),
            "total_kernels_tested": len(results),
-            "unique_kernels": len(
-                set(r.get("name", "unknown") for r in results)
-            ),
+            "unique_kernels": len(set(r.get("name", "unknown") for r in results)),
            "successful_runs": len(successful_results),
            "failed_runs": len(results) - len(successful_results),
        },
@@ -265,9 +266,7 @@ def export_json(results: List[Dict], filename: str, best_kernels: Dict = None, v
                "by_scheduler": scheduler_stats,
                "by_data_type": data_type_stats,
            },
-            "total_problem_configurations": len(best_kernels)
-            if best_kernels
-            else 0,
+            "total_problem_configurations": len(best_kernels) if best_kernels else 0,
        },
        "kernel_results": results,
        "best_kernels_by_problem": best_kernels or {},
@@ -282,4 +281,3 @@ def export_json(results: List[Dict], filename: str, best_kernels: Dict = None, v
    print(f"  - Best TFLOPS: {max(tflops_values, default=0):.2f}")
    print(f"  - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s")
    print(f"  - Best latency: {min(latency_values, default=0):.2f}ms")
-
--- a/tile_engine/ops/common/utils.hpp
+++ b/tile_engine/ops/common/utils.hpp
@@ -20,89 +20,6 @@ constexpr auto is_row_major(Layout)
    return ck_tile::bool_constant<std::is_same_v<Layout, ck_tile::tensor_layout::gemm::RowMajor>>{};
 }

-// Structure to hold kernel traits for dispatcher
-struct KernelTraits
-{
-    std::string pipeline;  // compv3, compv4, mem
-    std::string scheduler; // intrawave, interwave
-    std::string epilogue;  // cshuffle, default
-    bool pad_m;
-    bool pad_n;
-    bool pad_k;
-    bool persistent;
-
-    // Constructor with defaults
-    KernelTraits()
-        : pipeline("compv3"),
-          scheduler("intrawave"),
-          epilogue("cshuffle"),
-          pad_m(false),
-          pad_n(false),
-          pad_k(false),
-          persistent(false)
-    {
-    }
-};
-
-
-// Create argument parser
-inline auto create_args(int argc, char* argv[])
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
-        .insert("n", "4096", "The value for n dimension. Default is 4096.")
-        .insert("k", "2048", "The value for k dimension. Default is 2048.")
-        .insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
-        .insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
-	.insert("stride_ds", "0", "The stride value for tensor Ds . Default is 0.")
-        .insert("stride_c", "0", "The stride value for tensor C. Default is 0.")
-        .insert("split_k", "1", "The split value for k dimension. Default is 1.")
-        .insert("verify",
-                "2",
-                "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
-                "for validation on GPU. Default is 2, GPU validation.")
-        .insert("log",
-                "false",
-                "Whether output kernel instance information or not. Possible values are true or "
-                "false. Default is false")
-        .insert(
-            "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
-        .insert(
-            "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
-        .insert("timer",
-                "true",
-                "Whether if the timer is gpu timer or not. Possible values are false or true. "
-                "Default is true.")
-        .insert("init",
-                "0",
-                "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
-                "for constant(1). Default is 0, random.")
-        .insert("flush_cache",
-                "true",
-                "To flush cache, possible values are true or false. "
-                "Default is false.")
-        .insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.")
-        .insert("metric",
-                "0",
-                "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
-                "tflops, or 2 for bandwidth. Default is 0, latency.")
-        .insert("csv_filename",
-                "",
-                "The filename of benchmark result. Default is empty (no CSV output).")
-        .insert("structured_sparsity",
-                "false",
-                "Whether use sparsity kernel or not. Possible values are true or false. Default is "
-                "false")
-        .insert("json_output",
-                "false",
-                "Whether to output results in JSON format only. Possible values are true or false. "
-                "Default is "
-                "false");
-
-    bool result = arg_parser.parse(argc, argv);
-    return std::make_tuple(result, arg_parser);
-}
-
 enum class Metric
 {
    LATENCY   = 0,
--- a/tile_engine/ops/gemm/README.md
+++ b/tile_engine/ops/gemm/README.md
@@ -0,0 +1,442 @@
+# CK Tile Engine GEMM Operations
+
+## Overview
+
+The CK Tile Engine GEMM module provides a comprehensive system for generating, building, and benchmarking GEMM (General Matrix Multiplication) kernels with various configurations. It supports multiple data types, layouts, and optimization strategies. The system has evolved from a monolithic build approach (where all kernels compile into a single executable) to a more flexible individual kernel compilation system, providing better build parallelism and targeted testing capabilities.
+
+## Table of Contents
+
+1. [Build System Architecture](#build-system-architecture)
+2. [Build Instructions](#build-instructions)
+3. [Running Benchmarks](#running-benchmarks)
+4. [Configuration System](#configuration-system)
+5. [Scripts and Tools](#scripts-and-tools)
+6. [Command Line Options](#command-line-options)
+7. [Understanding Kernel Names](#understanding-kernel-names)
+8. [Troubleshooting](#troubleshooting)
+9. [Performance Tips](#performance-tips)
+
+## Build System Architecture
+
+### Individual Kernel Compilation (New Approach)
+
+The new tile engine benchmark system compiles each kernel configuration into a separate executable. This provides:
+- Better build parallelism
+- Faster incremental builds
+- More targeted testing
+- Easier debugging of specific configurations
+
+Each benchmark executable follows the naming pattern:
+```
+benchmark_gemm_<dtype>_<layout>_<config>_<tile_sizes>
+```
+
+### Monolithic Build (Legacy Approach)
+
+The original system compiles all kernels into a single executable (`benchmark_gemm_[Datatype]_[Layout]`), which can then be filtered at runtime using command-line arguments.
+
+## Build Instructions
+
+### Prerequisites
+- ROCm installation
+- CMake 3.16 or higher
+- C++17 compatible compiler
+
+### Basic Build
+
+```bash
+# In the root of composable kernel, create build directory
+mkdir build && cd build
+
+# Configure with specific datatypes and layouts
+# Replace [Arch] with your GPU architecture (e.g., gfx90a, gfx942)
+# Replace [Datatype1;Datatype2;...] with datatypes (fp8, bf8, int8, fp16, bf16, fp32, fp64)
+# Replace [Layout1;Layout2;...] with layouts (rcr, rrr, crr, ccr)
+../script/cmake-ck-dev.sh ../ [Arch] -DGEMM_DATATYPE="[Datatype1;Datatype2]" -DGEMM_LAYOUT="[Layout1;Layout2]"
+
+# Build specific benchmarks
+make benchmark_gemm_[Datatype1]_[Layout1] -j
+```
+
+### Configuration Options
+
+The build system supports several configuration options:
+
+#### Using Custom Config Files
+```bash
+# Method 1: CMake variable (config file must be in configs/ directory)
+cmake -DGEMM_CONFIG_FILE=my_custom_config.json ...
+
+# Method 2: Environment variable (takes precedence over CMake variable)
+export GEMM_CONFIG_FILE=my_custom_config.json
+cmake ...
+```
+
+#### Config File Priority Order
+1. **Environment variable** `GEMM_CONFIG_FILE` (highest priority)
+2. **CMake variable** `GEMM_CONFIG_FILE`
+3. **Default config** (default_config.json for all layouts)
+
+**Note**: All custom config files must be placed in the `tile_engine/ops/gemm/configs/` directory.
+
+### Example Build Commands
+
+```bash
+# Build for gfx942 with fp8 and fp16 datatypes, rcr layout
+mkdir build && cd build
+../script/cmake-ck-dev.sh ../ gfx942 -DGEMM_DATATYPE="fp8;fp16" -DGEMM_LAYOUT="rcr;ccr;rrr;crr"
+make benchmark_gemm_universal_fp8_rcr -j
+make benchmark_gemm_universal_fp16_rcr -j
+```
+
+### Building Individual Kernels
+
+```bash
+# Build a specific kernel configuration
+make benchmark_gemm_universal_fp8_rcr_compv4_default_intrawave_False_False_False_False_256x256x32_1x4x1_32x32x32
+
+# Build all fp16 benchmarks in parallel
+make -j$(nproc) $(make help | grep benchmark_gemm_fp16 | awk '{print $2}')
+```
+
+### Rebuilding After Configuration Changes
+
+If you modify the configuration file, you must rebuild:
+```bash
+rm -rf tile_engine/ && make benchmark_gemm_universal_[Datatype]_[Layout] -j
+```
+
+## Running Benchmarks
+
+### Individual Kernel Execution
+
+```bash
+cd /path/to/build/directory
+./bin/benchmark_gemm_universal_fp16_rcr_compv3_default_intrawave_False_False_False_False_256x128x32_4x1x1_32x32x16 \
+    -m=512 -n=512 -k=512 -verify=1
+```
+
+### Monolithic Executable (Legacy)
+
+```bash
+# Run specific pipeline/scheduler/epilogue combination
+./bin/benchmark_gemm_universal_[Datatype]_[Layout] -pipeline=compv3 -scheduler=intrawave -epilogue=default
+```
+
+### Automated Testing
+
+Use the provided test script to run multiple benchmarks:
+```bash
+cd /path/to/composable_kernel/tile_engine/ops/gemm
+./test_benchmark.sh [build_directory]
+```
+
+## Configuration System
+
+### Configuration Files
+
+The system uses JSON configuration files to specify kernel parameters:
+
+- `configs/default_config.json` - Default configurations for various datatypes
+- `configs/user_provided_config.json` - User-customizable configurations
+
+### Configuration Structure
+
+```json
+{
+    "tile_config": {
+        "tile_m": {"values": [256, 128]},
+        "tile_n": {"values": [256, 128]},
+        "tile_k": {"values": [64, 32]},
+        "warp_m": {"values": [2, 4]},
+        "warp_n": {"values": [2, 1]},
+        "warp_k": {"values": [1]},
+        "warp_tile_m": {"values": [32, 16]},
+        "warp_tile_n": {"values": [32, 16]},
+        "warp_tile_k": {"values": [16, 32]}
+    },
+    "trait_config": {
+        "pipeline": {"values": ["compv3", "compv4", "mem"]},
+        "scheduler": {"values": ["intrawave", "interwave"]},
+        "epilogue": {"values": ["default", "cshuffle"]},
+        "pad_m": {"values": [false]},
+        "pad_n": {"values": [false]},
+        "pad_k": {"values": [false]},
+        "persistent": {"values": [false]}
+    }
+}
+```
+
+## Scripts and Tools
+
+### Python Scripts
+
+#### gemm_universal_instance_builder.py
+**Purpose**: Main kernel instance generation script that creates C++ kernel implementations based on configuration files.
+
+**Key Features**:
+- Generates individual kernel header files for separate compilation
+- Supports multiple data types (fp16, fp8, bf16, fp32, fp64)
+- Validates tile configurations for correctness
+- Creates CMake integration files
+
+**Usage**:
+```bash
+python gemm_universal_instance_builder.py \
+    --working_path ./generated \
+    --datatype fp16 \
+    --layout rcr \
+    --config_json configs/user_provided_config.json \
+    --gen_all_individual
+```
+
+#### gemm_instance_builder_parallel.py
+**Purpose**: Parallel version of the instance builder for faster generation of multiple kernel configurations.
+
+**Features**:
+- Multi-threaded kernel generation
+- Improved performance for large configuration spaces
+
+#### validation_utils.py
+**Purpose**: Provides comprehensive validation functions for kernel configurations.
+
+**Key Functions**:
+- `is_tile_config_valid()` - Validates tile dimensions and alignments
+- `is_trait_combination_valid()` - Checks if pipeline/epilogue/scheduler combinations are supported
+- `validate_warp_tile_combination()` - GPU-specific warp tile validation
+- `validate_lds_capacity()` - Ensures configurations fit in LDS memory
+
+**Validation Checks**:
+- Dimension alignment (tile dimensions must be divisible by warp dimensions)
+- LDS capacity constraints
+- GPU-specific warp tile support
+- Unsupported trait combinations
+
+#### test_validation.py
+**Purpose**: Test suite for the validation logic to ensure correctness.
+
+**Usage**:
+```bash
+python test_validation.py
+```
+
+**Tests**:
+- Warp tile combination validation
+- Trait combination validation
+- Full tile configuration validation
+
+#### gemm_universal_benchmark.py
+**Purpose**: Python script for running and analyzing GEMM benchmarks.
+
+**Features**:
+- Automated benchmark execution
+- Performance data collection
+- Result analysis and reporting
+
+#### json_config.py
+**Purpose**: Configuration file parsing and management.
+
+**Features**:
+- JSON configuration loading
+- Default configuration handling
+- Configuration validation
+
+#### codegen_utils.py
+**Purpose**: Utility functions for code generation.
+
+**Features**:
+- Template processing
+- Code formatting utilities
+- File generation helpers
+
+### Shell Scripts
+
+#### test_benchmark.sh
+**Purpose**: Automated benchmark testing script that finds and runs all built benchmark executables.
+
+**Features**:
+- Automatic build directory detection
+- Batch execution of multiple benchmarks
+- CSV result collection
+- Colored output for easy reading
+- Example command generation
+
+**Usage**:
+```bash
+# Auto-detect build directory
+./test_benchmark.sh
+
+# Specify build directory
+./test_benchmark.sh /path/to/build/directory
+```
+
+**What it does**:
+1. Finds all benchmark executables in the build directory
+2. Runs each with multiple problem sizes (512, 1024, 2048)
+3. Performs GPU verification
+4. Saves results to timestamped CSV file
+5. Provides summary statistics
+
+## Command Line Options
+
+All benchmark executables support the following options:
+
+### Matrix Dimensions
+- `-m=<value>` - M dimension (default: 3840)
+- `-n=<value>` - N dimension (default: 4096)
+- `-k=<value>` - K dimension (default: 2048)
+
+### Strides
+- `-stride_a=<value>` - Stride for matrix A (default: 0, auto-calculated)
+- `-stride_b=<value>` - Stride for matrix B (default: 0, auto-calculated)
+- `-stride_c=<value>` - Stride for matrix C (default: 0, auto-calculated)
+
+### Verification
+- `-verify=<0|1|2>` - Verification mode
+  - 0: No verification (default)
+  - 1: CPU verification
+  - 2: GPU verification
+
+### Performance Testing
+- `-warmup=<value>` - Warmup iterations (default: 50)
+- `-repeat=<value>` - Benchmark iterations (default: 100)
+- `-timer=<true|false>` - Use GPU timer (default: true)
+- `-flush_cache=<true|false>` - Flush cache between runs (default: true)
+- `-rotating_count=<value>` - Cache rotation count (default: 1000)
+
+### Initialization
+- `-init=<0|1|2>` - Tensor initialization method
+  - 0: Random values [-1, 1] (default)
+  - 1: Linear sequence (i % 17)
+  - 2: Constant value (1.0)
+
+### Output Options
+- `-log=<true|false>` - Enable verbose logging (default: false)
+- `-metric=<0|1|2>` - Performance metric
+  - 0: Latency in ms (default)
+  - 1: TFLOPS
+  - 2: Bandwidth in GB/s
+- `-json_output=<true|false>` - JSON format output (default: false)
+- `-csv_filename=<filename>` - Save results to CSV
+- `-csv_format=<simple|comprehensive>` - CSV format (default: comprehensive)
+
+### Advanced Options
+- `-split_k=<value>` - Split-K factor (default: 1)
+- `-structured_sparsity=<true|false>` - Enable structured sparsity (default: false)
+- `-pipeline=<compv3|compv4|mem>` - Pipeline type (default: compv3)
+- `-scheduler=<intrawave|interwave>` - Scheduler type (default: intrawave)
+- `-epilogue=<cshuffle|default>` - Epilogue type (default: cshuffle)
+- `-pad_m=<true|false>` - Pad M dimension (default: false)
+- `-pad_n=<true|false>` - Pad N dimension (default: false)
+- `-pad_k=<true|false>` - Pad K dimension (default: false)
+- `-persistent=<true|false>` - Use persistent kernel (default: false)
+
+## Understanding Kernel Names
+
+The kernel naming convention encodes the configuration:
+
+```
+benchmark_gemm_universal_fp16_rcr_compv3_default_intrawave_False_False_False_False_256x128x32_4x1x1_32x32x16
+               		 ^^^^  ^^^ ^^^^^^ ^^^^^^^ ^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^ ^^^^^^^ ^^^^^^^^^
+               		 |     |   |      |       |         |                       |         |       |
+               		 |     |   |      |       |         Padding & flags         |         |       Warp tile
+               		 |     |   |      |       Scheduler                         |         Thread tile
+               		 |     |   |      Epilogue                                  Block tile
+               		 |     |   Pipeline
+               		 |     Layout (Row-Column-Row)
+               		 Data type
+```
+
+### Components:
+- **Data type**: fp16, fp32, bf16, fp8, bf8, int8
+- **Layout**: rcr (Row-Column-Row), rrr, crr, ccr
+- **Pipeline**: mem, compv3, compv4
+- **Epilogue**: default, cshuffle
+- **Scheduler**: intrawave, interwave
+- **Flags**: pad_m, pad_n, pad_k, persistent (4 boolean flags)
+- **Tile sizes**: BlockTile x ThreadTile x WarpTile
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Kernel not found**
+   - Ensure the specific benchmark executable is built
+   - Check the build directory bin/ folder
+
+2. **Verification failures**
+   - Try GPU verification (-verify=2) which may be more accurate
+   - Check data type compatibility
+   - Verify stride calculations
+
+3. **Build failures**
+   - Check GPU architecture compatibility
+   - Ensure ROCm is properly installed
+   - Verify configuration file syntax
+
+4. **Performance variations**
+   - Increase warmup iterations
+   - Disable CPU frequency scaling
+   - Use GPU timer for accurate measurements
+
+### Debug Options
+
+Enable verbose logging:
+```bash
+./bin/benchmark_gemm_... -log=true -verify=1
+```
+
+Test validation logic:
+```bash
+python test_validation.py
+```
+
+## Performance Tips
+
+1. **Optimal Problem Sizes**: Use sizes that are multiples of tile dimensions
+2. **Warmup**: Use at least 50-100 warmup iterations
+3. **GPU Timer**: Always use `-timer=true` for accurate measurements
+4. **Cache Management**: Enable cache flushing for consistent results
+5. **Thread Affinity**: Set CPU affinity to reduce variation
+
+## Integration Examples
+
+### Python Integration
+
+```python
+import subprocess
+import json
+
+# Run benchmark with JSON output
+result = subprocess.run([
+    './bin/benchmark_gemm_universal_fp16_rcr_...', 
+    '-m=1024', '-n=1024', '-k=1024',
+    '-json_output=true'
+], capture_output=True, text=True)
+
+# Parse results
+data = json.loads(result.stdout)
+print(f"Performance: {data['tflops']} TFLOPS")
+```
+
+### Batch Testing Script
+
+```bash
+#!/bin/bash
+SIZES="512 1024 2048 4096"
+for size in $SIZES; do
+    echo "Testing ${size}x${size}x${size}"
+    ./bin/benchmark_gemm_... -m=$size -n=$size -k=$size \
+        -verify=2 -csv_filename=results.csv
+done
+```
+
+## Contributing
+
+When adding new features or configurations:
+1. Update validation logic in `validation_utils.py`
+2. Add tests to `test_validation.py`
+3. Update configuration examples
+4. Document new command-line options
+
+For more information about the Composable Kernel project, visit the main repository documentation.
--- a/tile_engine/ops/gemm/gemm_benchmark.hpp
+++ b/tile_engine/ops/gemm/gemm_benchmark.hpp
@@ -52,19 +52,27 @@ struct GemmProblem

 // Detect Problem::DsDataType, default to void when absent
 template <class T, class = void>
-struct get_DsDataType { using type = void; };
+struct get_DsDataType
+{
+    using type = void;
+};

 template <class T>
-struct get_DsDataType<T, std::void_t<typename T::DsDataType>> {
+struct get_DsDataType<T, std::void_t<typename T::DsDataType>>
+{
    using type = typename T::DsDataType;
 };

 // Detect Problem::D0DataType, default to void when absent
 template <class T, class = void>
-struct get_D0DataType { using type = void; };
+struct get_D0DataType
+{
+    using type = void;
+};

 template <class T>
-struct get_D0DataType<T, std::void_t<typename T::D0DataType>> {
+struct get_D0DataType<T, std::void_t<typename T::D0DataType>>
+{
    using type = typename T::D0DataType;
 };

@@ -79,10 +87,10 @@ bool compare(std::string instanceName,
    using DDataType = typename get_D0DataType<Problem>::type;
    const float max_accumulated_value =
        *std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end());
-    //const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-        //K, kbatch, max_accumulated_value);
+    // const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+    // K, kbatch, max_accumulated_value);
    auto rtol_atol = [&] {
-        if constexpr (std::is_void_v<DDataType>)
+        if constexpr(std::is_void_v<DDataType>)
        {
            return calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
                K, kbatch, max_accumulated_value);
--- a/tile_engine/ops/gemm/gemm_benchmark.py
+++ b/tile_engine/ops/gemm/gemm_benchmark.py
@@ -3,15 +3,10 @@
 # SPDX-License-Identifier: MIT

 import os
-import sys
-import json
-import subprocess
-import argparse
-import csv
-import time
 import importlib.util
 from pathlib import Path
-from typing import List, Dict, Tuple, Optional
+from typing import List, Dict, Tuple
+

 # TODO: explore modularizing tile engine to avoid accessing imports like this
 def _import_benchmark_utils():
@@ -29,10 +24,14 @@ def _import_benchmark_utils():

    return benchmark_utils

+
 benchmark_utils = _import_benchmark_utils()

+
 class GemmBenchmark:
-    def __init__(self, build_dir: str, verbose: bool = False, name: str = "benchmark_gemm_"):
+    def __init__(
+        self, build_dir: str, verbose: bool = False, name: str = "benchmark_gemm_"
+    ):
        self.build_dir = Path(build_dir)
        self.verbose = verbose
        self.results = []
@@ -57,7 +56,7 @@ class GemmBenchmark:
        """Extract comprehensive kernel information from filename"""
        name = kernel_path.stem
        if name.startswith(self.name):
-            args = name[len(self.name):]
+            args = name[len(self.name) :]
        else:
            args = name

@@ -245,7 +244,9 @@ class GemmBenchmark:

        for kernel_path in kernels:
            kernel_info = self.extract_kernel_info(kernel_path)
-            result = benchmark_utils.run_kernel(self.build_dir, kernel_path, params, verbose=self.verbose)
+            result = benchmark_utils.run_kernel(
+                self.build_dir, kernel_path, params, verbose=self.verbose
+            )
            if result:
                # Create new structured result format
                structured_result = {
@@ -327,5 +328,3 @@ class GemmBenchmark:

        self.results = all_results
        return best_kernels
-
-
--- a/tile_engine/ops/gemm/gemm_common.hpp
+++ b/tile_engine/ops/gemm/gemm_common.hpp
@@ -0,0 +1,96 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+#include <iostream>
+#include <functional>
+#include <tuple>
+#include <exception>
+#include <sstream>
+#include <vector>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+// Structure to hold kernel traits for dispatcher
+struct KernelTraits
+{
+    std::string pipeline;  // compv3, compv4, mem
+    std::string scheduler; // intrawave, interwave
+    std::string epilogue;  // cshuffle, default
+    bool pad_m;
+    bool pad_n;
+    bool pad_k;
+    bool persistent;
+
+    // Constructor with defaults
+    KernelTraits()
+        : pipeline("compv3"),
+          scheduler("intrawave"),
+          epilogue("cshuffle"),
+          pad_m(false),
+          pad_n(false),
+          pad_k(false),
+          persistent(false)
+    {
+    }
+};
+
+// Create argument parser
+inline auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
+        .insert("n", "4096", "The value for n dimension. Default is 4096.")
+        .insert("k", "2048", "The value for k dimension. Default is 2048.")
+        .insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
+        .insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
+        .insert("stride_ds", "0", "The stride value for tensor Ds . Default is 0.")
+        .insert("stride_c", "0", "The stride value for tensor C. Default is 0.")
+        .insert("split_k", "1", "The split value for k dimension. Default is 1.")
+        .insert("verify",
+                "2",
+                "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
+                "for validation on GPU. Default is 2, GPU validation.")
+        .insert("log",
+                "false",
+                "Whether output kernel instance information or not. Possible values are true or "
+                "false. Default is false")
+        .insert(
+            "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
+        .insert(
+            "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
+        .insert("timer",
+                "true",
+                "Whether if the timer is gpu timer or not. Possible values are false or true. "
+                "Default is true.")
+        .insert("init",
+                "0",
+                "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
+                "for constant(1). Default is 0, random.")
+        .insert("flush_cache",
+                "true",
+                "To flush cache, possible values are true or false. "
+                "Default is false.")
+        .insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.")
+        .insert("metric",
+                "0",
+                "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
+                "tflops, or 2 for bandwidth. Default is 0, latency.")
+        .insert("csv_filename",
+                "",
+                "The filename of benchmark result. Default is empty (no CSV output).")
+        .insert("structured_sparsity",
+                "false",
+                "Whether use sparsity kernel or not. Possible values are true or false. Default is "
+                "false")
+        .insert("json_output",
+                "false",
+                "Whether to output results in JSON format only. Possible values are true or false. "
+                "Default is "
+                "false");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
--- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py
+++ b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py
@@ -3,14 +3,10 @@

 import os
 import sys
-import json
-import subprocess
 import argparse
-import csv
 import time
 import importlib.util
-from pathlib import Path
-from typing import List, Dict, Tuple, Optional
+

 def _import_gemm_benchmark():
    """Import validation utilities from commons directory."""
@@ -27,6 +23,7 @@ def _import_gemm_benchmark():

    return gemm_benchmark_module.GemmBenchmark

+
 def _import_benchmark_utils():
    """Import benchmark utilities from commons directory."""
    current_dir = os.path.dirname(os.path.abspath(__file__))
@@ -42,13 +39,16 @@ def _import_benchmark_utils():

    return benchmark_utils

+
 GemmBenchmark = _import_gemm_benchmark()
 benchmark_utils = _import_benchmark_utils()

+
 class GemmMultiDBenchmark(GemmBenchmark):
    def __init__(self, build_dir: str, verbose: bool = False):
        super().__init__(build_dir, verbose, name="benchmark_gemm_multi_d_")

+
 def main():
    parser = argparse.ArgumentParser(
        description="GEMM Multi D Kernel Benchmarking Tool"
--- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark_single.cpp
+++ b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark_single.cpp
@@ -11,6 +11,7 @@

 #include "ck_tile/core.hpp"
 #include "ck_tile/host.hpp"
+#include "gemm/gemm_common.hpp"
 #include "gemm_multi_d_profiler.hpp"

 // The kernel header is included via the compile command line with -include flag
@@ -35,29 +36,27 @@ void benchmark_single(const ck_tile::ArgParser& arg_parser)
    std::string layout_d1 = D1Layout::name;

    // Create GemmMultiDProblem struct
-    GemmMultiDProblem gemm_multi_d_problem{
-	    				GemmProblem{
-	    				   arg_parser.get_int("split_k"),
-                                           arg_parser.get_int("m"),
-                                           arg_parser.get_int("n"),
-                                           arg_parser.get_int("k"),
-                                           arg_parser.get_int("stride_a"),
-                                           arg_parser.get_int("stride_b"),
-                                           arg_parser.get_int("stride_c"),
-                                           dtype_a,
-                                           dtype_b,
-                                           dtype_acc,
-                                           dtype_c,
-                                           layout_a,
-                                           layout_b,
-                                           layout_c,
-					   arg_parser.get_bool("structured_sparsity")},
-					   arg_parser.get_int("stride_ds"),
-        				   arg_parser.get_int("stride_ds"),
-        				   dtype_d0,                 
-        				   dtype_d1,                      
-        				   layout_d0,                       
-        				   layout_d1};
+    GemmMultiDProblem gemm_multi_d_problem{GemmProblem{arg_parser.get_int("split_k"),
+                                                       arg_parser.get_int("m"),
+                                                       arg_parser.get_int("n"),
+                                                       arg_parser.get_int("k"),
+                                                       arg_parser.get_int("stride_a"),
+                                                       arg_parser.get_int("stride_b"),
+                                                       arg_parser.get_int("stride_c"),
+                                                       dtype_a,
+                                                       dtype_b,
+                                                       dtype_acc,
+                                                       dtype_c,
+                                                       layout_a,
+                                                       layout_b,
+                                                       layout_c,
+                                                       arg_parser.get_bool("structured_sparsity")},
+                                           arg_parser.get_int("stride_ds"),
+                                           arg_parser.get_int("stride_ds"),
+                                           dtype_d0,
+                                           dtype_d1,
+                                           layout_d0,
+                                           layout_d1};

    // Create Setting struct
    Setting setting{arg_parser.get_int("warmup"),
--- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_profiler.hpp
@@ -11,25 +11,28 @@
 #include <functional>
 #include <tuple>

-
 #include "ck_tile/host/device_prop.hpp"
 #include "ck_tile/ops/gemm.hpp"
 #include "gemm/gemm_profiler.hpp"
+#include "common/utils.hpp"
 #include "gemm_multi_d_benchmark.hpp"

-class GemmMultiDProfiler: public GemmProfiler<GemmMultiDProfiler,
-                          GemmMultiDProblem,
-                          ck_tile::GemmMultiDHostArgs<DsDataType::size()>>
+class GemmMultiDProfiler : public GemmProfiler<GemmMultiDProfiler,
+                                               GemmMultiDProblem,
+                                               ck_tile::GemmMultiDHostArgs<DsDataType::size()>>
 {
-public:
+    public:
    using BaseGemm = GemmProfiler<GemmMultiDProfiler,
-                              GemmMultiDProblem,
-                              ck_tile::GemmMultiDHostArgs<DsDataType::size()>>;
+                                  GemmMultiDProblem,
+                                  ck_tile::GemmMultiDHostArgs<DsDataType::size()>>;
    using BaseGemm::benchmark;

    GemmMultiDProfiler(Setting setting)
-        : GemmProfiler<GemmMultiDProfiler, GemmMultiDProblem, ck_tile::GemmMultiDHostArgs<DsDataType::size()>>(setting) {}
-
+        : GemmProfiler<GemmMultiDProfiler,
+                       GemmMultiDProblem,
+                       ck_tile::GemmMultiDHostArgs<DsDataType::size()>>(setting)
+    {
+    }

    void benchmark(
        GemmMultiDProblem& gemm_multi_d_problem,
@@ -157,5 +160,4 @@ public:
                           kernel_run_result);
        }
    }
-
 };
--- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.hpp
+++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.hpp
@@ -8,6 +8,23 @@
 #include "gemm_preshuffle_common.hpp"
 #include "gemm/gemm_benchmark.hpp"

+struct KernelConfig
+{
+    ck_tile::index_t M_Tile;
+    ck_tile::index_t N_Tile;
+    ck_tile::index_t K_Tile;
+
+    ck_tile::index_t M_Warp;
+    ck_tile::index_t N_Warp;
+    ck_tile::index_t K_Warp;
+
+    ck_tile::index_t M_Warp_Tile;
+    ck_tile::index_t N_Warp_Tile;
+    ck_tile::index_t K_Warp_Tile;
+
+    bool permuteN;
+};
+
 /// @brief Function to get the kernel output with reference implementation on CPU/GPU
 void gemm_host_reference(int verify,
                         ck_tile::HostTensor<ADataType>& a_m_k,
--- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py
+++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py
@@ -3,14 +3,9 @@

 import os
 import sys
-import json
-import subprocess
 import argparse
-import csv
 import time
 import importlib.util
-from pathlib import Path
-from typing import List, Dict, Tuple, Optional


 def _import_gemm_benchmark():
@@ -28,6 +23,7 @@ def _import_gemm_benchmark():

    return gemm_benchmark_module.GemmBenchmark

+
 def _import_benchmark_utils():
    """Import benchmark utilities from commons directory."""
    current_dir = os.path.dirname(os.path.abspath(__file__))
@@ -43,13 +39,16 @@ def _import_benchmark_utils():

    return benchmark_utils

+
 GemmBenchmark = _import_gemm_benchmark()
 benchmark_utils = _import_benchmark_utils()

+
 class GemmPreshuffleBenchmark(GemmBenchmark):
    def __init__(self, build_dir: str, verbose: bool = False):
        super().__init__(build_dir, verbose, name="benchmark_gemm_preshuffle_")

+
 def main():
    parser = argparse.ArgumentParser(
        description="GEMM Preshuffle Kernel Benchmarking Tool"
--- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark_single.cpp
+++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark_single.cpp
@@ -11,78 +11,21 @@

 #include "ck_tile/core.hpp"
 #include "ck_tile/host.hpp"
+#include "gemm/gemm_common.hpp"
 #include "gemm_preshuffle_profiler.hpp"
 #include "gemm_preshuffle_common.hpp"

 // The kernel header is included via the compile command line with -include flag
 // It defines SelectedKernel struct and KERNEL_NAME
-// DataTypeTraits are now defined in gemm_common.hpp
-
-// Create argument parser
-inline auto create_args(int argc, char* argv[])
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
-        .insert("n", "4096", "The value for n dimension. Default is 4096.")
-        .insert("k", "2048", "The value for k dimension. Default is 2048.")
-        .insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
-        .insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
-        .insert("stride_c", "0", "The stride value for tensor C. Default is 0.")
-        .insert("split_k", "1", "The split value for k dimension. Default is 1.")
-        .insert("verify",
-                "2",
-                "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
-                "for validation on GPU. Default is 0, no validation.")
-        .insert("log",
-                "false",
-                "Whether output kernel instance information or not. Possible values are true or "
-                "false. Default is false")
-        .insert(
-            "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
-        .insert(
-            "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
-        .insert("timer",
-                "true",
-                "Whether if the timer is gpu timer or not. Possible values are false or true. "
-                "Default is true.")
-        .insert("init",
-                "0",
-                "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
-                "for constant(1). Default is 0, random.")
-        .insert("flush_cache",
-                "true",
-                "To flush cache, possible values are true or false. "
-                "Default is false.")
-        .insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.")
-        .insert("metric",
-                "0",
-                "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
-                "tflops, or 2 for bandwidth. Default is 0, latency.")
-        .insert("csv_filename",
-                "",
-                "The filename of benchmark result. Default is empty (no CSV output).")
-        .insert("structured_sparsity",
-                "false",
-                "Whether use sparsity kernel or not. Possible values are true or false. Default is "
-                "false")
-        .insert("json_output",
-                "false",
-                "Whether to output results in JSON format only. Possible values are true or false. "
-                "Default is "
-                "false");
-
-    bool result = arg_parser.parse(argc, argv);
-    return std::make_tuple(result, arg_parser);
-}

 void benchmark_single(const ck_tile::ArgParser& arg_parser)
 {
    // Use DataTypeTraits to get the actual type names from the generated header
    // The generated header defines ADataType, BDataType, AccDataType, CDataType
-    std::string dtype_a   = DataTypeTraits<ADataType>::name;
-    std::string dtype_b   = DataTypeTraits<BDataType>::name;
-    std::string dtype_acc = DataTypeTraits<AccDataType>::name;
-    std::string dtype_c   = DataTypeTraits<CDataType>::name;
+    std::string dtype_a   = ck_tile::DataTypeTraits<ADataType>::name;
+    std::string dtype_b   = ck_tile::DataTypeTraits<BDataType>::name;
+    std::string dtype_acc = ck_tile::DataTypeTraits<AccDataType>::name;
+    std::string dtype_c   = ck_tile::DataTypeTraits<CDataType>::name;

    // Layout names from the layout types
    std::string layout_a = ALayout::name;
@@ -119,29 +62,17 @@ void benchmark_single(const ck_tile::ArgParser& arg_parser)
                    arg_parser.get_bool("json_output")};

    // Get the profiler instance
-    auto& profiler = GemmProfiler::instance(setting);
+    auto& profiler = GemmPreshuffleProfiler::instance(setting);

    try
    {
-        // Create a lambda that wraps the kernel launch
-        std::tuple<int, int, int> warp_tile_dims = std::make_tuple(
-            SelectedKernel::WarpTileM, SelectedKernel::WarpTileN, SelectedKernel::WarpTileK);
-        std::tuple<int, int, int> tile_dims =
-            std::make_tuple(SelectedKernel::TileM, SelectedKernel::TileN, SelectedKernel::TileK);
-        std::tuple<int, int, int> warp_dims = std::make_tuple(SelectedKernel::WarpPerBlock_M,
-                                                              SelectedKernel::WarpPerBlock_N,
-                                                              SelectedKernel::WarpPerBlock_K);
-        bool permuteN                       = SelectedKernel::PermuteN;
-
-        KernelConfig config{tile_dims, warp_dims, warp_tile_dims, permuteN};
-
        auto kernel_func = [](const ck_tile::GemmHostArgs& args,
                              const ck_tile::stream_config& stream) {
            return SelectedKernel::launch(args, stream);
        };

        // Benchmark the kernel
-        profiler.benchmark(gemm_problem, kernel_func, config);
+        profiler.benchmark(gemm_problem, kernel_func);

        // Select best instance based on metric
        profiler.select_best_instance(static_cast<Metric>(arg_parser.get_int("metric")));
--- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_common.hpp
+++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_common.hpp
@@ -8,35 +8,20 @@
 #include "ck_tile/host.hpp"
 #include "ck_tile/core/numeric/integer.hpp"
 #include "ck_tile/core/numeric/pk_int4.hpp"
+#include "gemm/gemm_common.hpp"

 // Structure to hold kernel traits for dispatcher
-struct KernelTraits
+struct PreshuffleKernelTraits : KernelTraits
 {
-    std::string pipeline;  // preshufflev2
-    std::string scheduler; // intrawave, interwave, default
-    std::string epilogue;  // cshuffle, default
-    bool pad_m;
-    bool pad_n;
-    bool pad_k;
-    bool persistent;

    // Constructor with defaults
-    KernelTraits()
-        : pipeline("preshufflev2"),
-          scheduler("default"),
-          epilogue("default"),
-          pad_m(false),
-          pad_n(false),
-          pad_k(false),
-          persistent(false)
-    {
-    }
+    PreshuffleKernelTraits() : KernelTraits() { this->pipeline = "preshufflev2"; }
 };

 // Helper to extract traits from kernel name
-inline KernelTraits extract_traits_from_name(const std::string& kernel_name)
+inline PreshuffleKernelTraits extract_traits_from_name(const std::string& kernel_name)
 {
-    KernelTraits traits;
+    PreshuffleKernelTraits traits;

    // Extract pipeline
    if(kernel_name.find("preshufflev2") != std::string::npos)
@@ -74,42 +59,3 @@ inline KernelTraits extract_traits_from_name(const std::string& kernel_name)

    return traits;
 }
-
-template <typename T>
-auto shuffle_b(const ck_tile::HostTensor<T>& t,
-               ck_tile::index_t N_Warp_Tile,
-               ck_tile::index_t K_Warp_Tile)
-{
-    assert(t.get_lengths().size() == 2);
-    int n_      = t.get_lengths()[1];
-    int k_      = t.get_lengths()[0];
-    int divisor = N_Warp_Tile == 32 ? 2 : 4;
-    ck_tile::HostTensor<T> t_view(
-        {n_ / N_Warp_Tile, N_Warp_Tile, k_ / K_Warp_Tile, divisor, K_Warp_Tile / divisor});
-    std::copy(t.begin(), t.end(), t_view.begin());
-    return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
-}
-
-template <typename T>
-auto shuffle_b_permuteN(const ck_tile::HostTensor<T>& t,
-                        ck_tile::index_t N_Warp_Tile,
-                        ck_tile::index_t K_Warp_Tile,
-                        ck_tile::index_t N_Tile,
-                        ck_tile::index_t N_Warp)
-{
-    assert(t.get_lengths().size() == 2);
-
-    int n_      = t.get_lengths()[1];
-    int k_      = t.get_lengths()[0];
-    int divisor = N_Warp_Tile == 32 ? 2 : 4;
-    int NRepeat = N_Tile / N_Warp_Tile / N_Warp;
-    ck_tile::HostTensor<T> t_view({n_ / N_Tile,
-                                   N_Warp,
-                                   N_Warp_Tile,
-                                   NRepeat,
-                                   k_ / K_Warp_Tile,
-                                   divisor,
-                                   K_Warp_Tile / divisor});
-    std::copy(t.begin(), t.end(), t_view.begin());
-    return ck_tile::reference_permute(t_view, {0, 3, 1, 4, 5, 2, 6});
-}
--- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_profiler.hpp
@@ -4,28 +4,26 @@
 #pragma once

 #include "ck_tile/host/device_prop.hpp"
+#include "ck_tile/host/tensor_shuffle_utils.hpp"
 #include "ck_tile/ops/gemm.hpp"
 #include "gemm/gemm_profiler.hpp"
 #include "gemm_preshuffle_benchmark.hpp"

-class GemmPreshuffleProfiler : public GemmProfiler<GemmPreshuffleProfiler,
-                          GemmProblem,
-                          ck_tile::GemmHostArgs>
+class GemmPreshuffleProfiler
+    : public GemmProfiler<GemmPreshuffleProfiler, GemmProblem, ck_tile::GemmHostArgs>
 {
-public:
-    using BaseGemm = GemmProfiler<GemmPreshuffleProfiler,
-                              GemmProblem,
-                              ck_tile::GemmHostArgs>;
+    public:
+    using BaseGemm = GemmProfiler<GemmPreshuffleProfiler, GemmProblem, ck_tile::GemmHostArgs>;
    using BaseGemm::benchmark;

    GemmPreshuffleProfiler(Setting setting)
-        : GemmProfiler<GemmPreshuffleProfiler, GemmProblem, ck_tile::GemmHostArgs>(setting) {}
-
+        : GemmProfiler<GemmPreshuffleProfiler, GemmProblem, ck_tile::GemmHostArgs>(setting)
+    {
+    }

    void benchmark(GemmProblem& gemm_problem,
                   std::vector<std::function<std::tuple<std::string, float>(
-                       ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables,
-                   KernelConfig& config) override
+                       ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables) override
    {
        const ALayout layout_a = ALayout{};
        const BLayout layout_b = BLayout{};
@@ -97,21 +95,28 @@ public:
        c_m_n_dev_buf.SetZero();
        c_m_n_dev_result.SetZero();

+        // Create a lambda that wraps the kernel launch
+        KernelConfig config{SelectedKernel::WarpTileM,
+                            SelectedKernel::WarpTileN,
+                            SelectedKernel::WarpTileK,
+                            SelectedKernel::TileM,
+                            SelectedKernel::TileN,
+                            SelectedKernel::TileK,
+                            SelectedKernel::WarpPerBlock_M,
+                            SelectedKernel::WarpPerBlock_N,
+                            SelectedKernel::WarpPerBlock_K,
+                            SelectedKernel::PermuteN};
+
        for(const auto& callable : callables)
        {
-            ck_tile::index_t N_Warp_Tile = std::get<1>(config.warp_tile_dims);
-            ck_tile::index_t K_Warp_Tile = std::get<2>(config.warp_tile_dims);
-            ck_tile::index_t N_Tile      = std::get<1>(config.tile_dims);
-            ck_tile::index_t N_Warp      = std::get<1>(config.warp_dims);
-
            ck_tile::HostTensor<BDataType> b_shuffle_host = [&]() {
                if(config.permuteN)
                {
-                    return shuffle_b_permuteN(b_k_n, N_Warp_Tile, K_Warp_Tile, N_Tile, N_Warp);
+                    return ck_tile::shuffle_b_permuteN<KernelConfig>(b_k_n, config);
                }
                else
                {
-                    return shuffle_b(b_k_n, N_Warp_Tile, K_Warp_Tile);
+                    return ck_tile::shuffle_b<KernelConfig>(b_k_n, config);
                }
            }();

@@ -144,5 +149,4 @@ public:
                gemm_problem, c_m_n_dev_buf, c_m_n_ref, c_m_n_dev_result, kernel_run_result);
        }
    }
-
 };
--- a/tile_engine/ops/gemm/gemm_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_profiler.hpp
@@ -11,17 +11,14 @@
 #include <functional>
 #include <tuple>

-
 #include "ck_tile/host/device_prop.hpp"
 #include "ck_tile/ops/gemm.hpp"
 #include "gemm_benchmark.hpp"

-template <typename Gemm,
-          typename Problem,
-          typename GemmArgs>
+template <typename Gemm, typename Problem, typename GemmArgs>
 class GemmProfiler
 {
-public:
+    public:
    static Gemm& instance(Setting setting)
    {
        static Gemm instance{setting};
@@ -30,27 +27,25 @@ public:

    // Overload for single kernel benchmarking
    void benchmark(Problem& gemm_problem,
-                   std::function<float(const GemmArgs&, const ck_tile::stream_config&)>
-                       kernel_func)
+                   std::function<float(const GemmArgs&, const ck_tile::stream_config&)> kernel_func)
    {
        // Create a vector with a single callable that returns both name and time
-        std::vector<std::function<std::tuple<std::string, float>(GemmArgs&,
-                                                                 const ck_tile::stream_config&)>>
+        std::vector<
+            std::function<std::tuple<std::string, float>(GemmArgs&, const ck_tile::stream_config&)>>
            callables;

-        callables.push_back(
-            [kernel_func](GemmArgs& args, const ck_tile::stream_config& stream) {
-                float time = kernel_func(args, stream);
-                return std::make_tuple(std::string(KERNEL_NAME), time);
-            });
+        callables.push_back([kernel_func](GemmArgs& args, const ck_tile::stream_config& stream) {
+            float time = kernel_func(args, stream);
+            return std::make_tuple(std::string(KERNEL_NAME), time);
+        });

-        benchmark(gemm_problem, callables); // TODO: need to cast this?
+        benchmark(gemm_problem, callables);
    }

    virtual void benchmark(Problem& gemm_problem,
-                   std::vector<std::function<std::tuple<std::string, float>(
-                       GemmArgs&, const ck_tile::stream_config&)>>& callables) = 0;
-   
+                           std::vector<std::function<std::tuple<std::string, float>(
+                               GemmArgs&, const ck_tile::stream_config&)>>& callables) = 0;
+
    void process_result(const Problem& gemm_problem,
                        ck_tile::DeviceMem& c_m_n_dev_buf,
                        ck_tile::HostTensor<CDataType>& c_m_n_host_result,
@@ -58,7 +53,7 @@ public:
                        const std::tuple<std::string, float>& kernel_run_result)
    {
        auto [name, avg_time] = kernel_run_result;
-	using DDataType = typename get_DsDataType<Problem>::type;
+        using DDataType       = typename get_DsDataType<Problem>::type;

        KernelInstance<Problem> kernel_instance{name, gemm_problem, {-1.0f, -1.0f, -1.0f}};

@@ -68,16 +63,14 @@ public:
                               sizeof(BDataType) * gemm_problem.n_ * gemm_problem.k_ +
                               sizeof(CDataType) * gemm_problem.m_ * gemm_problem.n_;

-
-	if constexpr (!std::is_void_v<DDataType>)
-	{
-		ck_tile::static_for<0, DDataType::size(), 1>{}([&](auto i) {
-				using DType = ck_tile::remove_cvref_t<std::tuple_element_t<i, DDataType>>;
-		    		num_byte += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_;
-		    		flop     += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_;
-				});
-	}
-
+        if constexpr(!std::is_void_v<DDataType>)
+        {
+            ck_tile::static_for<0, DDataType::size(), 1>{}([&](auto i) {
+                using DType = ck_tile::remove_cvref_t<std::tuple_element_t<i, DDataType>>;
+                num_byte += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_;
+                flop += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_;
+            });
+        }

        // update
        kernel_instance.perf_result_.latency_   = avg_time;
@@ -91,15 +84,14 @@ public:

        // verify result
        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
-	int split_k = 1;
-    	if constexpr (std::is_same_v<Problem, GemmProblem>)
-    	{
-        	split_k = gemm_problem.split_k_;
-    	}
+        int split_k = 1;
+        if constexpr(std::is_same_v<Problem, GemmProblem>)
+        {
+            split_k = gemm_problem.split_k_;
+        }
        bool verified_correct =
            !setting_.verify_ ||
-            compare<Problem>(
-                name, gemm_problem.k_, split_k, c_m_n_dev_result, c_m_n_host_result);
+            compare<Problem>(name, gemm_problem.k_, split_k, c_m_n_dev_result, c_m_n_host_result);

        if(verified_correct)
        {
@@ -196,5 +188,3 @@ public:

    std::vector<KernelInstance<Problem>> kernel_instances_;
 };
-
-
--- a/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py
+++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py
@@ -4,14 +4,10 @@

 import os
 import sys
-import json
-import subprocess
 import argparse
-import csv
 import time
 import importlib.util
-from pathlib import Path
-from typing import List, Dict, Tuple, Optional
+

 def _import_gemm_benchmark():
    """Import validation utilities from commons directory."""
@@ -28,6 +24,7 @@ def _import_gemm_benchmark():

    return gemm_benchmark_module.GemmBenchmark

+
 def _import_benchmark_utils():
    """Import benchmark utilities from commons directory."""
    current_dir = os.path.dirname(os.path.abspath(__file__))
@@ -43,16 +40,20 @@ def _import_benchmark_utils():

    return benchmark_utils

+
 GemmBenchmark = _import_gemm_benchmark()
 benchmark_utils = _import_benchmark_utils()

+
 class GemmUniversalBenchmark(GemmBenchmark):
    def __init__(self, build_dir: str, verbose: bool = False):
-        super().__init__(build_dir, verbose, name="benchmark_gemm_")
+        super().__init__(build_dir, verbose, name="benchmark_gemm_universal_")


 def main():
-    parser = argparse.ArgumentParser(description="GEMM Kernel Benchmarking Tool")
+    parser = argparse.ArgumentParser(
+        description="Universal GEMM Kernel Benchmarking Tool"
+    )
    parser.add_argument(
        "build_dir", help="Build directory containing kernel executables"
    )
@@ -67,7 +68,9 @@ def main():
    )
    parser.add_argument("--verify", action="store_true", help="Enable verification")
    parser.add_argument(
-        "--csv", default="gemm_benchmark_results.csv", help="CSV output filename"
+        "--csv",
+        default="gemm_universal_benchmark_results.csv",
+        help="CSV output filename",
    )
    parser.add_argument(
        "--best", default="best_kernels.txt", help="Best kernels output filename"
@@ -115,7 +118,7 @@ def main():
    benchmark = GemmUniversalBenchmark(args.build_dir, verbose=args.verbose)

    # Run benchmark sweep
-    print("Starting GEMM kernel benchmark sweep...")
+    print("Starting Universal GEMM kernel benchmark sweep...")
    start_time = time.time()

    best_kernels = benchmark.benchmark_sweep(
--- a/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark_single.cpp
+++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark_single.cpp
@@ -11,6 +11,7 @@

 #include "ck_tile/core.hpp"
 #include "ck_tile/host.hpp"
+#include "gemm/gemm_common.hpp"
 #include "gemm_universal_profiler.hpp"

 // The kernel header is included via the compile command line with -include flag
--- a/tile_engine/ops/gemm/gemm_universal/gemm_universal_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_profiler.hpp
@@ -9,21 +9,21 @@

 #include "ck_tile/host/device_prop.hpp"
 #include "ck_tile/ops/gemm.hpp"
-#include "gemm_universal_benchmark.hpp"
+#include "gemm/gemm_benchmark.hpp"
 #include "gemm/gemm_profiler.hpp"
+#include "gemm_universal_benchmark.hpp"

-class UniversalGemmProfiler : public GemmProfiler<UniversalGemmProfiler,
-                          GemmProblem,
-                          ck_tile::GemmHostArgs>
+class UniversalGemmProfiler
+    : public GemmProfiler<UniversalGemmProfiler, GemmProblem, ck_tile::GemmHostArgs>
 {
-public:
-    using BaseGemm = GemmProfiler<UniversalGemmProfiler,
-                              GemmProblem,
-                              ck_tile::GemmHostArgs>;
+    public:
+    using BaseGemm = GemmProfiler<UniversalGemmProfiler, GemmProblem, ck_tile::GemmHostArgs>;
    using BaseGemm::benchmark;

    UniversalGemmProfiler(Setting setting)
-        : GemmProfiler<UniversalGemmProfiler, GemmProblem, ck_tile::GemmHostArgs>(setting) {}
+        : GemmProfiler<UniversalGemmProfiler, GemmProblem, ck_tile::GemmHostArgs>(setting)
+    {
+    }

    void benchmark(GemmProblem& gemm_problem,
                   std::vector<std::function<std::tuple<std::string, float>(