[CK TILE ENGINE] GEMM Multi D Restructure (#3121)

* Renaming old code * Adding GEMM code with new Architecture * Partial Progress : Errors * Partial Progress : Working code * Changes to element wise function * Removing Debugging statements * Working GEMM Multi D code * Removing Stale Code * Address Copilot review comments * Address Copilot review comments * Changes to validation file * Changes to common code snippets * Creating common folder * Removing duplicate files * Pointing to right common file * Pointing to right common file * Pointing to right common file * Changing to VERBOSE * Changing CMAKE messages to verbose * Updating Cmake with right layout datatype configs * Working code for GEMM Multi D
2026-04-19 22:39:03 +00:00 · 2025-10-31 14:02:46 -05:00
parent 04efd282cf
commit a33d98f8e2
22 changed files with 2443 additions and 2002 deletions
--- a/tile_engine/ops/gemm/commons/test_benchmark.sh
+++ b/tile_engine/ops/gemm/commons/test_benchmark.sh
--- a/tile_engine/ops/gemm/commons/test_validation.py
+++ b/tile_engine/ops/gemm/commons/test_validation.py
--- a/tile_engine/ops/gemm/commons/validation_utils.py
+++ b/tile_engine/ops/gemm/commons/validation_utils.py
@@ -125,38 +125,13 @@ WARP_TILE_SUPPORTED_COMBINATIONS = {
            [32, 32, 64],
        ],
    },
-    "gfx1201": {
+    "gfx1201": {  # Check how to handle for GEMM and Multi D
        "fp16_fp16_fp16": [
            [16, 16, 16],
        ],
    },
 }

-# Supported warp tile combinations for different GPU architectures and data types
-WARP_SUPPORTED_COMBINATIONS = {
-    "gfx90a": [
-        [1, 4, 1],
-        [2, 2, 1],
-        [4, 1, 1],
-    ],
-    "gfx942": [
-        [1, 4, 1],
-        [2, 2, 1],
-        [4, 1, 1],
-    ],
-    "gfx950": [
-        [1, 4, 1],
-        [2, 2, 1],
-        [4, 1, 1],
-    ],
-    "gfx1201": [
-        [2, 4, 1],
-        [1, 8, 1],
-        [8, 1, 1],
-        [4, 2, 1],
-    ],
-}
-
 # Unsupported trait combinations
 TRAIT_UNSUPPORTED_COMBINATIONS = {
    ("compv3", "cshuffle", "interwave"),
@@ -441,6 +416,20 @@ def get_abc_layouts(layout_code: str) -> Tuple[str, str, str]:
    return a_layout, b_layout, c_layout


+def get_abcd_layouts(layout_code: str) -> Tuple[str, str, str, List[str]]:
+    """
+    Return (ALayout, BLayout, CLayout) from a 3-letter code like 'rcrr', 'ccrr', 'crrr', 'rrrr'.
+    """
+    code = str(layout_code).strip().lower()
+
+    a_layout = LAYOUT_MAP[code[0]]
+    b_layout = LAYOUT_MAP[code[1]]
+    c_layout = LAYOUT_MAP[code[2]]
+    d0_layout = LAYOUT_MAP[code[3]]
+    d1_layout = LAYOUT_MAP[code[3]]
+    return a_layout, b_layout, c_layout, [d0_layout, d1_layout]
+
+
 def validate_whole_wg_cover_configuration(
    tile_m,
    tile_n,
@@ -464,13 +453,13 @@ def validate_whole_wg_cover_configuration(

    # A matrix validation
    if layout[0] == "r":
-        XPerTile = tile_k
-        YPerTile = tile_m
-
        vector_load_size = get_global_vector_load_size(
            BlockSize, tile_k, a_datatype, tile_m, tile_k
        )

+        XPerTile = tile_k
+        YPerTile = tile_m
+
    elif layout[0] == "c":
        vector_load_size = get_global_vector_load_size(
            BlockSize, tile_k, a_datatype, tile_m, tile_m
@@ -485,7 +474,6 @@ def validate_whole_wg_cover_configuration(
        )

        if not wg_cover_core_valid:
-            print("I am here 1")
            logging.debug(
                f"whole workgroup cover failed for Matrix A distribution: {wg_cover_core_error}"
            )
@@ -521,7 +509,7 @@ def validate_whole_wg_cover_configuration(
        if not wg_cover_core_valid:
            print("I am here 3")
            logging.debug(
-                f"whole workgroup cover failed for Matrix A distribution: {wg_cover_core_error}"
+                f"whole workgroup cover failed for Matrix B distribution: {wg_cover_core_error}"
            )
            return False, wg_cover_core_error

@@ -540,7 +528,6 @@ def validate_whole_wg_cover_configuration(
        XPerTile, YPerTile, BlockSize, vector_load_size, warp_size
    )
    if not wg_cover_core_valid:
-        print("I am here 4")
        logging.debug(
            f"whole workgroup cover failed for Matrix B: {wg_cover_core_error}"
        )
@@ -557,7 +544,7 @@ def wg_cover_core_validation(
    warp_size: int,
 ) -> Tuple[bool, str]:
    if XPerTile % vector_load_size != 0:
-        return False
+        return False, "XPerTile is not divisible by vector_load_size"

    num_warps = BlockSize / warp_size
    LargestVec = (XPerTile * YPerTile) / (num_warps * warp_size)
@@ -567,7 +554,7 @@ def wg_cover_core_validation(
    Y1 = warp_size // X0

    if X0 * Y1 != warp_size:
-        return False, ""
+        return False, "X0 * Y1 != warp_size"

    return True, ""

@@ -583,9 +570,9 @@ def get_global_vector_load_size(
    PackedSize = 1

    if (
-        XPerTile % (PackedSize * 32 / element_size(DataType)) == 0
+        PackedSize == 2
+        and XPerTile % (PackedSize * 32 / element_size(DataType)) == 0
        and elements_per_thread % (PackedSize * 32 / element_size(DataType)) == 0
-        and PackedSize == 2
    ):
        return PackedSize * 32 / element_size(DataType)
    elif (
--- a/tile_engine/ops/gemm/CMakeLists.txt
+++ b/tile_engine/ops/gemm/CMakeLists.txt
@@ -122,15 +122,15 @@ function(build_individual_gemm_targets datatype layout)
    if(DEFINED ENV{GEMM_CONFIG_FILE} AND NOT "$ENV{GEMM_CONFIG_FILE}" STREQUAL "")
        set(config_filename "$ENV{GEMM_CONFIG_FILE}")
        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${config_filename}")
-        message(STATUS "  Using config from environment variable: ${config_filename}")
+        message(VERBOSE "  Using config from environment variable: ${config_filename}")
    elseif(NOT "${GEMM_CONFIG_FILE}" STREQUAL "")
        # Use CMake variable if set
        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${GEMM_CONFIG_FILE}")
-        message(STATUS "  Using custom config: ${GEMM_CONFIG_FILE}")
+        message(VERBOSE "  Using custom config: ${GEMM_CONFIG_FILE}")
    else()
        # Use default config for all layouts
        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
-        message(STATUS "  Using default config for layout ${layout}")
+        message(VERBOSE "  Using default config for layout ${layout}")
    endif()

    # Check if config file exists
@@ -151,16 +151,16 @@ function(build_individual_gemm_targets datatype layout)
    endif()

    # Generate individual kernel files using parallel version
-    message(STATUS "Generating individual kernels for ${datatype} ${layout} using ${num_workers} workers...")
-    message(STATUS "  Working path: ${working_path}")
-    message(STATUS "  Config file: ${json_blob}")
-    message(STATUS "  Python executable: ${Python3_EXECUTABLE}")
-    message(STATUS "  Script path: ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py")
+    message(VERBOSE "Generating individual kernels for ${datatype} ${layout} using ${num_workers} workers...")
+    message(VERBOSE "  Working path: ${working_path}")
+    message(VERBOSE "  Config file: ${json_blob}")
+    message(VERBOSE "  Python executable: ${Python3_EXECUTABLE}")
+    message(VERBOSE "  Script path: ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py")

    # Create working directory first
    file(MAKE_DIRECTORY ${working_path})

-    message(STATUS "COMMAND: ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
+    message(VERBOSE "COMMAND: ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
                --working_path ${working_path}
                --datatype ${datatype}
                --layout ${layout}
@@ -169,7 +169,7 @@ function(build_individual_gemm_targets datatype layout)
                --list_kernels ")

    # First, just list the kernels (fast operation)
-    message(STATUS "  Listing kernel configurations...")
+    message(VERBOSE "  Listing kernel configurations...")
    execute_process(
        COMMAND ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
                --working_path ${working_path}
@@ -192,7 +192,7 @@ function(build_individual_gemm_targets datatype layout)
    if(EXISTS ${working_path}/gemm_kernel_count.txt)
        file(READ ${working_path}/gemm_kernel_count.txt kernel_count)
        string(STRIP "${kernel_count}" kernel_count)
-        message(STATUS "  Found ${kernel_count} kernel configurations")
+        message(VERBOSE "  Found ${kernel_count} kernel configurations")
    else()
        message(FATAL_ERROR "Kernel count file not found")
    endif()
@@ -216,10 +216,10 @@ function(build_individual_gemm_targets datatype layout)
 endfunction()

 # Main build logic - Only individual builds supported
-message(STATUS "=== Starting Tile Engine GEMM Configuration ===")
-message(STATUS "GEMM_DATATYPE: ${GEMM_DATATYPE}")
-message(STATUS "GEMM_LAYOUT: ${GEMM_LAYOUT}")
-message(STATUS "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+message(VERBOSE "=== Starting Tile Engine GEMM Configuration ===")
+message(VERBOSE "GEMM_DATATYPE: ${GEMM_DATATYPE}")
+message(VERBOSE "GEMM_LAYOUT: ${GEMM_LAYOUT}")
+message(VERBOSE "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")

 # Filter GPU targets to only gfx90a, gfx942, gfx950, gfx1201
 set(GEMM_GPU_TARGETS_INDIVIDUAL "")
@@ -228,7 +228,7 @@ set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx1201")
 foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
    if(target IN_LIST DESIRED_TARGETS)
        list(APPEND GEMM_GPU_TARGETS_INDIVIDUAL ${target})
-        message(STATUS "  Adding GPU target: ${target}")
+        message(VERBOSE "  Adding GPU target: ${target}")
    endif()
 endforeach()

@@ -236,7 +236,7 @@ endforeach()
 if(NOT GEMM_GPU_TARGETS_INDIVIDUAL)
    message(WARNING "Skipping Tile Engine GEMM build: No supported GPU targets (gfx90a, gfx942, gfx950, gfx1201) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 else()
-    message(STATUS "Building individual GEMM targets for GPU targets: ${GEMM_GPU_TARGETS_INDIVIDUAL}")
+    message(VERBOSE "Building individual GEMM targets for GPU targets: ${GEMM_GPU_TARGETS_INDIVIDUAL}")

    # Enable parallel compilation optimizations
    # Set up job pools for better parallel compilation control
@@ -251,12 +251,12 @@ else()
        find_program(CCACHE_PROGRAM ccache)
        if(CCACHE_PROGRAM)
            set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
-            message(STATUS "Using ccache for faster compilation")
+            message(VERBOSE "Using ccache for faster compilation")
        else()
            message(WARNING "ccache requested but not found")
        endif()
    else()
-        message(STATUS "ccache disabled for GEMM ops (use -DENABLE_CCACHE_GEMM=ON to enable)")
+        message(VERBOSE "ccache disabled for GEMM ops (use -DENABLE_CCACHE_GEMM=ON to enable)")
    endif()

    # Create master collection targets
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -8,12 +8,30 @@ import multiprocessing
 import concurrent.futures
 from pathlib import Path
 import logging
-from commons.validation_utils import (
-    is_tile_config_valid,
-    is_trait_combination_valid,
-    get_dtype_string,
-    get_abc_layouts,
-)
+import importlib.util
+
+
+def _import_validation_utils():
+    """Import validation utilities from commons directory."""
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    parent_dir = os.path.dirname(current_dir)
+
+    # Load the module dynamically
+    spec = importlib.util.spec_from_file_location(
+        "validation_utils", os.path.join(parent_dir, "commons", "validation_utils.py")
+    )
+    validation_utils = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(validation_utils)
+
+    return validation_utils
+
+
+# Import validation functions
+_validation_utils = _import_validation_utils()
+is_tile_config_valid = _validation_utils.is_tile_config_valid
+is_trait_combination_valid = _validation_utils.is_trait_combination_valid
+get_dtype_string = _validation_utils.get_dtype_string
+get_abc_layouts = _validation_utils.get_abc_layouts

 logging.basicConfig(level=logging.INFO)

@@ -563,6 +581,8 @@ struct SelectedKernel {{
        tile_configs = self._get_tile_configs()
        trait_combos = self._generate_trait_combinations()
        k_block_per_cu = self.config.get("k_block_per_cu")
+        if k_block_per_cu is None:
+            k_block_per_cu = 1

        # Prepare work items for parallel processing
        work_items = []
@@ -574,11 +594,12 @@ struct SelectedKernel {{
                        trait_combo,
                        k_block_per_cu,
                        self.working_path,
+                        self.gpu_target,
                        self.datatype,
                        self.layout,
+                        self.config_json,
                    )
                )
-
        print(
            f"Generating {len(work_items)} individual kernel files using {num_workers} workers..."
        )
@@ -615,7 +636,6 @@ struct SelectedKernel {{
                    print(
                        f"  Progress: {completed}/{len(work_items)} kernels generated"
                    )
-
                try:
                    result = future.result()
                    if result:
@@ -662,10 +682,19 @@ struct SelectedKernel {{

 def _generate_single_kernel_individual(work_item):
    """Worker function to generate a single individual kernel file"""
-    tile_config, trait_combo, k_block_per_cu, working_path, datatype, layout = work_item
+    (
+        tile_config,
+        trait_combo,
+        k_block_per_cu,
+        working_path,
+        gpu_target,
+        datatype,
+        layout,
+        config_json,
+    ) = work_item

    # Create a temporary builder instance for this worker
-    builder = GemmKernelBuilder(working_path, datatype, layout)
+    builder = GemmKernelBuilder(working_path, gpu_target, datatype, layout, config_json)

    try:
        kernel_name, instance_code = builder._generate_kernel_instance(
@@ -798,6 +827,8 @@ def main():
        )

        k_block_per_cu = builder.config.get("k_block_per_cu")
+        if k_block_per_cu is None:
+            k_block_per_cu = 1

        # Generate the kernel
        kernel_name, instance_code = builder._generate_kernel_instance(
--- a/tile_engine/ops/gemm_multi_d/CMakeLists.txt
+++ b/tile_engine/ops/gemm_multi_d/CMakeLists.txt
@@ -1,175 +1,311 @@
-
 set(GEMM_MULTI_D_DATATYPE "fp16" CACHE STRING "List of datatypes for GEMM Multi D (semicolon-separated)")
-set(GEMM_MULTI_D_LAYOUT "rcrr" CACHE STRING "List of layout for GEMM Multi D(semicolon-separated)")
+set(GEMM_MULTI_D_LAYOUT "rcrr;rrrr;crrr;ccrr" CACHE STRING "List of layout for GEMM Multi D (semicolon-separated)")
+set(GEMM_MULTI_D_CONFIG_FILE "" CACHE STRING "Custom config file name (without path, must be in configs/ folder)")
 set(GEMM_MULTI_D_ELEMENTWISE_FUNCTION "mul"  CACHE STRING "Elementwise function")

-function(build_gemm_multi_d_for_datatype_layout datatype layout)
-    # Filter GPU targets to only gfx90a, gfx942, and gfx950
-    set(GEMM_GPU_TARGETS "")
-    set(DESIRED_TARGETS "gfx90a;gfx942;gfx950")
-    
-    foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
-        if(target IN_LIST DESIRED_TARGETS)
-            list(APPEND GEMM_GPU_TARGETS ${target})
-        endif()
-    endforeach()
-    
-    # Skip compilation if no matching targets found
-    if(NOT GEMM_GPU_TARGETS)
-        message(WARNING "Skipping Tile Engine GEMM Multi D compilation: No supported GPU targets (gfx90a, gfx942, gfx950) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+option(ENABLE_CCACHE_GEMM_MULTI_D "Enable ccache for GEMM Multi D ops compilation" OFF)
+
+# Store the directory path for use in functions
+set(GEMM_MULTI_D_SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR})
+
+# Function to create individual GEMM Multi D targets
+function(create_individual_gemm_multi_d_target datatype layout trait tile_config config_json)
+    # Use the parent scope GEMM_MULTI_D_GPU_TARGETS_INDIVIDUAL variable
+    if(NOT GEMM_MULTI_D_GPU_TARGETS_INDIVIDUAL)
+        message(WARNING "Skipping individual GEMM Multi D target ${datatype}_${layout}_${trait}_${tile_config}: No supported GPU targets")
        return()
    endif()
-    
-    message(STATUS "Building GEMM Multi D for GPU targets: ${GEMM_GPU_TARGETS}")
-    
+
+    # Parse tile configuration: format is tile_mxtile_nxtile_k_warp_mxwarp_nxwarp_k_warp_tile_mxwarp_tile_nxwarp_tile_k
+    # First split by underscore to get three groups
+    string(REPLACE "_" ";" config_groups ${tile_config})
+    list(GET config_groups 0 tile_dims)      # e.g., 256x256x32
+    list(GET config_groups 1 warp_dims)      # e.g., 4x1x1
+    list(GET config_groups 2 warp_tile_dims) # e.g., 16x16x16
+
+    # Parse tile dimensions
+    string(REPLACE "x" ";" tile_parts ${tile_dims})
+    list(GET tile_parts 0 tile_m)
+    list(GET tile_parts 1 tile_n)
+    list(GET tile_parts 2 tile_k)
+
+    # Parse warp dimensions
+    string(REPLACE "x" ";" warp_parts ${warp_dims})
+    list(GET warp_parts 0 warp_m)
+    list(GET warp_parts 1 warp_n)
+    list(GET warp_parts 2 warp_k)
+
+    # Parse warp tile dimensions
+    string(REPLACE "x" ";" warp_tile_parts ${warp_tile_dims})
+    list(GET warp_tile_parts 0 warp_tile_m)
+    list(GET warp_tile_parts 1 warp_tile_n)
+    list(GET warp_tile_parts 2 warp_tile_k)
+
+    set(target_name "benchmark_gemm_multi_d_${datatype}_${layout}_${trait}_${tile_config}")
    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}")

-    # Comment this if-else block when using user_provided_config
-    if(layout STREQUAL "rcrr")
-        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
+    # Generate the single instance header for this kernel
+    set(instance_header "${working_path}/gemm_multi_d_single_${datatype}_${layout}_${trait}_${tile_config}.hpp")
+
+    # Add custom command to generate the header file at build time
+    add_custom_command(
+        OUTPUT ${instance_header}
+        COMMAND ${Python3_EXECUTABLE} ${GEMM_MULTI_D_SOURCE_DIR}/gemm_multi_d_instance_builder.py
+                --working_path ${working_path}
+                --datatype ${datatype}
+                --layout ${layout}
+                --elementwise_function ${GEMM_MULTI_D_ELEMENTWISE_FUNCTION}
+                --config_json ${config_json}
+                --gen_single
+                --kernel_name "gemm_multi_d_${datatype}_${layout}_${trait}_${tile_config}"
+                --tile_config "${tile_config}"
+                --trait_combo "${trait}"
+                --gpu_target "${GEMM_MULTI_D_GPU_TARGETS_INDIVIDUAL}"
+        DEPENDS ${GEMM_MULTI_D_SOURCE_DIR}/gemm_multi_d_instance_builder.py ${config_json}
+        COMMENT "Generating ${instance_header}"
+    )
+
+    # Create the executable
+    add_executable(${target_name}
+        EXCLUDE_FROM_ALL
+        ${GEMM_MULTI_D_SOURCE_DIR}/gemm_multi_d_benchmark_single.cpp
+        ${instance_header}
+    )
+
+    # Set GPU architectures
+    set_property(TARGET ${target_name} PROPERTY HIP_ARCHITECTURES ${GEMM_MULTI_D_GPU_TARGETS_INDIVIDUAL})
+
+    # Set compile definitions
+    target_compile_definitions(${target_name} PRIVATE
+        GEMM_MULTI_D_SINGLE_INSTANCE_HPP="${instance_header}"
+    )
+
+    # Include directories
+    target_include_directories(${target_name} PRIVATE
+        ${GEMM_MULTI_D_SOURCE_DIR}
+        ${working_path}
+    )
+
+    # Compile options
+    target_compile_options(${target_name} PRIVATE
+        -Wno-undefined-func-template
+        -Wno-float-equal
+        --offload-compress
+        -include ${instance_header}
+    )
+
+    # Add to collection targets
+    add_dependencies(benchmark_gemm_multi_d_all ${target_name})
+    add_dependencies(benchmark_gemm_multi_d_${datatype} ${target_name})
+    add_dependencies(benchmark_gemm_multi_d_${layout} ${target_name})
+    add_dependencies(benchmark_gemm_multi_d_${datatype}_${layout} ${target_name})
+
+    # Add to trait-specific targets
+    string(REPLACE "_" ";" trait_parts ${trait})
+    list(GET trait_parts 0 pipeline)
+    list(GET trait_parts 1 epilogue)
+    list(GET trait_parts 2 scheduler)
+
+    add_dependencies(benchmark_gemm_multi_d_${pipeline}_pipeline ${target_name})
+    add_dependencies(benchmark_gemm_multi_d_${epilogue}_epilogue ${target_name})
+    add_dependencies(benchmark_gemm_multi_d_${scheduler}_scheduler ${target_name})
+endfunction()
+
+# Function to build individual GEMM Multi D targets
+function(build_individual_gemm_multi_d_targets datatype layout)
+    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}")
+
+    # Choose config file
+    # Priority order:
+    # 1. Environment variable GEMM_MULTI_D_CONFIG_FILE
+    # 2. CMake variable GEMM_MULTI_D_CONFIG_FILE
+    # 3. Default based on layout
+
+    # Check environment variable first
+    if(DEFINED ENV{GEMM_MULTI_D_CONFIG_FILE} AND NOT "$ENV{GEMM_MULTI_D_CONFIG_FILE}" STREQUAL "")
+        set(config_filename "$ENV{GEMM_MULTI_D_CONFIG_FILE}")
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${config_filename}")
+        message(VERBOSE "  Using config from environment variable: ${config_filename}")
+    elseif(NOT "${GEMM_MULTI_D_CONFIG_FILE}" STREQUAL "")
+        # Use CMake variable if set
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${GEMM_MULTI_D_CONFIG_FILE}")
+        message(VERBOSE "  Using custom config: ${GEMM_MULTI_D_CONFIG_FILE}")
    else()
-        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/custom_ci_config.json")
+        # Use default config for all layouts
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
+        message(VERBOSE "  Using default config for layout ${layout}")
    endif()

-    # uncomment this if you want to use user_provided_config.json
-    # set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json")
-    
-    # Generate kernel list
-    execute_process(
-        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_multi_d_instance_builder.py
+    # Check if config file exists
+    if(NOT EXISTS ${json_blob})
+        message(FATAL_ERROR "Config file not found: ${json_blob}")
+    endif()
+
+    # Determine number of workers for parallel generation
+    if(DEFINED ENV{CMAKE_BUILD_PARALLEL_LEVEL})
+        set(num_workers $ENV{CMAKE_BUILD_PARALLEL_LEVEL})
+    else()
+        # Use processor count but limit to avoid memory issues
+        cmake_host_system_information(RESULT num_cores QUERY NUMBER_OF_LOGICAL_CORES)
+        math(EXPR num_workers "${num_cores}")
+        if(num_workers GREATER 8)
+            set(num_workers 8)
+        endif()
+    endif()
+
+    # Generate individual kernel files using parallel version
+    message(VERBOSE "Generating individual kernels for ${datatype} ${layout} using ${num_workers} workers...")
+    message(VERBOSE "  Working path: ${working_path}")
+    message(VERBOSE "  Config file: ${json_blob}")
+    message(VERBOSE "  Python executable: ${Python3_EXECUTABLE}")
+    message(VERBOSE "  Script path: ${CMAKE_CURRENT_LIST_DIR}/gemm_multi_d_instance_builder.py")
+
+    # Create working directory first
+    file(MAKE_DIRECTORY ${working_path})
+
+    message(VERBOSE "COMMAND: ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_LIST_DIR}/gemm_multi_d_instance_builder.py
                --working_path ${working_path}
                --datatype ${datatype}
                --layout ${layout}
                --elementwise_function ${GEMM_MULTI_D_ELEMENTWISE_FUNCTION}
                --config_json ${json_blob}
-                --list_blobs
-                --gpu_target ${GEMM_GPU_TARGETS}
-        RESULT_VARIABLE ret
-    )
-    if(NOT ret EQUAL 0)
-        message(FATAL_ERROR "Failed to list kernels for ${datatype} ${layout}: ${ret}")
-    endif()
+                --gpu_target ${GEMM_MULTI_D_GPU_TARGETS_INDIVIDUAL}
+                --list_kernels ")

-    file(STRINGS "${working_path}/gemm_multi_d_instance_blobs.txt" codegen_blobs)
-    file(STRINGS "${working_path}/gemm_multi_d_instance_blobs_range.txt" codegen_blobs_range)
-    
-    # Generate the blobs
-    add_custom_command(
-        OUTPUT ${codegen_blobs}
-        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_multi_d_instance_builder.py
-                --working_path "${working_path}"
+    # First, just list the kernels (fast operation)
+    message(VERBOSE "  Listing kernel configurations...")
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_LIST_DIR}/gemm_multi_d_instance_builder.py
+                --working_path ${working_path}
                --datatype ${datatype}
                --layout ${layout}
                --elementwise_function ${GEMM_MULTI_D_ELEMENTWISE_FUNCTION}
-                --config_json "${json_blob}"
-                --gen_blobs
-                --gpu_target ${GEMM_GPU_TARGETS}
-        COMMENT "Generating GEMM Multi D instance sources for ${datatype} ${layout}"
+                --config_json ${json_blob}
+                --gpu_target ${GEMM_MULTI_D_GPU_TARGETS_INDIVIDUAL}
+                --list_kernels  
+        WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+        RESULT_VARIABLE ret
+        OUTPUT_VARIABLE list_output
+        ERROR_VARIABLE list_error
    )
-    add_custom_target(gemm_multi_d_gen_${datatype}_${layout} DEPENDS ${codegen_blobs})

-    set(intermediate_libs)
-    list(LENGTH codegen_blobs codegen_blobs_len)
+    if(NOT ret EQUAL 0)
+        message(FATAL_ERROR "Failed to list kernels for ${datatype} ${layout}: ${list_error}")
+    endif()

-    foreach(blob IN LISTS codegen_blobs_range)
-        string(STRIP "${blob}" stripped_blob)
-        separate_arguments(spilit_blob UNIX_COMMAND "${stripped_blob}")
-        # Each line is: <trait_name> <first_index_inclusive> <last_index_exclusive>   
-        list(GET spilit_blob 0 name)
-        list(GET spilit_blob 1 first)
-        list(GET spilit_blob 2 last)
-        math(EXPR total_files "${last} - ${first}")
-        if(total_files EQUAL 0)
-            continue()        # nothing for this trait
-        endif()
+    # Read kernel count
+    if(EXISTS ${working_path}/gemm_multi_d_kernel_count.txt)
+        file(READ ${working_path}/gemm_multi_d_kernel_count.txt kernel_count)
+        string(STRIP "${kernel_count}" kernel_count)
+        message(VERBOSE "  Found ${kernel_count} kernel configurations")
+    else()
+        message(FATAL_ERROR "Kernel count file not found")
+    endif()

-        # Object libraries (chunked) per trait
-        set(sub_intermediate_libs)
-        set(chunk_size 3)
-        math(EXPR num_chunks "( ${total_files} + ${chunk_size} - 1 ) / ${chunk_size}")
-        math(EXPR num_chunks_minus_1 "${num_chunks} - 1")
-        
-        foreach(i RANGE 0 ${num_chunks_minus_1})
-            math(EXPR start "${first} + ${i} * ${chunk_size} ")
-            math(EXPR end "${start} + ${chunk_size} - 1")
-
-            set(chunk_files)
-            foreach(j RANGE ${start} ${end})
-                if(j LESS ${last} AND j LESS ${codegen_blobs_len})
-                    list(GET codegen_blobs ${j} f)
-                    list(APPEND chunk_files "${f}")
-                endif()
-            endforeach()
-
-            #list(LENGTH chunk_files chunk_files_len)
-            #if(chunk_files_len AND chunk_files_len GREATER 1)
-            if(chunk_files)
-                set(sub_intermediate_lib_name "gemm_multi_d_objlib_${name}_${i}_${datatype}_${layout}")
-                add_library(${sub_intermediate_lib_name} OBJECT ${chunk_files})
-                set_property(TARGET ${sub_intermediate_lib_name} PROPERTY HIP_ARCHITECTURES ${GEMM_GPU_TARGETS})
-                list(APPEND sub_intermediate_libs ${sub_intermediate_lib_name})
-            endif()
+    # Read kernel list and create targets
+    if(EXISTS ${working_path}/gemm_multi_d_kernel_list.txt)
+        file(STRINGS ${working_path}/gemm_multi_d_kernel_list.txt kernel_lines)
+        foreach(line IN LISTS kernel_lines)
+            # Parse line: kernel_name|tile_config|trait_combo
+            string(REPLACE "|" ";" parts "${line}")
+            list(GET parts 0 kernel_name)
+            list(GET parts 1 tile_config)
+            list(GET parts 2 trait_combo)

+            # Create individual target
+            create_individual_gemm_multi_d_target("${datatype}" "${layout}" "${trait_combo}" "${tile_config}" "${json_blob}")
        endforeach()
-
-        # ------------------ Bundle the object libs into one static lib ---------
-        #list(LENGTH sub_intermediate_libs sub_intermediate_libs_len)
-        #if(sub_intermediate_libs AND sub_intermediate_libs_len GREATER 1)
-        if(sub_intermediate_libs)
-            set(intermediate_lib_name "gemm_multi_d_staticlib_${name}_${datatype}_${layout}")
-            # Collect the $<TARGET_OBJECTS:...> expressions
-            
-            set(obj_exprs)
-            foreach(objlib IN LISTS sub_intermediate_libs)
-                list(APPEND obj_exprs $<TARGET_OBJECTS:${objlib}>)
-            endforeach()
-            
-            add_library(${intermediate_lib_name} STATIC ${obj_exprs})
-            add_dependencies(${intermediate_lib_name} gemm_multi_d_gen_${datatype}_${layout})
-            set_property(TARGET ${intermediate_lib_name} PROPERTY HIP_ARCHITECTURES ${GEMM_GPU_TARGETS})
-            #foreach(objlib IN LISTS sub_intermediate_libs)
-            #    target_sources(${intermediate_lib_name} PRIVATE $<TARGET_OBJECTS:${objlib}>)
-            #endforeach()
-            list(APPEND intermediate_libs ${intermediate_lib_name})
-        endif()
-
-    endforeach()
-    
-    # Interface library for instances
-    add_library(gemm_multi_d_template_instances_${datatype}_${layout} INTERFACE)
-    add_dependencies(gemm_multi_d_template_instances_${datatype}_${layout} gemm_multi_d_gen_${datatype}_${layout})
-    target_link_libraries(gemm_multi_d_template_instances_${datatype}_${layout} INTERFACE ${intermediate_libs})
-    target_include_directories(gemm_multi_d_template_instances_${datatype}_${layout} INTERFACE
-        ${CMAKE_CURRENT_LIST_DIR}
-        "${working_path}"
-    )
-    set_target_properties(gemm_multi_d_template_instances_${datatype}_${layout} PROPERTIES LINKER_LANGUAGE CXX)
-    
-    # Host API interface library
-    add_library(gemm_multi_d_host_api_${datatype}_${layout} INTERFACE)
-    target_link_libraries(gemm_multi_d_host_api_${datatype}_${layout} INTERFACE gemm_multi_d_template_instances_${datatype}_${layout})
-    target_include_directories(gemm_multi_d_host_api_${datatype}_${layout} INTERFACE
-        ${CMAKE_CURRENT_LIST_DIR}
-        "${working_path}"
-    )
-
-    
-
-    # Executable per datatype
-    set(exec_name "benchmark_gemm_multi_d_${datatype}_${layout}")
-    add_executable(${exec_name} benchmark_gemm_multi_d.cpp)
-    set_property(TARGET ${exec_name} PROPERTY HIP_ARCHITECTURES ${GEMM_GPU_TARGETS})
-    target_link_libraries(${exec_name} PRIVATE gemm_multi_d_host_api_${datatype}_${layout})
-    target_compile_options(${exec_name} PRIVATE
-        -Wno-undefined-func-template
-        -Wno-float-equal
-        --offload-compress
-    )
+    else()
+        message(FATAL_ERROR "Kernel list file not found")
+    endif()
 endfunction()

-# Process each datatype in isolation
-foreach(dt IN LISTS GEMM_MULTI_D_DATATYPE)
-    foreach(l IN LISTS GEMM_MULTI_D_LAYOUT)
-        build_gemm_multi_d_for_datatype_layout(${dt} ${l})
-    endforeach()
+# Main build logic - Only individual builds supported
+message(VERBOSE "=== Starting Tile Engine GEMM Multi D Configuration ===")
+message(VERBOSE "GEMM_MULTI_D_DATATYPE: ${GEMM_MULTI_D_DATATYPE}")
+message(VERBOSE "GEMM_MULTI_D_LAYOUT: ${GEMM_MULTI_D_LAYOUT}")
+message(VERBOSE "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+
+# Filter GPU targets to only gfx90a, gfx942, gfx950
+set(GEMM_MULTI_D_GPU_TARGETS_INDIVIDUAL "")
+set(DESIRED_TARGETS "gfx90a;gfx942;gfx950")
+
+foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
+    if(target IN_LIST DESIRED_TARGETS)
+        list(APPEND GEMM_MULTI_D_GPU_TARGETS_INDIVIDUAL ${target})
+        message(VERBOSE "  Adding GPU target: ${target}")
+    endif()
 endforeach()
+
+# Skip build if no matching targets found
+if(NOT GEMM_MULTI_D_GPU_TARGETS_INDIVIDUAL)
+    message(WARNING "Skipping Tile Engine GEMM Multi D build: No supported GPU targets (gfx90a, gfx942, gfx950) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+else()
+    message(VERBOSE "Building individual GEMM Multi D targets for GPU targets: ${GEMM_MULTI_D_GPU_TARGETS_INDIVIDUAL}")
+
+    # Enable parallel compilation optimizations
+    # Set up job pools for better parallel compilation control
+    set_property(GLOBAL PROPERTY JOB_POOLS
+        compile_heavy=4    # Limit heavy compilations to prevent OOM
+        compile_normal=16  # Allow more parallel normal compilations
+    )
+
+    # Enable compiler cache if available and explicitly requested
+    # Disabled by default due to permission issues in CI environments
+    if(ENABLE_CCACHE_GEMM_MULTI_D)
+        find_program(CCACHE_PROGRAM ccache)
+        if(CCACHE_PROGRAM)
+            set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
+            message(VERBOSE "Using ccache for faster compilation")
+        else()
+            message(WARNING "ccache requested but not found")
+        endif()
+    else()
+        message(VERBOSE "ccache disabled for GEMM Multi D ops (use -DENABLE_CCACHE_GEMM_MULTI_D=ON to enable)")
+    endif()
+
+    # Create master collection targets
+    add_custom_target(benchmark_gemm_multi_d_all)
+
+    # Create datatype collection targets
+    foreach(dt IN LISTS GEMM_MULTI_D_DATATYPE)
+        add_custom_target(benchmark_gemm_multi_d_${dt})
+    endforeach()
+
+    # Create layout collection targets
+    foreach(l IN LISTS GEMM_MULTI_D_LAYOUT)
+        add_custom_target(benchmark_gemm_multi_d_${l})
+    endforeach()
+
+    # Create combined collection targets
+    foreach(dt IN LISTS GEMM_MULTI_D_DATATYPE)
+        foreach(l IN LISTS GEMM_MULTI_D_LAYOUT)
+            add_custom_target(benchmark_gemm_multi_d_${dt}_${l})
+        endforeach()
+    endforeach()
+
+    # Create trait-based collection targets
+    # These are common trait components used across all GEMM Multi D kernels
+    set(GEMM_MULTI_D_PIPELINES "mem;compv3;compv4")
+    set(GEMM_MULTI_D_EPILOGUES "default;cshuffle")
+    set(GEMM_MULTI_D_SCHEDULERS "intrawave;interwave")
+
+    foreach(pipeline IN LISTS GEMM_MULTI_D_PIPELINES)
+        add_custom_target(benchmark_gemm_multi_d_${pipeline}_pipeline)
+    endforeach()
+
+    foreach(epilogue IN LISTS GEMM_MULTI_D_EPILOGUES)
+        add_custom_target(benchmark_gemm_multi_d_${epilogue}_epilogue)
+    endforeach()
+
+    foreach(scheduler IN LISTS GEMM_MULTI_D_SCHEDULERS)
+        add_custom_target(benchmark_gemm_multi_d_${scheduler}_scheduler)
+    endforeach()
+
+    # Build individual targets for each datatype/layout combination
+    foreach(dt IN LISTS GEMM_MULTI_D_DATATYPE)
+        foreach(l IN LISTS GEMM_MULTI_D_LAYOUT)
+            build_individual_gemm_multi_d_targets(${dt} ${l})
+        endforeach()
+    endforeach()
+endif()
--- a/tile_engine/ops/gemm_multi_d/README.md
+++ b/tile_engine/ops/gemm_multi_d/README.md
@@ -1,110 +0,0 @@
-
-CK Tile Engine for GEMM Multi D is used to generate and run GEMM kernels with different combinations of BlockTile sizes, WarpTile sizes, WarpTile mapping for all valid pipelines, schedulers and epilogues while able to give custom datatype and Layout selections
-
-# Kernel Configurations
-
-# User Specific
-Users can specify custom kernel configurations such as tile size, warp size, padding, pipeline, scheduler, and epilogue in the config file. This allows building only for selected configurations, significantly reducing build time.
-For reference please see `./configs/user_provided_config.json`.
-
-# Default
-The Tile engine also has a default kernel configuration for providing range of configuration parameter values, which helps users who lack kernel development experience to benchmark. For reference please see in `./configs/default_config.json`
-
-If user does not provide kernel configuration, the tile engine uses default kernel configuration to generate kernel instances and benchmark. 
-
-## Build Instructions
-``` bash
-# in the root of composable kernel create build directory
-mkdir build && cd build
-# build composable kernel
-# replace [Arch] with the appropriate architecture or leave blank and 
-# replace [Datatype] in comma separated datatypes string (possible datatypes are [fp16])
-# replace [Layout1;Layout2;...] in comma separated datatypes string (possible layouts are [rcr, rrr, crr, ccr])
-# replace "mul" with either of mul,add,passthrough for Elementwise function as Multiply, Add or Passthrough respectively. If this is not specified it is considered as mul by default.
-../script/cmake-ck-dev.sh  ../ [Arch] -DGEMM_MULTI_D_DATATYPE="[Datatype]" -DGEMM_MULTI_D_LAYOUT="[Layout1;Layout2]" -DGEMM_MULTI_D_ELEMENTWISE_FUNCTION="mul"
-# generate different executable for each passed datatype
-make benchmark_gemm_multi_d_[Datatype]_[Layout1] -j
-make benchmark_gemm_multi_d_[Datatype]_[Layout2] -j
-```
-`benchmark_gemm_multi_d_[Datatype]_[Layout]` will be located in the `./bin/` directory.
-
-`benchmark_gemm_multi_d_[Datatype]_[Layout]` must be rebuilt everytime if configuration file is modified.
-
-``` bash
-rm -rf tile_engine/ && make benchmark_gemm_multi_d_[Datatype]_[Layout] -j  # rebuild
-```
-
-## For eaxmple build for gfx942 for datatype with rcr layout
-``` bash
-mkdir build && cd build
-../script/cmake-ck-dev.sh  ../ gfx942 -DGEMM_MULTI_D_DATATYPE="fp16" -DGEMM_MULTI_D_LAYOUT="rcrr" 
-make benchmark_gemm_multi_d_fp16_rcrr -j
-
-## benchmark_gemm inputs
-```
-                      -m    The value for m dimension. Default is 3840.
-                      -n    The value for n dimension. Default is 4096.
-                      -k    The value for k dimension. Default is 2048.
-               -stride_a    The stride value for tensor A. Default is 0.
-               -stride_b    The stride value for tensor B. Default is 0.
-              -stride_ds    The stride value for tensor Ds. Default is 0.
-               -stride_e    The stride value for tensor E. Default is 0.
-                -split_k    The split value for k dimension. Default is 1.
-                  -verify    The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 for validation on GPU. Default is 1, validation on CPU, as validation on GPU is not supported.
-                    -log    Wether output kernel instance information or not. Possible values are true or false. Default is false.
-                 -warmup    The number of iterations before benchmark the kernel. Default is 50.
-                 -repeat    The number of iterations to benchmark the kernel. Default is 100.
-                  -timer    Whether if the timer is gpu timer or not. Possible values are false or true. Default is true.
-                   -init    The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random.
-            -flush_cache    To flush cache, possible values are true or false. Default is false.
-         -rotating_count    Number of iterations to rotate the cache. Default is 5.
-                 -metric    Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency.
-           -csv_filename    The filename of benchmark result. Default is gemm_multi_d_kernel.
-               -pipeline    The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3.
-              -scheduler    The type of scheduler. Possible values are intrawave. Default is intrawave.
-               -epilogue    The type of epilogue. Possible values are cshuffle or default. Default is cshuffle.
-                  -pad_m    Whether pad or not in m direction. Possible values are true or false. Default is false.
-                  -pad_n    Whether pad or not in n direction. Possible values are true or false. Default is false.
-                  -pad_k    Whether pad or not in k direction. Possible values are true or false. Default is false.
-
-Note: pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be one of the options specified in user_provided_config.json 
-```
-Note: In `./configs/user_provided_config.json` pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be from one of the values specified above.
-
-## Example
-
-The following JSON file specifies parameters used to generate and build GEMM kernels across all possible combinations of pipelines, schedulers, epilogues with different tile and warp sizes.
-
-```json
-{     
-    /// other parameters ///
-    
-    "tile_m": {
-      "values": [256]
-    },
-    "tile_n": {
-      "values": [256]
-    },
-    "tile_k": {
-      "values": [64, 32]
-    },
-
-    /// other parameters ///
-
-    "pipeline": {
-      "values": ["compv3", "compv4", "mem"]
-    },
-    "scheduler": {
-      "values": ["intrawave", "interwave"]
-    },
-    "epilogue": {
-      "values": ["cshuffle"]
-    }
-}
-```
-
-At runtime, a specific subset of the generated kernels can be selected using command-line arguments.
-``` bash
-./bin/benchmark_gemm_multi_d_[Datatype]_[Layout] -pipeline=compv3 -scheduler=intrawave -epilogue=cshuffle 
-```
-The above command runs kernels configured with the compv3 pipeline, intrawave scheduler, and cshuffle epilogue, while sweeping over different BlockTile sizes, WarpTile sizes, and WarpTile mappings.
--- a/tile_engine/ops/gemm_multi_d/benchmark_gemm_multi_d.cpp
+++ b/tile_engine/ops/gemm_multi_d/benchmark_gemm_multi_d.cpp
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <functional>
-#include <tuple>
-#include <exception>
-
-#include "benchmark_gemm_multi_d.hpp"
-#include "gemm_multi_d_profiler.hpp"
-
-void benchmark_gemm_multi_d(const ck_tile::ArgParser& arg_parser)
-{
-    GemmMultiDProblem gemm_multi_d_problem{arg_parser.get_int("split_k"),
-                                           arg_parser.get_int("m"),
-                                           arg_parser.get_int("n"),
-                                           arg_parser.get_int("k"),
-                                           arg_parser.get_int("stride_a"),
-                                           arg_parser.get_int("stride_b"),
-                                           arg_parser.get_int("stride_ds"),
-                                           arg_parser.get_int("stride_ds"),
-                                           arg_parser.get_int("stride_e"),
-                                           DataTypeTraits<ADataType>::name,
-                                           DataTypeTraits<BDataType>::name,
-                                           DataTypeTraits<D0DataType>::name,
-                                           DataTypeTraits<D1DataType>::name,
-                                           DataTypeTraits<AccDataType>::name,
-                                           DataTypeTraits<EDataType>::name,
-                                           ALayout::name,
-                                           BLayout::name,
-                                           D0Layout::name,
-                                           D1Layout::name,
-                                           ELayout::name};
-
-    Setting setting{arg_parser.get_int("warmup"),
-                    arg_parser.get_int("repeat"),
-                    arg_parser.get_bool("timer"),
-                    arg_parser.get_int("verify"),
-                    arg_parser.get_int("init"),
-                    arg_parser.get_bool("log"),
-                    arg_parser.get_str("csv_filename"),
-                    arg_parser.get_bool("flush_cache"),
-                    arg_parser.get_int("rotating_count")};
-
-    auto& profiler = GemmMultiDProfiler::instance(setting);
-
-    try
-    {
-        auto kernel_func = get_kernel_func_by_trait(arg_parser);
-        profiler.benchmark(gemm_multi_d_problem, kernel_func);
-        profiler.select_best_instance(static_cast<Metric>(arg_parser.get_int("metric")));
-    }
-    catch(const std::exception& e)
-    {
-        std::cerr << "Benchmark failed: " << e.what() << std::endl;
-    }
-}
-
-int main(int argc, char* argv[])
-{
-    try
-    {
-        auto [result, parser] = create_args(argc, argv);
-        if(!result)
-            return EXIT_FAILURE;
-        benchmark_gemm_multi_d(parser);
-        return 0;
-    }
-    catch(const std::exception& e)
-    {
-        std::cerr << "Error: " << e.what() << "\n";
-        return EXIT_FAILURE;
-    }
-}
--- a/tile_engine/ops/gemm_multi_d/configs/custom_ci_config.json
+++ b/tile_engine/ops/gemm_multi_d/configs/custom_ci_config.json
@@ -1,80 +0,0 @@
-{
-  "tile_config": {
-    "tile_m": {
-      "values": [
-        256      ]
-    },
-    "tile_n": {
-      "values": [
-        128
-      ]
-    },
-    "tile_k": {
-      "values": [
-        32
-      ]
-    },
-    "warp_m": {
-      "values": [
-        2
-      ]
-    },
-    "warp_n": {
-      "values": [
-        2
-      ]
-    },
-    "warp_k": {
-      "values": [
-        1
-      ]
-    },
-    "warp_tile_m": {
-      "values": [
-        16
-      ]
-    },
-    "warp_tile_n": {
-      "values": [
-        16
-      ]
-    },
-    "warp_tile_k": {
-      "values": [
-        16
-      ]
-    }
-  },
-  "trait_config": {
-    "pipeline": {
-      "values": [
-        "compv3"
-      ]
-    },
-    "scheduler": {
-      "values": [
-        "intrawave"
-      ]
-    },
-    "epilogue": {
-      "values": [
-        "cshuffle"
-      ]
-    },
-    "pad_m": {
-      "values": [
-        false
-      ]
-    },
-    "pad_n": {
-      "values": [
-        false
-      ]
-    },
-    "pad_k": {
-      "values": [
-        false
-      ]
-    }
-  }
-}
--- a/tile_engine/ops/gemm_multi_d/configs/default_config.json
+++ b/tile_engine/ops/gemm_multi_d/configs/default_config.json
@@ -1,84 +1,104 @@
 {
-  "tile_config": {
-    "tile_m": {
-      "values": [
-        256
-      ]
+    "tile_config": {
+        "tile_m": {
+            "max": 256,
+            "min": 64,
+            "step": 64
+        },
+        "tile_n": {
+            "max": 256,
+            "min": 64,
+            "step": 64
+        },
+        "tile_k": {
+            "max": 256,
+            "min": 64,
+            "step": 64
+        },
+        "warp_m": {
+            "values": [
+                4,
+                2,
+                1
+            ]
+        },
+        "warp_n": {
+            "values": [
+                4,
+                2,
+                1
+            ]
+        },
+        "warp_k": {
+            "values": [
+                1
+            ]
+        },
+        "warp_tile_m": {
+            "values": [
+                4,
+                16,
+                32
+            ]
+        },
+        "warp_tile_n": {
+            "values": [
+                16,
+                32,
+                64
+            ]
+        },
+        "warp_tile_k": {
+            "values": [
+                8,
+                16,
+                32,
+                64,
+                128
+            ]
+        }
    },
-    "tile_n": {
-      "values": [
-        128
-      ]
+    "trait_config": {
+        "pipeline": {
+            "values": [
+                "compv3",
+                "compv4",
+                "mem"
+            ]
+        },
+        "scheduler": {
+            "values": [
+                "intrawave",
+                "interwave"
+            ]
+        },
+        "epilogue": {
+            "values": [
+                "cshuffle",
+                "default"
+            ]
+        },
+        "pad_m": {
+            "values": [
+                false
+            ]
+        },
+        "pad_n": {
+            "values": [
+                false
+            ]
+        },
+        "pad_k": {
+            "values": [
+                false
+            ]
+        },
+        "persistent": {
+            "values": [
+                false,
+                true
+            ]
+        }
    },
-    "tile_k": {
-      "values": [
-        32
-      ]
-    },
-    "warp_m": {
-      "values": [
-        2
-      ]
-    },
-    "warp_n": {
-      "values": [
-        2
-      ]
-    },
-    "warp_k": {
-      "values": [
-        1
-      ]
-    },
-    "warp_tile_m": {
-      "values": [
-        16
-      ]
-    },
-    "warp_tile_n": {
-      "values": [
-        16
-      ]
-    },
-    "warp_tile_k": {
-      "values": [
-        16
-      ]
-    }
-  },
-  "trait_config": {
-    "pipeline": {
-      "values": [
-        "compv3",
-        "compv4",
-        "mem"
-      ]
-    },
-    "scheduler": {
-      "values": [
-        "intrawave",
-        "interwave"
-      ]
-    },
-    "epilogue": {
-      "values": [
-        "cshuffle"
-      ]
-    },
-    "pad_m": {
-      "values": [
-        false
-      ]
-    },
-    "pad_n": {
-      "values": [
-        false
-      ]
-    },
-    "pad_k": {
-      "values": [
-        false
-      ]
-    }
-  }
-}
+    "k_block_per_cu": 1
+}
--- a/tile_engine/ops/gemm_multi_d/configs/user_provided_config.json
+++ b/tile_engine/ops/gemm_multi_d/configs/user_provided_config.json
@@ -2,12 +2,12 @@
  "tile_config": {
    "tile_m": {
      "values": [
-        256
+        64
      ]
    },
    "tile_n": {
      "values": [
-        256
+        192
      ]
    },
    "tile_k": {
@@ -42,24 +42,24 @@
    },
    "warp_tile_k": {
      "values": [
-        16
+        8
      ]
    }
  },
  "trait_config": {
    "pipeline": {
      "values": [
-        "compv3"
+        "compv4"
      ]
    },
    "scheduler": {
      "values": [
-        "intrawave"      
+        "intrawave"
      ]
    },
    "epilogue": {
      "values": [
-        "cshuffle"      
+        "cshuffle"
      ]
    },
    "pad_m": {
@@ -76,6 +76,12 @@
      "values": [
        false
      ]
+    },
+    "persistent": {
+        "values": [
+            true
+        ]
    }
-  }
+  },
+  "k_block_per_cu": 1
 }
--- a/tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.hpp
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.hpp
@@ -7,80 +7,14 @@
 #include <string>
 #include <fstream>
 #include <stdexcept>
+#include <iomanip>

-#include "gemm_multi_d_host_api.hpp"
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "gemm_multi_d_common.hpp"

-struct GemmMultiDProblem
-{
-    int split_k_;
-    int m_, n_, k_;
-    int stride_a_, stride_b_, stride_d0_, stride_d1_, stride_e_;
-
-    std::string dtype_a_, dtype_b_, dtype_d0_, dtype_d1_, dtype_acc_, dtype_e_;
-    std::string layout_a_, layout_b_, layout_d0_, layout_d1_, layout_e_;
-
-    friend std::ostream& operator<<(std::ostream& os, const GemmMultiDProblem& problem)
-    {
-        os << "{\n"
-           << "   \"split_k\":" << problem.split_k_ << ",\n"
-           << "   \"m\":" << problem.m_ << ",\n"
-           << "   \"n\":" << problem.n_ << ",\n"
-           << "   \"k\":" << problem.k_ << ",\n"
-           << "   \"stride_a\":" << problem.stride_a_ << ",\n"
-           << "   \"stride_b\":" << problem.stride_b_ << ",\n"
-           << "   \"stride_d0\":" << problem.stride_d0_ << ",\n"
-           << "   \"stride_d1\":" << problem.stride_d1_ << ",\n"
-           << "   \"stride_e\":" << problem.stride_e_ << ",\n"
-           << "   \"dtype_a\":\"" << problem.dtype_a_ << "\",\n"
-           << "   \"dtype_b\":\"" << problem.dtype_b_ << "\",\n"
-           << "   \"dtype_d0\":\"" << problem.dtype_d0_ << "\",\n"
-           << "   \"dtype_d1\":\"" << problem.dtype_d1_ << "\",\n"
-           << "   \"dtype_acc\":\"" << problem.dtype_acc_ << "\",\n"
-           << "   \"dtype_e\":\"" << problem.dtype_e_ << "\",\n"
-           << "   \"layout_a\":\"" << problem.layout_a_ << "\",\n"
-           << "   \"layout_b\":\"" << problem.layout_b_ << "\",\n"
-           << "   \"layout_d0\":\"" << problem.layout_d0_ << "\",\n"
-           << "   \"layout_d1\":\"" << problem.layout_d1_ << "\",\n"
-           << "   \"layout_e\":\"" << problem.layout_e_ << "\"\n"
-           << "}";
-        return os;
-    }
-};
-
-struct Setting
-{
-    int n_warmup_;
-    int n_repeat_;
-    bool is_gpu_timer_;
-    int verify_;
-    int init_method_;
-    bool log_;
-    std::string csv_filename_;
-    bool flush_cache_;
-    int rotating_count_;
-};
-
-// @brief Function to get the kernel output with reference implementation on CPU
-void gemm_multi_d_host_reference(int verify,
-                                 ck_tile::HostTensor<ADataType>& a_m_k,
-                                 ck_tile::HostTensor<BDataType>& b_k_n,
-                                 ck_tile::HostTensor<D0DataType>& d0_m_n,
-                                 ck_tile::HostTensor<D1DataType>& d1_m_n,
-                                 ck_tile::HostTensor<EDataType>& e_m_n_host_result)
-{
-    if(verify > 0)
-    {
-        // Currently supporting on CPU verification for Gemm Multi D
-        // e_m_n_host_result.SetZero();
-        ck_tile::reference_gemm_multiple_d<ADataType,
-                                           BDataType,
-                                           DsDataType,
-                                           AccDataType,
-                                           EDataType,
-                                           ElementWiseFn>(
-            a_m_k, b_k_n, {d0_m_n, d1_m_n}, e_m_n_host_result);
-    }
-}
+// Data types and Layouts are defined by the generated kernel headers
+// No hardcoded type definitions here to avoid conflicts

 enum class Metric
 {
@@ -100,6 +34,43 @@ inline constexpr auto get_metric_name(Metric m)
    }
 }

+struct GemmMultiDProblem
+{
+    int split_k_;
+    int m_, n_, k_;
+    int stride_a_, stride_b_, stride_d0_, stride_d1_, stride_c_;
+
+    std::string dtype_a_, dtype_b_, dtype_d0_, dtype_d1_, dtype_acc_, dtype_c_;
+    std::string layout_a_, layout_b_, layout_d0_, layout_d1_, layout_c_;
+
+    friend std::ostream& operator<<(std::ostream& os, const GemmMultiDProblem& problem)
+    {
+        os << "{\n"
+           << "   \"split_k\":" << problem.split_k_ << ",\n"
+           << "   \"m\":" << problem.m_ << ",\n"
+           << "   \"n\":" << problem.n_ << ",\n"
+           << "   \"k\":" << problem.k_ << ",\n"
+           << "   \"stride_a\":" << problem.stride_a_ << ",\n"
+           << "   \"stride_b\":" << problem.stride_b_ << ",\n"
+           << "   \"stride_d0\":" << problem.stride_d0_ << ",\n"
+           << "   \"stride_d1\":" << problem.stride_d1_ << ",\n"
+           << "   \"stride_c\":" << problem.stride_c_ << ",\n"
+           << "   \"dtype_a\":\"" << problem.dtype_a_ << "\",\n"
+           << "   \"dtype_b\":\"" << problem.dtype_b_ << "\",\n"
+           << "   \"dtype_d0\":\"" << problem.dtype_d0_ << "\",\n"
+           << "   \"dtype_d1\":\"" << problem.dtype_d1_ << "\",\n"
+           << "   \"dtype_acc\":\"" << problem.dtype_acc_ << "\",\n"
+           << "   \"dtype_c\":\"" << problem.dtype_c_ << "\",\n"
+           << "   \"layout_a\":\"" << problem.layout_a_ << "\",\n"
+           << "   \"layout_b\":\"" << problem.layout_b_ << "\",\n"
+           << "   \"layout_d0\":\"" << problem.layout_d0_ << "\",\n"
+           << "   \"layout_d1\":\"" << problem.layout_d1_ << "\",\n"
+           << "   \"layout_c\":\"" << problem.layout_c_ << "\"" << "\n"
+           << "}";
+        return os;
+    }
+};
+
 struct PerformanceResult
 {
    double latency_;
@@ -143,15 +114,28 @@ struct KernelInstance
    friend std::ostream& operator<<(std::ostream& os, const KernelInstance& obj)
    {
        os << "{\n"
-           << " \"name\": \"" << "{\n"
-           << obj.name_ << "\n}" << "\",\n"
-           << " \"problem\": \"" << obj.problem_ << "\",\n"
+           << " \"name\": \"" << obj.name_ << "\",\n"
+           << " \"problem\": " << obj.problem_ << ",\n"
           << " \"perf_result\": " << obj.perf_result_ << "\n"
           << "}";
        return os;
    }
 };

+struct Setting
+{
+    int n_warmup_;
+    int n_repeat_;
+    bool is_gpu_timer_;
+    int verify_;
+    int init_method_;
+    bool log_;
+    std::string csv_filename_;
+    bool flush_cache_;
+    int rotating_count_;
+    bool json_output_;
+};
+
 inline std::string get_rocm_version()
 {
    std::ifstream version_file("/opt/rocm/.info/version");
@@ -164,6 +148,11 @@ inline std::string get_rocm_version()
    return "Unknown";
 }

+template <typename ADataType,
+          typename BDataType,
+          typename D0DataType,
+          typename AccDataType,
+          typename CDataType>
 auto calculate_rtol_atol(const ck_tile::index_t K,
                         const ck_tile::index_t kbatch,
                         const float max_accumulated_value)
@@ -175,17 +164,17 @@ auto calculate_rtol_atol(const ck_tile::index_t K,
        std::conditional_t<sizeof(ComputeTypeAB) < sizeof(D0DataType), ComputeTypeAB, D0DataType>;

    // Calculate thresholds
-    const auto rtol = ck_tile::get_relative_threshold<ComputeType, EDataType, AccDataType>(
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
        ck_tile::integer_divide_ceil(K, kbatch));

-    const auto atol = ck_tile::get_absolute_threshold<ComputeType, EDataType, AccDataType>(
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));

    // Calculate error due to split_k accumulation
    const auto rtol_split_k =
-        ck_tile::get_relative_threshold<EDataType, EDataType, EDataType>(kbatch);
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);

-    const auto atol_split_k = ck_tile::get_absolute_threshold<EDataType, EDataType, EDataType>(
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
        max_accumulated_value, kbatch);

    // Use higher threshold
@@ -195,16 +184,19 @@ auto calculate_rtol_atol(const ck_tile::index_t K,
 /// @brief Function to compare the results of the device and host computations
 bool compare(std::string instanceName,
             ck_tile::index_t K,
-             ck_tile::HostTensor<EDataType>& e_m_n_dev_result,
-             ck_tile::HostTensor<EDataType>& e_m_n_host_result)
+             ck_tile::index_t kbatch,
+             ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+             ck_tile::HostTensor<CDataType>& c_m_n_host_result)
 {
    const float max_accumulated_value =
-        *std::max_element(e_m_n_host_result.mData.begin(), e_m_n_host_result.mData.end());
+        *std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end());

-    const auto rtol_atol = calculate_rtol_atol(K, 1, max_accumulated_value);
+    const auto rtol_atol =
+        calculate_rtol_atol<ADataType, BDataType, D0DataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);

-    bool pass = ck_tile::check_err(e_m_n_dev_result,
-                                   e_m_n_host_result,
+    bool pass = ck_tile::check_err(c_m_n_dev_result,
+                                   c_m_n_host_result,
                                   "Error: Incorrect results!",
                                   rtol_atol.at(ck_tile::number<0>{}),
                                   rtol_atol.at(ck_tile::number<1>{}));
@@ -216,3 +208,25 @@ bool compare(std::string instanceName,

    return pass;
 }
+
+/// @brief Function to get the kernel output with reference implementation on CPU/GPU
+void gemm_multi_d_host_reference(int verify,
+                                 ck_tile::HostTensor<ADataType>& a_m_k,
+                                 ck_tile::HostTensor<BDataType>& b_k_n,
+                                 ck_tile::HostTensor<D0DataType>& d0_m_n,
+                                 ck_tile::HostTensor<D1DataType>& d1_m_n,
+                                 ck_tile::HostTensor<CDataType>& c_m_n_host_result)
+{
+    if(verify > 0)
+    {
+        // Currently supporting on CPU verification for Gemm Multi D
+        // e_m_n_host_result.SetZero();
+        ck_tile::reference_gemm_multiple_d<ADataType,
+                                           BDataType,
+                                           DsDataType,
+                                           AccDataType,
+                                           CDataType,
+                                           ElementWiseFn>(
+            a_m_k, b_k_n, {d0_m_n, d1_m_n}, c_m_n_host_result);
+    }
+}
--- a/tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.py
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.py
@@ -0,0 +1,683 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+import sys
+import json
+import subprocess
+import argparse
+import csv
+import time
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+
+
+class GemmMultiDBenchmark:
+    def __init__(self, build_dir: str, verbose: bool = False):
+        self.build_dir = Path(build_dir)
+        self.verbose = verbose
+        self.results = []
+
+    def discover_kernels(self) -> List[Path]:
+        """Find all benchmark_gemm_multi_d_* executables in the build directory"""
+        bin_dir = self.build_dir / "bin"
+        if not bin_dir.exists():
+            print(f"Error: Binary directory {bin_dir} does not exist")
+            return []
+
+        kernels = list(bin_dir.glob("benchmark_gemm_multi_d_*"))
+        if self.verbose:
+            print(f"Found {len(kernels)} kernel executables")
+            for k in kernels:
+                print(f"  - {k.name}")
+        return kernels
+
+    def extract_kernel_info(self, kernel_path: Path) -> Dict[str, str]:
+        """Extract comprehensive kernel information from filename"""
+        name = kernel_path.stem
+
+        # Initialize with basic info
+        info = {
+            "executable": str(kernel_path),
+            "name": name,
+            "data_type": "unknown",
+            "layout": "unknown",
+            "pipeline": "unknown",
+            "scheduler": "unknown",
+            "epilogue": "unknown",
+        }
+
+        # Parse the kernel name pattern:
+        # benchmark_gemm_multi_d_fp16_rcr_mem_default_intrawave_False_False_False_False_False_256x256x32_2x2x1_4x64x16
+        parts = name.split("_")
+
+        if len(parts) >= 5:
+            # Extract data type (3rd part after benchmark_gemm_)
+            info["data_type"] = parts[4] if len(parts) > 4 else "unknown"
+
+            # Extract layout (4th part)
+            info["layout"] = parts[5] if len(parts) > 5 else "unknown"
+
+            # Extract pipeline (5th part)
+            info["pipeline"] = parts[6] if len(parts) > 6 else "unknown"
+
+            # Extract epilogue (6th part)
+            info["epilogue"] = parts[7] if len(parts) > 7 else "unknown"
+
+            # Extract scheduler (7th part)
+            info["scheduler"] = parts[8] if len(parts) > 8 else "unknown"
+
+        # Extract detailed configuration from the end of the name
+        config_info = self.parse_detailed_config(name)
+        info.update(config_info)
+
+        # Generate config ID
+        info["config_id"] = self.generate_config_id(info)
+
+        return info
+
+    def parse_detailed_config(self, kernel_name: str) -> Dict:
+        """Parse detailed configuration from kernel name"""
+        config = {
+            "tile_sizes": {"tile_m": 0, "tile_n": 0, "tile_k": 0},
+            "warp_config": {"warp_m": 0, "warp_n": 0, "warp_k": 0},
+            "warp_tile": {"warp_tile_m": 0, "warp_tile_n": 0, "warp_tile_k": 0},
+            "optimization_flags": {
+                "pad_m": False,
+                "pad_n": False,
+                "pad_k": False,
+                "persistent": False,
+            },
+        }
+
+        # Split by underscore and look for patterns
+        parts = kernel_name.split("_")
+
+        # Look for boolean flags (sequence of True/False values)
+        bool_sequence = []
+        for i, part in enumerate(parts):
+            if part in ["True", "False"]:
+                bool_sequence.append(part == "True")
+                # Continue collecting consecutive boolean values
+                j = i + 1
+                while j < len(parts) and parts[j] in ["True", "False"]:
+                    bool_sequence.append(parts[j] == "True")
+                    j += 1
+                break
+
+        # Assign boolean flags if we found them
+        # Order: pad_m, pad_n, pad_k, persistent (4 flags total)
+        if len(bool_sequence) >= 4:
+            config["optimization_flags"]["pad_m"] = bool_sequence[0]
+            config["optimization_flags"]["pad_n"] = bool_sequence[1]
+            config["optimization_flags"]["pad_k"] = bool_sequence[2]
+            config["optimization_flags"]["persistent"] = bool_sequence[3]
+
+        # Look for tile size patterns (e.g., 256x256x32_2x2x1_4x64x16)
+        # The pattern is: tile_sizes_warp_config_warp_tile
+        dimension_groups = []
+        for part in parts:
+            if "x" in part and len(part.split("x")) == 3:
+                try:
+                    dims = [int(x) for x in part.split("x")]
+                    if all(d > 0 for d in dims):
+                        dimension_groups.append(dims)
+                except ValueError:
+                    continue
+
+        # Assign dimensions based on order and magnitude
+        if len(dimension_groups) >= 3:
+            # Sort by magnitude to identify: largest=tile_sizes, smallest=warp_config, middle=warp_tile
+            sorted_groups = sorted(dimension_groups, key=max, reverse=True)
+
+            # Largest dimensions = tile sizes
+            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
+            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
+            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
+
+            # Smallest dimensions = warp config
+            config["warp_config"]["warp_m"] = sorted_groups[2][0]
+            config["warp_config"]["warp_n"] = sorted_groups[2][1]
+            config["warp_config"]["warp_k"] = sorted_groups[2][2]
+
+            # Middle dimensions = warp tile
+            config["warp_tile"]["warp_tile_m"] = sorted_groups[1][0]
+            config["warp_tile"]["warp_tile_n"] = sorted_groups[1][1]
+            config["warp_tile"]["warp_tile_k"] = sorted_groups[1][2]
+        elif len(dimension_groups) == 2:
+            # If only 2 groups, assign based on magnitude
+            sorted_groups = sorted(dimension_groups, key=max, reverse=True)
+
+            # Larger = tile sizes
+            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
+            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
+            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
+
+            # Smaller = warp config
+            config["warp_config"]["warp_m"] = sorted_groups[1][0]
+            config["warp_config"]["warp_n"] = sorted_groups[1][1]
+            config["warp_config"]["warp_k"] = sorted_groups[1][2]
+        elif len(dimension_groups) == 1:
+            # Only one group - assume it's tile sizes
+            config["tile_sizes"]["tile_m"] = dimension_groups[0][0]
+            config["tile_sizes"]["tile_n"] = dimension_groups[0][1]
+            config["tile_sizes"]["tile_k"] = dimension_groups[0][2]
+
+        return config
+
+    def generate_config_id(self, info: Dict) -> str:
+        """Generate a compact config ID from kernel info"""
+        # Create a compact identifier
+        parts = [
+            info.get("data_type", "unk"),
+            info.get("layout", "unk"),
+            info.get("pipeline", "unk"),
+            info.get("scheduler", "unk"),
+        ]
+
+        # Add tile configuration if available
+        tile_sizes = info.get("tile_sizes", {})
+        if tile_sizes.get("tile_m", 0) > 0:
+            tile_str = (
+                f"{tile_sizes['tile_m']}x{tile_sizes['tile_n']}x{tile_sizes['tile_k']}"
+            )
+            parts.append(tile_str)
+
+        # Add warp config if available
+        warp_config = info.get("warp_config", {})
+        if warp_config.get("warp_m", 0) > 0:
+            warp_str = f"w{warp_config['warp_m']}x{warp_config['warp_n']}x{warp_config['warp_k']}"
+            parts.append(warp_str)
+
+        # Add warp tile if available
+        warp_tile = info.get("warp_tile", {})
+        if warp_tile.get("warp_tile_m", 0) > 0:
+            warp_tile_str = f"wt{warp_tile['warp_tile_m']}x{warp_tile['warp_tile_n']}x{warp_tile['warp_tile_k']}"
+            parts.append(warp_tile_str)
+
+        return "_".join(parts)
+
+    def run_kernel(self, kernel_path: Path, params: Dict[str, str]) -> Optional[Dict]:
+        """Run a single kernel with given parameters and save output to individual JSON file"""
+        # Create results directory
+        results_dir = self.build_dir / "results"
+        results_dir.mkdir(exist_ok=True)
+
+        # Generate unique JSON filename for this kernel
+        json_file = results_dir / f"{kernel_path.stem}.json"
+
+        cmd = [str(kernel_path)]
+
+        # Add parameters
+        for key, value in params.items():
+            cmd.append(f"-{key}={value}")
+
+        # Add JSON output flag for clean JSON output
+        cmd.append("-json_output=true")
+
+        if self.verbose:
+            print(f"Running: {' '.join(cmd)}")
+
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
+
+            if result.returncode != 0:
+                print(f"Error running {kernel_path.name}: {result.stderr}")
+                return None
+
+            # Save raw output to individual JSON file
+            output = result.stdout.strip()
+            if output:
+                with open(json_file, "w") as f:
+                    f.write(output)
+
+                # Parse the JSON file
+                return self.parse_json_file(json_file)
+            else:
+                print(f"No output from {kernel_path.name}")
+                return None
+
+        except subprocess.TimeoutExpired:
+            print(f"Timeout running {kernel_path.name}")
+            return None
+        except Exception as e:
+            print(f"Error running {kernel_path.name}: {e}")
+            return None
+
+    def parse_json_file(self, json_file: Path) -> Optional[Dict]:
+        """Parse JSON data from individual kernel output file"""
+        try:
+            with open(json_file, "r") as f:
+                content = f.read().strip()
+
+            # Parse the JSON directly since executables produce clean JSON
+            data = json.loads(content)
+
+            # Return the complete JSON data as-is, just add some convenience fields
+            result = data.copy()
+            if "perf_result" in data:
+                perf = data["perf_result"]
+                # Add convenience fields for backward compatibility
+                result["time_ms"] = perf.get("latency(ms)", 0)
+                result["tflops"] = perf.get("tflops(TFlops)", 0)
+                result["bandwidth_gb_s"] = perf.get("bandwidth(GB/s)", 0)
+
+            return result
+
+        except json.JSONDecodeError as e:
+            if self.verbose:
+                print(f"Failed to parse JSON from {json_file}: {e}")
+            return None
+        except Exception as e:
+            if self.verbose:
+                print(f"Error reading JSON file {json_file}: {e}")
+            return None
+
+    def benchmark_problem_size(
+        self,
+        kernels: List[Path],
+        m: int,
+        n: int,
+        k: int,
+        split_k: int = 1,
+        verify: int = 0,
+        warmup: int = 50,
+        repeat: int = 100,
+        flush_cache: bool = True,
+        rotating_count: int = 1000,
+    ) -> List[Dict]:
+        """Benchmark all kernels for a specific problem size"""
+        results = []
+
+        params = {
+            "m": m,
+            "n": n,
+            "k": k,
+            "split_k": split_k,
+            "verify": verify,
+            "warmup": warmup,
+            "repeat": repeat,
+            "flush_cache": str(flush_cache).lower(),
+            "rotating_count": rotating_count,
+        }
+
+        print(f"\nBenchmarking M={m}, N={n}, K={k}, split_k={split_k}")
+
+        for kernel_path in kernels:
+            kernel_info = self.extract_kernel_info(kernel_path)
+            result = self.run_kernel(kernel_path, params)
+
+            if result:
+                # Create new structured result format
+                structured_result = {
+                    "name": kernel_info["name"],  # Add name field for compatibility
+                    "config_id": kernel_info["config_id"],
+                    "problem": result.get("problem", {}),
+                    "perf_result": result.get("perf_result", {}),
+                    "config": {
+                        "data_type": kernel_info["data_type"],
+                        "layout": kernel_info["layout"],
+                        "pipeline": kernel_info["pipeline"],
+                        "scheduler": kernel_info["scheduler"],
+                        "epilogue": kernel_info["epilogue"],
+                        "tile_sizes": kernel_info.get("tile_sizes", {}),
+                        "warp_config": kernel_info.get("warp_config", {}),
+                        "warp_tile": kernel_info.get("warp_tile", {}),
+                        "optimization_flags": kernel_info.get("optimization_flags", {}),
+                    },
+                    "executable": kernel_info["executable"],
+                    # Keep backward compatibility fields
+                    "time_ms": result.get("time_ms", 0),
+                    "tflops": result.get("tflops", 0),
+                    "bandwidth_gb_s": result.get("bandwidth_gb_s", 0),
+                }
+
+                results.append(structured_result)
+
+                if self.verbose:
+                    print(
+                        f"  {kernel_info['config_id']}: {structured_result['tflops']:.2f} TFLOPS, {structured_result['bandwidth_gb_s']:.2f} GB/s, {structured_result['time_ms']:.2f}ms"
+                    )
+
+        return results
+
+    def find_best_kernel(
+        self, results: List[Dict], metric: str = "tflops"
+    ) -> Optional[Dict]:
+        """Find the best performing kernel based on metric"""
+        if not results:
+            return None
+
+        if metric == "tflops":
+            return max(results, key=lambda x: x.get("tflops", 0))
+        elif metric == "time_ms":
+            return min(results, key=lambda x: x.get("time_ms", float("inf")))
+        elif metric == "bandwidth_gb_s":
+            return max(results, key=lambda x: x.get("bandwidth_gb_s", 0))
+        else:
+            raise ValueError(f"Unknown metric: {metric}")
+
+    def benchmark_sweep(
+        self,
+        problem_sizes: List[Tuple[int, int, int]],
+        split_k_values: List[int] = [1],
+        verify: bool = False,
+        warmup: int = 50,
+        repeat: int = 100,
+        flush_cache: bool = True,
+        rotating_count: int = 1000,
+    ) -> Dict:
+        """Run comprehensive benchmark sweep"""
+        kernels = self.discover_kernels()
+        if not kernels:
+            print("No kernels found!")
+            return {}
+
+        all_results = []
+        best_kernels = {}
+
+        for m, n, k in problem_sizes:
+            for split_k in split_k_values:
+                results = self.benchmark_problem_size(
+                    kernels,
+                    m,
+                    n,
+                    k,
+                    split_k,
+                    verify=2 if verify else 0,
+                    warmup=warmup,
+                    repeat=repeat,
+                    flush_cache=flush_cache,
+                    rotating_count=rotating_count,
+                )
+
+                all_results.extend(results)
+
+                # Find best kernel for this configuration
+                best = self.find_best_kernel(results)
+                if best:
+                    key = f"m{m}_n{n}_k{k}_splitk{split_k}"
+                    best_kernels[key] = best
+                    print(
+                        f"Best for {key}: {best['name']} ({best['tflops']:.2f} TFLOPS, {best['bandwidth_gb_s']:.2f} GB/s, {best['time_ms']:.2f}ms)"
+                    )
+
+        self.results = all_results
+        return best_kernels
+
+    def export_csv(self, filename: str):
+        """Export all results to CSV"""
+        if not self.results:
+            print("No results to export")
+            return
+
+        # Get all unique keys from results
+        all_keys = set()
+        for result in self.results:
+            all_keys.update(result.keys())
+
+        # Sort keys for consistent output
+        fieldnames = sorted(all_keys)
+
+        with open(filename, "w", newline="") as csvfile:
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(self.results)
+
+        print(f"Results exported to {filename}")
+
+    def export_best_kernels(self, best_kernels: Dict, filename: str):
+        """Export best kernel selections to file"""
+        with open(filename, "w") as f:
+            f.write("# Best kernel selections\n")
+            f.write(
+                "# Format: problem_size -> kernel_name (TFLOPS, bandwidth, latency)\n\n"
+            )
+
+            for key, kernel in sorted(best_kernels.items()):
+                f.write(
+                    f"{key}: {kernel['name']} ({kernel['tflops']:.2f} TFLOPS, {kernel['bandwidth_gb_s']:.2f} GB/s, {kernel['time_ms']:.2f}ms)\n"
+                )
+
+        print(f"Best kernels exported to {filename}")
+
+    def export_json(self, filename: str, best_kernels: Dict = None):
+        """Export all results and best kernels to JSON with comprehensive metadata"""
+        from datetime import datetime
+
+        # Calculate comprehensive summary statistics for all metrics
+        successful_results = [r for r in self.results if r.get("tflops", 0) > 0]
+
+        tflops_values = [r.get("tflops", 0) for r in successful_results]
+        bandwidth_values = [r.get("bandwidth_gb_s", 0) for r in successful_results]
+        latency_values = [
+            r.get("time_ms", 0) for r in successful_results if r.get("time_ms", 0) > 0
+        ]
+
+        # Performance breakdown by kernel type
+        pipeline_stats = {}
+        scheduler_stats = {}
+        data_type_stats = {}
+
+        for result in successful_results:
+            # Get config info from the new structure
+            config = result.get("config", {})
+
+            # Pipeline statistics
+            pipeline = config.get("pipeline", "unknown")
+            if pipeline not in pipeline_stats:
+                pipeline_stats[pipeline] = {
+                    "count": 0,
+                    "avg_tflops": 0,
+                    "best_tflops": 0,
+                }
+            pipeline_stats[pipeline]["count"] += 1
+            pipeline_stats[pipeline]["best_tflops"] = max(
+                pipeline_stats[pipeline]["best_tflops"], result.get("tflops", 0)
+            )
+
+            # Scheduler statistics
+            scheduler = config.get("scheduler", "unknown")
+            if scheduler not in scheduler_stats:
+                scheduler_stats[scheduler] = {
+                    "count": 0,
+                    "avg_tflops": 0,
+                    "best_tflops": 0,
+                }
+            scheduler_stats[scheduler]["count"] += 1
+            scheduler_stats[scheduler]["best_tflops"] = max(
+                scheduler_stats[scheduler]["best_tflops"], result.get("tflops", 0)
+            )
+
+            # Data type statistics
+            data_type = config.get("data_type", "unknown")
+            if data_type not in data_type_stats:
+                data_type_stats[data_type] = {
+                    "count": 0,
+                    "avg_tflops": 0,
+                    "best_tflops": 0,
+                }
+            data_type_stats[data_type]["count"] += 1
+            data_type_stats[data_type]["best_tflops"] = max(
+                data_type_stats[data_type]["best_tflops"], result.get("tflops", 0)
+            )
+
+        # Calculate averages for breakdown stats
+        for stats_dict, field_name in [
+            (pipeline_stats, "pipeline"),
+            (scheduler_stats, "scheduler"),
+            (data_type_stats, "data_type"),
+        ]:
+            for key in stats_dict:
+                relevant_results = [
+                    r
+                    for r in successful_results
+                    if r.get("config", {}).get(field_name, "unknown") == key
+                ]
+                if relevant_results:
+                    stats_dict[key]["avg_tflops"] = sum(
+                        r.get("tflops", 0) for r in relevant_results
+                    ) / len(relevant_results)
+
+        output_data = {
+            "benchmark_metadata": {
+                "timestamp": datetime.now().isoformat(),
+                "total_kernels_tested": len(self.results),
+                "unique_kernels": len(
+                    set(r.get("name", "unknown") for r in self.results)
+                ),
+                "successful_runs": len(successful_results),
+                "failed_runs": len(self.results) - len(successful_results),
+            },
+            "performance_summary": {
+                "tflops_stats": {
+                    "best": max(tflops_values, default=0),
+                    "average": sum(tflops_values) / len(tflops_values)
+                    if tflops_values
+                    else 0,
+                    "min": min(tflops_values, default=0),
+                    "median": sorted(tflops_values)[len(tflops_values) // 2]
+                    if tflops_values
+                    else 0,
+                },
+                "bandwidth_stats": {
+                    "best_gb_s": max(bandwidth_values, default=0),
+                    "average_gb_s": sum(bandwidth_values) / len(bandwidth_values)
+                    if bandwidth_values
+                    else 0,
+                    "min_gb_s": min(bandwidth_values, default=0),
+                    "median_gb_s": sorted(bandwidth_values)[len(bandwidth_values) // 2]
+                    if bandwidth_values
+                    else 0,
+                },
+                "latency_stats": {
+                    "best_ms": min(latency_values, default=0),
+                    "average_ms": sum(latency_values) / len(latency_values)
+                    if latency_values
+                    else 0,
+                    "max_ms": max(latency_values, default=0),
+                    "median_ms": sorted(latency_values)[len(latency_values) // 2]
+                    if latency_values
+                    else 0,
+                },
+                "kernel_type_breakdown": {
+                    "by_pipeline": pipeline_stats,
+                    "by_scheduler": scheduler_stats,
+                    "by_data_type": data_type_stats,
+                },
+                "total_problem_configurations": len(best_kernels)
+                if best_kernels
+                else 0,
+            },
+            "kernel_results": self.results,
+            "best_kernels_by_problem": best_kernels or {},
+        }
+
+        with open(filename, "w") as f:
+            json.dump(output_data, f, indent=2)
+
+        print(f"JSON results exported to {filename}")
+        print(f"  - Total kernels: {len(self.results)}")
+        print(f"  - Successful runs: {len(successful_results)}")
+        print(f"  - Best TFLOPS: {max(tflops_values, default=0):.2f}")
+        print(f"  - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s")
+        print(f"  - Best latency: {min(latency_values, default=0):.2f}ms")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="GEMM Multi D Kernel Benchmarking Tool"
+    )
+    parser.add_argument(
+        "build_dir", help="Build directory containing kernel executables"
+    )
+    parser.add_argument(
+        "--problem-sizes",
+        nargs="+",
+        default=["1024,1024,1024", "2048,2048,2048", "4096,4096,4096"],
+        help="Problem sizes as M,N,K tuples",
+    )
+    parser.add_argument(
+        "--split-k", nargs="+", type=int, default=[1], help="Split-K values to test"
+    )
+    parser.add_argument("--verify", action="store_true", help="Enable verification")
+    parser.add_argument(
+        "--csv",
+        default="gemm_multi_d_benchmark_results.csv",
+        help="CSV output filename",
+    )
+    parser.add_argument(
+        "--best", default="best_kernels.txt", help="Best kernels output filename"
+    )
+    parser.add_argument("--verbose", action="store_true", help="Verbose output")
+    parser.add_argument(
+        "--warmup",
+        type=int,
+        default=50,
+        help="Number of warmup iterations (default: 50)",
+    )
+    parser.add_argument(
+        "--repeat",
+        type=int,
+        default=100,
+        help="Number of benchmark iterations (default: 100)",
+    )
+    parser.add_argument(
+        "--flush-cache",
+        action="store_true",
+        default=True,
+        help="Enable cache flushing (default: True)",
+    )
+    parser.add_argument(
+        "--rotating-count",
+        type=int,
+        default=1000,
+        help="Number of iterations to rotate cache (default: 1000)",
+    )
+    parser.add_argument("--json", help="JSON output filename (optional)")
+
+    args = parser.parse_args()
+
+    # Parse problem sizes
+    problem_sizes = []
+    for size_str in args.problem_sizes:
+        try:
+            m, n, k = map(int, size_str.split(","))
+            problem_sizes.append((m, n, k))
+        except ValueError:
+            print(f"Invalid problem size: {size_str}")
+            return 1
+
+    # Create benchmark instance
+    benchmark = GemmMultiDBenchmark(args.build_dir, verbose=args.verbose)
+
+    # Run benchmark sweep
+    print("Starting GEMM Multi D kernel benchmark sweep...")
+    start_time = time.time()
+
+    best_kernels = benchmark.benchmark_sweep(
+        problem_sizes=problem_sizes,
+        split_k_values=args.split_k,
+        verify=args.verify,
+        warmup=args.warmup,
+        repeat=args.repeat,
+        flush_cache=args.flush_cache,
+        rotating_count=args.rotating_count,
+    )
+
+    elapsed_time = time.time() - start_time
+    print(f"\nBenchmark completed in {elapsed_time:.2f} seconds")
+
+    # Export results
+    benchmark.export_csv(args.csv)
+    benchmark.export_best_kernels(best_kernels, args.best)
+
+    # Export JSON if requested
+    if args.json:
+        benchmark.export_json(args.json, best_kernels)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark_single.cpp
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark_single.cpp
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <functional>
+#include <tuple>
+#include <exception>
+#include <sstream>
+#include <vector>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "gemm_multi_d_profiler.hpp"
+#include "gemm_multi_d_common.hpp"
+
+// The kernel header is included via the compile command line with -include flag
+// It defines SelectedKernel struct and KERNEL_NAME
+// DataTypeTraits are now defined in gemm_multi_d_common.hpp
+
+// Create argument parser
+inline auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
+        .insert("n", "4096", "The value for n dimension. Default is 4096.")
+        .insert("k", "2048", "The value for k dimension. Default is 2048.")
+        .insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
+        .insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
+        .insert("stride_ds", "0", "The stride value for tensor Ds . Default is 0.")
+        .insert("stride_c", "0", "The stride value for tensor C. Default is 0.")
+        .insert("split_k", "1", "The split value for k dimension. Default is 1.")
+        .insert("verify",
+                "1",
+                "for validation on GPU. Default is 1, validation on CPU, as validation on GPU is "
+                "not supported.")
+        .insert("log",
+                "false",
+                "Whether output kernel instance information or not. Possible values are true or "
+                "false. Default is false")
+        .insert(
+            "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
+        .insert(
+            "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
+        .insert("timer",
+                "true",
+                "Whether if the timer is gpu timer or not. Possible values are false or true. "
+                "Default is true.")
+        .insert("init",
+                "0",
+                "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
+                "for constant(1). Default is 0, random.")
+        .insert("flush_cache",
+                "true",
+                "To flush cache, possible values are true or false. "
+                "Default is false.")
+        .insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.")
+        .insert("metric",
+                "0",
+                "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
+                "tflops, or 2 for bandwidth. Default is 0, latency.")
+        .insert("csv_filename",
+                "",
+                "The filename of benchmark result. Default is empty (no CSV output).")
+        .insert("structured_sparsity",
+                "false",
+                "Whether use sparsity kernel or not. Possible values are true or false. Default is "
+                "false")
+        .insert("json_output",
+                "false",
+                "Whether to output results in JSON format only. Possible values are true or false. "
+                "Default is "
+                "false");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+void benchmark_single(const ck_tile::ArgParser& arg_parser)
+{
+    // Use DataTypeTraits to get the actual type names from the generated header
+    // The generated header defines ADataType, BDataType, AccDataType, CDataType
+    std::string dtype_a   = DataTypeTraits<ADataType>::name;
+    std::string dtype_b   = DataTypeTraits<BDataType>::name;
+    std::string dtype_acc = DataTypeTraits<AccDataType>::name;
+    std::string dtype_c   = DataTypeTraits<CDataType>::name;
+    std::string dtype_d0  = DataTypeTraits<D0DataType>::name;
+    std::string dtype_d1  = DataTypeTraits<D1DataType>::name;
+
+    // Layout names from the layout types
+    std::string layout_a  = ALayout::name;
+    std::string layout_b  = BLayout::name;
+    std::string layout_c  = CLayout::name;
+    std::string layout_d0 = D0Layout::name;
+    std::string layout_d1 = D1Layout::name;
+
+    // Create GemmMultiDProblem struct
+    GemmMultiDProblem gemm_multi_d_problem{arg_parser.get_int("split_k"),
+                                           arg_parser.get_int("m"),
+                                           arg_parser.get_int("n"),
+                                           arg_parser.get_int("k"),
+                                           arg_parser.get_int("stride_a"),
+                                           arg_parser.get_int("stride_b"),
+                                           arg_parser.get_int("stride_ds"),
+                                           arg_parser.get_int("stride_ds"),
+                                           arg_parser.get_int("stride_c"),
+                                           dtype_a,
+                                           dtype_b,
+                                           dtype_d0,
+                                           dtype_d1,
+                                           dtype_acc,
+                                           dtype_c,
+                                           layout_a,
+                                           layout_b,
+                                           layout_d0,
+                                           layout_d1,
+                                           layout_c};
+
+    // Create Setting struct
+    Setting setting{arg_parser.get_int("warmup"),
+                    arg_parser.get_int("repeat"),
+                    arg_parser.get_bool("timer"),
+                    arg_parser.get_int("verify"),
+                    arg_parser.get_int("init"),
+                    arg_parser.get_bool("log"),
+                    arg_parser.get_str("csv_filename"),
+                    arg_parser.get_bool("flush_cache"),
+                    arg_parser.get_int("rotating_count"),
+                    arg_parser.get_bool("json_output")};
+
+    // Get the profiler instance
+    auto& profiler = GemmMultiDProfiler::instance(setting);
+
+    try
+    {
+        // Create a lambda that wraps the kernel launch
+        auto kernel_func = [](const ck_tile::GemmMultiDHostArgs<DsDataType::size()>& args,
+                              const ck_tile::stream_config& stream) {
+            return SelectedKernel::launch(args, stream);
+        };
+
+        // Benchmark the kernel
+        profiler.benchmark(gemm_multi_d_problem, kernel_func);
+
+        // Select best instance based on metric
+        profiler.select_best_instance(static_cast<Metric>(arg_parser.get_int("metric")));
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Benchmark failed: " << e.what() << std::endl;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    try
+    {
+        auto [result, parser] = create_args(argc, argv);
+        if(!result)
+            return EXIT_FAILURE;
+
+        benchmark_single(parser);
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << "\n";
+        return EXIT_FAILURE;
+    }
+}
--- a/tile_engine/ops/gemm_multi_d/gemm_multi_d_codegen_utils.py
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_codegen_utils.py
@@ -1,196 +0,0 @@
-# SPDX-License-Identifier: MIT
-# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-# -*- coding: utf-8 -*-
-
-"""
-Mappings and utility functions for kernel code generation.
-"""
-
-DATA_TYPE_MAP = {
-    "fp32": "float",
-    "fp16": "ck_tile::half_t",
-    "bf16": "ck_tile::bf16_t",
-    "int8": "ck_tile::int8_t",
-    "fp8": "ck_tile::fp8_t",
-    "bf8": "ck_tile::bf8_t",
-    "int4": "ck_tile::pk_int4_t",
-    "int32": "ck_tile::int32_t",
-}
-
-LAYOUT_MAP = {
-    "r": "ck_tile::tensor_layout::gemm::RowMajor",
-    "c": "ck_tile::tensor_layout::gemm::ColumnMajor",
-}
-
-
-# TODO THIS IS NOT SUPPORTED FOR MULTI D AS OF NOW
-# DEFAULT_EPILOGUE = """
-#             using GemmEpilogue = ck_tile::DefaultGemm2DEpilogue<
-#                                 ck_tile::DefaultGemm2DEpilogueProblem<ADataType,
-#                                                                       BDataType,
-#                                                                       AccDataType,
-#                                                                       CDataType,
-#                                                                       CLayout,
-#                                                                       kPadM,
-#                                                                       kPadN,
-#                                                                       WarpTileM,
-#                                                                       WarpTileN,
-#                                                                       WarpTileK,
-#                                                                       UniversalGemmProblem::TransposeC,
-#                                                                       true,
-#                                                                       memory_operation>>;
-# """
-
-CSHUFFLE_EPILOGUE = """
-            using GemmEpilogue = ck_tile::CShuffleEpilogue<
-                            ck_tile::CShuffleEpilogueProblem<ADataType,
-                                                             BDataType,
-                                                             DsDataType,
-                                                             AccDataType,
-                                                             EDataType,
-                                                             DsLayout,
-                                                             ELayout,
-                                                             CDEElementWise,
-                                                             TilePartitioner::MPerBlock,
-                                                             TilePartitioner::NPerBlock,
-                                                             WarpM,
-                                                             WarpN,
-                                                             WarpTileM,
-                                                             WarpTileN,
-                                                             WarpTileK,
-                                                             UniversalGemmProblem::TransposeC,
-                                                             memory_operation>>;
-"""
-
-PIPELINE_MAP = {
-    "mem": ["ck_tile::BaseGemmPipelineAgBgCrMem", "ck_tile::GemmPipelineAgBgCrMem"],
-    "compv3": [
-        "ck_tile::BaseGemmPipelineAgBgCrCompV3",
-        "ck_tile::GemmPipelineAgBgCrCompV3",
-    ],
-    "compv4": [
-        "ck_tile::BaseGemmPipelineAgBgCrCompV4",
-        "ck_tile::GemmPipelineAgBgCrCompV4",
-    ],
-}
-
-SCHEDULER_MAP = {
-    "interwave": "ck_tile::GemmPipelineScheduler::Interwave",
-    "intrawave": "ck_tile::GemmPipelineScheduler::Intrawave",
-}
-
-# EPILOGUE_MAP = {"default": DEFAULT_EPILOGUE, "cshuffle": CSHUFFLE_EPILOGUE}
-
-EPILOGUE_MAP = {"cshuffle": CSHUFFLE_EPILOGUE}
-
-
-def BOOL_MAP(b_):
-    return {True: "true", False: "false"}[bool(b_)]
-
-
-# Can add some more supported combinations
-warp_tile_supported_combinations = {
-    "gfx90a": {
-        "fp16_fp16_fp16": [
-            [32, 32, 8],
-            [16, 16, 16],
-            [32, 32, 16],
-            [16, 16, 32],
-            [4, 64, 16],
-            [64, 4, 16],
-        ],
-        "bf16_bf16_bf16": [
-            [32, 32, 8],
-            [16, 16, 16],
-            [32, 32, 16],
-            [16, 16, 32],
-            [4, 64, 16],
-            [64, 4, 16],
-        ],
-        "fp8_fp8_fp16": [[32, 32, 16], [32, 32, 32]],
-        "bf8_bf8_fp16": [[32, 32, 16], [32, 32, 32]],
-    },
-    "gfx942": {
-        "fp16_fp16_fp16": [
-            [32, 32, 8],
-            [16, 16, 16],
-            [32, 32, 16],
-            [16, 16, 32],
-            [4, 64, 16],
-            [64, 4, 16],
-        ],
-        "bf16_bf16_bf16": [
-            [32, 32, 8],
-            [16, 16, 16],
-            [32, 32, 16],
-            [16, 16, 32],
-            [4, 64, 16],
-            [64, 4, 16],
-        ],
-        "fp8_fp8_fp16": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
-        "bf8_bf8_fp16": [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32]],
-        "int8_int8_int32": [[16, 16, 32], [32, 32, 16]],
-    },
-    "gfx950": {
-        "fp16_fp16_fp16": [
-            [32, 32, 8],
-            [16, 16, 16],
-            [32, 32, 16],
-            [16, 16, 32],
-            [4, 64, 16],
-            [64, 4, 16],
-        ],
-        "bf16_bf16_bf16": [
-            [32, 32, 8],
-            [16, 16, 16],
-            [32, 32, 16],
-            [16, 16, 32],
-            [4, 64, 16],
-            [64, 4, 16],
-        ],
-        "fp8_fp8_fp16": [
-            [32, 32, 16],
-            [32, 32, 32],
-            [16, 16, 32],
-            [16, 16, 64],
-            [16, 16, 128],
-            [32, 32, 64],
-        ],
-        "bf8_bf8_fp16": [
-            [32, 32, 16],
-            [32, 32, 32],
-            [16, 16, 64],
-            [16, 16, 32],
-            [16, 16, 128],
-            [32, 32, 64],
-        ],
-    },
-}
-
-# Remove some unsupported combinations
-trait_unsupported_combinations = {
-    ("compv3", "cshuffle", "interwave"),
-    ("compv3", "default", "interwave"),
-    ("compv4", "cshuffle", "interwave"),
-    ("compv4", "default", "interwave"),
-}
-
-
-ELEMENT_SIZE_MAP = {
-    "fp16": 2,
-    "bf16": 2,
-    "int8": 1,
-    "fp8": 1,
-    "bf8": 1,
-    "int4": 0.5,
-    "int32": 4,
-}
-
-
-def element_size(data_type: str) -> float:
-    """Calculate the size (in bytes) of a single element for given data type."""
-    data_type = data_type.lower()
-    if data_type not in ELEMENT_SIZE_MAP:
-        raise ValueError(f"Unsupported data type: {data_type}")
-    return ELEMENT_SIZE_MAP[data_type]
--- a/tile_engine/ops/gemm_multi_d/gemm_multi_d_common.hpp
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_common.hpp
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/core/numeric/integer.hpp"
+#include "ck_tile/core/numeric/pk_int4.hpp"
+
+//[TODO] This can be moved to commons
+// DataTypeTraits for all supported types
+template <typename T>
+struct DataTypeTraits;
+
+template <>
+struct DataTypeTraits<float>
+{
+    static constexpr const char* name = "fp32";
+};
+
+template <>
+struct DataTypeTraits<double>
+{
+    static constexpr const char* name = "fp64";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::half_t>
+{
+    static constexpr const char* name = "fp16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf16_t>
+{
+    static constexpr const char* name = "bf16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::fp8_t>
+{
+    static constexpr const char* name = "fp8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf8_t>
+{
+    static constexpr const char* name = "bf8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::int8_t>
+{
+    static constexpr const char* name = "int8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::int32_t>
+{
+    static constexpr const char* name = "int32";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::pk_int4_t>
+{
+    static constexpr const char* name = "pk_int4_t";
+};
+
+// Helper function to determine if a layout is row-major
+template <typename Layout>
+constexpr auto is_row_major(Layout)
+{
+    return ck_tile::bool_constant<std::is_same_v<Layout, ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
+// Structure to hold kernel traits for dispatcher
+struct KernelTraits
+{
+    std::string pipeline;  // compv3, compv4, mem
+    std::string scheduler; // intrawave, interwave
+    std::string epilogue;  // cshuffle, default
+    bool pad_m;
+    bool pad_n;
+    bool pad_k;
+    bool persistent;
+
+    // Constructor with defaults
+    KernelTraits()
+        : pipeline("compv3"),
+          scheduler("intrawave"),
+          epilogue("cshuffle"),
+          pad_m(false),
+          pad_n(false),
+          pad_k(false),
+          persistent(false)
+    {
+    }
+};
--- a/tile_engine/ops/gemm_multi_d/gemm_multi_d_config.py
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_config.py
@@ -1,250 +0,0 @@
-# SPDX-License-Identifier: MIT
-# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-# -*- coding: utf-8 -*-
-
-"""
-Handles loading, parsing, and validation of JSON and Argument configuration parameters.
-"""
-
-from pathlib import Path
-from dataclasses import dataclass
-from typing import List, Optional, Union, Type
-import json
-
-
-@dataclass
-class EnumConfigParam:
-    """Represents an enumeration-type configuration parameter"""
-
-    values: List[Union[int, str, bool]]
-
-
-@dataclass
-class RangeConfigParam:
-    """Represents a numeric range-type configuration parameter"""
-
-    min: int
-    max: int
-    step: int
-    exclude: Optional[List[int]]
-
-    def generate_candidates(self) -> List[int]:
-        """Generates valid candidates after applying range constraints"""
-
-        if self.min > self.max:
-            raise ValueError(f"Invalid range: min({self.min}) > max({self.max})")
-        if self.step <= 0:
-            raise ValueError(f"Step must be positive, got {self.step}")
-
-        candidates = list(range(self.min, self.max + 1, self.step))
-
-        if hasattr(self, "exclude") and self.exclude:
-            if not isinstance(self.exclude, list):
-                raise TypeError("exclude must be list type")
-            exclude_set = set(self.exclude)
-            candidates = [x for x in candidates if x not in exclude_set]
-
-        if not candidates:
-            raise ValueError(
-                f"No valid candidates for range [{self.min}-{self.max}] "
-                f"with step {self.step} and excludes {self.exclude}"
-            )
-
-        return candidates
-
-
-@dataclass
-class DataType:
-    """Configuration class for data type parameter."""
-
-    a_datatype: str
-    b_datatype: str
-    e_datatype: str
-    d0_datatype: str
-    d1_datatype: str
-    ds_datatype: List[str]
-
-
-@dataclass
-class Layout:
-    """Configuration class for Layout parameter."""
-
-    a_layout: str
-    b_layout: str
-    e_layout: str
-    d0_layout: str
-    d1_layout: str
-    ds_layout: List[str]
-
-
-@dataclass
-class ArgumentConfig:
-    """Configuration class for Argument parameter."""
-
-    datatypes: DataType
-    layouts: Layout
-    function_name: str
-
-    @classmethod
-    def from_args(
-        cls: Type["ArgumentConfig"],
-        datatype: str,
-        layout: str,
-        elementwise_function: str,
-    ) -> "ArgumentConfig":
-        """configuration loader with validation controls"""
-
-        datatypes = DataType(
-            a_datatype=datatype,
-            b_datatype=datatype,
-            e_datatype=datatype,
-            d0_datatype=datatype,
-            d1_datatype=datatype,
-            ds_datatype=[datatype, datatype],
-        )
-
-        layout_parts = layout.lower()
-        assert len(layout_parts) == 4, (
-            f"Invalid layout string: {layout} (must be 4 characters like 'rcrr' where r stands for row major and c stands for column major)"
-        )
-        assert layout_parts[0] in ("r", "c"), (
-            f"Invalid matrix_a layout: {layout_parts[0]} (must be 'r' for row major or or 'c' for column major)"
-        )
-        assert layout_parts[1] in ("r", "c"), (
-            f"Invalid matrix_b layout: {layout_parts[1]} (must be 'r' for row major or or 'c' for column major)"
-        )
-        assert layout_parts[2] == "r", (
-            f"Invalid matrix_e layout: {layout_parts[2]} (must be 'r' only as currently we are supporting only row major)"
-        )
-        assert layout_parts[3] == "r", (
-            f"Invalid D dimension layout: {layout_parts[3]} (must be 'r' only as currently we are supporting only row major)"
-        )
-
-        layouts = Layout(
-            a_layout=layout[0],
-            b_layout=layout[1],
-            e_layout=layout[2],
-            d0_layout=layout[3],
-            d1_layout=layout[3],
-            ds_layout=[layout[3], layout[3]],
-        )
-        # Elementwise function name validation
-        valid_functions = ["mul", "add", "passthrough"]
-        if elementwise_function not in valid_functions:
-            raise ValueError(
-                f"Invalid elementwise function: {elementwise_function}. "
-                f"Valid options are: {', '.join(valid_functions)}"
-            )
-
-        # Set the function name based on the elementwise function
-        if elementwise_function == "mul":
-            function_name = "MultiDMultiply"
-        elif elementwise_function == "add":
-            function_name = "MultiDAdd"
-        elif elementwise_function == "passthrough":
-            function_name = "PassThrough"  # TODO Change this
-
-        return cls(datatypes=datatypes, layouts=layouts, function_name=function_name)
-
-
-@dataclass
-class TileConfig:
-    """Configuration class for tile parameter."""
-
-    tile_m: Union[EnumConfigParam, RangeConfigParam]
-    tile_n: Union[EnumConfigParam, RangeConfigParam]
-    tile_k: Union[EnumConfigParam, RangeConfigParam]
-
-    warp_m: Union[EnumConfigParam, RangeConfigParam]
-    warp_n: Union[EnumConfigParam, RangeConfigParam]
-    warp_k: Union[EnumConfigParam, RangeConfigParam]
-
-    warp_tile_m: Union[EnumConfigParam, RangeConfigParam]
-    warp_tile_n: Union[EnumConfigParam, RangeConfigParam]
-    warp_tile_k: Union[EnumConfigParam, RangeConfigParam]
-
-
-@dataclass
-class TraitConfig:
-    """Configuration class for kernel traits."""
-
-    pipeline: EnumConfigParam
-    scheduler: EnumConfigParam
-    epilogue: EnumConfigParam
-    pad_m: EnumConfigParam
-    pad_n: EnumConfigParam
-    pad_k: EnumConfigParam
-
-
-@dataclass
-class JsonConfig:
-    """Configuration class for JSON parameter."""
-
-    tile_config: TileConfig
-    trait_config: TraitConfig
-
-    @classmethod
-    def from_json(cls: Type["JsonConfig"], filepath: str) -> "JsonConfig":
-        """JSON configuration loader with validation controls"""
-        config_path = Path(filepath)
-
-        try:
-            if not config_path.exists():
-                raise FileNotFoundError(f"Config file {filepath} not found")
-
-            with config_path.open("r") as f:
-                config_dict = json.load(f)
-
-            # Parse tile config
-            def create_param(param_dict):
-                if "values" in param_dict:
-                    return EnumConfigParam(values=param_dict["values"])
-                else:
-                    return RangeConfigParam(
-                        min=param_dict["min"],
-                        max=param_dict["max"],
-                        step=param_dict["step"],
-                        exclude=param_dict.get("exclude", []),
-                    )
-
-            tile_config = TileConfig(
-                tile_m=create_param(config_dict["tile_config"]["tile_m"]),
-                tile_n=create_param(config_dict["tile_config"]["tile_n"]),
-                tile_k=create_param(config_dict["tile_config"]["tile_k"]),
-                warp_m=create_param(config_dict["tile_config"]["warp_m"]),
-                warp_n=create_param(config_dict["tile_config"]["warp_n"]),
-                warp_k=create_param(config_dict["tile_config"]["warp_k"]),
-                warp_tile_m=create_param(config_dict["tile_config"]["warp_tile_m"]),
-                warp_tile_n=create_param(config_dict["tile_config"]["warp_tile_n"]),
-                warp_tile_k=create_param(config_dict["tile_config"]["warp_tile_k"]),
-            )
-
-            # Parse trait config
-            trait_config = TraitConfig(
-                pipeline=EnumConfigParam(
-                    values=config_dict["trait_config"]["pipeline"]["values"]
-                ),
-                scheduler=EnumConfigParam(
-                    values=config_dict["trait_config"]["scheduler"]["values"]
-                ),
-                epilogue=EnumConfigParam(
-                    values=config_dict["trait_config"]["epilogue"]["values"]
-                ),
-                pad_m=EnumConfigParam(
-                    values=config_dict["trait_config"]["pad_m"]["values"]
-                ),
-                pad_n=EnumConfigParam(
-                    values=config_dict["trait_config"]["pad_n"]["values"]
-                ),
-                pad_k=EnumConfigParam(
-                    values=config_dict["trait_config"]["pad_k"]["values"]
-                ),
-            )
-
-            return cls(tile_config=tile_config, trait_config=trait_config)
-
-        except json.JSONDecodeError as e:
-            raise ValueError(f"Invalid JSON format: {str(e)}")
-        except KeyError as e:
-            raise KeyError(f"Missing required configuration field: {str(e)}")
--- a/tile_engine/ops/gemm_multi_d/gemm_multi_d_host_api.hpp
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_host_api.hpp
@@ -1,164 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <cstring>
-#include <string>
-#include <tuple>
-
-#include "ck_tile/host.hpp"
-#include "gemm_multi_d_dispatcher.hpp"
-#include "gemm_multi_d_common.hpp"
-
-template <typename T>
-struct DataTypeTraits;
-
-template <>
-struct DataTypeTraits<float>
-{
-    static constexpr const char* name = "fp32";
-};
-
-template <>
-struct DataTypeTraits<double>
-{
-    static constexpr const char* name = "fp64";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::half_t>
-{
-    static constexpr const char* name = "fp16";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::bf16_t>
-{
-    static constexpr const char* name = "bf16";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::fp8_t>
-{
-    static constexpr const char* name = "fp8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::bf8_t>
-{
-    static constexpr const char* name = "bf8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::int8_t>
-{
-    static constexpr const char* name = "int8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::int32_t>
-{
-    static constexpr const char* name = "int32";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::pk_int4_t>
-{
-    static constexpr const char* name = "pk_int4_t";
-};
-
-template <typename Layout>
-static constexpr inline auto is_row_major(Layout layout_)
-{
-    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
-                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
-}
-
-inline auto create_args(int argc, char* argv[])
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
-        .insert("n", "4096", "The value for n dimension. Default is 4096.")
-        .insert("k", "2048", "The value for k dimension. Default is 2048.")
-        .insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
-        .insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
-        .insert("stride_ds", "0", "The stride value for tensor Ds  Default is 0.")
-        .insert("stride_e", "0", "The stride value for tensor E  Default is 0.")
-        .insert("split_k", "1", "The split value for k dimension. Default is 1.")
-        .insert("verify",
-                "1",
-                "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
-                "for validation on GPU. Default is 1, validation on CPU, as validation on GPU is "
-                "not supported.")
-        .insert("log",
-                "false",
-                "Wether output kernel instance information or not. Possible values are true or "
-                "false. Default is false")
-        .insert("warmup",
-                "50",
-                "The number of iterations before benchmarking the kernel. Default is 50.")
-        .insert("repeat",
-                "100",
-                "The number of iterations for benchmarking the kernel. Default is 100.")
-        .insert("timer",
-                "true",
-                "Indicates whether the timer is a GPU timer. Possible values are true or false. "
-                "Default is true.")
-        .insert("init",
-                "0",
-                "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
-                "for constant(1). Default is 0, random.")
-        .insert("flush_cache",
-                "false",
-                "To flush cache, possible values are true or false. "
-                "Default is false.")
-        .insert("rotating_count", "5", "number of iterations to rotate the cache. default is 5.")
-        .insert("metric",
-                "0",
-                "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
-                "tflops, or 2 for bandwidth. Default is 0, latency.")
-        .insert("csv_filename",
-                "gemm_multi_d_kernel",
-                "The filename of benchmark result. Default is set to gemm_multi_d_kernel.")
-        .insert(
-            "pipeline",
-            "compv3",
-            "The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3.")
-        .insert("scheduler",
-                "intrawave",
-                "The type of pipeline. Possible values are compv3, compv4 or mem. Default is "
-                "compv3.")
-        .insert(
-            "epilogue",
-            "cshuffle",
-            "The type of epilogue. Possible values are cshuffle or default. Default is cshuffle.")
-        .insert("pad_m",
-                "false",
-                "Whether pad or not in m direction. Possible values are true or false. Default is "
-                "false.")
-        .insert("pad_n",
-                "false",
-                "Whether pad or not in n direction. Possible values are true or false. Default is "
-                "false.")
-        .insert("pad_k",
-                "false",
-                "Whether pad or not in k direction. Possible values are true or false. Default is "
-                "false.");
-
-    bool result = arg_parser.parse(argc, argv);
-    return std::make_tuple(result, arg_parser);
-}
-
-auto get_kernel_func_by_trait(const ck_tile::ArgParser& arg_parser)
-{
-    KernelTraits trait;
-    trait.pipeline  = arg_parser.get_str("pipeline");
-    trait.scheduler = arg_parser.get_str("scheduler");
-    trait.epilogue  = arg_parser.get_str("epilogue");
-    trait.pad_m     = arg_parser.get_bool("pad_m");
-    trait.pad_n     = arg_parser.get_bool("pad_n");
-    trait.pad_k     = arg_parser.get_bool("pad_k");
-
-    return GemmMultiDDispatcher::dispatch(trait);
-}
--- a/tile_engine/ops/gemm_multi_d/gemm_multi_d_instance_builder.py
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_instance_builder.py
--- a/tile_engine/ops/gemm_multi_d/gemm_multi_d_profiler.hpp
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_profiler.hpp
@@ -9,7 +9,7 @@

 #include "ck_tile/host/device_prop.hpp"
 #include "ck_tile/ops/gemm.hpp"
-#include "benchmark_gemm_multi_d.hpp"
+#include "gemm_multi_d_benchmark.hpp"

 class GemmMultiDProfiler
 {
@@ -20,6 +20,25 @@ class GemmMultiDProfiler
        return instance;
    }

+    // Overload for single kernel benchmarking
+    void benchmark(GemmMultiDProblem& gemm_multi_d_problem,
+                   std::function<float(const ck_tile::GemmMultiDHostArgs<DsDataType::size()>&,
+                                       const ck_tile::stream_config&)> kernel_func)
+    {
+        // Create a vector with a single callable that returns both name and time
+        std::vector<std::function<std::tuple<std::string, float>(
+            ck_tile::GemmMultiDHostArgs<DsDataType::size()>&, const ck_tile::stream_config&)>>
+            callables;
+
+        callables.push_back([kernel_func](ck_tile::GemmMultiDHostArgs<DsDataType::size()>& args,
+                                          const ck_tile::stream_config& stream) {
+            float time = kernel_func(args, stream);
+            return std::make_tuple(std::string(KERNEL_NAME), time);
+        });
+
+        benchmark(gemm_multi_d_problem, callables);
+    }
+
    void benchmark(
        GemmMultiDProblem& gemm_multi_d_problem,
        std::vector<std::function<std::tuple<std::string, float>(
@@ -30,7 +49,7 @@ class GemmMultiDProfiler
        const BLayout layout_b   = BLayout{};
        const D0Layout layout_d0 = D0Layout{};
        const D1Layout layout_d1 = D1Layout{};
-        const ELayout layout_e   = ELayout{};
+        const CLayout layout_c   = CLayout{};

        gemm_multi_d_problem.stride_a_ = ck_tile::get_default_stride(gemm_multi_d_problem.m_,
                                                                     gemm_multi_d_problem.k_,
@@ -50,10 +69,10 @@ class GemmMultiDProfiler
                                        gemm_multi_d_problem.n_,
                                        gemm_multi_d_problem.stride_d1_,
                                        is_row_major(layout_d1));
-        gemm_multi_d_problem.stride_e_ = ck_tile::get_default_stride(gemm_multi_d_problem.m_,
+        gemm_multi_d_problem.stride_c_ = ck_tile::get_default_stride(gemm_multi_d_problem.m_,
                                                                     gemm_multi_d_problem.n_,
-                                                                     gemm_multi_d_problem.stride_e_,
-                                                                     is_row_major(layout_e));
+                                                                     gemm_multi_d_problem.stride_c_,
+                                                                     is_row_major(layout_c));

        ck_tile::HostTensor<ADataType> a_m_k(
            ck_tile::host_tensor_descriptor(gemm_multi_d_problem.m_,
@@ -75,30 +94,30 @@ class GemmMultiDProfiler
                                            gemm_multi_d_problem.n_,
                                            gemm_multi_d_problem.stride_d1_,
                                            is_row_major(layout_d1)));
-        ck_tile::HostTensor<EDataType> e_m_n_device_result(
+        ck_tile::HostTensor<CDataType> c_m_n_dev_result(
            ck_tile::host_tensor_descriptor(gemm_multi_d_problem.m_,
                                            gemm_multi_d_problem.n_,
-                                            gemm_multi_d_problem.stride_e_,
-                                            is_row_major(layout_e)));
+                                            gemm_multi_d_problem.stride_c_,
+                                            is_row_major(layout_c)));

        ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
        ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
        ck_tile::FillUniformDistribution<D0DataType>{-1.f, 1.f}(d0_m_n);
-        ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(d1_m_n);
+        ck_tile::FillUniformDistribution<D1DataType>{-1.f, 1.f}(d1_m_n);

        ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
        ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
        ck_tile::DeviceMem d0_m_n_dev_buf(d0_m_n.get_element_space_size_in_bytes());
        ck_tile::DeviceMem d1_m_n_dev_buf(d1_m_n.get_element_space_size_in_bytes());
-        ck_tile::DeviceMem e_m_n_dev_buf(e_m_n_device_result.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());

        a_m_k_dev_buf.ToDevice(a_m_k.mData.data());
        b_k_n_dev_buf.ToDevice(b_k_n.mData.data());
        d0_m_n_dev_buf.ToDevice(d0_m_n.mData.data());
        d1_m_n_dev_buf.ToDevice(d1_m_n.mData.data());

-        e_m_n_dev_buf.SetZero();
-        e_m_n_device_result.SetZero();
+        c_m_n_dev_buf.SetZero();
+        c_m_n_dev_result.SetZero();

        std::array<const void*, DsDataType::size()> ds_ptr_buf = {d0_m_n_dev_buf.GetDeviceBuffer(),
                                                                  d1_m_n_dev_buf.GetDeviceBuffer()};
@@ -110,7 +129,7 @@ class GemmMultiDProfiler
            a_m_k_dev_buf.GetDeviceBuffer(),
            b_k_n_dev_buf.GetDeviceBuffer(),
            ds_ptr_buf,
-            e_m_n_dev_buf.GetDeviceBuffer(),
+            c_m_n_dev_buf.GetDeviceBuffer(),
            gemm_multi_d_problem.split_k_,
            gemm_multi_d_problem.m_,
            gemm_multi_d_problem.n_,
@@ -118,19 +137,19 @@ class GemmMultiDProfiler
            gemm_multi_d_problem.stride_a_,
            gemm_multi_d_problem.stride_b_,
            stridesDs,
-            gemm_multi_d_problem.stride_e_,
+            gemm_multi_d_problem.stride_c_,
        };

-        ck_tile::HostTensor<EDataType> e_m_n_host_result(
+        ck_tile::HostTensor<CDataType> c_m_n_host_result(
            ck_tile::host_tensor_descriptor(gemm_multi_d_problem.m_,
                                            gemm_multi_d_problem.n_,
-                                            gemm_multi_d_problem.stride_e_,
-                                            is_row_major(layout_e)));
+                                            gemm_multi_d_problem.stride_c_,
+                                            is_row_major(layout_c)));

        if(setting_.verify_)
        {
            gemm_multi_d_host_reference(
-                setting_.verify_, a_m_k, b_k_n, d0_m_n, d1_m_n, e_m_n_host_result);
+                setting_.verify_, a_m_k, b_k_n, d0_m_n, d1_m_n, c_m_n_host_result);
        }

        for(auto& callable : callables)
@@ -139,54 +158,58 @@ class GemmMultiDProfiler
                callable(gemm_multi_d_args,
                         ck_tile::stream_config{
                             nullptr, true, setting_.log_, setting_.n_warmup_, setting_.n_repeat_});
-
-            auto [kernel_name, execution_time] = kernel_run_result;
-
            process_result(gemm_multi_d_problem,
-                           e_m_n_dev_buf,
-                           e_m_n_host_result,
-                           e_m_n_device_result,
+                           c_m_n_dev_buf,
+                           c_m_n_host_result,
+                           c_m_n_dev_result,
                           kernel_run_result);
        }
    }

    void process_result(const GemmMultiDProblem& gemm_multi_d_problem,
-                        ck_tile::DeviceMem& e_m_n_dev_buf,
-                        ck_tile::HostTensor<EDataType>& e_m_n_host_result,
-                        ck_tile::HostTensor<EDataType>& e_m_n_dev_result,
+                        ck_tile::DeviceMem& c_m_n_dev_buf,
+                        ck_tile::HostTensor<CDataType>& c_m_n_host_result,
+                        ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
                        const std::tuple<std::string, float>& kernel_run_result)
    {
        auto [name, avg_time] = kernel_run_result;

        KernelInstance kernel_instance{name, gemm_multi_d_problem, {-1.0f, -1.0f, -1.0f}};

-        static constexpr ck_tile::index_t NumDTensor = DsDataType::size();
-        std::size_t flop = 0, num_byte = 0;
-        flop += std::size_t(2) * gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_ *
-                gemm_multi_d_problem.k_;
-        ck_tile::static_for<0, NumDTensor, 1>{}([&](auto i) {
+        // compute performance metric
+        std::size_t flop = std::size_t(2) * gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_ *
+                           gemm_multi_d_problem.k_;
+        std::size_t num_byte =
+            sizeof(ADataType) * gemm_multi_d_problem.m_ * gemm_multi_d_problem.k_ +
+            sizeof(BDataType) * gemm_multi_d_problem.n_ * gemm_multi_d_problem.k_ +
+            sizeof(CDataType) * gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_;
+
+        // Dth Dimension Updates
+        ck_tile::static_for<0, DsDataType::size(), 1>{}([&](auto i) {
            num_byte += sizeof(ck_tile::remove_cvref_t<std::tuple_element_t<i, DsDataType>>) *
                        gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_;
            flop += sizeof(ck_tile::remove_cvref_t<std::tuple_element_t<i, DsDataType>>) *
                    gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_;
        });
-        num_byte += sizeof(ADataType) * gemm_multi_d_problem.m_ * gemm_multi_d_problem.k_ +
-                    sizeof(BDataType) * gemm_multi_d_problem.k_ * gemm_multi_d_problem.n_ +
-                    sizeof(EDataType) * gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_;

+        // update
        kernel_instance.perf_result_.latency_   = avg_time;
        kernel_instance.perf_result_.tflops_    = static_cast<float>(flop) / 1.E9 / avg_time;
        kernel_instance.perf_result_.bandwidth_ = num_byte / 1.E6 / avg_time;

-        if(setting_.log_ > 0)
+        if(setting_.log_ > 0 && !setting_.json_output_)
        {
            std::cout << kernel_instance << std::endl;
        }

-        e_m_n_dev_buf.FromDevice(e_m_n_dev_result.data());
+        // verify result
+        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
        bool verified_correct =
-            !setting_.verify_ ||
-            compare(name, gemm_multi_d_problem.k_, e_m_n_dev_result, e_m_n_host_result);
+            !setting_.verify_ || compare(name,
+                                         gemm_multi_d_problem.k_,
+                                         1, // Multi d currently supports only k_batch  = 1
+                                         c_m_n_dev_result,
+                                         c_m_n_host_result);

        if(verified_correct)
        {
@@ -197,8 +220,9 @@ class GemmMultiDProfiler
            std::cout << "Verification failed, skip kernel: " << name << std::endl;
        }

-        e_m_n_dev_buf.SetZero();
-        e_m_n_dev_result.SetZero();
+        // clear tensor
+        c_m_n_dev_buf.SetZero();
+        c_m_n_dev_result.SetZero();
    }

    KernelInstance select_best_instance(Metric metric)
@@ -213,10 +237,18 @@ class GemmMultiDProfiler
                                                         b.perf_result_, a.perf_result_, metric);
                                                 });

-        std::cout << "**********************************" << std::endl;
-        std::cout << "According to given metrics: " << get_metric_name(metric) << "\n"
-                  << "The best kernel instance is: " << kernel_instance << std::endl;
-        std::cout << "**********************************" << std::endl;
+        if(setting_.json_output_)
+        {
+            // Output clean JSON only
+            std::cout << kernel_instance << std::endl;
+        }
+        else
+        {
+            std::cout << "**********************************" << std::endl;
+            std::cout << "According to given metrics: " << get_metric_name(metric) << "\n"
+                      << "Current kernel performance is: " << kernel_instance << std::endl;
+            std::cout << "**********************************" << std::endl;
+        }

        if(!setting_.csv_filename_.empty())
        {
@@ -244,16 +276,13 @@ class GemmMultiDProfiler
                file << get_rocm_version() << "," << ck_tile::get_device_name() << ","
                     << problem.split_k_ << "," << problem.m_ << "," << problem.n_ << ","
                     << problem.k_ << "," << problem.stride_a_ << "," << problem.stride_b_ << ","
-                     << problem.stride_d0_ << "," << problem.stride_d1_ << "," << problem.stride_e_
-                     << "," << problem.dtype_a_ << "," << problem.dtype_b_ << ","
-                     << problem.dtype_d0_ << "," << problem.dtype_d1_ << "," << problem.dtype_acc_
-                     << "," << problem.dtype_e_ << "," << problem.layout_a_ << ","
-                     << problem.layout_b_ << "," << problem.layout_d0_ << "," << problem.layout_d1_
-                     << "," << problem.layout_e_ << "," << "," << name << "," << std::fixed
-                     << std::setprecision(4) << perf.latency_ << "," << std::fixed
-                     << std::setprecision(4) << perf.tflops_ << "," << std::fixed
-                     << std::setprecision(4) << perf.bandwidth_ << "," << get_metric_name(metric)
-                     << "\n";
+                     << problem.stride_c_ << "," << problem.dtype_a_ << "," << problem.dtype_b_
+                     << "," << problem.dtype_acc_ << "," << problem.dtype_c_ << ","
+                     << problem.layout_a_ << "," << problem.layout_b_ << "," << problem.layout_c_
+                     << "," << name << "," << std::fixed << std::setprecision(4) << perf.latency_
+                     << "," << std::fixed << std::setprecision(4) << perf.tflops_ << ","
+                     << std::fixed << std::setprecision(4) << perf.bandwidth_ << ","
+                     << get_metric_name(metric) << "\n";

                if(!file)
                {
--- a/tile_engine/ops/gemm_preshuffle/CMakeLists.txt
+++ b/tile_engine/ops/gemm_preshuffle/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(GEMM_PRESHUFFLE_DATATYPE "fp16;fp8" CACHE STRING "List of datatypes for GEMM Preshuffle (semicolon-separated)")
+set(GEMM_PRESHUFFLE_DATATYPE "fp16;fp8;bf16;bf8" CACHE STRING "List of datatypes for GEMM Preshuffle (semicolon-separated)")
 set(GEMM_PRESHUFFLE_LAYOUT "rcr" CACHE STRING "List of layout for GEMM Preshuffle (semicolon-separated)")
 set(GEMM_PRESHUFFLE_CONFIG_FILE "" CACHE STRING "Custom config file name (without path, must be in configs/ folder)")
 option(ENABLE_CCACHE_GEMM_PRESHUFFLE "Enable ccache for GEMM Preshuffle ops compilation" OFF)
@@ -122,15 +122,15 @@ function(build_individual_gemm_preshuffle_targets datatype layout)
    if(DEFINED ENV{GEMM_PRESHUFFLE_CONFIG_FILE} AND NOT "$ENV{GEMM_PRESHUFFLE_CONFIG_FILE}" STREQUAL "")
        set(config_filename "$ENV{GEMM_PRESHUFFLE_CONFIG_FILE}")
        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${config_filename}")
-        message(STATUS "  Using config from environment variable: ${config_filename}")
+        message(VERBOSE "  Using config from environment variable: ${config_filename}")
    elseif(NOT "${GEMM_PRESHUFFLE_CONFIG_FILE}" STREQUAL "")
        # Use CMake variable if set
        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${GEMM_PRESHUFFLE_CONFIG_FILE}")
-        message(STATUS "  Using custom config: ${GEMM_PRESHUFFLE_CONFIG_FILE}")
+        message(VERBOSE "  Using custom config: ${GEMM_PRESHUFFLE_CONFIG_FILE}")
    else()
        # Use default config for all layouts
        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
-        message(STATUS "  Using default config for layout ${layout}")
+        message(VERBOSE "  Using default config for layout ${layout}")
    endif()
    
    # Check if config file exists
@@ -151,18 +151,18 @@ function(build_individual_gemm_preshuffle_targets datatype layout)
    endif()
    
    # Generate individual kernel files using parallel version
-    message(STATUS "Generating individual kernels for ${datatype} ${layout} using ${num_workers} workers...")
-    message(STATUS "  Working path: ${working_path}")
-    message(STATUS "  Config file: ${json_blob}")
-    message(STATUS "  Python executable: ${Python3_EXECUTABLE}")
-    message(STATUS "  Script path: ${CMAKE_CURRENT_LIST_DIR}/gemm_preshuffle_instance_builder.py")
+    message(VERBOSE "Generating individual kernels for ${datatype} ${layout} using ${num_workers} workers...")
+    message(VERBOSE "  Working path: ${working_path}")
+    message(VERBOSE "  Config file: ${json_blob}")
+    message(VERBOSE "  Python executable: ${Python3_EXECUTABLE}")
+    message(VERBOSE "  Script path: ${CMAKE_CURRENT_LIST_DIR}/gemm_preshuffle_instance_builder.py")
    
    # Create working directory first
    file(MAKE_DIRECTORY ${working_path})
    
    # First, just list the kernels (fast operation)
-    message(STATUS "  Listing kernel configurations...")
-    message(STATUS "  GPU Targets: ${GEMM_PRESHUFFLE_GPU_TARGETS_INDIVIDUAL}")
+    message(VERBOSE "  Listing kernel configurations...")
+    message(VERBOSE "  GPU Targets: ${GEMM_PRESHUFFLE_GPU_TARGETS_INDIVIDUAL}")
    execute_process(
        COMMAND ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_LIST_DIR}/gemm_preshuffle_instance_builder.py
                --working_path ${working_path}
@@ -185,7 +185,7 @@ function(build_individual_gemm_preshuffle_targets datatype layout)
    if(EXISTS ${working_path}/gemm_preshuffle_kernel_count.txt)
        file(READ ${working_path}/gemm_preshuffle_kernel_count.txt kernel_count)
        string(STRIP "${kernel_count}" kernel_count)
-        message(STATUS "  Found ${kernel_count} kernel configurations")
+        message(VERBOSE "  Found ${kernel_count} kernel configurations")
    else()
        message(FATAL_ERROR "Kernel count file not found")
    endif()
@@ -209,10 +209,10 @@ function(build_individual_gemm_preshuffle_targets datatype layout)
 endfunction()

 # Main build logic - Only individual builds supported
-message(STATUS "=== Starting Tile Engine GEMM Preshuffle Configuration ===")
-message(STATUS "GEMM_PRESHUFFLE_DATATYPE: ${GEMM_PRESHUFFLE_DATATYPE}")
-message(STATUS "GEMM_PRESHUFFLE_LAYOUT: ${GEMM_PRESHUFFLE_LAYOUT}")
-message(STATUS "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+message(VERBOSE "=== Starting Tile Engine GEMM Preshuffle Configuration ===")
+message(VERBOSE "GEMM_PRESHUFFLE_DATATYPE: ${GEMM_PRESHUFFLE_DATATYPE}")
+message(VERBOSE "GEMM_PRESHUFFLE_LAYOUT: ${GEMM_PRESHUFFLE_LAYOUT}")
+message(VERBOSE "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")

 # Filter GPU targets to only gfx90a, gfx942, and gfx950
 set(GEMM_PRESHUFFLE_GPU_TARGETS_INDIVIDUAL "")
@@ -221,7 +221,7 @@ set(DESIRED_TARGETS "gfx90a;gfx942;gfx950")
 foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
    if(target IN_LIST DESIRED_TARGETS)
        list(APPEND GEMM_PRESHUFFLE_GPU_TARGETS_INDIVIDUAL ${target})
-        message(STATUS "  Adding GPU target: ${target}")
+        message(VERBOSE "  Adding GPU target: ${target}")
    endif()
 endforeach()

@@ -229,7 +229,7 @@ endforeach()
 if(NOT GEMM_PRESHUFFLE_GPU_TARGETS_INDIVIDUAL)
    message(WARNING "Skipping Tile Engine GEMM build: No supported GPU targets (gfx90a, gfx942, gfx950) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 else()
-    message(STATUS "Building individual GEMM Preshuffle targets for GPU targets: ${GEMM_PRESHUFFLE_GPU_TARGETS_INDIVIDUAL}")
+    message(VERBOSE "Building individual GEMM Preshuffle targets for GPU targets: ${GEMM_PRESHUFFLE_GPU_TARGETS_INDIVIDUAL}")

    # Enable parallel compilation optimizations
    # Set up job pools for better parallel compilation control
@@ -244,12 +244,12 @@ else()
        find_program(CCACHE_PROGRAM ccache)
        if(CCACHE_PROGRAM)
            set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
-            message(STATUS "Using ccache for faster compilation")
+            message(VERBOSE "Using ccache for faster compilation")
        else()
            message(WARNING "ccache requested but not found")
        endif()
    else()
-        message(STATUS "ccache disabled for GEMM Preshuffle ops (use -DENABLE_CCACHE_GEMM_PRESHUFFLE=ON to enable)")
+        message(VERBOSE "ccache disabled for GEMM Preshuffle ops (use -DENABLE_CCACHE_GEMM_PRESHUFFLE=ON to enable)")
    endif()

    # Create master collection targets