Adding dispatcher architecture (#3300)

* WIP POC of dispatcher * Dispatcher python workflow setup. * Dispatcher cleanup and updates. Further dispatcher cleanup and updates. Build fixes Improvements and python to CK example Improvements to readme * Fixes to python paths * Cleaning up code * Improving dispatcher support for different arch Fixing typos * Fix formatting errors * Cleaning up examples * Improving codegeneration * Improving and fixing C++ examples * Adding conv functionality (fwd,bwd,bwdw) and examples. * Fixes based on feedback. * Further fixes based on feedback. * Adding stress test for autogeneration and autocorrection, and fixing preshuffle bug. * Another round of improvements based on feedback. * Trimming out unnecessary code. * Fixing the multi-D implementation. * Using gpu verification for gemms and fixing convolutions tflops calculation. * Fix counter usage issue and arch filtering per ops. * Adding changelog and other fixes. * Improve examples and resolve critical bugs. * Reduce build time for python examples. * Fixing minor bug. * Fix compilation error. * Improve installation instructions for dispatcher. * Add docker based installation instructions for dispatcher. * Fixing arch-based filtering to match tile engine. * Remove dead code and fix arch filtering. * Minor bugfix. * Updates after rebase. * Trimming code. * Fix copyright headers. * Consolidate examples, cut down code. * Minor fixes. * Improving python examples. * Update readmes. * Remove conv functionality. * Cleanup following conv removable.
2026-04-20 14:59:17 +00:00 · 2026-01-22 09:34:33 -08:00
parent 44f481a45c
commit 9e049a32a1
97 changed files with 33472 additions and 0 deletions
--- a/dispatcher/include/ck_tile/dispatcher.hpp
+++ b/dispatcher/include/ck_tile/dispatcher.hpp
@@ -0,0 +1,19 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+/// Main dispatcher header - includes all core components
+/// Use this for convenient access to the full dispatcher API
+
+#include "ck_tile/dispatcher/kernel_key.hpp"
+#include "ck_tile/dispatcher/kernel_config.hpp"
+#include "ck_tile/dispatcher/kernel_decl.hpp"
+#include "ck_tile/dispatcher/problem.hpp"
+#include "ck_tile/dispatcher/kernel_instance.hpp"
+#include "ck_tile/dispatcher/registry.hpp"
+#include "ck_tile/dispatcher/dispatcher.hpp"
+#include "ck_tile/dispatcher/arch_filter.hpp"
+#include "ck_tile/dispatcher/backends/tile_backend.hpp"
+#include "ck_tile/dispatcher/backends/generated_tile_backend.hpp"
+#include "ck_tile/dispatcher/utils.hpp"
--- a/dispatcher/include/ck_tile/dispatcher/README.md
+++ b/dispatcher/include/ck_tile/dispatcher/README.md
@@ -0,0 +1,161 @@
+# CK Tile Dispatcher - C++ Headers
+
+C++ API for the CK Tile dispatcher.
+
+> **See also:** [Main Dispatcher README](../../../../README.md) for installation and core concepts.
+
+## File Organization
+
+```
+dispatcher/
+├── dispatcher.hpp          # Main dispatcher (kernel selection)
+├── registry.hpp            # Kernel registry (storage & lookup)
+├── problem.hpp             # Problem specification
+├── kernel_key.hpp          # Kernel configuration key
+├── kernel_instance.hpp     # Kernel instance interface
+├── utils.hpp               # Utilities (timers, GPU buffers)
+│
+└── backends/               # Backend implementations
+    ├── generated_tile_backend.hpp  # CK Tile kernels (production)
+    └── tile_backend.hpp            # Tile backend base
+```
+
+## Quick Start
+
+```cpp
+#include "ck_tile/dispatcher.hpp"
+
+using namespace ck_tile::dispatcher;
+using namespace ck_tile::dispatcher::utils;
+
+int main() {
+    // 1. Build kernel key
+    KernelKeyBuilder builder = KernelKeyBuilder::fp16_rcr();
+    builder.tile_m = 128;
+    builder.tile_n = 128;
+    builder.tile_k = 32;
+    KernelKey key = builder.build();
+
+    // 2. Register kernel
+    auto kernel = create_generated_tile_kernel<...>(key, "my_kernel");
+    Registry::instance().register_kernel(kernel, Priority::High);
+
+    // 3. Run GEMM
+    Dispatcher dispatcher;
+    Problem problem(1024, 1024, 1024);
+    float time_ms = dispatcher.run(a_ptr, b_ptr, c_ptr, problem, nullptr);
+}
+```
+
+## Core Classes
+
+### KernelKey (`kernel_key.hpp`)
+
+Uniquely identifies a kernel configuration:
+
+```cpp
+KernelKeyBuilder builder;
+builder.dtype_a = DataType::FP16;
+builder.layout_a = LayoutTag::Row;
+builder.tile_m = 256;
+builder.pipeline = Pipeline::CompV4;
+KernelKey key = builder.build();
+```
+
+### Registry (`registry.hpp`)
+
+Thread-safe kernel storage:
+
+```cpp
+auto& registry = Registry::instance();
+registry.register_kernel(kernel, Priority::High);
+registry.get_kernel_count();
+registry.export_json();
+```
+
+### Dispatcher (`dispatcher.hpp`)
+
+Kernel selection and execution:
+
+```cpp
+Dispatcher dispatcher;
+
+// Strategies
+dispatcher.set_strategy(SelectionStrategy::FirstFit);
+dispatcher.set_strategy(SelectionStrategy::Heuristic);
+
+// Run
+float time = dispatcher.run(a, b, c, problem, stream);
+```
+
+### Problem (`problem.hpp`)
+
+GEMM problem specification:
+
+```cpp
+Problem problem(M, N, K);
+problem.batch_size = 4;
+problem.alpha = 1.0f;
+problem.beta = 0.0f;
+
+// Auto-inference
+auto p = Problem::from_ab(a_rows, a_cols, b_rows, b_cols, trans_a, trans_b);
+```
+
+## Utilities (`utils.hpp`)
+
+### GPU Memory
+
+```cpp
+GpuBuffer<half_t> buffer(size);
+buffer.copy_from_host(host_ptr);
+buffer.copy_to_host(host_ptr);
+buffer.zero();
+```
+
+### Timing
+
+```cpp
+GpuTimer timer;
+timer.start();
+// kernel...
+timer.stop();
+float ms = timer.elapsed_ms();
+```
+
+### Quick Helpers
+
+```cpp
+// Create FP16 RCR key
+auto key = create_fp16_rcr_key(tile_m, tile_n, tile_k, ...);
+
+// Performance
+double tflops = calculate_tflops(M, N, K, time_ms);
+
+// Validation
+auto result = validate_result(gpu_ptr, cpu_ptr, size);
+```
+
+## Backend
+
+### Generated Tile Backend
+
+```cpp
+#include "ck_tile/dispatcher/backends/generated_tile_backend.hpp"
+
+auto kernel = create_generated_tile_kernel<
+    SelectedKernel, ADataType, BDataType, CDataType, AccDataType
+>(key, name);
+```
+
+## Best Practices
+
+1. Use `Release` build for performance
+2. Register kernels at startup
+3. Use `Priority::High` for hand-tuned kernels
+4. Reuse dispatcher instances
+5. Clear registry between test runs
+
+---
+
+> **More info:** See [../../../../README.md](../../../../README.md) for full documentation.
--- a/dispatcher/include/ck_tile/dispatcher/arch_filter.hpp
+++ b/dispatcher/include/ck_tile/dispatcher/arch_filter.hpp
@@ -0,0 +1,393 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/**
+ * Architecture-Specific Kernel Filtering for CK Tile Dispatcher
+ *
+ * Provides GPU architecture-aware validation of kernel configurations.
+ * Uses arch_specs_generated.hpp as single source of truth (generated from arch_specs.json).
+ *
+ * Usage:
+ *   ArchFilter filter("gfx942");
+ *
+ *   // Check if a kernel configuration is valid
+ *   if (filter.is_valid(kernel_key)) {
+ *       registry.register_kernel(kernel);
+ *   }
+ *
+ *   // Get validation result with error details
+ *   auto result = filter.validate(kernel_key);
+ *   if (!result.valid) {
+ *       for (const auto& error : result.errors) {
+ *           std::cerr << error << "\n";
+ *       }
+ *   }
+ *
+ * Adding New GPU Support:
+ *   1. Edit dispatcher/codegen/arch_specs.json
+ *   2. Run: python dispatcher/codegen/generate_arch_specs.py
+ *   3. Rebuild the dispatcher
+ */
+
+#pragma once
+
+#include "ck_tile/dispatcher/kernel_key.hpp"
+#include "ck_tile/dispatcher/arch_specs_generated.hpp"
+#include <array>
+#include <string>
+#include <vector>
+#include <cstdint>
+
+namespace ck_tile {
+namespace dispatcher {
+
+// =============================================================================
+// Re-export from generated header for convenience
+// =============================================================================
+
+// Use the generated types and functions from arch_specs namespace
+using GpuArch        = arch_specs::GpuArch;
+using WarpConfig     = arch_specs::WarpConfig;
+using WarpTileConfig = std::array<int, 3>;
+
+// Re-export string conversion functions
+using arch_specs::arch_to_string;
+using arch_specs::element_size;
+using arch_specs::get_lds_capacity;
+using arch_specs::get_supported_warp_configs;
+using arch_specs::is_trait_unsupported;
+using arch_specs::string_to_arch;
+
+// =============================================================================
+// Additional Helper Functions
+// =============================================================================
+
+/// Get supported warp tile configurations for arch and data types
+/// This function wraps the generated data with runtime logic
+inline std::vector<WarpTileConfig> get_supported_warp_tiles(GpuArch arch,
+                                                            DataType dtype_a,
+                                                            DataType dtype_b,
+                                                            [[maybe_unused]] DataType dtype_c)
+{
+    // Common FP16 configurations (from arch_specs.json)
+    std::vector<WarpTileConfig> fp16_configs = {
+        {32, 32, 8}, {16, 16, 16}, {32, 32, 16}, {16, 16, 32}, {4, 64, 16}, {64, 4, 16}};
+
+    // FP8 configurations
+    std::vector<WarpTileConfig> fp8_gfx942 = {
+        {32, 32, 16}, {32, 32, 32}, {16, 16, 32}, {16, 16, 64}};
+    std::vector<WarpTileConfig> fp8_gfx950 = {
+        {32, 32, 16}, {32, 32, 32}, {16, 16, 32}, {16, 16, 64}, {16, 16, 128}, {32, 32, 64}};
+
+    // INT8 configurations
+    std::vector<WarpTileConfig> int8_configs = {{16, 16, 32}, {32, 32, 16}};
+
+    // GFX1201 only supports limited FP16
+    std::vector<WarpTileConfig> rdna4_fp16 = {{16, 16, 16}};
+
+    // Match based on architecture and data types
+    if(dtype_a == DataType::FP16 && dtype_b == DataType::FP16)
+    {
+        if(arch == GpuArch::GFX_1201)
+            return rdna4_fp16;
+        return fp16_configs;
+    }
+    if(dtype_a == DataType::BF16 && dtype_b == DataType::BF16)
+    {
+        if(arch == GpuArch::GFX_1201)
+            return {};       // Not supported on RDNA4
+        return fp16_configs; // Same as FP16
+    }
+    if(dtype_a == DataType::FP8 || dtype_a == DataType::BF8)
+    {
+        if(arch == GpuArch::GFX_950)
+            return fp8_gfx950;
+        if(arch == GpuArch::GFX_942)
+            return fp8_gfx942;
+        if(arch == GpuArch::GFX_90A)
+            return {{32, 32, 16}, {32, 32, 32}};
+    }
+    if(dtype_a == DataType::INT8 && dtype_b == DataType::INT8)
+    {
+        if(arch == GpuArch::GFX_942)
+            return int8_configs;
+    }
+
+    return {}; // Unknown combination
+}
+
+// =============================================================================
+// Validation Result
+// =============================================================================
+
+/// Result of kernel validation
+struct ValidationResult
+{
+    bool valid = true;
+    std::vector<std::string> errors;
+    std::vector<std::string> warnings;
+
+    explicit operator bool() const { return valid; }
+
+    void add_error(const std::string& msg)
+    {
+        errors.push_back(msg);
+        valid = false;
+    }
+
+    void add_warning(const std::string& msg) { warnings.push_back(msg); }
+};
+
+// =============================================================================
+// Architecture Filter
+// =============================================================================
+
+/**
+ * Architecture-specific kernel filter.
+ *
+ * Validates kernel configurations against GPU architecture constraints
+ * including warp configurations, warp tiles, LDS capacity, and traits.
+ */
+class ArchFilter
+{
+    public:
+    /**
+     * Create architecture filter.
+     * @param arch Target GPU architecture
+     * @param strict_mode If true, unknown configurations are rejected
+     */
+    explicit ArchFilter(GpuArch arch, bool strict_mode = false)
+        : arch_(arch), strict_mode_(strict_mode)
+    {
+    }
+
+    /**
+     * Create architecture filter from string.
+     * @param arch_str GPU architecture string (e.g., "gfx942")
+     * @param strict_mode If true, unknown configurations are rejected
+     */
+    explicit ArchFilter(const std::string& arch_str, bool strict_mode = false)
+        : arch_(string_to_arch(arch_str)), strict_mode_(strict_mode)
+    {
+    }
+
+    /**
+     * Quick validation check.
+     * @param key Kernel configuration key
+     * @return true if configuration is valid for this architecture
+     */
+    [[nodiscard]] bool is_valid(const KernelKey& key) const { return validate(key).valid; }
+
+    /**
+     * Detailed validation with error messages.
+     * @param key Kernel configuration key
+     * @return ValidationResult with valid flag and error/warning messages
+     */
+    [[nodiscard]] ValidationResult validate(const KernelKey& key) const
+    {
+        ValidationResult result;
+
+        // Check architecture match
+        if(!key.gfx_arch.empty() && string_to_arch(key.gfx_arch) != arch_)
+        {
+            result.add_warning("Kernel compiled for different architecture: " + key.gfx_arch);
+        }
+
+        // Validate dimensions
+        validate_dimensions(key, result);
+
+        // Validate warp configuration
+        validate_warp_config(key, result);
+
+        // Validate warp tile configuration
+        validate_warp_tiles(key, result);
+
+        // Validate trait combination
+        validate_traits(key, result);
+
+        // Validate LDS capacity
+        validate_lds(key, result);
+
+        return result;
+    }
+
+    /// Get target architecture
+    [[nodiscard]] GpuArch get_arch() const { return arch_; }
+
+    /// Get target architecture as string
+    [[nodiscard]] std::string get_arch_string() const { return arch_to_string(arch_); }
+
+    private:
+    void validate_dimensions(const KernelKey& key, ValidationResult& result) const
+    {
+        const auto& alg = key.algorithm;
+
+        // Check positive dimensions
+        if(alg.tile_shape.m <= 0 || alg.tile_shape.n <= 0 || alg.tile_shape.k <= 0)
+        {
+            result.add_error("Tile dimensions must be positive");
+            return;
+        }
+
+        // Check warp tiles fit in block tiles
+        int warp_m_coverage = alg.wave_shape.m * alg.warp_tile_shape.m;
+        int warp_n_coverage = alg.wave_shape.n * alg.warp_tile_shape.n;
+        int warp_k_coverage = alg.wave_shape.k * alg.warp_tile_shape.k;
+
+        if(warp_m_coverage > alg.tile_shape.m)
+        {
+            result.add_error("warp_m * warp_tile_m > tile_m: " + std::to_string(warp_m_coverage) +
+                             " > " + std::to_string(alg.tile_shape.m));
+        }
+        if(warp_n_coverage > alg.tile_shape.n)
+        {
+            result.add_error("warp_n * warp_tile_n > tile_n: " + std::to_string(warp_n_coverage) +
+                             " > " + std::to_string(alg.tile_shape.n));
+        }
+        if(warp_k_coverage > alg.tile_shape.k)
+        {
+            result.add_error("warp_k * warp_tile_k > tile_k: " + std::to_string(warp_k_coverage) +
+                             " > " + std::to_string(alg.tile_shape.k));
+        }
+
+        // Check alignment
+        if(alg.tile_shape.m % warp_m_coverage != 0)
+        {
+            result.add_error("tile_m must be divisible by warp_m * warp_tile_m");
+        }
+        if(alg.tile_shape.n % warp_n_coverage != 0)
+        {
+            result.add_error("tile_n must be divisible by warp_n * warp_tile_n");
+        }
+        if(alg.tile_shape.k % warp_k_coverage != 0)
+        {
+            result.add_error("tile_k must be divisible by warp_k * warp_tile_k");
+        }
+    }
+
+    void validate_warp_config(const KernelKey& key, ValidationResult& result) const
+    {
+        auto supported = get_supported_warp_configs(arch_);
+        if(supported.empty())
+        {
+            if(strict_mode_)
+            {
+                result.add_error("No warp configurations defined for " + get_arch_string());
+            }
+            else
+            {
+                result.add_warning("No warp configurations defined for " + get_arch_string());
+            }
+            return;
+        }
+
+        WarpConfig current = {
+            key.algorithm.wave_shape.m, key.algorithm.wave_shape.n, key.algorithm.wave_shape.k};
+
+        bool found = false;
+        for(const auto& cfg : supported)
+        {
+            if(cfg == current)
+            {
+                found = true;
+                break;
+            }
+        }
+
+        if(!found)
+        {
+            result.add_error("Invalid warp configuration [" + std::to_string(current[0]) + ", " +
+                             std::to_string(current[1]) + ", " + std::to_string(current[2]) +
+                             "] for " + get_arch_string());
+        }
+    }
+
+    void validate_warp_tiles(const KernelKey& key, ValidationResult& result) const
+    {
+        auto supported = get_supported_warp_tiles(
+            arch_, key.signature.dtype_a, key.signature.dtype_b, key.signature.dtype_c);
+
+        if(supported.empty())
+        {
+            // Unknown data type combination - allow with warning
+            result.add_warning("No warp tile combinations defined for data types");
+            return;
+        }
+
+        WarpTileConfig current = {key.algorithm.warp_tile_shape.m,
+                                  key.algorithm.warp_tile_shape.n,
+                                  key.algorithm.warp_tile_shape.k};
+
+        bool found = false;
+        for(const auto& cfg : supported)
+        {
+            if(cfg == current)
+            {
+                found = true;
+                break;
+            }
+        }
+
+        if(!found)
+        {
+            result.add_error("Invalid warp tile [" + std::to_string(current[0]) + ", " +
+                             std::to_string(current[1]) + ", " + std::to_string(current[2]) +
+                             "] for " + get_arch_string());
+        }
+    }
+
+    void validate_traits(const KernelKey& key, ValidationResult& result) const
+    {
+        if(is_trait_unsupported(
+               key.algorithm.pipeline, key.algorithm.epilogue, key.algorithm.scheduler))
+        {
+            result.add_error("Unsupported trait combination");
+        }
+    }
+
+    void validate_lds(const KernelKey& key, ValidationResult& result) const
+    {
+        const auto& sig = key.signature;
+        const auto& alg = key.algorithm;
+
+        float elem_a = element_size(sig.dtype_a);
+        float elem_b = element_size(sig.dtype_b);
+
+        std::size_t matrix_a_size = alg.tile_shape.m * alg.tile_shape.k * elem_a;
+        std::size_t matrix_b_size = alg.tile_shape.n * alg.tile_shape.k * elem_b;
+        std::size_t total_lds     = matrix_a_size + matrix_b_size;
+
+        std::size_t max_lds = get_lds_capacity(alg.pipeline);
+
+        if(total_lds > max_lds)
+        {
+            result.add_error("LDS capacity exceeded: " + std::to_string(total_lds) + " bytes > " +
+                             std::to_string(max_lds) + " bytes limit");
+        }
+    }
+
+    GpuArch arch_;
+    bool strict_mode_;
+};
+
+// =============================================================================
+// Registry Integration Helper
+// =============================================================================
+
+/**
+ * Create a filter function for use with Registry::filter()
+ *
+ * @tparam KernelT Kernel instance type with get_key() method
+ * @param arch Target GPU architecture
+ * @return Predicate function that returns true for valid kernels
+ */
+template <typename KernelT>
+inline auto make_arch_filter_predicate(const std::string& arch)
+{
+    return [filter = ArchFilter(arch)](const KernelT& kernel) {
+        return filter.is_valid(kernel.get_key());
+    };
+}
+
+} // namespace dispatcher
+} // namespace ck_tile
--- a/dispatcher/include/ck_tile/dispatcher/arch_specs_generated.hpp
+++ b/dispatcher/include/ck_tile/dispatcher/arch_specs_generated.hpp
@@ -0,0 +1,168 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/**
+ * AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY!
+ *
+ * Generated from: arch_specs.json
+ * Generated at: 2026-01-05T19:34:01.229811
+ *
+ * To update this file:
+ * 1. Edit arch_specs.json
+ * 2. Run: python generate_arch_specs.py
+ */
+
+#pragma once
+
+#include "ck_tile/dispatcher/kernel_key.hpp"
+#include <array>
+#include <string>
+#include <vector>
+#include <cstdint>
+
+namespace ck_tile {
+namespace dispatcher {
+namespace arch_specs {
+
+// =============================================================================
+// GPU Architecture Enum (Generated)
+// =============================================================================
+
+enum class GpuArch : std::uint8_t
+{
+    GFX_908,  // AMD Instinct MI100
+    GFX_90A,  // AMD Instinct MI200 series
+    GFX_942,  // AMD Instinct MI300 series
+    GFX_950,  // AMD Instinct MI350 series
+    GFX_1100, // AMD Radeon RX 7900 series (RDNA3)
+    GFX_1200, // AMD Radeon RX 9000 series (RDNA4)
+    GFX_1201, // AMD Radeon RX 9000 series (RDNA4)
+    UNKNOWN
+};
+
+// =============================================================================
+// String Conversion Functions (Generated)
+// =============================================================================
+
+inline std::string arch_to_string(GpuArch arch)
+{
+    switch(arch)
+    {
+    case GpuArch::GFX_908: return "gfx908";
+    case GpuArch::GFX_90A: return "gfx90a";
+    case GpuArch::GFX_942: return "gfx942";
+    case GpuArch::GFX_950: return "gfx950";
+    case GpuArch::GFX_1100: return "gfx1100";
+    case GpuArch::GFX_1200: return "gfx1200";
+    case GpuArch::GFX_1201: return "gfx1201";
+    default: return "unknown";
+    }
+}
+
+inline GpuArch string_to_arch(const std::string& arch_str)
+{
+    if(arch_str == "gfx908")
+        return GpuArch::GFX_908;
+    if(arch_str == "gfx90a")
+        return GpuArch::GFX_90A;
+    if(arch_str == "gfx942")
+        return GpuArch::GFX_942;
+    if(arch_str == "gfx950")
+        return GpuArch::GFX_950;
+    if(arch_str == "gfx1100")
+        return GpuArch::GFX_1100;
+    if(arch_str == "gfx1200")
+        return GpuArch::GFX_1200;
+    if(arch_str == "gfx1201")
+        return GpuArch::GFX_1201;
+    return GpuArch::UNKNOWN;
+}
+
+// =============================================================================
+// Element Size (Generated)
+// =============================================================================
+
+inline float element_size(DataType dtype)
+{
+    switch(dtype)
+    {
+    case DataType::FP16: return 2.0f;
+    case DataType::BF16: return 2.0f;
+    case DataType::FP32: return 4.0f;
+    case DataType::FP64: return 8.0f;
+    case DataType::FP8: return 1.0f;
+    case DataType::BF8: return 1.0f;
+    case DataType::INT8: return 1.0f;
+    case DataType::INT4: return 0.5f;
+    case DataType::INT32: return 4.0f;
+    default: return 2.0f;
+    }
+}
+
+// =============================================================================
+// Warp Configurations (Generated)
+// =============================================================================
+
+using WarpConfig = std::array<int, 3>;
+
+inline std::vector<WarpConfig> get_supported_warp_configs(GpuArch arch)
+{
+    switch(arch)
+    {
+    case GpuArch::GFX_908: return {{1, 4, 1}, {2, 2, 1}, {4, 1, 1}};
+    case GpuArch::GFX_90A: return {{1, 4, 1}, {2, 2, 1}, {4, 1, 1}};
+    case GpuArch::GFX_942: return {{1, 4, 1}, {2, 2, 1}, {4, 1, 1}};
+    case GpuArch::GFX_950: return {{1, 4, 1}, {2, 2, 1}, {4, 1, 1}};
+    case GpuArch::GFX_1100: return {{2, 4, 1}, {1, 8, 1}, {8, 1, 1}, {4, 2, 1}};
+    case GpuArch::GFX_1200: return {{2, 4, 1}, {1, 8, 1}, {8, 1, 1}, {4, 2, 1}};
+    case GpuArch::GFX_1201: return {{2, 4, 1}, {1, 8, 1}, {8, 1, 1}, {4, 2, 1}};
+    default: return {};
+    }
+}
+
+// =============================================================================
+// LDS Capacity Limits (Generated)
+// =============================================================================
+
+inline std::size_t get_lds_capacity(Pipeline pipeline)
+{
+    if(pipeline == Pipeline::Mem)
+        return 65536;
+    if(pipeline == Pipeline::CompV1)
+        return 65536;
+    if(pipeline == Pipeline::CompV2)
+        return 65536;
+    if(pipeline == Pipeline::CompV3)
+        return 65536;
+    if(pipeline == Pipeline::CompV4)
+        return 32768;
+    if(pipeline == Pipeline::CompV5)
+        return 65536;
+    if(pipeline == Pipeline::PreShuffleV1)
+        return 32768;
+    if(pipeline == Pipeline::PreShuffleV2)
+        return 32768;
+    return 65536; // Default
+}
+
+// =============================================================================
+// Unsupported Trait Combinations (Generated)
+// =============================================================================
+
+inline bool
+is_trait_unsupported(Pipeline pipeline, [[maybe_unused]] Epilogue epilogue, Scheduler scheduler)
+{
+    // Generated from unsupported_trait_combos in arch_specs.json
+    if(scheduler == Scheduler::Interwave)
+    {
+        if(pipeline == Pipeline::CompV3 || pipeline == Pipeline::CompV4)
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+} // namespace arch_specs
+} // namespace dispatcher
+} // namespace ck_tile
--- a/dispatcher/include/ck_tile/dispatcher/backends/generated_kernel_backend.hpp
+++ b/dispatcher/include/ck_tile/dispatcher/backends/generated_kernel_backend.hpp
@@ -0,0 +1,143 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/**
+ * Generated Kernel Backend
+ *
+ * Backend for kernels generated by unified_gemm_codegen.py
+ * with unique namespace wrapping (Kernel_{name}).
+ *
+ * Status: Work in progress - use generated_tile_backend.hpp for now
+ *
+ * This backend handles the new codegen format with unique kernel structs.
+ */
+
+#pragma once
+
+#include "ck_tile/dispatcher/kernel_instance.hpp"
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include <hip/hip_runtime.h>
+
+namespace ck_tile {
+namespace dispatcher {
+namespace backends {
+
+/**
+ * Kernel instance wrapper for unified_gemm_codegen.py generated kernels
+ *
+ * These kernels have:
+ * - namespace {kernel_name}_ns { ... } (NEW format)
+ * - struct Kernel_{name} with static launch() method
+ * - struct SelectedKernel alias for compatibility
+ * - Type aliases: ADataType, BDataType, CDataType, AccDataType
+ *
+ * Note: Currently use generated_tile_backend.hpp for production
+ */
+template <typename SelectedKernelType>
+class GeneratedKernelInstance : public KernelInstance
+{
+    public:
+    using SelectedKernel = SelectedKernelType;
+    using ADataType      = typename SelectedKernel::ADataType;
+    using BDataType      = typename SelectedKernel::BDataType;
+    using CDataType      = typename SelectedKernel::CDataType;
+    using AccDataType    = typename SelectedKernel::AccDataType;
+
+    GeneratedKernelInstance(const KernelKey& key, const std::string& name) : key_(key), name_(name)
+    {
+    }
+
+    const KernelKey& get_key() const override { return key_; }
+
+    bool supports(const Problem& problem) const override
+    {
+        // Check dimension divisibility based on padding flags
+        constexpr bool pad_m = SelectedKernel::kPadM;
+        constexpr bool pad_n = SelectedKernel::kPadN;
+        constexpr bool pad_k = SelectedKernel::kPadK;
+
+        if(pad_m && pad_n && pad_k)
+        {
+            return true; // Padding enabled - supports any size
+        }
+
+        // Check divisibility for dimensions without padding
+        constexpr int tile_m = SelectedKernel::TileM;
+        constexpr int tile_n = SelectedKernel::TileN;
+        constexpr int tile_k = SelectedKernel::TileK;
+
+        if(!pad_m && problem.M % tile_m != 0)
+            return false;
+        if(!pad_n && problem.N % tile_n != 0)
+            return false;
+        if(!pad_k && problem.K % tile_k != 0)
+            return false;
+
+        return true;
+    }
+
+    std::string get_name() const override { return name_; }
+
+    float run(const void* a_ptr,
+              const void* b_ptr,
+              void* c_ptr,
+              const void** d_ptrs,
+              const Problem& problem,
+              void* stream) const override
+    {
+        (void)d_ptrs; // Not used in basic GEMM
+
+        // Create arguments using constructor
+        ck_tile::GemmHostArgs args(a_ptr,           // a_ptr
+                                   b_ptr,           // b_ptr
+                                   c_ptr,           // e_ptr/c_ptr
+                                   problem.k_batch, // k_batch
+                                   problem.M,       // M
+                                   problem.N,       // N
+                                   problem.K,       // K
+                                   problem.K,       // stride_A (row-major A: stride = K)
+                                   problem.K,       // stride_B (column-major B: stride = K)
+                                   problem.N        // stride_E/C (row-major C: stride = N)
+        );
+
+        // Create stream config for timing
+        ck_tile::stream_config stream_cfg;
+        stream_cfg.stream_id_      = reinterpret_cast<hipStream_t>(stream);
+        stream_cfg.time_kernel_    = true;
+        stream_cfg.log_level_      = 0;
+        stream_cfg.cold_niters_    = 5;  // Warmup iterations
+        stream_cfg.nrepeat_        = 10; // Measurement iterations
+        stream_cfg.is_gpu_timer_   = true;
+        stream_cfg.flush_cache_    = false;
+        stream_cfg.rotating_count_ = 1;
+
+        // Call the generated kernel's launch method
+        return SelectedKernel::launch(args, stream_cfg);
+    }
+
+    bool validate(const void* a_ptr,
+                  const void* b_ptr,
+                  const void* c_ptr,
+                  const void** d_ptrs,
+                  const Problem& problem,
+                  float tolerance) const override
+    {
+        (void)a_ptr;
+        (void)b_ptr;
+        (void)c_ptr;
+        (void)d_ptrs;
+        (void)problem;
+        (void)tolerance;
+        // Validation would require reference implementation
+        return true;
+    }
+
+    private:
+    KernelKey key_;
+    std::string name_;
+};
+
+} // namespace backends
+} // namespace dispatcher
+} // namespace ck_tile
--- a/dispatcher/include/ck_tile/dispatcher/backends/generated_tile_backend.hpp
+++ b/dispatcher/include/ck_tile/dispatcher/backends/generated_tile_backend.hpp
@@ -0,0 +1,157 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/dispatcher/kernel_instance.hpp"
+#include "ck_tile/dispatcher/validation/reference_kernels.hpp"
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
+#include <hip/hip_runtime.h>
+#include <sstream>
+#include <vector>
+#include <cmath>
+
+namespace ck_tile {
+namespace dispatcher {
+namespace backends {
+
+/**
+ * Kernel instance wrapper for unified_gemm_codegen.py generated kernels
+ *
+ * These kernels have structure:
+ * - Types defined outside: using ADataType = ...; using BDataType = ...;
+ * - struct SelectedKernel with static constexpr config and launch() method
+ * - constexpr const char* KERNEL_NAME = "...";
+ *
+ * This is different from tile_engine style where everything is in SelectedKernel.
+ */
+template <typename SelectedKernelType,
+          typename ADataType_,
+          typename BDataType_,
+          typename CDataType_,
+          typename AccDataType_>
+class GeneratedTileKernelInstance : public KernelInstance
+{
+    public:
+    using ADataType      = ADataType_;
+    using BDataType      = BDataType_;
+    using CDataType      = CDataType_;
+    using AccDataType    = AccDataType_;
+    using SelectedKernel = SelectedKernelType;
+
+    GeneratedTileKernelInstance(const KernelKey& key, const std::string& name)
+        : key_(key), name_(name)
+    {
+    }
+
+    const KernelKey& get_key() const override { return key_; }
+
+    bool supports(const Problem& problem) const override
+    {
+        // Check dimension divisibility if padding not enabled
+        constexpr bool pad_m = SelectedKernel::kPadM;
+        constexpr bool pad_n = SelectedKernel::kPadN;
+        constexpr bool pad_k = SelectedKernel::kPadK;
+
+        if(pad_m && pad_n && pad_k)
+        {
+            return true; // Padding enabled - supports any size
+        }
+
+        // Check divisibility
+        constexpr int tile_m = SelectedKernel::TileM;
+        constexpr int tile_n = SelectedKernel::TileN;
+        constexpr int tile_k = SelectedKernel::TileK;
+
+        if(!pad_m && problem.M % tile_m != 0)
+            return false;
+        if(!pad_n && problem.N % tile_n != 0)
+            return false;
+        if(!pad_k && problem.K % tile_k != 0)
+            return false;
+
+        return true;
+    }
+
+    std::string get_name() const override { return name_; }
+
+    float run(const void* a_ptr,
+              const void* b_ptr,
+              void* c_ptr,
+              const void** d_ptrs,
+              const Problem& problem,
+              void* stream) const override
+    {
+        (void)d_ptrs; // Not used in basic GEMM
+
+        // Create arguments using constructor (correct order!)
+        // Order from GemmHostArgs constructor: a_ptr, b_ptr, e_ptr, k_batch, M, N, K, stride_A,
+        // stride_B, stride_E
+        ck_tile::GemmHostArgs args(a_ptr,           // a_ptr
+                                   b_ptr,           // b_ptr
+                                   c_ptr,           // e_ptr/c_ptr
+                                   problem.k_batch, // k_batch (4th argument!)
+                                   problem.M,       // M
+                                   problem.N,       // N
+                                   problem.K,       // K
+                                   problem.K,       // stride_A (row-major A: stride = K)
+                                   problem.K,       // stride_B (column-major B: stride = K)
+                                   problem.N        // stride_E/C (row-major C: stride = N)
+        );
+
+        // Create stream config for timing
+        ck_tile::stream_config stream_cfg;
+        stream_cfg.stream_id_      = reinterpret_cast<hipStream_t>(stream);
+        stream_cfg.time_kernel_    = true;
+        stream_cfg.log_level_      = 0;  // No logging for performance
+        stream_cfg.cold_niters_    = 5;  // Warmup iterations
+        stream_cfg.nrepeat_        = 10; // Measurement iterations
+        stream_cfg.is_gpu_timer_   = true;
+        stream_cfg.flush_cache_    = false;
+        stream_cfg.rotating_count_ = 1;
+
+        // Call the generated kernel's launch method
+        return SelectedKernel::launch(args, stream_cfg);
+    }
+
+    bool validate(const void* a_ptr,
+                  const void* b_ptr,
+                  const void* c_ptr,
+                  const void** d_ptrs,
+                  const Problem& problem,
+                  float tolerance) const override
+    {
+        (void)a_ptr;
+        (void)b_ptr;
+        (void)c_ptr;
+        (void)d_ptrs;
+        (void)problem;
+        (void)tolerance;
+        // Validation would require reference implementation
+        return true;
+    }
+
+    private:
+    KernelKey key_;
+    std::string name_;
+};
+
+/// Helper function to create a generated tile kernel instance wrapper
+template <typename SelectedKernel,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AccDataType>
+std::shared_ptr<KernelInstance> create_generated_tile_kernel(const KernelKey& key,
+                                                             const std::string& name)
+{
+    return std::make_shared<
+        GeneratedTileKernelInstance<SelectedKernel, ADataType, BDataType, CDataType, AccDataType>>(
+        key, name);
+}
+
+} // namespace backends
+} // namespace dispatcher
+} // namespace ck_tile
--- a/dispatcher/include/ck_tile/dispatcher/backends/kernel_registration.hpp
+++ b/dispatcher/include/ck_tile/dispatcher/backends/kernel_registration.hpp
@@ -0,0 +1,109 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/dispatcher/backends/tile_backend.hpp"
+#include "ck_tile/dispatcher/registry.hpp"
+#include <type_traits>
+
+namespace ck_tile {
+namespace dispatcher {
+namespace backends {
+
+/// Helper to register a CK Tile generated kernel
+/// This should be called from generated code for each kernel
+template <typename SelectedKernel>
+void register_tile_kernel(Registry& registry, const std::string& kernel_name)
+{
+    // Extract metadata from SelectedKernel static members
+    KernelKey key;
+
+    // Signature
+    key.signature.dtype_a   = static_cast<DataType>(SelectedKernel::ADataType);
+    key.signature.dtype_b   = static_cast<DataType>(SelectedKernel::BDataType);
+    key.signature.dtype_c   = static_cast<DataType>(SelectedKernel::CDataType);
+    key.signature.dtype_acc = static_cast<DataType>(SelectedKernel::AccDataType);
+
+    key.signature.layout_a = static_cast<LayoutTag>(SelectedKernel::ALayout);
+    key.signature.layout_b = static_cast<LayoutTag>(SelectedKernel::BLayout);
+    key.signature.layout_c = static_cast<LayoutTag>(SelectedKernel::CLayout);
+
+    key.signature.transpose_a = false; // Extract from kernel if available
+    key.signature.transpose_b = false;
+    key.signature.grouped     = false;
+    key.signature.split_k     = 1;
+
+    key.signature.elementwise_op      = "PassThrough"; // Extract if available
+    key.signature.num_d_tensors       = 0;
+    key.signature.structured_sparsity = SelectedKernel::UseStructuredSparsity;
+
+    // Algorithm
+    key.algorithm.tile_shape.m = SelectedKernel::TileM;
+    key.algorithm.tile_shape.n = SelectedKernel::TileN;
+    key.algorithm.tile_shape.k = SelectedKernel::TileK;
+
+    key.algorithm.wave_shape.m = SelectedKernel::WarpPerBlock_M;
+    key.algorithm.wave_shape.n = SelectedKernel::WarpPerBlock_N;
+    key.algorithm.wave_shape.k = SelectedKernel::WarpPerBlock_K;
+
+    key.algorithm.warp_tile_shape.m = SelectedKernel::WarpTileM;
+    key.algorithm.warp_tile_shape.n = SelectedKernel::WarpTileN;
+    key.algorithm.warp_tile_shape.k = SelectedKernel::WarpTileK;
+
+    // Extract pipeline, epilogue, scheduler from traits
+    key.algorithm.pipeline  = Pipeline::CompV4;  // Extract from kernel
+    key.algorithm.epilogue  = Epilogue::Default; // Extract from kernel
+    key.algorithm.scheduler = Scheduler::Auto;   // Extract from kernel
+
+    key.algorithm.block_size      = SelectedKernel::BlockSize;
+    key.algorithm.double_buffer   = SelectedKernel::DoubleSmemBuffer;
+    key.algorithm.persistent      = SelectedKernel::UsePersistentKernel;
+    key.algorithm.preshuffle      = false; // Extract if available
+    key.algorithm.transpose_c     = SelectedKernel::TransposeC;
+    key.algorithm.num_wave_groups = 1; // Extract if available
+
+    key.gfx_arch = 942; // Extract from build configuration
+
+    // Create kernel instance
+    auto kernel_instance = std::make_shared<TileKernelInstance<SelectedKernel>>(key, kernel_name);
+
+    // Register with high priority (Tile kernels preferred)
+    registry.register_kernel(kernel_instance, Registry::Priority::High);
+}
+
+/// Macro to simplify kernel registration in generated code
+#define CK_TILE_REGISTER_KERNEL(SelectedKernel, KernelName, Registry) \
+    ::ck_tile::dispatcher::backends::register_tile_kernel<SelectedKernel>(Registry, KernelName)
+
+/// Helper to register multiple kernels from a list
+template <typename... Kernels>
+struct KernelRegistrar
+{
+    static void register_all(Registry& registry)
+    {
+        // This would be specialized for each kernel set
+        // For now, empty implementation
+    }
+};
+
+/// Auto-registration helper
+/// Place this in generated files to automatically register kernels
+template <typename SelectedKernel>
+struct AutoRegister
+{
+    AutoRegister(const std::string& kernel_name)
+    {
+        auto& registry = Registry::instance();
+        register_tile_kernel<SelectedKernel>(registry, kernel_name);
+    }
+};
+
+/// Macro for auto-registration
+#define CK_TILE_AUTO_REGISTER(SelectedKernel, KernelName)                \
+    static ::ck_tile::dispatcher::backends::AutoRegister<SelectedKernel> \
+        auto_register_##SelectedKernel{KernelName};
+
+} // namespace backends
+} // namespace dispatcher
+} // namespace ck_tile
--- a/dispatcher/include/ck_tile/dispatcher/backends/tile_backend.hpp
+++ b/dispatcher/include/ck_tile/dispatcher/backends/tile_backend.hpp
@@ -0,0 +1,173 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/dispatcher/kernel_instance.hpp"
+#include "ck_tile/dispatcher/validation/reference_kernels.hpp"
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include <hip/hip_runtime.h>
+#include <chrono>
+#include <filesystem>
+#include <fstream>
+#include <regex>
+#include <sstream>
+
+namespace ck_tile {
+namespace dispatcher {
+namespace backends {
+
+/// Kernel instance for CK Tile generated kernels
+template <typename SelectedKernel>
+class TileKernelInstance : public KernelInstance
+{
+    public:
+    TileKernelInstance(const KernelKey& key, const std::string& name) : key_(key), name_(name) {}
+
+    const KernelKey& get_key() const override { return key_; }
+
+    bool supports(const Problem& problem) const override
+    {
+        // Check dimension divisibility if padding not enabled
+        constexpr bool pad_m = SelectedKernel::kPadM;
+        constexpr bool pad_n = SelectedKernel::kPadN;
+        constexpr bool pad_k = SelectedKernel::kPadK;
+
+        if(pad_m && pad_n && pad_k)
+        {
+            // Padding enabled - supports any size
+            return true;
+        }
+
+        // Check divisibility
+        constexpr int tile_m = SelectedKernel::TileM;
+        constexpr int tile_n = SelectedKernel::TileN;
+        constexpr int tile_k = SelectedKernel::TileK;
+
+        if(!pad_m && problem.M % tile_m != 0)
+            return false;
+        if(!pad_n && problem.N % tile_n != 0)
+            return false;
+        if(!pad_k && problem.K % tile_k != 0)
+            return false;
+
+        // Check shared memory budget if specified
+        if(problem.smem_budget > 0)
+        {
+            int64_t estimated_smem = estimate_smem_usage();
+            if(estimated_smem > problem.smem_budget)
+                return false;
+        }
+
+        return true;
+    }
+
+    std::string get_name() const override { return name_; }
+
+    float run(const void* a_ptr,
+              const void* b_ptr,
+              void* c_ptr,
+              const void** d_ptrs,
+              const Problem& problem,
+              void* stream) const override
+    {
+        // Convert void* stream to hipStream_t
+        hipStream_t hip_stream = reinterpret_cast<hipStream_t>(stream);
+
+        // Construct kernel arguments
+        using ADataType = typename SelectedKernel::ADataType;
+        using BDataType = typename SelectedKernel::BDataType;
+        using CDataType = typename SelectedKernel::CDataType;
+
+        // Note: d_ptrs not yet supported in basic CK Tile kernels
+        (void)d_ptrs; // Suppress unused parameter warning
+
+        auto kargs = SelectedKernel::MakeKernelArgs(static_cast<const ADataType*>(a_ptr),
+                                                    static_cast<const BDataType*>(b_ptr),
+                                                    static_cast<CDataType*>(c_ptr),
+                                                    problem.M,
+                                                    problem.N,
+                                                    problem.K,
+                                                    problem.k_batch);
+
+        // Validate arguments
+        if(!SelectedKernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Kernel does not support the given arguments");
+        }
+
+        // Calculate grid and block dimensions
+        dim3 grids       = SelectedKernel::GridSize(problem.M, problem.N, problem.K);
+        dim3 blocks      = SelectedKernel::BlockSize();
+        size_t lds_bytes = SelectedKernel::GetSmemSize();
+
+        // Time kernel execution
+        hipEvent_t start, stop;
+        (void)hipEventCreate(&start);
+        (void)hipEventCreate(&stop);
+
+        (void)hipEventRecord(start, hip_stream);
+
+        // Launch kernel
+        ck_tile::launch_kernel(SelectedKernel::Kernel, grids, blocks, lds_bytes, hip_stream, kargs);
+
+        (void)hipEventRecord(stop, hip_stream);
+        (void)hipEventSynchronize(stop);
+
+        float elapsed_ms = 0.0f;
+        (void)hipEventElapsedTime(&elapsed_ms, start, stop);
+
+        (void)hipEventDestroy(start);
+        (void)hipEventDestroy(stop);
+
+        return elapsed_ms;
+    }
+
+    bool validate(const void* a_ptr,
+                  const void* b_ptr,
+                  const void* c_ptr,
+                  const void** d_ptrs,
+                  const Problem& problem,
+                  float tolerance) const override
+    {
+        // Use validation helper
+        using ADataType   = typename SelectedKernel::ADataType;
+        using BDataType   = typename SelectedKernel::BDataType;
+        using CDataType   = typename SelectedKernel::CDataType;
+        using AccDataType = typename SelectedKernel::AccDataType;
+
+        // d_ptrs not yet supported
+        (void)d_ptrs;
+
+        // Convert tolerance to rtol and atol
+        float rtol = tolerance;
+        float atol = tolerance * 1e-2f; // atol is typically smaller
+
+        return validation::validate_gemm_kernel<ADataType, BDataType, CDataType, AccDataType>(
+            a_ptr, b_ptr, c_ptr, problem, rtol, atol);
+    }
+
+    private:
+    int64_t estimate_smem_usage() const
+    {
+        // Use kernel's reported shared memory size
+        return SelectedKernel::GetSmemSize();
+    }
+
+    KernelKey key_;
+    std::string name_;
+};
+
+/// Helper function to create a tile kernel instance wrapper
+/// This should be called from generated code that knows the SelectedKernel type
+template <typename SelectedKernel>
+std::shared_ptr<KernelInstance> create_tile_kernel_instance(const KernelKey& key,
+                                                            const std::string& name)
+{
+    return std::make_shared<TileKernelInstance<SelectedKernel>>(key, name);
+}
+
+} // namespace backends
+} // namespace dispatcher
+} // namespace ck_tile
--- a/dispatcher/include/ck_tile/dispatcher/dispatcher.hpp
+++ b/dispatcher/include/ck_tile/dispatcher/dispatcher.hpp
@@ -0,0 +1,146 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/**
+ * Dispatcher - Main Kernel Selection and Execution Engine
+ *
+ * The Dispatcher provides unified interface for selecting and executing
+ * CK Tile GEMM kernels based on problem specifications.
+ *
+ * Features:
+ * - Multiple selection strategies (FirstFit, Heuristic)
+ * - Custom heuristic functions
+ * - Thread-safe registry integration
+ * - Real GPU execution with timing
+ *
+ * Usage:
+ *   Dispatcher dispatcher;
+ *   Problem problem(M, N, K);
+ *   float time = dispatcher.run(a_dev, b_dev, c_dev, problem);
+ *
+ * Status: Production ready - 319 TFLOPS validated
+ */
+
+#pragma once
+
+#include "ck_tile/dispatcher/kernel_instance.hpp"
+#include "ck_tile/dispatcher/problem.hpp"
+#include "ck_tile/dispatcher/registry.hpp"
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace ck_tile {
+namespace dispatcher {
+
+/// Heuristic function type: maps Problem to ordered list of kernel identifiers
+/// Returns kernel identifiers ranked by expected performance (best first)
+using HeuristicFunction = std::function<std::vector<std::string>(const Problem&)>;
+
+/// Dispatcher: Top-level orchestration for kernel selection and execution
+/// Provides unified interface for kernel dispatch across different backends
+class Dispatcher
+{
+    public:
+    /// Selection strategy for kernel choice
+    enum class SelectionStrategy
+    {
+        FirstFit, // Use first kernel that supports the problem
+        Heuristic // Use heuristic function to guide selection
+    };
+
+    /// Constructor
+    /// @param registry Registry instance to use (default: global singleton)
+    explicit Dispatcher(Registry* registry = nullptr);
+
+    /// Register a heuristic function for kernel selection
+    /// @param heuristic Function that maps problems to ranked kernel identifiers
+    void set_heuristic(HeuristicFunction heuristic);
+
+    /// Set selection strategy
+    /// @param strategy Strategy to use for kernel selection
+    void set_strategy(SelectionStrategy strategy);
+
+    /// Select a kernel for the given problem
+    /// @param problem Problem configuration
+    /// @return Selected kernel instance, or nullptr if no suitable kernel found
+    [[nodiscard]] KernelInstancePtr select_kernel(const Problem& problem) const;
+
+    /// Execute GEMM operation with automatic kernel selection
+    /// @param a_ptr Pointer to matrix A (device memory)
+    /// @param b_ptr Pointer to matrix B (device memory)
+    /// @param c_ptr Pointer to matrix C (device memory, input/output)
+    /// @param problem Problem configuration
+    /// @param stream HIP stream for kernel launch (nullptr = default stream)
+    /// @return Kernel execution time in milliseconds
+    /// @throws std::runtime_error if no suitable kernel found
+    [[nodiscard]] float run(const void* a_ptr,
+                            const void* b_ptr,
+                            void* c_ptr,
+                            const Problem& problem,
+                            void* stream = nullptr) const;
+
+    /// Execute GEMM operation with fusion (multi-D)
+    /// @param a_ptr Pointer to matrix A (device memory)
+    /// @param b_ptr Pointer to matrix B (device memory)
+    /// @param c_ptr Pointer to matrix C (device memory, input/output)
+    /// @param d_ptrs Array of pointers to additional D tensors (device memory)
+    /// @param problem Problem configuration
+    /// @param stream HIP stream for kernel launch (nullptr = default stream)
+    /// @return Kernel execution time in milliseconds
+    /// @throws std::runtime_error if no suitable kernel found
+    [[nodiscard]] float run_fused(const void* a_ptr,
+                                  const void* b_ptr,
+                                  void* c_ptr,
+                                  const void** d_ptrs,
+                                  const Problem& problem,
+                                  void* stream = nullptr) const;
+
+    /// Execute with explicit kernel selection
+    /// @param kernel_id Kernel identifier string
+    /// @param a_ptr Pointer to matrix A (device memory)
+    /// @param b_ptr Pointer to matrix B (device memory)
+    /// @param c_ptr Pointer to matrix C (device memory, input/output)
+    /// @param d_ptrs Array of pointers to additional D tensors (device memory)
+    /// @param problem Problem configuration
+    /// @param stream HIP stream for kernel launch (nullptr = default stream)
+    /// @return Kernel execution time in milliseconds
+    /// @throws std::runtime_error if kernel not found or doesn't support problem
+    [[nodiscard]] float run_explicit(const std::string& kernel_id,
+                                     const void* a_ptr,
+                                     const void* b_ptr,
+                                     void* c_ptr,
+                                     const void** d_ptrs,
+                                     const Problem& problem,
+                                     void* stream = nullptr) const;
+
+    /// Validate kernel output
+    /// @param a_ptr Pointer to matrix A (device memory)
+    /// @param b_ptr Pointer to matrix B (device memory)
+    /// @param c_ptr Pointer to matrix C (device memory, kernel output)
+    /// @param d_ptrs Array of pointers to additional D tensors (device memory)
+    /// @param problem Problem configuration
+    /// @param tolerance Relative error tolerance
+    /// @return true if validation passes, false otherwise
+    [[nodiscard]] bool validate(const void* a_ptr,
+                                const void* b_ptr,
+                                const void* c_ptr,
+                                const void** d_ptrs,
+                                const Problem& problem,
+                                float tolerance = 1e-3f) const;
+
+    private:
+    Registry* registry_;
+    HeuristicFunction heuristic_;
+    SelectionStrategy strategy_;
+
+    /// Select kernel using first-fit strategy
+    [[nodiscard]] KernelInstancePtr select_first_fit(const Problem& problem) const;
+
+    /// Select kernel using heuristic strategy
+    [[nodiscard]] KernelInstancePtr select_heuristic(const Problem& problem) const;
+};
+
+} // namespace dispatcher
+} // namespace ck_tile
--- a/dispatcher/include/ck_tile/dispatcher/example_args.hpp
+++ b/dispatcher/include/ck_tile/dispatcher/example_args.hpp
@@ -0,0 +1,230 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include <map>
+#include <sstream>
+#include <algorithm>
+
+namespace ck_tile {
+namespace dispatcher {
+namespace utils {
+
+/**
+ * Simple command-line argument parser for examples.
+ *
+ * Usage:
+ *   ExampleArgs args("Example 01: Basic GEMM", "Demonstrates basic GEMM usage");
+ *   args.add_flag("--list", "List all kernel sets");
+ *   args.add_option("--dtype", "fp16", "Data type (fp16, bf16, fp32)");
+ *   args.add_option("--size", "1024", "Problem size MxNxK");
+ *
+ *   if (!args.parse(argc, argv)) return 0;  // --help was printed
+ *
+ *   bool do_list = args.has("--list");
+ *   std::string dtype = args.get("--dtype");
+ *   int size = args.get_int("--size");
+ */
+class ExampleArgs
+{
+    public:
+    ExampleArgs(const std::string& name, const std::string& description = "")
+        : name_(name), description_(description)
+    {
+        // Always add --help
+        add_flag("--help", "Show this help message");
+        add_flag("-h", "Show this help message");
+    }
+
+    // Add a boolean flag (no value)
+    void add_flag(const std::string& name, const std::string& help)
+    {
+        flags_[name] = false;
+        help_[name]  = help;
+        order_.push_back(name);
+    }
+
+    // Add an option with a default value
+    void
+    add_option(const std::string& name, const std::string& default_val, const std::string& help)
+    {
+        options_[name]  = default_val;
+        defaults_[name] = default_val;
+        help_[name]     = help;
+        order_.push_back(name);
+    }
+
+    // Parse arguments. Returns false if --help was requested.
+    bool parse(int argc, char* argv[])
+    {
+        for(int i = 1; i < argc; ++i)
+        {
+            std::string arg = argv[i];
+
+            // Check for --help
+            if(arg == "--help" || arg == "-h")
+            {
+                print_help();
+                return false;
+            }
+
+            // Check for flags
+            if(flags_.find(arg) != flags_.end())
+            {
+                flags_[arg] = true;
+                continue;
+            }
+
+            // Check for options (--name=value or --name value)
+            std::string name, value;
+            size_t eq_pos = arg.find('=');
+            if(eq_pos != std::string::npos)
+            {
+                name  = arg.substr(0, eq_pos);
+                value = arg.substr(eq_pos + 1);
+            }
+            else if(options_.find(arg) != options_.end() && i + 1 < argc)
+            {
+                name  = arg;
+                value = argv[++i];
+            }
+            else
+            {
+                // Positional argument - store as _pos_N
+                std::string pos_name = "_pos_" + std::to_string(positional_.size());
+                positional_.push_back(arg);
+                continue;
+            }
+
+            if(options_.find(name) != options_.end())
+            {
+                options_[name] = value;
+            }
+        }
+        return true;
+    }
+
+    // Check if a flag is set
+    bool has(const std::string& name) const
+    {
+        auto it = flags_.find(name);
+        return it != flags_.end() && it->second;
+    }
+
+    // Get an option value as string
+    std::string get(const std::string& name) const
+    {
+        auto it = options_.find(name);
+        return it != options_.end() ? it->second : "";
+    }
+
+    // Get an option value as string with default
+    std::string get(const std::string& name, const std::string& default_val) const
+    {
+        auto it = options_.find(name);
+        return it != options_.end() ? it->second : default_val;
+    }
+
+    // Get an option value as int
+    int get_int(const std::string& name, int default_val = 0) const
+    {
+        std::string val = get(name);
+        if(val.empty())
+            return default_val;
+        try
+        {
+            return std::stoi(val);
+        }
+        catch(...)
+        {
+            return default_val;
+        }
+    }
+
+    // Get an option value as float
+    float get_float(const std::string& name, float default_val = 0.0f) const
+    {
+        std::string val = get(name);
+        if(val.empty())
+            return default_val;
+        try
+        {
+            return std::stof(val);
+        }
+        catch(...)
+        {
+            return default_val;
+        }
+    }
+
+    // Get positional arguments
+    const std::vector<std::string>& positional() const { return positional_; }
+
+    // Print help message
+    void print_help() const
+    {
+        std::cout << "\n";
+        std::cout << "  " << name_ << "\n";
+        if(!description_.empty())
+        {
+            std::cout << "  " << description_ << "\n";
+        }
+        std::cout << "\n";
+        std::cout << "Usage:\n";
+        std::cout << "  ./example [OPTIONS]\n";
+        std::cout << "\n";
+        std::cout << "Options:\n";
+
+        // Find max option name length for alignment
+        size_t max_len = 0;
+        for(const auto& name : order_)
+        {
+            if(name == "-h")
+                continue; // Skip -h, show --help only
+            max_len = std::max(max_len, name.length());
+        }
+
+        // Print options in order
+        for(const auto& name : order_)
+        {
+            if(name == "-h")
+                continue;
+
+            std::cout << "  " << std::left << std::setw(max_len + 2) << name;
+
+            auto help_it = help_.find(name);
+            if(help_it != help_.end())
+            {
+                std::cout << help_it->second;
+            }
+
+            // Show default value for options
+            auto def_it = defaults_.find(name);
+            if(def_it != defaults_.end() && !def_it->second.empty())
+            {
+                std::cout << " (default: " << def_it->second << ")";
+            }
+
+            std::cout << "\n";
+        }
+        std::cout << "\n";
+    }
+
+    private:
+    std::string name_;
+    std::string description_;
+    std::map<std::string, bool> flags_;
+    std::map<std::string, std::string> options_;
+    std::map<std::string, std::string> defaults_;
+    std::map<std::string, std::string> help_;
+    std::vector<std::string> order_;
+    std::vector<std::string> positional_;
+};
+
+} // namespace utils
+} // namespace dispatcher
+} // namespace ck_tile
--- a/dispatcher/include/ck_tile/dispatcher/json_export.hpp
+++ b/dispatcher/include/ck_tile/dispatcher/json_export.hpp
@@ -0,0 +1,370 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/**
+ * JSON Export Utilities for Dispatcher Registry
+ *
+ * Provides functionality to export kernel registry metadata to JSON format,
+ * similar to the tile engine benchmarking JSON export.
+ *
+ * Features:
+ * - Export all registered kernels with full metadata
+ * - Include kernel configuration (tile shapes, pipeline, scheduler, etc.)
+ * - Group kernels by various properties (data type, layout, pipeline, etc.)
+ * - Export to string or file
+ *
+ * Usage:
+ *   auto& registry = Registry::instance();
+ *   std::string json = export_registry_json(registry);
+ *   // or
+ *   export_registry_json_to_file(registry, "kernels.json");
+ */
+
+#pragma once
+
+#include "ck_tile/dispatcher/registry.hpp"
+#include "ck_tile/dispatcher/kernel_key.hpp"
+#include <string>
+#include <sstream>
+#include <fstream>
+#include <map>
+#include <vector>
+#include <iomanip>
+#include <ctime>
+#include <chrono>
+
+namespace ck_tile {
+namespace dispatcher {
+
+/// Convert DataType enum to string
+inline std::string datatype_to_string(DataType dtype)
+{
+    switch(dtype)
+    {
+    case DataType::FP16: return "fp16";
+    case DataType::BF16: return "bf16";
+    case DataType::FP32: return "fp32";
+    case DataType::FP8: return "fp8";
+    case DataType::BF8: return "bf8";
+    case DataType::INT8: return "int8";
+    case DataType::INT32: return "int32";
+    default: return "unknown";
+    }
+}
+
+/// Convert LayoutTag enum to string
+inline std::string layout_to_string(LayoutTag layout)
+{
+    switch(layout)
+    {
+    case LayoutTag::RowMajor: return "row_major";
+    case LayoutTag::ColMajor: return "col_major";
+    case LayoutTag::PackedExternal: return "packed_external";
+    default: return "unknown";
+    }
+}
+
+/// Convert Pipeline enum to string
+inline std::string pipeline_to_string(Pipeline pipeline)
+{
+    switch(pipeline)
+    {
+    case Pipeline::Mem: return "mem";
+    case Pipeline::CompV1: return "compv1";
+    case Pipeline::CompV2: return "compv2";
+    case Pipeline::CompV3: return "compv3";
+    case Pipeline::CompV4: return "compv4";
+    case Pipeline::CompV5: return "compv5";
+    default: return "unknown";
+    }
+}
+
+/// Convert Epilogue enum to string
+inline std::string epilogue_to_string(Epilogue epilogue)
+{
+    switch(epilogue)
+    {
+    case Epilogue::None: return "none";
+    case Epilogue::Bias: return "bias";
+    case Epilogue::Activation: return "activation";
+    case Epilogue::CShuffle: return "cshuffle";
+    case Epilogue::Default: return "default";
+    default: return "unknown";
+    }
+}
+
+/// Convert Scheduler enum to string
+inline std::string scheduler_to_string(Scheduler scheduler)
+{
+    switch(scheduler)
+    {
+    case Scheduler::Auto: return "auto";
+    case Scheduler::Intrawave: return "intrawave";
+    case Scheduler::Interwave: return "interwave";
+    default: return "unknown";
+    }
+}
+
+/// Escape string for JSON
+inline std::string json_escape(const std::string& str)
+{
+    std::ostringstream oss;
+    for(char c : str)
+    {
+        switch(c)
+        {
+        case '"': oss << "\\\""; break;
+        case '\\': oss << "\\\\"; break;
+        case '\b': oss << "\\b"; break;
+        case '\f': oss << "\\f"; break;
+        case '\n': oss << "\\n"; break;
+        case '\r': oss << "\\r"; break;
+        case '\t': oss << "\\t"; break;
+        default:
+            if(c < 0x20)
+            {
+                oss << "\\u" << std::hex << std::setw(4) << std::setfill('0') << (int)c;
+            }
+            else
+            {
+                oss << c;
+            }
+        }
+    }
+    return oss.str();
+}
+
+/// Get current timestamp in ISO 8601 format
+inline std::string get_iso_timestamp()
+{
+    auto now    = std::chrono::system_clock::now();
+    auto time_t = std::chrono::system_clock::to_time_t(now);
+    std::tm tm_buf;
+    localtime_r(&time_t, &tm_buf);
+
+    std::ostringstream oss;
+    oss << std::put_time(&tm_buf, "%Y-%m-%dT%H:%M:%S");
+    return oss.str();
+}
+
+/// Export a single kernel's metadata to JSON
+inline std::string export_kernel_json(const KernelInstance& kernel)
+{
+    std::ostringstream json;
+    const auto& key = kernel.get_key();
+
+    json << "    {\n";
+    json << "      \"name\": \"" << json_escape(kernel.get_name()) << "\",\n";
+    json << "      \"identifier\": \"" << json_escape(key.encode_identifier()) << "\",\n";
+
+    // Signature (what operation is computed)
+    json << "      \"signature\": {\n";
+    json << "        \"dtype_a\": \"" << datatype_to_string(key.signature.dtype_a) << "\",\n";
+    json << "        \"dtype_b\": \"" << datatype_to_string(key.signature.dtype_b) << "\",\n";
+    json << "        \"dtype_c\": \"" << datatype_to_string(key.signature.dtype_c) << "\",\n";
+    json << "        \"dtype_acc\": \"" << datatype_to_string(key.signature.dtype_acc) << "\",\n";
+    json << "        \"layout_a\": \"" << layout_to_string(key.signature.layout_a) << "\",\n";
+    json << "        \"layout_b\": \"" << layout_to_string(key.signature.layout_b) << "\",\n";
+    json << "        \"layout_c\": \"" << layout_to_string(key.signature.layout_c) << "\",\n";
+    json << "        \"transpose_a\": " << (key.signature.transpose_a ? "true" : "false") << ",\n";
+    json << "        \"transpose_b\": " << (key.signature.transpose_b ? "true" : "false") << ",\n";
+    json << "        \"grouped\": " << (key.signature.grouped ? "true" : "false") << ",\n";
+    json << "        \"split_k\": " << (int)key.signature.split_k << ",\n";
+    json << "        \"elementwise_op\": \"" << json_escape(key.signature.elementwise_op)
+         << "\",\n";
+    json << "        \"num_d_tensors\": " << (int)key.signature.num_d_tensors << ",\n";
+    json << "        \"structured_sparsity\": "
+         << (key.signature.structured_sparsity ? "true" : "false") << "\n";
+    json << "      },\n";
+
+    // Algorithm (how it's implemented)
+    json << "      \"algorithm\": {\n";
+    json << "        \"tile_shape\": {\n";
+    json << "          \"m\": " << key.algorithm.tile_shape.m << ",\n";
+    json << "          \"n\": " << key.algorithm.tile_shape.n << ",\n";
+    json << "          \"k\": " << key.algorithm.tile_shape.k << "\n";
+    json << "        },\n";
+    json << "        \"wave_shape\": {\n";
+    json << "          \"m\": " << (int)key.algorithm.wave_shape.m << ",\n";
+    json << "          \"n\": " << (int)key.algorithm.wave_shape.n << ",\n";
+    json << "          \"k\": " << (int)key.algorithm.wave_shape.k << "\n";
+    json << "        },\n";
+    json << "        \"warp_tile_shape\": {\n";
+    json << "          \"m\": " << (int)key.algorithm.warp_tile_shape.m << ",\n";
+    json << "          \"n\": " << (int)key.algorithm.warp_tile_shape.n << ",\n";
+    json << "          \"k\": " << (int)key.algorithm.warp_tile_shape.k << "\n";
+    json << "        },\n";
+    json << "        \"pipeline\": \"" << pipeline_to_string(key.algorithm.pipeline) << "\",\n";
+    json << "        \"scheduler\": \"" << scheduler_to_string(key.algorithm.scheduler) << "\",\n";
+    json << "        \"epilogue\": \"" << epilogue_to_string(key.algorithm.epilogue) << "\",\n";
+    json << "        \"block_size\": " << key.algorithm.block_size << ",\n";
+    json << "        \"double_buffer\": " << (key.algorithm.double_buffer ? "true" : "false")
+         << ",\n";
+    json << "        \"persistent\": " << (key.algorithm.persistent ? "true" : "false") << ",\n";
+    json << "        \"preshuffle\": " << (key.algorithm.preshuffle ? "true" : "false") << ",\n";
+    json << "        \"transpose_c\": " << (key.algorithm.transpose_c ? "true" : "false") << ",\n";
+    json << "        \"num_wave_groups\": " << (int)key.algorithm.num_wave_groups << "\n";
+    json << "      },\n";
+
+    json << "      \"gfx_arch\": \"" << json_escape(key.gfx_arch) << "\"\n";
+    json << "    }";
+
+    return json.str();
+}
+
+/// Export registry metadata and statistics to JSON
+inline std::string export_registry_json(const Registry& registry, bool include_statistics = true)
+{
+    std::ostringstream json;
+
+    auto all_kernels = registry.get_all();
+
+    json << "{\n";
+
+    // Metadata
+    json << "  \"metadata\": {\n";
+    json << "    \"timestamp\": \"" << get_iso_timestamp() << "\",\n";
+    json << "    \"registry_name\": \"" << json_escape(registry.get_name()) << "\",\n";
+    json << "    \"total_kernels\": " << all_kernels.size() << ",\n";
+    json << "    \"export_version\": \"1.0.0\"\n";
+    json << "  },\n";
+
+    // Statistics (if enabled)
+    if(include_statistics && !all_kernels.empty())
+    {
+        std::map<std::string, int> by_datatype;
+        std::map<std::string, int> by_pipeline;
+        std::map<std::string, int> by_scheduler;
+        std::map<std::string, int> by_layout;
+        std::map<std::string, int> by_gfx_arch;
+
+        for(const auto& kernel : all_kernels)
+        {
+            const auto& key = kernel->get_key();
+
+            // Count by data type
+            std::string dtype_key = datatype_to_string(key.signature.dtype_a) + "_" +
+                                    datatype_to_string(key.signature.dtype_b) + "_" +
+                                    datatype_to_string(key.signature.dtype_c);
+            by_datatype[dtype_key]++;
+
+            // Count by pipeline
+            by_pipeline[pipeline_to_string(key.algorithm.pipeline)]++;
+
+            // Count by scheduler
+            by_scheduler[scheduler_to_string(key.algorithm.scheduler)]++;
+
+            // Count by layout
+            std::string layout_key = layout_to_string(key.signature.layout_a) + "_" +
+                                     layout_to_string(key.signature.layout_b) + "_" +
+                                     layout_to_string(key.signature.layout_c);
+            by_layout[layout_key]++;
+
+            // Count by GFX architecture
+            by_gfx_arch[key.gfx_arch]++;
+        }
+
+        json << "  \"statistics\": {\n";
+
+        // Data type breakdown
+        json << "    \"by_datatype\": {\n";
+        bool first = true;
+        for(const auto& [dtype, count] : by_datatype)
+        {
+            if(!first)
+                json << ",\n";
+            json << "      \"" << dtype << "\": " << count;
+            first = false;
+        }
+        json << "\n    },\n";
+
+        // Pipeline breakdown
+        json << "    \"by_pipeline\": {\n";
+        first = true;
+        for(const auto& [pipeline, count] : by_pipeline)
+        {
+            if(!first)
+                json << ",\n";
+            json << "      \"" << pipeline << "\": " << count;
+            first = false;
+        }
+        json << "\n    },\n";
+
+        // Scheduler breakdown
+        json << "    \"by_scheduler\": {\n";
+        first = true;
+        for(const auto& [scheduler, count] : by_scheduler)
+        {
+            if(!first)
+                json << ",\n";
+            json << "      \"" << scheduler << "\": " << count;
+            first = false;
+        }
+        json << "\n    },\n";
+
+        // Layout breakdown
+        json << "    \"by_layout\": {\n";
+        first = true;
+        for(const auto& [layout, count] : by_layout)
+        {
+            if(!first)
+                json << ",\n";
+            json << "      \"" << layout << "\": " << count;
+            first = false;
+        }
+        json << "\n    },\n";
+
+        // GFX architecture breakdown
+        json << "    \"by_gfx_arch\": {\n";
+        first = true;
+        for(const auto& [arch, count] : by_gfx_arch)
+        {
+            if(!first)
+                json << ",\n";
+            json << "      \"" << arch << "\": " << count;
+            first = false;
+        }
+        json << "\n    }\n";
+
+        json << "  },\n";
+    }
+
+    // Kernels list
+    json << "  \"kernels\": [\n";
+    for(size_t i = 0; i < all_kernels.size(); ++i)
+    {
+        json << export_kernel_json(*all_kernels[i]);
+        if(i < all_kernels.size() - 1)
+        {
+            json << ",";
+        }
+        json << "\n";
+    }
+    json << "  ]\n";
+
+    json << "}\n";
+
+    return json.str();
+}
+
+/// Export registry to a JSON file
+inline bool export_registry_json_to_file(const Registry& registry,
+                                         const std::string& filename,
+                                         bool include_statistics = true)
+{
+    std::string json = export_registry_json(registry, include_statistics);
+
+    std::ofstream file(filename);
+    if(!file.is_open())
+    {
+        return false;
+    }
+
+    file << json;
+    file.close();
+
+    return true;
+}
+
+} // namespace dispatcher
+} // namespace ck_tile
--- a/dispatcher/include/ck_tile/dispatcher/kernel_config.hpp
+++ b/dispatcher/include/ck_tile/dispatcher/kernel_config.hpp
@@ -0,0 +1,370 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/**
+ * @file kernel_config.hpp
+ * @brief Explicit kernel configuration for CK Tile Dispatcher
+ *
+ * This header provides a KernelConfig struct that mirrors the Python API,
+ * allowing explicit, self-contained kernel configuration without relying
+ * on force-included generated headers.
+ *
+ * Usage:
+ *   #include "ck_tile/dispatcher/kernel_config.hpp"
+ *   using namespace ck_tile::dispatcher;
+ *
+ *   // Step 1: Define explicit config
+ *   auto config = KernelConfig::fp16_rcr()
+ *       .tile(128, 128, 32)
+ *       .wave(2, 2, 1)
+ *       .warp_tile(32, 32, 16)
+ *       .pipeline(Pipeline::CompV4)
+ *       .scheduler(Scheduler::Intrawave);
+ *
+ *   // Step 2: Create registry and register
+ *   Registry registry;
+ *   registry.register_kernel(config.build_key(), config.get_name());
+ *
+ *   // Step 3: Create dispatcher
+ *   Dispatcher dispatcher(&registry);
+ *
+ *   // Step 4: Run GEMM
+ *   dispatcher.run(a, b, c, Problem(M, N, K));
+ */
+
+#pragma once
+
+#include "ck_tile/dispatcher/kernel_key.hpp"
+#include <sstream>
+#include <string>
+#include <iostream>
+
+namespace ck_tile {
+namespace dispatcher {
+
+/**
+ * @brief Explicit kernel configuration matching Python's KernelConfig
+ *
+ * This provides a fluent builder API for creating kernel configurations
+ * with all parameters visible and explicit.
+ */
+class KernelConfig
+{
+    public:
+    // =========================================================================
+    // Data types
+    // =========================================================================
+    DataType dtype_a   = DataType::FP16;
+    DataType dtype_b   = DataType::FP16;
+    DataType dtype_c   = DataType::FP16;
+    DataType dtype_acc = DataType::FP32;
+
+    // =========================================================================
+    // Layouts
+    // =========================================================================
+    LayoutTag layout_a = LayoutTag::RowMajor;
+    LayoutTag layout_b = LayoutTag::ColMajor;
+    LayoutTag layout_c = LayoutTag::RowMajor;
+
+    // =========================================================================
+    // Tile shape
+    // =========================================================================
+    int tile_m = 128;
+    int tile_n = 128;
+    int tile_k = 32;
+
+    // =========================================================================
+    // Wave shape (warps per block)
+    // =========================================================================
+    int wave_m = 2;
+    int wave_n = 2;
+    int wave_k = 1;
+
+    // =========================================================================
+    // Warp tile shape
+    // =========================================================================
+    int warp_m = 32;
+    int warp_n = 32;
+    int warp_k = 16;
+
+    // =========================================================================
+    // Block and pipeline
+    // =========================================================================
+    int block_size           = 256;
+    Pipeline pipeline_type   = Pipeline::CompV4;
+    Scheduler scheduler_type = Scheduler::Intrawave;
+    Epilogue epilogue_type   = Epilogue::CShuffle;
+
+    // =========================================================================
+    // Padding and features
+    // =========================================================================
+    bool pad_m      = true;
+    bool pad_n      = true;
+    bool pad_k      = true;
+    bool preshuffle = false;
+
+    // =========================================================================
+    // Target architecture
+    // =========================================================================
+    std::string gfx_arch = "gfx942";
+
+    // =========================================================================
+    // Fluent builder methods
+    // =========================================================================
+
+    /// Set tile dimensions (M x N x K)
+    KernelConfig& tile(int m, int n, int k)
+    {
+        tile_m = m;
+        tile_n = n;
+        tile_k = k;
+        return *this;
+    }
+
+    /// Set wave dimensions (warps per block M x N x K)
+    KernelConfig& wave(int m, int n, int k)
+    {
+        wave_m = m;
+        wave_n = n;
+        wave_k = k;
+        return *this;
+    }
+
+    /// Set warp tile dimensions (M x N x K)
+    KernelConfig& warp_tile(int m, int n, int k)
+    {
+        warp_m = m;
+        warp_n = n;
+        warp_k = k;
+        return *this;
+    }
+
+    /// Set block size
+    KernelConfig& block(int size)
+    {
+        block_size = size;
+        return *this;
+    }
+
+    /// Set pipeline type
+    KernelConfig& pipeline(Pipeline p)
+    {
+        pipeline_type = p;
+        return *this;
+    }
+
+    /// Set scheduler type
+    KernelConfig& scheduler(Scheduler s)
+    {
+        scheduler_type = s;
+        return *this;
+    }
+
+    /// Set epilogue type
+    KernelConfig& epilogue(Epilogue e)
+    {
+        epilogue_type = e;
+        return *this;
+    }
+
+    /// Set data types for A, B, C
+    KernelConfig& dtypes(DataType a, DataType b, DataType c, DataType acc = DataType::FP32)
+    {
+        dtype_a   = a;
+        dtype_b   = b;
+        dtype_c   = c;
+        dtype_acc = acc;
+        return *this;
+    }
+
+    /// Set layouts for A, B, C
+    KernelConfig& layouts(LayoutTag a, LayoutTag b, LayoutTag c)
+    {
+        layout_a = a;
+        layout_b = b;
+        layout_c = c;
+        return *this;
+    }
+
+    /// Set padding flags
+    KernelConfig& padding(bool m, bool n, bool k)
+    {
+        pad_m = m;
+        pad_n = n;
+        pad_k = k;
+        return *this;
+    }
+
+    /// Set target GPU architecture
+    KernelConfig& arch(const std::string& gpu)
+    {
+        gfx_arch = gpu;
+        return *this;
+    }
+
+    // =========================================================================
+    // Preset configurations
+    // =========================================================================
+
+    /// FP16 Row-Column-Row layout (most common)
+    static KernelConfig fp16_rcr() { return KernelConfig{}; }
+
+    /// FP16 Row-Row-Row layout
+    static KernelConfig fp16_rrr()
+    {
+        KernelConfig cfg;
+        cfg.layout_b = LayoutTag::RowMajor;
+        return cfg;
+    }
+
+    /// BF16 Row-Column-Row layout
+    static KernelConfig bf16_rcr()
+    {
+        KernelConfig cfg;
+        cfg.dtype_a = DataType::BF16;
+        cfg.dtype_b = DataType::BF16;
+        cfg.dtype_c = DataType::BF16;
+        return cfg;
+    }
+
+    /// FP32 Row-Column-Row layout
+    static KernelConfig fp32_rcr()
+    {
+        KernelConfig cfg;
+        cfg.dtype_a   = DataType::FP32;
+        cfg.dtype_b   = DataType::FP32;
+        cfg.dtype_c   = DataType::FP32;
+        cfg.dtype_acc = DataType::FP32;
+        return cfg;
+    }
+
+    // =========================================================================
+    // Build KernelKey
+    // =========================================================================
+
+    /// Build a KernelKey from this configuration
+    [[nodiscard]] KernelKey build_key() const
+    {
+        KernelKey key;
+
+        // Signature
+        key.signature.dtype_a             = dtype_a;
+        key.signature.dtype_b             = dtype_b;
+        key.signature.dtype_c             = dtype_c;
+        key.signature.dtype_acc           = dtype_acc;
+        key.signature.layout_a            = layout_a;
+        key.signature.layout_b            = layout_b;
+        key.signature.layout_c            = layout_c;
+        key.signature.transpose_a         = false;
+        key.signature.transpose_b         = false;
+        key.signature.grouped             = false;
+        key.signature.split_k             = 1;
+        key.signature.elementwise_op      = "PassThrough";
+        key.signature.num_d_tensors       = 0;
+        key.signature.structured_sparsity = false;
+
+        // Algorithm
+        key.algorithm.tile_shape      = {static_cast<std::uint16_t>(tile_m),
+                                         static_cast<std::uint16_t>(tile_n),
+                                         static_cast<std::uint16_t>(tile_k)};
+        key.algorithm.wave_shape      = {static_cast<std::uint8_t>(wave_m),
+                                         static_cast<std::uint8_t>(wave_n),
+                                         static_cast<std::uint8_t>(wave_k)};
+        key.algorithm.warp_tile_shape = {static_cast<std::uint8_t>(warp_m),
+                                         static_cast<std::uint8_t>(warp_n),
+                                         static_cast<std::uint8_t>(warp_k)};
+        key.algorithm.pipeline        = pipeline_type;
+        key.algorithm.scheduler       = scheduler_type;
+        key.algorithm.epilogue        = epilogue_type;
+        key.algorithm.block_size      = block_size;
+        key.algorithm.double_buffer   = true;
+        key.algorithm.persistent      = false;
+        key.algorithm.preshuffle      = preshuffle;
+        key.algorithm.transpose_c     = false;
+        key.algorithm.num_wave_groups = 1;
+
+        key.gfx_arch = gfx_arch;
+
+        return key;
+    }
+
+    // =========================================================================
+    // String representations
+    // =========================================================================
+
+    /// Get tile string (e.g., "128x128x32")
+    [[nodiscard]] std::string tile_str() const
+    {
+        std::ostringstream oss;
+        oss << tile_m << "x" << tile_n << "x" << tile_k;
+        return oss.str();
+    }
+
+    /// Get wave string (e.g., "2x2x1")
+    [[nodiscard]] std::string wave_str() const
+    {
+        std::ostringstream oss;
+        oss << wave_m << "x" << wave_n << "x" << wave_k;
+        return oss.str();
+    }
+
+    /// Get warp tile string (e.g., "32x32x16")
+    [[nodiscard]] std::string warp_tile_str() const
+    {
+        std::ostringstream oss;
+        oss << warp_m << "x" << warp_n << "x" << warp_k;
+        return oss.str();
+    }
+
+    /// Get layout string (e.g., "rcr")
+    [[nodiscard]] std::string layout_str() const
+    {
+        std::ostringstream oss;
+        oss << to_string(layout_a) << to_string(layout_b) << to_string(layout_c);
+        return oss.str();
+    }
+
+    /// Get kernel name for generated code lookup
+    [[nodiscard]] std::string get_name() const
+    {
+        std::ostringstream oss;
+        oss << "gemm_" << to_string(dtype_a) << "_" << layout_str() << "_"
+            << to_string(pipeline_type) << "_" << to_string(epilogue_type) << "_"
+            << to_string(scheduler_type) << "_" << (pad_m ? "True" : "False") << "_"
+            << (pad_n ? "True" : "False") << "_" << (pad_k ? "True" : "False") << "_"
+            << "False" // preshuffle
+            << "_" << tile_str() << "_" << wave_str() << "_" << warp_tile_str();
+        return oss.str();
+    }
+
+    /// Print configuration to stdout
+    void print_config(std::ostream& os = std::cout) const
+    {
+        os << "  Data types:\n";
+        os << "    dtype_a   = " << to_string(dtype_a) << "\n";
+        os << "    dtype_b   = " << to_string(dtype_b) << "\n";
+        os << "    dtype_c   = " << to_string(dtype_c) << "\n";
+        os << "    dtype_acc = " << to_string(dtype_acc) << "\n";
+        os << "  Layouts:\n";
+        os << "    layout_a = " << to_string(layout_a) << "\n";
+        os << "    layout_b = " << to_string(layout_b) << "\n";
+        os << "    layout_c = " << to_string(layout_c) << "\n";
+        os << "  Tile shape:\n";
+        os << "    tile = " << tile_str() << "\n";
+        os << "    wave = " << wave_str() << "\n";
+        os << "    warp_tile = " << warp_tile_str() << "\n";
+        os << "  Pipeline:\n";
+        os << "    pipeline  = " << to_string(pipeline_type) << "\n";
+        os << "    scheduler = " << to_string(scheduler_type) << "\n";
+        os << "    epilogue  = " << to_string(epilogue_type) << "\n";
+        os << "  Padding:\n";
+        os << "    pad_m = " << (pad_m ? "true" : "false") << "\n";
+        os << "    pad_n = " << (pad_n ? "true" : "false") << "\n";
+        os << "    pad_k = " << (pad_k ? "true" : "false") << "\n";
+        os << "  Target:\n";
+        os << "    gfx_arch = " << gfx_arch << "\n";
+    }
+};
+
+} // namespace dispatcher
+} // namespace ck_tile
--- a/dispatcher/include/ck_tile/dispatcher/kernel_decl.hpp
+++ b/dispatcher/include/ck_tile/dispatcher/kernel_decl.hpp
@@ -0,0 +1,509 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/**
+ * @file kernel_decl.hpp
+ * @brief Declarative kernel specification with KernelSet
+ *
+ * USAGE:
+ * ======
+ *
+ * // Named kernel sets
+ * DECL_KERNEL_SET(compute_bound,
+ *     .add("fp16", "rcr", 256, 256, 64)
+ *     .add("fp16", "rcr", 128, 128, 32)
+ * );
+ *
+ * // Access at runtime
+ * auto& set = KernelSetRegistry::instance().get("compute_bound");
+ */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+namespace ck_tile {
+namespace dispatcher {
+namespace decl {
+
+// =============================================================================
+// Wildcard constants
+// =============================================================================
+
+constexpr const char* ANY = "*";
+constexpr int ANY_INT     = -1;
+
+// =============================================================================
+// Signature Builder
+// =============================================================================
+
+class Signature
+{
+    public:
+    std::string dtype_a_        = "fp16";
+    std::string dtype_b_        = "fp16";
+    std::string dtype_c_        = "fp16";
+    std::string dtype_acc_      = "fp32";
+    std::string layout_a_       = "row";
+    std::string layout_b_       = "col";
+    std::string layout_c_       = "row";
+    std::string elementwise_op_ = "PassThrough";
+    int num_d_tensors_          = 0;
+    bool structured_sparsity_   = false;
+
+    Signature& dtype(const std::string& a,
+                     const std::string& b,
+                     const std::string& c,
+                     const std::string& acc = "fp32")
+    {
+        dtype_a_   = a;
+        dtype_b_   = b;
+        dtype_c_   = c;
+        dtype_acc_ = acc;
+        return *this;
+    }
+
+    Signature& dtype(const std::string& all)
+    {
+        dtype_a_ = dtype_b_ = dtype_c_ = all;
+        dtype_acc_                     = "fp32";
+        return *this;
+    }
+
+    Signature& layout(const std::string& a, const std::string& b, const std::string& c)
+    {
+        layout_a_ = a;
+        layout_b_ = b;
+        layout_c_ = c;
+        return *this;
+    }
+
+    Signature& layout(const std::string& combined)
+    {
+        if(combined.size() >= 3)
+        {
+            layout_a_ = (combined[0] == 'r') ? "row" : "col";
+            layout_b_ = (combined[1] == 'r') ? "row" : "col";
+            layout_c_ = (combined[2] == 'r') ? "row" : "col";
+        }
+        return *this;
+    }
+
+    Signature& elementwise(const std::string& op, int num_d = 0)
+    {
+        elementwise_op_ = op;
+        num_d_tensors_  = num_d;
+        return *this;
+    }
+
+    std::string layout_str() const
+    {
+        std::string r;
+        r += (layout_a_ == "col") ? 'c' : 'r';
+        r += (layout_b_ == "col") ? 'c' : 'r';
+        r += (layout_c_ == "col") ? 'c' : 'r';
+        return r;
+    }
+};
+
+// =============================================================================
+// Algorithm Builder
+// =============================================================================
+
+class Algorithm
+{
+    public:
+    int tile_m_ = 128, tile_n_ = 128, tile_k_ = 32;
+    int wave_m_ = ANY_INT, wave_n_ = ANY_INT, wave_k_ = 1;
+    int warp_m_ = ANY_INT, warp_n_ = ANY_INT, warp_k_ = 16;
+    std::string pipeline_  = "compv4";
+    std::string scheduler_ = "intrawave";
+    std::string epilogue_  = "cshuffle";
+    int block_size_        = 256;
+    int pad_m_ = 1, pad_n_ = 1, pad_k_ = 1;
+    bool preshuffle_ = false;
+
+    Algorithm& tile(int m, int n, int k)
+    {
+        tile_m_ = m;
+        tile_n_ = n;
+        tile_k_ = k;
+        return *this;
+    }
+
+    Algorithm& wave(int m, int n, int k = 1)
+    {
+        wave_m_ = m;
+        wave_n_ = n;
+        wave_k_ = k;
+        return *this;
+    }
+
+    Algorithm& warp(int m, int n, int k = 16)
+    {
+        warp_m_ = m;
+        warp_n_ = n;
+        warp_k_ = k;
+        return *this;
+    }
+
+    Algorithm& pipeline(const std::string& p)
+    {
+        pipeline_ = p;
+        return *this;
+    }
+    Algorithm& scheduler(const std::string& s)
+    {
+        scheduler_ = s;
+        return *this;
+    }
+    Algorithm& epilogue(const std::string& e)
+    {
+        epilogue_ = e;
+        return *this;
+    }
+
+    Algorithm& pad(bool m, bool n, bool k)
+    {
+        pad_m_ = m ? 1 : 0;
+        pad_n_ = n ? 1 : 0;
+        pad_k_ = k ? 1 : 0;
+        return *this;
+    }
+
+    Algorithm& preshuffle(bool v)
+    {
+        preshuffle_ = v;
+        return *this;
+    }
+
+    bool needs_expansion() const
+    {
+        return wave_m_ == ANY_INT || warp_m_ == ANY_INT || pipeline_ == "*" || pad_m_ == ANY_INT;
+    }
+
+    void auto_fill()
+    {
+        if(wave_m_ == ANY_INT)
+            wave_m_ = 2;
+        if(wave_n_ == ANY_INT)
+            wave_n_ = 2;
+        if(wave_k_ == ANY_INT)
+            wave_k_ = 1;
+        if(warp_m_ == ANY_INT)
+            warp_m_ = 32;
+        if(warp_n_ == ANY_INT)
+            warp_n_ = 32;
+        if(warp_k_ == ANY_INT)
+            warp_k_ = 16;
+    }
+};
+
+// =============================================================================
+// Kernel Declaration
+// =============================================================================
+
+struct KernelDecl
+{
+    Signature signature;
+    Algorithm algorithm;
+    std::string arch = "gfx942";
+
+    KernelDecl() = default;
+
+    KernelDecl(const Signature& sig, const Algorithm& algo, const std::string& a = "gfx942")
+        : signature(sig), algorithm(algo), arch(a)
+    {
+    }
+
+    std::string name() const
+    {
+        std::ostringstream oss;
+        oss << signature.dtype_a_ << "_" << signature.layout_str();
+        if(algorithm.tile_m_ > 0)
+        {
+            oss << "_" << algorithm.tile_m_ << "x" << algorithm.tile_n_ << "x" << algorithm.tile_k_;
+        }
+        return oss.str();
+    }
+
+    bool has_wildcards() const { return algorithm.needs_expansion() || arch == "*"; }
+};
+
+// =============================================================================
+// KernelSet - Collection of declarations
+// =============================================================================
+
+class KernelSet
+{
+    public:
+    KernelSet() = default;
+
+    KernelSet& add(const Signature& sig, const Algorithm& algo, const std::string& arch = "gfx942")
+    {
+        decls_.emplace_back(sig, algo, arch);
+        return *this;
+    }
+
+    KernelSet& add(const std::string& dtype,
+                   const std::string& layout,
+                   int tm,
+                   int tn,
+                   int tk,
+                   const std::string& arch = "gfx942")
+    {
+        Signature sig;
+        sig.dtype(dtype).layout(layout);
+        Algorithm algo;
+        algo.tile(tm, tn, tk);
+        decls_.emplace_back(sig, algo, arch);
+        return *this;
+    }
+
+    KernelSet& add(const KernelDecl& decl)
+    {
+        decls_.push_back(decl);
+        return *this;
+    }
+
+    KernelSet& merge(const KernelSet& other)
+    {
+        decls_.insert(decls_.end(), other.decls_.begin(), other.decls_.end());
+        return *this;
+    }
+
+    const std::vector<KernelDecl>& declarations() const { return decls_; }
+    size_t size() const { return decls_.size(); }
+
+    bool needs_expansion() const
+    {
+        for(const auto& d : decls_)
+        {
+            if(d.algorithm.needs_expansion())
+                return true;
+        }
+        return false;
+    }
+
+    void print(std::ostream& os = std::cout) const
+    {
+        os << "KernelSet (" << size() << " declarations):\n";
+        for(const auto& d : decls_)
+        {
+            os << "  - " << d.name();
+            if(d.algorithm.needs_expansion())
+                os << " [expands]";
+            os << "\n";
+        }
+    }
+
+    KernelSet& tag(const std::string& t)
+    {
+        tag_ = t;
+        return *this;
+    }
+    std::string tag() const { return tag_; }
+
+    private:
+    std::vector<KernelDecl> decls_;
+    std::string tag_;
+};
+
+// =============================================================================
+// KernelSet Registry
+// =============================================================================
+
+class KernelSetRegistry
+{
+    public:
+    static KernelSetRegistry& instance()
+    {
+        static KernelSetRegistry reg;
+        return reg;
+    }
+
+    void add(const std::string& name, const KernelSet& set)
+    {
+        sets_[name] = set;
+        order_.push_back(name);
+    }
+
+    const KernelSet& get(const std::string& name) const
+    {
+        static KernelSet empty;
+        auto it = sets_.find(name);
+        return it != sets_.end() ? it->second : empty;
+    }
+
+    bool has(const std::string& name) const { return sets_.find(name) != sets_.end(); }
+
+    // Return const reference to avoid deep copy
+    const std::vector<std::string>& names() const { return order_; }
+    size_t size() const { return sets_.size(); }
+
+    void print() const
+    {
+        std::cout << "Named Kernel Sets (" << size() << "):\n";
+        for(const auto& name : order_)
+        {
+            const auto& set = sets_.at(name);
+            std::cout << "  " << name << ": " << set.size() << " declarations\n";
+        }
+    }
+
+    private:
+    KernelSetRegistry() = default;
+    std::unordered_map<std::string, KernelSet> sets_;
+    std::vector<std::string> order_;
+};
+
+// =============================================================================
+// Declaration Registry (for DECL_KERNEL)
+// =============================================================================
+
+class Registry
+{
+    public:
+    static Registry& instance()
+    {
+        static Registry reg;
+        return reg;
+    }
+
+    void add(const KernelDecl& decl)
+    {
+        std::string key    = decl.has_wildcards()
+                                 ? ("wildcard_" + std::to_string(declarations_.size()))
+                                 : decl.name();
+        declarations_[key] = decl;
+        order_.push_back(key);
+    }
+
+    std::vector<KernelDecl> all() const
+    {
+        std::vector<KernelDecl> result;
+        for(const auto& key : order_)
+        {
+            result.push_back(declarations_.at(key));
+        }
+        return result;
+    }
+
+    size_t size() const { return declarations_.size(); }
+
+    void print() const
+    {
+        std::cout << "Declared kernels (" << size() << "):\n";
+        for(const auto& key : order_)
+        {
+            const auto& d = declarations_.at(key);
+            std::cout << "  " << d.name();
+            if(d.has_wildcards())
+                std::cout << " [wildcards]";
+            std::cout << "\n";
+        }
+    }
+
+    private:
+    Registry() = default;
+    std::unordered_map<std::string, KernelDecl> declarations_;
+    std::vector<std::string> order_;
+};
+
+// =============================================================================
+// Static Registrars
+// =============================================================================
+
+struct Declarator
+{
+    Declarator(const Signature& sig, const Algorithm& algo, const std::string& arch = "gfx942")
+    {
+        Registry::instance().add(KernelDecl(sig, algo, arch));
+    }
+
+    Declarator(const std::string& dtype,
+               const std::string& layout,
+               int tm,
+               int tn,
+               int tk,
+               const std::string& arch = "gfx942")
+    {
+        Signature sig;
+        sig.dtype(dtype).layout(layout);
+        Algorithm algo;
+        algo.tile(tm, tn, tk);
+        Registry::instance().add(KernelDecl(sig, algo, arch));
+    }
+
+    Declarator(const std::string& dtype, const std::string& layout, const std::string& arch)
+    {
+        Signature sig;
+        sig.dtype(dtype).layout(layout);
+        Algorithm algo;
+        algo.tile(ANY_INT, ANY_INT, ANY_INT);
+        Registry::instance().add(KernelDecl(sig, algo, arch));
+    }
+};
+
+struct KernelSetRegistrar
+{
+    KernelSetRegistrar(const std::string& name, const KernelSet& set)
+    {
+        KernelSetRegistry::instance().add(name, set);
+    }
+};
+
+} // namespace decl
+
+// =============================================================================
+// Convenience Aliases
+// =============================================================================
+
+using KernelSignature    = decl::Signature;
+using KernelAlgorithm    = decl::Algorithm;
+using KernelDecl         = decl::KernelDecl;
+using KernelDeclRegistry = decl::Registry;
+using KernelSet          = decl::KernelSet;
+using KernelSetRegistry  = decl::KernelSetRegistry;
+
+constexpr const char* ANY = decl::ANY;
+constexpr int ANY_INT     = decl::ANY_INT;
+
+} // namespace dispatcher
+} // namespace ck_tile
+
+// =============================================================================
+// Declaration Macros
+// =============================================================================
+
+#define CK_DECL_CAT_(a, b) CK_DECL_CAT_IMPL_(a, b)
+#define CK_DECL_CAT_IMPL_(a, b) a##b
+
+// Note: __extension__ suppresses warnings about __COUNTER__ being a GCC/Clang extension
+#define DECL_KERNEL(sig, algo, ...)                                            \
+    __extension__ static ::ck_tile::dispatcher::decl::Declarator CK_DECL_CAT_( \
+        _kdecl_, __COUNTER__)(sig, algo, ##__VA_ARGS__)
+
+#define DECL_KERNEL_SIMPLE(dtype, layout, tm, tn, tk)                          \
+    __extension__ static ::ck_tile::dispatcher::decl::Declarator CK_DECL_CAT_( \
+        _kdecl_, __COUNTER__)(#dtype, #layout, tm, tn, tk)
+
+#define DECL_KERNEL_ALL(dtype, layout)                                         \
+    __extension__ static ::ck_tile::dispatcher::decl::Declarator CK_DECL_CAT_( \
+        _kdecl_, __COUNTER__)(#dtype, #layout, "*")
+
+#define DECL_KERNEL_SET(name, ...)                                                     \
+    __extension__ static ::ck_tile::dispatcher::decl::KernelSetRegistrar CK_DECL_CAT_( \
+        _kset_reg_, __COUNTER__)(#name,                                                \
+                                 ::ck_tile::dispatcher::decl::KernelSet() __VA_ARGS__.tag(#name))
+
+#define KERNEL_SET(name) ::ck_tile::dispatcher::decl::KernelSet name
+#define BEGIN_KERNEL_SET() ::ck_tile::dispatcher::decl::KernelSet()
+
+// Legacy compatibility
+// Legacy aliases removed - use DECL_KERNEL_SET instead
--- a/dispatcher/include/ck_tile/dispatcher/kernel_instance.hpp
+++ b/dispatcher/include/ck_tile/dispatcher/kernel_instance.hpp
@@ -0,0 +1,68 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/dispatcher/kernel_key.hpp"
+#include "ck_tile/dispatcher/problem.hpp"
+#include <memory>
+#include <string>
+
+namespace ck_tile {
+namespace dispatcher {
+
+/// KernelInstance: Uniform interface for kernel execution
+/// Abstracts away implementation details (CK Library vs CK Tile vs future JIT)
+/// Enables type-erased storage in registry while backends perform type-safe casts
+class KernelInstance
+{
+    public:
+    virtual ~KernelInstance() = default;
+
+    /// Get the kernel's configuration metadata
+    [[nodiscard]] virtual const KernelKey& get_key() const = 0;
+
+    /// Check if this kernel supports the given problem
+    /// Returns false if problem dimensions don't meet kernel requirements
+    /// (e.g., divisibility constraints, resource limits)
+    [[nodiscard]] virtual bool supports(const Problem& problem) const = 0;
+
+    /// Get human-readable kernel name for logging and debugging
+    [[nodiscard]] virtual std::string get_name() const = 0;
+
+    /// Execute the kernel with given problem and data pointers
+    /// @param a_ptr Pointer to matrix A (device memory)
+    /// @param b_ptr Pointer to matrix B (device memory)
+    /// @param c_ptr Pointer to matrix C (device memory, input/output)
+    /// @param d_ptrs Array of pointers to additional D tensors for fusion (device memory)
+    /// @param problem Problem configuration
+    /// @param stream HIP stream for kernel launch (nullptr = default stream)
+    /// @return Kernel execution time in milliseconds (0 if timing not available)
+    [[nodiscard]] virtual float run(const void* a_ptr,
+                                    const void* b_ptr,
+                                    void* c_ptr,
+                                    const void** d_ptrs,
+                                    const Problem& problem,
+                                    void* stream = nullptr) const = 0;
+
+    /// Validate kernel output against reference implementation
+    /// @param a_ptr Pointer to matrix A (device memory)
+    /// @param b_ptr Pointer to matrix B (device memory)
+    /// @param c_ptr Pointer to matrix C (device memory, kernel output)
+    /// @param d_ptrs Array of pointers to additional D tensors (device memory)
+    /// @param problem Problem configuration
+    /// @param tolerance Relative error tolerance for validation
+    /// @return true if validation passes, false otherwise
+    [[nodiscard]] virtual bool validate(const void* a_ptr,
+                                        const void* b_ptr,
+                                        const void* c_ptr,
+                                        const void** d_ptrs,
+                                        const Problem& problem,
+                                        float tolerance = 1e-3f) const = 0;
+};
+
+/// Shared pointer type for kernel instances
+using KernelInstancePtr = std::shared_ptr<KernelInstance>;
+
+} // namespace dispatcher
+} // namespace ck_tile
--- a/dispatcher/include/ck_tile/dispatcher/kernel_key.hpp
+++ b/dispatcher/include/ck_tile/dispatcher/kernel_key.hpp
@@ -0,0 +1,428 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <sstream>
+#include <string>
+#include <tuple>
+
+namespace ck_tile {
+namespace dispatcher {
+
+/// Data types supported by CK Tile GEMM kernels
+/// Matches tile_engine DATA_TYPE_MAP for full compatibility
+enum class DataType : std::uint8_t
+{
+    FP16,  // ck_tile::half_t
+    BF16,  // ck_tile::bf16_t
+    FP32,  // float
+    FP64,  // double
+    FP8,   // ck_tile::fp8_t (E4M3)
+    BF8,   // ck_tile::bf8_t (E5M2)
+    INT8,  // ck_tile::int8_t
+    INT4,  // ck_tile::pk_int4_t (packed int4)
+    INT32, // ck_tile::int32_t
+    UNKNOWN
+};
+
+/// Memory layout tags for tensors
+enum class LayoutTag : std::uint8_t
+{
+    RowMajor,
+    ColMajor,
+    PackedExternal
+};
+
+/// Pipeline variants for memory/compute optimization
+/// Matches tile_engine PIPELINE_MAP for full compatibility
+enum class Pipeline : std::uint8_t
+{
+    Mem,          // Memory-bound pipeline
+    CompV1,       // Compute pipeline v1
+    CompV2,       // Compute pipeline v2
+    CompV3,       // Compute pipeline v3
+    CompV4,       // Compute pipeline v4 (double buffering)
+    CompV5,       // Compute pipeline v5
+    PreShuffleV1, // Weight preshuffle pipeline v1
+    PreShuffleV2  // Weight preshuffle pipeline v2 (optimized)
+};
+
+/// Epilogue strategies for output processing
+/// Matches tile_engine epilogue options for full compatibility
+enum class Epilogue : std::uint8_t
+{
+    None,
+    Default,       // DefaultGemm2DEpilogue
+    CShuffle,      // CShuffleEpilogue (cross-shuffle)
+    Bias,          // Bias addition
+    Activation,    // Fused activation
+    BiasActivation // Fused bias + activation
+};
+
+/// Scheduler types for wave coordination
+enum class Scheduler : std::uint8_t
+{
+    Auto,
+    Intrawave,
+    Interwave
+};
+
+/// KernelKey: Compile-time kernel configuration metadata
+/// Organized into Signature (what operation) and Algorithm (how it's implemented)
+struct KernelKey
+{
+    /// Signature: Describes WHAT operation is computed (mathematical semantics)
+    /// Two kernels with different signatures compute different mathematical operations
+    struct Signature
+    {
+        DataType dtype_a;
+        DataType dtype_b;
+        DataType dtype_c;
+        DataType dtype_acc;
+        LayoutTag layout_a;
+        LayoutTag layout_b;
+        LayoutTag layout_c;
+        bool transpose_a;
+        bool transpose_b;
+        bool grouped;
+        std::uint8_t split_k;
+
+        // Element-wise fusion: Describes mathematical operation applied to GEMM output
+        // Examples: PassThrough (C = A*B), MultiDAdd (E = C + D0 + D1),
+        //           MultiDMultiply (E = C * D0 * D1), Clamp, Relu, Gelu, etc.
+        // This affects the mathematical result, so it belongs in Signature
+        std::string elementwise_op; // e.g., "PassThrough", "MultiDAdd", "Relu"
+        std::uint8_t
+            num_d_tensors; // Number of additional input tensors for fusion (0 for basic GEMM)
+
+        bool structured_sparsity; // 2:4 sparsity affects mathematical correctness
+    } signature;
+
+    /// Algorithm: Describes HOW it's implemented (performance tuning parameters)
+    /// Two kernels with same signature but different algorithms compute the same result
+    /// with different performance characteristics
+    struct Algorithm
+    {
+        // Hierarchical tiling configuration (primary tuning knobs)
+        struct TileShape
+        {
+            std::uint16_t m;
+            std::uint16_t n;
+            std::uint16_t k;
+        } tile_shape;
+
+        struct WaveShape
+        {
+            std::uint8_t m; // WarpPerBlock_M in generated kernels
+            std::uint8_t n; // WarpPerBlock_N
+            std::uint8_t k; // WarpPerBlock_K
+        } wave_shape;
+
+        struct WarpTileShape
+        {
+            std::uint8_t m; // WarpTileM in generated kernels
+            std::uint8_t n; // WarpTileN
+            std::uint8_t k; // WarpTileK
+        } warp_tile_shape;
+
+        // Pipeline and scheduling strategy
+        Pipeline pipeline;
+        Scheduler scheduler;
+        Epilogue epilogue;
+
+        // Block and memory configuration
+        std::uint16_t block_size;     // BlockSize in generated kernels (typically 256)
+        bool double_buffer;           // DoubleSmemBuffer (true for compv4)
+        bool persistent;              // UsePersistentKernel
+        bool preshuffle;              // Preshuffle (for weight preshuffle variants)
+        bool transpose_c;             // TransposeC
+        std::uint8_t num_wave_groups; // NumWaveGroups
+    } algorithm;
+
+    std::string gfx_arch; // e.g. "gfx942", "gfx90a", "gfx908"
+
+    /// Generate a unique string identifier for this kernel configuration
+    /// Format matches tile_engine naming convention for registry lookup
+    /// Note: Defined after to_string() functions to use them
+    [[nodiscard]] std::string encode_identifier() const;
+
+    /// Create a tuple of all fields for comparison operators
+    auto tie() const
+    {
+        return std::tie(signature.dtype_a,
+                        signature.dtype_b,
+                        signature.dtype_c,
+                        signature.dtype_acc,
+                        signature.layout_a,
+                        signature.layout_b,
+                        signature.layout_c,
+                        signature.transpose_a,
+                        signature.transpose_b,
+                        signature.grouped,
+                        signature.split_k,
+                        signature.elementwise_op,
+                        signature.num_d_tensors,
+                        signature.structured_sparsity,
+                        algorithm.tile_shape.m,
+                        algorithm.tile_shape.n,
+                        algorithm.tile_shape.k,
+                        algorithm.wave_shape.m,
+                        algorithm.wave_shape.n,
+                        algorithm.wave_shape.k,
+                        algorithm.warp_tile_shape.m,
+                        algorithm.warp_tile_shape.n,
+                        algorithm.warp_tile_shape.k,
+                        algorithm.pipeline,
+                        algorithm.epilogue,
+                        algorithm.scheduler,
+                        algorithm.block_size,
+                        gfx_arch,
+                        signature.structured_sparsity,
+                        algorithm.persistent,
+                        algorithm.double_buffer,
+                        algorithm.preshuffle,
+                        algorithm.transpose_c,
+                        algorithm.num_wave_groups);
+    }
+
+    /// Equality comparison
+    friend bool operator==(const KernelKey& lhs, const KernelKey& rhs)
+    {
+        return lhs.tie() == rhs.tie();
+    }
+
+    /// Inequality comparison
+    friend bool operator!=(const KernelKey& lhs, const KernelKey& rhs) { return !(lhs == rhs); }
+};
+
+// =============================================================================
+// String Conversion Helpers (for serialization and debugging)
+// =============================================================================
+
+/// Convert DataType to string
+inline std::string to_string(DataType dtype)
+{
+    switch(dtype)
+    {
+    case DataType::FP16: return "fp16";
+    case DataType::BF16: return "bf16";
+    case DataType::FP32: return "fp32";
+    case DataType::FP64: return "fp64";
+    case DataType::FP8: return "fp8";
+    case DataType::BF8: return "bf8";
+    case DataType::INT8: return "int8";
+    case DataType::INT4: return "int4";
+    case DataType::INT32: return "int32";
+    default: return "unknown";
+    }
+}
+
+/// Convert string to DataType
+inline DataType string_to_dtype(const std::string& str)
+{
+    if(str == "fp16")
+        return DataType::FP16;
+    if(str == "bf16")
+        return DataType::BF16;
+    if(str == "fp32")
+        return DataType::FP32;
+    if(str == "fp64")
+        return DataType::FP64;
+    if(str == "fp8")
+        return DataType::FP8;
+    if(str == "bf8")
+        return DataType::BF8;
+    if(str == "int8")
+        return DataType::INT8;
+    if(str == "int4")
+        return DataType::INT4;
+    if(str == "int32")
+        return DataType::INT32;
+    return DataType::UNKNOWN;
+}
+
+/// Convert LayoutTag to string
+inline std::string to_string(LayoutTag layout)
+{
+    switch(layout)
+    {
+    case LayoutTag::RowMajor: return "r";
+    case LayoutTag::ColMajor: return "c";
+    case LayoutTag::PackedExternal: return "p";
+    default: return "?";
+    }
+}
+
+/// Convert string to LayoutTag
+inline LayoutTag string_to_layout(const std::string& str)
+{
+    if(str == "r" || str == "row" || str == "RowMajor")
+        return LayoutTag::RowMajor;
+    if(str == "c" || str == "col" || str == "ColMajor")
+        return LayoutTag::ColMajor;
+    if(str == "p" || str == "packed")
+        return LayoutTag::PackedExternal;
+    return LayoutTag::RowMajor; // Default
+}
+
+/// Convert Pipeline to string
+inline std::string to_string(Pipeline pipeline)
+{
+    switch(pipeline)
+    {
+    case Pipeline::Mem: return "mem";
+    case Pipeline::CompV1: return "compv1";
+    case Pipeline::CompV2: return "compv2";
+    case Pipeline::CompV3: return "compv3";
+    case Pipeline::CompV4: return "compv4";
+    case Pipeline::CompV5: return "compv5";
+    case Pipeline::PreShuffleV1: return "preshufflev1";
+    case Pipeline::PreShuffleV2: return "preshufflev2";
+    default: return "unknown";
+    }
+}
+
+/// Convert string to Pipeline
+inline Pipeline string_to_pipeline(const std::string& str)
+{
+    if(str == "mem")
+        return Pipeline::Mem;
+    if(str == "compv1")
+        return Pipeline::CompV1;
+    if(str == "compv2")
+        return Pipeline::CompV2;
+    if(str == "compv3")
+        return Pipeline::CompV3;
+    if(str == "compv4")
+        return Pipeline::CompV4;
+    if(str == "compv5")
+        return Pipeline::CompV5;
+    if(str == "preshufflev1")
+        return Pipeline::PreShuffleV1;
+    if(str == "preshufflev2")
+        return Pipeline::PreShuffleV2;
+    return Pipeline::Mem; // Default
+}
+
+/// Convert Epilogue to string
+inline std::string to_string(Epilogue epilogue)
+{
+    switch(epilogue)
+    {
+    case Epilogue::None: return "none";
+    case Epilogue::Default: return "default";
+    case Epilogue::CShuffle: return "cshuffle";
+    case Epilogue::Bias: return "bias";
+    case Epilogue::Activation: return "activation";
+    case Epilogue::BiasActivation: return "bias_activation";
+    default: return "unknown";
+    }
+}
+
+/// Convert string to Epilogue
+inline Epilogue string_to_epilogue(const std::string& str)
+{
+    if(str == "none")
+        return Epilogue::None;
+    if(str == "default")
+        return Epilogue::Default;
+    if(str == "cshuffle")
+        return Epilogue::CShuffle;
+    if(str == "bias")
+        return Epilogue::Bias;
+    if(str == "activation")
+        return Epilogue::Activation;
+    if(str == "bias_activation")
+        return Epilogue::BiasActivation;
+    return Epilogue::Default; // Default
+}
+
+/// Convert Scheduler to string
+inline std::string to_string(Scheduler scheduler)
+{
+    switch(scheduler)
+    {
+    case Scheduler::Auto: return "auto";
+    case Scheduler::Intrawave: return "intrawave";
+    case Scheduler::Interwave: return "interwave";
+    default: return "unknown";
+    }
+}
+
+/// Convert string to Scheduler
+inline Scheduler string_to_scheduler(const std::string& str)
+{
+    if(str == "auto")
+        return Scheduler::Auto;
+    if(str == "intrawave")
+        return Scheduler::Intrawave;
+    if(str == "interwave")
+        return Scheduler::Interwave;
+    return Scheduler::Intrawave; // Default
+}
+
+/// Common elementwise operations (for reference in elementwise_op field)
+/// These match CK Tile's ck_tile::element_wise namespace
+namespace ElementwiseOps {
+constexpr const char* PassThrough    = "PassThrough";
+constexpr const char* Add            = "Add";
+constexpr const char* Multiply       = "Multiply";
+constexpr const char* MultiDAdd      = "MultiDAdd";
+constexpr const char* MultiDMultiply = "MultiDMultiply";
+constexpr const char* Relu           = "Relu";
+constexpr const char* Gelu           = "Gelu";
+constexpr const char* Clamp          = "Clamp";
+constexpr const char* Sigmoid        = "Sigmoid";
+constexpr const char* Tanh           = "Tanh";
+constexpr const char* Swish          = "Swish";
+constexpr const char* HardSwish      = "HardSwish";
+} // namespace ElementwiseOps
+
+// =============================================================================
+// KernelKey::encode_identifier() implementation
+// Defined after to_string() functions to use them
+// =============================================================================
+
+inline std::string KernelKey::encode_identifier() const
+{
+    std::ostringstream oss;
+
+    // Include data types and layout for uniqueness across different signatures
+    oss << to_string(signature.dtype_a) << "_";
+    oss << to_string(signature.layout_a) << to_string(signature.layout_b)
+        << to_string(signature.layout_c) << "_";
+
+    // Include pipeline, scheduler, epilogue for uniqueness
+    oss << to_string(algorithm.pipeline) << "_";
+    oss << to_string(algorithm.scheduler) << "_";
+    oss << to_string(algorithm.epilogue) << "_";
+
+    // Match tile_engine naming: tile_m x tile_n x tile_k _ warp_m x warp_n x warp_k _
+    // warp_tile_m x warp_tile_n x warp_tile_k
+    oss << algorithm.tile_shape.m << "x" << algorithm.tile_shape.n << "x" << algorithm.tile_shape.k
+        << "_" << unsigned(algorithm.wave_shape.m) << "x" << unsigned(algorithm.wave_shape.n) << "x"
+        << unsigned(algorithm.wave_shape.k) << "_" << unsigned(algorithm.warp_tile_shape.m) << "x"
+        << unsigned(algorithm.warp_tile_shape.n) << "x" << unsigned(algorithm.warp_tile_shape.k);
+
+    // Add trait flags
+    oss << "_" << (algorithm.persistent ? "persist" : "nopers");
+
+    if(signature.split_k > 1)
+        oss << "_splitk" << unsigned(signature.split_k);
+    if(!signature.elementwise_op.empty() && signature.elementwise_op != "PassThrough")
+        oss << "_" << signature.elementwise_op;
+    if(signature.num_d_tensors > 0)
+        oss << "_d" << unsigned(signature.num_d_tensors);
+    if(signature.structured_sparsity)
+        oss << "_sparse";
+    if(algorithm.preshuffle)
+        oss << "_preshuffle";
+
+    return oss.str();
+}
+
+} // namespace dispatcher
+} // namespace ck_tile
--- a/dispatcher/include/ck_tile/dispatcher/problem.hpp
+++ b/dispatcher/include/ck_tile/dispatcher/problem.hpp
@@ -0,0 +1,311 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+
+namespace ck_tile {
+namespace dispatcher {
+
+// =============================================================================
+// Tensor Information for Automatic MNK Inference
+// =============================================================================
+
+/// TensorShape: Describes tensor dimensions for automatic MNK inference
+struct TensorShape
+{
+    std::int64_t rows;  // First dimension
+    std::int64_t cols;  // Second dimension
+    bool is_transposed; // Whether the tensor is transposed (column-major)
+
+    TensorShape() : rows(0), cols(0), is_transposed(false) {}
+    TensorShape(std::int64_t r, std::int64_t c, bool trans = false)
+        : rows(r), cols(c), is_transposed(trans)
+    {
+    }
+
+    /// Get logical M (rows when not transposed)
+    [[nodiscard]] std::int64_t logical_rows() const { return is_transposed ? cols : rows; }
+
+    /// Get logical N (cols when not transposed)
+    [[nodiscard]] std::int64_t logical_cols() const { return is_transposed ? rows : cols; }
+};
+
+// =============================================================================
+// Problem: Runtime Parameters
+// =============================================================================
+
+/// Problem: Runtime parameters for kernel invocation
+/// Captures problem dimensions and resource constraints that vary between invocations
+/// even when using the same kernel
+struct Problem
+{
+    // Problem dimensions
+    std::int64_t M; // Number of rows in A and C
+    std::int64_t N; // Number of columns in B and C
+    std::int64_t K; // Shared dimension (columns of A, rows of B)
+
+    // Batch configuration
+    std::int32_t k_batch; // Number of K-dimension splits for split-K GEMM
+
+    // Resource preferences
+    std::int32_t smem_budget; // Shared memory budget in bytes (0 = no constraint)
+    bool prefer_persistent;   // Prefer persistent kernel variants
+
+    // Validation control
+    bool enable_validation; // Enable output validation against reference
+
+    /// Default constructor with sensible defaults
+    Problem()
+        : M(0),
+          N(0),
+          K(0),
+          k_batch(1),
+          smem_budget(0),
+          prefer_persistent(false),
+          enable_validation(false)
+    {
+    }
+
+    /// Constructor with problem dimensions
+    Problem(std::int64_t m, std::int64_t n, std::int64_t k)
+        : M(m),
+          N(n),
+          K(k),
+          k_batch(1),
+          smem_budget(0),
+          prefer_persistent(false),
+          enable_validation(false)
+    {
+    }
+
+    /// Check if problem dimensions are valid
+    [[nodiscard]] bool is_valid() const { return M > 0 && N > 0 && K > 0 && k_batch > 0; }
+
+    /// Get total number of operations (for performance metrics)
+    [[nodiscard]] std::int64_t num_ops() const
+    {
+        return 2 * M * N * K; // Multiply-add counts as 2 ops
+    }
+
+    // =========================================================================
+    // Factory Methods for Automatic MNK Inference
+    // =========================================================================
+
+    /**
+     * Create Problem by inferring MNK from tensor shapes.
+     *
+     * For GEMM: C[M,N] = A[M,K] × B[K,N]
+     *
+     * @param a_shape Shape of matrix A (M x K, or K x M if transposed)
+     * @param b_shape Shape of matrix B (K x N, or N x K if transposed)
+     * @param c_shape Shape of matrix C (M x N) - used for validation
+     * @throws std::invalid_argument if dimensions are inconsistent
+     *
+     * Example:
+     *   // A is 512x256, B is 256x1024, C is 512x1024
+     *   auto problem = Problem::from_shapes({512, 256}, {256, 1024}, {512, 1024});
+     *   // Infers: M=512, N=1024, K=256
+     */
+    [[nodiscard]] static Problem
+    from_shapes(TensorShape a_shape, TensorShape b_shape, TensorShape c_shape)
+    {
+        // For C = A × B:
+        // A: [M, K] (or [K, M] if transposed)
+        // B: [K, N] (or [N, K] if transposed)
+        // C: [M, N]
+
+        std::int64_t M_from_A = a_shape.logical_rows();
+        std::int64_t K_from_A = a_shape.logical_cols();
+        std::int64_t K_from_B = b_shape.logical_rows();
+        std::int64_t N_from_B = b_shape.logical_cols();
+        std::int64_t M_from_C = c_shape.logical_rows();
+        std::int64_t N_from_C = c_shape.logical_cols();
+
+        // Validate K dimension matches between A and B
+        if(K_from_A != K_from_B)
+        {
+            throw std::invalid_argument(
+                "K dimension mismatch: A has K=" + std::to_string(K_from_A) +
+                ", B has K=" + std::to_string(K_from_B));
+        }
+
+        // Validate M dimension matches between A and C
+        if(M_from_A != M_from_C)
+        {
+            throw std::invalid_argument(
+                "M dimension mismatch: A has M=" + std::to_string(M_from_A) +
+                ", C has M=" + std::to_string(M_from_C));
+        }
+
+        // Validate N dimension matches between B and C
+        if(N_from_B != N_from_C)
+        {
+            throw std::invalid_argument(
+                "N dimension mismatch: B has N=" + std::to_string(N_from_B) +
+                ", C has N=" + std::to_string(N_from_C));
+        }
+
+        return Problem(M_from_A, N_from_B, K_from_A);
+    }
+
+    /**
+     * Create Problem from tensor dimensions (simple version without transpose).
+     *
+     * @param a_rows Rows of matrix A (= M)
+     * @param a_cols Columns of matrix A (= K)
+     * @param b_rows Rows of matrix B (= K)
+     * @param b_cols Columns of matrix B (= N)
+     * @param c_rows Rows of matrix C (= M) - for validation
+     * @param c_cols Columns of matrix C (= N) - for validation
+     * @throws std::invalid_argument if dimensions are inconsistent
+     *
+     * Example:
+     *   // A[512,256] × B[256,1024] = C[512,1024]
+     *   auto problem = Problem::from_dimensions(512, 256, 256, 1024, 512, 1024);
+     */
+    [[nodiscard]] static Problem from_dimensions(std::int64_t a_rows,
+                                                 std::int64_t a_cols,
+                                                 std::int64_t b_rows,
+                                                 std::int64_t b_cols,
+                                                 std::int64_t c_rows,
+                                                 std::int64_t c_cols)
+    {
+        return from_shapes(
+            TensorShape(a_rows, a_cols), TensorShape(b_rows, b_cols), TensorShape(c_rows, c_cols));
+    }
+
+    /**
+     * Create Problem from A and B dimensions only (C is inferred).
+     *
+     * @param a_rows Rows of matrix A (= M)
+     * @param a_cols Columns of matrix A (= K)
+     * @param b_rows Rows of matrix B (= K) - validated
+     * @param b_cols Columns of matrix B (= N)
+     * @throws std::invalid_argument if K dimensions don't match
+     *
+     * Example:
+     *   // A[512,256] × B[256,1024] = C[512,1024]
+     *   auto problem = Problem::from_ab(512, 256, 256, 1024);
+     */
+    [[nodiscard]] static Problem
+    from_ab(std::int64_t a_rows, std::int64_t a_cols, std::int64_t b_rows, std::int64_t b_cols)
+    {
+        if(a_cols != b_rows)
+        {
+            throw std::invalid_argument("K dimension mismatch: A.cols=" + std::to_string(a_cols) +
+                                        ", B.rows=" + std::to_string(b_rows));
+        }
+        return Problem(a_rows, b_cols, a_cols);
+    }
+
+    /**
+     * Validate that tensor pointers have consistent sizes.
+     * Call this before kernel execution to catch dimension errors early.
+     *
+     * @param a_size Total elements in A tensor
+     * @param b_size Total elements in B tensor
+     * @param c_size Total elements in C tensor
+     * @throws std::invalid_argument if sizes don't match expected dimensions
+     */
+    void validate_sizes(std::int64_t a_size, std::int64_t b_size, std::int64_t c_size) const
+    {
+        std::int64_t expected_a = M * K;
+        std::int64_t expected_b = K * N;
+        std::int64_t expected_c = M * N;
+
+        if(a_size != expected_a)
+        {
+            throw std::invalid_argument("A tensor size mismatch: got " + std::to_string(a_size) +
+                                        ", expected " + std::to_string(expected_a) + " (M*K = " +
+                                        std::to_string(M) + "*" + std::to_string(K) + ")");
+        }
+        if(b_size != expected_b)
+        {
+            throw std::invalid_argument("B tensor size mismatch: got " + std::to_string(b_size) +
+                                        ", expected " + std::to_string(expected_b) + " (K*N = " +
+                                        std::to_string(K) + "*" + std::to_string(N) + ")");
+        }
+        if(c_size != expected_c)
+        {
+            throw std::invalid_argument("C tensor size mismatch: got " + std::to_string(c_size) +
+                                        ", expected " + std::to_string(expected_c) + " (M*N = " +
+                                        std::to_string(M) + "*" + std::to_string(N) + ")");
+        }
+    }
+};
+
+// =============================================================================
+// Convenience Builders
+// =============================================================================
+
+/// Builder pattern for Problem configuration
+class ProblemBuilder
+{
+    public:
+    ProblemBuilder() = default;
+
+    /// Set dimensions from A and B shapes
+    ProblemBuilder&
+    from_ab(std::int64_t a_rows, std::int64_t a_cols, std::int64_t b_rows, std::int64_t b_cols)
+    {
+        problem_ = Problem::from_ab(a_rows, a_cols, b_rows, b_cols);
+        return *this;
+    }
+
+    /// Set MNK directly
+    ProblemBuilder& dimensions(std::int64_t m, std::int64_t n, std::int64_t k)
+    {
+        problem_.M = m;
+        problem_.N = n;
+        problem_.K = k;
+        return *this;
+    }
+
+    /// Set split-K batch count
+    ProblemBuilder& split_k(std::int32_t k_batch)
+    {
+        problem_.k_batch = k_batch;
+        return *this;
+    }
+
+    /// Set shared memory budget
+    ProblemBuilder& smem_budget(std::int32_t budget)
+    {
+        problem_.smem_budget = budget;
+        return *this;
+    }
+
+    /// Prefer persistent kernels
+    ProblemBuilder& persistent(bool prefer = true)
+    {
+        problem_.prefer_persistent = prefer;
+        return *this;
+    }
+
+    /// Enable validation
+    ProblemBuilder& validate(bool enable = true)
+    {
+        problem_.enable_validation = enable;
+        return *this;
+    }
+
+    /// Build the Problem
+    [[nodiscard]] Problem build() const
+    {
+        if(!problem_.is_valid())
+        {
+            throw std::invalid_argument("Invalid problem dimensions");
+        }
+        return problem_;
+    }
+
+    private:
+    Problem problem_;
+};
+
+} // namespace dispatcher
+} // namespace ck_tile
--- a/dispatcher/include/ck_tile/dispatcher/registry.hpp
+++ b/dispatcher/include/ck_tile/dispatcher/registry.hpp
@@ -0,0 +1,197 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/**
+ * Registry - Thread-Safe Kernel Storage
+ *
+ * Central registry for all available kernel instances with priority-based
+ * ordering and efficient lookup.
+ *
+ * Features:
+ * - Thread-safe registration and lookup
+ * - Priority-based ordering (High, Normal, Low)
+ * - Lookup by name or KernelKey
+ * - Filter by problem compatibility
+ * - Supports both singleton and multiple instance patterns
+ *
+ * Usage (Singleton - backward compatible):
+ *   auto& registry = Registry::instance();
+ *   registry.register_kernel(kernel, Priority::High);
+ *   auto kernel = registry.lookup("kernel_name");
+ *
+ * Usage (Multiple registries):
+ *   Registry fp16_registry;
+ *   Registry bf16_registry;
+ *   fp16_registry.register_kernel(fp16_kernel, Priority::High);
+ *   bf16_registry.register_kernel(bf16_kernel, Priority::High);
+ *
+ *   Dispatcher fp16_dispatcher(&fp16_registry);
+ *   Dispatcher bf16_dispatcher(&bf16_registry);
+ *
+ * Status: Production ready, thread-safe
+ */
+
+#pragma once
+
+#include "ck_tile/dispatcher/kernel_instance.hpp"
+#include "ck_tile/dispatcher/kernel_key.hpp"
+#include <functional>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include <memory>
+
+namespace ck_tile {
+namespace dispatcher {
+
+/// Registry: Central mapping from kernel configurations to executable instances
+/// Thread-safe kernel registration and lookup
+/// Supports both singleton pattern and multiple independent instances
+class Registry
+{
+    public:
+    /// Priority levels for conflict resolution when multiple kernels have same key
+    enum class Priority
+    {
+        Low    = 0,
+        Normal = 1,
+        High   = 2
+    };
+
+    /// Default constructor - creates an empty registry instance
+    /// Use this to create independent registries for different kernel sets
+    Registry();
+
+    /// Destructor - triggers auto-export if enabled
+    ~Registry();
+
+    /// Move constructor
+    Registry(Registry&& other) noexcept;
+
+    /// Move assignment
+    Registry& operator=(Registry&& other) noexcept;
+
+    // Prevent copying (registries contain shared_ptrs that shouldn't be duplicated)
+    Registry(const Registry&)            = delete;
+    Registry& operator=(const Registry&) = delete;
+
+    /// Register a kernel instance with the registry
+    /// @param instance Kernel instance to register
+    /// @param priority Priority level for conflict resolution (default: Normal)
+    /// @return true if registered successfully, false if duplicate with higher priority exists
+    bool register_kernel(KernelInstancePtr instance, Priority priority = Priority::Normal);
+
+    /// Lookup a kernel by its string identifier
+    /// @param identifier Kernel identifier string
+    /// @return Kernel instance if found, nullptr otherwise
+    [[nodiscard]] KernelInstancePtr lookup(const std::string& identifier) const;
+
+    /// Lookup a kernel by its KernelKey
+    /// @param key Kernel configuration key
+    /// @return Kernel instance if found, nullptr otherwise
+    [[nodiscard]] KernelInstancePtr lookup(const KernelKey& key) const;
+
+    /// Get all registered kernels
+    /// @return Vector of all kernel instances
+    [[nodiscard]] std::vector<KernelInstancePtr> get_all() const;
+
+    /// Get all kernels matching a predicate
+    /// @param predicate Function to filter kernels
+    /// @return Vector of matching kernel instances
+    [[nodiscard]] std::vector<KernelInstancePtr>
+    filter(std::function<bool(const KernelInstance&)> predicate) const;
+
+    /// Get number of registered kernels
+    [[nodiscard]] std::size_t size() const;
+
+    /// Check if registry is empty
+    [[nodiscard]] bool empty() const;
+
+    /// Clear all registered kernels
+    void clear();
+
+    /// Get registry name (for logging/debugging)
+    [[nodiscard]] const std::string& get_name() const;
+
+    /// Set registry name (for logging/debugging)
+    void set_name(const std::string& name);
+
+    /// Export registry to JSON string
+    /// @param include_statistics Whether to include kernel statistics breakdown
+    /// @return JSON string with all kernel metadata
+    [[nodiscard]] std::string export_json(bool include_statistics = true) const;
+
+    /// Export registry to JSON file
+    /// @param filename Output filename
+    /// @param include_statistics Whether to include kernel statistics breakdown
+    /// @return true if export succeeded, false otherwise
+    bool export_json_to_file(const std::string& filename, bool include_statistics = true) const;
+
+    /// Enable automatic JSON export on kernel registration
+    /// @param filename Output filename for auto-export
+    /// @param include_statistics Whether to include statistics in auto-export
+    /// @param export_on_every_registration If true, exports after every registration (default).
+    ///                                      If false, only exports on destruction.
+    void enable_auto_export(const std::string& filename,
+                            bool include_statistics           = true,
+                            bool export_on_every_registration = true);
+
+    /// Disable automatic JSON export
+    void disable_auto_export();
+
+    /// Check if auto-export is enabled
+    [[nodiscard]] bool is_auto_export_enabled() const;
+
+    /// Merge kernels from another registry into this one
+    /// @param other Registry to merge from
+    /// @param priority Priority for merged kernels (default: Normal)
+    /// @return Number of kernels successfully merged
+    std::size_t merge_from(const Registry& other, Priority priority = Priority::Normal);
+
+    /// Filter kernels in-place by architecture
+    /// @param gpu_arch Target GPU architecture string (e.g., "gfx942")
+    /// @return Number of kernels removed
+    std::size_t filter_by_arch(const std::string& gpu_arch);
+
+    /// Get singleton instance of the global registry (backward compatible)
+    /// This is the default registry used when no specific registry is provided
+    static Registry& instance();
+
+    private:
+    struct RegistryEntry
+    {
+        KernelInstancePtr instance;
+        Priority priority;
+    };
+
+    /// Perform auto-export if enabled
+    void perform_auto_export();
+
+    mutable std::mutex mutex_;
+    std::unordered_map<std::string, RegistryEntry> kernels_;
+    std::string name_;
+
+    // Auto-export configuration
+    bool auto_export_enabled_ = false;
+    std::string auto_export_filename_;
+    bool auto_export_include_statistics_    = true;
+    bool auto_export_on_every_registration_ = true;
+};
+
+/// Shared pointer type for registries (useful for managing lifetime)
+using RegistryPtr = std::shared_ptr<Registry>;
+
+/// Create a new registry instance (factory function)
+inline RegistryPtr make_registry(const std::string& name = "")
+{
+    auto reg = std::make_shared<Registry>();
+    if(!name.empty())
+    {
+        reg->set_name(name);
+    }
+    return reg;
+}
+
+} // namespace dispatcher
+} // namespace ck_tile
--- a/dispatcher/include/ck_tile/dispatcher/utils.hpp
+++ b/dispatcher/include/ck_tile/dispatcher/utils.hpp
@@ -0,0 +1,724 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/**
+ * @file utils.hpp
+ * @brief Common utilities for CK Tile Dispatcher
+ *
+ * This header provides reusable utilities for:
+ * - GPU memory management (GpuBuffer)
+ * - Performance measurement (Timer, GpuTimer, BenchmarkStats)
+ * - Validation (ValidationResult, validate_result)
+ * - Kernel registration helpers
+ * - Data generation (fill_random, etc.)
+ *
+ * Usage:
+ *   #include "ck_tile/dispatcher/utils.hpp"
+ *   using namespace ck_tile::dispatcher::utils;
+ *
+ *   // GPU memory
+ *   GpuBuffer<half_t> buffer(1024);
+ *
+ *   // Timing
+ *   GpuTimer timer;
+ *   timer.start();
+ *   // ... kernel ...
+ *   timer.stop();
+ *   float ms = timer.elapsed_ms();
+ *
+ *   // Validation
+ *   auto result = validate_result(gpu_data, ref_data, size);
+ */
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <chrono>
+#include <cmath>
+#include <cstdint>
+#include <iomanip>
+#include <iostream>
+#include <random>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <algorithm>
+
+#include "ck_tile/dispatcher/dispatcher.hpp"
+#include "ck_tile/dispatcher/registry.hpp"
+#include "ck_tile/dispatcher/backends/generated_tile_backend.hpp"
+
+namespace ck_tile {
+namespace dispatcher {
+namespace utils {
+
+// =============================================================================
+// HIP Error Handling
+// =============================================================================
+
+#define CK_HIP_CHECK(call)                                                      \
+    do                                                                          \
+    {                                                                           \
+        hipError_t err = call;                                                  \
+        if(err != hipSuccess)                                                   \
+        {                                                                       \
+            std::cerr << "HIP error at " << __FILE__ << ":" << __LINE__ << ": " \
+                      << hipGetErrorString(err) << std::endl;                   \
+            return false;                                                       \
+        }                                                                       \
+    } while(0)
+
+#define CK_HIP_CHECK_THROW(call)                                                           \
+    do                                                                                     \
+    {                                                                                      \
+        hipError_t err = call;                                                             \
+        if(err != hipSuccess)                                                              \
+        {                                                                                  \
+            throw std::runtime_error(std::string("HIP error: ") + hipGetErrorString(err)); \
+        }                                                                                  \
+    } while(0)
+
+// =============================================================================
+// Timing Utilities
+// =============================================================================
+
+/**
+ * @brief High-resolution timer for CPU timing
+ */
+class Timer
+{
+    public:
+    void start() { start_ = std::chrono::high_resolution_clock::now(); }
+
+    double elapsed_ms() const
+    {
+        auto end = std::chrono::high_resolution_clock::now();
+        return std::chrono::duration<double, std::milli>(end - start_).count();
+    }
+
+    private:
+    std::chrono::high_resolution_clock::time_point start_;
+};
+
+/**
+ * @brief GPU timing using HIP events
+ *
+ * Times kernel execution on a specific HIP stream. Events are recorded
+ * on the provided stream to accurately measure kernel execution time.
+ *
+ * Usage:
+ *   hipStream_t stream;
+ *   hipStreamCreate(&stream);
+ *   GpuTimer timer(stream);  // or timer.set_stream(stream)
+ *   timer.start();
+ *   kernel<<<grid, block, 0, stream>>>(...);
+ *   timer.stop();
+ *   float ms = timer.elapsed_ms();
+ */
+class GpuTimer
+{
+    public:
+    /**
+     * @brief Construct timer with optional stream
+     * @param stream HIP stream to record events on (default: null stream)
+     */
+    explicit GpuTimer(hipStream_t stream = nullptr) : stream_(stream)
+    {
+        (void)hipEventCreate(&start_);
+        (void)hipEventCreate(&stop_);
+    }
+
+    ~GpuTimer()
+    {
+        (void)hipEventDestroy(start_);
+        (void)hipEventDestroy(stop_);
+    }
+
+    // Non-copyable
+    GpuTimer(const GpuTimer&)            = delete;
+    GpuTimer& operator=(const GpuTimer&) = delete;
+
+    // Movable
+    GpuTimer(GpuTimer&& other) noexcept
+        : start_(other.start_), stop_(other.stop_), stream_(other.stream_)
+    {
+        other.start_  = nullptr;
+        other.stop_   = nullptr;
+        other.stream_ = nullptr;
+    }
+
+    GpuTimer& operator=(GpuTimer&& other) noexcept
+    {
+        if(this != &other)
+        {
+            if(start_)
+                (void)hipEventDestroy(start_);
+            if(stop_)
+                (void)hipEventDestroy(stop_);
+            start_        = other.start_;
+            stop_         = other.stop_;
+            stream_       = other.stream_;
+            other.start_  = nullptr;
+            other.stop_   = nullptr;
+            other.stream_ = nullptr;
+        }
+        return *this;
+    }
+
+    /**
+     * @brief Set the stream to record events on
+     * @param stream HIP stream (pass nullptr for default stream)
+     */
+    void set_stream(hipStream_t stream) { stream_ = stream; }
+
+    /**
+     * @brief Get the current stream
+     */
+    hipStream_t get_stream() const { return stream_; }
+
+    /**
+     * @brief Record start event on the stream
+     */
+    void start() { (void)hipEventRecord(start_, stream_); }
+
+    /**
+     * @brief Record stop event on the stream
+     */
+    void stop() { (void)hipEventRecord(stop_, stream_); }
+
+    /**
+     * @brief Get elapsed time in milliseconds
+     *
+     * Synchronizes on the stop event before calculating time.
+     * @return Elapsed time between start and stop in milliseconds
+     */
+    float elapsed_ms()
+    {
+        (void)hipEventSynchronize(stop_);
+        float ms = 0;
+        (void)hipEventElapsedTime(&ms, start_, stop_);
+        return ms;
+    }
+
+    private:
+    hipEvent_t start_   = nullptr;
+    hipEvent_t stop_    = nullptr;
+    hipStream_t stream_ = nullptr;
+};
+
+// =============================================================================
+// Performance Metrics
+// =============================================================================
+
+/**
+ * @brief Calculate TFLOPS for GEMM
+ */
+inline double calculate_tflops(int64_t M, int64_t N, int64_t K, double time_ms)
+{
+    double flops = 2.0 * M * N * K;
+    return (flops / (time_ms * 1e-3)) / 1e12;
+}
+
+/**
+ * @brief Calculate memory bandwidth in GB/s
+ */
+template <typename AType, typename BType, typename CType>
+inline double calculate_bandwidth_gbs(int64_t M, int64_t N, int64_t K, double time_ms)
+{
+    double bytes = M * K * sizeof(AType) + K * N * sizeof(BType) + M * N * sizeof(CType);
+    return (bytes / (time_ms * 1e-3)) / 1e9;
+}
+
+/**
+ * @brief Benchmark statistics
+ */
+struct BenchmarkStats
+{
+    double min_ms        = 0;
+    double avg_ms        = 0;
+    double max_ms        = 0;
+    double median_ms     = 0;
+    double tflops        = 0;
+    double bandwidth_gbs = 0;
+    int iterations       = 0;
+
+    void print(std::ostream& os = std::cout) const
+    {
+        os << std::fixed << std::setprecision(4);
+        os << "  Min: " << min_ms << " ms\n";
+        os << "  Avg: " << avg_ms << " ms\n";
+        os << "  Max: " << max_ms << " ms\n";
+        os << "  Median: " << median_ms << " ms\n";
+        os << "  TFLOPS: " << std::setprecision(2) << tflops << "\n";
+        os << "  Bandwidth: " << bandwidth_gbs << " GB/s\n";
+    }
+};
+
+/**
+ * @brief Run benchmark and compute statistics
+ */
+template <typename Func>
+BenchmarkStats run_benchmark(Func&& func, int warmup = 2, int iterations = 10)
+{
+    std::vector<double> times;
+    times.reserve(iterations);
+
+    for(int i = 0; i < warmup; ++i)
+        func();
+
+    for(int i = 0; i < iterations; ++i)
+        times.push_back(func());
+
+    std::sort(times.begin(), times.end());
+
+    BenchmarkStats stats;
+    stats.iterations = iterations;
+    stats.min_ms     = times.front();
+    stats.max_ms     = times.back();
+    stats.median_ms  = times[iterations / 2];
+
+    double sum = 0;
+    for(double t : times)
+        sum += t;
+    stats.avg_ms = sum / iterations;
+
+    return stats;
+}
+
+// =============================================================================
+// Validation Utilities
+// =============================================================================
+
+/**
+ * @brief Validation result
+ */
+struct ValidationResult
+{
+    bool correct     = false;
+    double max_diff  = 0;
+    double mean_diff = 0;
+    double accuracy  = 0;
+    int64_t matches  = 0;
+    int64_t total    = 0;
+
+    void print(std::ostream& os = std::cout) const
+    {
+        os << "  Correct: " << (correct ? "YES" : "NO") << "\n";
+        os << "  Max diff: " << max_diff << "\n";
+        os << "  Mean diff: " << mean_diff << "\n";
+        os << "  Accuracy: " << accuracy << "%\n";
+        os << "  Matches: " << matches << "/" << total << "\n";
+    }
+};
+
+/**
+ * @brief Validate GEMM result against reference
+ */
+template <typename T>
+ValidationResult validate_result(
+    const T* result, const T* reference, int64_t size, double rtol = 1e-3, double atol = 1e-2)
+{
+    ValidationResult v;
+    v.total    = size;
+    v.max_diff = 0;
+    v.matches  = 0;
+
+    double sum_diff = 0;
+
+    for(int64_t i = 0; i < size; ++i)
+    {
+        double r    = static_cast<double>(result[i]);
+        double ref  = static_cast<double>(reference[i]);
+        double diff = std::abs(r - ref);
+
+        v.max_diff = std::max(v.max_diff, diff);
+        sum_diff += diff;
+
+        double threshold = atol + rtol * std::abs(ref);
+        if(diff <= threshold)
+            ++v.matches;
+    }
+
+    v.mean_diff = sum_diff / size;
+    v.accuracy  = 100.0 * v.matches / v.total;
+    v.correct   = (v.matches == v.total) || (v.accuracy >= 99.9);
+
+    return v;
+}
+
+/**
+ * @brief Compute reference GEMM on CPU
+ */
+template <typename AType, typename BType, typename CType>
+void compute_reference_gemm(
+    const AType* A, const BType* B, CType* C, int64_t M, int64_t N, int64_t K)
+{
+    for(int64_t m = 0; m < M; ++m)
+    {
+        for(int64_t n = 0; n < N; ++n)
+        {
+            double acc = 0;
+            for(int64_t k = 0; k < K; ++k)
+                acc += static_cast<double>(A[m * K + k]) * static_cast<double>(B[k * N + n]);
+            C[m * N + n] = static_cast<CType>(acc);
+        }
+    }
+}
+
+// =============================================================================
+// Data Generation
+// =============================================================================
+
+template <typename T>
+void fill_random(T* data, int64_t size, T min_val = T(-1), T max_val = T(1))
+{
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> dist(static_cast<float>(min_val),
+                                               static_cast<float>(max_val));
+    for(int64_t i = 0; i < size; ++i)
+        data[i] = static_cast<T>(dist(gen));
+}
+
+template <typename T>
+void fill_zeros(T* data, int64_t size)
+{
+    std::fill(data, data + size, T(0));
+}
+
+template <typename T>
+void fill_ones(T* data, int64_t size)
+{
+    std::fill(data, data + size, T(1));
+}
+
+template <typename T>
+void fill_identity(T* data, int64_t rows, int64_t cols)
+{
+    fill_zeros(data, rows * cols);
+    int64_t min_dim = std::min(rows, cols);
+    for(int64_t i = 0; i < min_dim; ++i)
+        data[i * cols + i] = T(1);
+}
+
+// =============================================================================
+// GPU Memory Management
+// =============================================================================
+
+/**
+ * @brief RAII wrapper for GPU memory
+ */
+template <typename T>
+class GpuBuffer
+{
+    public:
+    GpuBuffer() : data_(nullptr), size_(0) {}
+
+    explicit GpuBuffer(int64_t count) : size_(count * sizeof(T))
+    {
+        CK_HIP_CHECK_THROW(hipMalloc(&data_, size_));
+    }
+
+    ~GpuBuffer()
+    {
+        if(data_)
+            (void)hipFree(data_);
+    }
+
+    // Non-copyable
+    GpuBuffer(const GpuBuffer&)            = delete;
+    GpuBuffer& operator=(const GpuBuffer&) = delete;
+
+    // Movable
+    GpuBuffer(GpuBuffer&& other) noexcept : data_(other.data_), size_(other.size_)
+    {
+        other.data_ = nullptr;
+        other.size_ = 0;
+    }
+
+    GpuBuffer& operator=(GpuBuffer&& other) noexcept
+    {
+        if(this != &other)
+        {
+            if(data_)
+                (void)hipFree(data_);
+            data_       = other.data_;
+            size_       = other.size_;
+            other.data_ = nullptr;
+            other.size_ = 0;
+        }
+        return *this;
+    }
+
+    T* get() { return data_; }
+    const T* get() const { return data_; }
+    int64_t size_bytes() const { return size_; }
+    int64_t count() const { return size_ / sizeof(T); }
+
+    void copy_from_host(const T* host_data)
+    {
+        CK_HIP_CHECK_THROW(hipMemcpy(data_, host_data, size_, hipMemcpyHostToDevice));
+    }
+
+    void copy_to_host(T* host_data) const
+    {
+        CK_HIP_CHECK_THROW(hipMemcpy(host_data, data_, size_, hipMemcpyDeviceToHost));
+    }
+
+    void zero() { CK_HIP_CHECK_THROW(hipMemset(data_, 0, size_)); }
+
+    private:
+    T* data_;
+    int64_t size_;
+};
+
+// =============================================================================
+// Printing Utilities
+// =============================================================================
+
+inline void print_separator(char c = '=', int width = 70)
+{
+    std::cout << std::string(width, c) << "\n";
+}
+
+inline void print_header(const std::string& title)
+{
+    print_separator();
+    std::cout << title << "\n";
+    print_separator();
+}
+
+inline std::string format_size(int64_t M, int64_t N, int64_t K)
+{
+    std::ostringstream oss;
+    oss << M << "x" << N << "x" << K;
+    return oss.str();
+}
+
+inline std::string format_number(int64_t n)
+{
+    std::string s = std::to_string(n);
+    int pos       = static_cast<int>(s.length()) - 3;
+    while(pos > 0)
+    {
+        s.insert(pos, ",");
+        pos -= 3;
+    }
+    return s;
+}
+
+/**
+ * @brief Print all registered kernels in a registry
+ *
+ * @param registry The registry to list kernels from
+ * @param os Output stream (default: std::cout)
+ * @param verbose If true, show full kernel config details
+ */
+inline void print_registered_kernels(const Registry& registry,
+                                     std::ostream& os = std::cout,
+                                     bool verbose     = false)
+{
+    const auto& kernels = registry.get_all();
+    os << "Registered Kernels (" << kernels.size() << "):\n";
+    os << std::string(70, '-') << "\n";
+
+    int idx = 1;
+    for(const auto& kernel : kernels)
+    {
+        const auto& key = kernel->get_key();
+
+        os << "  " << idx++ << ". " << kernel->get_name() << "\n";
+
+        if(verbose)
+        {
+            os << "     Tile:      " << key.algorithm.tile_shape.m << "x"
+               << key.algorithm.tile_shape.n << "x" << key.algorithm.tile_shape.k << "\n";
+            os << "     Wave:      " << static_cast<int>(key.algorithm.wave_shape.m) << "x"
+               << static_cast<int>(key.algorithm.wave_shape.n) << "x"
+               << static_cast<int>(key.algorithm.wave_shape.k) << "\n";
+            os << "     WarpTile:  " << static_cast<int>(key.algorithm.warp_tile_shape.m) << "x"
+               << static_cast<int>(key.algorithm.warp_tile_shape.n) << "x"
+               << static_cast<int>(key.algorithm.warp_tile_shape.k) << "\n";
+            os << "     Pipeline:  " << to_string(key.algorithm.pipeline) << "\n";
+            os << "     Scheduler: " << to_string(key.algorithm.scheduler) << "\n";
+            os << "     Arch:      " << key.gfx_arch << "\n";
+            os << "\n";
+        }
+    }
+
+    if(!verbose && !kernels.empty())
+    {
+        os << "\n  Use --list-verbose for full details\n";
+    }
+    os << std::string(70, '-') << "\n";
+}
+
+/**
+ * @brief Print a single kernel's configuration
+ */
+inline void print_kernel_info(const KernelInstance& kernel, std::ostream& os = std::cout)
+{
+    const auto& key = kernel.get_key();
+
+    os << "Kernel: " << kernel.get_name() << "\n";
+    os << "  Signature:\n";
+    os << "    dtype:  " << to_string(key.signature.dtype_a) << "/"
+       << to_string(key.signature.dtype_b) << "/" << to_string(key.signature.dtype_c) << "\n";
+    os << "    layout: " << to_string(key.signature.layout_a) << to_string(key.signature.layout_b)
+       << to_string(key.signature.layout_c) << "\n";
+
+    os << "  Algorithm:\n";
+    os << "    tile:      " << key.algorithm.tile_shape.m << "x" << key.algorithm.tile_shape.n
+       << "x" << key.algorithm.tile_shape.k << "\n";
+    os << "    wave:      " << static_cast<int>(key.algorithm.wave_shape.m) << "x"
+       << static_cast<int>(key.algorithm.wave_shape.n) << "x"
+       << static_cast<int>(key.algorithm.wave_shape.k) << "\n";
+    os << "    warp_tile: " << static_cast<int>(key.algorithm.warp_tile_shape.m) << "x"
+       << static_cast<int>(key.algorithm.warp_tile_shape.n) << "x"
+       << static_cast<int>(key.algorithm.warp_tile_shape.k) << "\n";
+    os << "    pipeline:  " << to_string(key.algorithm.pipeline) << "\n";
+    os << "    scheduler: " << to_string(key.algorithm.scheduler) << "\n";
+    os << "    epilogue:  " << to_string(key.algorithm.epilogue) << "\n";
+
+    os << "  Target: " << key.gfx_arch << "\n";
+}
+
+// =============================================================================
+// Kernel Key Builders
+// =============================================================================
+
+/**
+ * @brief Build a KernelKey for FP16 Row-Col-Row layout GEMM
+ *
+ * This is the most common configuration. Customize parameters as needed.
+ */
+struct KernelKeyBuilder
+{
+    // Tile shape
+    int tile_m = 128;
+    int tile_n = 128;
+    int tile_k = 32;
+
+    // Wave shape (warps per block)
+    int wave_m = 2;
+    int wave_n = 2;
+    int wave_k = 1;
+
+    // Warp tile shape
+    int warp_m = 32;
+    int warp_n = 32;
+    int warp_k = 16;
+
+    // Block size
+    int block_size = 256;
+
+    // Data types
+    DataType dtype_a   = DataType::FP16;
+    DataType dtype_b   = DataType::FP16;
+    DataType dtype_c   = DataType::FP16;
+    DataType dtype_acc = DataType::FP32;
+
+    // Layouts
+    LayoutTag layout_a = LayoutTag::RowMajor;
+    LayoutTag layout_b = LayoutTag::ColMajor;
+    LayoutTag layout_c = LayoutTag::RowMajor;
+
+    // Pipeline/scheduler
+    Pipeline pipeline   = Pipeline::CompV4;
+    Scheduler scheduler = Scheduler::Intrawave;
+    Epilogue epilogue   = Epilogue::CShuffle;
+
+    // Features
+    bool preshuffle            = false;
+    int num_d_tensors          = 0; // Multi-D: number of additional input tensors
+    std::string elementwise_op = "PassThrough";
+
+    // Target GPU
+    std::string gfx_arch = "gfx942";
+
+    /**
+     * @brief Build the KernelKey
+     */
+    KernelKey build() const
+    {
+        KernelKey key;
+
+        // Signature
+        key.signature.dtype_a             = dtype_a;
+        key.signature.dtype_b             = dtype_b;
+        key.signature.dtype_c             = dtype_c;
+        key.signature.dtype_acc           = dtype_acc;
+        key.signature.layout_a            = layout_a;
+        key.signature.layout_b            = layout_b;
+        key.signature.layout_c            = layout_c;
+        key.signature.transpose_a         = false;
+        key.signature.transpose_b         = false;
+        key.signature.grouped             = false;
+        key.signature.split_k             = 1;
+        key.signature.elementwise_op      = elementwise_op;
+        key.signature.num_d_tensors       = num_d_tensors;
+        key.signature.structured_sparsity = false;
+
+        // Algorithm
+        key.algorithm.tile_shape      = {static_cast<std::uint16_t>(tile_m),
+                                         static_cast<std::uint16_t>(tile_n),
+                                         static_cast<std::uint16_t>(tile_k)};
+        key.algorithm.wave_shape      = {static_cast<std::uint8_t>(wave_m),
+                                         static_cast<std::uint8_t>(wave_n),
+                                         static_cast<std::uint8_t>(wave_k)};
+        key.algorithm.warp_tile_shape = {static_cast<std::uint8_t>(warp_m),
+                                         static_cast<std::uint8_t>(warp_n),
+                                         static_cast<std::uint8_t>(warp_k)};
+        key.algorithm.pipeline        = pipeline;
+        key.algorithm.scheduler       = scheduler;
+        key.algorithm.epilogue        = epilogue;
+        key.algorithm.block_size      = block_size;
+        key.algorithm.double_buffer   = true;
+        key.algorithm.persistent      = false;
+        key.algorithm.preshuffle      = preshuffle;
+        key.algorithm.transpose_c     = false;
+        key.algorithm.num_wave_groups = 1;
+
+        key.gfx_arch = gfx_arch;
+
+        return key;
+    }
+
+    // Convenience preset methods
+    static KernelKeyBuilder fp16_rcr() { return KernelKeyBuilder{}; }
+
+    static KernelKeyBuilder fp16_rrr()
+    {
+        auto b     = KernelKeyBuilder{};
+        b.layout_b = LayoutTag::RowMajor;
+        return b;
+    }
+
+    static KernelKeyBuilder preshuffle_v1()
+    {
+        auto b       = KernelKeyBuilder{};
+        b.pipeline   = Pipeline::PreShuffleV1;
+        b.preshuffle = true;
+        return b;
+    }
+
+    static KernelKeyBuilder preshuffle_v2()
+    {
+        auto b       = KernelKeyBuilder{};
+        b.pipeline   = Pipeline::PreShuffleV2;
+        b.preshuffle = true;
+        return b;
+    }
+
+    static KernelKeyBuilder multi_d(int num_d, const std::string& op = "MultiDAdd")
+    {
+        auto b           = KernelKeyBuilder{};
+        b.num_d_tensors  = num_d;
+        b.elementwise_op = op;
+        return b;
+    }
+};
+
+} // namespace utils
+} // namespace dispatcher
+} // namespace ck_tile
--- a/dispatcher/include/ck_tile/dispatcher/validation/reference_kernels.hpp
+++ b/dispatcher/include/ck_tile/dispatcher/validation/reference_kernels.hpp
@@ -0,0 +1,228 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/dispatcher/problem.hpp"
+#include <hip/hip_runtime.h>
+#include <cmath>
+#include <vector>
+
+namespace ck_tile {
+namespace dispatcher {
+namespace validation {
+
+/// Reference CPU GEMM implementation for validation
+template <typename ADataType, typename BDataType, typename CDataType, typename AccDataType>
+void reference_gemm_cpu(const ADataType* a,
+                        const BDataType* b,
+                        CDataType* c,
+                        int M,
+                        int N,
+                        int K,
+                        int stride_a,
+                        int stride_b,
+                        int stride_c,
+                        bool transpose_a = false,
+                        bool transpose_b = false)
+{
+    for(int m = 0; m < M; ++m)
+    {
+        for(int n = 0; n < N; ++n)
+        {
+            AccDataType acc = 0;
+
+            for(int k = 0; k < K; ++k)
+            {
+                // Get A element
+                int a_idx         = transpose_a ? (k * stride_a + m) : (m * stride_a + k);
+                AccDataType a_val = static_cast<AccDataType>(a[a_idx]);
+
+                // Get B element
+                int b_idx         = transpose_b ? (n * stride_b + k) : (k * stride_b + n);
+                AccDataType b_val = static_cast<AccDataType>(b[b_idx]);
+
+                acc += a_val * b_val;
+            }
+
+            // Write C element
+            int c_idx = m * stride_c + n;
+            c[c_idx]  = static_cast<CDataType>(acc);
+        }
+    }
+}
+
+/// Validate kernel output against reference
+template <typename CDataType>
+bool validate_output(const CDataType* result,
+                     const CDataType* reference,
+                     int size,
+                     float rtol = 1e-3f,
+                     float atol = 1e-5f)
+{
+    int errors                    = 0;
+    const int max_errors_to_print = 10;
+
+    for(int i = 0; i < size; ++i)
+    {
+        float res_val = static_cast<float>(result[i]);
+        float ref_val = static_cast<float>(reference[i]);
+
+        float abs_diff = std::abs(res_val - ref_val);
+        float abs_ref  = std::abs(ref_val);
+
+        bool is_valid = (abs_diff <= atol) || (abs_diff <= rtol * abs_ref);
+
+        if(!is_valid)
+        {
+            if(errors < max_errors_to_print)
+            {
+                printf("Mismatch at index %d: result=%.6f, reference=%.6f, diff=%.6e\n",
+                       i,
+                       res_val,
+                       ref_val,
+                       abs_diff);
+            }
+            errors++;
+        }
+    }
+
+    if(errors > 0)
+    {
+        printf("Validation failed: %d/%d elements mismatched (%.2f%%)\n",
+               errors,
+               size,
+               100.0f * errors / size);
+        return false;
+    }
+
+    return true;
+}
+
+/// Validate kernel with reference implementation
+template <typename ADataType, typename BDataType, typename CDataType, typename AccDataType>
+bool validate_gemm_kernel(const void* a_dev_ptr,
+                          const void* b_dev_ptr,
+                          const void* c_dev_ptr,
+                          const Problem& problem,
+                          float rtol = 1e-3f,
+                          float atol = 1e-5f)
+{
+    const int M = problem.M;
+    const int N = problem.N;
+    const int K = problem.K;
+
+    // Allocate host memory
+    std::vector<ADataType> a_host(M * K);
+    std::vector<BDataType> b_host(K * N);
+    std::vector<CDataType> c_host(M * N);
+    std::vector<CDataType> c_ref(M * N);
+
+    // Copy from device
+    hipMemcpy(a_host.data(), a_dev_ptr, M * K * sizeof(ADataType), hipMemcpyDeviceToHost);
+    hipMemcpy(b_host.data(), b_dev_ptr, K * N * sizeof(BDataType), hipMemcpyDeviceToHost);
+    hipMemcpy(c_host.data(), c_dev_ptr, M * N * sizeof(CDataType), hipMemcpyDeviceToHost);
+
+    // Compute reference
+    reference_gemm_cpu<ADataType, BDataType, CDataType, AccDataType>(a_host.data(),
+                                                                     b_host.data(),
+                                                                     c_ref.data(),
+                                                                     M,
+                                                                     N,
+                                                                     K,
+                                                                     K, // stride_a (row-major)
+                                                                     N, // stride_b (row-major)
+                                                                     N, // stride_c (row-major)
+                                                                     false,
+                                                                     false);
+
+    // Validate
+    return validate_output(c_host.data(), c_ref.data(), M * N, rtol, atol);
+}
+
+/// Validator class for kernel instances
+class KernelValidator
+{
+    public:
+    KernelValidator(float rtol = 1e-3f, float atol = 1e-5f) : rtol_(rtol), atol_(atol) {}
+
+    /// Validate a kernel instance
+    template <typename KernelInstance>
+    bool validate(KernelInstance& kernel,
+                  const void* a_ptr,
+                  const void* b_ptr,
+                  const void* c_ptr,
+                  const Problem& problem)
+    {
+        // Use kernel's validate method if available
+        return kernel.validate(a_ptr, b_ptr, c_ptr, problem, rtol_, atol_);
+    }
+
+    /// Set tolerances
+    void set_tolerances(float rtol, float atol)
+    {
+        rtol_ = rtol;
+        atol_ = atol;
+    }
+
+    /// Get tolerances
+    std::pair<float, float> get_tolerances() const { return {rtol_, atol_}; }
+
+    private:
+    float rtol_;
+    float atol_;
+};
+
+/// Helper to generate random test data
+template <typename T>
+void generate_random_data(T* data, int size, float min_val = -1.0f, float max_val = 1.0f)
+{
+    for(int i = 0; i < size; ++i)
+    {
+        float rand_val = min_val + (max_val - min_val) * (rand() / (float)RAND_MAX);
+        data[i]        = static_cast<T>(rand_val);
+    }
+}
+
+/// Helper to allocate and initialize test tensors
+template <typename T>
+struct TestTensor
+{
+    T* host_ptr;
+    T* device_ptr;
+    int size;
+
+    TestTensor(int size_) : size(size_)
+    {
+        host_ptr = new T[size];
+        hipMalloc(&device_ptr, size * sizeof(T));
+    }
+
+    ~TestTensor()
+    {
+        delete[] host_ptr;
+        hipFree(device_ptr);
+    }
+
+    void randomize(float min_val = -1.0f, float max_val = 1.0f)
+    {
+        generate_random_data(host_ptr, size, min_val, max_val);
+        hipMemcpy(device_ptr, host_ptr, size * sizeof(T), hipMemcpyHostToDevice);
+    }
+
+    void copy_to_device()
+    {
+        hipMemcpy(device_ptr, host_ptr, size * sizeof(T), hipMemcpyHostToDevice);
+    }
+
+    void copy_from_device()
+    {
+        hipMemcpy(host_ptr, device_ptr, size * sizeof(T), hipMemcpyDeviceToHost);
+    }
+
+    void zero() { hipMemset(device_ptr, 0, size * sizeof(T)); }
+};
+
+} // namespace validation
+} // namespace dispatcher
+} // namespace ck_tile