[CK Tile] gemm splitk two stage (#2697)

* Fix a typo * Use std::variant to call run_gemm_example_with_layouts with the available layout variant combinations * Use a unified run_gemm_example_prec_type for basic gemm and universal gemm * Factor out run_gemm_example_prec_type * Refactor argument parsing in gemm_splitk_two_stage_reduce.cpp * Parse arguments outside of create_args * Move the gemm operators to separate structs to facilitate their reuse * Move the invokers to separate files to facilitate their reuse * Rename the invoker files for consistency with the examples that use them * Add fp32 support to the elementwise examples, and produce an error message for unsupported types * Get rid of four unused variables * Make two variables const * Add support for different input-output type combinations in elementwise examples * Test support for different input and output types in elementwise examples * Add support for different operations in the elementwise unary tests * Add support for UnaryConvert in the elementwise unary tests * Add support for bf16 in elementwise examples, excluding unsupported type combinations * Make some operator parameters const in ElementWiseKernel * Remove some unnecessary include statements * Implement a two-stage GEMM that does a type conversion in the second stage using the elementwise kernel * Clear workspace instead of output when flushing the cache in SplitKTwoStageInvoker::gemm * Fix formatting issues reported by clang * Add back CK_TILE_USE_WMMA related changes * Use the right prec type for bf16 in the universal GEMM and two stage split K examples * Add some brackets * Add some brackets * Separate the clearing of the GEMM output memory from the cache flushing in the universal GEMM example * Separate the clearing of the GEMM output memory from the cache flushing in the split K two stage example * Fix formatting * No need to call SetZero on ws_m_n_dev_buf here, as clear_gemm_output now does this as part of the kernel preprocessing * Add fp16 data type to splitk two stage example * Add preprocessing with optional cache flushing and clearing of output for k_batch > 1 to the basic GEMM example [ROCm/composable_kernel commit: 1acd8e041c]
2026-07-19 02:01:01 +00:00 · 2025-09-04 14:33:44 +03:00
parent d8583d1abf
commit e4ac6bca96
21 changed files with 1245 additions and 782 deletions
--- a/example/ck_tile/21_elementwise/elementwise_common.hpp
+++ b/example/ck_tile/21_elementwise/elementwise_common.hpp
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/core/arch/arch.hpp"
+
+auto string_to_datatype(const std::string& datatype)
+{
+    using PrecVariant = std::variant<ck_tile::half_t, ck_tile::bf16_t, float>;
+
+    if(datatype == "fp16")
+    {
+        return PrecVariant{ck_tile::half_t{}};
+    }
+    else if(datatype == "bf16")
+    {
+        return PrecVariant{ck_tile::bf16_t{}};
+    }
+    else if(datatype == "fp32")
+    {
+        return PrecVariant{float{}};
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type: " + datatype);
+    }
+};
--- a/example/ck_tile/21_elementwise/elementwise_example.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example.cpp
@@ -1,11 +1,11 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.

-#include "ck_tile/core/arch/arch.hpp"
 #include "ck_tile/host.hpp"
 #include "ck_tile/ops/elementwise.hpp"
 #include "ck_tile/host/reference/reference_elementwise.hpp"
 #include "json_dump.hpp"
+#include "elementwise_common.hpp"

 auto create_args(int argc, char* argv[])
 {
@@ -14,7 +14,8 @@ auto create_args(int argc, char* argv[])
        .insert("n", "1024", "n dimension")
        .insert("stride", "-1", "stride per row, if -1 then equal to n")
        .insert("v", "1", "cpu validation or not")
-        .insert("prec", "fp16", "precision")
+        .insert("x_prec", "fp16", "input precision, fp16/bf16/fp32")
+        .insert("y_prec", "fp16", "output precision, fp16/bf16/fp32")
        .insert("warmup", "10", "cold iter")
        .insert("repeat", "50", "hot iter")
        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
@@ -24,7 +25,10 @@ auto create_args(int argc, char* argv[])
    return std::make_tuple(result, arg_parser);
 }

-template <typename DataType>
+// XDataType: Data type of the input tensors.
+// ComputeDataType: Data type used for intermediate computations (often float for precision).
+// YDataType: Data type of the output tensor.
+template <typename XDataType, typename YDataType>
 bool run(const ck_tile::ArgParser& arg_parser)
 {
    ck_tile::index_t M      = arg_parser.get_int("m");
@@ -34,25 +38,18 @@ bool run(const ck_tile::ArgParser& arg_parser)
    // If stride is negative (default -1), set it to N, assuming a dense row-major layout.
    if(stride < 0)
        stride = N;
-    std::string data_type = arg_parser.get_str("prec");
-    int do_validation     = arg_parser.get_int("v");
-    int warmup            = arg_parser.get_int("warmup");
-    int repeat            = arg_parser.get_int("repeat");
+    int do_validation = arg_parser.get_int("v");
+    int warmup        = arg_parser.get_int("warmup");
+    int repeat        = arg_parser.get_int("repeat");

    if(stride < N)
    {
        throw std::runtime_error("stride must be >= N");
    }

-    // Define type aliases for clarity.
-    // XDataType: Data type of the input tensors.
-    // ComputeDataType: Data type used for intermediate computations (often float for precision).
-    // YDataType: Data type of the output tensor.
    // XElementwiseOperation: The specific elementwise operation to perform (e.g., Add, Mul).
-    using XDataType = DataType;
    using ComputeDataType =
        float; // Using float for intermediate calculations can improve numerical stability.
-    using YDataType             = DataType;
    using XElementwiseOperation = ck_tile::element_wise::Add;

    // 1. Initialize the input data on the host (CPU).
@@ -219,11 +216,22 @@ int main(int argc, char* argv[])
    if(!result)
        return -1;

-    const std::string data_type = arg_parser.get_str("prec");
-    if(data_type == "fp16")
+    try
    {
-        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+        const auto x_prec_variant = string_to_datatype(arg_parser.get_str("x_prec"));
+        const auto y_prec_variant = string_to_datatype(arg_parser.get_str("y_prec"));
+        return std::visit(
+            [&](auto&& x_dt, auto&& y_dt) -> int {
+                using XDataType = std::decay_t<decltype(x_dt)>;
+                using YDataType = std::decay_t<decltype(y_dt)>;
+                return run<XDataType, YDataType>(arg_parser);
+            },
+            x_prec_variant,
+            y_prec_variant);
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return -3;
    }
-
-    return -3;
 }
--- a/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp
@@ -1,11 +1,11 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.

-#include "ck_tile/core/arch/arch.hpp"
 #include "ck_tile/host.hpp"
 #include "ck_tile/ops/elementwise.hpp"
 #include "ck_tile/host/reference/reference_elementwise.hpp"
 #include "json_dump.hpp"
+#include "elementwise_common.hpp"

 auto create_args(int argc, char* argv[])
 {
@@ -15,7 +15,8 @@ auto create_args(int argc, char* argv[])
        .insert("dim2", "32", "dimension 2")
        .insert("dim3", "32", "dimension 3")
        .insert("v", "1", "cpu validation or not")
-        .insert("prec", "fp16", "precision")
+        .insert("x_prec", "fp16", "input precision")
+        .insert("y_prec", "fp16", "output precision")
        .insert("warmup", "10", "cold iter")
        .insert("repeat", "50", "hot iter")
        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
@@ -25,7 +26,7 @@ auto create_args(int argc, char* argv[])
    return std::make_tuple(result, arg_parser);
 }

-template <typename DataType>
+template <typename XDataType, typename YDataType>
 bool run(const ck_tile::ArgParser& arg_parser)
 {
    ck_tile::index_t D0 = arg_parser.get_int("dim0");
@@ -33,15 +34,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
    ck_tile::index_t D2 = arg_parser.get_int("dim2");
    ck_tile::index_t D3 = arg_parser.get_int("dim3");

-    std::string data_type = arg_parser.get_str("prec");
-    int do_validation     = arg_parser.get_int("v");
-    int warmup            = arg_parser.get_int("warmup");
-    int repeat            = arg_parser.get_int("repeat");
+    int do_validation = arg_parser.get_int("v");
+    int warmup        = arg_parser.get_int("warmup");
+    int repeat        = arg_parser.get_int("repeat");

-    using XDataType = DataType;
    using ComputeDataType =
        float; // Using float for intermediate calculations can improve numerical stability.
-    using YDataType             = DataType;
    using XElementwiseOperation = ck_tile::element_wise::Add;

    // Initialize the input data on the host (CPU).
@@ -164,11 +162,22 @@ int main(int argc, char* argv[])
    if(!result)
        return -1;

-    const std::string data_type = arg_parser.get_str("prec");
-    if(data_type == "fp16")
+    try
    {
-        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+        const auto x_prec_variant = string_to_datatype(arg_parser.get_str("x_prec"));
+        const auto y_prec_variant = string_to_datatype(arg_parser.get_str("y_prec"));
+        return std::visit(
+            [&](auto&& x_dt, auto&& y_dt) -> int {
+                using XDataType = std::decay_t<decltype(x_dt)>;
+                using YDataType = std::decay_t<decltype(y_dt)>;
+                return run<XDataType, YDataType>(arg_parser);
+            },
+            x_prec_variant,
+            y_prec_variant);
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return -3;
    }
-
-    return -3;
 }
--- a/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp
@@ -5,6 +5,7 @@
 #include "ck_tile/ops/elementwise.hpp"
 #include "ck_tile/host/reference/reference_transpose.hpp"
 #include "json_dump.hpp"
+#include "elementwise_common.hpp"

 auto create_args(int argc, char* argv[])
 {
@@ -32,10 +33,9 @@ bool run(const ck_tile::ArgParser& arg_parser)

    if(stride_in < 0)
        stride_in = N; // Dense input: stride for M dim is N
-    std::string data_type = arg_parser.get_str("prec");
-    int do_validation     = arg_parser.get_int("v");
-    int warmup            = arg_parser.get_int("warmup");
-    int repeat            = arg_parser.get_int("repeat");
+    int do_validation = arg_parser.get_int("v");
+    int warmup        = arg_parser.get_int("warmup");
+    int repeat        = arg_parser.get_int("repeat");

    if(stride_in < N)
    {
@@ -161,12 +161,19 @@ int main(int argc, char* argv[])
    if(!result)
        return -1;

-    const std::string data_type = arg_parser.get_str("prec");
-    if(data_type == "fp16")
+    try
    {
-        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+        const auto prec_variant = string_to_datatype(arg_parser.get_str("prec"));
+        return std::visit(
+            [&](auto&& dt) -> int {
+                using DataType = std::decay_t<decltype(dt)>;
+                return run<DataType>(arg_parser);
+            },
+            prec_variant);
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return -3;
    }
-
-    std::cerr << "Unsupported data type: " << data_type << std::endl;
-    return -3;
 }
--- a/example/ck_tile/21_elementwise/elementwise_example_unary.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example_unary.cpp
@@ -1,11 +1,11 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.

-#include "ck_tile/core/arch/arch.hpp"
 #include "ck_tile/host.hpp"
 #include "ck_tile/ops/elementwise.hpp"
 #include "ck_tile/host/reference/reference_elementwise.hpp"
 #include "json_dump.hpp"
+#include "elementwise_common.hpp"

 auto create_args(int argc, char* argv[])
 {
@@ -14,7 +14,9 @@ auto create_args(int argc, char* argv[])
        .insert("n", "1024", "n dimension")
        .insert("stride", "-1", "stride per row, if -1 then equal to n")
        .insert("v", "1", "cpu validation or not")
-        .insert("prec", "fp16", "precision")
+        .insert("op", "1", "unary operation, 1: square, 2: convert")
+        .insert("x_prec", "fp16", "input precision")
+        .insert("y_prec", "fp16", "output precision")
        .insert("warmup", "10", "cold iter")
        .insert("repeat", "50", "hot iter")
        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
@@ -24,7 +26,7 @@ auto create_args(int argc, char* argv[])
    return std::make_tuple(result, arg_parser);
 }

-template <typename DataType>
+template <typename XElementwiseOperation, typename XDataType, typename YDataType>
 bool run(const ck_tile::ArgParser& arg_parser)
 {
    ck_tile::index_t M      = arg_parser.get_int("m");
@@ -32,17 +34,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
    ck_tile::index_t stride = arg_parser.get_int("stride");
    if(stride < 0)
        stride = N;
-    std::string data_type = arg_parser.get_str("prec");
-    int do_validation     = arg_parser.get_int("v");
-    int warmup            = arg_parser.get_int("warmup");
-    int repeat            = arg_parser.get_int("repeat");
+    int do_validation = arg_parser.get_int("v");
+    int warmup        = arg_parser.get_int("warmup");
+    int repeat        = arg_parser.get_int("repeat");

    assert(stride >= N);

-    using XDataType             = DataType;
-    using YDataType             = DataType;
-    using XElementwiseOperation = ck_tile::element_wise::UnarySquare;
-
    // 1. Initialize the input data on the host
    ck_tile::HostTensor<XDataType> x_host_a({M, N}, {stride, 1});
    ck_tile::HostTensor<YDataType> y_host({M, N}, {stride, 1});
@@ -122,12 +119,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
    {
        y_buf.FromDevice(y_validation.data());

-        auto op = [](const auto& v0) { return v0 * v0; };
+        auto op = [](const XDataType& v0) -> YDataType {
+            XElementwiseOperation element_op{};
+            YDataType result;
+            element_op(result, v0);
+            return result;
+        };

        ck_tile::reference_unary_elementwise<XDataType, YDataType, YDataType>(x_host_a, y_host, op);

        pass = ck_tile::check_err(
-            y_validation, y_host, "Elementwise Add Error: Incorrect results!", 0.01, 0.01);
+            y_validation, y_host, "Elementwise unary op: Incorrect results!", 0.01, 0.01);
    }

    if(arg_parser.get_int("json") == 1)
@@ -145,17 +147,69 @@ bool run(const ck_tile::ArgParser& arg_parser)
    return pass;
 }

+template <typename XElementwiseOperation, typename XDataType, typename YDataType>
+bool filter_then_run(const ck_tile::ArgParser& arg_parser)
+{
+    auto throw_unsupported = [&]() {
+        const auto x_prec = arg_parser.get_str("x_prec");
+        const auto op     = arg_parser.get_str("op");
+        throw std::runtime_error("Unsupported! x_prec: " + x_prec + ", op: " + op);
+    };
+    bool pass = true;
+
+    if constexpr(std::is_same_v<XElementwiseOperation, ck_tile::element_wise::UnarySquare> &&
+                 std::is_same_v<XDataType, ck_tile::bf16_t>)
+    {
+        throw_unsupported();
+    }
+    else
+    {
+        pass = run<XElementwiseOperation, XDataType, YDataType>(arg_parser);
+    }
+
+    return pass;
+}
+
+auto string_to_op(const std::string& op)
+{
+    using OpVariant =
+        std::variant<ck_tile::element_wise::UnarySquare, ck_tile::element_wise::UnaryConvert>;
+
+    if(op == "1")
+        return OpVariant{ck_tile::element_wise::UnarySquare{}};
+    else if(op == "2")
+        return OpVariant{ck_tile::element_wise::UnaryConvert{}};
+    else
+    {
+        throw std::runtime_error("Unsupported unary operation: " + op);
+    }
+};
+
 int main(int argc, char* argv[])
 {
    auto [result, arg_parser] = create_args(argc, argv);
    if(!result)
        return -1;

-    const std::string data_type = arg_parser.get_str("prec");
-    if(data_type == "fp16")
+    try
    {
-        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+        const auto x_prec_variant = string_to_datatype(arg_parser.get_str("x_prec"));
+        const auto y_prec_variant = string_to_datatype(arg_parser.get_str("y_prec"));
+        const auto op_variant     = string_to_op(arg_parser.get_str("op"));
+        return std::visit(
+            [&](auto&& op, auto&& x_dt, auto&& y_dt) -> int {
+                using XElementwiseOperation = std::decay_t<decltype(op)>;
+                using XDataType             = std::decay_t<decltype(x_dt)>;
+                using YDataType             = std::decay_t<decltype(y_dt)>;
+                return filter_then_run<XElementwiseOperation, XDataType, YDataType>(arg_parser);
+            },
+            op_variant,
+            x_prec_variant,
+            y_prec_variant);
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return -3;
    }
-
-    return -3;
 }