[CK Tile] gemm splitk two stage (#2697)

* Fix a typo * Use std::variant to call run_gemm_example_with_layouts with the available layout variant combinations * Use a unified run_gemm_example_prec_type for basic gemm and universal gemm * Factor out run_gemm_example_prec_type * Refactor argument parsing in gemm_splitk_two_stage_reduce.cpp * Parse arguments outside of create_args * Move the gemm operators to separate structs to facilitate their reuse * Move the invokers to separate files to facilitate their reuse * Rename the invoker files for consistency with the examples that use them * Add fp32 support to the elementwise examples, and produce an error message for unsupported types * Get rid of four unused variables * Make two variables const * Add support for different input-output type combinations in elementwise examples * Test support for different input and output types in elementwise examples * Add support for different operations in the elementwise unary tests * Add support for UnaryConvert in the elementwise unary tests * Add support for bf16 in elementwise examples, excluding unsupported type combinations * Make some operator parameters const in ElementWiseKernel * Remove some unnecessary include statements * Implement a two-stage GEMM that does a type conversion in the second stage using the elementwise kernel * Clear workspace instead of output when flushing the cache in SplitKTwoStageInvoker::gemm * Fix formatting issues reported by clang * Add back CK_TILE_USE_WMMA related changes * Use the right prec type for bf16 in the universal GEMM and two stage split K examples * Add some brackets * Add some brackets * Separate the clearing of the GEMM output memory from the cache flushing in the universal GEMM example * Separate the clearing of the GEMM output memory from the cache flushing in the split K two stage example * Fix formatting * No need to call SetZero on ws_m_n_dev_buf here, as clear_gemm_output now does this as part of the kernel preprocessing * Add fp16 data type to splitk two stage example * Add preprocessing with optional cache flushing and clearing of output for k_batch > 1 to the basic GEMM example [ROCm/composable_kernel commit: 1acd8e041c]
2026-07-18 09:38:17 +00:00 · 2025-09-04 14:33:44 +03:00
parent d8583d1abf
commit e4ac6bca96
21 changed files with 1245 additions and 782 deletions
--- a/include/ck_tile/ops/elementwise/binary_elementwise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/binary_elementwise_operation.hpp
@@ -82,6 +82,14 @@ struct Add
        y                  = type_convert<bf16_t>(y_tmp);
    }

+    template <>
+    __host__ __device__ constexpr void
+    operator()<bf16_t>(bf16_t& y, const float& x0, const float& x1) const
+    {
+        const float y_tmp = x0 + x1;
+        y                 = type_convert<bf16_t>(y_tmp);
+    }
+
    template <>
    __host__ __device__ constexpr void
    operator()<int8_t>(int8_t& y, const int8_t& x0, const int8_t& x1) const
--- a/include/ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp
+++ b/include/ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp
@@ -23,9 +23,9 @@ struct ElementWiseKernel
    static constexpr index_t kBlockSize = Problem::BlockShape::kBlockSize;

    template <typename... XDataType, typename Dims>
-    CK_TILE_DEVICE void operator()(Dims lens,
-                                   Dims input_strides,
-                                   Dims output_strides,
+    CK_TILE_DEVICE void operator()(const Dims lens,
+                                   const Dims input_strides,
+                                   const Dims output_strides,
                                   const tuple<XDataType...>& input_tensors,
                                   YDataType* p_y) const
    {
--- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
@@ -326,7 +326,6 @@ struct MultiDAdd
    }
 };

-#if 0
 struct UnaryConvert
 {
    template <typename Y, typename X>
@@ -336,6 +335,7 @@ struct UnaryConvert
    }
 };

+#if 0
 struct ConvertBF16RTN
 {
    // convert to bf16 using round to nearest (rtn)
@@ -472,14 +472,14 @@ struct UnaryDivide

 struct UnarySquare
 {
-    template <typename T>
-    CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
+    template <typename Y, typename X>
+    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const
    {
-        static_assert(std::is_same_v<T, float> || std::is_same_v<T, ck_tile::fp16_t> ||
-                          std::is_same_v<T, double> || std::is_same_v<T, int32_t> ||
-                          std::is_same_v<T, int8_t>
+        static_assert(std::is_same_v<X, float> || std::is_same_v<X, ck_tile::fp16_t> ||
+                          std::is_same_v<X, double> || std::is_same_v<X, int32_t> ||
+                          std::is_same_v<X, int8_t>
 #ifdef CK_TILE_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
-                          || std::is_same_v<T, int4_t>
+                          || std::is_same_v<X, int4_t>
 #endif
                      ,
                      "Data type is not supported by this operation!");