Flatmm merge (#2168)

* sync with function interface of cshuffleepiloge,fix flatmm build fail * move code from solin/flatmm which add mfma16*16*32fp8 and optimize flatmm --------- Co-authored-by: solin <bingzhou@amd.com>
2026-05-04 21:51:28 +00:00 · 2025-05-08 12:59:57 +08:00
parent c7b8e86e34
commit 6a3960c1e1
11 changed files with 552 additions and 192 deletions
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -1,6 +1,20 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
+#include <type_traits>
+
+template <typename T>
+constexpr const char* DataTypeToString() {
+    if constexpr (std::is_same_v<T, ck_tile::half_t>) {
+        return "fp16";
+    } else if constexpr (std::is_same_v<T, ck_tile::fp8_t>) {
+        return "fp8";
+    } else if constexpr (std::is_same_v<T, ck_tile::bf8_t>) {
+        return "bf8";
+    } else {
+        return "unknown";
+    }
+}

 template <typename Layout>
 static constexpr inline auto is_row_major(Layout layout_)
@@ -11,7 +25,7 @@ static constexpr inline auto is_row_major(Layout layout_)

 // mfma_type, 0:32x32, 1:16x16
 template <typename T>
-auto shuffle_b(const ck_tile::HostTensor<T>& t, std::string mfma_dtype, int mfma_type = 0)
+auto shuffle_b(const ck_tile::HostTensor<T>& t, std::string mfma_dtype, int mfma_type)
 {
    assert(t.get_lengths().size() == 2);
    int n_ = t.get_lengths()[1];
@@ -29,13 +43,13 @@ auto shuffle_b(const ck_tile::HostTensor<T>& t, std::string mfma_dtype, int mfma
        std::copy(t.begin(), t.end(), t_view.begin());
        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
    }
-    else if((mfma_dtype == "int8" || mfma_dtype == "fp8") && mfma_type == 0)
+    else if((mfma_dtype == "int8" || mfma_dtype == "fp8" || mfma_dtype == "bf8") && mfma_type == 0)
    {
        ck_tile::HostTensor<T> t_view({n_ / 32, 32, k_ / 32, 2, 16});
        std::copy(t.begin(), t.end(), t_view.begin());
        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
    }
-    else if((mfma_dtype == "int8" || mfma_dtype == "fp8") && mfma_type == 1)
+    else if((mfma_dtype == "int8" || mfma_dtype == "fp8" || mfma_dtype == "bf8") && mfma_type == 1)
    {
        ck_tile::HostTensor<T> t_view({n_ / 16, 16, k_ / 64, 4, 16});
        std::copy(t.begin(), t.end(), t_view.begin());
@@ -44,6 +58,7 @@ auto shuffle_b(const ck_tile::HostTensor<T>& t, std::string mfma_dtype, int mfma
    return t;
 }

+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
 auto calculate_rtol_atol(const ck_tile::index_t K,
                         const ck_tile::index_t kbatch,
                         const float max_accumulated_value)
@@ -64,7 +79,13 @@ auto calculate_rtol_atol(const ck_tile::index_t K,
    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
 }

-template <typename ALayout, typename BLayout, typename CLayout>
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
 float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
                    ck_tile::DeviceMem& b_shuffle_dev_buf,
                    ck_tile::DeviceMem& c_dev_buf,
@@ -91,7 +112,7 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
    args.stride_B = stride_B;
    args.stride_C = stride_C;

-    float ave_time = flatmm_calc<ALayout, BLayout, CLayout>(
+    float ave_time = flatmm_calc<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});

    std::size_t flop = std::size_t(2) * M * N * K;
@@ -100,7 +121,7 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
    float gb_per_sec = num_byte / 1.E6 / ave_time;

-    std::cout << "Run Flatmm kernel with M =" << M << " N =" << N << " K =" << K
+    std::cout << "Run Flatmm kernel with DataType = " << DataTypeToString<ADataType>() << " M =" << M << " N =" << N << " K =" << K
              << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C
              << " : " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
              << std::endl;
@@ -108,7 +129,10 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
    return ave_time;
 }

-template <typename ALayout, typename BLayout, typename CLayout>
+template <typename PrecType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
 int run_flatmm_example_with_layouts(int argc,
                                    char* argv[],
                                    const ALayout a_layout                  = ALayout{},
@@ -119,6 +143,11 @@ int run_flatmm_example_with_layouts(int argc,
    if(!result)
        return -1;

+    using ADataType   = typename GemmBasicTypeConfig<PrecType>::ADataType;
+    using BDataType   = typename GemmBasicTypeConfig<PrecType>::BDataType;
+    using CDataType   = typename GemmBasicTypeConfig<PrecType>::CDataType;
+    using AccDataType = typename GemmBasicTypeConfig<PrecType>::AccDataType;
+    
    ck_tile::index_t M = arg_parser.get_int("m");
    ck_tile::index_t N = arg_parser.get_int("n");
    ck_tile::index_t K = arg_parser.get_int("k");
@@ -154,11 +183,17 @@ int run_flatmm_example_with_layouts(int argc,

    // do pre-shuffle
    std::string mfma                              = arg_parser.get_str("prec");
-    ck_tile::HostTensor<BDataType> b_shuffle_host = shuffle_b(b_origin_host, mfma, 0);
+#if defined(USING_MFMA_16x16x32) && defined(ENABLE_FP8)
+    ck_tile::index_t mfma_type = 1;
+#else
+    ck_tile::index_t mfma_type = 0;
+#endif
+    ck_tile::HostTensor<BDataType> b_shuffle_host = shuffle_b(b_origin_host, mfma, mfma_type);
    ck_tile::DeviceMem b_shuffle_dev_buf(b_shuffle_host.get_element_space_size_in_bytes());
    b_shuffle_dev_buf.ToDevice(b_shuffle_host.data());

-    invoke_flatmm<ALayout, BLayout, CLayout>(a_dev_buf,
+    invoke_flatmm<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
+                                             a_dev_buf,
                                             b_shuffle_dev_buf,
                                             c_dev_buf,
                                             M,
@@ -184,7 +219,7 @@ int run_flatmm_example_with_layouts(int argc,
            a_host, b_origin_host, c_ref_host);
        const float max_accumulated_value =
            *std::max_element(c_ref_host.mData.begin(), c_ref_host.mData.end());
-        const auto rtol_atol = calculate_rtol_atol(K, kbatch, max_accumulated_value);
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(K, kbatch, max_accumulated_value);
        pass                 = ck_tile::check_err(c_rslt_host,
                                  c_ref_host,
                                  "Error: Incorrect results!",
@@ -242,7 +277,7 @@ int run_flatmm_example_with_layouts(int argc,
        c_gpu_ref_dev_buf.FromDevice(c_gpu_ref_host.data());
        const float max_accumulated_value =
            *std::max_element(c_gpu_ref_host.mData.begin(), c_gpu_ref_host.mData.end());
-        const auto rtol_atol = calculate_rtol_atol(K, kbatch, max_accumulated_value);
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(K, kbatch, max_accumulated_value);
        pass                 = ck_tile::check_err(c_rslt_host,
                                  c_gpu_ref_host,
                                  "Error: Incorrect results!",
@@ -257,25 +292,3 @@ int run_flatmm_example_with_layouts(int argc,

    return pass;
 }
-
-int run_flatmm_example(int argc, char* argv[])
-{
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
-    using Row = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
-
-    std::string a_layout = arg_parser.get_str("a_layout");
-    std::string b_layout = arg_parser.get_str("b_layout");
-
-    if(a_layout == "R" && b_layout == "C")
-    {
-        return run_flatmm_example_with_layouts(argc, argv, Row{}, Col{}, Row{});
-    }
-    else
-    {
-        throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
-    }
-}