move test_copy into test

2026-05-12 01:10:17 +00:00 · 2025-07-17 03:10:46 +00:00
parent 21627d7ca7
commit 804f77dce5
8 changed files with 137 additions and 129 deletions
--- a/example/ck_tile/36_copy/CMakeLists.txt
+++ b/example/ck_tile/36_copy/CMakeLists.txt
@@ -1,4 +0,0 @@
-add_executable(test_copy_kernel EXCLUDE_FROM_ALL test_copy.cpp)
-target_compile_options(test_copy_kernel PRIVATE
-  -mllvm -enable-noalias-to-md-conversion=0
-)
--- a/example/ck_tile/36_copy/test_copy.cpp
+++ b/example/ck_tile/36_copy/test_copy.cpp
@@ -1,119 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck_tile/host.hpp"
-#include <cstring>
-#include "test_copy.hpp"
-
-auto create_args(int argc, char* argv[])
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("m", "64", "m dimension")
-        .insert("n", "8", "n dimension")
-        .insert("id", "0", "warp to use")
-        .insert("v", "1", "cpu validation or not")
-        .insert("prec", "fp16", "precision")
-        .insert("warmup", "50", "cold iter")
-        .insert("repeat", "100", "hot iter");
-
-    bool result = arg_parser.parse(argc, argv);
-    return std::make_tuple(result, arg_parser);
-}
-
-template <typename DataType>
-bool run(const ck_tile::ArgParser& arg_parser)
-{
-    using XDataType = DataType;
-    using YDataType = DataType;
-
-    ck_tile::index_t m       = arg_parser.get_int("m");
-    ck_tile::index_t n       = arg_parser.get_int("n");
-    ck_tile::index_t warp_id = arg_parser.get_int("id");
-    int do_validation        = arg_parser.get_int("v");
-    int warmup               = arg_parser.get_int("warmup");
-    int repeat               = arg_parser.get_int("repeat");
-
-    constexpr auto dword_bytes = 4;
-
-    if(n % (dword_bytes / sizeof(DataType)) != 0)
-    {
-        std::cerr << "n should be multiple of 2" << std::endl;
-        return false;
-    }
-
-    ck_tile::HostTensor<XDataType> x_host({m, n});
-    ck_tile::HostTensor<YDataType> y_host_ref({m, n});
-    ck_tile::HostTensor<YDataType> y_host_dev({m, n});
-
-    // ck_tile::FillConstant<XDataType>{1.f}(x_host);
-    ck_tile::half_t value = 1;
-    for(int i = 0; i < m; i++)
-    {
-        value = 1;
-        for(int j = 0; j < n; j++)
-        {
-            x_host(i, j) = value++;
-        }
-    }
-
-    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
-
-    x_buf.ToDevice(x_host.data());
-
-    using BlockWaves         = ck_tile::sequence<2, 1>;
-    using BlockTile          = ck_tile::sequence<64, 8>;
-    using WaveTile           = ck_tile::sequence<64, 8>;
-    using Vector             = ck_tile::sequence<1, 2>;
-    constexpr bool AsyncCopy = true;
-
-    ck_tile::index_t kGridSize =
-        ck_tile::integer_divide_ceil(m, BlockTile::at(ck_tile::number<0>{}));
-
-    using Shape   = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, Vector>;
-    using Problem = ck_tile::TileCopyProblem<XDataType, Shape, AsyncCopy>;
-    using Kernel  = ck_tile::TileCopy<Problem>;
-
-    constexpr ck_tile::index_t kBlockSize  = 128;
-    constexpr ck_tile::index_t kBlockPerCu = 1;
-
-    float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
-                                   ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
-                                       Kernel{},
-                                       kGridSize,
-                                       kBlockSize,
-                                       0,
-                                       static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
-                                       static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
-                                       m,
-                                       n,
-                                       warp_id));
-
-    std::size_t num_btype = sizeof(XDataType) * m * n + sizeof(YDataType) * m;
-
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-    std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
-
-    bool pass = true;
-
-    if(do_validation)
-    {
-        // reference
-        y_buf.FromDevice(y_host_dev.mData.data());
-        pass = ck_tile::check_err(y_host_dev, x_host);
-
-        std::cout << "valid:" << (pass ? "y" : "n") << std::flush << std::endl;
-    }
-
-    return pass;
-}
-
-int main(int argc, char* argv[])
-{
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
-    const std::string data_type = arg_parser.get_str("prec");
-    return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
-}
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -21,5 +21,4 @@ add_subdirectory(18_flatmm)
 add_subdirectory(19_gemm_multi_d)
 add_subdirectory(20_grouped_convolution)
 add_subdirectory(35_batched_transpose)
-add_subdirectory(36_copy)
 add_subdirectory(37_transpose)
--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
@@ -6,3 +6,4 @@ add_subdirectory(grouped_gemm)
 add_subdirectory(gemm_multi_d)
 add_subdirectory(data_type)
 add_subdirectory(slice_tile)
+add_subdirectory(memory_copy)
--- a/test/ck_tile/memory_copy/CMakeLists.txt
+++ b/test/ck_tile/memory_copy/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(GPU_TARGETS MATCHES "gfx950")
+    add_gtest_executable(test_memory_copy test_copy.cpp)
+endif()
--- a/test/ck_tile/memory_copy/README.md
+++ b/test/ck_tile/memory_copy/README.md
--- a/test/ck_tile/memory_copy/test_copy.cpp
+++ b/test/ck_tile/memory_copy/test_copy.cpp
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <algorithm>
+#include <gtest/gtest.h>
+
+#include "ck_tile/host.hpp"
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "test_copy.hpp"
+
+struct MemoryCopyParam
+{
+    MemoryCopyParam(ck_tile::index_t m_, ck_tile::index_t n_, ck_tile::index_t warp_id_)
+        : m(m_), n(n_), warp_id(warp_id_)
+    {
+    }
+    ck_tile::index_t m;
+    ck_tile::index_t n;
+    ck_tile::index_t warp_id;
+};
+
+template <typename DataType>
+class TestCkTileMemoryCopy : public ::testing::Test
+{
+    protected:
+    void Run(const MemoryCopyParam& memcpy_params)
+    {
+        using XDataType = DataType;
+        using YDataType = DataType;
+
+        ck_tile::index_t m       = memcpy_params.m;
+        ck_tile::index_t n       = memcpy_params.n;
+        ck_tile::index_t warp_id = memcpy_params.warp_id;
+
+        constexpr auto dword_bytes = 4;
+
+        if(n % (dword_bytes / sizeof(DataType)) != 0)
+        {
+            std::cerr << "n size should be multiple of dword_bytes" << std::endl;
+        }
+
+        ck_tile::HostTensor<XDataType> x_host({m, n});
+        ck_tile::HostTensor<YDataType> y_host_dev({m, n});
+        std::cout << "input: " << x_host.mDesc << std::endl;
+        std::cout << "output: " << y_host_dev.mDesc << std::endl;
+
+        ck_tile::half_t value = 1;
+        for(int i = 0; i < m; i++)
+        {
+            value = 1;
+            for(int j = 0; j < n; j++)
+            {
+                x_host(i, j) = value++;
+            }
+        }
+
+        ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
+
+        x_buf.ToDevice(x_host.data());
+
+        using BlockWaves         = ck_tile::sequence<2, 1>;
+        using BlockTile          = ck_tile::sequence<64, 8>;
+        using WaveTile           = ck_tile::sequence<64, 8>;
+        using Vector             = ck_tile::sequence<1, 2>;
+        constexpr bool AsyncCopy = true;
+
+        ck_tile::index_t kGridSize =
+            ck_tile::integer_divide_ceil(m, BlockTile::at(ck_tile::number<0>{}));
+
+        using Shape   = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, Vector>;
+        using Problem = ck_tile::TileCopyProblem<XDataType, Shape, AsyncCopy>;
+        using Kernel  = ck_tile::TileCopy<Problem>;
+
+        constexpr ck_tile::index_t kBlockSize  = 128;
+        constexpr ck_tile::index_t kBlockPerCu = 1;
+
+        launch_kernel(ck_tile::stream_config{},
+                      ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
+                          Kernel{},
+                          kGridSize,
+                          kBlockSize,
+                          0,
+                          static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
+                          static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
+                          m,
+                          n,
+                          warp_id));
+
+        // reference
+        y_buf.FromDevice(y_host_dev.mData.data());
+        bool pass = ck_tile::check_err(y_host_dev, x_host);
+
+        EXPECT_TRUE(pass);
+    }
+};
+
+class TestCkTileMemoryCopyHalf : public TestCkTileMemoryCopy<ck_tile::half_t>
+{
+};
+
+class TestCkTileMemoryCopyBFloat : public TestCkTileMemoryCopy<ck_tile::bf16_t>
+{
+};
+
+TEST_F(TestCkTileMemoryCopyHalf, TestCorrectness)
+{
+    this->Run({64, 8, 0});
+    this->Run({63, 8, 0});
+    this->Run({63, 2, 0});
+    this->Run({127, 30, 0});
+    this->Run({64, 8, 1});
+    this->Run({63, 8, 1});
+    this->Run({63, 2, 1});
+    this->Run({127, 30, 1});
+}
+
+TEST_F(TestCkTileMemoryCopyBFloat, TestCorrectness)
+{
+    this->Run({64, 8, 0});
+    this->Run({63, 8, 0});
+    this->Run({63, 2, 0});
+    this->Run({127, 30, 0});
+    this->Run({64, 8, 1});
+    this->Run({63, 8, 1});
+    this->Run({63, 2, 1});
+    this->Run({127, 30, 1});
+}
--- a/test/ck_tile/memory_copy/test_copy.hpp
+++ b/test/ck_tile/memory_copy/test_copy.hpp
@@ -18,10 +18,10 @@ template <typename BlockWaves, // num warps along seq<M, N>
 struct TileCopyShape
 {
    // We split Workgroup waves into two specialized groups.
-    // One for reading data from global -> LDS, the other is doing reduction
+    // One for reading data from global -> LDS, the other idled
    static constexpr index_t WaveGroups = 2;
    static constexpr index_t MWarps     = BlockWaves::at(number<0>{});
-    static constexpr index_t NWarps     = BlockWaves::at(number<0>{});
+    static constexpr index_t NWarps     = BlockWaves::at(number<1>{});

    static constexpr index_t Block_M = BlockTile::at(number<0>{});
    static constexpr index_t Block_N = BlockTile::at(number<1>{});
@@ -36,9 +36,8 @@ struct TileCopyShape
    static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;

    // We splited the waves on M dimension
-    static constexpr index_t WarpPerBlock_M =
-        integer_divide_ceil(BlockWaves::at(number<0>{}), WaveGroups);
-    static constexpr index_t WarpPerBlock_N = BlockWaves::at(number<1>{});
+    static constexpr index_t WarpPerBlock_M = integer_divide_ceil(MWarps, WaveGroups);
+    static constexpr index_t WarpPerBlock_N = NWarps;

    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);