fix async copytest bug (#2509)

* fix async copytest bug * Add block_sync_lds_direct_load utility * fix the s_waitcnt_imm calculation * Improve s_waitcnt_imm calculation * fix vmcnt shift * add input validation and bug fix * remove unnecessary output * move test_copy into test * change bit width check * refactor macros into constexpr functions which still get inlined * wrap s_waitcnt api * parameterize test * cleanup * cleanup fp8 stub * add fp8 test cases; todo which input parameters are valid? * replace n for fp8 in test cases * add large shapes; fp8 fails again * change input init * test sync/async * time the test * clang-format test * use float instead of bfloat to cover a 4-byte type * fix logic - arg sections should be 'or'd * make block_sync_lds_direct_load interface similar to old ck * fix a few comment typos * name common shapes * revert the example to original logic of not waiting lds * clang-format --------- Co-authored-by: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Co-authored-by: Thomas Ning <Thomas.Ning@amd.com>
2026-04-20 14:59:17 +00:00 · 2025-07-23 15:14:02 +08:00
parent e62710e461
commit a5fdc663c8
9 changed files with 313 additions and 191 deletions
--- a/test/ck_tile/memory_copy/CMakeLists.txt
+++ b/test/ck_tile/memory_copy/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(GPU_TARGETS MATCHES "gfx950")
+    add_gtest_executable(test_memory_copy test_copy.cpp)
+endif()
--- a/test/ck_tile/memory_copy/README.md
+++ b/test/ck_tile/memory_copy/README.md
@@ -0,0 +1,31 @@
+# Copy Kernel
+This folder contains basic setup code designed to provide a platform for novice 
+CK_Tile kernel developers to test basic functionality with minimal additional 
+code compared to the functional code. Sample functional code for a simple
+tile distribution for DRAM window and LDS window are provided and data is moved
+from DRAM to registers, registers to LDS, LDS to registers and finally data
+is moved to output DRAM window for a simple copy operation.
+
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+# you can replace <arch> with the appropriate architecture 
+# (for example gfx90a or gfx942) or leave it blank
+sh ../script/cmake-ck-dev.sh  ../ <arch>
+# Make the copy kernel executable
+make test_copy -j
+```
+This will result in an executable `build/bin/test_copy_kernel`
+
+## example
+```
+args:
+          -m        input matrix rows. (default 64)
+          -n        input matrix cols. (default 8)
+          -id       warp to use for computation. (default 0)
+          -v        validation flag to check device results. (default 1)
+          -prec     datatype precision to use. (default fp16)
+          -warmup   no. of warmup iterations. (default 50)
+          -repeat   no. of iterations for kernel execution time. (default 100)
+```
--- a/test/ck_tile/memory_copy/test_copy.cpp
+++ b/test/ck_tile/memory_copy/test_copy.cpp
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <algorithm>
+#include <gtest/gtest.h>
+
+#include "ck_tile/host.hpp"
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "test_copy.hpp"
+
+struct MemoryCopyParam
+{
+    MemoryCopyParam(ck_tile::index_t m_, ck_tile::index_t n_, ck_tile::index_t warp_id_)
+        : m(m_), n(n_), warp_id(warp_id_)
+    {
+    }
+    ck_tile::index_t m;
+    ck_tile::index_t n;
+    ck_tile::index_t warp_id;
+};
+
+template <typename DataType, bool AsyncCopy = true>
+class TestCkTileMemoryCopy : public ::testing::TestWithParam<std::tuple<int, int, int>>
+{
+    protected:
+    void Run(const MemoryCopyParam& memcpy_params)
+    {
+        using XDataType = DataType;
+        using YDataType = DataType;
+
+        ck_tile::index_t m       = memcpy_params.m;
+        ck_tile::index_t n       = memcpy_params.n;
+        ck_tile::index_t warp_id = memcpy_params.warp_id;
+
+        constexpr auto dword_bytes = 4;
+
+        if(n % (dword_bytes / sizeof(DataType)) != 0)
+        {
+            std::cerr << "n size should be multiple of dword_bytes" << std::endl;
+        }
+
+        ck_tile::HostTensor<XDataType> x_host({m, n});
+        ck_tile::HostTensor<YDataType> y_host_dev({m, n});
+        std::cout << "input: " << x_host.mDesc << std::endl;
+        std::cout << "output: " << y_host_dev.mDesc << std::endl;
+
+        ck_tile::index_t value = 1;
+        for(int i = 0; i < m; i++)
+        {
+            value = 1;
+            for(int j = 0; j < n; j++)
+            {
+                value        = (value + 1) % 127;
+                x_host(i, j) = static_cast<DataType>(value);
+            }
+        }
+
+        ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
+
+        x_buf.ToDevice(x_host.data());
+
+        using BlockWaves = ck_tile::sequence<2, 1>;
+        using BlockTile  = ck_tile::sequence<64, 8>;
+        using WaveTile   = ck_tile::sequence<64, 8>;
+        using Vector     = ck_tile::sequence<1, dword_bytes / sizeof(DataType)>;
+
+        ck_tile::index_t kGridSize =
+            ck_tile::integer_divide_ceil(m, BlockTile::at(ck_tile::number<0>{}));
+
+        using Shape   = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, Vector>;
+        using Problem = ck_tile::TileCopyProblem<XDataType, Shape, AsyncCopy>;
+        using Kernel  = ck_tile::TileCopy<Problem>;
+
+        constexpr ck_tile::index_t kBlockSize  = 128;
+        constexpr ck_tile::index_t kBlockPerCu = 1;
+
+        auto ms = launch_kernel(ck_tile::stream_config{nullptr, true},
+                                ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
+                                    Kernel{},
+                                    kGridSize,
+                                    kBlockSize,
+                                    0,
+                                    static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
+                                    static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
+                                    m,
+                                    n,
+                                    warp_id));
+
+        auto bytes = 2 * m * n * sizeof(DataType);
+        std::cout << "elapsed: " << ms << " (ms)" << std::endl;
+        std::cout << (bytes * 1e-6 / ms) << " (GB/s)" << std::endl;
+
+        // reference
+        y_buf.FromDevice(y_host_dev.mData.data());
+        bool pass = ck_tile::check_err(y_host_dev, x_host);
+
+        EXPECT_TRUE(pass);
+    }
+};
+
+class TestCkTileMemoryCopyHalfAsync : public TestCkTileMemoryCopy<ck_tile::half_t>
+{
+};
+
+class TestCkTileMemoryCopyHalfSync : public TestCkTileMemoryCopy<ck_tile::half_t, false>
+{
+};
+
+class TestCkTileMemoryCopyFloatAsync : public TestCkTileMemoryCopy<float>
+{
+};
+
+class TestCkTileMemoryCopyFP8Async : public TestCkTileMemoryCopy<ck_tile::fp8_t>
+{
+};
+
+TEST_P(TestCkTileMemoryCopyHalfAsync, TestCorrectness)
+{
+    auto [M, N, warp_id] = GetParam();
+    this->Run({M, N, warp_id});
+}
+
+TEST_P(TestCkTileMemoryCopyHalfSync, TestCorrectness)
+{
+    auto [M, N, warp_id] = GetParam();
+    this->Run({M, N, warp_id});
+}
+
+TEST_P(TestCkTileMemoryCopyFloatAsync, TestCorrectness)
+{
+    auto [M, N, warp_id] = GetParam();
+    this->Run({M, N, warp_id});
+}
+
+TEST_P(TestCkTileMemoryCopyFP8Async, TestCorrectness)
+{
+    auto [M, N, warp_id] = GetParam();
+    this->Run({M, N, warp_id});
+}
+
+INSTANTIATE_TEST_SUITE_P(TestCkTileMemCopySuite,
+                         TestCkTileMemoryCopyHalfAsync,
+                         ::testing::Values(std::tuple{64, 8, 0},
+                                           std::tuple{63, 8, 0},
+                                           std::tuple{63, 2, 0},
+                                           std::tuple{127, 30, 0},
+                                           std::tuple{64, 8, 1},
+                                           std::tuple{63, 8, 1},
+                                           std::tuple{63, 2, 1},
+                                           std::tuple{127, 30, 1},
+                                           std::tuple{16384, 16384, 0},
+                                           std::tuple{16384, 16384, 1}));
+
+INSTANTIATE_TEST_SUITE_P(TestCkTileMemCopySuite,
+                         TestCkTileMemoryCopyHalfSync,
+                         ::testing::Values(std::tuple{64, 8, 0},
+                                           std::tuple{63, 8, 0},
+                                           std::tuple{63, 2, 0},
+                                           std::tuple{127, 30, 0},
+                                           std::tuple{64, 8, 1},
+                                           std::tuple{63, 8, 1},
+                                           std::tuple{63, 2, 1},
+                                           std::tuple{127, 30, 1},
+                                           std::tuple{16384, 16384, 0},
+                                           std::tuple{16384, 16384, 1}));
+
+INSTANTIATE_TEST_SUITE_P(TestCkTileMemCopySuite,
+                         TestCkTileMemoryCopyFloatAsync,
+                         ::testing::Values(std::tuple{64, 8, 0},
+                                           std::tuple{63, 8, 0},
+                                           std::tuple{63, 2, 0},
+                                           std::tuple{127, 30, 0},
+                                           std::tuple{64, 8, 1},
+                                           std::tuple{63, 8, 1},
+                                           std::tuple{63, 2, 1},
+                                           std::tuple{127, 30, 1},
+                                           std::tuple{16384, 16384, 0},
+                                           std::tuple{16384, 16384, 1}));
+
+INSTANTIATE_TEST_SUITE_P(TestCkTileMemCopySuite,
+                         TestCkTileMemoryCopyFP8Async,
+                         ::testing::Values(std::tuple{64, 8, 0},
+                                           std::tuple{63, 8, 0},
+                                           std::tuple{63, 4, 0},
+                                           std::tuple{127, 20, 0},
+                                           std::tuple{64, 8, 1},
+                                           std::tuple{63, 8, 1},
+                                           std::tuple{63, 4, 1},
+                                           std::tuple{127, 20, 1},
+                                           std::tuple{16384, 16384, 0},
+                                           std::tuple{16384, 16384, 1}));
--- a/test/ck_tile/memory_copy/test_copy.hpp
+++ b/test/ck_tile/memory_copy/test_copy.hpp
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+
+namespace ck_tile {
+
+template <typename BlockWaves, // num warps along seq<M, N>
+          typename BlockTile,  // block size, seq<M, N>
+          typename WaveTile,   // warp size, seq<M, N>
+          typename Vector>     // contiguous elements (vector size) along seq<M, N>
+struct TileCopyShape
+{
+    // We split Workgroup waves into two specialized groups.
+    // One for reading data from global -> LDS, the other idling
+    static constexpr index_t WaveGroups = 2;
+    static constexpr index_t MWarps     = BlockWaves::at(number<0>{});
+    static constexpr index_t NWarps     = BlockWaves::at(number<1>{});
+
+    static constexpr index_t Block_M = BlockTile::at(number<0>{});
+    static constexpr index_t Block_N = BlockTile::at(number<1>{});
+
+    static constexpr index_t Warp_M = WaveTile::at(number<0>{});
+    static constexpr index_t Warp_N = WaveTile::at(number<1>{});
+
+    static constexpr index_t Vector_M = Vector::at(number<0>{});
+    static constexpr index_t Vector_N = Vector::at(number<1>{});
+
+    static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
+    static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
+
+    // We splitted the waves on M dimension
+    static constexpr index_t WarpPerBlock_M = integer_divide_ceil(MWarps, WaveGroups);
+    static constexpr index_t WarpPerBlock_N = NWarps;
+
+    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
+    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
+
+    static constexpr index_t WaveNum = reduce_on_sequence(BlockWaves{}, multiplies{}, number<1>{});
+
+    static constexpr index_t BlockSize     = get_warp_size() * WaveNum;
+    static constexpr index_t WaveGroupSize = WaveNum / WaveGroups;
+    static_assert(WaveGroupSize == WarpPerBlock_M * WarpPerBlock_N,
+                  "Inconsistent wave group size!");
+};
+
+template <typename XDataType_, typename BlockShape_, bool AsyncCopy_>
+struct TileCopyProblem
+{
+    using XDataType                 = remove_cvref_t<XDataType_>;
+    using BlockShape                = remove_cvref_t<BlockShape_>;
+    static constexpr bool AsyncCopy = AsyncCopy_;
+};
+
+template <typename Problem_>
+struct TileCopy
+{
+    using Problem   = ck_tile::remove_cvref_t<Problem_>;
+    using XDataType = typename Problem::XDataType;
+
+    static constexpr bool AsyncCopy = Problem::AsyncCopy;
+
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeDRAMDistribution()
+    {
+        using S = typename Problem::BlockShape;
+
+        constexpr index_t warp_size = get_warp_size();
+        constexpr index_t X0 = S::ThreadPerWarp_N; // threads needed along N dimension, fastest
+                                                   // changing with given vector size.
+        constexpr index_t X1 =
+            S::Vector_N; // no. of elements along N dimensions to be read by each thread.
+
+        constexpr index_t Y0 =
+            S::WaveNum / S::WaveGroups; // number of active warps working in this thread block.
+        constexpr index_t Y2 =
+            warp_size / X0; // number of threads in a warp needed along M dimension.
+        constexpr index_t Y1 =
+            S::Warp_M /
+            Y2; // number of iterations each warp needs to perform to cover the entire tile window.
+
+        constexpr auto outer_encoding =
+            tile_distribution_encoding<sequence<S::WaveGroups>,
+                                       tuple<sequence<Y0, Y1, Y2>, sequence<X0, X1>>,
+                                       tuple<sequence<0, 1>, sequence<1, 2>>,
+                                       tuple<sequence<0, 0>, sequence<2, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<1, 1>>{};
+
+        return make_static_tile_distribution(outer_encoding);
+    }
+
+    CK_TILE_DEVICE void
+    operator()(const XDataType* p_x, XDataType* p_y, index_t M, index_t N, index_t warp_id) const
+    {
+        using S = typename Problem::BlockShape;
+
+        // LDS buffer
+        __shared__ XDataType x_lds[S::Block_M * S::Block_N];
+
+        constexpr auto block_dims    = make_tuple(number<S::Block_M>{}, number<S::Block_N>{});
+        constexpr auto block_strides = make_tuple(number<S::Block_N>{}, number<1>{});
+
+        const auto x_lds_desc = make_naive_tensor_descriptor(
+            block_dims, block_strides, number<S::Vector_N>{}, number<1>{});
+
+        auto x_lds_view = make_tensor_view<address_space_enum::lds>(x_lds, x_lds_desc);
+
+        auto x_block_lds_write_window = make_tile_window(x_lds_view, block_dims, {0, 0});
+
+        auto x_block_lds_read_window =
+            make_tile_window(x_lds_view, block_dims, {0, 0}, MakeDRAMDistribution<Problem>());
+
+        const index_t iM = __builtin_amdgcn_readfirstlane(get_block_id() * S::Block_M);
+        // Input tensor
+        const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
+            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+        auto x_block_window =
+            make_tile_window(x_m_n, block_dims, {iM, 0}, MakeDRAMDistribution<Problem>());
+
+        // Output tensor
+        const auto y_m = make_naive_tensor_view<address_space_enum::global>(
+            p_y, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+        auto y_block_window = make_tile_window(y_m, block_dims, {iM, 0});
+
+        const index_t num_n_tile_iteration =
+            __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N));
+        const index_t my_id                    = __builtin_amdgcn_readfirstlane(get_warp_id());
+        constexpr index_t async_copy_fence_cnt = 0;
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            if(my_id == warp_id)
+            {
+                if constexpr(AsyncCopy)
+                {
+                    async_load_tile(x_block_lds_write_window, x_block_window);
+                    // We don't have prefetch here, wait the data back immediately.
+                    // Wait all asyncload insts complete.
+                    // Wait all waves synced
+                    s_waitcnt_barrier<async_copy_fence_cnt>();
+                    auto lds_tile = load_tile(x_block_lds_read_window);
+                    // store from registers to DRAM
+                    store_tile(y_block_window, lds_tile);
+                }
+                else
+                {
+                    // load from DRAM to registers
+                    auto dram_tile = load_tile(x_block_window);
+                    // store in lds
+                    store_tile(x_block_lds_write_window, dram_tile);
+                    // Wait all lds write insts complete
+                    // Wait all waves synced
+                    block_sync_lds();
+                    // read from lds to registers
+                    auto lds_tile = load_tile(x_block_lds_read_window);
+                    // store from registers to DRAM
+                    store_tile(y_block_window, lds_tile);
+                }
+            }
+
+            move_tile_window(x_block_window, {0, S::Block_N});
+            move_tile_window(y_block_window, {0, S::Block_N});
+        }
+    }
+};
+
+} // namespace ck_tile