fix async copytest bug (#2509)

* fix async copytest bug

* Add block_sync_lds_direct_load utility

* fix the s_waitcnt_imm calculation

* Improve s_waitcnt_imm calculation

* fix vmcnt shift

* add input validation and bug fix

* remove unnecessary output

* move test_copy into test

* change bit width check

* refactor macros into constexpr functions

which still get inlined

* wrap s_waitcnt api

* parameterize test

* cleanup

* cleanup fp8 stub

* add fp8 test cases; todo which input parameters are valid?

* replace n for fp8 in test cases

* add large shapes; fp8 fails again

* change input init

* test sync/async

* time the test

* clang-format test

* use float instead of bfloat to cover a 4-byte type

* fix logic - arg sections should be 'or'd

* make block_sync_lds_direct_load interface similar to old ck

* fix a few comment typos

* name common shapes

* revert the example to original logic of not waiting lds

* clang-format

---------

Co-authored-by: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Co-authored-by: Thomas Ning <Thomas.Ning@amd.com>
This commit is contained in:
Haocong WANG
2025-07-23 15:14:02 +08:00
committed by GitHub
parent e62710e461
commit a5fdc663c8
9 changed files with 313 additions and 191 deletions

View File

@@ -0,0 +1,3 @@
if(GPU_TARGETS MATCHES "gfx950")
add_gtest_executable(test_memory_copy test_copy.cpp)
endif()

View File

@@ -0,0 +1,31 @@
# Copy Kernel
This folder contains basic setup code designed to provide a platform for novice
CK_Tile kernel developers to test basic functionality with minimal additional
code compared to the functional code. Sample functional code for a simple
tile distribution for DRAM window and LDS window are provided and data is moved
from DRAM to registers, registers to LDS, LDS to registers and finally data
is moved to output DRAM window for a simple copy operation.
## build
```
# in the root of ck_tile
mkdir build && cd build
# you can replace <arch> with the appropriate architecture
# (for example gfx90a or gfx942) or leave it blank
sh ../script/cmake-ck-dev.sh ../ <arch>
# Make the copy kernel executable
make test_copy -j
```
This will result in an executable `build/bin/test_copy_kernel`
## example
```
args:
-m input matrix rows. (default 64)
-n input matrix cols. (default 8)
-id warp to use for computation. (default 0)
-v validation flag to check device results. (default 1)
-prec datatype precision to use. (default fp16)
-warmup no. of warmup iterations. (default 50)
-repeat no. of iterations for kernel execution time. (default 100)
```

View File

@@ -0,0 +1,193 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include <algorithm>
#include <gtest/gtest.h>
#include "ck_tile/host.hpp"
#include "ck_tile/core.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include "test_copy.hpp"
struct MemoryCopyParam
{
MemoryCopyParam(ck_tile::index_t m_, ck_tile::index_t n_, ck_tile::index_t warp_id_)
: m(m_), n(n_), warp_id(warp_id_)
{
}
ck_tile::index_t m;
ck_tile::index_t n;
ck_tile::index_t warp_id;
};
template <typename DataType, bool AsyncCopy = true>
class TestCkTileMemoryCopy : public ::testing::TestWithParam<std::tuple<int, int, int>>
{
protected:
void Run(const MemoryCopyParam& memcpy_params)
{
using XDataType = DataType;
using YDataType = DataType;
ck_tile::index_t m = memcpy_params.m;
ck_tile::index_t n = memcpy_params.n;
ck_tile::index_t warp_id = memcpy_params.warp_id;
constexpr auto dword_bytes = 4;
if(n % (dword_bytes / sizeof(DataType)) != 0)
{
std::cerr << "n size should be multiple of dword_bytes" << std::endl;
}
ck_tile::HostTensor<XDataType> x_host({m, n});
ck_tile::HostTensor<YDataType> y_host_dev({m, n});
std::cout << "input: " << x_host.mDesc << std::endl;
std::cout << "output: " << y_host_dev.mDesc << std::endl;
ck_tile::index_t value = 1;
for(int i = 0; i < m; i++)
{
value = 1;
for(int j = 0; j < n; j++)
{
value = (value + 1) % 127;
x_host(i, j) = static_cast<DataType>(value);
}
}
ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
x_buf.ToDevice(x_host.data());
using BlockWaves = ck_tile::sequence<2, 1>;
using BlockTile = ck_tile::sequence<64, 8>;
using WaveTile = ck_tile::sequence<64, 8>;
using Vector = ck_tile::sequence<1, dword_bytes / sizeof(DataType)>;
ck_tile::index_t kGridSize =
ck_tile::integer_divide_ceil(m, BlockTile::at(ck_tile::number<0>{}));
using Shape = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, Vector>;
using Problem = ck_tile::TileCopyProblem<XDataType, Shape, AsyncCopy>;
using Kernel = ck_tile::TileCopy<Problem>;
constexpr ck_tile::index_t kBlockSize = 128;
constexpr ck_tile::index_t kBlockPerCu = 1;
auto ms = launch_kernel(ck_tile::stream_config{nullptr, true},
ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
Kernel{},
kGridSize,
kBlockSize,
0,
static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
m,
n,
warp_id));
auto bytes = 2 * m * n * sizeof(DataType);
std::cout << "elapsed: " << ms << " (ms)" << std::endl;
std::cout << (bytes * 1e-6 / ms) << " (GB/s)" << std::endl;
// reference
y_buf.FromDevice(y_host_dev.mData.data());
bool pass = ck_tile::check_err(y_host_dev, x_host);
EXPECT_TRUE(pass);
}
};
class TestCkTileMemoryCopyHalfAsync : public TestCkTileMemoryCopy<ck_tile::half_t>
{
};
class TestCkTileMemoryCopyHalfSync : public TestCkTileMemoryCopy<ck_tile::half_t, false>
{
};
class TestCkTileMemoryCopyFloatAsync : public TestCkTileMemoryCopy<float>
{
};
class TestCkTileMemoryCopyFP8Async : public TestCkTileMemoryCopy<ck_tile::fp8_t>
{
};
TEST_P(TestCkTileMemoryCopyHalfAsync, TestCorrectness)
{
auto [M, N, warp_id] = GetParam();
this->Run({M, N, warp_id});
}
TEST_P(TestCkTileMemoryCopyHalfSync, TestCorrectness)
{
auto [M, N, warp_id] = GetParam();
this->Run({M, N, warp_id});
}
TEST_P(TestCkTileMemoryCopyFloatAsync, TestCorrectness)
{
auto [M, N, warp_id] = GetParam();
this->Run({M, N, warp_id});
}
TEST_P(TestCkTileMemoryCopyFP8Async, TestCorrectness)
{
auto [M, N, warp_id] = GetParam();
this->Run({M, N, warp_id});
}
INSTANTIATE_TEST_SUITE_P(TestCkTileMemCopySuite,
TestCkTileMemoryCopyHalfAsync,
::testing::Values(std::tuple{64, 8, 0},
std::tuple{63, 8, 0},
std::tuple{63, 2, 0},
std::tuple{127, 30, 0},
std::tuple{64, 8, 1},
std::tuple{63, 8, 1},
std::tuple{63, 2, 1},
std::tuple{127, 30, 1},
std::tuple{16384, 16384, 0},
std::tuple{16384, 16384, 1}));
INSTANTIATE_TEST_SUITE_P(TestCkTileMemCopySuite,
TestCkTileMemoryCopyHalfSync,
::testing::Values(std::tuple{64, 8, 0},
std::tuple{63, 8, 0},
std::tuple{63, 2, 0},
std::tuple{127, 30, 0},
std::tuple{64, 8, 1},
std::tuple{63, 8, 1},
std::tuple{63, 2, 1},
std::tuple{127, 30, 1},
std::tuple{16384, 16384, 0},
std::tuple{16384, 16384, 1}));
INSTANTIATE_TEST_SUITE_P(TestCkTileMemCopySuite,
TestCkTileMemoryCopyFloatAsync,
::testing::Values(std::tuple{64, 8, 0},
std::tuple{63, 8, 0},
std::tuple{63, 2, 0},
std::tuple{127, 30, 0},
std::tuple{64, 8, 1},
std::tuple{63, 8, 1},
std::tuple{63, 2, 1},
std::tuple{127, 30, 1},
std::tuple{16384, 16384, 0},
std::tuple{16384, 16384, 1}));
INSTANTIATE_TEST_SUITE_P(TestCkTileMemCopySuite,
TestCkTileMemoryCopyFP8Async,
::testing::Values(std::tuple{64, 8, 0},
std::tuple{63, 8, 0},
std::tuple{63, 4, 0},
std::tuple{127, 20, 0},
std::tuple{64, 8, 1},
std::tuple{63, 8, 1},
std::tuple{63, 4, 1},
std::tuple{127, 20, 1},
std::tuple{16384, 16384, 0},
std::tuple{16384, 16384, 1}));

View File

@@ -0,0 +1,173 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/host.hpp"
#include "ck_tile/host/kernel_launch.hpp"
namespace ck_tile {
template <typename BlockWaves, // num warps along seq<M, N>
typename BlockTile, // block size, seq<M, N>
typename WaveTile, // warp size, seq<M, N>
typename Vector> // contiguous elements (vector size) along seq<M, N>
struct TileCopyShape
{
// We split Workgroup waves into two specialized groups.
// One for reading data from global -> LDS, the other idling
static constexpr index_t WaveGroups = 2;
static constexpr index_t MWarps = BlockWaves::at(number<0>{});
static constexpr index_t NWarps = BlockWaves::at(number<1>{});
static constexpr index_t Block_M = BlockTile::at(number<0>{});
static constexpr index_t Block_N = BlockTile::at(number<1>{});
static constexpr index_t Warp_M = WaveTile::at(number<0>{});
static constexpr index_t Warp_N = WaveTile::at(number<1>{});
static constexpr index_t Vector_M = Vector::at(number<0>{});
static constexpr index_t Vector_N = Vector::at(number<1>{});
static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
// We splitted the waves on M dimension
static constexpr index_t WarpPerBlock_M = integer_divide_ceil(MWarps, WaveGroups);
static constexpr index_t WarpPerBlock_N = NWarps;
static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
static constexpr index_t WaveNum = reduce_on_sequence(BlockWaves{}, multiplies{}, number<1>{});
static constexpr index_t BlockSize = get_warp_size() * WaveNum;
static constexpr index_t WaveGroupSize = WaveNum / WaveGroups;
static_assert(WaveGroupSize == WarpPerBlock_M * WarpPerBlock_N,
"Inconsistent wave group size!");
};
template <typename XDataType_, typename BlockShape_, bool AsyncCopy_>
struct TileCopyProblem
{
using XDataType = remove_cvref_t<XDataType_>;
using BlockShape = remove_cvref_t<BlockShape_>;
static constexpr bool AsyncCopy = AsyncCopy_;
};
template <typename Problem_>
struct TileCopy
{
using Problem = ck_tile::remove_cvref_t<Problem_>;
using XDataType = typename Problem::XDataType;
static constexpr bool AsyncCopy = Problem::AsyncCopy;
template <typename Problem>
CK_TILE_DEVICE static constexpr auto MakeDRAMDistribution()
{
using S = typename Problem::BlockShape;
constexpr index_t warp_size = get_warp_size();
constexpr index_t X0 = S::ThreadPerWarp_N; // threads needed along N dimension, fastest
// changing with given vector size.
constexpr index_t X1 =
S::Vector_N; // no. of elements along N dimensions to be read by each thread.
constexpr index_t Y0 =
S::WaveNum / S::WaveGroups; // number of active warps working in this thread block.
constexpr index_t Y2 =
warp_size / X0; // number of threads in a warp needed along M dimension.
constexpr index_t Y1 =
S::Warp_M /
Y2; // number of iterations each warp needs to perform to cover the entire tile window.
constexpr auto outer_encoding =
tile_distribution_encoding<sequence<S::WaveGroups>,
tuple<sequence<Y0, Y1, Y2>, sequence<X0, X1>>,
tuple<sequence<0, 1>, sequence<1, 2>>,
tuple<sequence<0, 0>, sequence<2, 0>>,
sequence<1, 2>,
sequence<1, 1>>{};
return make_static_tile_distribution(outer_encoding);
}
CK_TILE_DEVICE void
operator()(const XDataType* p_x, XDataType* p_y, index_t M, index_t N, index_t warp_id) const
{
using S = typename Problem::BlockShape;
// LDS buffer
__shared__ XDataType x_lds[S::Block_M * S::Block_N];
constexpr auto block_dims = make_tuple(number<S::Block_M>{}, number<S::Block_N>{});
constexpr auto block_strides = make_tuple(number<S::Block_N>{}, number<1>{});
const auto x_lds_desc = make_naive_tensor_descriptor(
block_dims, block_strides, number<S::Vector_N>{}, number<1>{});
auto x_lds_view = make_tensor_view<address_space_enum::lds>(x_lds, x_lds_desc);
auto x_block_lds_write_window = make_tile_window(x_lds_view, block_dims, {0, 0});
auto x_block_lds_read_window =
make_tile_window(x_lds_view, block_dims, {0, 0}, MakeDRAMDistribution<Problem>());
const index_t iM = __builtin_amdgcn_readfirstlane(get_block_id() * S::Block_M);
// Input tensor
const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
auto x_block_window =
make_tile_window(x_m_n, block_dims, {iM, 0}, MakeDRAMDistribution<Problem>());
// Output tensor
const auto y_m = make_naive_tensor_view<address_space_enum::global>(
p_y, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
auto y_block_window = make_tile_window(y_m, block_dims, {iM, 0});
const index_t num_n_tile_iteration =
__builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N));
const index_t my_id = __builtin_amdgcn_readfirstlane(get_warp_id());
constexpr index_t async_copy_fence_cnt = 0;
for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
{
if(my_id == warp_id)
{
if constexpr(AsyncCopy)
{
async_load_tile(x_block_lds_write_window, x_block_window);
// We don't have prefetch here, wait the data back immediately.
// Wait all asyncload insts complete.
// Wait all waves synced
s_waitcnt_barrier<async_copy_fence_cnt>();
auto lds_tile = load_tile(x_block_lds_read_window);
// store from registers to DRAM
store_tile(y_block_window, lds_tile);
}
else
{
// load from DRAM to registers
auto dram_tile = load_tile(x_block_window);
// store in lds
store_tile(x_block_lds_write_window, dram_tile);
// Wait all lds write insts complete
// Wait all waves synced
block_sync_lds();
// read from lds to registers
auto lds_tile = load_tile(x_block_lds_read_window);
// store from registers to DRAM
store_tile(y_block_window, lds_tile);
}
}
move_tile_window(x_block_window, {0, S::Block_N});
move_tile_window(y_block_window, {0, S::Block_N});
}
}
};
} // namespace ck_tile