move test_copy into test

This commit is contained in:
aska-0096
2025-07-17 03:10:46 +00:00
parent 21627d7ca7
commit 804f77dce5
8 changed files with 137 additions and 129 deletions

View File

@@ -0,0 +1,3 @@
if(GPU_TARGETS MATCHES "gfx950")
add_gtest_executable(test_memory_copy test_copy.cpp)
endif()

View File

@@ -0,0 +1,31 @@
# Copy Kernel
This folder contains basic setup code designed to provide a platform for novice
CK_Tile kernel developers to test basic functionality with minimal additional
code compared to the functional code. Sample functional code for a simple
tile distribution for DRAM window and LDS window are provided and data is moved
from DRAM to registers, registers to LDS, LDS to registers and finally data
is moved to output DRAM window for a simple copy operation.
## build
```
# in the root of ck_tile
mkdir build && cd build
# you can replace <arch> with the appropriate architecture
# (for example gfx90a or gfx942) or leave it blank
sh ../script/cmake-ck-dev.sh ../ <arch>
# Make the copy kernel executable
make test_copy -j
```
This will result in an executable `build/bin/test_copy_kernel`
## example
```
args:
-m input matrix rows. (default 64)
-n input matrix cols. (default 8)
-id warp to use for computation. (default 0)
-v validation flag to check device results. (default 1)
-prec datatype precision to use. (default fp16)
-warmup no. of warmup iterations. (default 50)
-repeat no. of iterations for kernel execution time. (default 100)
```

View File

@@ -0,0 +1,129 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include <algorithm>
#include <gtest/gtest.h>
#include "ck_tile/host.hpp"
#include "ck_tile/core.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include "test_copy.hpp"
struct MemoryCopyParam
{
MemoryCopyParam(ck_tile::index_t m_, ck_tile::index_t n_, ck_tile::index_t warp_id_)
: m(m_), n(n_), warp_id(warp_id_)
{
}
ck_tile::index_t m;
ck_tile::index_t n;
ck_tile::index_t warp_id;
};
template <typename DataType>
class TestCkTileMemoryCopy : public ::testing::Test
{
protected:
void Run(const MemoryCopyParam& memcpy_params)
{
using XDataType = DataType;
using YDataType = DataType;
ck_tile::index_t m = memcpy_params.m;
ck_tile::index_t n = memcpy_params.n;
ck_tile::index_t warp_id = memcpy_params.warp_id;
constexpr auto dword_bytes = 4;
if(n % (dword_bytes / sizeof(DataType)) != 0)
{
std::cerr << "n size should be multiple of dword_bytes" << std::endl;
}
ck_tile::HostTensor<XDataType> x_host({m, n});
ck_tile::HostTensor<YDataType> y_host_dev({m, n});
std::cout << "input: " << x_host.mDesc << std::endl;
std::cout << "output: " << y_host_dev.mDesc << std::endl;
ck_tile::half_t value = 1;
for(int i = 0; i < m; i++)
{
value = 1;
for(int j = 0; j < n; j++)
{
x_host(i, j) = value++;
}
}
ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
x_buf.ToDevice(x_host.data());
using BlockWaves = ck_tile::sequence<2, 1>;
using BlockTile = ck_tile::sequence<64, 8>;
using WaveTile = ck_tile::sequence<64, 8>;
using Vector = ck_tile::sequence<1, 2>;
constexpr bool AsyncCopy = true;
ck_tile::index_t kGridSize =
ck_tile::integer_divide_ceil(m, BlockTile::at(ck_tile::number<0>{}));
using Shape = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, Vector>;
using Problem = ck_tile::TileCopyProblem<XDataType, Shape, AsyncCopy>;
using Kernel = ck_tile::TileCopy<Problem>;
constexpr ck_tile::index_t kBlockSize = 128;
constexpr ck_tile::index_t kBlockPerCu = 1;
launch_kernel(ck_tile::stream_config{},
ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
Kernel{},
kGridSize,
kBlockSize,
0,
static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
m,
n,
warp_id));
// reference
y_buf.FromDevice(y_host_dev.mData.data());
bool pass = ck_tile::check_err(y_host_dev, x_host);
EXPECT_TRUE(pass);
}
};
class TestCkTileMemoryCopyHalf : public TestCkTileMemoryCopy<ck_tile::half_t>
{
};
class TestCkTileMemoryCopyBFloat : public TestCkTileMemoryCopy<ck_tile::bf16_t>
{
};
TEST_F(TestCkTileMemoryCopyHalf, TestCorrectness)
{
this->Run({64, 8, 0});
this->Run({63, 8, 0});
this->Run({63, 2, 0});
this->Run({127, 30, 0});
this->Run({64, 8, 1});
this->Run({63, 8, 1});
this->Run({63, 2, 1});
this->Run({127, 30, 1});
}
TEST_F(TestCkTileMemoryCopyBFloat, TestCorrectness)
{
this->Run({64, 8, 0});
this->Run({63, 8, 0});
this->Run({63, 2, 0});
this->Run({127, 30, 0});
this->Run({64, 8, 1});
this->Run({63, 8, 1});
this->Run({63, 2, 1});
this->Run({127, 30, 1});
}

View File

@@ -0,0 +1,190 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/host.hpp"
#include "ck_tile/host/kernel_launch.hpp"
namespace ck_tile {
template <typename BlockWaves, // num warps along seq<M, N>
typename BlockTile, // block size, seq<M, N>
typename WaveTile, // warp size, seq<M, N>
typename Vector> // contiguous elements(vector size) along seq<M, N>
struct TileCopyShape
{
// We split Workgroup waves into two specialized groups.
// One for reading data from global -> LDS, the other idled
static constexpr index_t WaveGroups = 2;
static constexpr index_t MWarps = BlockWaves::at(number<0>{});
static constexpr index_t NWarps = BlockWaves::at(number<1>{});
static constexpr index_t Block_M = BlockTile::at(number<0>{});
static constexpr index_t Block_N = BlockTile::at(number<1>{});
static constexpr index_t Warp_M = WaveTile::at(number<0>{});
static constexpr index_t Warp_N = WaveTile::at(number<1>{});
static constexpr index_t Vector_M = Vector::at(number<0>{});
static constexpr index_t Vector_N = Vector::at(number<1>{});
static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
// We splited the waves on M dimension
static constexpr index_t WarpPerBlock_M = integer_divide_ceil(MWarps, WaveGroups);
static constexpr index_t WarpPerBlock_N = NWarps;
static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
static constexpr index_t WaveNum = reduce_on_sequence(BlockWaves{}, multiplies{}, number<1>{});
static constexpr index_t BlockSize = get_warp_size() * WaveNum;
static constexpr index_t WaveGroupSize = WaveNum / WaveGroups;
static_assert(WaveGroupSize == WarpPerBlock_M * WarpPerBlock_N, "Inconsisten wave group size!");
};
template <typename XDataType_, typename BlockShape_, bool AsyncCopy_>
struct TileCopyProblem
{
using XDataType = remove_cvref_t<XDataType_>;
using BlockShape = remove_cvref_t<BlockShape_>;
static constexpr bool AsyncCopy = AsyncCopy_;
};
template <typename Problem_>
struct TileCopy
{
using Problem = ck_tile::remove_cvref_t<Problem_>;
using XDataType = typename Problem::XDataType;
static constexpr bool AsyncCopy = Problem::AsyncCopy;
template <typename Problem>
CK_TILE_DEVICE static constexpr auto MakeDRAMDistribution()
{
using S = typename Problem::BlockShape;
constexpr index_t warp_size = get_warp_size();
constexpr index_t X0 = S::ThreadPerWarp_N; // threads needed along N dimension, fastest
// changing with given vector size.
constexpr index_t X1 =
S::Vector_N; // no. of elements along N dimensions to be read by each thread.
constexpr index_t Y0 =
S::WaveNum / S::WaveGroups; // no. of active warps working in this thread block.
constexpr index_t Y2 = warp_size / X0; // no. of threads in a warp needed along M dimension.
constexpr index_t Y1 =
S::Warp_M /
Y2; // no. of iterations each warp needs to perform to cover the entire tile window.
constexpr auto outer_encoding =
tile_distribution_encoding<sequence<S::WaveGroups>,
tuple<sequence<Y0, Y1, Y2>, sequence<X0, X1>>,
tuple<sequence<0, 1>, sequence<1, 2>>,
tuple<sequence<0, 0>, sequence<2, 0>>,
sequence<1, 2>,
sequence<1, 1>>{};
return make_static_tile_distribution(outer_encoding);
}
CK_TILE_DEVICE void
operator()(const XDataType* p_x, XDataType* p_y, index_t M, index_t N, index_t warp_id) const
{
using S = typename Problem::BlockShape;
// LDS Data.
__shared__ XDataType x_lds[number<S::Block_M>{} * number<S::Block_N>{}];
XDataType* __restrict__ p_x_lds = static_cast<XDataType*>(x_lds);
const auto x_lds_desc =
make_naive_tensor_descriptor(make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
make_tuple(number<S::Block_N>{}, 1),
number<S::Vector_N>{},
number<1>{});
auto x_lds_view = make_tensor_view<address_space_enum::lds>(p_x_lds, x_lds_desc);
auto x_block_lds_write_window = make_tile_window(
x_lds_view, make_tuple(number<S::Block_M>{}, number<S::Block_N>{}), {0, 0});
auto x_block_lds_read_window =
make_tile_window(x_lds_view,
make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
{0, 0},
MakeDRAMDistribution<Problem>());
// Input tensor
const auto iM = get_block_id() * S::Block_M;
const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
auto x_block_window =
make_tile_window(x_m_n,
make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
{iM, 0},
MakeDRAMDistribution<Problem>());
// We don't have prefetch here, wait the data back immediately.
constexpr auto async_copy_fence_cnt = 0;
// Output tensor
const auto y_m = make_naive_tensor_view<address_space_enum::global>(
p_y, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
auto y_block_window =
make_tile_window(y_m, make_tuple(number<S::Block_M>{}, number<S::Block_N>{}), {iM, 0});
// Programming logic
index_t num_n_tile_iteration =
__builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N));
auto my_id = get_warp_id();
for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
{
if(my_id == warp_id)
{
if constexpr(AsyncCopy)
{
async_load_tile(x_block_lds_write_window, x_block_window);
// Wait all asyncload insts complete.
// Wait all waves synced
block_sync_lds_direct_load<async_copy_fence_cnt>();
auto lds_tile = load_tile(x_block_lds_read_window);
// store from registers to DRAM
store_tile(y_block_window, lds_tile);
}
else
{
// load from DRAM to registers
auto dram_tile = load_tile(x_block_window);
// store in lds
store_tile(x_block_lds_write_window, dram_tile);
// Wait all lds write insts complete
// Wait all waves synced
block_sync_lds();
// read from lds to registers
auto lds_tile = load_tile(x_block_lds_read_window);
// store from registers to DRAM
store_tile(y_block_window, lds_tile);
}
}
move_tile_window(x_block_window, {0, S::Block_N});
move_tile_window(y_block_window, {0, S::Block_N});
}
}
};
} // namespace ck_tile