Merge commit '956fe8f75118de688b1ee9ca8619b2c1dbe35ea1' into develop

This commit is contained in:
github-actions[bot]
2025-05-07 07:07:28 +00:00
parent b96328e63f
commit 73914f82b3
8 changed files with 461 additions and 0 deletions

View File

@@ -0,0 +1,4 @@
add_executable(test_copy_kernel EXCLUDE_FROM_ALL test_copy.cpp)
target_compile_options(test_copy_kernel PRIVATE
-mllvm -enable-noalias-to-md-conversion=0
)

View File

@@ -0,0 +1,31 @@
# Copy Kernel
This folder contains basic setup code designed to provide a platform for novice
CK_Tile kernel developers to test basic functionality with minimal additional
code compared to the functional code. Sample functional code for a simple
tile distribution for DRAM window and LDS window are provided and data is moved
from DRAM to registers, registers to LDS, LDS to registers and finally data
is moved to output DRAM window for a simple copy operation.
## build
```
# in the root of ck_tile
mkdir build && cd build
# you can replace <arch> with the appropriate architecture
# (for example gfx90a or gfx942) or leave it blank
sh ../script/cmake-ck-dev.sh ../ <arch>
# Make the copy kernel executable
make test_copy -j
```
This will result in an executable `build/bin/test_copy_kernel`
## example
```
args:
-m input matrix rows. (default 64)
-n input matrix cols. (default 8)
-id warp to use for computation. (default 0)
-v validation flag to check device results. (default 1)
-prec datatype precision to use. (default fp16)
-warmup no. of warmup iterations. (default 50)
-repeat no. of iterations for kernel execution time. (default 100)
```

View File

@@ -0,0 +1,117 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "ck_tile/host.hpp"
#include <cstring>
#include "test_copy.hpp"
auto create_args(int argc, char* argv[])
{
ck_tile::ArgParser arg_parser;
arg_parser.insert("m", "64", "m dimension")
.insert("n", "8", "n dimension")
.insert("id", "0", "warp to use")
.insert("v", "1", "cpu validation or not")
.insert("prec", "fp16", "precision")
.insert("warmup", "50", "cold iter")
.insert("repeat", "100", "hot iter");
bool result = arg_parser.parse(argc, argv);
return std::make_tuple(result, arg_parser);
}
template <typename DataType>
bool run(const ck_tile::ArgParser& arg_parser)
{
using XDataType = DataType;
using YDataType = DataType;
ck_tile::index_t m = arg_parser.get_int("m");
ck_tile::index_t n = arg_parser.get_int("n");
ck_tile::index_t warp_id = arg_parser.get_int("id");
int do_validation = arg_parser.get_int("v");
int warmup = arg_parser.get_int("warmup");
int repeat = arg_parser.get_int("repeat");
ck_tile::HostTensor<XDataType> x_host({m, n});
ck_tile::HostTensor<YDataType> y_host_ref({m, n});
ck_tile::HostTensor<YDataType> y_host_dev({m, n});
// ck_tile::FillConstant<XDataType>{1.f}(x_host);
ck_tile::half_t value = 1;
for(int i = 0; i < m; i++)
{
value = 1;
for(int j = 0; j < n; j++)
{
x_host(i, j) = value++;
}
}
ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
x_buf.ToDevice(x_host.data());
using BlockWaves = ck_tile::sequence<2, 1>;
using BlockTile = ck_tile::sequence<64, 8>;
using WaveTile = ck_tile::sequence<64, 8>;
using Vector = ck_tile::sequence<1, 4>;
ck_tile::index_t kGridSize = (m / BlockTile::at(ck_tile::number<0>{}));
std::cout << "grid size " << kGridSize << std::endl;
using Shape = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, Vector>;
using Problem = ck_tile::TileCopyProblem<XDataType, Shape>;
using Kernel = ck_tile::TileCopy<Problem>;
constexpr ck_tile::index_t kBlockSize = 128;
constexpr ck_tile::index_t kBlockPerCu = 1;
std::cout << "block size " << kBlockSize << std::endl;
std::cout << "warp SIze " << ck_tile::get_warp_size() << std::endl;
std::cout << "warps per block _M " << Shape::WarpPerBlock_M << " " << Shape::WarpPerBlock_N
<< std::endl;
std::cout << "Block waves: " << BlockWaves::at(ck_tile::number<0>{}) << " "
<< BlockWaves::at(ck_tile::number<1>{}) << std::endl;
std::cout << " Wave Groups: " << Shape::WaveGroups << std::endl;
float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
Kernel{},
kGridSize,
kBlockSize,
0,
static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
m,
n,
warp_id));
std::size_t num_btype = sizeof(XDataType) * m * n + sizeof(YDataType) * m;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
bool pass = true;
if(do_validation)
{
// reference
y_buf.FromDevice(y_host_dev.mData.data());
pass = ck_tile::check_err(y_host_dev, x_host);
std::cout << "valid:" << (pass ? "y" : "n") << std::flush << std::endl;
}
return pass;
}
int main(int argc, char* argv[])
{
auto [result, arg_parser] = create_args(argc, argv);
if(!result)
return -1;
const std::string data_type = arg_parser.get_str("prec");
return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
}

View File

@@ -0,0 +1,178 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/host.hpp"
#include "ck_tile/host/kernel_launch.hpp"
namespace ck_tile {
template <typename BlockWaves, // num warps along seq<M, N>
typename BlockTile, // block size, seq<M, N>
typename WaveTile, // warp size, seq<M, N>
typename Vector> // contiguous elements(vector size) along seq<M, N>
struct TileCopyShape
{
// We split Workgroup waves into two specialized groups.
// One for reading data from global -> LDS, the other is doing reduction
static constexpr index_t WaveGroups = 2;
static constexpr index_t MWarps = BlockWaves::at(number<0>{});
static constexpr index_t NWarps = BlockWaves::at(number<0>{});
static constexpr index_t Block_M = BlockTile::at(number<0>{});
static constexpr index_t Block_N = BlockTile::at(number<1>{});
static constexpr index_t Warp_M = WaveTile::at(number<0>{});
static constexpr index_t Warp_N = WaveTile::at(number<1>{});
static constexpr index_t Vector_M = Vector::at(number<0>{});
static constexpr index_t Vector_N = Vector::at(number<1>{});
static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
static constexpr index_t WarpPerBlock_M =
integer_divide_ceil(BlockWaves::at(number<0>{}), WaveGroups);
static constexpr index_t WarpPerBlock_N =
integer_divide_ceil(BlockWaves::at(number<1>{}), WaveGroups);
static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
static constexpr index_t WaveNum = reduce_on_sequence(BlockWaves{}, multiplies{}, number<1>{});
static constexpr index_t BlockSize = get_warp_size() * WaveNum;
static constexpr index_t WaveGroupSize = WaveNum / WaveGroups;
static_assert(WaveGroupSize == WarpPerBlock_M * WarpPerBlock_N, "Inconsisten wave group size!");
};
template <typename XDataType_, typename BlockShape_>
struct TileCopyProblem
{
using XDataType = remove_cvref_t<XDataType_>;
using BlockShape = remove_cvref_t<BlockShape_>;
};
template <typename Problem_>
struct TileCopy
{
using Problem = ck_tile::remove_cvref_t<Problem_>;
using XDataType = typename Problem::XDataType;
template <typename Problem>
CK_TILE_DEVICE static constexpr auto MakeDRAMDistribution()
{
using S = typename Problem::BlockShape;
constexpr index_t warp_size = get_warp_size();
constexpr index_t X0 = S::ThreadPerWarp_N; // threads needed along N dimension, fastest
// changing with given vector size.
constexpr index_t X1 =
S::Vector_N; // no. of elements along N dimensions to be read by each thread.
constexpr index_t Y0 =
S::WaveNum / S::WaveGroups; // no. of active warps working in this thread block.
constexpr index_t Y1 = warp_size / X0; // no. of threads in a warp needed along M dimension.
constexpr index_t Y2 =
S::Warp_M /
(Y1 *
Y0); // no. of iterations each warp needs to perform to cover the entire tile window.
constexpr auto outer_encoding =
tile_distribution_encoding<sequence<Y0>,
tuple<sequence<Y1, Y2>, sequence<X0, X1>>,
tuple<sequence<0>, sequence<1, 2>>,
tuple<sequence<0>, sequence<0, 0>>,
sequence<1, 2>,
sequence<1, 1>>{};
return make_static_tile_distribution(outer_encoding);
}
CK_TILE_DEVICE void
operator()(const XDataType* p_x, XDataType* p_y, index_t M, index_t N, index_t warp_id) const
{
using S = typename Problem::BlockShape;
// LDS Data.
__shared__ XDataType x_lds[number<S::Block_M>{} * number<S::Block_N>{}];
XDataType* __restrict__ p_x_lds = static_cast<XDataType*>(x_lds);
const auto x_lds_desc = make_naive_tensor_descriptor(
make_tuple(number<S::Block_M>{}, number<S::Block_N>{}, number<S::Vector_N>{}),
make_tuple(number<S::Block_N>{}, number<S::Vector_N>{}, 1),
number<S::Vector_N>{},
number<1>{});
auto x_lds_block_desc = transform_tensor_descriptor(
x_lds_desc,
make_tuple(make_pass_through_transform(number<S::Block_M>{}),
make_merge_transform(
make_tuple(number<S::Block_N>{} / S::Vector_N, number<S::Vector_N>{}))),
make_tuple(sequence<1>{}, sequence<0, 2>{}),
make_tuple(sequence<0>{}, sequence<1>{}));
auto x_lds_view = make_tensor_view<address_space_enum::lds>(p_x_lds, x_lds_block_desc);
auto x_block_lds_window =
make_tile_window(x_lds_view,
make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
{0, 0},
MakeDRAMDistribution<Problem>());
auto x_block_lds_window_no_dist = make_tile_window(
x_lds_view, make_tuple(number<S::Block_M>{}, number<S::Block_N>{}), {0, 0});
// Input tensor
const auto iM = get_block_id() * S::Block_M;
const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
auto x_block_window =
make_tile_window(x_m_n,
make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
{iM, 0},
MakeDRAMDistribution<Problem>());
// Output tensor
const auto y_m = make_naive_tensor_view<address_space_enum::global>(
p_y, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
auto y_block_window =
make_tile_window(y_m, make_tuple(number<S::Block_M>{}, number<S::Block_N>{}), {iM, 0});
// Programming logic
index_t num_n_tile_iteration =
__builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N));
auto my_id = get_warp_id();
auto DramTileDist = x_block_window.get_tile_distribution();
using dram_reg_tile = decltype(make_static_distributed_tensor<XDataType>(DramTileDist));
for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
{
dram_reg_tile dram_tile;
if(my_id == warp_id)
{
// load from DRAM to registers
load_tile(dram_tile, x_block_window);
// store in lds
store_tile(x_block_lds_window_no_dist, dram_tile);
// read from lds to registers
load_tile(dram_tile, x_block_lds_window);
// store from registers to DRAM
store_tile(y_block_window, dram_tile);
}
__syncthreads();
move_tile_window(x_block_window, {0, S::Block_N});
move_tile_window(y_block_window, {0, S::Block_N});
}
}
};
} // namespace ck_tile

View File

@@ -19,3 +19,4 @@ add_subdirectory(16_batched_gemm)
add_subdirectory(17_grouped_gemm)
add_subdirectory(18_flatmm)
add_subdirectory(35_batched_transpose)
add_subdirectory(36_copy)

View File

@@ -1164,4 +1164,82 @@ CK_TILE_DEVICE void move_tile_window(
window.move(step);
}
/**
* @brief Type trait to determine if a type is a tile window with static distribution.
*
* Defaults to `false_type`. Specializations define when the trait evaluates to `true`.
*
* @tparam T The type to check.
*/
template <typename T>
struct is_tile_window_with_static_distribution : std::false_type
{
};
/**
* @brief Specialization for `tile_window_with_static_distribution` to evaluate to `true_type`.
*
* @tparam BottomTensorView_ Bottom tensor view type of the tile window.
* @tparam WindowLengths_ Static window lengths.
* @tparam StaticTileDistribution_ Tile distribution policy.
* @tparam NumCoord Number of coordinate dimensions.
*/
template <typename BottomTensorView_,
typename WindowLengths_,
typename StaticTileDistribution_,
index_t NumCoord>
struct is_tile_window_with_static_distribution<
tile_window_with_static_distribution<BottomTensorView_,
WindowLengths_,
StaticTileDistribution_,
NumCoord>> : std::true_type
{
};
/**
* @brief Helper variable template to check if a type is a tile window with static distribution.
*
* Equivalent to `is_tile_window_with_static_distribution<T>::value`.
*
* @tparam T The type to check.
*/
template <typename T>
inline constexpr bool is_tile_window_with_static_distribution_v =
is_tile_window_with_static_distribution<T>::value;
/**
* @brief Type trait to determine if a type is a tile window with static lengths.
*
* Defaults to `false_type`. Specializations define when the trait evaluates to `true`.
*
* @tparam T The type to check.
*/
template <typename T>
struct is_tile_window_with_static_lengths : std::false_type
{
};
/**
* @brief Specialization for `tile_window_with_static_lengths` to evaluate to `true_type`.
*
* @tparam BottomTensorView_ Bottom tensor view type of the tile window.
* @tparam WindowLengths_ Static window lengths.
*/
template <typename BottomTensorView_, typename WindowLengths_>
struct is_tile_window_with_static_lengths<
tile_window_with_static_lengths<BottomTensorView_, WindowLengths_>> : std::true_type
{
};
/**
* @brief Helper variable template to check if a type is a tile window with static lengths.
*
* Equivalent to `is_tile_window_with_static_lengths<T>::value`.
*
* @tparam T The type to check.
*/
template <typename T>
inline constexpr bool is_tile_window_with_static_lengths_v =
is_tile_window_with_static_lengths<T>::value;
} // namespace ck_tile

View File

@@ -44,6 +44,7 @@ template <typename BottomTensorView_,
typename LinearBottomDims_>
struct tile_window_linear
{
using BottomTensorView = remove_reference_t<BottomTensorView_>;
using WindowLengths = remove_cvref_t<WindowLengths_>;
using TileDstr = remove_cvref_t<StaticTileDistribution_>;
@@ -1215,4 +1216,49 @@ CK_TILE_DEVICE void move_tile_window(
window.move(step);
}
/**
* @brief Type trait to determine if a type is a linear tile window.
*
* Defaults to `false_type`. Specialized to `true_type` for types that match
* `tile_window_linear<...>`.
*
* @tparam T The type to check.
*/
template <typename T>
struct is_tile_window_linear : std::false_type
{
};
/**
* @brief Specialization of `is_tile_window_linear` for `tile_window_linear`.
*
* Evaluates to `true_type` if the type is a `tile_window_linear` with the given template
* parameters.
*
* @tparam BottomTensorView_ Bottom tensor view type of the tile window.
* @tparam WindowLengths_ Static window lengths.
* @tparam StaticTileDistribution_ Tile distribution policy.
* @tparam LinearBottomDims_ Dimensions of the bottom tensor view that participate in linearization.
*/
template <typename BottomTensorView_,
typename WindowLengths_,
typename StaticTileDistribution_,
typename LinearBottomDims_>
struct is_tile_window_linear<tile_window_linear<BottomTensorView_,
WindowLengths_,
StaticTileDistribution_,
LinearBottomDims_>> : std::true_type
{
};
/**
* @brief Helper variable template to check if a type is a linear tile window.
*
* Equivalent to `is_tile_window_linear<T>::value`.
*
* @tparam T The type to check.
*/
template <typename T>
inline constexpr bool is_tile_window_linear_v = is_tile_window_linear<T>::value;
} // namespace ck_tile

View File

@@ -337,6 +337,12 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
{0, 0},
BLdsTileDistr);
static_assert(
!(is_tile_window_linear_v<decltype(a_lds_ld_window0)>)&&!(is_tile_window_linear_v<decltype(a_lds_ld_window1)>)&&!(
is_tile_window_linear_v<
decltype(b_lds_ld_window0)>)&&!(is_tile_window_linear_v<decltype(b_lds_ld_window1)>),
"LDS windows must not be linear");
Base::LocalPrefetch(a_block_tile0, a_lds_ld_window0);
Base::LocalPrefetch(b_block_tile0, b_lds_ld_window0);