General 2D Reduction Kernel

* Move the reduction kernel from the example
* Split the code and add the necessary policy, problem, shape files as
per ck_tile convention
* Add/modify the headers
* Modified the example to work with the 'new' kernel
* Added tests for the kernel
* N-D refernce reduce
* Added support for N-D input with transform to 2D
* Added padding to support various input sized tensors
* Bug fix in the thread buffer constructor
* Some comments to explain the reduce2d block kernel
This commit is contained in:
yashagar
2025-07-07 08:04:33 +00:00
committed by Yashvardhan Agarwal
parent c9886109b4
commit f515d29036
13 changed files with 521 additions and 84 deletions

View File

@@ -1,16 +1,17 @@
#include "ck_tile/host.hpp"
#include "reduce.hpp"
#include "ck_tile/ops/reduce.hpp"
#include <cstring>
auto create_args(int argc, char* argv[])
{
ck_tile::ArgParser arg_parser;
arg_parser.insert("m", "3328", "m dimension")
.insert("n", "4096", "n dimension")
arg_parser.insert("m", "2048", "m dimension")
.insert("n", "1024", "n dimension")
.insert("k", "2", "k dimension")
.insert("v", "1", "cpu validation or not")
.insert("prec", "fp16", "precision")
.insert("warmup", "5", "cold iter")
.insert("repeat", "20", "hot iter");
.insert("warmup", "0", "cold iter")
.insert("repeat", "1", "hot iter");
bool result = arg_parser.parse(argc, argv);
return std::make_tuple(result, arg_parser);
@@ -25,13 +26,25 @@ bool run(const ck_tile::ArgParser& arg_parser)
ck_tile::index_t m = arg_parser.get_int("m");
ck_tile::index_t n = arg_parser.get_int("n");
ck_tile::index_t k = arg_parser.get_int("k");
int do_validation = arg_parser.get_int("v");
int warmup = arg_parser.get_int("warmup");
int repeat = arg_parser.get_int("repeat");
ck_tile::HostTensor<XDataType> x_host({m, n});
ck_tile::HostTensor<YDataType> y_host_ref({m});
ck_tile::HostTensor<YDataType> y_host_dev({m});
std::vector<ck_tile::index_t> problem_shape = {m, n, k};
std::vector<ck_tile::index_t> strides(3);
strides[0] = n * k;
strides[1] = k;
strides[2] = 1;
// Define reduction specification:
// dimension 0 is kept, dimensions 1,2 are reduced
constexpr auto kept_dim = ck_tile::sequence<0>{}; // Which dimension to keep (pass-through)
constexpr auto reduce_dims = ck_tile::sequence<1, 2>{}; // Which dimensions to reduce (merge)
ck_tile::HostTensor<XDataType> x_host(problem_shape, strides);
ck_tile::HostTensor<YDataType> y_host_ref({problem_shape[kept_dim.at(0)]}, {1});
ck_tile::HostTensor<YDataType> y_host_dev({problem_shape[kept_dim.at(0)]}, {1});
ck_tile::FillUniformDistribution<XDataType>{-5.f, 5.f}(x_host);
@@ -54,7 +67,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
constexpr ck_tile::index_t kBlockSize = 256;
constexpr ck_tile::index_t kBlockPerCu = 1;
ck_tile::index_t kGridSize = (m / BlockTile::at(ck_tile::number<0>{}));
ck_tile::index_t kGridSize =
(problem_shape[kept_dim.at(0)] + BlockTile::at(ck_tile::number<0>{}) - 1) /
BlockTile::at(ck_tile::number<0>{});
std::cout << "grid size " << kGridSize << std::endl;
using Shape = ck_tile::Reduce2dShape<BlockWarps, BlockTile, WarpTile, Vector>;
@@ -63,6 +78,15 @@ bool run(const ck_tile::ArgParser& arg_parser)
using Kernel = ck_tile::Reduce<Porblem>;
// Create input tensor shape and strides
auto input_shape = ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2]);
auto input_strides = ck_tile::make_tuple(strides[0], strides[1], strides[2]);
if(!Kernel::IsSupportedArgument(arg_parser))
{
throw std::runtime_error("Wrong! Arguments not supported!\n");
}
float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
Kernel{},
@@ -71,10 +95,13 @@ bool run(const ck_tile::ArgParser& arg_parser)
0,
static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
m,
n));
input_shape,
input_strides,
kept_dim,
reduce_dims));
std::size_t num_btype = sizeof(XDataType) * m * n + sizeof(YDataType) * m;
std::size_t num_btype =
sizeof(XDataType) * m * n * k + sizeof(YDataType) * problem_shape[kept_dim.at(0)];
float gb_per_sec = num_btype / 1.E6 / ave_time;
@@ -86,7 +113,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
{
// reference
ck_tile::reference_reduce<XDataType, ComputeDataType, YDataType>(
x_host, y_host_ref, ReduceOp{});
x_host, y_host_ref, ReduceOp{}, kept_dim, reduce_dims);
y_buf.FromDevice(y_host_dev.mData.data());
pass = ck_tile::check_err(y_host_dev, y_host_ref);

View File

@@ -42,7 +42,11 @@ struct thread_buffer {
// TODO: this ctor can't ignore
CK_TILE_HOST_DEVICE constexpr thread_buffer() : data{} {}
CK_TILE_HOST_DEVICE constexpr thread_buffer(const value_type & o) : data{o} {}
CK_TILE_HOST_DEVICE constexpr thread_buffer(const value_type & o) : data{} {
static_for<0, N, 1>{}(
[&](auto i) { data[i] = o; }
);
}
CK_TILE_HOST_DEVICE static constexpr auto size() { return N; }
CK_TILE_HOST_DEVICE auto & get() {return data; }

View File

@@ -34,6 +34,8 @@ struct Add
return type_convert<T>(y_ + x_);
}
static constexpr bool requires_special_combine = false;
};
struct SquareAdd
@@ -51,6 +53,18 @@ struct SquareAdd
{
return y + (x * x);
}
// For combining partial results
template <typename T,
typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
CK_TILE_HOST_DEVICE constexpr T combine_partial_results(const T& partial1,
const T& partial2) const
{
return partial1 + partial2; // Just add the partial sums, don't square again
}
static constexpr bool requires_special_combine = true;
};
struct Max
@@ -70,6 +84,8 @@ struct Max
{
return max(y, x);
}
static constexpr bool requires_special_combine = false;
};
struct AbsMax
@@ -89,6 +105,8 @@ struct AbsMax
{
return max(y, abs(x));
}
static constexpr bool requires_special_combine = false;
};
} // namespace ReduceOp

View File

@@ -30,4 +30,60 @@ reference_reduce(const HostTensor<XDataType>& x_m_n, HostTensor<YDataType>& y_m,
make_ParallelTensorFunctor(f, y_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
}
// Generic reference reduce for arbitrary dimensions
template <typename XDataType,
typename ComputeDataType,
typename YDataType,
typename ReduceOp,
typename KeptDim,
typename ReduceDims>
CK_TILE_HOST void reference_reduce(const HostTensor<XDataType>& x_tensor,
HostTensor<YDataType>& y_tensor,
ReduceOp reduce_op,
KeptDim kept_dim,
ReduceDims reduce_dims)
{
const auto& x_lengths = x_tensor.mDesc.get_lengths();
const auto kept_len = x_lengths[kept_dim.at(0)];
// Calculate total reduce elements
index_t total_reduce_elements = 1;
static_for<0, reduce_dims.size(), 1>{}(
[&](auto i) { total_reduce_elements *= x_lengths[reduce_dims.at(i)]; });
auto f = [&](auto kept_idx) {
ComputeDataType v_acc = reduce_op.template GetIdentityValue<ComputeDataType>();
for(index_t reduce_idx = 0; reduce_idx < total_reduce_elements; ++reduce_idx)
{
// Convert linear index to multi-dimensional indices
std::vector<index_t> indices(x_lengths.size(), 0);
indices[kept_dim.at(0)] = kept_idx;
index_t temp = reduce_idx;
static_for<0, reduce_dims.size(), 1>{}([&](auto i) {
constexpr auto dim = reduce_dims.at(reduce_dims.size() - 1 - i);
const auto len = x_lengths[dim];
indices[dim] = temp % len;
temp /= len;
});
// Flat tensor access
index_t flat_idx = 0;
const auto& strides = x_tensor.mDesc.get_strides();
for(size_t d = 0; d < indices.size(); ++d)
{
flat_idx += indices[d] * strides[d];
}
const auto v_a = type_convert<ComputeDataType>(x_tensor.mData[flat_idx]);
v_acc = reduce_op(v_acc, v_a);
}
y_tensor(kept_idx) = type_convert<YDataType>(v_acc);
};
make_ParallelTensorFunctor(f, kept_len)(std::thread::hardware_concurrency());
}
} // namespace ck_tile

View File

@@ -5,8 +5,11 @@
#include "ck_tile/ops/reduce/block/block_reduce.hpp"
#include "ck_tile/ops/reduce/block/block_reduce2d.hpp"
#include "ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp"
#include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
#include "ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp"
#include "ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp"
#include "ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp"
#include "ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp"

View File

@@ -7,20 +7,55 @@
namespace ck_tile {
// BlockReduce2d implements a hierarchical 2D reduction operator that reduces data along the second
// dimension using a user-specified reduction function.
//
// The reduction is performed in a three-stage hierarchical approach:
//
// STAGE 1: Thread-level reduction (BlockReduce2d)
// ===============================================
// - Each thread processes multiple elements from the input tensor within its assigned data
// partition
// - Reduction is performed locally within each thread by iterating over assigned elements
// - ReducePacksPerXDim controls how many elements sweep_tile processes in one iteration per
// dimension
// (e.g., {1,1} = 1 element at a time from each dimension, {2,4} = 2 from dim0, 4 from dim1)
// - Results are accumulated into a thread-local output tensor stored in registers
// - The output tensor distribution is derived from the input tensor's distribution using
// make_reduce_tile_distribution_encoding() to handle dimension reduction
//
// STAGE 2: Warp-level reduction (BlockReduce2dSync)
// ================================================
// - Performs inter-thread reduction within each warp
// - Uses warp shuffle operations to exchange data between threads in the same warp
// - Implements a tree-reduction pattern with power-of-2 stages
// - Only reduces along dimensions that map to lane IDs within the warp
//
// STAGE 3: Cross-warp reduction (BlockReduce2dCrossWarpSync)
// ========================================================
// - Performs reduction across multiple warps within the same thread block
// - Uses shared memory (LDS) to facilitate data exchange between warps
// - Each warp's lane-0 thread stores its partial results to shared memory
// - All threads participate in loading and reducing data from shared memory
// - Implements block-level synchronization to ensure memory consistency
// BlockReduce2d: Thread-level reduction (Stage 1)
template <typename Problem_, typename Policy_ = void>
struct BlockReduce2d
{
// in-thread reduction
// Thread-level reduction implementation
using Problem = remove_cvref_t<Problem_>;
using XDataType = typename Problem::XDataType;
using ComputeDataType = typename Problem::ComputeDataType;
CK_TILE_DEVICE constexpr BlockReduce2d() {}
template <typename XDistributedTensor_,
typename YDistributedTensor_,
typename ReduceFunc,
typename ReducePacksPerXDim = uniform_sequence_gen_t<2, 1>>
template <
typename XDistributedTensor_,
typename YDistributedTensor_,
typename ReduceFunc,
typename ReducePacksPerXDim =
uniform_sequence_gen_t<2, 1>> // {1,1} = process 1 element at a time from each dimension
CK_TILE_DEVICE void operator()(const XDistributedTensor_& x_tensor,
YDistributedTensor_& y_tensor,
const ReduceFunc& reduce_func,
@@ -33,6 +68,7 @@ struct BlockReduce2d
y_tensor(idx_0), ck_tile::type_convert<ComputeDataType>(x_tensor[idx_])...);
},
ReducePacksPerXDim{});
#if 0
constexpr auto I0 = number<0>{};
constexpr auto I1 = number<1>{};
@@ -75,6 +111,8 @@ struct BlockReduce2d
return tensor;
}
// uniform_sequence_gen_t<NSize, Value> generates sequence of NSize elements filled with Value
// e.g., uniform_sequence_gen_t<2, 1> → {1, 1} and uniform_sequence_gen_t<3, 4> → {4, 4, 4}
template <typename XDistributedTensor_,
typename ReduceFunc,
typename ReducePacksPerXDim = uniform_sequence_gen_t<2, 1>>
@@ -91,6 +129,7 @@ struct BlockReduce2d
}
};
// BlockReduce2dSync: Warp-level reduction (Stage 2)
template <typename Problem_, typename Policy_ = void>
struct BlockReduce2dSync
{
@@ -145,8 +184,15 @@ struct BlockReduce2dSync
// pull data from remote lane
const auto v_remote = warp_shuffle(v_local, src_lane);
// reduce
v_local = reduce_func(v_local, v_remote);
// For reduce, use combine_partial_results for operations that require it
if constexpr(ReduceFunc::requires_special_combine)
{
v_local = reduce_func.combine_partial_results(v_local, v_remote);
}
else
{
v_local = reduce_func(v_local, v_remote);
}
});
}
});
@@ -157,6 +203,7 @@ struct BlockReduce2dSync
}
};
// BlockReduce2dCrossWarpSync: Cross-warp reduction (Stage 3)
template <typename Problem_, typename Policy_ = void>
struct BlockReduce2dCrossWarpSync
{
@@ -263,8 +310,15 @@ struct BlockReduce2dCrossWarpSync
constexpr auto i_1 = number<i_1_n1 + 1>{};
const DataType v_remote = all_scratch[i_0 * num_reduce_warps + i_1];
// reduce
v_local = reduce_func(v_local, v_remote);
// For reduce, use combine_partial_results for operations that require it
if constexpr(ReduceFunc::requires_special_combine)
{
v_local = reduce_func.combine_partial_results(v_local, v_remote);
}
else
{
v_local = reduce_func(v_local, v_remote);
}
});
y_tensor.get_thread_buffer()(i_0) = v_local;

View File

@@ -6,56 +6,16 @@
#include "ck_tile/core.hpp"
#include "ck_tile/ops/common.hpp"
#include "ck_tile/ops/reduce/block/block_reduce.hpp"
#include "ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp"
#include "ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp"
// Reduce2d Kernel:
// =======================================
// This kernel implements a 2D reduction operation that reduces data along the second dimension
// of a matrix. The reduction is performed in multiple hierarchical stages.
namespace ck_tile {
template <typename BlockWarps, // num warps along seq<M, N>
typename BlockTile, // block size, seq<M, N>
typename WarpTile, // warp size, seq<M, N>
typename Vector> // contiguous pixels(vector size) along seq<M, N>
struct Reduce2dShape
{
static constexpr index_t Block_M = BlockTile::at(number<0>{});
static constexpr index_t Block_N = BlockTile::at(number<1>{});
static constexpr index_t Warp_M = WarpTile::at(number<0>{});
static constexpr index_t Warp_N = WarpTile::at(number<1>{});
static constexpr index_t Vector_M = Vector::at(number<0>{});
static constexpr index_t Vector_N = Vector::at(number<1>{});
static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{});
static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{});
static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
static constexpr index_t BlockSize =
ck_tile::get_warp_size() * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
};
template <typename XDataType_,
typename ComputeDataType_,
typename YDataType_,
typename BlockShape_,
typename ReduceOp_>
struct Reduce2dProblem
{
using XDataType = remove_cvref_t<XDataType_>;
using ComputeDataType = remove_cvref_t<ComputeDataType_>;
using YDataType = remove_cvref_t<YDataType_>;
using BlockShape = remove_cvref_t<BlockShape_>;
using ReduceOp = ReduceOp_;
static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1;
static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1;
};
template <typename Problem_, typename Policy_ = BlockReduce2dDefaultPolicy>
template <typename Problem_, typename Policy_ = Reduce2dDefaultPolicy>
struct Reduce
{
using Problem = ck_tile::remove_cvref_t<Problem_>;
@@ -112,19 +72,56 @@ struct Reduce
store_tile(y_window, cast_tile<YDataType>(y_compute));
}
#else
CK_TILE_DEVICE void operator()(const XDataType* p_x, YDataType* p_y, index_t M, index_t N) const
template <typename InputShape, typename InputStrides, typename KeptDim, typename ReduceDims>
CK_TILE_DEVICE void operator()(const XDataType* p_x,
YDataType* p_y,
InputShape input_shape,
InputStrides input_strides,
KeptDim kept_dim,
ReduceDims reduce_dims) const
{
using S = typename Problem::BlockShape;
const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
const auto y_m = make_naive_tensor_view_packed<address_space_enum::global>(
p_y, make_tuple(M), number<1>{});
using S = typename Problem::BlockShape;
const auto iM = get_block_id() * S::Block_M;
auto x_window = make_tile_window(x_m_n,
// Extract lengths based on kept and reduced dimensions
const auto kept_len = input_shape.at(number<kept_dim.at(0)>{});
const auto reduce_lens = [&]() {
return generate_tuple(
[&](auto I) { return input_shape.at(number<reduce_dims.at(I)>{}); },
number<reduce_dims.size()>{});
}();
// Create transforms
const auto pass_through_transform = make_pass_through_transform(kept_len);
const auto merge_transform = make_merge_transform(reduce_lens);
auto reduce_func = typename Problem::ReduceOp{};
const XDataType custom_padding_value =
type_convert<XDataType>(reduce_func.template GetIdentityValue<ComputeDataType>());
// Create input tensor view with custom padding value
// First create the descriptor
auto desc = make_naive_tensor_descriptor(
input_shape, input_strides, number<S::Vector_N>{}, number<1>{});
// Create buffer view with custom padding value
auto buffer_view = make_buffer_view<address_space_enum::global>(
p_x, desc.get_element_space_size(), custom_padding_value);
// Create tensor view with custom padding
const auto x_tensor = tensor_view<decltype(buffer_view), decltype(desc)>{buffer_view, desc};
const auto transformed_x_tensor = pad_tensor_view(
transform_tensor_view(x_tensor,
ck_tile::make_tuple(pass_through_transform, merge_transform),
ck_tile::make_tuple(kept_dim, reduce_dims),
ck_tile::make_tuple(sequence<0>{}, sequence<1>{})),
make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
sequence<0, 1>{});
const auto y_m = make_naive_tensor_view_packed<address_space_enum::global>(
p_y, make_tuple(kept_len), number<1>{});
auto x_window = make_tile_window(transformed_x_tensor,
make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
{iM, 0},
Policy::template MakeXBlockTileDistribution<Problem>());
@@ -133,10 +130,12 @@ struct Reduce
__shared__ char smem[Policy::template GetSmemSize<Problem>()];
// Get the merged dimension size from the transformed tensor
const auto merged_reduce_len =
transformed_x_tensor.get_tensor_descriptor().get_lengths().at(number<1>{});
index_t num_n_tile_iteration =
__builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N));
__builtin_amdgcn_readfirstlane(integer_divide_ceil(merged_reduce_len, S::Block_N));
auto reduce_func = typename Problem::ReduceOp{};
auto block_reduce2d = Policy::template GetBlockReduce2d<Problem>();
auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
auto block_reduce2d_cross_warp_sync =
@@ -158,6 +157,22 @@ struct Reduce
store_tile(y_window, cast_tile<YDataType>(y_compute));
}
template <typename ArgParser>
CK_TILE_HOST static bool IsSupportedArgument(const ArgParser& arg_parser)
{
using S = typename Problem::BlockShape;
if(arg_parser.get_int("n") % S::Vector_N != 0)
{
if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
{
CK_TILE_ERROR("Size of n dimension should be a multiple of Vector_N !");
}
return false;
}
return true;
}
#endif
};

View File

@@ -9,7 +9,7 @@
namespace ck_tile {
struct BlockReduce2dDefaultPolicy
struct Reduce2dDefaultPolicy
{
template <typename Problem>
CK_TILE_DEVICE static constexpr auto MakeXBlockTileDistribution()

View File

@@ -0,0 +1,27 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
namespace ck_tile {
template <typename XDataType_,
typename ComputeDataType_,
typename YDataType_,
typename BlockShape_,
typename ReduceOp_>
struct Reduce2dProblem
{
using XDataType = remove_cvref_t<XDataType_>;
using ComputeDataType = remove_cvref_t<ComputeDataType_>;
using YDataType = remove_cvref_t<YDataType_>;
using BlockShape = remove_cvref_t<BlockShape_>;
using ReduceOp = ReduceOp_;
static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1;
static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1;
};
} // namespace ck_tile

View File

@@ -0,0 +1,37 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
namespace ck_tile {
template <typename BlockWarps, // num warps along seq<M, N>
typename BlockTile, // block size, seq<M, N>
typename WarpTile, // warp size, seq<M, N>
typename Vector> // contiguous pixels(vector size) along seq<M, N>
struct Reduce2dShape
{
static constexpr index_t Block_M = BlockTile::at(number<0>{});
static constexpr index_t Block_N = BlockTile::at(number<1>{});
static constexpr index_t Warp_M = WarpTile::at(number<0>{});
static constexpr index_t Warp_N = WarpTile::at(number<1>{});
static constexpr index_t Vector_M = Vector::at(number<0>{});
static constexpr index_t Vector_N = Vector::at(number<1>{});
static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{});
static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{});
static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
static constexpr index_t BlockSize =
ck_tile::get_warp_size() * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
};
} // namespace ck_tile

View File

@@ -13,3 +13,4 @@ add_subdirectory(moe_sorting)
add_subdirectory(slice_tile)
add_subdirectory(batched_transpose)
add_subdirectory(smoothquant)
add_subdirectory(reduce)

View File

@@ -0,0 +1,7 @@
if(GPU_TARGETS MATCHES "gfx9")
add_gtest_executable(test_ck_tile_reduce2d test_reduce2d.cpp)
if(result EQUAL 0)
target_link_libraries(test_ck_tile_reduce2d PRIVATE utility)
endif()
endif()

View File

@@ -0,0 +1,188 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include <gtest/gtest.h>
#include <vector>
#include <cmath>
#include <tuple>
#include <iostream>
#include "ck_tile/core.hpp"
#include "ck_tile/host.hpp"
#include "ck_tile/ops/reduce.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp"
#include "ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp"
#include "ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp"
#include "ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp"
#include "ck_tile/host/reference/reference_reduce.hpp"
template <typename Tuple>
class TestCkTileReduce2d : public ::testing::Test
{
protected:
using XDataType = std::tuple_element_t<0, Tuple>;
using ComputeDataType = std::tuple_element_t<1, Tuple>;
using YDataType = std::tuple_element_t<2, Tuple>;
using ReduceOpType = std::tuple_element_t<3, Tuple>;
using BlockWarps_ = std::tuple_element_t<4, Tuple>;
using BlockTile_ = std::tuple_element_t<5, Tuple>;
using WarpTile_ = std::tuple_element_t<6, Tuple>;
using Vector_ = std::tuple_element_t<7, Tuple>;
using TestReduce2dShape = ck_tile::Reduce2dShape<BlockWarps_, BlockTile_, WarpTile_, Vector_>;
void RunTest(ck_tile::index_t m, ck_tile::index_t n, ck_tile::index_t k)
{
// Problem shape: 3D tensor [M, N, K] -> reduce along [N, K] -> output [M]
std::vector<ck_tile::index_t> problem_shape = {m, n, k};
std::vector<ck_tile::index_t> strides(3);
strides[0] = n * k; // M stride
strides[1] = k; // N stride
strides[2] = 1; // K stride
constexpr auto kept_dim = ck_tile::sequence<0>{};
constexpr auto reduce_dims = ck_tile::sequence<1, 2>{};
ck_tile::HostTensor<XDataType> h_x(problem_shape, strides);
ck_tile::HostTensor<YDataType> h_y({problem_shape[kept_dim.at(0)]}, {1});
ck_tile::HostTensor<YDataType> h_y_ref({problem_shape[kept_dim.at(0)]}, {1});
ck_tile::FillUniformDistribution<XDataType>{-5.f, 5.f}(h_x);
h_y.SetZero();
h_y_ref.SetZero();
ck_tile::DeviceMem d_x_mem(h_x.get_element_space_size_in_bytes());
ck_tile::DeviceMem d_y_mem(h_y.get_element_space_size_in_bytes());
d_x_mem.ToDevice(h_x.data());
d_y_mem.ToDevice(h_y.data()); // Initialize device output buffer
// Problem and kernel setup
using Problem = ck_tile::
Reduce2dProblem<XDataType, ComputeDataType, YDataType, TestReduce2dShape, ReduceOpType>;
using Kernel = ck_tile::Reduce<Problem>;
// Launch configuration
constexpr ck_tile::index_t kBlockSize = 256;
constexpr ck_tile::index_t kBlockPerCu = 1;
ck_tile::index_t kGridSize =
(problem_shape[kept_dim.at(0)] + TestReduce2dShape::Block_M - 1) /
TestReduce2dShape::Block_M;
auto input_shape =
ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2]);
auto input_strides = ck_tile::make_tuple(strides[0], strides[1], strides[2]);
ck_tile::launch_kernel(ck_tile::stream_config{nullptr, false, 0},
ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
Kernel{},
kGridSize,
kBlockSize,
0,
static_cast<XDataType*>(d_x_mem.GetDeviceBuffer()),
static_cast<YDataType*>(d_y_mem.GetDeviceBuffer()),
input_shape,
input_strides,
kept_dim,
reduce_dims));
// Get results back
d_y_mem.FromDevice(h_y.data());
// Reference computation
ck_tile::reference_reduce<XDataType, ComputeDataType, YDataType>(
h_x, h_y_ref, ReduceOpType{}, kept_dim, reduce_dims);
// Calculate proper error thresholds based on data types and number of accumulations
const auto total_reduce_elements = n * k;
const auto rtol = ck_tile::get_relative_threshold<XDataType, YDataType, ComputeDataType>(
total_reduce_elements);
const auto atol = ck_tile::get_absolute_threshold<XDataType, YDataType, ComputeDataType>(
5.0f, total_reduce_elements);
bool result =
ck_tile::check_err(h_y, h_y_ref, "Error: Incorrect reduce results!", rtol, atol);
EXPECT_TRUE(result);
}
void RunTest2D(ck_tile::index_t m, ck_tile::index_t n)
{
// 2D case: [M, N] -> reduce along [N] -> output [M]
RunTest(m, n, 1);
}
};
// Shape parameters for different test configurations
using Shape1_BlockWarps = ck_tile::sequence<4, 1>;
using Shape1_BlockTile = ck_tile::sequence<128, 128>;
using Shape1_WarpTile = ck_tile::sequence<32, 128>;
using Shape1_Vector = ck_tile::sequence<8, 8>;
using Shape2_BlockWarps = ck_tile::sequence<2, 2>; // Cross-warp reduction test
using Shape2_BlockTile = ck_tile::sequence<2, 1024>;
using Shape2_WarpTile = ck_tile::sequence<1, 512>;
using Shape2_Vector = ck_tile::sequence<1, 8>;
// Test configurations for different data types and operations
using TestConfig_F32_Add = std::tuple<float,
float,
float,
ck_tile::ReduceOp::Add,
Shape1_BlockWarps,
Shape1_BlockTile,
Shape1_WarpTile,
Shape1_Vector>;
using TestConfig_F16_Add = std::tuple<ck_tile::half_t,
float,
ck_tile::half_t,
ck_tile::ReduceOp::Add,
Shape1_BlockWarps,
Shape1_BlockTile,
Shape1_WarpTile,
Shape1_Vector>;
using TestConfig_F32_CrossWarp = std::tuple<float,
float,
float,
ck_tile::ReduceOp::Add,
Shape2_BlockWarps,
Shape2_BlockTile,
Shape2_WarpTile,
Shape2_Vector>;
using TestConfig_F32_Max = std::tuple<float,
float,
float,
ck_tile::ReduceOp::Max,
Shape1_BlockWarps,
Shape1_BlockTile,
Shape1_WarpTile,
Shape1_Vector>;
using TestConfig_F32_SquareAdd = std::tuple<float,
float,
float,
ck_tile::ReduceOp::SquareAdd,
Shape1_BlockWarps,
Shape1_BlockTile,
Shape1_WarpTile,
Shape1_Vector>;
using TestTypes = ::testing::Types<TestConfig_F32_Add,
TestConfig_F16_Add,
TestConfig_F32_CrossWarp,
TestConfig_F32_Max,
TestConfig_F32_SquareAdd>;
TYPED_TEST_SUITE(TestCkTileReduce2d, TestTypes);
TYPED_TEST(TestCkTileReduce2d, test) { this->RunTest(128, 128, 1); }
TYPED_TEST(TestCkTileReduce2d, Reduce3D_512_1024_16) { this->RunTest(512, 1024, 16); }
TYPED_TEST(TestCkTileReduce2d, Reduce3D_150_170_6) { this->RunTest(150, 64, 3); }
TYPED_TEST(TestCkTileReduce2d, Reduce2D_128_128) { this->RunTest2D(128, 128); }