From 295cf136a09360ed7a979c4124e6bb2abd5a3a95 Mon Sep 17 00:00:00 2001 From: yashagar Date: Thu, 24 Jul 2025 17:59:43 +0000 Subject: [PATCH] comments resolution --- example/ck_tile/05_reduce/reduce.cpp | 50 ++-- .../ck_tile/core/utility/reduce_operator.hpp | 39 ++- .../host/reference/reference_reduce.hpp | 71 ++++-- .../ops/reduce/kernel/reduce2d_kernel.hpp | 110 ++++---- .../pipeline/reduce2d_default_policy.hpp | 6 +- .../ops/reduce/pipeline/reduce2d_problem.hpp | 2 +- .../ops/reduce/pipeline/reduce2d_shape.hpp | 12 +- test/ck_tile/reduce/test_reduce2d.cpp | 240 ++++++++++++++---- 8 files changed, 354 insertions(+), 176 deletions(-) diff --git a/example/ck_tile/05_reduce/reduce.cpp b/example/ck_tile/05_reduce/reduce.cpp index 070c295d33..5491d8f019 100644 --- a/example/ck_tile/05_reduce/reduce.cpp +++ b/example/ck_tile/05_reduce/reduce.cpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include "ck_tile/host.hpp" #include "ck_tile/ops/reduce.hpp" #include @@ -5,9 +8,10 @@ auto create_args(int argc, char* argv[]) { ck_tile::ArgParser arg_parser; - arg_parser.insert("m", "2048", "m dimension") - .insert("n", "1024", "n dimension") - .insert("k", "2", "k dimension") + arg_parser.insert("n", "32", "n dimension") + .insert("h", "7", "h dimension") + .insert("w", "7", "w dimension") + .insert("c", "512", "c dimension") .insert("v", "1", "cpu validation or not") .insert("prec", "fp16", "precision") .insert("warmup", "0", "cold iter") @@ -24,27 +28,28 @@ bool run(const ck_tile::ArgParser& arg_parser) using ComputeDataType = float; using YDataType = DataType; - ck_tile::index_t m = arg_parser.get_int("m"); - ck_tile::index_t n = arg_parser.get_int("n"); - ck_tile::index_t k = arg_parser.get_int("k"); + ck_tile::index_t N = arg_parser.get_int("n"); + ck_tile::index_t H = arg_parser.get_int("h"); + ck_tile::index_t W = arg_parser.get_int("w"); + ck_tile::index_t C = arg_parser.get_int("c"); int do_validation = arg_parser.get_int("v"); int warmup = arg_parser.get_int("warmup"); int repeat = arg_parser.get_int("repeat"); - std::vector problem_shape = {m, n, k}; - std::vector strides(3); - strides[0] = n * k; - strides[1] = k; - strides[2] = 1; + std::vector problem_shape = {N, H, W, C}; + std::vector strides(4); + strides[0] = H * W * C; + strides[1] = W * C; + strides[2] = C; + strides[3] = 1; // Define reduction specification: - // dimension 0 is kept, dimensions 1,2 are reduced - constexpr auto kept_dim = ck_tile::sequence<0>{}; // Which dimension to keep (pass-through) - constexpr auto reduce_dims = ck_tile::sequence<1, 2>{}; // Which dimensions to reduce (merge) + constexpr auto kept_dim = ck_tile::sequence<0, 3>{}; // Which dimension to keep + constexpr auto reduce_dims = ck_tile::sequence<1, 2>{}; // Which dimensions to reduce ck_tile::HostTensor x_host(problem_shape, strides); - ck_tile::HostTensor y_host_ref({problem_shape[kept_dim.at(0)]}, {1}); - ck_tile::HostTensor y_host_dev({problem_shape[kept_dim.at(0)]}, {1}); + ck_tile::HostTensor y_host_ref({N,C}, {C,1}); + ck_tile::HostTensor y_host_dev({N,C}, {C,1}); ck_tile::FillUniformDistribution{-5.f, 5.f}(x_host); @@ -67,8 +72,9 @@ bool run(const ck_tile::ArgParser& arg_parser) constexpr ck_tile::index_t kBlockSize = 256; constexpr ck_tile::index_t kBlockPerCu = 1; + ck_tile::index_t kept_dim_len_prod = N*C; ck_tile::index_t kGridSize = - (problem_shape[kept_dim.at(0)] + BlockTile::at(ck_tile::number<0>{}) - 1) / + (kept_dim_len_prod + BlockTile::at(ck_tile::number<0>{}) - 1) / BlockTile::at(ck_tile::number<0>{}); std::cout << "grid size " << kGridSize << std::endl; @@ -79,10 +85,10 @@ bool run(const ck_tile::ArgParser& arg_parser) using Kernel = ck_tile::Reduce; // Create input tensor shape and strides - auto input_shape = ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2]); - auto input_strides = ck_tile::make_tuple(strides[0], strides[1], strides[2]); + auto input_shape = ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2], problem_shape[3]); + auto input_strides = ck_tile::make_tuple(strides[0], strides[1], strides[2], strides[3]); - if(!Kernel::IsSupportedArgument(arg_parser)) + if(!Kernel::IsSupportedArgument(C)) //output tensor's continuous dimension { throw std::runtime_error("Wrong! Arguments not supported!\n"); } @@ -101,7 +107,7 @@ bool run(const ck_tile::ArgParser& arg_parser) reduce_dims)); std::size_t num_btype = - sizeof(XDataType) * m * n * k + sizeof(YDataType) * problem_shape[kept_dim.at(0)]; + sizeof(XDataType) * N * C * H * W + sizeof(YDataType) * N * C; float gb_per_sec = num_btype / 1.E6 / ave_time; @@ -114,7 +120,7 @@ bool run(const ck_tile::ArgParser& arg_parser) // reference ck_tile::reference_reduce( x_host, y_host_ref, ReduceOp{}, kept_dim, reduce_dims); - y_buf.FromDevice(y_host_dev.mData.data()); + y_buf.FromDevice(y_host_dev.mData.data()); pass = ck_tile::check_err(y_host_dev, y_host_ref); std::cout << "valid:" << (pass ? "y" : "n") << std::flush << std::endl; diff --git a/include/ck_tile/core/utility/reduce_operator.hpp b/include/ck_tile/core/utility/reduce_operator.hpp index e45e53fdbe..2d7ac78b06 100644 --- a/include/ck_tile/core/utility/reduce_operator.hpp +++ b/include/ck_tile/core/utility/reduce_operator.hpp @@ -26,7 +26,8 @@ struct Add } template || std::is_same_v>> + typename = std::enable_if_t || std::is_same_v || + std::is_same_v || std::is_same_v>> CK_TILE_HOST_DEVICE constexpr T operator()(T& y, T x) const { float y_ = type_convert(y); @@ -54,6 +55,16 @@ struct SquareAdd return y + (x * x); } + template || std::is_same_v || + std::is_same_v || std::is_same_v>> + CK_TILE_HOST_DEVICE constexpr T operator()(T& y, T x) const + { + float y_ = type_convert(y); + float x_ = type_convert(x); + return type_convert(y_ + (x_ * x_)); + } + // For combining partial results template || std::is_same_v || @@ -64,6 +75,16 @@ struct SquareAdd return partial1 + partial2; // Just add the partial sums, don't square again } + template || std::is_same_v || + std::is_same_v || std::is_same_v>> + CK_TILE_HOST_DEVICE constexpr T combine_partial_results(T& partial1, T& partial2) const + { + float partial1_ = type_convert(partial1); + float partial2_ = type_convert(partial2); + return type_convert(partial1_ + partial2_); + } + static constexpr bool requires_special_combine = true; }; @@ -71,7 +92,9 @@ struct Max { template || std::is_same_v || - std::is_same_v || std::is_same_v>> + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v>> CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue() { return numeric::min(); @@ -79,7 +102,9 @@ struct Max template || std::is_same_v || - std::is_same_v || std::is_same_v>> + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v>> CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const { return max(y, x); @@ -92,7 +117,9 @@ struct AbsMax { template || std::is_same_v || - std::is_same_v || std::is_same_v>> + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v>> CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue() { return numeric::min(); @@ -100,7 +127,9 @@ struct AbsMax template || std::is_same_v || - std::is_same_v || std::is_same_v>> + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v>> CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const { return max(y, abs(x)); diff --git a/include/ck_tile/host/reference/reference_reduce.hpp b/include/ck_tile/host/reference/reference_reduce.hpp index 0e04bf0177..348bcaa4fb 100644 --- a/include/ck_tile/host/reference/reference_reduce.hpp +++ b/include/ck_tile/host/reference/reference_reduce.hpp @@ -45,45 +45,76 @@ CK_TILE_HOST void reference_reduce(const HostTensor& x_tensor, ReduceDims reduce_dims) { const auto& x_lengths = x_tensor.mDesc.get_lengths(); - const auto kept_len = x_lengths[kept_dim.at(0)]; + const auto& x_strides = x_tensor.mDesc.get_strides(); + const auto& y_strides = y_tensor.mDesc.get_strides(); + + // Calculate total kept elements (product of all kept dimension lengths) + index_t total_kept_elements = 1; + static_for<0, kept_dim.size(), 1>{}( + [&](auto i) { total_kept_elements *= x_lengths[kept_dim.at(i)]; }); - // Calculate total reduce elements + // Calculate total reduce elements (product of all reduce dimension lengths) index_t total_reduce_elements = 1; static_for<0, reduce_dims.size(), 1>{}( [&](auto i) { total_reduce_elements *= x_lengths[reduce_dims.at(i)]; }); - auto f = [&](auto kept_idx) { + auto f = [&](auto linear_kept_idx) { ComputeDataType v_acc = reduce_op.template GetIdentityValue(); + // Convert linear kept index to multi-dimensional kept indices + std::vector kept_indices(kept_dim.size()); + index_t temp_kept = linear_kept_idx; + static_for<0, kept_dim.size(), 1>{}([&](auto i) { + constexpr auto dim_idx = kept_dim.size() - 1 - i; + constexpr auto dim = kept_dim.at(dim_idx); + const auto len = x_lengths[dim]; + kept_indices[dim_idx] = temp_kept % len; + temp_kept /= len; + }); + for(index_t reduce_idx = 0; reduce_idx < total_reduce_elements; ++reduce_idx) { - // Convert linear index to multi-dimensional indices - std::vector indices(x_lengths.size(), 0); - indices[kept_dim.at(0)] = kept_idx; - - index_t temp = reduce_idx; + // Convert linear reduce index to multi-dimensional reduce indices + std::vector reduce_indices(reduce_dims.size()); + index_t temp_reduce = reduce_idx; static_for<0, reduce_dims.size(), 1>{}([&](auto i) { - constexpr auto dim = reduce_dims.at(reduce_dims.size() - 1 - i); - const auto len = x_lengths[dim]; - indices[dim] = temp % len; - temp /= len; + constexpr auto dim_idx = reduce_dims.size() - 1 - i; + constexpr auto dim = reduce_dims.at(dim_idx); + const auto len = x_lengths[dim]; + reduce_indices[dim_idx] = temp_reduce % len; + temp_reduce /= len; }); - // Flat tensor access - index_t flat_idx = 0; - const auto& strides = x_tensor.mDesc.get_strides(); - for(size_t d = 0; d < indices.size(); ++d) + // Build full input tensor indices by combining kept and reduce indices + std::vector full_indices(x_lengths.size(), 0); + static_for<0, kept_dim.size(), 1>{}([&](auto i) { + full_indices[kept_dim.at(i)] = kept_indices[i]; + }); + static_for<0, reduce_dims.size(), 1>{}([&](auto i) { + full_indices[reduce_dims.at(i)] = reduce_indices[i]; + }); + + // Calculate flat input tensor index + index_t flat_x_idx = 0; + for(size_t d = 0; d < full_indices.size(); ++d) { - flat_idx += indices[d] * strides[d]; + flat_x_idx += full_indices[d] * x_strides[d]; } - const auto v_a = type_convert(x_tensor.mData[flat_idx]); + const auto v_a = type_convert(x_tensor.mData[flat_x_idx]); v_acc = reduce_op(v_acc, v_a); } - y_tensor(kept_idx) = type_convert(v_acc); + // Calculate output tensor index using kept indices and output strides + // The output tensor has the same structure as the kept dimensions + index_t flat_y_idx = 0; + static_for<0, kept_dim.size(), 1>{}([&](auto i) { + flat_y_idx += kept_indices[i] * y_strides[i]; + }); + + y_tensor.mData[flat_y_idx] = type_convert(v_acc); }; - make_ParallelTensorFunctor(f, kept_len)(std::thread::hardware_concurrency()); + make_ParallelTensorFunctor(f, total_kept_elements)(std::thread::hardware_concurrency()); } } // namespace ck_tile diff --git a/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp index 01523c4279..ed8d212a12 100644 --- a/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp +++ b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -25,53 +25,6 @@ struct Reduce using ComputeDataType = ck_tile::remove_cvref_t; using YDataType = ck_tile::remove_cvref_t; -#if 0 - CK_TILE_DEVICE void operator()(const XDataType* p_x, YDataType* p_y, index_t M, index_t N) - const - { - using S = typename Problem::BlockShape; - - const auto x_m_n = make_naive_tensor_view( - p_x, make_tuple(M, N), make_tuple(N, 1), number{}, number<1>{}); - - const auto y_m = make_naive_tensor_view_packed( - p_y, make_tuple(M), number<1>{}); - - const auto iM = get_block_id() * S::Block_M; - - auto x_window = make_tile_window(x_m_n, - make_tuple(number{}, number{}), - {iM, 0}, - Policy::template MakeXBlockTileDistribution()); - - auto y_window = make_tile_window(y_m, make_tuple(number{}), {iM}); - - const auto f_reduce = [](const auto& v0, const auto& v1) { return v0 + v1; }; - - const XDataType reduce_init_value = 0; - - constexpr auto reduce_dims = sequence<1>{}; - - auto y_compute = decltype(block_tile_reduce( - load_tile(x_window), reduce_dims, f_reduce, reduce_init_value)){}; - - set_tile(y_compute, reduce_init_value); - - index_t num_n_tile_iteration = - __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N)); - - for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) - { - const auto x = load_tile(x_window); - block_tile_reduce(y_compute, x, reduce_dims, f_reduce); - move_tile_window(x_window, {0, S::Block_N}); - } - - block_tile_reduce_sync(y_compute, f_reduce); - - store_tile(y_window, cast_tile(y_compute)); - } -#else template CK_TILE_DEVICE void operator()(const XDataType* p_x, YDataType* p_y, @@ -83,26 +36,31 @@ struct Reduce using S = typename Problem::BlockShape; const auto iM = get_block_id() * S::Block_M; + static_assert(kept_dim.size() + reduce_dims.size() == InputShape::size(), + "Size of kept dimensions + reduced dimensions must equal input tensor rank"); + // Extract lengths based on kept and reduced dimensions - const auto kept_len = input_shape.at(number{}); + const auto kept_lens = [&]() { + return generate_tuple( + [&](auto I) { return input_shape.at(number{}); }, + number{}); + }(); const auto reduce_lens = [&]() { return generate_tuple( [&](auto I) { return input_shape.at(number{}); }, number{}); }(); - // Create transforms - const auto pass_through_transform = make_pass_through_transform(kept_len); - const auto merge_transform = make_merge_transform(reduce_lens); + const auto kept_merge_transform = make_merge_transform(kept_lens); + const auto reduce_merge_transform = make_merge_transform(reduce_lens); auto reduce_func = typename Problem::ReduceOp{}; const XDataType custom_padding_value = type_convert(reduce_func.template GetIdentityValue()); // Create input tensor view with custom padding value - // First create the descriptor auto desc = make_naive_tensor_descriptor( - input_shape, input_strides, number{}, number<1>{}); + input_shape, input_strides, number{}, number<1>{}); // Create buffer view with custom padding value auto buffer_view = make_buffer_view( @@ -112,21 +70,43 @@ struct Reduce const auto x_tensor = tensor_view{buffer_view, desc}; const auto transformed_x_tensor = pad_tensor_view( transform_tensor_view(x_tensor, - ck_tile::make_tuple(pass_through_transform, merge_transform), - ck_tile::make_tuple(kept_dim, reduce_dims), - ck_tile::make_tuple(sequence<0>{}, sequence<1>{})), + make_tuple(kept_merge_transform, reduce_merge_transform), + make_tuple(kept_dim, reduce_dims), + make_tuple(sequence<0>{}, sequence<1>{})), make_tuple(number{}, number{}), sequence<0, 1>{}); - const auto y_m = make_naive_tensor_view_packed( - p_y, make_tuple(kept_len), number<1>{}); + // Calculate strides for output tensor based on its own dimensions + const auto kept_strides = [&]() { + return generate_tuple( + [&](auto I) { + // Calculate stride for dimension I as product of all following dimensions + index_t stride = 1; + static_for{}([&](auto J) { + stride *= kept_lens.at(number{}); + }); + return stride; + }, + number{}); + }(); + + const auto y_m = make_naive_tensor_view( + p_y, kept_lens, kept_strides, number<16 / sizeof(YDataType)>{}, number<1>{}); + + // Transform output tensor to 1D merged view + // This creates a view compatible with the 2D reduction pattern + const auto y_merged = transform_tensor_view( + y_m, + make_tuple(kept_merge_transform), + make_tuple(typename arithmetic_sequence_gen<0, kept_dim.size(), 1>::type{}), + make_tuple(sequence<0>{})); auto x_window = make_tile_window(transformed_x_tensor, make_tuple(number{}, number{}), {iM, 0}, Policy::template MakeXBlockTileDistribution()); - auto y_window = make_tile_window(y_m, make_tuple(number{}), {iM}); + auto y_window = make_tile_window(y_merged, make_tuple(number{}), {iM}); __shared__ char smem[Policy::template GetSmemSize()]; @@ -158,22 +138,22 @@ struct Reduce store_tile(y_window, cast_tile(y_compute)); } - template - CK_TILE_HOST static bool IsSupportedArgument(const ArgParser& arg_parser) + CK_TILE_HOST static bool IsSupportedArgument(index_t y_continous_dim) { using S = typename Problem::BlockShape; - if(arg_parser.get_int("n") % S::Vector_N != 0) + + if(y_continous_dim % S::ThreadTile_N != 0) { if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) { - CK_TILE_ERROR("Size of n dimension should be a multiple of Vector_N !"); + CK_TILE_ERROR("Total reduction size should be a multiple of ThreadTile_N!"); } return false; } + return true; } -#endif }; } // namespace ck_tile diff --git a/include/ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp b/include/ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp index 3e1f894fde..9d333232f7 100644 --- a/include/ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp +++ b/include/ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -18,8 +18,8 @@ struct Reduce2dDefaultPolicy return make_static_tile_distribution( tile_distribution_encoding< sequence<>, - tuple, - sequence>, + tuple, + sequence>, tuple, sequence<1, 2>>, tuple, sequence<2, 2>>, sequence<1, 1, 2, 2>, diff --git a/include/ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp b/include/ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp index 524a7f7663..67fdec9286 100644 --- a/include/ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp +++ b/include/ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp b/include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp index f791cc48bf..283db9326b 100644 --- a/include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp +++ b/include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -10,7 +10,7 @@ namespace ck_tile { template typename BlockTile, // block size, seq typename WarpTile, // warp size, seq - typename Vector> // contiguous pixels(vector size) along seq + typename ThreadTile> // contiguous pixels(vector size) along seq struct Reduce2dShape { static constexpr index_t Block_M = BlockTile::at(number<0>{}); @@ -19,14 +19,14 @@ struct Reduce2dShape static constexpr index_t Warp_M = WarpTile::at(number<0>{}); static constexpr index_t Warp_N = WarpTile::at(number<1>{}); - static constexpr index_t Vector_M = Vector::at(number<0>{}); - static constexpr index_t Vector_N = Vector::at(number<1>{}); + static constexpr index_t ThreadTile_M = ThreadTile::at(number<0>{}); + static constexpr index_t ThreadTile_N = ThreadTile::at(number<1>{}); static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{}); static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{}); - static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M; - static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N; + static constexpr index_t ThreadPerWarp_M = Warp_M / ThreadTile_M; + static constexpr index_t ThreadPerWarp_N = Warp_N / ThreadTile_N; static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M); static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N); diff --git a/test/ck_tile/reduce/test_reduce2d.cpp b/test/ck_tile/reduce/test_reduce2d.cpp index 8c6f31608f..87554d3478 100644 --- a/test/ck_tile/reduce/test_reduce2d.cpp +++ b/test/ck_tile/reduce/test_reduce2d.cpp @@ -11,14 +11,9 @@ #include "ck_tile/host.hpp" #include "ck_tile/ops/reduce.hpp" #include "ck_tile/host/kernel_launch.hpp" -#include "ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp" -#include "ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp" -#include "ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp" -#include "ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp" -#include "ck_tile/host/reference/reference_reduce.hpp" template -class TestCkTileReduce2d : public ::testing::Test +class TestCkTileReduce : public ::testing::Test { protected: using XDataType = std::tuple_element_t<0, Tuple>; @@ -28,25 +23,23 @@ class TestCkTileReduce2d : public ::testing::Test using BlockWarps_ = std::tuple_element_t<4, Tuple>; using BlockTile_ = std::tuple_element_t<5, Tuple>; using WarpTile_ = std::tuple_element_t<6, Tuple>; - using Vector_ = std::tuple_element_t<7, Tuple>; + using ThreadTile_ = std::tuple_element_t<7, Tuple>; - using TestReduce2dShape = ck_tile::Reduce2dShape; + using TestReduce2dShape = ck_tile::Reduce2dShape; - void RunTest(ck_tile::index_t m, ck_tile::index_t n, ck_tile::index_t k) + template + void RunGenericTest(const std::vector& input_shape, + const std::vector& input_strides, + const std::vector& output_shape, + const std::vector& output_strides, + ck_tile::index_t kept_dim_len_prod, + ck_tile::index_t total_reduce_elements, + KeptDimSeq kept_dims, + ReduceDimSeq reduce_dims) { - // Problem shape: 3D tensor [M, N, K] -> reduce along [N, K] -> output [M] - std::vector problem_shape = {m, n, k}; - std::vector strides(3); - strides[0] = n * k; // M stride - strides[1] = k; // N stride - strides[2] = 1; // K stride - - constexpr auto kept_dim = ck_tile::sequence<0>{}; - constexpr auto reduce_dims = ck_tile::sequence<1, 2>{}; - - ck_tile::HostTensor h_x(problem_shape, strides); - ck_tile::HostTensor h_y({problem_shape[kept_dim.at(0)]}, {1}); - ck_tile::HostTensor h_y_ref({problem_shape[kept_dim.at(0)]}, {1}); + ck_tile::HostTensor h_x(input_shape, input_strides); + ck_tile::HostTensor h_y(output_shape, output_strides); + ck_tile::HostTensor h_y_ref(output_shape, output_strides); ck_tile::FillUniformDistribution{-5.f, 5.f}(h_x); h_y.SetZero(); @@ -67,36 +60,47 @@ class TestCkTileReduce2d : public ::testing::Test // Launch configuration constexpr ck_tile::index_t kBlockSize = 256; constexpr ck_tile::index_t kBlockPerCu = 1; + ck_tile::index_t kGridSize = - (problem_shape[kept_dim.at(0)] + TestReduce2dShape::Block_M - 1) / + (kept_dim_len_prod + TestReduce2dShape::Block_M - 1) / TestReduce2dShape::Block_M; - auto input_shape = - ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2]); - auto input_strides = ck_tile::make_tuple(strides[0], strides[1], strides[2]); + // Generic helper to create tuple from vector based on compile-time size + auto make_shape_tuple = [](const std::vector& vec) { + return [&vec](std::index_sequence) { + return ck_tile::make_tuple(vec[I]...); + }(std::make_index_sequence{}); + }; + + auto input_shape_tuple = make_shape_tuple.template operator()(input_shape); + auto input_strides_tuple = make_shape_tuple.template operator()(input_strides); + + if(!Kernel::IsSupportedArgument(output_shape[output_shape.size()-1])) //output tensor's continuous dimension + { + throw std::runtime_error("Wrong! Arguments not supported!\n"); + } ck_tile::launch_kernel(ck_tile::stream_config{nullptr, false, 0}, - ck_tile::make_kernel( - Kernel{}, - kGridSize, - kBlockSize, - 0, - static_cast(d_x_mem.GetDeviceBuffer()), - static_cast(d_y_mem.GetDeviceBuffer()), - input_shape, - input_strides, - kept_dim, - reduce_dims)); + ck_tile::make_kernel( + Kernel{}, + kGridSize, + kBlockSize, + 0, + static_cast(d_x_mem.GetDeviceBuffer()), + static_cast(d_y_mem.GetDeviceBuffer()), + input_shape_tuple, + input_strides_tuple, + kept_dims, + reduce_dims)); // Get results back d_y_mem.FromDevice(h_y.data()); // Reference computation ck_tile::reference_reduce( - h_x, h_y_ref, ReduceOpType{}, kept_dim, reduce_dims); + h_x, h_y_ref, ReduceOpType{}, kept_dims, reduce_dims); // Calculate proper error thresholds based on data types and number of accumulations - const auto total_reduce_elements = n * k; const auto rtol = ck_tile::get_relative_threshold( total_reduce_elements); const auto atol = ck_tile::get_absolute_threshold( @@ -107,23 +111,126 @@ class TestCkTileReduce2d : public ::testing::Test EXPECT_TRUE(result); } - void RunTest2D(ck_tile::index_t m, ck_tile::index_t n) + // Convenience functions for specific dimensional patterns + void RunTest2D_KeepDim0_ReduceDim1(ck_tile::index_t dim0, ck_tile::index_t dim1) { - // 2D case: [M, N] -> reduce along [N] -> output [M] - RunTest(m, n, 1); + constexpr auto kept_dims = ck_tile::sequence<0>{}; + constexpr auto reduce_dims = ck_tile::sequence<1>{}; + + // Input shape and strides + std::vector input_shape = {dim0, dim1}; + std::vector input_strides = {dim1, 1}; + + // Output shape and strides (keep dim0) + std::vector output_shape = {dim0}; + std::vector output_strides = {1}; + + // Calculate products + ck_tile::index_t kept_dim_len_prod = dim0; + ck_tile::index_t total_reduce_elements = dim1; + + RunGenericTest<2>(input_shape, input_strides, output_shape, output_strides, + kept_dim_len_prod, total_reduce_elements, kept_dims, reduce_dims); } + + void RunTest3D_KeepDim0_ReduceDim12(ck_tile::index_t dim0, ck_tile::index_t dim1, ck_tile::index_t dim2) + { + constexpr auto kept_dims = ck_tile::sequence<0>{}; + constexpr auto reduce_dims = ck_tile::sequence<1, 2>{}; + + // Input shape and strides + std::vector input_shape = {dim0, dim1, dim2}; + std::vector input_strides = {dim1 * dim2, dim2, 1}; + + // Output shape and strides (keep dim0) + std::vector output_shape = {dim0}; + std::vector output_strides = {1}; + + // Calculate products + ck_tile::index_t kept_dim_len_prod = dim0; // product of kept dimensions + ck_tile::index_t total_reduce_elements = dim1 * dim2; // product of reduced dimensions + + RunGenericTest<3>(input_shape, input_strides, output_shape, output_strides, + kept_dim_len_prod, total_reduce_elements, kept_dims, reduce_dims); + } + + void RunTest3D_KeepDim01_ReduceDim2(ck_tile::index_t dim0, ck_tile::index_t dim1, ck_tile::index_t dim2) + { + constexpr auto kept_dims = ck_tile::sequence<0,1>{}; + constexpr auto reduce_dims = ck_tile::sequence<2>{}; + + // Input shape and strides + std::vector input_shape = {dim0, dim1, dim2}; + std::vector input_strides = {dim1 * dim2, dim2, 1}; + + // Output shape and strides (keep dim0) + std::vector output_shape = {dim0, dim1}; + std::vector output_strides = {dim1, 1}; + + // Calculate products + ck_tile::index_t kept_dim_len_prod = dim0 * dim1; // product of kept dimensions + ck_tile::index_t total_reduce_elements = dim2; // product of reduced dimensions + + RunGenericTest<3>(input_shape, input_strides, output_shape, output_strides, + kept_dim_len_prod, total_reduce_elements, kept_dims, reduce_dims); + } + + + + void RunTest4D_KeepDim01_ReduceDim23(ck_tile::index_t N, ck_tile::index_t C, ck_tile::index_t H, ck_tile::index_t W) + { + constexpr auto kept_dims = ck_tile::sequence<0, 1>{}; + constexpr auto reduce_dims = ck_tile::sequence<2, 3>{}; + + // Input shape and strides + std::vector input_shape = {N, C, H, W}; + std::vector input_strides = {C * H * W, H * W, W, 1}; + + // Output shape and strides (keep dim0, dim1) + std::vector output_shape = {N, C}; + std::vector output_strides = {C, 1}; + + // Calculate products + ck_tile::index_t kept_dim_len_prod = N * C; // product of kept dimensions + ck_tile::index_t total_reduce_elements = H * W; // product of reduced dimensions + + RunGenericTest<4>(input_shape, input_strides, output_shape, output_strides, + kept_dim_len_prod, total_reduce_elements, kept_dims, reduce_dims); + } + + void RunTest4D_KeepDim03_ReduceDim12(ck_tile::index_t N, ck_tile::index_t H, ck_tile::index_t W, ck_tile::index_t C) + { + constexpr auto kept_dims = ck_tile::sequence<0, 3>{}; + constexpr auto reduce_dims = ck_tile::sequence<1, 2>{}; + + // Input shape and strides + std::vector input_shape = {N, H, W, C}; + std::vector input_strides = {H * W * C, W * C, C, 1}; + + // Output shape and strides (keep dim0, dim1) + std::vector output_shape = {N, C}; + std::vector output_strides = {C, 1}; + + // Calculate products + ck_tile::index_t kept_dim_len_prod = N * C; // product of kept dimensions + ck_tile::index_t total_reduce_elements = H * W; // product of reduced dimensions + + RunGenericTest<4>(input_shape, input_strides, output_shape, output_strides, + kept_dim_len_prod, total_reduce_elements, kept_dims, reduce_dims); + } + }; // Shape parameters for different test configurations using Shape1_BlockWarps = ck_tile::sequence<4, 1>; using Shape1_BlockTile = ck_tile::sequence<128, 128>; using Shape1_WarpTile = ck_tile::sequence<32, 128>; -using Shape1_Vector = ck_tile::sequence<8, 8>; +using Shape1_ThreadTile = ck_tile::sequence<8, 8>; using Shape2_BlockWarps = ck_tile::sequence<2, 2>; // Cross-warp reduction test using Shape2_BlockTile = ck_tile::sequence<2, 1024>; using Shape2_WarpTile = ck_tile::sequence<1, 512>; -using Shape2_Vector = ck_tile::sequence<1, 8>; +using Shape2_ThreadTile = ck_tile::sequence<1, 8>; // Test configurations for different data types and operations using TestConfig_F32_Add = std::tuple; + Shape1_ThreadTile>; using TestConfig_F16_Add = std::tuple; + Shape1_ThreadTile>; using TestConfig_F32_CrossWarp = std::tuple; + Shape2_ThreadTile>; using TestConfig_F32_Max = std::tuple; + Shape1_ThreadTile>; using TestConfig_F32_SquareAdd = std::tuple; + Shape1_ThreadTile>; using TestTypes = ::testing::Types; -TYPED_TEST_SUITE(TestCkTileReduce2d, TestTypes); +TYPED_TEST_SUITE(TestCkTileReduce, TestTypes); -TYPED_TEST(TestCkTileReduce2d, test) { this->RunTest(128, 128, 1); } +// 2D Tests - Keep dim0, reduce dim1 +TYPED_TEST(TestCkTileReduce, Test2D_KeepDim0_ReduceDim1_64x32) +{ + this->RunTest2D_KeepDim0_ReduceDim1(64, 32); +} -TYPED_TEST(TestCkTileReduce2d, Reduce3D_512_1024_16) { this->RunTest(512, 1024, 16); } +TYPED_TEST(TestCkTileReduce, Test2D_KeepDim0_ReduceDim1_1024x512) +{ + this->RunTest2D_KeepDim0_ReduceDim1(1024, 512); +} -TYPED_TEST(TestCkTileReduce2d, Reduce3D_150_170_6) { this->RunTest(150, 64, 3); } +// 3D Tests - Keep dim0, reduce dim1,2 +TYPED_TEST(TestCkTileReduce, Test3D_KeepDim0_ReduceDim12_128x128x1) +{ + this->RunTest3D_KeepDim0_ReduceDim12(128, 128, 8); +} +// 3D Tests - Keep dim0,1, reduce dim1 +TYPED_TEST(TestCkTileReduce, Test3D_KeepDim01_ReduceDim2_512x1024x16) +{ + this->RunTest3D_KeepDim01_ReduceDim2(512, 1024, 16); +} -TYPED_TEST(TestCkTileReduce2d, Reduce2D_128_128) { this->RunTest2D(128, 128); } +// 4D Tests - Keep dim0,1, reduce dim2,3 (NCHW -> NC) +TYPED_TEST(TestCkTileReduce, Test4D_KeepDim01_ReduceDim23_32x256x16x16) +{ + this->RunTest4D_KeepDim01_ReduceDim23(32, 256, 16, 16); +} +// 4D Tests - Keep dim0,3, reduce dim1,2 (NHWC -> NC) +TYPED_TEST(TestCkTileReduce, Test4D_KeepDim03_ReduceDim12_16x32x32x128) +{ + this->RunTest4D_KeepDim03_ReduceDim12(16, 32, 32, 128); +}