mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-17 11:30:02 +00:00
Dlejeune/ck tile 2d multiple reductions (#3147)
* WIP
* Add Unit tests for the Multi Reduction Kernel
* clang format
* Rename multiblock to threadwise
* Multiblock WIP
* Fix multi reduce multi block unit tests
* Multi Reduce Tile Engine: WIP
* refactoring + try addressing precision error
* Fix multiops examples
* Cleanup
* Clean up tile engine's reduce op
* Update changelog
* Fix remod/clang
* Fix dates
* Fix documentation & missing file
* Fix comments
* Use the update_tile api in the multi-block kernel
* Unify threadwise/multiblock into a single kernel + default multiblock output to float in tests
* Add TileParitioner
* Cleanup
* Add warning when no data to process, in the example
* Refactoring Reduce kernel Tile Partioner + cleanup
* Move the tile partioner to its own file
* Add missing includes
* Fix copyright header with update_amd_copyright_headers.py
* Fix change of interface in Reduce2dProblem
---------
Co-authored-by: Damien Lejeune <damien.lejeune@amd.com>
Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>
[ROCm/composable_kernel commit: 4216d43da8]
This commit is contained in:
@@ -3,8 +3,12 @@
|
||||
|
||||
if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
|
||||
add_gtest_executable(test_ck_tile_reduce2d test_reduce2d.cpp)
|
||||
add_gtest_executable(test_ck_tile_multi_reduce2d_threadwise test_multi_reduce2d_threadwise.cpp)
|
||||
add_gtest_executable(test_ck_tile_multi_reduce2d_multiblock test_multi_reduce2d_multiblock.cpp)
|
||||
if(result EQUAL 0)
|
||||
target_link_libraries(test_ck_tile_reduce2d PRIVATE utility)
|
||||
target_link_libraries(test_ck_tile_multi_reduce2d_threadwise PRIVATE utility)
|
||||
target_link_libraries(test_ck_tile_multi_reduce2d_multiblock PRIVATE utility)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
34
test/ck_tile/reduce/test_multi_reduce2d_common.hpp
Normal file
34
test/ck_tile/reduce/test_multi_reduce2d_common.hpp
Normal file
@@ -0,0 +1,34 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/ops/elementwise.hpp"
|
||||
// Overload methods required for the parametrize tests
|
||||
|
||||
// Overload for PassThrough (no parameter)
|
||||
inline ck_tile::element_wise::PassThrough make_elementwise_op(int32_t,
|
||||
ck_tile::element_wise::PassThrough)
|
||||
{
|
||||
return ck_tile::element_wise::PassThrough{};
|
||||
}
|
||||
|
||||
// Overload for UnaryDivide (needs parameter)
|
||||
inline ck_tile::element_wise::UnaryDivide make_elementwise_op(int32_t total_reduce_elements,
|
||||
ck_tile::element_wise::UnaryDivide)
|
||||
{
|
||||
return ck_tile::element_wise::UnaryDivide{total_reduce_elements};
|
||||
}
|
||||
|
||||
// Overload for UnarySquare (no parameter)
|
||||
inline ck_tile::element_wise::UnarySquare make_elementwise_op(int32_t,
|
||||
ck_tile::element_wise::UnarySquare)
|
||||
{
|
||||
return ck_tile::element_wise::UnarySquare{};
|
||||
}
|
||||
|
||||
template <typename... Ops>
|
||||
auto make_elementwise_ops_tuple(int32_t total_reduce_elements, ck_tile::tuple<Ops...>)
|
||||
{
|
||||
return ck_tile::make_tuple(make_elementwise_op(total_reduce_elements, Ops{})...);
|
||||
}
|
||||
91
test/ck_tile/reduce/test_multi_reduce2d_multiblock.cpp
Normal file
91
test/ck_tile/reduce/test_multi_reduce2d_multiblock.cpp
Normal file
@@ -0,0 +1,91 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include <tuple>
|
||||
#include <iostream>
|
||||
#include <cstring>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host.hpp"
|
||||
#include "ck_tile/ops/reduce.hpp"
|
||||
#include "ck_tile/host/kernel_launch.hpp"
|
||||
#include "ck_tile/ops/elementwise.hpp"
|
||||
|
||||
#include "test_multi_reduce2d_multiblock_impl.hpp"
|
||||
|
||||
// Shape parameters for different test configurations
|
||||
using Shape1_BlockWarps = ck_tile::sequence<4, 1>;
|
||||
using Shape1_BlockTile = ck_tile::sequence<128, 128>;
|
||||
using Shape1_WarpTile = ck_tile::sequence<32, 128>;
|
||||
using Shape1_ThreadTile = ck_tile::sequence<8, 8>;
|
||||
|
||||
// Test configurations for different data types and operations
|
||||
using TestConfig_F16_Add = std::tuple<ck_tile::half_t,
|
||||
float,
|
||||
float, // Output and multiblock reducing buffer. Using float
|
||||
// to avoid too many accumulation errors
|
||||
ck_tile::tuple<ck_tile::ReduceOp::Add>,
|
||||
ck_tile::tuple<ck_tile::element_wise::PassThrough>,
|
||||
ck_tile::tuple<ck_tile::element_wise::PassThrough>,
|
||||
ck_tile::tuple<ck_tile::ReduceOp::Add>,
|
||||
Shape1_BlockWarps,
|
||||
Shape1_BlockTile,
|
||||
Shape1_WarpTile,
|
||||
Shape1_ThreadTile>;
|
||||
|
||||
using TestConfig_F16_Add_MeanSquare = std::tuple<
|
||||
ck_tile::half_t,
|
||||
float,
|
||||
float, // Output and multiblock reducing buffer. Using float to avoid too many accumulation
|
||||
// errors
|
||||
ck_tile::tuple<ck_tile::ReduceOp::Add, ck_tile::ReduceOp::Add>, // Intra block reductions
|
||||
ck_tile::tuple<ck_tile::element_wise::PassThrough,
|
||||
ck_tile::element_wise::UnarySquare>, // Elementwise
|
||||
// ops
|
||||
ck_tile::tuple<ck_tile::element_wise::PassThrough,
|
||||
ck_tile::element_wise::UnaryDivide>, // Accumulator Elementiwise ops, intra block
|
||||
ck_tile::tuple<ck_tile::ReduceOp::Add, ck_tile::ReduceOp::Add>, // Inter block reduction
|
||||
Shape1_BlockWarps,
|
||||
Shape1_BlockTile,
|
||||
Shape1_WarpTile,
|
||||
Shape1_ThreadTile>;
|
||||
|
||||
using TestTypes = ::testing::Types<TestConfig_F16_Add, TestConfig_F16_Add_MeanSquare>;
|
||||
|
||||
TYPED_TEST_SUITE(TestCkTileMultiReduceMultiblock, TestTypes);
|
||||
|
||||
// 2D Tests - Keep dim0, reduce dim1
|
||||
TYPED_TEST(TestCkTileMultiReduceMultiblock, Test2D_KeepDim0_ReduceDim1_64x32)
|
||||
{
|
||||
this->RunTest2D_KeepDim0_ReduceDim1(64, 32);
|
||||
}
|
||||
|
||||
TYPED_TEST(TestCkTileMultiReduceMultiblock, Test2D_KeepDim0_ReduceDim1_1024x512)
|
||||
{
|
||||
this->RunTest2D_KeepDim0_ReduceDim1(1024, 512);
|
||||
}
|
||||
|
||||
// 3D Tests - Keep dim0, reduce dim1,2
|
||||
TYPED_TEST(TestCkTileMultiReduceMultiblock, Test3D_KeepDim0_ReduceDim12_128x128x1)
|
||||
{
|
||||
this->RunTest3D_KeepDim0_ReduceDim12(128, 128, 8);
|
||||
}
|
||||
// 3D Tests - Keep dim0,1, reduce dim1
|
||||
TYPED_TEST(TestCkTileMultiReduceMultiblock, Test3D_KeepDim01_ReduceDim2_512x1024x16)
|
||||
{
|
||||
this->RunTest3D_KeepDim01_ReduceDim2(512, 1024, 16);
|
||||
}
|
||||
|
||||
// 4D Tests - Keep dim0,1, reduce dim2,3 (NCHW -> NC)
|
||||
TYPED_TEST(TestCkTileMultiReduceMultiblock, Test4D_KeepDim01_ReduceDim23_32x256x16x16)
|
||||
{
|
||||
this->RunTest4D_KeepDim01_ReduceDim23(32, 256, 16, 16);
|
||||
}
|
||||
// 4D Tests - Keep dim0,3, reduce dim1,2 (NHWC -> NC)
|
||||
TYPED_TEST(TestCkTileMultiReduceMultiblock, Test4D_KeepDim03_ReduceDim12_16x32x32x128)
|
||||
{
|
||||
this->RunTest4D_KeepDim03_ReduceDim12(16, 32, 32, 128);
|
||||
}
|
||||
355
test/ck_tile/reduce/test_multi_reduce2d_multiblock_impl.hpp
Normal file
355
test/ck_tile/reduce/test_multi_reduce2d_multiblock_impl.hpp
Normal file
@@ -0,0 +1,355 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include <tuple>
|
||||
#include <iostream>
|
||||
#include <cstring>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host.hpp"
|
||||
#include "ck_tile/ops/reduce.hpp"
|
||||
#include "ck_tile/host/kernel_launch.hpp"
|
||||
|
||||
#include "test_multi_reduce2d_common.hpp"
|
||||
|
||||
template <typename Tuple>
|
||||
class TestCkTileMultiReduceMultiblock : public ::testing::Test
|
||||
{
|
||||
protected:
|
||||
using XDataType = std::tuple_element_t<0, Tuple>;
|
||||
using ComputeDataType = std::tuple_element_t<1, Tuple>;
|
||||
using YDataType = std::tuple_element_t<2, Tuple>;
|
||||
using ReduceOpsType = std::tuple_element_t<3, Tuple>;
|
||||
using ElementwiseOpsType = std::tuple_element_t<4, Tuple>;
|
||||
using AccumulatorOpsType = std::tuple_element_t<5, Tuple>;
|
||||
using InterBlockReduceOpsType = std::tuple_element_t<6, Tuple>;
|
||||
using BlockWarps_ = std::tuple_element_t<7, Tuple>;
|
||||
using BlockTile_ = std::tuple_element_t<8, Tuple>;
|
||||
using WarpTile_ = std::tuple_element_t<9, Tuple>;
|
||||
using ThreadTile_ = std::tuple_element_t<10, Tuple>;
|
||||
|
||||
using TestReduce2dShape =
|
||||
ck_tile::Reduce2dShape<BlockWarps_, BlockTile_, WarpTile_, ThreadTile_>;
|
||||
|
||||
template <std::size_t InputDim, typename KeptDimSeq, typename ReduceDimSeq>
|
||||
void RunGenericTest(const std::vector<ck_tile::index_t>& input_shape,
|
||||
const std::vector<ck_tile::index_t>& input_strides,
|
||||
const std::vector<ck_tile::index_t>& output_shape,
|
||||
const std::vector<ck_tile::index_t>& output_strides,
|
||||
ck_tile::index_t kept_dim_len_prod,
|
||||
ck_tile::index_t total_reduce_elements,
|
||||
KeptDimSeq kept_dims,
|
||||
ReduceDimSeq reduce_dims)
|
||||
{
|
||||
static_assert(
|
||||
ReduceOpsType::size() == ElementwiseOpsType::size() &&
|
||||
ReduceOpsType::size() == AccumulatorOpsType::size() &&
|
||||
ReduceOpsType::size() == InterBlockReduceOpsType::size(),
|
||||
"Error: All operations tuple size must match the number of reduction operations");
|
||||
|
||||
const auto number_operations = ReduceOpsType::size();
|
||||
|
||||
ck_tile::HostTensor<XDataType> h_x(input_shape, input_strides);
|
||||
|
||||
auto h_ys = ck_tile::generate_tuple(
|
||||
[&output_shape, &output_strides](auto /*i*/) {
|
||||
return ck_tile::HostTensor<YDataType>(output_shape, output_strides);
|
||||
},
|
||||
ck_tile::number<number_operations>{});
|
||||
|
||||
auto h_ys_ref = ck_tile::generate_tuple(
|
||||
[&output_shape, &output_strides](auto /*i*/) {
|
||||
return ck_tile::HostTensor<YDataType>(output_shape, output_strides);
|
||||
},
|
||||
ck_tile::number<number_operations>{});
|
||||
|
||||
ck_tile::FillUniformDistribution<XDataType>{-5.f, 5.f}(h_x);
|
||||
|
||||
ck_tile::static_for<0, number_operations, 1>{}([&](auto i) {
|
||||
h_ys.template at<i>().SetZero();
|
||||
h_ys_ref.template at<i>().SetZero();
|
||||
});
|
||||
|
||||
auto output_number_elements = [&output_shape]() {
|
||||
ck_tile::index_t prod = 1;
|
||||
for(auto len : output_shape)
|
||||
prod *= len;
|
||||
return prod;
|
||||
}();
|
||||
|
||||
auto output_buffer_size =
|
||||
number_operations * h_ys.get(ck_tile::number<0>{}).get_element_space_size_in_bytes();
|
||||
ck_tile::DeviceMem d_x_mem(h_x.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem d_y_mem(output_buffer_size);
|
||||
|
||||
std::vector<YDataType> h(number_operations * output_number_elements);
|
||||
|
||||
// Init the output data with identity values respective to each reduce op
|
||||
ck_tile::static_for<0, number_operations, 1>{}([&](auto i) {
|
||||
constexpr auto op = ReduceOpsType{}.at(i);
|
||||
const auto identity_val = op.template GetIdentityValue<YDataType>();
|
||||
std::fill(h.begin() + i * output_number_elements,
|
||||
h.begin() + (i + 1) * output_number_elements,
|
||||
identity_val);
|
||||
});
|
||||
|
||||
d_x_mem.ToDevice(h_x.data());
|
||||
d_y_mem.ToDevice(h.data());
|
||||
|
||||
using Problem = ck_tile::Reduce2dProblem<XDataType,
|
||||
ComputeDataType,
|
||||
YDataType,
|
||||
TestReduce2dShape,
|
||||
ReduceOpsType,
|
||||
KeptDimSeq,
|
||||
ReduceDimSeq,
|
||||
InputDim>;
|
||||
|
||||
using Kernel = ck_tile::MultiReduceMultiblock<Problem>;
|
||||
|
||||
// Launch configuration
|
||||
const ck_tile::index_t kBlockSize = Kernel::BlockSize();
|
||||
constexpr ck_tile::index_t kBlockPerCu = 1;
|
||||
|
||||
auto elementwise_ops =
|
||||
make_elementwise_ops_tuple(total_reduce_elements, ElementwiseOpsType{});
|
||||
auto accumulator_ops =
|
||||
make_elementwise_ops_tuple(total_reduce_elements, AccumulatorOpsType{});
|
||||
|
||||
auto [num_block_tile_iterations, block_group_size] =
|
||||
typename Kernel::TilePartitioner{total_reduce_elements}.GetBlockGroupParams();
|
||||
|
||||
std::cout << "Block group size: " << block_group_size
|
||||
<< ", Num block tile iterations: " << num_block_tile_iterations
|
||||
<< ", Reduce total length: " << total_reduce_elements << std::endl;
|
||||
|
||||
ck_tile::index_t kGridSize =
|
||||
((kept_dim_len_prod + TestReduce2dShape::Block_M - 1) / TestReduce2dShape::Block_M) *
|
||||
block_group_size;
|
||||
|
||||
// Generic helper to create tuple from vector based on compile-time size
|
||||
auto make_shape_tuple = []<std::size_t N>(const std::vector<ck_tile::index_t>& vec) {
|
||||
return [&vec]<std::size_t... I>(std::index_sequence<I...>) {
|
||||
return ck_tile::make_tuple(vec[I]...);
|
||||
}(std::make_index_sequence<N>{});
|
||||
};
|
||||
|
||||
auto input_shape_tuple = make_shape_tuple.template operator()<InputDim>(input_shape);
|
||||
auto input_strides_tuple = make_shape_tuple.template operator()<InputDim>(input_strides);
|
||||
|
||||
if(!Kernel::IsSupportedArgument(
|
||||
total_reduce_elements,
|
||||
input_strides_tuple)) // output tensor's continuous dimension
|
||||
{
|
||||
throw std::runtime_error("Wrong! Arguments not supported!\n");
|
||||
}
|
||||
|
||||
ck_tile::launch_kernel(
|
||||
ck_tile::stream_config{nullptr, false, 0},
|
||||
ck_tile::make_kernel<kBlockPerCu>(Kernel{},
|
||||
kGridSize,
|
||||
kBlockSize,
|
||||
0,
|
||||
static_cast<XDataType*>(d_x_mem.GetDeviceBuffer()),
|
||||
static_cast<YDataType*>(d_y_mem.GetDeviceBuffer()),
|
||||
input_shape_tuple,
|
||||
input_strides_tuple,
|
||||
kept_dims,
|
||||
reduce_dims,
|
||||
output_number_elements,
|
||||
elementwise_ops,
|
||||
accumulator_ops,
|
||||
InterBlockReduceOpsType{}));
|
||||
|
||||
// Reference computation
|
||||
ck_tile::reference_multiple_reduce_multiblock<XDataType, ComputeDataType, YDataType>(
|
||||
h_x,
|
||||
h_ys_ref,
|
||||
ReduceOpsType{},
|
||||
kept_dims,
|
||||
reduce_dims,
|
||||
elementwise_ops,
|
||||
accumulator_ops,
|
||||
InterBlockReduceOpsType{},
|
||||
block_group_size);
|
||||
|
||||
// Calculate proper error thresholds based on data types and number of accumulations
|
||||
// const auto rtol = ck_tile::get_relative_threshold<XDataType, YDataType, ComputeDataType>(
|
||||
// total_reduce_elements);
|
||||
// const auto atol = ck_tile::get_absolute_threshold<YDataType, YDataType, ComputeDataType>(
|
||||
// 5.0f, total_reduce_elements);
|
||||
|
||||
// Unfortunately due to the non-sequenciality, down-casting on the output buffer
|
||||
// and further operations on this buffer, the error is compounding at a faster
|
||||
// rate than what the host reference can support. A large tolerance is then required
|
||||
const auto rtol = 1e-2;
|
||||
const auto atol = 1e-1;
|
||||
|
||||
// Transfer data from device and check error for each operation
|
||||
std::vector<YDataType> h_y_tmp(output_number_elements * number_operations);
|
||||
d_y_mem.FromDevice(h_y_tmp.data());
|
||||
bool result = true;
|
||||
ck_tile::static_for<0, number_operations, 1>{}([&](auto i) {
|
||||
std::memcpy(h_ys.get(ck_tile::number<i>{}).data(),
|
||||
h_y_tmp.data() + i * output_number_elements,
|
||||
output_number_elements * sizeof(YDataType));
|
||||
std::cout << "Checking errors for operation: " << i << std::endl;
|
||||
result &= ck_tile::check_err(h_ys.get(ck_tile::number<i>{}),
|
||||
h_ys_ref.get(ck_tile::number<i>{}),
|
||||
"Error: Incorrect reduce results!",
|
||||
rtol,
|
||||
atol);
|
||||
});
|
||||
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
// Convenience functions for specific dimensional patterns
|
||||
void RunTest2D_KeepDim0_ReduceDim1(ck_tile::index_t dim0, ck_tile::index_t dim1)
|
||||
{
|
||||
constexpr auto kept_dims = ck_tile::sequence<0>{};
|
||||
constexpr auto reduce_dims = ck_tile::sequence<1>{};
|
||||
|
||||
// Input shape and strides
|
||||
std::vector<ck_tile::index_t> input_shape = {dim0, dim1};
|
||||
std::vector<ck_tile::index_t> input_strides = {dim1, 1};
|
||||
|
||||
// Output shape and strides (keep dim0)
|
||||
std::vector<ck_tile::index_t> output_shape = {dim0};
|
||||
std::vector<ck_tile::index_t> output_strides = {1};
|
||||
|
||||
// Calculate products
|
||||
ck_tile::index_t kept_dim_len_prod = dim0;
|
||||
ck_tile::index_t total_reduce_elements = dim1;
|
||||
|
||||
RunGenericTest<2>(input_shape,
|
||||
input_strides,
|
||||
output_shape,
|
||||
output_strides,
|
||||
kept_dim_len_prod,
|
||||
total_reduce_elements,
|
||||
kept_dims,
|
||||
reduce_dims);
|
||||
}
|
||||
|
||||
void RunTest3D_KeepDim0_ReduceDim12(ck_tile::index_t dim0,
|
||||
ck_tile::index_t dim1,
|
||||
ck_tile::index_t dim2)
|
||||
{
|
||||
constexpr auto kept_dims = ck_tile::sequence<0>{};
|
||||
constexpr auto reduce_dims = ck_tile::sequence<1, 2>{};
|
||||
|
||||
// Input shape and strides
|
||||
std::vector<ck_tile::index_t> input_shape = {dim0, dim1, dim2};
|
||||
std::vector<ck_tile::index_t> input_strides = {dim1 * dim2, dim2, 1};
|
||||
|
||||
// Output shape and strides (keep dim0)
|
||||
std::vector<ck_tile::index_t> output_shape = {dim0};
|
||||
std::vector<ck_tile::index_t> output_strides = {1};
|
||||
|
||||
// Calculate products
|
||||
ck_tile::index_t kept_dim_len_prod = dim0; // product of kept dimensions
|
||||
ck_tile::index_t total_reduce_elements = dim1 * dim2; // product of reduced dimensions
|
||||
|
||||
RunGenericTest<3>(input_shape,
|
||||
input_strides,
|
||||
output_shape,
|
||||
output_strides,
|
||||
kept_dim_len_prod,
|
||||
total_reduce_elements,
|
||||
kept_dims,
|
||||
reduce_dims);
|
||||
}
|
||||
|
||||
void RunTest3D_KeepDim01_ReduceDim2(ck_tile::index_t dim0,
|
||||
ck_tile::index_t dim1,
|
||||
ck_tile::index_t dim2)
|
||||
{
|
||||
constexpr auto kept_dims = ck_tile::sequence<0, 1>{};
|
||||
constexpr auto reduce_dims = ck_tile::sequence<2>{};
|
||||
|
||||
// Input shape and strides
|
||||
std::vector<ck_tile::index_t> input_shape = {dim0, dim1, dim2};
|
||||
std::vector<ck_tile::index_t> input_strides = {dim1 * dim2, dim2, 1};
|
||||
|
||||
// Output shape and strides (keep dim0, dim1)
|
||||
std::vector<ck_tile::index_t> output_shape = {dim0, dim1};
|
||||
std::vector<ck_tile::index_t> output_strides = {dim1, 1};
|
||||
|
||||
// Calculate products
|
||||
ck_tile::index_t kept_dim_len_prod = dim0 * dim1; // product of kept dimensions
|
||||
ck_tile::index_t total_reduce_elements = dim2; // product of reduced dimensions
|
||||
|
||||
RunGenericTest<3>(input_shape,
|
||||
input_strides,
|
||||
output_shape,
|
||||
output_strides,
|
||||
kept_dim_len_prod,
|
||||
total_reduce_elements,
|
||||
kept_dims,
|
||||
reduce_dims);
|
||||
}
|
||||
|
||||
void RunTest4D_KeepDim01_ReduceDim23(ck_tile::index_t N,
|
||||
ck_tile::index_t C,
|
||||
ck_tile::index_t H,
|
||||
ck_tile::index_t W)
|
||||
{
|
||||
constexpr auto kept_dims = ck_tile::sequence<0, 1>{};
|
||||
constexpr auto reduce_dims = ck_tile::sequence<2, 3>{};
|
||||
|
||||
// Input shape and strides
|
||||
std::vector<ck_tile::index_t> input_shape = {N, C, H, W};
|
||||
std::vector<ck_tile::index_t> input_strides = {C * H * W, H * W, W, 1};
|
||||
|
||||
// Output shape and strides (keep dim0, dim1)
|
||||
std::vector<ck_tile::index_t> output_shape = {N, C};
|
||||
std::vector<ck_tile::index_t> output_strides = {C, 1};
|
||||
|
||||
// Calculate products
|
||||
ck_tile::index_t kept_dim_len_prod = N * C; // product of kept dimensions
|
||||
ck_tile::index_t total_reduce_elements = H * W; // product of reduced dimensions
|
||||
|
||||
RunGenericTest<4>(input_shape,
|
||||
input_strides,
|
||||
output_shape,
|
||||
output_strides,
|
||||
kept_dim_len_prod,
|
||||
total_reduce_elements,
|
||||
kept_dims,
|
||||
reduce_dims);
|
||||
}
|
||||
|
||||
void RunTest4D_KeepDim03_ReduceDim12(ck_tile::index_t N,
|
||||
ck_tile::index_t H,
|
||||
ck_tile::index_t W,
|
||||
ck_tile::index_t C)
|
||||
{
|
||||
constexpr auto kept_dims = ck_tile::sequence<0, 3>{};
|
||||
constexpr auto reduce_dims = ck_tile::sequence<1, 2>{};
|
||||
|
||||
// Input shape and strides
|
||||
std::vector<ck_tile::index_t> input_shape = {N, H, W, C};
|
||||
std::vector<ck_tile::index_t> input_strides = {H * W * C, W * C, C, 1};
|
||||
|
||||
// Output shape and strides (keep dim0, dim3)
|
||||
std::vector<ck_tile::index_t> output_shape = {N, C};
|
||||
std::vector<ck_tile::index_t> output_strides = {C, 1};
|
||||
|
||||
// Calculate products
|
||||
ck_tile::index_t kept_dim_len_prod = N * C; // product of kept dimensions
|
||||
ck_tile::index_t total_reduce_elements = H * W; // product of reduced dimensions
|
||||
|
||||
RunGenericTest<4>(input_shape,
|
||||
input_strides,
|
||||
output_shape,
|
||||
output_strides,
|
||||
kept_dim_len_prod,
|
||||
total_reduce_elements,
|
||||
kept_dims,
|
||||
reduce_dims);
|
||||
}
|
||||
};
|
||||
96
test/ck_tile/reduce/test_multi_reduce2d_threadwise.cpp
Normal file
96
test/ck_tile/reduce/test_multi_reduce2d_threadwise.cpp
Normal file
@@ -0,0 +1,96 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include <tuple>
|
||||
#include <iostream>
|
||||
#include <cstring>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host.hpp"
|
||||
#include "ck_tile/ops/reduce.hpp"
|
||||
#include "ck_tile/host/kernel_launch.hpp"
|
||||
|
||||
#include "test_multi_reduce2d_threadwise_impl.hpp"
|
||||
|
||||
// Shape parameters for different test configurations
|
||||
using Shape1_BlockWarps = ck_tile::sequence<4, 1>;
|
||||
using Shape1_BlockTile = ck_tile::sequence<128, 128>;
|
||||
using Shape1_WarpTile = ck_tile::sequence<32, 128>;
|
||||
using Shape1_ThreadTile = ck_tile::sequence<8, 8>;
|
||||
|
||||
using Shape2_BlockWarps = ck_tile::sequence<2, 2>; // Cross-warp reduction test
|
||||
using Shape2_BlockTile = ck_tile::sequence<2, 1024>;
|
||||
using Shape2_WarpTile = ck_tile::sequence<1, 512>;
|
||||
using Shape2_ThreadTile = ck_tile::sequence<1, 8>;
|
||||
|
||||
// Test configurations for different data types and operations
|
||||
using TestConfig_F16_Add = std::tuple<ck_tile::half_t,
|
||||
float,
|
||||
ck_tile::half_t,
|
||||
ck_tile::tuple<ck_tile::ReduceOp::Add>,
|
||||
ck_tile::tuple<ck_tile::element_wise::PassThrough>,
|
||||
ck_tile::tuple<ck_tile::element_wise::PassThrough>,
|
||||
ck_tile::tuple<ck_tile::element_wise::PassThrough>,
|
||||
Shape1_BlockWarps,
|
||||
Shape1_BlockTile,
|
||||
Shape1_WarpTile,
|
||||
Shape1_ThreadTile>;
|
||||
|
||||
using TestConfig_F16_Add_Max = std::tuple<
|
||||
ck_tile::half_t,
|
||||
float,
|
||||
ck_tile::half_t,
|
||||
ck_tile::tuple<ck_tile::ReduceOp::Add, ck_tile::ReduceOp::Max, ck_tile::ReduceOp::Add>,
|
||||
ck_tile::tuple<ck_tile::element_wise::PassThrough,
|
||||
ck_tile::element_wise::PassThrough,
|
||||
ck_tile::element_wise::UnarySquare>,
|
||||
ck_tile::tuple<ck_tile::element_wise::PassThrough,
|
||||
ck_tile::element_wise::PassThrough,
|
||||
ck_tile::element_wise::UnaryDivide>,
|
||||
ck_tile::tuple<ck_tile::element_wise::PassThrough,
|
||||
ck_tile::element_wise::PassThrough,
|
||||
ck_tile::element_wise::PassThrough>,
|
||||
Shape1_BlockWarps,
|
||||
Shape1_BlockTile,
|
||||
Shape1_WarpTile,
|
||||
Shape1_ThreadTile>;
|
||||
|
||||
using TestTypes = ::testing::Types<TestConfig_F16_Add, TestConfig_F16_Add_Max>;
|
||||
|
||||
TYPED_TEST_SUITE(TestCkTileMultiReduceThreadwise, TestTypes);
|
||||
|
||||
// 2D Tests - Keep dim0, reduce dim1
|
||||
TYPED_TEST(TestCkTileMultiReduceThreadwise, Test2D_KeepDim0_ReduceDim1_64x32)
|
||||
{
|
||||
this->RunTest2D_KeepDim0_ReduceDim1(64, 32);
|
||||
}
|
||||
|
||||
TYPED_TEST(TestCkTileMultiReduceThreadwise, Test2D_KeepDim0_ReduceDim1_1024x512)
|
||||
{
|
||||
this->RunTest2D_KeepDim0_ReduceDim1(1024, 512);
|
||||
}
|
||||
|
||||
// 3D Tests - Keep dim0, reduce dim1,2
|
||||
TYPED_TEST(TestCkTileMultiReduceThreadwise, Test3D_KeepDim0_ReduceDim12_128x128x1)
|
||||
{
|
||||
this->RunTest3D_KeepDim0_ReduceDim12(128, 128, 8);
|
||||
}
|
||||
// 3D Tests - Keep dim0,1, reduce dim1
|
||||
TYPED_TEST(TestCkTileMultiReduceThreadwise, Test3D_KeepDim01_ReduceDim2_512x1024x16)
|
||||
{
|
||||
this->RunTest3D_KeepDim01_ReduceDim2(512, 512, 16);
|
||||
}
|
||||
|
||||
// 4D Tests - Keep dim0,1, reduce dim2,3 (NCHW -> NC)
|
||||
TYPED_TEST(TestCkTileMultiReduceThreadwise, Test4D_KeepDim01_ReduceDim23_32x256x16x16)
|
||||
{
|
||||
this->RunTest4D_KeepDim01_ReduceDim23(32, 256, 16, 16);
|
||||
}
|
||||
// 4D Tests - Keep dim0,3, reduce dim1,2 (NHWC -> NC)
|
||||
TYPED_TEST(TestCkTileMultiReduceThreadwise, Test4D_KeepDim03_ReduceDim12_16x32x32x128)
|
||||
{
|
||||
this->RunTest4D_KeepDim03_ReduceDim12(16, 32, 32, 128);
|
||||
}
|
||||
325
test/ck_tile/reduce/test_multi_reduce2d_threadwise_impl.hpp
Normal file
325
test/ck_tile/reduce/test_multi_reduce2d_threadwise_impl.hpp
Normal file
@@ -0,0 +1,325 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include <tuple>
|
||||
#include <iostream>
|
||||
#include <cstring>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host.hpp"
|
||||
#include "ck_tile/ops/reduce.hpp"
|
||||
#include "ck_tile/host/kernel_launch.hpp"
|
||||
|
||||
#include "test_multi_reduce2d_common.hpp"
|
||||
|
||||
template <typename Tuple>
|
||||
class TestCkTileMultiReduceThreadwise : public ::testing::Test
|
||||
{
|
||||
protected:
|
||||
using XDataType = std::tuple_element_t<0, Tuple>;
|
||||
using ComputeDataType = std::tuple_element_t<1, Tuple>;
|
||||
using YDataType = std::tuple_element_t<2, Tuple>;
|
||||
using ReduceOpsType = std::tuple_element_t<3, Tuple>;
|
||||
using ElementwiseOpsType = std::tuple_element_t<4, Tuple>;
|
||||
using AccumulatorOpsType = std::tuple_element_t<5, Tuple>;
|
||||
using InterBlockReduceOpsType = std::tuple_element_t<6, Tuple>;
|
||||
using BlockWarps_ = std::tuple_element_t<7, Tuple>;
|
||||
using BlockTile_ = std::tuple_element_t<8, Tuple>;
|
||||
using WarpTile_ = std::tuple_element_t<9, Tuple>;
|
||||
using ThreadTile_ = std::tuple_element_t<10, Tuple>;
|
||||
|
||||
using TestReduce2dShape =
|
||||
ck_tile::Reduce2dShape<BlockWarps_, BlockTile_, WarpTile_, ThreadTile_>;
|
||||
|
||||
template <std::size_t InputDim, typename KeptDimSeq, typename ReduceDimSeq>
|
||||
void RunGenericTest(const std::vector<ck_tile::index_t>& input_shape,
|
||||
const std::vector<ck_tile::index_t>& input_strides,
|
||||
const std::vector<ck_tile::index_t>& output_shape,
|
||||
const std::vector<ck_tile::index_t>& output_strides,
|
||||
ck_tile::index_t kept_dim_len_prod,
|
||||
ck_tile::index_t total_reduce_elements,
|
||||
KeptDimSeq kept_dims,
|
||||
ReduceDimSeq reduce_dims)
|
||||
{
|
||||
const auto number_operations = ReduceOpsType::size();
|
||||
|
||||
ck_tile::HostTensor<XDataType> h_x(input_shape, input_strides);
|
||||
|
||||
auto h_ys = ck_tile::generate_tuple(
|
||||
[&output_shape, &output_strides](auto /*i*/) {
|
||||
return ck_tile::HostTensor<YDataType>(output_shape, output_strides);
|
||||
},
|
||||
ck_tile::number<number_operations>{});
|
||||
|
||||
auto h_ys_ref = ck_tile::generate_tuple(
|
||||
[&output_shape, &output_strides](auto /*i*/) {
|
||||
return ck_tile::HostTensor<YDataType>(output_shape, output_strides);
|
||||
},
|
||||
ck_tile::number<number_operations>{});
|
||||
|
||||
ck_tile::FillUniformDistribution<XDataType>{-5.f, 5.f}(h_x);
|
||||
|
||||
ck_tile::static_for<0, number_operations, 1>{}([&](auto i) {
|
||||
h_ys.template at<i>().SetZero();
|
||||
h_ys_ref.template at<i>().SetZero();
|
||||
});
|
||||
|
||||
auto output_number_elements = [&output_shape]() {
|
||||
ck_tile::index_t prod = 1;
|
||||
for(auto len : output_shape)
|
||||
prod *= len;
|
||||
return prod;
|
||||
}();
|
||||
|
||||
auto output_buffer_size =
|
||||
number_operations * h_ys.get(ck_tile::number<0>{}).get_element_space_size_in_bytes();
|
||||
ck_tile::DeviceMem d_x_mem(h_x.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem d_y_mem(output_buffer_size);
|
||||
|
||||
d_x_mem.ToDevice(h_x.data());
|
||||
|
||||
// Problem and kernel setup
|
||||
using Problem = ck_tile::Reduce2dProblem<XDataType,
|
||||
ComputeDataType,
|
||||
YDataType,
|
||||
TestReduce2dShape,
|
||||
ReduceOpsType,
|
||||
KeptDimSeq,
|
||||
ReduceDimSeq,
|
||||
InputDim>;
|
||||
|
||||
using Kernel = ck_tile::MultiReduceThreadWise<Problem>;
|
||||
|
||||
// Launch configuration
|
||||
const ck_tile::index_t kBlockSize = Kernel::BlockSize();
|
||||
constexpr ck_tile::index_t kBlockPerCu = 1;
|
||||
|
||||
ck_tile::index_t kGridSize =
|
||||
(kept_dim_len_prod + TestReduce2dShape::Block_M - 1) / TestReduce2dShape::Block_M;
|
||||
|
||||
// Generic helper to create tuple from vector based on compile-time size
|
||||
auto make_shape_tuple = []<std::size_t N>(const std::vector<ck_tile::index_t>& vec) {
|
||||
return [&vec]<std::size_t... I>(std::index_sequence<I...>) {
|
||||
return ck_tile::make_tuple(vec[I]...);
|
||||
}(std::make_index_sequence<N>{});
|
||||
};
|
||||
|
||||
auto input_shape_tuple = make_shape_tuple.template operator()<InputDim>(input_shape);
|
||||
auto input_strides_tuple = make_shape_tuple.template operator()<InputDim>(input_strides);
|
||||
|
||||
if(!Kernel::IsSupportedArgument(
|
||||
total_reduce_elements,
|
||||
input_strides_tuple)) // output tensor's continuous dimension
|
||||
{
|
||||
throw std::runtime_error("Wrong! Arguments not supported!\n");
|
||||
}
|
||||
|
||||
auto elementwise_ops =
|
||||
make_elementwise_ops_tuple(total_reduce_elements, ElementwiseOpsType{});
|
||||
auto accumulator_ops =
|
||||
make_elementwise_ops_tuple(total_reduce_elements, AccumulatorOpsType{});
|
||||
|
||||
ck_tile::launch_kernel(
|
||||
ck_tile::stream_config{nullptr, false, 0},
|
||||
ck_tile::make_kernel<kBlockPerCu>(Kernel{},
|
||||
kGridSize,
|
||||
kBlockSize,
|
||||
0,
|
||||
static_cast<XDataType*>(d_x_mem.GetDeviceBuffer()),
|
||||
static_cast<YDataType*>(d_y_mem.GetDeviceBuffer()),
|
||||
input_shape_tuple,
|
||||
input_strides_tuple,
|
||||
kept_dims,
|
||||
reduce_dims,
|
||||
output_number_elements,
|
||||
elementwise_ops,
|
||||
accumulator_ops));
|
||||
|
||||
// Reference computation
|
||||
ck_tile::reference_multiple_reduce<XDataType, ComputeDataType, YDataType>(h_x,
|
||||
h_ys_ref,
|
||||
ReduceOpsType{},
|
||||
kept_dims,
|
||||
reduce_dims,
|
||||
elementwise_ops,
|
||||
accumulator_ops);
|
||||
|
||||
// Calculate proper error thresholds based on data types and number of accumulations
|
||||
// const auto rtol = ck_tile::get_relative_threshold<XDataType, YDataType, ComputeDataType>(
|
||||
// total_reduce_elements);
|
||||
// const auto atol = ck_tile::get_absolute_threshold<XDataType, YDataType, ComputeDataType>(
|
||||
// 5.0f, total_reduce_elements);
|
||||
|
||||
// Unfortunately due to the non-sequenciality, down-casting on the output buffer
|
||||
// and further operations on this buffer, the error is compounding at a faster
|
||||
// rate than what the host reference can support. A large tolerance is then required
|
||||
const auto rtol = 1e-2;
|
||||
const auto atol = 1e-1;
|
||||
|
||||
// Transfer data from device and check error for each operation
|
||||
std::vector<YDataType> h_y_tmp(output_number_elements * number_operations);
|
||||
d_y_mem.FromDevice(h_y_tmp.data());
|
||||
bool result = true;
|
||||
ck_tile::static_for<0, number_operations, 1>{}([&](auto i) {
|
||||
std::memcpy(h_ys.get(ck_tile::number<i>{}).data(),
|
||||
h_y_tmp.data() + i * output_number_elements,
|
||||
output_number_elements * sizeof(YDataType));
|
||||
result &= ck_tile::check_err(h_ys.get(ck_tile::number<i>{}),
|
||||
h_ys_ref.get(ck_tile::number<i>{}),
|
||||
"Error: Incorrect reduce results!",
|
||||
rtol,
|
||||
atol);
|
||||
});
|
||||
|
||||
EXPECT_TRUE(result);
|
||||
}
|
||||
|
||||
// Convenience functions for specific dimensional patterns
|
||||
void RunTest2D_KeepDim0_ReduceDim1(ck_tile::index_t dim0, ck_tile::index_t dim1)
|
||||
{
|
||||
constexpr auto kept_dims = ck_tile::sequence<0>{};
|
||||
constexpr auto reduce_dims = ck_tile::sequence<1>{};
|
||||
|
||||
// Input shape and strides
|
||||
std::vector<ck_tile::index_t> input_shape = {dim0, dim1};
|
||||
std::vector<ck_tile::index_t> input_strides = {dim1, 1};
|
||||
|
||||
// Output shape and strides (keep dim0)
|
||||
std::vector<ck_tile::index_t> output_shape = {dim0};
|
||||
std::vector<ck_tile::index_t> output_strides = {1};
|
||||
|
||||
// Calculate products
|
||||
ck_tile::index_t kept_dim_len_prod = dim0;
|
||||
ck_tile::index_t total_reduce_elements = dim1;
|
||||
|
||||
RunGenericTest<2>(input_shape,
|
||||
input_strides,
|
||||
output_shape,
|
||||
output_strides,
|
||||
kept_dim_len_prod,
|
||||
total_reduce_elements,
|
||||
kept_dims,
|
||||
reduce_dims);
|
||||
}
|
||||
|
||||
void RunTest3D_KeepDim0_ReduceDim12(ck_tile::index_t dim0,
|
||||
ck_tile::index_t dim1,
|
||||
ck_tile::index_t dim2)
|
||||
{
|
||||
constexpr auto kept_dims = ck_tile::sequence<0>{};
|
||||
constexpr auto reduce_dims = ck_tile::sequence<1, 2>{};
|
||||
|
||||
// Input shape and strides
|
||||
std::vector<ck_tile::index_t> input_shape = {dim0, dim1, dim2};
|
||||
std::vector<ck_tile::index_t> input_strides = {dim1 * dim2, dim2, 1};
|
||||
|
||||
// Output shape and strides (keep dim0)
|
||||
std::vector<ck_tile::index_t> output_shape = {dim0};
|
||||
std::vector<ck_tile::index_t> output_strides = {1};
|
||||
|
||||
// Calculate products
|
||||
ck_tile::index_t kept_dim_len_prod = dim0; // product of kept dimensions
|
||||
ck_tile::index_t total_reduce_elements = dim1 * dim2; // product of reduced dimensions
|
||||
|
||||
RunGenericTest<3>(input_shape,
|
||||
input_strides,
|
||||
output_shape,
|
||||
output_strides,
|
||||
kept_dim_len_prod,
|
||||
total_reduce_elements,
|
||||
kept_dims,
|
||||
reduce_dims);
|
||||
}
|
||||
|
||||
void RunTest3D_KeepDim01_ReduceDim2(ck_tile::index_t dim0,
|
||||
ck_tile::index_t dim1,
|
||||
ck_tile::index_t dim2)
|
||||
{
|
||||
constexpr auto kept_dims = ck_tile::sequence<0, 1>{};
|
||||
constexpr auto reduce_dims = ck_tile::sequence<2>{};
|
||||
|
||||
// Input shape and strides
|
||||
std::vector<ck_tile::index_t> input_shape = {dim0, dim1, dim2};
|
||||
std::vector<ck_tile::index_t> input_strides = {dim1 * dim2, dim2, 1};
|
||||
|
||||
// Output shape and strides (keep dim0, dim1)
|
||||
std::vector<ck_tile::index_t> output_shape = {dim0, dim1};
|
||||
std::vector<ck_tile::index_t> output_strides = {dim1, 1};
|
||||
|
||||
// Calculate products
|
||||
ck_tile::index_t kept_dim_len_prod = dim0 * dim1; // product of kept dimensions
|
||||
ck_tile::index_t total_reduce_elements = dim2; // product of reduced dimensions
|
||||
|
||||
RunGenericTest<3>(input_shape,
|
||||
input_strides,
|
||||
output_shape,
|
||||
output_strides,
|
||||
kept_dim_len_prod,
|
||||
total_reduce_elements,
|
||||
kept_dims,
|
||||
reduce_dims);
|
||||
}
|
||||
|
||||
void RunTest4D_KeepDim01_ReduceDim23(ck_tile::index_t N,
|
||||
ck_tile::index_t C,
|
||||
ck_tile::index_t H,
|
||||
ck_tile::index_t W)
|
||||
{
|
||||
constexpr auto kept_dims = ck_tile::sequence<0, 1>{};
|
||||
constexpr auto reduce_dims = ck_tile::sequence<2, 3>{};
|
||||
|
||||
// Input shape and strides
|
||||
std::vector<ck_tile::index_t> input_shape = {N, C, H, W};
|
||||
std::vector<ck_tile::index_t> input_strides = {C * H * W, H * W, W, 1};
|
||||
|
||||
// Output shape and strides (keep dim0, dim1)
|
||||
std::vector<ck_tile::index_t> output_shape = {N, C};
|
||||
std::vector<ck_tile::index_t> output_strides = {C, 1};
|
||||
|
||||
// Calculate products
|
||||
ck_tile::index_t kept_dim_len_prod = N * C; // product of kept dimensions
|
||||
ck_tile::index_t total_reduce_elements = H * W; // product of reduced dimensions
|
||||
|
||||
RunGenericTest<4>(input_shape,
|
||||
input_strides,
|
||||
output_shape,
|
||||
output_strides,
|
||||
kept_dim_len_prod,
|
||||
total_reduce_elements,
|
||||
kept_dims,
|
||||
reduce_dims);
|
||||
}
|
||||
|
||||
void RunTest4D_KeepDim03_ReduceDim12(ck_tile::index_t N,
|
||||
ck_tile::index_t H,
|
||||
ck_tile::index_t W,
|
||||
ck_tile::index_t C)
|
||||
{
|
||||
constexpr auto kept_dims = ck_tile::sequence<0, 3>{};
|
||||
constexpr auto reduce_dims = ck_tile::sequence<1, 2>{};
|
||||
|
||||
// Input shape and strides
|
||||
std::vector<ck_tile::index_t> input_shape = {N, H, W, C};
|
||||
std::vector<ck_tile::index_t> input_strides = {H * W * C, W * C, C, 1};
|
||||
|
||||
// Output shape and strides (keep dim0, dim3)
|
||||
std::vector<ck_tile::index_t> output_shape = {N, C};
|
||||
std::vector<ck_tile::index_t> output_strides = {C, 1};
|
||||
|
||||
// Calculate products
|
||||
ck_tile::index_t kept_dim_len_prod = N * C; // product of kept dimensions
|
||||
ck_tile::index_t total_reduce_elements = H * W; // product of reduced dimensions
|
||||
|
||||
RunGenericTest<4>(input_shape,
|
||||
input_strides,
|
||||
output_shape,
|
||||
output_strides,
|
||||
kept_dim_len_prod,
|
||||
total_reduce_elements,
|
||||
kept_dims,
|
||||
reduce_dims);
|
||||
}
|
||||
};
|
||||
Reference in New Issue
Block a user