From 7a4690b0779c4e8705a7e93d5839b90d39b43dfa Mon Sep 17 00:00:00 2001 From: Mateusz Ozga <110818320+mozga-amd@users.noreply.github.com> Date: Tue, 13 Aug 2024 16:15:47 +0200 Subject: [PATCH] Support large: 12d tensor size for reduction kenrel (#1465) [ROCm/composable_kernel commit: 0606e5498e7aa085a91c083d9c49794d30d371dc] --- example/12_reduce/reduce_blockwise.cpp | 29 ++++++++++++++++++- example/12_reduce/reduce_example_common.hpp | 5 ++-- .../gpu/device/impl/device_reduce_common.hpp | 6 ++-- .../device/impl/device_reduce_multiblock.hpp | 4 +-- .../device/impl/device_reduce_threadwise.hpp | 4 +-- .../impl/device_reduce_threadwise_multi_d.hpp | 2 +- 6 files changed, 39 insertions(+), 11 deletions(-) diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp index 9a736d4cfa..309100cdde 100644 --- a/example/12_reduce/reduce_blockwise.cpp +++ b/example/12_reduce/reduce_blockwise.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -255,34 +255,61 @@ int main(int argc, char* argv[]) else { // for testing half_t + pass = + pass && reduce_blockwise_test( + true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f); pass = pass && reduce_blockwise_test( true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f); // for testing float + pass = + pass && reduce_blockwise_test( + true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f); + pass = pass && reduce_blockwise_test( true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f); // for testing double + pass = + pass && reduce_blockwise_test( + true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f); + pass = pass && reduce_blockwise_test( true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f); // for testing bhalf_t + pass = pass && + reduce_blockwise_test( + true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f); + pass = pass && reduce_blockwise_test( true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f); // for testing int8_t + pass = + pass && reduce_blockwise_test( + true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f); + pass = pass && reduce_blockwise_test( true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f); #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 // for testing int4_t using AVG operation + pass = + pass && reduce_blockwise_test( + true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f); + pass = pass && reduce_blockwise_test( true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f); // for testing int4_t using MAX operation + pass = + pass && reduce_blockwise_test( + true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f); + pass = pass && reduce_blockwise_test( true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f); #endif diff --git a/example/12_reduce/reduce_example_common.hpp b/example/12_reduce/reduce_example_common.hpp index 5f9a48804a..08cd6e7ff9 100644 --- a/example/12_reduce/reduce_example_common.hpp +++ b/example/12_reduce/reduce_example_common.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -38,7 +38,8 @@ struct ReduceShape static constexpr ck::index_t NumReduceDim_ = NumReduceDim; }; -using reduce_shape_instances = std::tuple, +using reduce_shape_instances = std::tuple, + ReduceShape<3, 1>, ReduceShape<3, 2>, ReduceShape<4, 1>, ReduceShape<4, 2>, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp index 2481c5c769..67956d9f3f 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -19,7 +19,7 @@ namespace device { template std::pair get_2d_lengths(const std::vector& inLengths) { - static_assert(Rank <= 6, "bigger Rank size not supported!"); + static_assert(Rank <= 12, "bigger Rank size not supported!"); long_index_t invariant_total_length = 1; long_index_t reduce_total_length = 1; @@ -38,7 +38,7 @@ std::pair get_2d_lengths(const std::vector& template std::pair get_2d_lengths(const std::array& inLengths) { - static_assert(Rank <= 6, "bigger Rank size not supported!"); + static_assert(Rank <= 12, "bigger Rank size not supported!"); long_index_t invariant_total_length = 1; long_index_t reduce_total_length = 1; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp index bf3deeb57a..b4873e3403 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -51,7 +51,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce { - static_assert(Rank <= 6, "Bigger Rank size is not supported!"); + static_assert(Rank <= 12, "Bigger Rank size is not supported!"); static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, "Invalid thread cluster size assignments!"); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp index 609c4c2f5b..8291575fb8 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -47,7 +47,7 @@ struct DeviceReduceThreadWise : public DeviceReduce { - static_assert(Rank <= 6, "Bigger Rank size is not supported!"); + static_assert(Rank <= 12, "Bigger Rank size is not supported!"); static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) || (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) && diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp index 75abb4d2e4..764b9312f3 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp @@ -45,7 +45,7 @@ struct DeviceReduceThreadWiseMultiD : public DeviceReduceMultiD { - static_assert(Rank <= 6, "Bigger Rank size is not supported!"); + static_assert(Rank <= 12, "Bigger Rank size is not supported!"); static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) || (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&