diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp index 9a736d4cfa..309100cdde 100644 --- a/example/12_reduce/reduce_blockwise.cpp +++ b/example/12_reduce/reduce_blockwise.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -255,34 +255,61 @@ int main(int argc, char* argv[]) else { // for testing half_t + pass = + pass && reduce_blockwise_test( + true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f); pass = pass && reduce_blockwise_test( true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f); // for testing float + pass = + pass && reduce_blockwise_test( + true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f); + pass = pass && reduce_blockwise_test( true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f); // for testing double + pass = + pass && reduce_blockwise_test( + true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f); + pass = pass && reduce_blockwise_test( true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f); // for testing bhalf_t + pass = pass && + reduce_blockwise_test( + true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f); + pass = pass && reduce_blockwise_test( true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f); // for testing int8_t + pass = + pass && reduce_blockwise_test( + true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f); + pass = pass && reduce_blockwise_test( true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f); #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 // for testing int4_t using AVG operation + pass = + pass && reduce_blockwise_test( + true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f); + pass = pass && reduce_blockwise_test( true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f); // for testing int4_t using MAX operation + pass = + pass && reduce_blockwise_test( + true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f); + pass = pass && reduce_blockwise_test( true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f); #endif diff --git a/example/12_reduce/reduce_example_common.hpp b/example/12_reduce/reduce_example_common.hpp index 5f9a48804a..08cd6e7ff9 100644 --- a/example/12_reduce/reduce_example_common.hpp +++ b/example/12_reduce/reduce_example_common.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -38,7 +38,8 @@ struct ReduceShape static constexpr ck::index_t NumReduceDim_ = NumReduceDim; }; -using reduce_shape_instances = std::tuple, +using reduce_shape_instances = std::tuple, + ReduceShape<3, 1>, ReduceShape<3, 2>, ReduceShape<4, 1>, ReduceShape<4, 2>, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp index 2481c5c769..67956d9f3f 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -19,7 +19,7 @@ namespace device { template std::pair get_2d_lengths(const std::vector& inLengths) { - static_assert(Rank <= 6, "bigger Rank size not supported!"); + static_assert(Rank <= 12, "bigger Rank size not supported!"); long_index_t invariant_total_length = 1; long_index_t reduce_total_length = 1; @@ -38,7 +38,7 @@ std::pair get_2d_lengths(const std::vector& template std::pair get_2d_lengths(const std::array& inLengths) { - static_assert(Rank <= 6, "bigger Rank size not supported!"); + static_assert(Rank <= 12, "bigger Rank size not supported!"); long_index_t invariant_total_length = 1; long_index_t reduce_total_length = 1; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp index bf3deeb57a..b4873e3403 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -51,7 +51,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce { - static_assert(Rank <= 6, "Bigger Rank size is not supported!"); + static_assert(Rank <= 12, "Bigger Rank size is not supported!"); static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, "Invalid thread cluster size assignments!"); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp index 609c4c2f5b..8291575fb8 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -47,7 +47,7 @@ struct DeviceReduceThreadWise : public DeviceReduce { - static_assert(Rank <= 6, "Bigger Rank size is not supported!"); + static_assert(Rank <= 12, "Bigger Rank size is not supported!"); static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) || (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) && diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp index 75abb4d2e4..764b9312f3 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp @@ -45,7 +45,7 @@ struct DeviceReduceThreadWiseMultiD : public DeviceReduceMultiD { - static_assert(Rank <= 6, "Bigger Rank size is not supported!"); + static_assert(Rank <= 12, "Bigger Rank size is not supported!"); static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) || (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&