From 7a4690b0779c4e8705a7e93d5839b90d39b43dfa Mon Sep 17 00:00:00 2001
From: Mateusz Ozga <110818320+mozga-amd@users.noreply.github.com>
Date: Tue, 13 Aug 2024 16:15:47 +0200
Subject: [PATCH] Support large: 12d tensor size for reduction kenrel (#1465)

[ROCm/composable_kernel commit: 0606e5498e7aa085a91c083d9c49794d30d371dc]
---
 example/12_reduce/reduce_blockwise.cpp        | 29 ++++++++++++++++++-
 example/12_reduce/reduce_example_common.hpp   |  5 ++--
 .../gpu/device/impl/device_reduce_common.hpp  |  6 ++--
 .../device/impl/device_reduce_multiblock.hpp  |  4 +--
 .../device/impl/device_reduce_threadwise.hpp  |  4 +--
 .../impl/device_reduce_threadwise_multi_d.hpp |  2 +-
 6 files changed, 39 insertions(+), 11 deletions(-)
diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp
index 9a736d4cfa..309100cdde 100644
--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <initializer_list>
@@ -255,34 +255,61 @@ int main(int argc, char* argv[])
     else
     {
         // for testing half_t
+        pass =
+            pass && reduce_blockwise_test<ck::half_t, float, ReduceOpId, PropagateNan, OutputIndex>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
         pass =
             pass && reduce_blockwise_test<ck::half_t, float, ReduceOpId, PropagateNan, OutputIndex>(
                         true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
 
         // for testing float
+        pass =
+            pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
+
         pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
                            true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
 
         // for testing double
+        pass =
+            pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
+
         pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
                            true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
 
         // for testing bhalf_t
+        pass = pass &&
+               reduce_blockwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
+                   true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
+
         pass = pass &&
                reduce_blockwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
                    true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
 
         // for testing int8_t
+        pass =
+            pass && reduce_blockwise_test<int8_t, int32_t, ReduceOpId, PropagateNan, OutputIndex>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
+
         pass =
             pass && reduce_blockwise_test<int8_t, int32_t, ReduceOpId, PropagateNan, OutputIndex>(
                         true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
 
 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
         // for testing int4_t using AVG operation
+        pass =
+            pass && reduce_blockwise_test<int4_t, int32_t, ReduceTensorOp::AVG, false, false>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
+
         pass = pass && reduce_blockwise_test<int4_t, int32_t, ReduceTensorOp::AVG, false, false>(
                            true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
 
         // for testing int4_t using MAX operation
+        pass =
+            pass && reduce_blockwise_test<int4_t, int8_t, ReduceTensorOp::MAX, false, false>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
+
         pass = pass && reduce_blockwise_test<int4_t, int8_t, ReduceTensorOp::MAX, false, false>(
                            true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
 #endif
diff --git a/example/12_reduce/reduce_example_common.hpp b/example/12_reduce/reduce_example_common.hpp
index 5f9a48804a..08cd6e7ff9 100644
--- a/example/12_reduce/reduce_example_common.hpp
+++ b/example/12_reduce/reduce_example_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -38,7 +38,8 @@ struct ReduceShape
     static constexpr ck::index_t NumReduceDim_ = NumReduceDim;
 };
 
-using reduce_shape_instances = std::tuple<ReduceShape<3, 1>,
+using reduce_shape_instances = std::tuple<ReduceShape<12, 3>,
+                                          ReduceShape<3, 1>,
                                           ReduceShape<3, 2>,
                                           ReduceShape<4, 1>,
                                           ReduceShape<4, 2>,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp
index 2481c5c769..67956d9f3f 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -19,7 +19,7 @@ namespace device {
 template <index_t Rank, int NumReduceDim>
 std::pair<long_index_t, long_index_t> get_2d_lengths(const std::vector<index_t>& inLengths)
 {
-    static_assert(Rank <= 6, "bigger Rank size not supported!");
+    static_assert(Rank <= 12, "bigger Rank size not supported!");
 
     long_index_t invariant_total_length = 1;
     long_index_t reduce_total_length    = 1;
@@ -38,7 +38,7 @@ std::pair<long_index_t, long_index_t> get_2d_lengths(const std::vector<index_t>&
 template <index_t Rank, int NumReduceDim>
 std::pair<long_index_t, long_index_t> get_2d_lengths(const std::array<index_t, Rank>& inLengths)
 {
-    static_assert(Rank <= 6, "bigger Rank size not supported!");
+    static_assert(Rank <= 12, "bigger Rank size not supported!");
 
     long_index_t invariant_total_length = 1;
     long_index_t reduce_total_length    = 1;
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp
index bf3deeb57a..b4873e3403 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -51,7 +51,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InDataType,
                                                     PropagateNan,
                                                     OutputIndex>
 {
-    static_assert(Rank <= 6, "Bigger Rank size is not supported!");
+    static_assert(Rank <= 12, "Bigger Rank size is not supported!");
     static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
                   "Invalid thread cluster size assignments!");
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp
index 609c4c2f5b..8291575fb8 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -47,7 +47,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InDataType,
                                                     OutputIndex>
 
 {
-    static_assert(Rank <= 6, "Bigger Rank size is not supported!");
+    static_assert(Rank <= 12, "Bigger Rank size is not supported!");
 
     static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
                    (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp
index 75abb4d2e4..764b9312f3 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp
@@ -45,7 +45,7 @@ struct DeviceReduceThreadWiseMultiD : public DeviceReduceMultiD<InDataType,
                                                                 OutElementwiseOperation>
 
 {
-    static_assert(Rank <= 6, "Bigger Rank size is not supported!");
+    static_assert(Rank <= 12, "Bigger Rank size is not supported!");
 
     static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
                    (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&