From 033ea8069a58f2fe3feac5bd77dc0d3d75aed2ce Mon Sep 17 00:00:00 2001
From: root <jizhan@amd.com>
Date: Fri, 12 Apr 2024 04:36:34 +0000
Subject: [PATCH] add reduce_multi_d

---
 example/12_reduce/CMakeLists.txt              |   2 +-
 ...wise.cpp => reduce_threadwise_multi_d.cpp} | 143 ++++++------------
 ...hpp => reduce_threadwise_multi_d_impl.hpp} |  41 +++--
 3 files changed, 77 insertions(+), 109 deletions(-)
 rename example/12_reduce/{reduce_threadwise.cpp => reduce_threadwise_multi_d.cpp} (59%)
 rename example/12_reduce/{reduce_threadwise_impl.hpp => reduce_threadwise_multi_d_impl.hpp} (89%)
diff --git a/example/12_reduce/CMakeLists.txt b/example/12_reduce/CMakeLists.txt
index 968ef4c6eb..03381a449f 100644
--- a/example/12_reduce/CMakeLists.txt
+++ b/example/12_reduce/CMakeLists.txt
@@ -1,4 +1,4 @@
 add_example_executable(example_reduce_blockwise reduce_blockwise.cpp)
-add_example_executable(example_reduce_threadwise reduce_threadwise.cpp)
+add_example_executable(example_reduce_threadwise_multi_d reduce_threadwise_multi_d.cpp)
 add_example_executable(example_reduce_multiblock_atomic_add reduce_multiblock_atomic_add.cpp)
 add_example_executable(example_reduce_blockwise_two_call reduce_blockwise_two_call.cpp)
diff --git a/example/12_reduce/reduce_threadwise.cpp b/example/12_reduce/reduce_threadwise_multi_d.cpp
similarity index 59%
rename from example/12_reduce/reduce_threadwise.cpp
rename to example/12_reduce/reduce_threadwise_multi_d.cpp
index 6cd7ad6c1a..3d031f298e 100644
--- a/example/12_reduce/reduce_threadwise.cpp
+++ b/example/12_reduce/reduce_threadwise_multi_d.cpp
@@ -7,7 +7,7 @@
 #include <getopt.h>
 
 #include "ck/utility/reduction_enums.hpp"
-#include "reduce_threadwise_impl.hpp"
+#include "reduce_threadwise_multi_d_impl.hpp"
 #include "reduce_example_common.hpp"
 
 using namespace ck;
@@ -25,7 +25,7 @@ class SimpleAppArgs
 
     public:
     std::vector<size_t> inLengths = {16, 64, 32, 16};
-    std::vector<int> reduceDims   = {0, 1, 2};
+    std::vector<int> reduceDims   = {0};
     std::vector<float> scales     = {1.0f, 0.0f};
 
     bool do_verification = true;
@@ -118,13 +118,13 @@ template <typename InOutDataType,
           ReduceTensorOp ReduceOpId,
           index_t PropagateNan,
           index_t OutputIndex>
-bool reduce_threadwise_test(bool do_verification,
-                            int init_method,
-                            bool time_kernel,
-                            const std::vector<size_t>& inLengths,
-                            const std::vector<int>& reduceDims,
-                            float alpha,
-                            float beta)
+bool reduce_threadwise_multi_d_test(bool do_verification,
+                                    int init_method,
+                                    bool time_kernel,
+                                    const std::vector<size_t>& inLengths,
+                                    const std::vector<int>& reduceDims,
+                                    float alpha,
+                                    float beta)
 {
     bool matched = false;
     int result   = 0;
@@ -144,13 +144,13 @@ bool reduce_threadwise_test(bool do_verification,
 
         ck::ranges::copy(reduceDims, arrReduceDims.begin());
 
-        result = reduce_threadwise_impl<InOutDataType,
-                                        AccDataType,
-                                        ReduceOpId,
-                                        ShapeType::Rank_,
-                                        ShapeType::NumReduceDim_,
-                                        PropagateNan,
-                                        OutputIndex>(
+        result = reduce_threadwise_multi_d_impl<InOutDataType,
+                                                AccDataType,
+                                                ReduceOpId,
+                                                ShapeType::Rank_,
+                                                ShapeType::NumReduceDim_,
+                                                PropagateNan,
+                                                OutputIndex>(
             do_verification, init_method, time_kernel, inLengths, arrReduceDims, alpha, beta);
 
         matched = true;
@@ -176,96 +176,53 @@ int main(int argc, char* argv[])
 
         if(arg.data_type == 0)
         {
-            pass = reduce_threadwise_test<ck::half_t, float, ReduceOpId, PropagateNan, OutputIndex>(
-                arg.do_verification,
-                arg.init_method,
-                arg.time_kernel,
-                arg.inLengths,
-                arg.reduceDims,
-                arg.scales[0],
-                arg.scales[1]);
+            pass = reduce_threadwise_multi_d_test<ck::half_t,
+                                                  float,
+                                                  ReduceOpId,
+                                                  PropagateNan,
+                                                  OutputIndex>(arg.do_verification,
+                                                               arg.init_method,
+                                                               arg.time_kernel,
+                                                               arg.inLengths,
+                                                               arg.reduceDims,
+                                                               arg.scales[0],
+                                                               arg.scales[1]);
         }
         else if(arg.data_type == 1)
         {
-            pass = reduce_threadwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
-                arg.do_verification,
-                arg.init_method,
-                arg.time_kernel,
-                arg.inLengths,
-                arg.reduceDims,
-                arg.scales[0],
-                arg.scales[1]);
+            pass =
+                reduce_threadwise_multi_d_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
+                    arg.do_verification,
+                    arg.init_method,
+                    arg.time_kernel,
+                    arg.inLengths,
+                    arg.reduceDims,
+                    arg.scales[0],
+                    arg.scales[1]);
         }
-#if 0
-        else if(arg.data_type == 3)
-        {
-            pass = reduce_threadwise_test<int8_t, float, ReduceOpId, PropagateNan, OutputIndex>(
-                arg.do_verification,
-                arg.init_method,
-                arg.time_kernel,
-                arg.inLengths,
-                arg.reduceDims,
-                arg.scales[0],
-                arg.scales[1]);
-        }
-        else if(arg.data_type == 5)
-        {
-            pass = reduce_threadwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
-                arg.do_verification,
-                arg.init_method,
-                arg.time_kernel,
-                arg.inLengths,
-                arg.reduceDims,
-                arg.scales[0],
-                arg.scales[1]);
-        }
-        else if(arg.data_type == 6)
-        {
-            pass = reduce_threadwise_test<double, double, ReduceOpId, PropagateNan, OutputIndex>(
-                arg.do_verification,
-                arg.init_method,
-                arg.time_kernel,
-                arg.inLengths,
-                arg.reduceDims,
-                arg.scales[0],
-                arg.scales[1]);
-        }
-#endif
     }
     else
     {
         // for testing half_t
-        pass = pass &&
-               reduce_threadwise_test<ck::half_t, float, ReduceOpId, PropagateNan, OutputIndex>(
-                   true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f);
+        pass = pass && reduce_threadwise_multi_d_test<ck::half_t,
+                                                      float,
+                                                      ReduceOpId,
+                                                      PropagateNan,
+                                                      OutputIndex>(
+                           true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f);
 
         // for testing float
-        pass = pass && reduce_threadwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
-                           true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f);
-
-        // for testing double
-        pass = pass && reduce_threadwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
-                           true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f);
-
-        // for testing bhalf_t
         pass = pass &&
-               reduce_threadwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
+               reduce_threadwise_multi_d_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
                    true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f);
 
-#if 0
-        // for testing int8_t
-        pass =
-            pass && reduce_threadwise_test<int8_t, int32_t, ReduceOpId, PropagateNan, OutputIndex>(
-                        true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f);
-
-        // for testing 3D input
-        pass = pass && reduce_threadwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
-                           true, 2, true, {16, 64, 960}, {0}, 1.0f, 0.0f);
-
-        // for testing 5D input
-        pass = pass && reduce_threadwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
-                           true, 2, true, {16, 64, 32, 2, 960}, {0}, 1.0f, 0.0f);
-#endif
+        // for testing bhalf_t
+        pass = pass && reduce_threadwise_multi_d_test<ck::bhalf_t,
+                                                      float,
+                                                      ReduceOpId,
+                                                      PropagateNan,
+                                                      OutputIndex>(
+                           true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f);
     }
 
     return (pass ? 0 : 1);
diff --git a/example/12_reduce/reduce_threadwise_impl.hpp b/example/12_reduce/reduce_threadwise_multi_d_impl.hpp
similarity index 89%
rename from example/12_reduce/reduce_threadwise_impl.hpp
rename to example/12_reduce/reduce_threadwise_multi_d_impl.hpp
index 631d0535a4..457a2a637d 100644
--- a/example/12_reduce/reduce_threadwise_impl.hpp
+++ b/example/12_reduce/reduce_threadwise_multi_d_impl.hpp
@@ -8,7 +8,6 @@
 #include "ck/ck.hpp"
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
-//#include "ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp"
 
@@ -28,13 +27,13 @@ template <typename InOutDataType,
           ck::index_t NumReduceDim,
           bool PropagateNan,
           bool OutputIndex>
-int reduce_threadwise_impl(bool do_verification,
-                           int init_method,
-                           bool time_kernel,
-                           const std::vector<size_t>& inLengths,
-                           const std::array<int, NumReduceDim>& reduceDims,
-                           float alpha,
-                           float beta)
+int reduce_threadwise_multi_d_impl(bool do_verification,
+                                   int init_method,
+                                   bool time_kernel,
+                                   const std::vector<size_t>& inLengths,
+                                   const std::array<int, NumReduceDim>& reduceDims,
+                                   float alpha,
+                                   float beta)
 
 {
     using namespace ck;
@@ -90,17 +89,17 @@ int reduce_threadwise_impl(bool do_verification,
     };
 
     using PassThrough = tensor_operation::element_wise::PassThrough;
-    // using Add         = tensor_operation::element_wise::Add;
+    using Add         = tensor_operation::element_wise::Add;
 
     using ReduceOperation         = typename reduce_binary_operator<ReduceOpId>::opType;
     using InElementwiseOperation  = PassThrough;
-    using OutElementwiseOperation = PassThrough;
+    using OutElementwiseOperation = Add;
 
     using InOutDataTypeInDevice = InOutDataType;
 
     using DeviceReduceInstance =
         ck::tensor_operation::device::DeviceReduceThreadWiseMultiD<InOutDataTypeInDevice,
-                                                                   ck::Tuple<>,
+                                                                   ck::Tuple<InOutDataTypeInDevice>,
                                                                    AccDataType,
                                                                    InOutDataTypeInDevice,
                                                                    Rank,
@@ -129,6 +128,9 @@ int reduce_threadwise_impl(bool do_verification,
 
     Tensor<InOutDataType> out_ref(outLengths);
     Tensor<InOutDataType> out(outLengths);
+
+    Tensor<InOutDataType> d0(outLengths);
+
     Tensor<int> out_indices_ref(outLengths);
     Tensor<int> out_indices(outLengths);
 
@@ -147,16 +149,19 @@ int reduce_threadwise_impl(bool do_verification,
         case 0: break;
         case 1:
             in.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
+            d0.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
             if(beta != 0.0f)
                 out_ref.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
             break;
         case 2:
             in.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+            d0.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
             if(beta != 0.0f)
                 out_ref.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
             break;
         default:
             in.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0}, num_thread);
+            d0.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0}, num_thread);
             if(beta != 0.0f)
                 out_ref.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0},
                                             num_thread);
@@ -169,13 +174,14 @@ int reduce_threadwise_impl(bool do_verification,
 
     // these buffers are usually provided by the user application
     DeviceMem in_dev(sizeof(InOutDataTypeInDevice) * in.mDesc.GetElementSpaceSize());
+    DeviceMem d0_dev(sizeof(InOutDataTypeInDevice) * d0.mDesc.GetElementSpaceSize());
     DeviceMem out_dev(sizeof(InOutDataTypeInDevice) * out.mDesc.GetElementSpaceSize());
 
     in_dev.ToDevice(in.mData.data());
+    d0_dev.ToDevice(d0.mData.data());
 
     if(beta != 0.0f)
     {
-
         out_dev.ToDevice(out.mData.data());
     };
 
@@ -188,11 +194,13 @@ int reduce_threadwise_impl(bool do_verification,
 
     std::array<index_t, Rank> arrInLengths;
     std::array<index_t, Rank> arrInStrides;
+
     std::array<index_t, NumOutDim> arrOutLengths;
     std::array<index_t, NumOutDim> arrOutStrides;
 
     ck::ranges::copy(inLengths, arrInLengths.begin());
     ck::ranges::copy(inStrides, arrInStrides.begin());
+
     ck::ranges::copy(outLengths, arrOutLengths.begin());
     ck::ranges::copy(outStrides, arrOutStrides.begin());
 
@@ -236,19 +244,22 @@ int reduce_threadwise_impl(bool do_verification,
         auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer();
 
         invoker_ptr_ref->Run(argument_ptr_ref.get());
+
+        for(std::size_t i = 0; i < out_ref.GetElementSize(); i++)
+            out_elementwise_op(out_ref.mData[i], out_ref.mData[i], d0.mData[i]);
     };
 
     auto reduce = DeviceReduceInstance{};
 
     auto argument_ptr = reduce.MakeArgumentPointer(arrInLengths,
                                                    arrInStrides,
-                                                   {},
-                                                   {},
+                                                   {arrOutLengths},
+                                                   {arrOutStrides},
                                                    arrOutLengths,
                                                    arrOutStrides,
                                                    reduceDims,
                                                    in_dev.GetDeviceBuffer(),
-                                                   {},
+                                                   {d0_dev.GetDeviceBuffer()},
                                                    out_dev.GetDeviceBuffer(),
                                                    in_elementwise_op,
                                                    out_elementwise_op);