From 2e7ce3bb6bf0f5f2b9e76bbb7bcb5aa93feb2f96 Mon Sep 17 00:00:00 2001
From: Qianfeng <qianfeng.zhang@amd.com>
Date: Wed, 25 May 2022 01:19:12 +0800
Subject: [PATCH] Overhaul to Reducton and its dependants  (#237)

* Tiny fix in dynamic_buffer.hpp to support vectorized AtomicAdd for double type

* Update to host layer and host reduction

* Merge and remove reduction kernels

* Merge and remove reduction device interfaces and update pooling device interface

* Merge and remove useless reduction device instances

* Update to reduction profiler and reduction ctests

* Update to reduction and pooling examples and add one reduction example

* Change to reduction examples to let them testable by ctest

* Add explicit pass checking for reduction and pooling examples

* Explicit assignment of tensor shapes in example reduce_blockwise_two_call

* Use atomic_add to repace atomicAdd and add atomic_add for double type

* Add reduce ctest support for double data type

* Replace to_int_vector() by using c++ std::vector::assign()

* Keep DeviceReduceThreadWise separated from DeviceReduceBlockWise

* Merge DeviceReduceBlockWise and DeviceReduceMultiBlockAtomicAdd into DeviceReduceMultiBlock

* Add GetAtomicOperationZeroValue() support for AtomicMax

* Tiny change to reduce example README.md

* Fix some tiny issues due to branch merging

* Revoke previous change in dynamic_buffer.hpp and add atomic_add for double2_t

* Add reduce multiblock_atomic_add instances for fp64 to verify vectorized atomic_add on fp64

* Renaming

* Clean the header includings in device_reduce instances header files

[ROCm/composable_kernel commit: 63eee2d9991b08ca286f6895dd8f90da12a62da3]
---
 example/12_reduce/CMakeLists.txt              |   3 +-
 example/12_reduce/README.md                   |  41 +-
 example/12_reduce/reduce_blockwise.cpp        | 188 ++--
 .../12_reduce/reduce_blockwise_two_call.cpp   | 290 ++++++
 example/13_pool2d_fwd/README.md               |  10 +-
 example/13_pool2d_fwd/pool2d_fwd.cpp          | 114 ++-
 .../device/device_pool2d_fwd_nhwc_nhwc.hpp    |  45 +-
 .../gpu/device/device_reduce.hpp              |  29 +-
 .../gpu/device/device_reduce_blockwise.hpp    | 374 --------
 .../device_reduce_blockwise_second_call.hpp   | 328 -------
 .../gpu/device/device_reduce_common.hpp       |  18 +-
 ...c_add.hpp => device_reduce_multiblock.hpp} | 311 +++---
 ...evice_reduce_multiblock_partial_reduce.hpp | 440 ---------
 .../gpu/device/device_reduce_threadwise.hpp   | 145 ++-
 .../grid/gridwise_2d_reduction_blockwise.hpp  | 886 ------------------
 .../grid/gridwise_2d_reduction_multiblock.hpp | 638 +++++++++++++
 ...ise_2d_reduction_multiblock_atomic_add.hpp | 269 ------
 ...2d_reduction_multiblock_partial_reduce.hpp | 487 ----------
 .../grid/gridwise_2d_reduction_threadwise.hpp | 371 ++++----
 include/ck/utility/dynamic_buffer.hpp         |   2 +-
 .../utility/generic_memory_space_atomic.hpp   |  23 +
 include/ck/utility/reduction_operator.hpp     |  58 +-
 .../library/host_tensor/host_common_util.hpp  | 102 ++
 .../library/host_tensor/host_reduce_util.hpp  |  26 +-
 .../ck/library/host_tensor/host_reduction.hpp |  18 +-
 .../gpu/reduce/device_reduce_instance.hpp     |  17 +-
 .../device_reduce_instance_blockwise.hpp      | 148 +--
 ..._reduce_instance_blockwise_b16_f32_b16.hpp |   3 +-
 ..._reduce_instance_blockwise_f16_f16_f16.hpp |   3 +-
 ..._reduce_instance_blockwise_f16_f32_f16.hpp |   3 +-
 ..._reduce_instance_blockwise_f32_f32_f32.hpp |   2 -
 ..._reduce_instance_blockwise_f32_f64_f32.hpp |   2 -
 ..._reduce_instance_blockwise_f64_f64_f64.hpp |   2 -
 ...ce_reduce_instance_blockwise_i8_i32_i8.hpp |   2 -
 ...ice_reduce_instance_blockwise_i8_i8_i8.hpp |   2 -
 ..._reduce_instance_blockwise_second_call.hpp | 165 ----
 ...ance_blockwise_second_call_f16_f16_f16.hpp |  47 -
 ...ance_blockwise_second_call_f32_f32_b16.hpp |  60 --
 ...ance_blockwise_second_call_f32_f32_f16.hpp |  35 -
 ...ance_blockwise_second_call_f32_f32_f32.hpp |  59 --
 ...ance_blockwise_second_call_f64_f64_f32.hpp |  35 -
 ...ance_blockwise_second_call_f64_f64_f64.hpp |  59 --
 ...tance_blockwise_second_call_i32_i32_i8.hpp |  31 -
 ...nstance_blockwise_second_call_i8_i8_i8.hpp |  47 -
 .../device_reduce_instance_impl_common.hpp    |  14 -
 ..._reduce_instance_multiblock_atomic_add.hpp | 123 +--
 ...ance_multiblock_atomic_add_b16_f32_f32.hpp |   3 +-
 ...ance_multiblock_atomic_add_f16_f32_f32.hpp |   3 +-
 ...ance_multiblock_atomic_add_f32_f32_f32.hpp |   2 -
 ...ance_multiblock_atomic_add_f32_f64_f32.hpp |   2 -
 ...ance_multiblock_atomic_add_f64_f64_f64.hpp |  29 +
 ...uce_instance_multiblock_partial_reduce.hpp | 174 ----
 ..._multiblock_partial_reduce_b16_f32_b16.hpp |  60 --
 ..._multiblock_partial_reduce_f16_f16_f16.hpp |  47 -
 ..._multiblock_partial_reduce_f16_f32_f16.hpp |  35 -
 ..._multiblock_partial_reduce_f32_f32_f32.hpp |  52 -
 ..._multiblock_partial_reduce_f32_f64_f32.hpp |  27 -
 ..._multiblock_partial_reduce_f64_f64_f64.hpp |  62 --
 ...ce_multiblock_partial_reduce_i8_i32_i8.hpp |  31 -
 ...nce_multiblock_partial_reduce_i8_i8_i8.hpp |  47 -
 .../device_reduce_instance_threadwise.hpp     |  75 +-
 ...reduce_instance_threadwise_b16_f32_b16.hpp |   3 +-
 ...reduce_instance_threadwise_f16_f16_f16.hpp |   3 +-
 ...reduce_instance_threadwise_f16_f32_f16.hpp |   3 +-
 ...reduce_instance_threadwise_f32_f32_f32.hpp |   2 -
 ...reduce_instance_threadwise_f32_f64_f32.hpp |   2 -
 ...reduce_instance_threadwise_f64_f64_f64.hpp |   2 -
 ...e_reduce_instance_threadwise_i8_i32_i8.hpp |   2 -
 ...ce_reduce_instance_threadwise_i8_i8_i8.hpp |   2 -
 .../gpu/reduce/CMakeLists.txt                 |  17 +-
 ...ance_blockwise_second_call_f16_f16_f16.cpp |  40 -
 ...ance_blockwise_second_call_f32_f32_b16.cpp |  53 --
 ...ance_blockwise_second_call_f32_f32_f16.cpp |  28 -
 ...ance_blockwise_second_call_f32_f32_f32.cpp |  52 -
 ...ance_blockwise_second_call_f64_f64_f32.cpp |  28 -
 ...ance_blockwise_second_call_f64_f64_f64.cpp |  52 -
 ...tance_blockwise_second_call_i32_i32_i8.cpp |  24 -
 ...nstance_blockwise_second_call_i8_i8_i8.cpp |  40 -
 ...ance_multiblock_atomic_add_f64_f64_f64.cpp |  24 +
 ..._multiblock_partial_reduce_b16_f32_b16.cpp |  53 --
 ..._multiblock_partial_reduce_f16_f16_f16.cpp |  40 -
 ..._multiblock_partial_reduce_f16_f32_f16.cpp |  28 -
 ..._multiblock_partial_reduce_f32_f32_f32.cpp |  45 -
 ..._multiblock_partial_reduce_f32_f64_f32.cpp |  20 -
 ..._multiblock_partial_reduce_f64_f64_f64.cpp |  55 --
 ...ce_multiblock_partial_reduce_i8_i32_i8.cpp |  24 -
 ...nce_multiblock_partial_reduce_i8_i8_i8.cpp |  40 -
 profiler/include/profile_reduce_impl.hpp      | 422 +++------
 profiler/src/profile_reduce.cpp               | 218 ++---
 script/test_reduce_no_index.sh                |  11 +
 script/test_reduce_with_index.sh              |  11 +
 test/reduce/reduce_no_index.cpp               | 561 ++---------
 test/reduce/reduce_util.hpp                   |  19 -
 test/reduce/reduce_with_index.cpp             | 566 ++---------
 94 files changed, 2429 insertions(+), 6785 deletions(-)
 create mode 100644 example/12_reduce/reduce_blockwise_two_call.cpp
 delete mode 100644 include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
 delete mode 100644 include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
 rename include/ck/tensor_operation/gpu/device/{device_reduce_multiblock_atomic_add.hpp => device_reduce_multiblock.hpp} (58%)
 delete mode 100644 include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
 delete mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
 delete mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp
 delete mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp
 create mode 100644 library/include/ck/library/host_tensor/host_common_util.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp
 delete mode 100644 test/reduce/reduce_util.hpp

diff --git a/example/12_reduce/CMakeLists.txt b/example/12_reduce/CMakeLists.txt
index d6866abeb8..9045a78a85 100644
--- a/example/12_reduce/CMakeLists.txt
+++ b/example/12_reduce/CMakeLists.txt
@@ -1 +1,2 @@
-add_example_executable(example_reduce_blockwise reduce_blockwise.cpp -D 16,64,32,960 -v 1 1 10)
+add_example_executable(example_reduce_blockwise reduce_blockwise.cpp)
+add_example_executable(example_reduce_blockwise_two_call reduce_blockwise_two_call.cpp)
diff --git a/example/12_reduce/README.md b/example/12_reduce/README.md
index 6fd3b3dcf3..a6442984e7 100644
--- a/example/12_reduce/README.md
+++ b/example/12_reduce/README.md
@@ -5,23 +5,38 @@
 # -D <xxx> : input 4-d tensor lengths
 # -v <x> :   verification (0=no, 1=yes)
 #arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
-#arg2: run kernel # of times (>1)
-./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 10
+#arg2: time kernel (0=no, 1=yes) 
+./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 1
 ```
 
 Result
 ```
+./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 1
 launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1} 
-Warm up
-Start running 3 times...
-Perf: 0.23536 ms, 267.32 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
-error: 0
-max_diff: 0, 529, 529
-root@dc-smc-18:/data/composable_kernel/Build3# bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 10
-launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1} 
-Warm up
+Warm up 1 time
 Start running 10 times...
-Perf: 0.23392 ms, 268.966 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
-error: 0
-max_diff: 0, 528, 528
+Perf: 0.282592 ms, 222.641 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
 ```
+
+# Instructions for ```example_reduce_blockwise_two_call```
+
+## Run ```example_reduce_blockwise_two_call```
+```bash
+#arg1:  verification (0=no, 1=yes(
+#arg2:  initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg3:  time kernel (0=no, 1=yes) 
+./bin/example_reduce_blockwise_two_call 1 2 1
+
+
+Result
+```
+./bin/example_reduce_blockwise_two_call 1 2 1
+launch_and_time_kernel: grid_dim {204800, 1, 1}, block_dim {256, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+launch_and_time_kernel: grid_dim {6400, 1, 1}, block_dim {256, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+Perf: 2.1791 ms, 771.42 GB/s, DeviceReduceBlockWise<256,M_C32_S1,K_C8_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1> => DeviceReduceBlockWise<256,M_C256_S1,K_C1_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1>
+```
+
diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp
index b2d312ae8c..e1e3afc58a 100644
--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -12,8 +12,8 @@
 #include "host_tensor_generator.hpp"
 #include "device_tensor.hpp"
 #include "device_base.hpp"
-#include "device_reduce_blockwise.hpp"
-#include "host_reduce_util.hpp"
+#include "device_reduce_multiblock.hpp"
+#include "host_common_util.hpp"
 #include "host_reduction.hpp"
 
 #include "reduction_enums.hpp"
@@ -30,9 +30,8 @@ constexpr int Rank         = 4;
 constexpr int NumReduceDim = 3;
 
 constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2;
-constexpr NanPropagation NanOpt     = NanPropagation::PROPAGATE_NAN;
-constexpr bool PropagateNan         = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
-constexpr ReduceTensorIndices IndicesOpt = ReduceTensorIndices::NO_INDICES;
+constexpr bool PropagateNan         = true;
+constexpr bool OutputIndex          = false;
 
 using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
 using InElementwiseOperation =
@@ -40,85 +39,44 @@ using InElementwiseOperation =
 using AccElementwiseOperation =
     typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation;
 
-using DeviceReduceInstance = DeviceReduceBlockWise<InDataType,
-                                                   AccDataType,
-                                                   OutDataType,
-                                                   Rank,
-                                                   NumReduceDim,
-                                                   ReduceOperation,
-                                                   InElementwiseOperation,
-                                                   AccElementwiseOperation,
-                                                   PropagateNan,
-                                                   false,
-                                                   256,
-                                                   4,
-                                                   64,
-                                                   1,
-                                                   1,
-                                                   0,
-                                                   1,
-                                                   1>;
+using DeviceReduceInstance = DeviceReduceMultiBlock<InDataType,
+                                                    AccDataType,
+                                                    OutDataType,
+                                                    Rank,
+                                                    NumReduceDim,
+                                                    ReduceOperation,
+                                                    InElementwiseOperation,
+                                                    AccElementwiseOperation,
+                                                    InMemoryDataOperationEnum::Set,
+                                                    PropagateNan,
+                                                    OutputIndex,
+                                                    false, // HaveIndexInputIfOutputIndex
+                                                    256,
+                                                    4,
+                                                    64,
+                                                    1,
+                                                    1,
+                                                    0,
+                                                    1,
+                                                    1>;
 
 static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
-                                       {"scales", required_argument, nullptr, 'S'},
                                        {"verify", required_argument, nullptr, 'v'},
                                        {"help", no_argument, nullptr, '?'},
                                        {nullptr, 0, nullptr, 0}};
 
 class SimpleAppArgs
 {
-    template <typename T>
-    static T getSingleValueFromString(const std::string& valueStr)
-    {
-        std::istringstream iss(valueStr);
-
-        T ret;
-
-        iss >> ret;
-
-        return (ret);
-    };
-
-    template <typename T>
-    static std::vector<T> getTypeValuesFromString(const char* cstr_values)
-    {
-        std::string valuesStr(cstr_values);
-
-        std::vector<T> values;
-        std::size_t pos = 0;
-        std::size_t new_pos;
-
-        new_pos = valuesStr.find(',', pos);
-        while(new_pos != std::string::npos)
-        {
-            const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
-
-            T val = getSingleValueFromString<T>(sliceStr);
-
-            values.push_back(val);
-
-            pos     = new_pos + 1;
-            new_pos = valuesStr.find(',', pos);
-        };
-
-        std::string sliceStr = valuesStr.substr(pos);
-        T val                = getSingleValueFromString<T>(sliceStr);
-
-        values.push_back(val);
-
-        return (values);
-    };
-
     private:
     int option_index = 0;
 
     public:
-    std::vector<size_t> inLengths;
-    std::vector<float> scales;
+    std::vector<size_t> inLengths = {16, 64, 32, 960};
+    std::vector<float> scales     = {1.0f, 0.0f};
 
     bool do_verification = true;
     int init_method      = 1;
-    bool time_kernel     = false;
+    bool time_kernel     = true;
 
     public:
     void show_usage(const char* cmd)
@@ -126,24 +84,24 @@ class SimpleAppArgs
         std::cout << "Usage of " << cmd << std::endl;
         std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths"
                   << std::endl;
-        std::cout << "--scales or -S, comma separated two float values for alpha and beta"
-                  << std::endl;
         std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
                      "comparing with the host-based reduction"
                   << std::endl;
         std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
                      "value, 3=decimal value)"
                   << std::endl;
-        std::cout << "Arg2 -- time kernel (0=n0, 1=yes)" << std::endl;
+        std::cout << "Arg2 -- time kernel (0=no, 1=yes)" << std::endl;
     };
 
     int processArgs(int argc, char* argv[])
     {
+        using ck::host_common::getTypeValuesFromString;
+
         int ch;
 
         while(1)
         {
-            ch = getopt_long(argc, argv, "D:S:v:l:", long_options, &option_index);
+            ch = getopt_long(argc, argv, "D:v:l:", long_options, &option_index);
             if(ch == -1)
                 break;
             switch(ch)
@@ -154,12 +112,6 @@ class SimpleAppArgs
 
                 inLengths = getTypeValuesFromString<size_t>(optarg);
                 break;
-            case 'S':
-                if(!optarg)
-                    throw std::runtime_error("Invalid option format!");
-
-                scales = getTypeValuesFromString<float>(optarg);
-                break;
             case 'v':
                 if(!optarg)
                     throw std::runtime_error("Invalid option format!");
@@ -181,7 +133,7 @@ class SimpleAppArgs
             throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
 
         init_method = std::atoi(argv[optind++]);
-        time_kernel = std::atoi(argv[optind]);
+        time_kernel = static_cast<bool>(std::atoi(argv[optind]));
 
         if(scales.empty())
         {
@@ -202,16 +154,16 @@ int main(int argc, char* argv[])
 
     SimpleAppArgs args;
 
-    if(args.processArgs(argc, argv) < 0)
-        return (-1);
+    if(argc > 1)
+    {
+        if(args.processArgs(argc, argv) < 0)
+            return (-1);
+    };
 
     constexpr bool op_support_indices =
         (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
          ReduceOpId == ReduceTensorOp::AMAX);
 
-    constexpr bool NeedIndices =
-        (op_support_indices && (IndicesOpt != ReduceTensorIndices::NO_INDICES));
-
     // if input is half type, no reason to use float for indiced reduction operation and must use
     // float for non-indiced reduction operation for accuracy
     constexpr bool invalid_reduce_1 =
@@ -225,8 +177,7 @@ int main(int argc, char* argv[])
         (op_support_indices && !std::is_same<AccDataType, float>::value);
 
     // indices option can only be used when it is really needed
-    constexpr bool invalid_reduce_3 =
-        (!op_support_indices && IndicesOpt != ReduceTensorIndices::NO_INDICES);
+    constexpr bool invalid_reduce_3 = (!op_support_indices && OutputIndex);
 
     constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3);
 
@@ -294,9 +245,9 @@ int main(int argc, char* argv[])
     if(beta != 0.0f)
         out_dev.ToDevice(out.mData.data());
 
-    size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int32_t) : 0;
+    size_t indicesSizeInBytes = OutputIndex ? out.mDesc.GetElementSize() * sizeof(int32_t) : 0;
 
-    DeviceMem out_indices_dev(indicesSizeInBytes);
+    DeviceMem out_index_dev(indicesSizeInBytes);
 
     if(args.do_verification)
     {
@@ -307,38 +258,39 @@ int main(int argc, char* argv[])
                       Rank,
                       NumReduceDim,
                       PropagateNan,
-                      NeedIndices>
+                      OutputIndex>
             hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
 
         hostReduce.Run(
             alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
     };
 
-    const auto i_inLengths  = to_int_vector(args.inLengths);
-    const auto i_inStrides  = to_int_vector(inStrides);
-    const auto i_outLengths = to_int_vector(outLengths);
-    const auto i_outStrides = to_int_vector(outStrides);
+    std::vector<ck::index_t> i_inLengths;
+    std::vector<ck::index_t> i_inStrides;
+    std::vector<ck::index_t> i_outLengths;
+    std::vector<ck::index_t> i_outStrides;
+
+    i_inLengths.assign(args.inLengths.begin(), args.inLengths.end());
+    i_inStrides.assign(inStrides.begin(), inStrides.end());
+    i_outLengths.assign(outLengths.begin(), outLengths.end());
+    i_outStrides.assign(outStrides.begin(), outStrides.end());
 
     auto reduce = DeviceReduceInstance{};
 
-    auto wsSizeInBytes = reduce.GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
-
-    DeviceMem ws_dev(wsSizeInBytes);
-
-    auto argument_ptr =
-        reduce.MakeArgumentPointer(i_inLengths,
-                                   i_inStrides,
-                                   i_outLengths,
-                                   i_outStrides,
-                                   reduceDims,
-                                   alpha,
-                                   beta,
-                                   in_dev.GetDeviceBuffer(),
-                                   out_dev.GetDeviceBuffer(),
-                                   out_indices_dev.GetDeviceBuffer(),
-                                   ws_dev.GetDeviceBuffer(),
-                                   InElementwiseOperation{static_cast<int>(reduce_total_length)},
-                                   AccElementwiseOperation{static_cast<int>(reduce_total_length)});
+    auto argument_ptr = reduce.MakeArgumentPointer(
+        i_inLengths,
+        i_inStrides,
+        i_outLengths,
+        i_outStrides,
+        reduceDims,
+        alpha,
+        beta,
+        in_dev.GetDeviceBuffer(),
+        nullptr,
+        out_dev.GetDeviceBuffer(),
+        out_index_dev.GetDeviceBuffer(),
+        InElementwiseOperation{static_cast<int32_t>(reduce_total_length)},
+        AccElementwiseOperation{static_cast<int32_t>(reduce_total_length)});
 
     if(!reduce.IsSupportedArgument(argument_ptr.get()))
     {
@@ -362,16 +314,18 @@ int main(int argc, char* argv[])
               << std::endl;
 
     bool pass = true;
+
     if(args.do_verification)
     {
         out_dev.FromDevice(out.mData.data());
-        pass &= ck::utils::check_err(out.mData, out_ref.mData);
+        pass = pass && ck::utils::check_err(out.mData, out_ref.mData);
 
-        if(NeedIndices)
+        if(OutputIndex)
         {
-            out_indices_dev.FromDevice(out_indices.mData.data());
-            pass &= ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
+            out_index_dev.FromDevice(out_indices.mData.data());
+            pass = pass && ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
         };
     };
-    return pass ? 0 : 1;
+
+    return (pass ? 0 : 1);
 }
diff --git a/example/12_reduce/reduce_blockwise_two_call.cpp b/example/12_reduce/reduce_blockwise_two_call.cpp
new file mode 100644
index 0000000000..cd166c40fe
--- /dev/null
+++ b/example/12_reduce/reduce_blockwise_two_call.cpp
@@ -0,0 +1,290 @@
+#include <iostream>
+#include <numeric>
+#include <sstream>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "check_err.hpp"
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "device_tensor.hpp"
+#include "device_base.hpp"
+#include "device_reduce_multiblock.hpp"
+#include "host_common_util.hpp"
+#include "host_reduction.hpp"
+
+#include "reduction_enums.hpp"
+#include "reduction_operator_mapping.hpp"
+
+using namespace ck;
+using namespace ck::tensor_operation::device;
+
+using InOutDataType = ck::half_t;
+using InOutDataType = ck::half_t;
+using AccDataType   = float;
+
+constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2;
+constexpr bool PropagateNan         = true;
+constexpr bool OutputIndex          = false;
+
+using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
+using InElementwiseOperation =
+    typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
+using AccElementwiseOperation =
+    typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation;
+
+using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<AccDataType, AccDataType>;
+
+using DeviceReduceInstance_1 = DeviceReduceMultiBlock<InOutDataType,
+                                                      AccDataType,
+                                                      InOutDataType,
+                                                      5, // Rank
+                                                      1, // NumReduceDim
+                                                      ReduceOperation,
+                                                      InElementwiseOperation,
+                                                      PassThroughOp,
+                                                      InMemoryDataOperationEnum::Set,
+                                                      PropagateNan,
+                                                      OutputIndex,
+                                                      false, // HaveIndexInputIfOutputIndex
+                                                      256,
+                                                      32,
+                                                      8,
+                                                      1,
+                                                      1,
+                                                      1, // vector dim
+                                                      1,
+                                                      1>;
+
+using DeviceReduceInstance_2 = DeviceReduceMultiBlock<InOutDataType,
+                                                      AccDataType,
+                                                      InOutDataType,
+                                                      4, // Rank
+                                                      1, // NumReduceDim
+                                                      ReduceOperation,
+                                                      PassThroughOp,
+                                                      AccElementwiseOperation,
+                                                      InMemoryDataOperationEnum::Set,
+                                                      PropagateNan,
+                                                      OutputIndex,
+                                                      false, // HaveIndexInputIfOutputIndex
+                                                      256,
+                                                      128,
+                                                      2,
+                                                      1,
+                                                      1,
+                                                      1, // vector dim
+                                                      1,
+                                                      1>;
+
+static bool do_verify;
+static int init_method;
+static float alpha;
+static float beta;
+static bool time_kernel;
+
+int main(int argc, char* argv[])
+{
+    // used by the device reduction
+    const std::vector<int> reduceDims_1    = {4};
+    const std::vector<int> invariantDims_1 = {0, 1, 2, 3};
+
+    const std::vector<int> reduceDims_2    = {3};
+    const std::vector<int> invariantDims_2 = {0, 1, 2};
+
+    // used by the host reduction
+    const std::vector<int> reduceDims    = {3, 4};
+    const std::vector<int> invariantDims = {0, 1, 2};
+
+    const std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};
+
+    // input lengths of the second reduction, which is also the output lengths of the first
+    // reduction
+    const std::vector<size_t> inLengths_2 = {64, 320, 80, 4};
+
+    const std::vector<size_t> outLengths = {64, 320, 80};
+
+    using namespace ck::host_reduce;
+
+    if(argc == 1)
+    {
+        do_verify   = true;
+        init_method = 2;
+        time_kernel = true;
+    }
+    else if(argc == 4)
+    {
+        do_verify   = static_cast<bool>(argv[1]);
+        init_method = atoi(argv[2]);
+        time_kernel = static_cast<bool>(atoi(argv[3]));
+    }
+    else
+    {
+        std::ostringstream ostr;
+
+        ostr << "Wrong parameter! " << std::endl
+             << "Usage: " << argv[0] << "[verify 0/1] init_method time_kernel" << std::endl;
+
+        throw std::runtime_error(ostr.str());
+    };
+
+    alpha = 1.0f;
+    beta  = 0.0f;
+
+    Tensor<InOutDataType> in_1(inLengths_1);
+
+    Tensor<InOutDataType> out_ref(outLengths);
+    Tensor<InOutDataType> in_2(inLengths_2); // also the output tensor of the first reduction
+    Tensor<InOutDataType> out(outLengths);
+
+    auto inStrides_1 = in_1.mDesc.GetStrides();
+    auto inStrides_2 = in_2.mDesc.GetStrides();
+    auto outStrides  = out.mDesc.GetStrides();
+
+    size_t invariant_total_length = out.mDesc.GetElementSize();
+    size_t reduce_total_length    = in_1.mDesc.GetElementSize() / invariant_total_length;
+
+    std::size_t num_thread = 1;
+
+    if(do_verify)
+    {
+        switch(init_method)
+        {
+        case 0: break;
+        case 1:
+            in_1.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
+            break;
+        case 2:
+            in_1.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+            break;
+        default:
+            in_1.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0},
+                                            num_thread);
+        }
+
+        if(beta != 0.0f)
+            for(size_t i = 0; i < out_ref.mDesc.GetElementSpace(); i++)
+                out.mData[i] = out_ref.mData[i];
+    };
+
+    DeviceMem in_1_dev(sizeof(InOutDataType) * in_1.mDesc.GetElementSpace());
+    DeviceMem in_2_dev(sizeof(InOutDataType) * in_2.mDesc.GetElementSpace());
+    DeviceMem out_dev(sizeof(InOutDataType) * out.mDesc.GetElementSpace());
+
+    in_1_dev.ToDevice(in_1.mData.data());
+
+    if(beta != 0.0f)
+        out_dev.ToDevice(out.mData.data());
+
+    if(do_verify)
+    {
+        ReductionHost<InOutDataType,
+                      AccDataType,
+                      InOutDataType,
+                      ReduceOpId,
+                      5, // Rank
+                      2, // NumReduceDim
+                      PropagateNan,
+                      OutputIndex>
+            hostReduce(in_1.mDesc, out_ref.mDesc, invariantDims, reduceDims);
+
+        hostReduce.Run(alpha, in_1.mData.data(), beta, out_ref.mData.data(), nullptr);
+    };
+
+    std::vector<ck::index_t> i_inLengths_1;
+    std::vector<ck::index_t> i_inStrides_1;
+    std::vector<ck::index_t> i_inLengths_2;
+    std::vector<ck::index_t> i_inStrides_2;
+    std::vector<ck::index_t> i_outLengths;
+    std::vector<ck::index_t> i_outStrides;
+
+    i_inLengths_1.assign(inLengths_1.begin(), inLengths_1.end());
+    i_inStrides_1.assign(inStrides_1.begin(), inStrides_1.end());
+    i_inLengths_2.assign(inLengths_2.begin(), inLengths_2.end());
+    i_inStrides_2.assign(inStrides_2.begin(), inStrides_2.end());
+    i_outLengths.assign(outLengths.begin(), outLengths.end());
+    i_outStrides.assign(outStrides.begin(), outStrides.end());
+
+    auto reduce_1 = DeviceReduceInstance_1{};
+
+    auto argument_ptr_1 = reduce_1.MakeArgumentPointer(
+        i_inLengths_1,
+        i_inStrides_1,
+        i_inLengths_2,
+        i_inStrides_2,
+        reduceDims_1,
+        1.0f,
+        0.0f,
+        in_1_dev.GetDeviceBuffer(),
+        nullptr,
+        in_2_dev.GetDeviceBuffer(),
+        nullptr,
+        InElementwiseOperation{static_cast<int32_t>(reduce_total_length)},
+        PassThroughOp{});
+
+    if(!reduce_1.IsSupportedArgument(argument_ptr_1.get()))
+    {
+        std::cout
+            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
+            << std::endl;
+    };
+
+    auto invoker_ptr_1 = reduce_1.MakeInvokerPointer();
+
+    auto reduce_2 = DeviceReduceInstance_2{};
+
+    auto argument_ptr_2 = reduce_2.MakeArgumentPointer(
+        i_inLengths_2,
+        i_inStrides_2,
+        i_outLengths,
+        i_outStrides,
+        reduceDims_2,
+        alpha,
+        beta,
+        in_2_dev.GetDeviceBuffer(),
+        nullptr,
+        out_dev.GetDeviceBuffer(),
+        nullptr,
+        PassThroughOp{},
+        AccElementwiseOperation{static_cast<int32_t>(reduce_total_length)});
+
+    if(!reduce_2.IsSupportedArgument(argument_ptr_2.get()))
+    {
+        std::cout
+            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
+            << std::endl;
+    };
+
+    auto invoker_ptr_2 = reduce_2.MakeInvokerPointer();
+
+    float avg_time_1 = invoker_ptr_1->Run(argument_ptr_1.get(), StreamConfig{nullptr, time_kernel});
+    float avg_time_2 = invoker_ptr_2->Run(argument_ptr_2.get(), StreamConfig{nullptr, time_kernel});
+
+    std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InOutDataType) +
+                            invariant_total_length * sizeof(InOutDataType);
+
+    float gb_per_sec = num_bytes / 1.E6 / (avg_time_1 + avg_time_2);
+
+    std::cout << "Perf: " << avg_time_1 + avg_time_2 << " ms, " << gb_per_sec << " GB/s, "
+              << reduce_1.GetTypeString() << " => " << reduce_2.GetTypeString() << std::endl;
+
+    bool pass = true;
+
+    if(do_verify)
+    {
+        out_dev.FromDevice(out.mData.data());
+        pass = pass && ck::utils::check_err(out.mData, out_ref.mData);
+    };
+
+    return (pass ? 0 : 1);
+}
diff --git a/example/13_pool2d_fwd/README.md b/example/13_pool2d_fwd/README.md
index d9c829fb98..2314cfd670 100644
--- a/example/13_pool2d_fwd/README.md
+++ b/example/13_pool2d_fwd/README.md
@@ -4,9 +4,9 @@
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
-#arg3: run kernel # of times (>1)
+#arg3: time kernel (0=no, 1=yes)
 #arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
-./bin/example_pool2d_fwd 1 1 10
+./bin/example_pool2d_fwd 1 1 1
 ```
 
 Result 
@@ -14,9 +14,7 @@ Result
 in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
 out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
 launch_and_time_kernel: grid_dim {124416, 1, 1}, block_dim {64, 1, 1} 
-Warm up
+Warm up 1 time
 Start running 10 times...
-Perf: 0.415453 ms, 1.37996 TFlops, 749.726 GB/s
-error: 0
-max_diff: 0, 1, 1
+Perf: 0.397436 ms, 1.44252 TFlops, 783.713 GB/s
 ```
diff --git a/example/13_pool2d_fwd/pool2d_fwd.cpp b/example/13_pool2d_fwd/pool2d_fwd.cpp
index e6749bf8d7..662a48500f 100644
--- a/example/13_pool2d_fwd/pool2d_fwd.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd.cpp
@@ -20,6 +20,8 @@ using InDataType  = ck::half_t;
 using OutDataType = ck::half_t;
 using AccDataType = float;
 
+using IndexDataType = int32_t;
+
 using InLayout  = ck::tensor_layout::convolution::NHWC;
 using OutLayout = ck::tensor_layout::convolution::NHWC;
 
@@ -29,7 +31,7 @@ static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
 static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
 #endif
 
-static constexpr bool NeedIndices  = false;
+static constexpr bool OutputIndex  = false;
 static constexpr bool PropagateNan = false;
 
 using DevicePoolFwdInstance =
@@ -38,7 +40,7 @@ using DevicePoolFwdInstance =
         OutDataType, // OutDataType
         AccDataType, // AccDataType
         ReduceOpId,
-        NeedIndices,
+        OutputIndex,
         64, // BlockSize
         64, // ReduceMThreadClusterSize
         1,  // ReduceKThreadClusterSize
@@ -51,10 +53,10 @@ template <typename InDataType,
           typename AccDataType,
           ck::ReduceTensorOp ReduceOpId,
           bool PropagateNan,
-          bool NeedIndices>
+          bool OutputIndex>
 static void pool_host_verify(const Tensor<InDataType>& in,
                              Tensor<OutDataType>& out,
-                             Tensor<int>& out_indices,
+                             Tensor<IndexDataType>& out_indices,
                              const std::array<ck::index_t, 2>& window_spatial_lengths,
                              const std::array<ck::index_t, 2>& window_strides,
                              const std::array<ck::index_t, 2>& in_left_pads,
@@ -62,26 +64,26 @@ static void pool_host_verify(const Tensor<InDataType>& in,
 {
     using namespace ck::host_reduce;
 
-    const int divider = window_spatial_lengths[0] * window_spatial_lengths[1];
+    const int32_t divider = window_spatial_lengths[0] * window_spatial_lengths[1];
 
     const auto PreUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
     const auto PosUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
 
-    if constexpr(!NeedIndices)
+    if constexpr(!OutputIndex)
     {
         auto opReduce = ReduceOpFn<AccDataType, ReduceOpId>();
 
         auto f_nchw = [&](auto n, auto c, auto ho, auto wo) {
             auto accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
 
-            for(int y = 0; y < window_spatial_lengths[0]; ++y)
+            for(ck::index_t y = 0; y < window_spatial_lengths[0]; ++y)
             {
-                int hi = ho * window_strides[0] + y - in_left_pads[0];
-                for(int x = 0; x < window_spatial_lengths[1]; ++x)
+                ck::index_t hi = ho * window_strides[0] + y - in_left_pads[0];
+                for(ck::index_t x = 0; x < window_spatial_lengths[1]; ++x)
                 {
-                    int wi = wo * window_strides[1] + x - in_left_pads[1];
-                    if(hi >= 0 && hi < ck::type_convert<int>(in.mDesc.GetLengths()[2]) && wi >= 0 &&
-                       wi < ck::type_convert<int>(in.mDesc.GetLengths()[3]))
+                    ck::index_t wi = wo * window_strides[1] + x - in_left_pads[1];
+                    if(hi >= 0 && hi < static_cast<ck::index_t>(in.mDesc.GetLengths()[2]) &&
+                       wi >= 0 && wi < static_cast<ck::index_t>(in.mDesc.GetLengths()[3]))
                     {
                         AccDataType currVal = static_cast<AccDataType>(in(n, c, hi, wi));
 
@@ -108,24 +110,24 @@ static void pool_host_verify(const Tensor<InDataType>& in,
         auto opReduce = ReduceOpFn2<AccDataType, ReduceOpId>();
 
         auto f_nchw = [&](auto n, auto c, auto ho, auto wo) {
-            auto accuVal  = ReduceOpZeroVal<AccDataType, ReduceOpId>();
-            int accuIndex = 0;
+            auto accuVal            = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+            IndexDataType accuIndex = 0;
 
-            for(int y = 0; y < window_spatial_lengths[0]; ++y)
+            for(ck::index_t y = 0; y < window_spatial_lengths[0]; ++y)
             {
-                int hi = ho * window_strides[0] + y - in_left_pads[0];
-                for(int x = 0; x < window_spatial_lengths[1]; ++x)
+                ck::index_t hi = ho * window_strides[0] + y - in_left_pads[0];
+                for(ck::index_t x = 0; x < window_spatial_lengths[1]; ++x)
                 {
-                    int wi = wo * window_strides[1] + x - in_left_pads[1];
+                    ck::index_t wi = wo * window_strides[1] + x - in_left_pads[1];
                     if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
                        wi < in.mDesc.GetLengths()[3])
                     {
-                        AccDataType currVal = static_cast<AccDataType>(in(n, c, hi, wi));
-                        int currIndex       = y * window_spatial_lengths[1] + x;
+                        AccDataType currVal     = static_cast<AccDataType>(in(n, c, hi, wi));
+                        IndexDataType currIndex = y * window_spatial_lengths[1] + x;
 
                         PreUnaryOp(currVal);
 
-                        binop_with_nan_check2<AccDataType, PropagateNan>(
+                        binop_with_index_and_nan_check<AccDataType, IndexDataType, PropagateNan>(
                             opReduce, accuVal, currVal, accuIndex, currIndex);
                     }
                 }
@@ -149,9 +151,9 @@ int main(int argc, char* argv[])
 {
     using namespace ck::host_reduce;
 
-    bool do_verification = true;
-    int init_method      = 1;
-    bool time_kernel     = false;
+    bool do_verification;
+    int init_method;
+    bool time_kernel;
 
     // Pool shape
     ck::index_t N               = 128;
@@ -167,17 +169,23 @@ int main(int argc, char* argv[])
     ck::index_t in_right_pad_h  = 1;
     ck::index_t in_right_pad_w  = 1;
 
-    if(argc == 4)
+    if(argc == 1)
+    {
+        do_verification = true;
+        init_method     = 1;
+        time_kernel     = true;
+    }
+    else if(argc == 4)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
+        time_kernel     = static_cast<bool>(std::stoi(argv[3]));
     }
     else if(argc == 16)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
+        time_kernel     = static_cast<bool>(std::stoi(argv[3]));
 
         N               = std::stoi(argv[4]);
         C               = std::stoi(argv[5]);
@@ -196,7 +204,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
         printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
                "RightPx\n");
         exit(0);
@@ -228,9 +236,11 @@ int main(int argc, char* argv[])
 
     Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
     Tensor<OutDataType> out_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
-    Tensor<int> out_indices_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
+    Tensor<IndexDataType> out_indices_n_c_ho_wo_host(
+        f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
     Tensor<OutDataType> out_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
-    Tensor<int> out_indices_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
+    Tensor<IndexDataType> out_indices_n_c_ho_wo_device(
+        f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
 
     std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
     std::cout << "out_n_c_ho_wo: " << out_n_c_ho_wo_host.mDesc << std::endl;
@@ -245,25 +255,25 @@ int main(int argc, char* argv[])
 
     DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
     DeviceMem out_device_buf(sizeof(OutDataType) * out_n_c_ho_wo_device.mDesc.GetElementSpace());
-    DeviceMem out_indices_device_buf(sizeof(int) *
+    DeviceMem out_indices_device_buf(sizeof(IndexDataType) *
                                      out_indices_n_c_ho_wo_device.mDesc.GetElementSpace());
 
     in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
 
-    auto pool        = DevicePoolFwdInstance{};
-    auto invoker_ptr = pool.MakeInvokerPointer();
-    auto argument_ptr =
-        pool.MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-                                 static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                                 static_cast<int*>(out_indices_device_buf.GetDeviceBuffer()),
-                                 N,
-                                 C,
-                                 std::array<ck::index_t, 2>{{Hi, Wi}},
-                                 std::array<ck::index_t, 2>{{Y, X}},
-                                 std::array<ck::index_t, 2>{{Ho, Wo}},
-                                 window_strides,
-                                 input_left_pads,
-                                 input_right_pads);
+    auto pool         = DevicePoolFwdInstance{};
+    auto invoker_ptr  = pool.MakeInvokerPointer();
+    auto argument_ptr = pool.MakeArgumentPointer(
+        static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+        static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
+        N,
+        C,
+        std::array<ck::index_t, 2>{{Hi, Wi}},
+        std::array<ck::index_t, 2>{{Y, X}},
+        std::array<ck::index_t, 2>{{Ho, Wo}},
+        window_strides,
+        input_left_pads,
+        input_right_pads);
 
     if(!pool.IsSupportedArgument(argument_ptr.get()))
     {
@@ -286,6 +296,7 @@ int main(int argc, char* argv[])
               << std::endl;
 
     bool pass = true;
+
     if(do_verification)
     {
         pool_host_verify<InDataType,
@@ -293,7 +304,7 @@ int main(int argc, char* argv[])
                          AccDataType,
                          ReduceOpId,
                          PropagateNan,
-                         NeedIndices>(in_n_c_hi_wi,
+                         OutputIndex>(in_n_c_hi_wi,
                                       out_n_c_ho_wo_host,
                                       out_indices_n_c_ho_wo_host,
                                       window_spatial_lengths,
@@ -303,15 +314,16 @@ int main(int argc, char* argv[])
 
         out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());
 
-        pass &= ck::utils::check_err(out_n_c_ho_wo_device.mData, out_n_c_ho_wo_host.mData);
+        pass = pass && ck::utils::check_err(out_n_c_ho_wo_device.mData, out_n_c_ho_wo_host.mData);
 
-        if constexpr(NeedIndices)
+        if constexpr(OutputIndex)
         {
             out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data());
 
-            pass &= ck::utils::check_err(out_indices_n_c_ho_wo_device.mData,
-                                         out_indices_n_c_ho_wo_host.mData);
+            pass = pass && ck::utils::check_err(out_indices_n_c_ho_wo_device.mData,
+                                                out_indices_n_c_ho_wo_host.mData);
         };
     }
-    return pass ? 0 : 1;
+
+    return (pass ? 0 : 1);
 }
diff --git a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
index f665378e08..c7e18d98dc 100644
--- a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
@@ -17,7 +17,7 @@ template <typename InDataType,
           typename OutDataType,
           typename AccDataType,
           ck::ReduceTensorOp ReduceOpId,
-          bool NeedIndices,
+          bool OuputIndex,
           ck::index_t BlockSize,
           ck::index_t ReduceMThreadClusterSize,
           ck::index_t ReduceKThreadClusterSize,
@@ -44,8 +44,6 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
         typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
             AccElementwiseOperation;
 
-    static constexpr bool BetaIsZero = true;
-
     static constexpr index_t InSrcOutDstVectorDim =
         0; // for NHWC, the dim C is the vector Dim for both input and output in memory, which is
            // not reduced.
@@ -206,28 +204,28 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
     {
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
-            using gridwise_reduce = GridwiseReduction_mk_to_m_threadwise<InDataType,
-                                                                         OutDataType,
-                                                                         AccDataType,
-                                                                         IndexDataType,
-                                                                         AGridDesc_M_K,
-                                                                         BGridDesc_M,
-                                                                         ReduceOperation,
-                                                                         InElementwiseOperation,
-                                                                         AccElementwiseOperation,
-                                                                         false, // propagate_nan
-                                                                         BetaIsZero,
-                                                                         BlockSize,
-                                                                         ReduceMThreadClusterSize,
-                                                                         ReduceKThreadClusterSize,
-                                                                         ReduceMThreadSliceSize,
-                                                                         ReduceKThreadSliceSize,
-                                                                         InSrcOutDstVectorDim,
-                                                                         InSrcOutDstVectorSize,
-                                                                         InSrcOutDstVectorSize>;
+            using gridwise_reduce =
+                GridwiseReduction_mk_to_m_threadwise<InDataType,
+                                                     OutDataType,
+                                                     AccDataType,
+                                                     IndexDataType,
+                                                     AGridDesc_M_K,
+                                                     BGridDesc_M,
+                                                     ReduceOperation,
+                                                     InElementwiseOperation,
+                                                     AccElementwiseOperation,
+                                                     InMemoryDataOperationEnum::Set,
+                                                     false, // propagate_nan
+                                                     BlockSize,
+                                                     ReduceMThreadSliceSize,
+                                                     ReduceKThreadSliceSize,
+                                                     InSrcOutDstVectorDim,
+                                                     InSrcOutDstVectorSize,
+                                                     InSrcOutDstVectorSize>;
 
             const auto kernel = kernel_reduce_threadwise<gridwise_reduce,
-                                                         NeedIndices,
+                                                         OuputIndex,
+                                                         false, // don't have index input
                                                          InDataType,
                                                          OutDataType,
                                                          AccDataType,
@@ -252,6 +250,7 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
                                           arg.acc_element_op_,
                                           float(1),
                                           arg.p_in_dev_,
+                                          nullptr,
                                           float(0),
                                           arg.p_out_dev_,
                                           arg.p_out_indices_dev_);
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_reduce.hpp
index 50fa64dab8..6f367a8747 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce.hpp
@@ -16,35 +16,18 @@ namespace device {
 template <typename InElementwiseOperation, typename AccElementwiseOperation>
 struct DeviceReduce : public BaseOperator
 {
-    virtual long_index_t GetWorkspaceSizeInBytes(const std::vector<int> inLengths,
-                                                 const std::vector<int> reduceDims)
-    {
-        (void)inLengths;
-        (void)reduceDims;
-
-        return (0);
-    };
-
-    virtual bool HasFurtherCall() { return (false); };
-
-    virtual std::vector<int> GetWorkspace2dLengths(const BaseArgument* argPtr)
-    {
-        (void)argPtr;
-        return (std::vector<int>{0, 0});
-    };
-
     virtual std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::vector<int> inLengths,
-                        const std::vector<int> inStrides,
-                        const std::vector<int> outLengths,
-                        const std::vector<int> outStrides,
+    MakeArgumentPointer(const std::vector<index_t> inLengths,
+                        const std::vector<index_t> inStrides,
+                        const std::vector<index_t> outLengths,
+                        const std::vector<index_t> outStrides,
                         const std::vector<int> reduceDims,
                         float alpha,
                         float beta,
                         const void* in_dev,
+                        const void* in_index_dev,
                         void* out_dev,
-                        void* out_indices_dev,
-                        void* workspace_dev,
+                        void* out_index_dev,
                         const InElementwiseOperation in_elementwise_op,
                         const AccElementwiseOperation acc_elementwise_op) = 0;
 
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
deleted file mode 100644
index 860f53d8c5..0000000000
--- a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
+++ /dev/null
@@ -1,374 +0,0 @@
-#ifndef DEVICE_REDUCE_BLOCKWISE_HPP
-#define DEVICE_REDUCE_BLOCKWISE_HPP
-
-#include <iostream>
-#include <sstream>
-#include "device.hpp"
-#include "device_reduce.hpp"
-#include "device_reduce_common.hpp"
-#include "gridwise_2d_reduction_blockwise.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-
-template <typename InDataType,
-          typename AccDataType,
-          typename OutDataType,
-          index_t Rank,
-          index_t NumReduceDim,
-          typename ReduceOperation,
-          typename InElementwiseOperation,
-          typename AccElementwiseOperation,
-          bool PropagateNan,
-          bool NeedIndices,
-          index_t BlockSize,
-          index_t MThreadClusterSize,
-          index_t KThreadClusterSize,
-          index_t MThreadSliceSize,
-          index_t KThreadSliceSize,
-          index_t InSrcVectorDim,
-          index_t InSrcVectorSize,
-          index_t OutDstVectorSize>
-struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccElementwiseOperation>
-{
-    static_assert(Rank <= 6, "Bigger Rank size is not supported!");
-    static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
-                  "Invalid thread cluster size assignments!");
-
-    static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
-                   (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
-                      (MThreadSliceSize % OutDstVectorSize == 0),
-                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
-
-    using IndexDataType = int32_t;
-
-    static constexpr bool BetaIsZero = NeedIndices;
-
-    static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
-
-    static constexpr index_t numSrcDim = Rank;
-    static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
-    static constexpr bool reduceAllDim = (NumInvariantDim == 0);
-
-    static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
-    static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
-
-    static auto MakeSrc2dDescriptor(const std::vector<int>& inLengths,
-                                    const std::vector<int>& inStrides)
-    {
-        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<numSrcDim>{});
-        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<numSrcDim>{});
-
-        const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
-
-        const auto in_grid_desc_m_k = [&]() {
-            if constexpr(reduceAllDim)
-            {
-                const auto one_dim_inDesc = transform_tensor_descriptor(
-                    inDesc,
-                    make_tuple(make_merge_transform(tupleSrcLengths)),
-                    make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}),
-                    make_tuple(Sequence<0>{}));
-
-                return transform_tensor_descriptor(one_dim_inDesc,
-                                                   make_tuple(make_unmerge_transform(make_tuple(
-                                                       1, one_dim_inDesc.GetLength(Number<0>{})))),
-                                                   make_tuple(Sequence<0>{}),
-                                                   make_tuple(Sequence<0, 1>{}));
-            }
-            else
-            {
-                using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
-                using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
-
-                const auto reduceDimLengths =
-                    make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
-                const auto invariantDimLengths =
-                    make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
-
-                return transform_tensor_descriptor(
-                    inDesc,
-                    make_tuple(make_merge_transform(invariantDimLengths),
-                               make_merge_transform(reduceDimLengths)),
-                    make_tuple(InvariantDims{}, ReduceDims{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}));
-            }
-        }();
-
-        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
-        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});
-
-        const auto inPad_M =
-            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
-        const auto inPad_K =
-            math::integer_least_multiple(reduceLength, K_BlockTileSize) - reduceLength;
-
-        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
-            in_grid_desc_m_k,
-            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
-                       make_right_pad_transform(reduceLength, inPad_K)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-        return (in_grid_desc_m_k_padded);
-    };
-
-    static auto MakeDst1dDescriptor(const std::vector<int>& outLengths,
-                                    const std::vector<int>& outStrides)
-    {
-        const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<numDstDim>{});
-        const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<numDstDim>{});
-
-        auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
-
-        auto out_grid_desc_m = transform_tensor_descriptor(
-            outDesc,
-            make_tuple(make_merge_transform(tupleDstLengths)),
-            make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}),
-            make_tuple(Sequence<0>{}));
-
-        const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{});
-
-        const auto inPad =
-            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
-
-        auto out_grid_desc_m_padded = transform_tensor_descriptor(
-            out_grid_desc_m,
-            make_tuple(make_right_pad_transform(invariantLength, inPad)),
-            make_tuple(Sequence<0>{}),
-            make_tuple(Sequence<0>{}));
-        return (out_grid_desc_m_padded);
-    };
-
-    struct Argument : public BaseArgument
-    {
-        Argument(const std::vector<int> inLengths,
-                 const std::vector<int> inStrides,
-                 const std::vector<int> outLengths,
-                 const std::vector<int> outStrides,
-                 const std::vector<int> reduceDims,
-                 float alpha,
-                 float beta,
-                 const InDataType* in_dev,
-                 OutDataType* out_dev,
-                 IndexDataType* out_indices_dev,
-                 AccDataType* workspace_dev,
-                 const InElementwiseOperation in_elementwise_op,
-                 const AccElementwiseOperation acc_elementwise_op)
-            : outLengths_{outLengths},
-              outStrides_{outStrides},
-              in_dev_{in_dev},
-              out_dev_{out_dev},
-              out_indices_dev_{out_indices_dev},
-              in_elementwise_op_{in_elementwise_op},
-              acc_elementwise_op_{acc_elementwise_op}
-        {
-            (void)workspace_dev;
-
-            inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
-            inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);
-
-            alpha_ = type_convert<AccDataType>(alpha);
-            beta_  = type_convert<AccDataType>(beta);
-
-            std::tie(invariant_total_length, reduce_total_length) =
-                get_2d_lengths<Rank, NumReduceDim>(inLengths_);
-
-            if constexpr(NumInvariantDim == 0)
-                invariant_lowest_length = 1;
-            else
-                invariant_lowest_length = inLengths_[NumInvariantDim - 1];
-
-            reduce_lowest_length = inLengths_[Rank - 1];
-
-            gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
-                       M_BlockTileSize;
-        }
-
-        std::vector<int> inLengths_;
-        std::vector<int> inStrides_;
-        std::vector<int> outLengths_;
-        std::vector<int> outStrides_;
-
-        AccDataType alpha_;
-        AccDataType beta_;
-
-        const InDataType* in_dev_;
-        OutDataType* out_dev_;
-        IndexDataType* out_indices_dev_;
-
-        InElementwiseOperation in_elementwise_op_;
-        AccElementwiseOperation acc_elementwise_op_;
-
-        int invariant_lowest_length;
-        int reduce_lowest_length;
-        size_t invariant_total_length;
-        size_t reduce_total_length;
-
-        size_t gridSize;
-    };
-
-    struct Invoker : public BaseInvoker
-    {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
-        {
-            const auto in_grid_desc_m_k =
-                DeviceReduceBlockWise::MakeSrc2dDescriptor(arg.inLengths_, arg.inStrides_);
-            const auto out_grid_desc_m =
-                DeviceReduceBlockWise::MakeDst1dDescriptor(arg.outLengths_, arg.outStrides_);
-            using InGridDesc_M_K = decltype(in_grid_desc_m_k);
-            using OutGridDesc_M  = decltype(out_grid_desc_m);
-
-            using GridwiseReduce = GridwiseReduction_mk_to_m_blockwise<InDataType,
-                                                                       OutDataType,
-                                                                       AccDataType,
-                                                                       IndexDataType,
-                                                                       InGridDesc_M_K,
-                                                                       OutGridDesc_M,
-                                                                       ReduceOperation,
-                                                                       InElementwiseOperation,
-                                                                       AccElementwiseOperation,
-                                                                       PropagateNan,
-                                                                       BetaIsZero,
-                                                                       BlockSize,
-                                                                       MThreadClusterSize,
-                                                                       KThreadClusterSize,
-                                                                       MThreadSliceSize,
-                                                                       KThreadSliceSize,
-                                                                       InSrcVectorDim,
-                                                                       InSrcVectorSize,
-                                                                       OutDstVectorSize>;
-
-            float avg_time = 0;
-
-            const auto kernel = kernel_reduce_blockwise<GridwiseReduce,
-                                                        NeedIndices,
-                                                        InDataType,
-                                                        OutDataType,
-                                                        AccDataType,
-                                                        IndexDataType,
-                                                        InGridDesc_M_K,
-                                                        OutGridDesc_M,
-                                                        InElementwiseOperation,
-                                                        AccElementwiseOperation>;
-
-            avg_time = launch_and_time_kernel(stream_config,
-                                              kernel,
-                                              dim3(arg.gridSize),
-                                              dim3(BlockSize),
-                                              0,
-                                              in_grid_desc_m_k,
-                                              out_grid_desc_m,
-                                              arg.in_elementwise_op_,
-                                              arg.acc_elementwise_op_,
-                                              arg.alpha_,
-                                              arg.in_dev_,
-                                              arg.beta_,
-                                              arg.out_dev_,
-                                              nullptr,
-                                              arg.out_indices_dev_);
-
-            return (avg_time);
-        };
-
-        float Run(const BaseArgument* p_arg,
-                  const StreamConfig& stream_config = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
-        };
-    };
-
-    bool IsSupportedArgument(const BaseArgument* p_arg) override
-    {
-        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
-
-        if constexpr(InSrcVectorDim == 0)
-        {
-            if constexpr(NumInvariantDim == 0)
-            {
-                return (false);
-            }
-            else
-            {
-                if(pArg->inStrides_[NumInvariantDim - 1] != 1)
-                    return (false);
-
-                if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
-                    return (false);
-            };
-        }
-        else
-        {
-            if(pArg->inStrides_[Rank - 1] != 1)
-                return (false);
-
-            if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
-                return (false);
-        };
-
-        // To improve
-        if(pArg->invariant_lowest_length % OutDstVectorSize != 0)
-            return (false);
-
-        // cases with very small reduce_total_length should be handled by the ThreadWise method
-        if(pArg->reduce_total_length / KThreadSliceSize < 2)
-            return (false);
-
-        return (true);
-    };
-
-    std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::vector<int> inLengths,
-                        const std::vector<int> inStrides,
-                        const std::vector<int> outLengths,
-                        const std::vector<int> outStrides,
-                        const std::vector<int> reduceDims,
-                        float alpha,
-                        float beta,
-                        const void* in_dev,
-                        void* out_dev,
-                        void* out_indices_dev,
-                        void* workspace_dev,
-                        const InElementwiseOperation in_elementwise_op,
-                        const AccElementwiseOperation acc_elementwise_op) override
-    {
-        return std::make_unique<Argument>(inLengths,
-                                          inStrides,
-                                          outLengths,
-                                          outStrides,
-                                          reduceDims,
-                                          alpha,
-                                          beta,
-                                          static_cast<const InDataType*>(in_dev),
-                                          static_cast<OutDataType*>(out_dev),
-                                          static_cast<IndexDataType*>(out_indices_dev),
-                                          static_cast<AccDataType*>(workspace_dev),
-                                          in_elementwise_op,
-                                          acc_elementwise_op);
-    };
-
-    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
-    {
-        return std::make_unique<Invoker>();
-    };
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "DeviceReduceBlockWise<" << BlockSize << ",";
-        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
-        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
-        str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
-        // clang-format on
-
-        return str.str();
-    }
-};
-
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
-#endif
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
deleted file mode 100644
index 43ac48cecc..0000000000
--- a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
+++ /dev/null
@@ -1,328 +0,0 @@
-#ifndef DEVICE_REDUCE_BLOCKWISE_SECOND_CALL_HPP
-#define DEVICE_REDUCE_BLOCKWISE_SECOND_CALL_HPP
-
-#include <iostream>
-#include <sstream>
-#include "device.hpp"
-#include "device_reduce.hpp"
-#include "device_reduce_common.hpp"
-#include "gridwise_2d_reduction_blockwise.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-
-template <typename InDataType,
-          typename AccDataType,
-          typename OutDataType,
-          index_t Rank,
-          index_t NumReduceDim,
-          typename ReduceOperation,
-          typename InElementwiseOperation,
-          typename AccElementwiseOperation,
-          bool PropagateNan,
-          bool NeedIndices,
-          index_t BlockSize,
-          index_t MThreadClusterSize,
-          index_t KThreadClusterSize,
-          index_t MThreadSliceSize,
-          index_t KThreadSliceSize,
-          index_t InSrcVectorDim,
-          index_t InSrcVectorSize,
-          index_t OutDstVectorSize>
-struct DeviceReduceBlockWiseSecondCall
-    : public DeviceReduce<InElementwiseOperation, AccElementwiseOperation>
-{
-    static_assert(Rank <= 6, "Bigger Rank size is not supported!");
-    static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
-                  "Invalid thread cluster size assignments!");
-
-    static_assert((InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0) &&
-                      (MThreadSliceSize % OutDstVectorSize == 0),
-                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
-
-    using IndexDataType = int32_t;
-
-    static constexpr bool BetaIsZero = NeedIndices;
-
-    static_assert(
-        std::is_same<InDataType, AccDataType>::value,
-        "InDataType and AccDataType should be the same to use DEviceReduceBlockWiseSecondCall!");
-
-    static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
-
-    static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
-
-    static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
-    static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
-
-    static auto MakeSrc2dDescriptor(const std::vector<int>& inLengths,
-                                    const std::vector<int>& inStrides)
-    {
-        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<2>{});
-        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<2>{});
-
-        const auto in_grid_desc_m_k =
-            make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
-
-        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
-        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});
-
-        const auto inPad_M =
-            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
-        const auto inPad_K =
-            math::integer_least_multiple(reduceLength, K_BlockTileSize) - reduceLength;
-
-        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
-            in_grid_desc_m_k,
-            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
-                       make_right_pad_transform(reduceLength, inPad_K)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-        return (in_grid_desc_m_k_padded);
-    };
-
-    static auto MakeDst1dDescriptor(const std::vector<int>& outLengths,
-                                    const std::vector<int>& outStrides)
-    {
-        const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<numDstDim>{});
-        const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<numDstDim>{});
-
-        auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
-
-        auto out_grid_desc_m = transform_tensor_descriptor(
-            outDesc,
-            make_tuple(make_merge_transform(tupleDstLengths)),
-            make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}),
-            make_tuple(Sequence<0>{}));
-
-        const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{});
-
-        const auto outPad =
-            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
-
-        auto out_grid_desc_m_padded = transform_tensor_descriptor(
-            out_grid_desc_m,
-            make_tuple(make_right_pad_transform(invariantLength, outPad)),
-            make_tuple(Sequence<0>{}),
-            make_tuple(Sequence<0>{}));
-        return (out_grid_desc_m_padded);
-    };
-
-    struct Argument : public BaseArgument
-    {
-        Argument(const std::vector<int>& inLengths,
-                 const std::vector<int>& inStrides,
-                 const std::vector<int>& outLengths,
-                 const std::vector<int>& outStrides,
-                 float alpha,
-                 float beta,
-                 const InDataType* in_dev,
-                 OutDataType* out_dev,
-                 IndexDataType* out_indices_dev,
-                 AccDataType* workspace_dev,
-                 const InElementwiseOperation& in_elementwise_op,
-                 const AccElementwiseOperation& acc_elementwise_op)
-            : inLengths_(inLengths),
-              inStrides_(inStrides),
-              outLengths_(outLengths),
-              outStrides_(outStrides),
-              in_dev_{in_dev},
-              out_dev_{out_dev},
-              out_indices_dev_{out_indices_dev},
-              in_elementwise_op_(in_elementwise_op),
-              acc_elementwise_op_(acc_elementwise_op)
-        {
-            alpha_ = type_convert<AccDataType>(alpha);
-            beta_  = type_convert<AccDataType>(beta);
-
-            invariant_total_length = inLengths[0];
-            reduce_total_length    = inLengths[1];
-
-            invariant_lowest_length = inLengths[0];
-            reduce_lowest_length    = inLengths[1];
-
-            gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
-                       M_BlockTileSize;
-
-            size_t ws_buf2_bytes_offset = math::integer_least_multiple(
-                invariant_total_length * reduce_total_length * sizeof(AccDataType), 64);
-
-            if constexpr(NeedIndices)
-                workspace_indices_dev_ = reinterpret_cast<index_t*>(
-                    reinterpret_cast<char*>(workspace_dev) + ws_buf2_bytes_offset);
-            else
-                workspace_indices_dev_ = nullptr;
-        }
-
-        std::vector<int> inLengths_;
-        std::vector<int> inStrides_;
-        std::vector<int> outLengths_;
-        std::vector<int> outStrides_;
-
-        AccDataType alpha_;
-        AccDataType beta_;
-
-        const InDataType* in_dev_;
-        OutDataType* out_dev_;
-        IndexDataType* out_indices_dev_;
-        IndexDataType* workspace_indices_dev_;
-
-        InElementwiseOperation in_elementwise_op_;
-        AccElementwiseOperation acc_elementwise_op_;
-
-        int invariant_lowest_length;
-        int reduce_lowest_length;
-        size_t invariant_total_length;
-        size_t reduce_total_length;
-
-        size_t gridSize;
-    };
-
-    struct Invoker : public BaseInvoker
-    {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
-        {
-            const auto in_grid_desc_m_k = DeviceReduceBlockWiseSecondCall::MakeSrc2dDescriptor(
-                arg.inLengths_, arg.inStrides_);
-            const auto out_grid_desc_m = DeviceReduceBlockWiseSecondCall::MakeDst1dDescriptor(
-                arg.outLengths_, arg.outStrides_);
-            using InGridDesc_M_K = decltype(in_grid_desc_m_k);
-            using OutGridDesc_M  = decltype(out_grid_desc_m);
-
-            using GridwiseReduce = GridwiseReduction_mk_to_m_blockwise<InDataType,
-                                                                       OutDataType,
-                                                                       AccDataType,
-                                                                       IndexDataType,
-                                                                       InGridDesc_M_K,
-                                                                       OutGridDesc_M,
-                                                                       ReduceOperation,
-                                                                       InElementwiseOperation,
-                                                                       AccElementwiseOperation,
-                                                                       PropagateNan,
-                                                                       BetaIsZero,
-                                                                       BlockSize,
-                                                                       MThreadClusterSize,
-                                                                       KThreadClusterSize,
-                                                                       MThreadSliceSize,
-                                                                       KThreadSliceSize,
-                                                                       InSrcVectorDim,
-                                                                       InSrcVectorSize,
-                                                                       OutDstVectorSize>;
-
-            float avg_time = 0;
-
-            const auto kernel = kernel_reduce_blockwise_second_call<GridwiseReduce,
-                                                                    NeedIndices,
-                                                                    InDataType,
-                                                                    OutDataType,
-                                                                    AccDataType,
-                                                                    IndexDataType,
-                                                                    InGridDesc_M_K,
-                                                                    OutGridDesc_M,
-                                                                    InElementwiseOperation,
-                                                                    AccElementwiseOperation>;
-
-            avg_time = launch_and_time_kernel(stream_config,
-                                              kernel,
-                                              dim3(arg.gridSize),
-                                              dim3(BlockSize),
-                                              0,
-                                              in_grid_desc_m_k,
-                                              out_grid_desc_m,
-                                              arg.in_elementwise_op_,
-                                              arg.acc_elementwise_op_,
-                                              arg.alpha_,
-                                              arg.in_dev_,
-                                              arg.beta_,
-                                              arg.out_dev_,
-                                              arg.workspace_indices_dev_,
-                                              arg.out_indices_dev_);
-
-            return (avg_time);
-        };
-
-        float Run(const BaseArgument* p_arg,
-                  const StreamConfig& stream_config = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
-        }
-    };
-
-    bool IsSupportedArgument(const BaseArgument* p_arg) override
-    {
-        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
-
-        if constexpr(InSrcVectorDim == 0)
-            return (false);
-
-        if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
-            return (false);
-
-        // To improve
-        if(pArg->invariant_lowest_length % OutDstVectorSize != 0)
-            return (false);
-
-        // cases with very small reduce_total_length should be handled by the ThreadWise method
-        if(pArg->reduce_total_length / KThreadSliceSize < 2)
-            return (false);
-
-        return (true);
-    };
-
-    std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::vector<int> inLengths,
-                        const std::vector<int> inStrides,
-                        const std::vector<int> outLengths,
-                        const std::vector<int> outStrides,
-                        const std::vector<int> reduceDims,
-                        float alpha,
-                        float beta,
-                        const void* in_dev,
-                        void* out_dev,
-                        void* out_indices_dev,
-                        void* workspace_dev,
-                        const InElementwiseOperation in_elementwise_op,
-                        const AccElementwiseOperation acc_elementwise_op) override
-    {
-        (void)reduceDims;
-
-        return std::make_unique<Argument>(inLengths,
-                                          inStrides,
-                                          outLengths,
-                                          outStrides,
-                                          alpha,
-                                          beta,
-                                          static_cast<const InDataType*>(in_dev),
-                                          static_cast<OutDataType*>(out_dev),
-                                          static_cast<IndexDataType*>(out_indices_dev),
-                                          static_cast<AccDataType*>(workspace_dev),
-                                          in_elementwise_op,
-                                          acc_elementwise_op);
-    };
-
-    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
-    {
-        return std::make_unique<Invoker>();
-    };
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "DeviceReduceBlockWiseSecondCall<" << BlockSize << ",";
-        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
-        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
-        str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
-        // clang-format on
-
-        return str.str();
-    }
-};
-
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
-#endif
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp
index 038c754722..f68a392821 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp
@@ -14,13 +14,13 @@ namespace device {
 
 // here, inLengths[] is already shuffled so that lengths of invariant dims are included before those
 // of reduce dims
-template <int Rank, int NumReduceDim>
-std::pair<size_t, size_t> get_2d_lengths(const std::vector<int>& inLengths)
+template <index_t Rank, int NumReduceDim>
+std::pair<long_index_t, long_index_t> get_2d_lengths(const std::vector<index_t>& inLengths)
 {
     static_assert(Rank <= 6, "bigger Rank size not supported!");
 
-    size_t invariant_total_length = 1;
-    size_t reduce_total_length    = 1;
+    long_index_t invariant_total_length = 1;
+    long_index_t reduce_total_length    = 1;
 
     constexpr int NumInvariantDim = Rank - NumReduceDim;
 
@@ -35,13 +35,13 @@ std::pair<size_t, size_t> get_2d_lengths(const std::vector<int>& inLengths)
 
 // helper functions using variadic template arguments
 template <index_t... Ns>
-auto make_tuple_from_array_and_index_seq(const std::vector<int>& lengths, Sequence<Ns...>)
+auto make_tuple_from_array_and_index_seq(const std::vector<index_t>& lengths, Sequence<Ns...>)
 {
     return make_tuple(static_cast<index_t>(lengths[Ns])...);
 };
 
 template <index_t arraySize>
-static auto make_tuple_from_array(const std::vector<int>& lengths, Number<arraySize>)
+auto make_tuple_from_array(const std::vector<index_t>& lengths, Number<arraySize>)
 {
     static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
 
@@ -51,10 +51,10 @@ static auto make_tuple_from_array(const std::vector<int>& lengths, Number<arrayS
 };
 
 template <index_t Rank, index_t NumReduceDim>
-std::vector<int> shuffle_tensor_dimensions(const std::vector<int>& origLengthsStrides,
-                                           const std::vector<int>& reduceDims)
+std::vector<index_t> shuffle_tensor_dimensions(const std::vector<index_t>& origLengthsStrides,
+                                               const std::vector<int>& reduceDims)
 {
-    std::vector<int> newLengthsStrides;
+    std::vector<index_t> newLengthsStrides;
 
     assert(Rank == origLengthsStrides.size() && NumReduceDim == reduceDims.size());
 
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
similarity index 58%
rename from include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp
rename to include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
index f93c65fe18..2f447c0979 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
@@ -1,5 +1,5 @@
-#ifndef DEVICE_REDUCE_MULTIBLOCK_ATOMIC_ADD_HPP
-#define DEVICE_REDUCE_MULTIBLOCK_ATOMIC_ADD_HPP
+#ifndef DEVICE_REDUCE_MULTIBLOCK_HPP
+#define DEVICE_REDUCE_MULTIBLOCK_HPP
 
 #include <iostream>
 #include <sstream>
@@ -7,8 +7,9 @@
 #include "device_base.hpp"
 #include "device_reduce.hpp"
 #include "device_reduce_common.hpp"
-#include "gridwise_2d_reduction_multiblock_atomic_add.hpp"
+#include "gridwise_2d_reduction_multiblock.hpp"
 #include "gridwise_set_buffer_value.hpp"
+#include "reduction_operator.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -22,8 +23,10 @@ template <typename InDataType,
           typename ReduceOperation,
           typename InElementwiseOperation,
           typename AccElementwiseOperation,
+          InMemoryDataOperationEnum OutMemoryDataOperation,
           bool PropagateNan,
-          bool NeedIndices,
+          bool OutputIndex,
+          bool HaveIndexInputIfOutputIndex,
           index_t BlockSize,
           index_t MThreadClusterSize,
           index_t KThreadClusterSize,
@@ -32,8 +35,7 @@ template <typename InDataType,
           index_t InSrcVectorDim,
           index_t InSrcVectorSize,
           index_t OutDstVectorSize>
-struct DeviceReduceMultiBlockAtomicAdd
-    : public DeviceReduce<InElementwiseOperation, AccElementwiseOperation>
+struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccElementwiseOperation>
 {
     static_assert(Rank <= 6, "Bigger Rank size is not supported!");
     static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
@@ -46,26 +48,40 @@ struct DeviceReduceMultiBlockAtomicAdd
 
     using IndexDataType = int32_t;
 
+    static constexpr bool HaveIndexInput = OutputIndex && HaveIndexInputIfOutputIndex;
+
     static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
 
     static constexpr index_t numSrcDim = Rank;
     static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
     static constexpr bool reduceAllDim = (NumInvariantDim == 0);
 
-    static constexpr bool support_AtomicAdd =
+    // So far, only AtomicAdd is considered, other Atomic Operation like AtomicMax can be added
+    // later
+    static constexpr bool use_multiblock =
+        (OutMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd);
+
+    static constexpr bool out_type_compatible_with_atomic_op =
         std::is_same<OutDataType, float>::value || std::is_same<OutDataType, double>::value;
 
-    static_assert(!NeedIndices && support_AtomicAdd,
-                  "MultiBlockAtomicAdd method can only be used with non-indiced operation and when "
-                  "having float/double output type!");
+    static_assert(
+        !use_multiblock || (use_multiblock && out_type_compatible_with_atomic_op),
+        "The OutDataType must support the atomic operation for using MultiBlock reduction");
 
-    static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
-    static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+    static_assert(!use_multiblock || (use_multiblock && !OutputIndex),
+                  "MultiBlock reduction can only be used when outputing index is not required");
 
-    static auto MakeSrc2dDescriptor(const std::vector<int>& inLengths,
-                                    const std::vector<int>& inStrides,
+    static_assert(
+        ReduceOperation::IsCompatibleInMemoryDataOperation(OutMemoryDataOperation),
+        "The reduction accumulation operation must be compatible with the OutMemoryDataOperation!");
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    static auto MakeSrc2dDescriptor(const std::vector<index_t>& inLengths,
+                                    const std::vector<index_t>& inStrides,
                                     int blkGroupSize,
-                                    int kBlockTileIterations)
+                                    int numBlockTileIteration)
     {
         const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<numSrcDim>{});
         const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<numSrcDim>{});
@@ -109,7 +125,7 @@ struct DeviceReduceMultiBlockAtomicAdd
         const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
         const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});
 
-        const int reduceSizePerBlock = K_BlockTileSize * kBlockTileIterations;
+        const int reduceSizePerBlock = K_BlockTileSize * numBlockTileIteration;
         const auto inPad_M =
             math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
         const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength;
@@ -124,8 +140,8 @@ struct DeviceReduceMultiBlockAtomicAdd
         return (in_grid_desc_m_k_padded);
     };
 
-    static auto MakeDst1dDescriptor(const std::vector<int>& outLengths,
-                                    const std::vector<int>& outStrides)
+    static auto MakeDst1dDescriptor(const std::vector<index_t>& outLengths,
+                                    const std::vector<index_t>& outStrides)
     {
         const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<numDstDim>{});
         const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<numDstDim>{});
@@ -151,31 +167,56 @@ struct DeviceReduceMultiBlockAtomicAdd
         return (out_grid_desc_m_padded);
     };
 
+    static auto MakeDst1dDescriptorForBufferSet(const std::vector<index_t>& outLengths,
+                                                const std::vector<index_t>& outStrides)
+    {
+        const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<numDstDim>{});
+        const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<numDstDim>{});
+
+        auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+
+        auto out_grid_desc_m = transform_tensor_descriptor(
+            outDesc,
+            make_tuple(make_merge_transform(tupleDstLengths)),
+            make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}),
+            make_tuple(Sequence<0>{}));
+
+        const auto length = out_grid_desc_m.GetLength(Number<0>{});
+
+        const auto pad = math::integer_least_multiple(length, BlockSize) - length;
+
+        auto out_grid_desc_m_padded =
+            transform_tensor_descriptor(out_grid_desc_m,
+                                        make_tuple(make_right_pad_transform(length, pad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        return (out_grid_desc_m_padded);
+    };
+
     struct Argument : public BaseArgument
     {
-        Argument(const std::vector<int> inLengths,
-                 const std::vector<int> inStrides,
-                 const std::vector<int> outLengths,
-                 const std::vector<int> outStrides,
+        Argument(const std::vector<index_t> inLengths,
+                 const std::vector<index_t> inStrides,
+                 const std::vector<index_t> outLengths,
+                 const std::vector<index_t> outStrides,
                  const std::vector<int> reduceDims,
                  float alpha,
                  float beta,
                  const InDataType* in_dev,
+                 const IndexDataType* in_index_dev,
                  OutDataType* out_dev,
-                 IndexDataType* out_indices_dev,
-                 AccDataType* workspace_dev,
+                 IndexDataType* out_index_dev,
                  const InElementwiseOperation in_elementwise_op,
                  const AccElementwiseOperation acc_elementwise_op)
             : outLengths_{outLengths},
               outStrides_{outStrides},
               in_dev_{in_dev},
+              in_index_dev_{in_index_dev},
               out_dev_{out_dev},
+              out_index_dev_{out_index_dev},
               in_elementwise_op_{in_elementwise_op},
               acc_elementwise_op_{acc_elementwise_op}
         {
-            (void)out_indices_dev;
-            (void)workspace_dev;
-
             inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
             inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);
 
@@ -192,24 +233,35 @@ struct DeviceReduceMultiBlockAtomicAdd
 
             reduce_lowest_length = inLengths_[Rank - 1];
 
-            int iterations = 1;
-            while(true)
+            if constexpr(use_multiblock)
             {
-                int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
-                                       (K_BlockTileSize * iterations);
 
-                // we want the blkGroupSize be not more than 128
-                if(testBlkGroupSize <= 128)
-                    break;
+                int iterations = 1;
+                while(true)
+                {
+                    int testBlkGroupSize =
+                        (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
+                        (K_BlockTileSize * iterations);
 
-                iterations++;
+                    // we want the blkGroupSize be not more than 128
+                    if(testBlkGroupSize <= 128)
+                        break;
+
+                    iterations++;
+                };
+
+                blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
+                               (K_BlockTileSize * iterations);
+
+                numBlockTileIteration = iterations;
+            }
+            else
+            {
+                blkGroupSize = 1;
+                numBlockTileIteration =
+                    (reduce_total_length + K_BlockTileSize - 1) / K_BlockTileSize;
             };
 
-            blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
-                           (K_BlockTileSize * iterations);
-
-            kBlockTileIterations = iterations;
-
             gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
                        M_BlockTileSize * blkGroupSize;
 
@@ -217,27 +269,29 @@ struct DeviceReduceMultiBlockAtomicAdd
                 math::integer_least_multiple(invariant_total_length, BlockSize) / BlockSize;
         }
 
-        std::vector<int> inLengths_;
-        std::vector<int> inStrides_;
-        std::vector<int> outLengths_;
-        std::vector<int> outStrides_;
+        std::vector<index_t> inLengths_;
+        std::vector<index_t> inStrides_;
+        std::vector<index_t> outLengths_;
+        std::vector<index_t> outStrides_;
 
         AccDataType alpha_;
         AccDataType beta_;
 
         const InDataType* in_dev_;
+        const IndexDataType* in_index_dev_;
         OutDataType* out_dev_;
+        IndexDataType* out_index_dev_;
 
         InElementwiseOperation in_elementwise_op_;
         AccElementwiseOperation acc_elementwise_op_;
 
-        int invariant_lowest_length;
-        int reduce_lowest_length;
-        size_t invariant_total_length;
-        size_t reduce_total_length;
+        index_t invariant_lowest_length;
+        index_t reduce_lowest_length;
+        long_index_t invariant_total_length;
+        long_index_t reduce_total_length;
 
-        index_t blkGroupSize;
-        index_t kBlockTileIterations;
+        int blkGroupSize;
+        int numBlockTileIteration;
         size_t gridSize;
 
         size_t gridSize_pre;
@@ -247,52 +301,69 @@ struct DeviceReduceMultiBlockAtomicAdd
     {
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
-            const auto in_grid_desc_m_k = DeviceReduceMultiBlockAtomicAdd::MakeSrc2dDescriptor(
-                arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations);
-            const auto out_grid_desc_m = DeviceReduceMultiBlockAtomicAdd::MakeDst1dDescriptor(
+            const auto in_grid_desc_m_k = DeviceReduceMultiBlock::MakeSrc2dDescriptor(
+                arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration);
+            const auto out_grid_desc_m =
+                DeviceReduceMultiBlock::MakeDst1dDescriptor(arg.outLengths_, arg.outStrides_);
+            const auto out_grid_desc_m_2 = DeviceReduceMultiBlock::MakeDst1dDescriptorForBufferSet(
                 arg.outLengths_, arg.outStrides_);
-            using InGridDesc_M_K = decltype(in_grid_desc_m_k);
-            using OutGridDesc_M  = decltype(out_grid_desc_m);
 
-            using GridwiseReduce =
-                GridwiseReduction_mk_to_m_multiblock_atomic_add<InDataType,
-                                                                OutDataType,
-                                                                AccDataType,
-                                                                InGridDesc_M_K,
-                                                                OutGridDesc_M,
-                                                                ReduceOperation,
-                                                                InElementwiseOperation,
-                                                                AccElementwiseOperation,
-                                                                PropagateNan,
-                                                                BlockSize,
-                                                                MThreadClusterSize,
-                                                                KThreadClusterSize,
-                                                                MThreadSliceSize,
-                                                                KThreadSliceSize,
-                                                                InSrcVectorDim,
-                                                                InSrcVectorSize,
-                                                                OutDstVectorSize>;
+            using InGridDesc_M_K  = decltype(in_grid_desc_m_k);
+            using OutGridDesc_M   = decltype(out_grid_desc_m);
+            using OutGridDesc_M_2 = decltype(out_grid_desc_m_2);
+
+            using GridwiseReduce = GridwiseReduction_mk_to_m_multiblock<InDataType,
+                                                                        OutDataType,
+                                                                        AccDataType,
+                                                                        IndexDataType,
+                                                                        InGridDesc_M_K,
+                                                                        OutGridDesc_M,
+                                                                        ReduceOperation,
+                                                                        InElementwiseOperation,
+                                                                        AccElementwiseOperation,
+                                                                        OutMemoryDataOperation,
+                                                                        PropagateNan,
+                                                                        BlockSize,
+                                                                        MThreadClusterSize,
+                                                                        KThreadClusterSize,
+                                                                        MThreadSliceSize,
+                                                                        KThreadSliceSize,
+                                                                        InSrcVectorDim,
+                                                                        InSrcVectorSize,
+                                                                        OutDstVectorSize>;
+
+            const auto kernel_main = kernel_reduce_multiblock<GridwiseReduce,
+                                                              OutputIndex,
+                                                              HaveIndexInput,
+                                                              InDataType,
+                                                              OutDataType,
+                                                              AccDataType,
+                                                              int32_t,
+                                                              InGridDesc_M_K,
+                                                              OutGridDesc_M,
+                                                              InElementwiseOperation,
+                                                              AccElementwiseOperation>;
 
             float avg_time = 0;
 
-            const auto kernel_pre  = kernel_buffer_set_value<BlockSize, OutDataType, OutGridDesc_M>;
-            const auto kernel_main = kernel_reduce_multiblock_atocmi_add<GridwiseReduce,
-                                                                         InDataType,
-                                                                         OutDataType,
-                                                                         AccDataType,
-                                                                         InGridDesc_M_K,
-                                                                         OutGridDesc_M,
-                                                                         InElementwiseOperation,
-                                                                         AccElementwiseOperation>;
+            if constexpr(use_multiblock)
+            {
+                const auto zeroVal =
+                    ck::reduce::GetReductionZeroValueForInMemoryDataOperation<OutDataType>(
+                        OutMemoryDataOperation);
 
-            avg_time += launch_and_time_kernel(stream_config,
-                                               kernel_pre,
-                                               dim3(arg.gridSize_pre),
-                                               dim3(BlockSize),
-                                               0,
-                                               out_grid_desc_m,
-                                               arg.out_dev_,
-                                               static_cast<OutDataType>(0.0f));
+                const auto kernel_pre =
+                    kernel_buffer_set_value<BlockSize, OutDataType, OutGridDesc_M_2>;
+
+                avg_time += launch_and_time_kernel(stream_config,
+                                                   kernel_pre,
+                                                   dim3(arg.gridSize_pre),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   out_grid_desc_m_2,
+                                                   arg.out_dev_,
+                                                   zeroVal);
+            };
 
             avg_time += launch_and_time_kernel(stream_config,
                                                kernel_main,
@@ -304,25 +375,34 @@ struct DeviceReduceMultiBlockAtomicAdd
                                                arg.in_elementwise_op_,
                                                arg.acc_elementwise_op_,
                                                arg.blkGroupSize,
-                                               arg.kBlockTileIterations,
+                                               arg.numBlockTileIteration,
                                                arg.alpha_,
                                                arg.in_dev_,
-                                               arg.out_dev_);
+                                               arg.in_index_dev_,
+                                               arg.beta_,
+                                               arg.out_dev_,
+                                               arg.out_index_dev_);
 
-            return avg_time;
-        }
+            return (avg_time);
+        };
 
         float Run(const BaseArgument* p_arg,
                   const StreamConfig& stream_config = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
-        }
+        };
     };
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
     {
         const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
 
+        if constexpr(use_multiblock)
+        {
+            if(static_cast<float>(pArg->beta_) != 0.0f)
+                return (false);
+        };
+
         if constexpr(InSrcVectorDim == 0)
         {
             if constexpr(NumInvariantDim == 0)
@@ -347,36 +427,43 @@ struct DeviceReduceMultiBlockAtomicAdd
                 return (false);
         };
 
-        if(static_cast<float>(pArg->beta_) != 0.0f)
-            return (false);
-
         // To improve
         if(pArg->invariant_lowest_length % OutDstVectorSize != 0)
             return (false);
 
-        // cases with small reduce_total_length should be handled by the BlockWise method
-        if(pArg->reduce_total_length <= BlockSize * KThreadSliceSize)
-            return (false);
+        if constexpr(use_multiblock)
+        {
+            // blkGroupSize of 1 should be handled by Blockwise path using
+            // InMemoryDataOperationEnum::Set
+            if(pArg->blkGroupSize == 1)
+                return (false);
 
-        // This is very strong restriction, but needed to avoid some failure
-        if(pArg->invariant_lowest_length % M_BlockTileSize != 0)
-            return (false);
+            // This is very strong restriction, but needed to avoid some failure
+            if(pArg->invariant_lowest_length % M_BlockTileSize != 0)
+                return (false);
+        }
+        else
+        {
+            // cases with very small reduce_total_length should be handled by ThreadWise kernel
+            if(pArg->reduce_total_length / KThreadSliceSize < 2)
+                return (false);
+        };
 
         return (true);
     };
 
     std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::vector<int> inLengths,
-                        const std::vector<int> inStrides,
-                        const std::vector<int> outLengths,
-                        const std::vector<int> outStrides,
+    MakeArgumentPointer(const std::vector<index_t> inLengths,
+                        const std::vector<index_t> inStrides,
+                        const std::vector<index_t> outLengths,
+                        const std::vector<index_t> outStrides,
                         const std::vector<int> reduceDims,
                         float alpha,
                         float beta,
                         const void* in_dev,
+                        const void* in_index_dev,
                         void* out_dev,
-                        void* out_indices_dev,
-                        void* workspace_dev,
+                        void* out_index_dev,
                         const InElementwiseOperation in_elementwise_op,
                         const AccElementwiseOperation acc_elementwise_op) override
     {
@@ -388,9 +475,9 @@ struct DeviceReduceMultiBlockAtomicAdd
                                           alpha,
                                           beta,
                                           static_cast<const InDataType*>(in_dev),
+                                          static_cast<const IndexDataType*>(in_index_dev),
                                           static_cast<OutDataType*>(out_dev),
-                                          static_cast<IndexDataType*>(out_indices_dev),
-                                          static_cast<AccDataType*>(workspace_dev),
+                                          static_cast<IndexDataType*>(out_index_dev),
                                           in_elementwise_op,
                                           acc_elementwise_op);
     };
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
deleted file mode 100644
index b4eb8116c2..0000000000
--- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
+++ /dev/null
@@ -1,440 +0,0 @@
-#ifndef DEVICE_REDUCE_MULTIBLOCK_PARTIAL_REDUCE_HPP
-#define DEVICE_REDUCE_MULTIBLOCK_PARTIAL_REDUCE_HPP
-
-#include <iostream>
-#include <sstream>
-#include "device.hpp"
-#include "device_reduce.hpp"
-#include "device_reduce_common.hpp"
-#include "gridwise_2d_reduction_multiblock_partial_reduce.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-
-template <typename InDataType,
-          typename AccDataType,
-          typename OutDataType,
-          index_t Rank,
-          index_t NumReduceDim,
-          typename ReduceOperation,
-          typename InElementwiseOperation,
-          typename AccElementwiseOperation,
-          bool PropagateNan,
-          bool NeedIndices,
-          index_t BlockSize,
-          index_t MThreadClusterSize,
-          index_t KThreadClusterSize,
-          index_t MThreadSliceSize,
-          index_t KThreadSliceSize,
-          index_t InSrcVectorDim,
-          index_t InSrcVectorSize,
-          index_t OutDstVectorSize>
-struct DeviceReduceMultiBlockPartialReduce
-    : public DeviceReduce<InElementwiseOperation, AccElementwiseOperation>
-{
-    static_assert(Rank <= 6, "Bigger Rank size is not supported!");
-    static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
-                  "Invalid thread cluster size assignments!");
-
-    static_assert((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
-                      (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0),
-                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
-
-    static_assert(OutDstVectorSize == 1, "OutDstVectorSize must be 1 for MultiBlockPartialReduce!");
-
-    using IndexDataType = int32_t;
-
-    static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
-
-    static constexpr index_t numSrcDim = Rank;
-    static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
-    static constexpr bool reduceAllDim = (NumInvariantDim == 0);
-
-    static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
-    static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
-
-    static constexpr int MaxBlockGroupSize = 256;
-
-    long_index_t GetWorkspaceSizeInBytes(const std::vector<int> inLengths,
-                                         const std::vector<int> reduceDims) override
-    {
-        size_t invariant_total_length;
-        size_t reduce_total_length;
-
-        auto inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
-
-        std::tie(invariant_total_length, reduce_total_length) =
-            get_2d_lengths<Rank, NumReduceDim>(inLengths_);
-
-        int iterations = 1;
-        while(true)
-        {
-            int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
-                                   (K_BlockTileSize * iterations);
-
-            if(testBlkGroupSize <= MaxBlockGroupSize)
-                break;
-
-            iterations++;
-        };
-
-        int blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
-                           (K_BlockTileSize * iterations);
-
-        long_index_t workspace_size = invariant_total_length * blkGroupSize;
-
-        long_index_t wsSizeInBytes =
-            !NeedIndices
-                ? workspace_size * sizeof(AccDataType)
-                : workspace_size * (sizeof(AccDataType) + sizeof(int32_t)) + 64 + sizeof(int);
-
-        return (wsSizeInBytes);
-    };
-
-    bool HasFurtherCall() override { return (true); };
-
-    static auto MakeSrc2dDescriptor(const std::vector<int>& inLengths,
-                                    const std::vector<int>& inStrides,
-                                    int blkGroupSize,
-                                    int kBlockTileIterations)
-    {
-        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<numSrcDim>{});
-        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<numSrcDim>{});
-
-        const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
-
-        const auto in_grid_desc_m_k = [&]() {
-            if constexpr(reduceAllDim)
-            {
-                const auto one_dim_inDesc = transform_tensor_descriptor(
-                    inDesc,
-                    make_tuple(make_merge_transform(tupleSrcLengths)),
-                    make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}),
-                    make_tuple(Sequence<0>{}));
-
-                return transform_tensor_descriptor(one_dim_inDesc,
-                                                   make_tuple(make_unmerge_transform(make_tuple(
-                                                       1, one_dim_inDesc.GetLength(Number<0>{})))),
-                                                   make_tuple(Sequence<0>{}),
-                                                   make_tuple(Sequence<0, 1>{}));
-            }
-            else
-            {
-                using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
-                using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
-
-                const auto reduceDimLengths =
-                    make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
-                const auto invariantDimLengths =
-                    make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
-
-                return transform_tensor_descriptor(
-                    inDesc,
-                    make_tuple(make_merge_transform(invariantDimLengths),
-                               make_merge_transform(reduceDimLengths)),
-                    make_tuple(InvariantDims{}, ReduceDims{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}));
-            }
-        }();
-
-        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
-        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});
-
-        const int reduceSizePerBlock = K_BlockTileSize * kBlockTileIterations;
-        const auto inPad_M =
-            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
-        const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength;
-
-        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
-            in_grid_desc_m_k,
-            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
-                       make_right_pad_transform(reduceLength, inPad_K)),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-        return (in_grid_desc_m_k_padded);
-    };
-
-    static auto MakeWorkspace2dDescriptor(int invariantLength, int blkGroupSize)
-    {
-        auto ws_desc_m_k =
-            make_naive_tensor_descriptor_packed(make_tuple(invariantLength, blkGroupSize));
-
-        const auto wsPad =
-            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
-
-        auto ws_desc_m_k_padded =
-            transform_tensor_descriptor(ws_desc_m_k,
-                                        make_tuple(make_right_pad_transform(invariantLength, wsPad),
-                                                   make_pass_through_transform(blkGroupSize)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-        return (ws_desc_m_k_padded);
-    };
-
-    struct Argument : public BaseArgument
-    {
-        Argument(const std::vector<int> inLengths,
-                 const std::vector<int> inStrides,
-                 const std::vector<int> outLengths,
-                 const std::vector<int> outStrides,
-                 const std::vector<int> reduceDims,
-                 float alpha,
-                 float beta,
-                 const InDataType* in_dev,
-                 OutDataType* out_dev,
-                 IndexDataType* out_indices_dev,
-                 AccDataType* workspace_dev,
-                 const InElementwiseOperation in_elementwise_op,
-                 const AccElementwiseOperation acc_elementwise_op)
-            : outLengths_{outLengths},
-              outStrides_{outStrides},
-              in_dev_{in_dev},
-              out_dev_{out_dev},
-              out_indices_dev_{out_indices_dev},
-              workspace_dev_{workspace_dev},
-              in_elementwise_op_{in_elementwise_op},
-              acc_elementwise_op_{acc_elementwise_op}
-        {
-            inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
-            inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);
-
-            alpha_ = type_convert<AccDataType>(alpha);
-            beta_  = type_convert<AccDataType>(beta);
-
-            std::tie(invariant_total_length, reduce_total_length) =
-                get_2d_lengths<Rank, NumReduceDim>(inLengths_);
-
-            if constexpr(NumInvariantDim == 0)
-                invariant_lowest_length = 1;
-            else
-                invariant_lowest_length = inLengths_[NumInvariantDim - 1];
-
-            reduce_lowest_length = inLengths_[Rank - 1];
-
-            int iterations = 1;
-            while(true)
-            {
-                int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
-                                       (K_BlockTileSize * iterations);
-
-                if(testBlkGroupSize <= MaxBlockGroupSize)
-                    break;
-
-                iterations++;
-            };
-
-            blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
-                           (K_BlockTileSize * iterations);
-
-            kBlockTileIterations = iterations;
-
-            gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
-                       M_BlockTileSize * blkGroupSize;
-
-            size_t ws_buf2_bytes_offset = math::integer_least_multiple(
-                invariant_total_length * blkGroupSize * sizeof(AccDataType), 64);
-
-            if constexpr(NeedIndices)
-                workspace_indices_dev_ = reinterpret_cast<int*>(
-                    reinterpret_cast<char*>(workspace_dev_) + ws_buf2_bytes_offset);
-            else
-                workspace_indices_dev_ = nullptr;
-        }
-
-        std::vector<int> inLengths_;
-        std::vector<int> inStrides_;
-        std::vector<int> outLengths_;
-        std::vector<int> outStrides_;
-
-        AccDataType alpha_;
-        AccDataType beta_;
-
-        const InDataType* in_dev_;
-        OutDataType* out_dev_;
-        IndexDataType* out_indices_dev_;
-        AccDataType* workspace_dev_;
-        IndexDataType* workspace_indices_dev_;
-
-        InElementwiseOperation in_elementwise_op_;
-        AccElementwiseOperation acc_elementwise_op_;
-
-        int invariant_lowest_length;
-        int reduce_lowest_length;
-        size_t invariant_total_length;
-        size_t reduce_total_length;
-
-        index_t blkGroupSize;
-        index_t kBlockTileIterations;
-        size_t gridSize;
-    };
-
-    struct Invoker : public BaseInvoker
-    {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
-        {
-            const auto in_grid_desc_m_k = DeviceReduceMultiBlockPartialReduce::MakeSrc2dDescriptor(
-                arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations);
-            const auto ws_desc_m_k = DeviceReduceMultiBlockPartialReduce::MakeWorkspace2dDescriptor(
-                arg.invariant_total_length, arg.blkGroupSize);
-            using InGridDesc_M_K    = decltype(in_grid_desc_m_k);
-            using WorkspaceDesc_M_K = decltype(ws_desc_m_k);
-
-            using GridwiseReduce =
-                GridwiseReduction_mk_to_mk_multiblock_partial_reduce<InDataType,
-                                                                     AccDataType,
-                                                                     IndexDataType,
-                                                                     InGridDesc_M_K,
-                                                                     WorkspaceDesc_M_K,
-                                                                     ReduceOperation,
-                                                                     InElementwiseOperation,
-                                                                     AccElementwiseOperation,
-                                                                     PropagateNan,
-                                                                     BlockSize,
-                                                                     MThreadClusterSize,
-                                                                     KThreadClusterSize,
-                                                                     MThreadSliceSize,
-                                                                     KThreadSliceSize,
-                                                                     InSrcVectorDim,
-                                                                     InSrcVectorSize,
-                                                                     OutDstVectorSize>;
-
-            float avg_time = 0;
-
-            const auto kernel = kernel_partial_reduce_multiblock<GridwiseReduce,
-                                                                 NeedIndices,
-                                                                 InDataType,
-                                                                 AccDataType,
-                                                                 IndexDataType,
-                                                                 InGridDesc_M_K,
-                                                                 WorkspaceDesc_M_K,
-                                                                 InElementwiseOperation,
-                                                                 AccElementwiseOperation>;
-
-            avg_time = launch_and_time_kernel(stream_config,
-                                              kernel,
-                                              dim3(arg.gridSize),
-                                              dim3(BlockSize),
-                                              0,
-                                              in_grid_desc_m_k,
-                                              ws_desc_m_k,
-                                              arg.in_elementwise_op_,
-                                              arg.acc_elementwise_op_,
-                                              arg.blkGroupSize,
-                                              arg.kBlockTileIterations,
-                                              arg.in_dev_,
-                                              arg.workspace_dev_,
-                                              arg.workspace_indices_dev_);
-
-            return (avg_time);
-        };
-
-        float Run(const BaseArgument* p_arg,
-                  const StreamConfig& stream_config = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
-        }
-    };
-
-    bool IsSupportedArgument(const BaseArgument* p_arg) override
-    {
-        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
-
-        if constexpr(OutDstVectorSize != 1)
-            return (false);
-
-        if constexpr(InSrcVectorDim == 0)
-        {
-            if constexpr(NumInvariantDim == 0)
-            {
-                return (false);
-            }
-            else
-            {
-                if(pArg->inStrides_[NumInvariantDim - 1] != 1)
-                    return (false);
-
-                if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
-                    return (false);
-            };
-        }
-        else
-        {
-            if(pArg->inStrides_[Rank - 1] != 1)
-                return (false);
-
-            if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
-                return (false);
-        };
-
-        // cases with small reduce_total_length should be handled by the BlockWise method
-        if(pArg->reduce_total_length <= BlockSize * KThreadSliceSize)
-            return (false);
-
-        return (true);
-    };
-
-    std::vector<int> GetWorkspace2dLengths(const BaseArgument* p_arg) override
-    {
-        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
-
-        return (
-            std::vector<int>{static_cast<int>(pArg->invariant_total_length), pArg->blkGroupSize});
-    };
-
-    std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::vector<int> inLengths,
-                        const std::vector<int> inStrides,
-                        const std::vector<int> outLengths,
-                        const std::vector<int> outStrides,
-                        const std::vector<int> reduceDims,
-                        float alpha,
-                        float beta,
-                        const void* in_dev,
-                        void* out_dev,
-                        void* out_indices_dev,
-                        void* workspace_dev,
-                        const InElementwiseOperation in_elementwise_op,
-                        const AccElementwiseOperation acc_elementwise_op) override
-    {
-        return std::make_unique<Argument>(inLengths,
-                                          inStrides,
-                                          outLengths,
-                                          outStrides,
-                                          reduceDims,
-                                          alpha,
-                                          beta,
-                                          static_cast<const InDataType*>(in_dev),
-                                          static_cast<OutDataType*>(out_dev),
-                                          static_cast<IndexDataType*>(out_indices_dev),
-                                          static_cast<AccDataType*>(workspace_dev),
-                                          in_elementwise_op,
-                                          acc_elementwise_op);
-    };
-
-    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
-    {
-        return std::make_unique<Invoker>();
-    };
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "DeviceReduceMultiBlockPartialReduce<" << BlockSize << ",";
-        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
-        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
-        str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
-        // clang-format on
-
-        return str.str();
-    }
-};
-
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
-#endif
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
index dacb175043..9549bf65d2 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
@@ -6,6 +6,7 @@
 #include "device.hpp"
 #include "device_reduce.hpp"
 #include "device_reduce_common.hpp"
+#include "gridwise_2d_reduction_multiblock.hpp"
 #include "gridwise_2d_reduction_threadwise.hpp"
 
 namespace ck {
@@ -19,22 +20,19 @@ template <typename InDataType,
           index_t NumReduceDim,
           typename ReduceOperation,
           typename InElementwiseOperation,
-          typename OutElementwiseOperation,
+          typename AccElementwiseOperation,
           bool PropagateNan,
-          bool NeedIndices,
+          bool OutputIndex,
+          bool HaveIndexInputIfOutputIndex,
           index_t BlockSize,
-          index_t MThreadClusterSize,
-          index_t KThreadClusterSize,
           index_t MThreadSliceSize,
           index_t KThreadSliceSize,
           index_t InSrcVectorDim,
           index_t InSrcVectorSize,
           index_t OutDstVectorSize>
-struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutElementwiseOperation>
+struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, AccElementwiseOperation>
 {
     static_assert(Rank <= 6, "Bigger Rank size is not supported!");
-    static_assert((BlockSize == MThreadClusterSize) && (KThreadClusterSize == 1),
-                  "Threadwise can only be called with KThreadClusterSize be 1 !");
 
     static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
                    (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
@@ -43,7 +41,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
 
     using IndexDataType = int32_t;
 
-    static constexpr bool BetaIsZero = NeedIndices;
+    static constexpr bool HaveIndexInput = OutputIndex && HaveIndexInputIfOutputIndex;
 
     static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
 
@@ -51,11 +49,11 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
     static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
     static constexpr bool reduceAllDim = (NumInvariantDim == 0);
 
-    static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
-    static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+    static constexpr index_t M_BlockTileSize = BlockSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = 1 * KThreadSliceSize;
 
-    static auto MakeSrc2dDescriptor(const std::vector<int>& inLengths,
-                                    const std::vector<int>& inStrides)
+    static auto MakeSrc2dDescriptor(const std::vector<index_t>& inLengths,
+                                    const std::vector<index_t>& inStrides)
     {
         const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<numSrcDim>{});
         const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<numSrcDim>{});
@@ -114,8 +112,8 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
         return (in_grid_desc_m_k_padded);
     };
 
-    static auto MakeDst1dDescriptor(const std::vector<int>& outLengths,
-                                    const std::vector<int>& outStrides)
+    static auto MakeDst1dDescriptor(const std::vector<index_t>& outLengths,
+                                    const std::vector<index_t>& outStrides)
     {
         const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<numDstDim>{});
         const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<numDstDim>{});
@@ -143,30 +141,26 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
 
     struct Argument : public BaseArgument
     {
-        Argument(const std::vector<int> inLengths,
-                 const std::vector<int> inStrides,
-                 const std::vector<int> outLengths,
-                 const std::vector<int> outStrides,
+        Argument(const std::vector<index_t> inLengths,
+                 const std::vector<index_t> inStrides,
+                 const std::vector<index_t> outLengths,
+                 const std::vector<index_t> outStrides,
                  const std::vector<int> reduceDims,
                  float alpha,
                  float beta,
                  const InDataType* in_dev,
                  OutDataType* out_dev,
-                 IndexDataType* out_indices_dev,
-                 AccDataType* workspace_dev,
+                 IndexDataType* out_index_dev,
                  const InElementwiseOperation in_elementwise_op,
-                 const OutElementwiseOperation acc_elementwise_op)
+                 const AccElementwiseOperation acc_elementwise_op)
             : outLengths_{outLengths},
               outStrides_{outStrides},
               in_dev_{in_dev},
               out_dev_{out_dev},
-              out_indices_dev_{out_indices_dev},
+              out_index_dev_{out_index_dev},
               in_elementwise_op_{in_elementwise_op},
               acc_elementwise_op_{acc_elementwise_op}
-
         {
-            (void)workspace_dev;
-
             inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
             inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);
 
@@ -183,30 +177,33 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
 
             reduce_lowest_length = inLengths_[Rank - 1];
 
+            numBlockTileIteration = (reduce_total_length + K_BlockTileSize - 1) / K_BlockTileSize;
+
             gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
                        M_BlockTileSize;
         }
 
-        std::vector<int> inLengths_;
-        std::vector<int> inStrides_;
-        std::vector<int> outLengths_;
-        std::vector<int> outStrides_;
+        std::vector<index_t> inLengths_;
+        std::vector<index_t> inStrides_;
+        std::vector<index_t> outLengths_;
+        std::vector<index_t> outStrides_;
 
         AccDataType alpha_;
         AccDataType beta_;
 
         const InDataType* in_dev_;
         OutDataType* out_dev_;
-        IndexDataType* out_indices_dev_;
+        IndexDataType* out_index_dev_;
 
         InElementwiseOperation in_elementwise_op_;
-        OutElementwiseOperation acc_elementwise_op_;
+        AccElementwiseOperation acc_elementwise_op_;
 
-        int invariant_lowest_length;
-        int reduce_lowest_length;
-        size_t invariant_total_length;
-        size_t reduce_total_length;
+        index_t invariant_lowest_length;
+        index_t reduce_lowest_length;
+        long_index_t invariant_total_length;
+        long_index_t reduce_total_length;
 
+        int numBlockTileIteration;
         size_t gridSize;
     };
 
@@ -221,30 +218,30 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
             using InGridDesc_M_K = decltype(in_grid_desc_m_k);
             using OutGridDesc_M  = decltype(out_grid_desc_m);
 
-            using GridwiseReduce = GridwiseReduction_mk_to_m_threadwise<InDataType,
-                                                                        OutDataType,
-                                                                        AccDataType,
-                                                                        IndexDataType,
-                                                                        InGridDesc_M_K,
-                                                                        OutGridDesc_M,
-                                                                        ReduceOperation,
-                                                                        InElementwiseOperation,
-                                                                        OutElementwiseOperation,
-                                                                        PropagateNan,
-                                                                        BetaIsZero,
-                                                                        BlockSize,
-                                                                        MThreadClusterSize,
-                                                                        KThreadClusterSize,
-                                                                        MThreadSliceSize,
-                                                                        KThreadSliceSize,
-                                                                        InSrcVectorDim,
-                                                                        InSrcVectorSize,
-                                                                        OutDstVectorSize>;
-
             float avg_time = 0;
 
+            using GridwiseReduce =
+                GridwiseReduction_mk_to_m_threadwise<InDataType,
+                                                     OutDataType,
+                                                     AccDataType,
+                                                     IndexDataType,
+                                                     InGridDesc_M_K,
+                                                     OutGridDesc_M,
+                                                     ReduceOperation,
+                                                     InElementwiseOperation,
+                                                     AccElementwiseOperation,
+                                                     InMemoryDataOperationEnum::Set,
+                                                     PropagateNan,
+                                                     BlockSize,
+                                                     MThreadSliceSize,
+                                                     KThreadSliceSize,
+                                                     InSrcVectorDim,
+                                                     InSrcVectorSize,
+                                                     OutDstVectorSize>;
+
             const auto kernel = kernel_reduce_threadwise<GridwiseReduce,
-                                                         NeedIndices,
+                                                         OutputIndex,
+                                                         HaveIndexInput,
                                                          InDataType,
                                                          OutDataType,
                                                          AccDataType,
@@ -252,7 +249,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
                                                          InGridDesc_M_K,
                                                          OutGridDesc_M,
                                                          InElementwiseOperation,
-                                                         OutElementwiseOperation>;
+                                                         AccElementwiseOperation>;
 
             avg_time = launch_and_time_kernel(stream_config,
                                               kernel,
@@ -265,9 +262,10 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
                                               arg.acc_elementwise_op_,
                                               arg.alpha_,
                                               arg.in_dev_,
+                                              nullptr,
                                               arg.beta_,
                                               arg.out_dev_,
-                                              arg.out_indices_dev_);
+                                              arg.out_index_dev_);
 
             return (avg_time);
         };
@@ -276,7 +274,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
                   const StreamConfig& stream_config = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
-        }
+        };
     };
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
@@ -311,9 +309,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
         if(pArg->invariant_lowest_length % OutDstVectorSize != 0)
             return (false);
 
-        // TODO: remove this. Should return true, as long as this DeviceOP instance support this
-        // case for bigger reduce_total_length size, we are supposed to use BlockWise method for
-        // better performance
+        // cases with big reduce_total_length should be handled by Blockwise kernel
         if(pArg->reduce_total_length / KThreadSliceSize >= 32)
             return (false);
 
@@ -321,20 +317,22 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
     };
 
     std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::vector<int> inLengths,
-                        const std::vector<int> inStrides,
-                        const std::vector<int> outLengths,
-                        const std::vector<int> outStrides,
+    MakeArgumentPointer(const std::vector<index_t> inLengths,
+                        const std::vector<index_t> inStrides,
+                        const std::vector<index_t> outLengths,
+                        const std::vector<index_t> outStrides,
                         const std::vector<int> reduceDims,
                         float alpha,
                         float beta,
                         const void* in_dev,
+                        const void* in_index_dev,
                         void* out_dev,
-                        void* out_indices_dev,
-                        void* workspace_dev,
+                        void* out_index_dev,
                         const InElementwiseOperation in_elementwise_op,
-                        const OutElementwiseOperation acc_elementwise_op) override
+                        const AccElementwiseOperation acc_elementwise_op) override
     {
+        (void)in_index_dev;
+
         return std::make_unique<Argument>(inLengths,
                                           inStrides,
                                           outLengths,
@@ -344,8 +342,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
                                           beta,
                                           static_cast<const InDataType*>(in_dev),
                                           static_cast<OutDataType*>(out_dev),
-                                          static_cast<IndexDataType*>(out_indices_dev),
-                                          static_cast<AccDataType*>(workspace_dev),
+                                          static_cast<IndexDataType*>(out_index_dev),
                                           in_elementwise_op,
                                           acc_elementwise_op);
     };
@@ -360,9 +357,9 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
         auto str = std::stringstream();
 
         // clang-format off
-        str << "DeviceReducceThreadWise<" << BlockSize << ",";
-        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
-        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
+        str << "DeviceReduceThreadWise<" << BlockSize << ",";
+        str << "M_C" << BlockSize << "_S" << MThreadSliceSize << ",";
+        str << "K_C" << 1 << "_S" << KThreadSliceSize << ",";
         str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
         // clang-format on
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp
deleted file mode 100644
index 6826d5211c..0000000000
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp
+++ /dev/null
@@ -1,886 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef CK_GRIDWISE_2D_REDUCTION_BLOCKWISE_HPP
-#define CK_GRIDWISE_2D_REDUCTION_BLOCKWISE_HPP
-
-#include "data_type.hpp"
-#include "reduction_common.hpp"
-#include "reduction_operator.hpp"
-#include "reduction_functions_accumulate.hpp"
-#include "reduction_functions_blockwise.hpp"
-#include "reduction_functions_threadwise.hpp"
-#include "threadwise_tensor_slice_transfer.hpp"
-#include "cluster_descriptor.hpp"
-#include "element_wise_operation.hpp"
-
-namespace ck {
-
-template <typename GridwiseReduction,
-          bool NeedIndices,
-          typename InDataType,
-          typename OutDataType,
-          typename AccDataType,
-          typename IndexDataType,
-          typename InGridDesc_M_K,
-          typename OutGridDesc_M,
-          typename InElementwiseOperation,
-          typename OutElementwiseOperation>
-__global__ void kernel_reduce_blockwise(const InGridDesc_M_K in_grid_desc_m_k,
-                                        const OutGridDesc_M out_grid_desc_m,
-                                        const InElementwiseOperation in_elementwise_op,
-                                        const OutElementwiseOperation acc_elementwise_op,
-                                        AccDataType alpha,
-                                        const InDataType* const __restrict__ p_in_global,
-                                        AccDataType beta,
-                                        OutDataType* const __restrict__ p_out_global,
-                                        const IndexDataType* const __restrict__ p_ws_indices_global,
-                                        IndexDataType* const __restrict__ p_indices_global)
-{
-    if constexpr(!NeedIndices)
-    {
-        constexpr bool IsSecondCall = false;
-
-        GridwiseReduction::template Run<IsSecondCall>(in_grid_desc_m_k,
-                                                      out_grid_desc_m,
-                                                      in_elementwise_op,
-                                                      acc_elementwise_op,
-                                                      alpha,
-                                                      p_in_global,
-                                                      beta,
-                                                      p_out_global,
-                                                      p_ws_indices_global,
-                                                      p_indices_global);
-    }
-    else
-    {
-        GridwiseReduction::RunWithIndex(in_grid_desc_m_k,
-                                        out_grid_desc_m,
-                                        in_elementwise_op,
-                                        acc_elementwise_op,
-                                        alpha,
-                                        p_in_global,
-                                        beta,
-                                        p_out_global,
-                                        p_ws_indices_global,
-                                        p_indices_global);
-    };
-};
-
-template <typename GridwiseReduction,
-          bool NeedIndices,
-          typename InDataType,
-          typename OutDataType,
-          typename AccDataType,
-          typename IndexDataType,
-          typename InGridDesc_M_K,
-          typename OutGridDesc_M,
-          typename InElementwiseOperation,
-          typename OutElementwiseOperation>
-__global__ void
-kernel_reduce_blockwise_second_call(const InGridDesc_M_K in_grid_desc_m_k,
-                                    const OutGridDesc_M out_grid_desc_m,
-                                    const InElementwiseOperation in_elementwise_op,
-                                    const OutElementwiseOperation acc_elementwise_op,
-                                    AccDataType alpha,
-                                    const InDataType* const __restrict__ p_in_global,
-                                    AccDataType beta,
-                                    OutDataType* const __restrict__ p_out_global,
-                                    const IndexDataType* const __restrict__ p_ws_indices_global,
-                                    IndexDataType* const __restrict__ p_indices_global)
-{
-    if constexpr(!NeedIndices)
-    {
-        constexpr bool IsSecondCall = true;
-
-        GridwiseReduction::template Run<IsSecondCall>(in_grid_desc_m_k,
-                                                      out_grid_desc_m,
-                                                      in_elementwise_op,
-                                                      acc_elementwise_op,
-                                                      alpha,
-                                                      p_in_global,
-                                                      beta,
-                                                      p_out_global,
-                                                      p_ws_indices_global,
-                                                      p_indices_global);
-    }
-    else
-    {
-        GridwiseReduction::RunSecondCallWithIndex(in_grid_desc_m_k,
-                                                  out_grid_desc_m,
-                                                  in_elementwise_op,
-                                                  acc_elementwise_op,
-                                                  alpha,
-                                                  p_in_global,
-                                                  beta,
-                                                  p_out_global,
-                                                  p_ws_indices_global,
-                                                  p_indices_global);
-    };
-};
-
-template <typename InDataType,
-          typename OutDataType,
-          typename AccDataType,
-          typename IndexDataType,
-          typename InGridDesc_M_K,
-          typename OutGridDesc_M,
-          typename ReduceOperation,
-          typename InElementwiseOperation,
-          typename OutElementwiseOperation,
-          bool PropagateNan,
-          bool BetaIsZero,
-          index_t BlockSize,
-          index_t MThreadClusterSize,
-          index_t KThreadClusterSize,
-          index_t MThreadSliceSize,
-          index_t KThreadSliceSize,
-          index_t InSrcVectorDim,
-          index_t InSrcVectorSize,
-          index_t OutDstVectorSize>
-struct GridwiseReduction_mk_to_m_blockwise
-{
-    static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
-                   (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
-                      (MThreadSliceSize % OutDstVectorSize == 0),
-                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
-
-    static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0);
-
-    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
-
-    using ThreadBufferDimAccessOrder =
-        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
-
-    using ThreadClusterArrangeOrder =
-        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
-
-    static constexpr auto thread_cluster_desc =
-        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
-
-    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
-        make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
-    using ThreadReduceDstDesc_M =
-        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
-
-    using PassThroughOp = tensor_operation::element_wise::PassThrough;
-
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto I1 = Number<1>{};
-
-    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
-    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
-
-    template <bool IsSecondCall>
-    __device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k,
-                               const OutGridDesc_M& out_grid_desc_m,
-                               const InElementwiseOperation& in_elementwise_op,
-                               const OutElementwiseOperation& acc_elementwise_op,
-                               AccDataType alpha,
-                               const InDataType* const __restrict__ p_in_global,
-                               AccDataType beta,
-                               OutDataType* const __restrict__ p_out_global,
-                               const IndexDataType* const __restrict__ p_ws_indices_global,
-                               IndexDataType* const __restrict__ p_indices_global)
-    {
-        if constexpr(IsSecondCall)
-        {
-            static_assert(InSrcVectorDim == 1,
-                          "InSrcVectorDim must be 1 for BlockwiseSecondCall, please check!");
-        };
-
-        using BlockwiseReduce = PartitionedBlockwiseReduction<AccDataType,
-                                                              BlockSize,
-                                                              ThreadClusterLengths_M_K,
-                                                              ThreadClusterArrangeOrder,
-                                                              ReduceOperation,
-                                                              PropagateNan>;
-
-        using ThreadwiseReduce = ThreadwiseReduction<AccDataType,
-                                                     ThreadReduceSrcDesc_M_K,
-                                                     ThreadReduceDstDesc_M,
-                                                     ReduceOperation,
-                                                     PropagateNan>;
-
-        (void)p_ws_indices_global;
-        (void)p_indices_global;
-
-        // LDS
-        __shared__ AccDataType p_reduce_work_buffer[BlockSize];
-
-        const auto zeroVal = ReduceOperation::GetReductionZeroVal();
-
-        const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert<InDataType>(zeroVal));
-        auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_out_global, out_grid_desc_m.GetElementSpaceSize());
-
-        auto reduce_work_buf =
-            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_buffer, BlockSize);
-
-        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
-            in_thread_buf;
-
-        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
-
-        static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; });
-
-        const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{});
-
-        const index_t thread_local_id    = get_thread_local_1d_id();
-        const index_t block_global_1d_id = get_block_1d_id();
-
-        const auto thread_cluster_idx =
-            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
-
-        const auto thread_m_cluster_id = thread_cluster_idx[I0];
-        const auto thread_k_cluster_id = thread_cluster_idx[I1];
-
-        using ThreadBufferLengths         = Sequence<MThreadSliceSize, KThreadSliceSize>;
-        constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
-            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
-
-        auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<InDataType,
-                                                                    AccDataType,
-                                                                    InGridDesc_M_K,
-                                                                    decltype(thread_buffer_desc),
-                                                                    ThreadBufferLengths,
-                                                                    ThreadBufferDimAccessOrder,
-                                                                    InSrcVectorDim,
-                                                                    InSrcVectorSize,
-                                                                    1,
-                                                                    false>(
-            in_grid_desc_m_k,
-            make_multi_index(block_global_1d_id * M_BlockTileSize +
-                                 thread_m_cluster_id * MThreadSliceSize,
-                             thread_k_cluster_id * KThreadSliceSize));
-
-        constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize);
-
-        const index_t toReduceTiles = (toReduceLength + K_BlockTileSize - 1) / K_BlockTileSize;
-
-        index_t reducedTiles = 0;
-        do
-        {
-            threadwise_src_load.Run(in_grid_desc_m_k,
-                                    in_global_buf,
-                                    thread_buffer_desc,
-                                    make_tuple(I0, I0),
-                                    in_thread_buf);
-
-            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
-                // do element-wise pre-reduction operation
-                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
-                    constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
-                    in_elementwise_op(in_thread_buf(Number<offset>{}),
-                                      in_thread_buf(Number<offset>{}));
-                });
-            });
-
-            ThreadwiseReduce::Reduce(in_thread_buf, accu_value_buf);
-
-            threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
-
-            reducedTiles++;
-        } while(reducedTiles < toReduceTiles);
-
-        constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{};
-
-        static_for<0, MThreadSliceSize, 1>{}(
-            [&](auto I) { BlockwiseReduce::Reduce(reduce_work_buf, accu_value_buf(I)); });
-
-        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-            if(thread_k_cluster_id == 0)
-            {
-                acc_elementwise_op(accu_value_buf(I), accu_value_buf(I));
-
-                accu_value_buf(I) *= alpha;
-            }
-        });
-
-        if(thread_k_cluster_id == 0)
-        {
-            if constexpr(!BetaIsZero)
-            {
-                if(!float_equal_zero{}(beta))
-                {
-                    StaticBuffer<AddressSpaceEnum::Vgpr, OutDataType, MThreadSliceSize, true>
-                        priorDstValueBuf;
-
-                    auto threadwise_dst_load =
-                        ThreadwiseTensorSliceTransfer_v2<OutDataType,
-                                                         OutDataType,
-                                                         OutGridDesc_M,
-                                                         decltype(reduced_data_desc),
-                                                         Sequence<MThreadSliceSize>,
-                                                         Sequence<0>,
-                                                         0,
-                                                         OutDstVectorSize,
-                                                         1,
-                                                         false>(
-                            out_grid_desc_m,
-                            make_multi_index(block_global_1d_id * M_BlockTileSize +
-                                             thread_m_cluster_id * MThreadSliceSize));
-
-                    threadwise_dst_load.Run(out_grid_desc_m,
-                                            out_global_buf,
-                                            reduced_data_desc,
-                                            make_tuple(I0),
-                                            priorDstValueBuf);
-
-                    static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-                        accu_value_buf(I) += type_convert<AccDataType>(priorDstValueBuf[I]) * beta;
-                    });
-                };
-            };
-
-            auto threadwise_dst_store =
-                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
-                                                   OutDataType,
-                                                   decltype(reduced_data_desc),
-                                                   OutGridDesc_M,
-                                                   PassThroughOp,
-                                                   Sequence<MThreadSliceSize>,
-                                                   Sequence<0>,
-                                                   0,
-                                                   OutDstVectorSize,
-                                                   InMemoryDataOperationEnum::Set,
-                                                   1,
-                                                   true>(
-                    out_grid_desc_m,
-                    make_multi_index(block_global_1d_id * M_BlockTileSize +
-                                     thread_m_cluster_id * MThreadSliceSize),
-                    PassThroughOp{});
-
-            threadwise_dst_store.Run(
-                reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, out_global_buf);
-        }
-    };
-
-    __device__ static void RunWithIndex(const InGridDesc_M_K& in_grid_desc_m_k,
-                                        const OutGridDesc_M& out_grid_desc_m,
-                                        const InElementwiseOperation& in_elementwise_op,
-                                        const OutElementwiseOperation& acc_elementwise_op,
-                                        AccDataType alpha,
-                                        const InDataType* const __restrict__ p_in_global,
-                                        AccDataType beta,
-                                        OutDataType* const __restrict__ p_out_global,
-                                        const IndexDataType* const __restrict__ p_ws_indices_global,
-                                        IndexDataType* const __restrict__ p_indices_global)
-    {
-        using BlockwiseReduceWithIndex =
-            PartitionedBlockwiseReductionWithIndex<AccDataType,
-                                                   IndexDataType,
-                                                   BlockSize,
-                                                   ThreadClusterLengths_M_K,
-                                                   ThreadClusterArrangeOrder,
-                                                   ReduceOperation,
-                                                   PropagateNan>;
-
-        using AccumulationWithIndex = detail::AccumulateWithIndexAndNanCheck<PropagateNan,
-                                                                             ReduceOperation,
-                                                                             AccDataType,
-                                                                             IndexDataType>;
-
-        (void)p_ws_indices_global;
-
-        // LDS
-        __shared__ AccDataType p_reduce_work_val_buffer[BlockSize];
-        __shared__ IndexDataType p_reduce_work_idx_buffer[BlockSize];
-
-        const auto zeroVal = ReduceOperation::GetReductionZeroVal();
-
-        const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert<InDataType>(zeroVal));
-        auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_out_global, out_grid_desc_m.GetElementSpaceSize());
-        auto out_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_indices_global, out_grid_desc_m.GetElementSpaceSize());
-
-        auto reduce_work_val_buf =
-            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_val_buffer, BlockSize);
-        auto reduce_work_idx_buf =
-            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_idx_buffer, BlockSize);
-
-        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
-            in_thread_val_buf;
-
-        StaticBuffer<AddressSpaceEnum::Vgpr,
-                     IndexDataType,
-                     MThreadSliceSize * KThreadSliceSize,
-                     true>
-            in_thread_idx_buf;
-
-        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
-        StaticBuffer<AddressSpaceEnum::Vgpr, IndexDataType, MThreadSliceSize, true> accu_index_buf;
-
-        const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{});
-
-        const index_t thread_local_id    = get_thread_local_1d_id();
-        const index_t block_global_1d_id = get_block_1d_id();
-
-        const auto thread_cluster_idx =
-            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
-
-        const auto thread_m_cluster_id = thread_cluster_idx[I0];
-        const auto thread_k_cluster_id = thread_cluster_idx[I1];
-
-        using ThreadBufferLengths         = Sequence<MThreadSliceSize, KThreadSliceSize>;
-        constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
-            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
-
-        auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<InDataType,
-                                                                    AccDataType,
-                                                                    InGridDesc_M_K,
-                                                                    decltype(thread_buffer_desc),
-                                                                    ThreadBufferLengths,
-                                                                    ThreadBufferDimAccessOrder,
-                                                                    InSrcVectorDim,
-                                                                    InSrcVectorSize,
-                                                                    1,
-                                                                    false>(
-            in_grid_desc_m_k,
-            make_multi_index(block_global_1d_id * M_BlockTileSize +
-                                 thread_m_cluster_id * MThreadSliceSize,
-                             thread_k_cluster_id * KThreadSliceSize));
-
-        index_t indexOffset = 0;
-
-        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-            accu_value_buf(I) = zeroVal;
-            accu_index_buf(I) = 0;
-        });
-
-        constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize);
-
-        const index_t toReduceTiles = (toReduceLength + K_BlockTileSize - 1) / K_BlockTileSize;
-
-        index_t reducedTiles = 0;
-        do
-        {
-            // load the thread slice
-            threadwise_src_load.Run(in_grid_desc_m_k,
-                                    in_global_buf,
-                                    thread_buffer_desc,
-                                    make_tuple(I0, I0),
-                                    in_thread_val_buf);
-
-            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
-                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
-                    constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
-
-                    // initialize the indices for the per-thread to-reduce values
-                    in_thread_idx_buf(Number<offset>{}) =
-                        indexOffset + thread_k_cluster_id * KThreadSliceSize + iK();
-
-                    // do element-wise pre-reduction operation
-                    in_elementwise_op(in_thread_val_buf(Number<offset>{}),
-                                      in_thread_val_buf(Number<offset>{}));
-                });
-
-                AccDataType tmpValue   = zeroVal;
-                IndexDataType tmpIndex = 0;
-
-                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
-                    constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
-
-                    AccumulationWithIndex::Calculate(tmpValue,
-                                                     in_thread_val_buf[Number<offset>{}],
-                                                     tmpIndex,
-                                                     in_thread_idx_buf[Number<offset>{}]);
-                });
-
-                BlockwiseReduceWithIndex::Reduce(
-                    reduce_work_val_buf, reduce_work_idx_buf, tmpValue, tmpIndex);
-
-                AccumulationWithIndex::Calculate(
-                    accu_value_buf(iM), tmpValue, accu_index_buf(iM), tmpIndex);
-            });
-
-            threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
-
-            indexOffset += K_BlockTileSize;
-            reducedTiles++;
-        } while(reducedTiles < toReduceTiles);
-
-        constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{};
-
-        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-            if(thread_k_cluster_id == 0)
-            {
-                // for indiced operation, acc_elementwise_op shoud do nothing
-                acc_elementwise_op(accu_value_buf(I), accu_value_buf(I));
-
-                accu_value_buf(I) *= alpha;
-            }
-        });
-
-        if(thread_k_cluster_id == 0)
-        {
-            if constexpr(!BetaIsZero)
-            {
-                if(!float_equal_zero{}(beta))
-                {
-                    StaticBuffer<AddressSpaceEnum::Vgpr, OutDataType, MThreadSliceSize, true>
-                        priorDstValueBuf;
-
-                    auto threadwise_dst_load =
-                        ThreadwiseTensorSliceTransfer_v2<OutDataType,
-                                                         OutDataType,
-                                                         OutGridDesc_M,
-                                                         decltype(reduced_data_desc),
-                                                         Sequence<MThreadSliceSize>,
-                                                         Sequence<0>,
-                                                         0,
-                                                         OutDstVectorSize,
-                                                         1,
-                                                         false>(
-                            out_grid_desc_m,
-                            make_multi_index(block_global_1d_id * M_BlockTileSize +
-                                             thread_m_cluster_id * MThreadSliceSize));
-
-                    threadwise_dst_load.Run(out_grid_desc_m,
-                                            out_global_val_buf,
-                                            reduced_data_desc,
-                                            make_tuple(I0),
-                                            priorDstValueBuf);
-
-                    static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-                        accu_value_buf(I) += type_convert<AccDataType>(priorDstValueBuf[I]) * beta;
-                    });
-                };
-            };
-
-            auto threadwise_dst_val_store =
-                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
-                                                   OutDataType,
-                                                   decltype(reduced_data_desc),
-                                                   OutGridDesc_M,
-                                                   PassThroughOp,
-                                                   Sequence<MThreadSliceSize>,
-                                                   Sequence<0>,
-                                                   0,
-                                                   OutDstVectorSize,
-                                                   InMemoryDataOperationEnum::Set,
-                                                   1,
-                                                   false>(
-                    out_grid_desc_m,
-                    make_multi_index(block_global_1d_id * M_BlockTileSize +
-                                     thread_m_cluster_id * MThreadSliceSize),
-                    PassThroughOp{});
-
-            auto threadwise_dst_idx_store =
-                ThreadwiseTensorSliceTransfer_v1r3<IndexDataType,
-                                                   IndexDataType,
-                                                   decltype(reduced_data_desc),
-                                                   OutGridDesc_M,
-                                                   PassThroughOp,
-                                                   Sequence<MThreadSliceSize>,
-                                                   Sequence<0>,
-                                                   0,
-                                                   OutDstVectorSize,
-                                                   InMemoryDataOperationEnum::Set,
-                                                   1,
-                                                   false>(
-                    out_grid_desc_m,
-                    make_multi_index(block_global_1d_id * M_BlockTileSize +
-                                     thread_m_cluster_id * MThreadSliceSize),
-                    PassThroughOp{});
-
-            threadwise_dst_val_store.Run(reduced_data_desc,
-                                         make_tuple(I0),
-                                         accu_value_buf,
-                                         out_grid_desc_m,
-                                         out_global_val_buf);
-            threadwise_dst_idx_store.Run(reduced_data_desc,
-                                         make_tuple(I0),
-                                         accu_index_buf,
-                                         out_grid_desc_m,
-                                         out_global_idx_buf);
-        }
-    };
-
-    __device__ static void
-    RunSecondCallWithIndex(const InGridDesc_M_K& in_grid_desc_m_k,
-                           const OutGridDesc_M& out_grid_desc_m,
-                           const InElementwiseOperation in_elementwise_op,
-                           const OutElementwiseOperation acc_elementwise_op,
-                           AccDataType alpha,
-                           const InDataType* const __restrict__ p_ws_values_global,
-                           AccDataType beta,
-                           OutDataType* const __restrict__ p_out_global,
-                           const IndexDataType* const __restrict__ p_ws_indices_global,
-                           IndexDataType* const __restrict__ p_indices_global)
-    {
-        static_assert(InSrcVectorDim == 1,
-                      "InSrcVectorDim must be 1 for BlockwiseSecondCall, please check!");
-
-        using BlockwiseReduceWithIndex =
-            PartitionedBlockwiseReductionWithIndex<AccDataType,
-                                                   IndexDataType,
-                                                   BlockSize,
-                                                   Sequence<MThreadClusterSize, KThreadClusterSize>,
-                                                   ThreadClusterArrangeOrder,
-                                                   ReduceOperation,
-                                                   PropagateNan>;
-
-        using AccumulationWithIndex = detail::AccumulateWithIndexAndNanCheck<PropagateNan,
-                                                                             ReduceOperation,
-                                                                             AccDataType,
-                                                                             IndexDataType>;
-
-        (void)in_elementwise_op;
-
-        // LDS
-        __shared__ AccDataType p_reduce_work_val_buffer[BlockSize];
-        __shared__ IndexDataType p_reduce_work_idx_buffer[BlockSize];
-
-        const auto zeroVal = ReduceOperation::GetReductionZeroVal();
-
-        const auto src_global_val_buf =
-            make_dynamic_buffer<AddressSpaceEnum::Global>(p_ws_values_global,
-                                                          in_grid_desc_m_k.GetElementSpaceSize(),
-                                                          type_convert<InDataType>(zeroVal));
-        const auto src_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_ws_indices_global, in_grid_desc_m_k.GetElementSpaceSize());
-        auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_out_global, out_grid_desc_m.GetElementSpaceSize());
-        auto out_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_indices_global, out_grid_desc_m.GetElementSpaceSize());
-
-        auto reduce_work_val_buf =
-            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_val_buffer, BlockSize);
-        auto reduce_work_idx_buf =
-            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_idx_buffer, BlockSize);
-
-        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
-            in_thread_val_buf;
-
-        StaticBuffer<AddressSpaceEnum::Vgpr,
-                     IndexDataType,
-                     MThreadSliceSize * KThreadSliceSize,
-                     true>
-            in_thread_idx_buf;
-
-        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
-        StaticBuffer<AddressSpaceEnum::Vgpr, IndexDataType, MThreadSliceSize, true> accu_index_buf;
-
-        const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{});
-
-        const index_t thread_local_id    = get_thread_local_1d_id();
-        const index_t block_global_1d_id = get_block_1d_id();
-
-        const auto thread_cluster_idx =
-            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
-
-        const auto thread_m_cluster_id = thread_cluster_idx[I0];
-        const auto thread_k_cluster_id = thread_cluster_idx[I1];
-
-        using ThreadBufferLengths         = Sequence<MThreadSliceSize, KThreadSliceSize>;
-        constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
-            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
-
-        auto threadwise_src_val_load =
-            ThreadwiseTensorSliceTransfer_v2<InDataType,
-                                             AccDataType,
-                                             InGridDesc_M_K,
-                                             decltype(thread_buffer_desc),
-                                             ThreadBufferLengths,
-                                             ThreadBufferDimAccessOrder,
-                                             InSrcVectorDim,
-                                             InSrcVectorSize,
-                                             1,
-                                             false>(
-                in_grid_desc_m_k,
-                make_multi_index(block_global_1d_id * M_BlockTileSize +
-                                     thread_m_cluster_id * MThreadSliceSize,
-                                 thread_k_cluster_id * KThreadSliceSize));
-
-        auto threadwise_src_idx_load =
-            ThreadwiseTensorSliceTransfer_v2<IndexDataType,
-                                             IndexDataType,
-                                             InGridDesc_M_K,
-                                             decltype(thread_buffer_desc),
-                                             ThreadBufferLengths,
-                                             ThreadBufferDimAccessOrder,
-                                             InSrcVectorDim,
-                                             InSrcVectorSize,
-                                             1,
-                                             false>(
-                in_grid_desc_m_k,
-                make_multi_index(block_global_1d_id * M_BlockTileSize +
-                                     thread_m_cluster_id * MThreadSliceSize,
-                                 thread_k_cluster_id * KThreadSliceSize));
-
-        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-            accu_value_buf(I) = zeroVal;
-            accu_index_buf(I) = 0;
-        });
-
-        constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize);
-
-        const index_t toReduceTiles = (toReduceLength + K_BlockTileSize - 1) / K_BlockTileSize;
-
-        index_t reducedTiles = 0;
-        do
-        {
-            // load the thread slice
-            threadwise_src_val_load.Run(in_grid_desc_m_k,
-                                        src_global_val_buf,
-                                        thread_buffer_desc,
-                                        make_tuple(I0, I0),
-                                        in_thread_val_buf);
-            threadwise_src_idx_load.Run(in_grid_desc_m_k,
-                                        src_global_idx_buf,
-                                        thread_buffer_desc,
-                                        make_tuple(I0, I0),
-                                        in_thread_idx_buf);
-
-            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
-                AccDataType tmpValue   = zeroVal;
-                IndexDataType tmpIndex = 0;
-
-                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
-                    constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
-
-                    AccumulationWithIndex::Calculate(tmpValue,
-                                                     in_thread_val_buf[Number<offset>{}],
-                                                     tmpIndex,
-                                                     in_thread_idx_buf[Number<offset>{}]);
-                });
-
-                BlockwiseReduceWithIndex::Reduce(
-                    reduce_work_val_buf, reduce_work_idx_buf, tmpValue, tmpIndex);
-
-                AccumulationWithIndex::Calculate(
-                    accu_value_buf(iM), tmpValue, accu_index_buf(iM), tmpIndex);
-            });
-
-            threadwise_src_val_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
-            threadwise_src_idx_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
-
-            reducedTiles++;
-        } while(reducedTiles < toReduceTiles);
-
-        constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{};
-
-        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-            if(thread_k_cluster_id == 0)
-            {
-                // for indiced operation, acc_elementwise_op shoud do nothing
-                acc_elementwise_op(accu_value_buf(I), accu_value_buf(I));
-
-                accu_value_buf(I) *= alpha;
-            }
-        });
-
-        if(thread_k_cluster_id == 0)
-        {
-            if constexpr(!BetaIsZero)
-            {
-                if(!float_equal_zero{}(beta))
-                {
-                    StaticBuffer<AddressSpaceEnum::Vgpr, OutDataType, MThreadSliceSize, true>
-                        priorDstValueBuf;
-
-                    auto threadwise_dst_load =
-                        ThreadwiseTensorSliceTransfer_v2<OutDataType,
-                                                         OutDataType,
-                                                         OutGridDesc_M,
-                                                         decltype(reduced_data_desc),
-                                                         Sequence<MThreadSliceSize>,
-                                                         Sequence<0>,
-                                                         0,
-                                                         OutDstVectorSize,
-                                                         1,
-                                                         true>(
-                            out_grid_desc_m,
-                            make_multi_index(block_global_1d_id * M_BlockTileSize +
-                                             thread_m_cluster_id * MThreadSliceSize));
-
-                    threadwise_dst_load.Run(out_grid_desc_m,
-                                            out_global_val_buf,
-                                            reduced_data_desc,
-                                            make_tuple(I0),
-                                            priorDstValueBuf);
-
-                    static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-                        accu_value_buf(I) += type_convert<AccDataType>(priorDstValueBuf[I]) * beta;
-                    });
-                };
-            };
-
-            auto threadwise_dst_val_store =
-                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
-                                                   OutDataType,
-                                                   decltype(reduced_data_desc),
-                                                   OutGridDesc_M,
-                                                   PassThroughOp,
-                                                   Sequence<MThreadSliceSize>,
-                                                   Sequence<0>,
-                                                   0,
-                                                   OutDstVectorSize,
-                                                   InMemoryDataOperationEnum::Set,
-                                                   1,
-                                                   true>(
-                    out_grid_desc_m,
-                    make_multi_index(block_global_1d_id * M_BlockTileSize +
-                                     thread_m_cluster_id * MThreadSliceSize),
-                    PassThroughOp{});
-
-            auto threadwise_dst_idx_store =
-                ThreadwiseTensorSliceTransfer_v1r3<IndexDataType,
-                                                   IndexDataType,
-                                                   decltype(reduced_data_desc),
-                                                   OutGridDesc_M,
-                                                   PassThroughOp,
-                                                   Sequence<MThreadSliceSize>,
-                                                   Sequence<0>,
-                                                   0,
-                                                   OutDstVectorSize,
-                                                   InMemoryDataOperationEnum::Set,
-                                                   1,
-                                                   true>(
-                    out_grid_desc_m,
-                    make_multi_index(block_global_1d_id * M_BlockTileSize +
-                                     thread_m_cluster_id * MThreadSliceSize),
-                    PassThroughOp{});
-
-            threadwise_dst_val_store.Run(reduced_data_desc,
-                                         make_tuple(I0),
-                                         accu_value_buf,
-                                         out_grid_desc_m,
-                                         out_global_val_buf);
-            threadwise_dst_idx_store.Run(reduced_data_desc,
-                                         make_tuple(I0),
-                                         accu_index_buf,
-                                         out_grid_desc_m,
-                                         out_global_idx_buf);
-        }
-    };
-};
-
-} // namespace ck
-#endif
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
new file mode 100644
index 0000000000..f3e9836d4f
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
@@ -0,0 +1,638 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_HPP
+#define CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_HPP
+
+#include "reduction_common.hpp"
+#include "reduction_operator.hpp"
+#include "reduction_functions_accumulate.hpp"
+#include "reduction_functions_blockwise.hpp"
+#include "reduction_functions_threadwise.hpp"
+
+#include "threadwise_tensor_slice_transfer.hpp"
+#include "element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseReduction,
+          bool OutputIndex,
+          bool HaveIndexInput,
+          typename InDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename IndexDataType,
+          typename InGridDesc_M_K,
+          typename OutGridDesc_M,
+          typename InElementwiseOperation,
+          typename AccElementwiseOperation>
+__global__ void kernel_reduce_multiblock(const InGridDesc_M_K in_grid_desc_m_k,
+                                         const OutGridDesc_M out_grid_desc_m,
+                                         const InElementwiseOperation in_elementwise_op,
+                                         const AccElementwiseOperation acc_elementwise_op,
+                                         index_t block_group_size,
+                                         index_t num_k_block_tile_iteration,
+                                         AccDataType alpha,
+                                         const InDataType* const __restrict__ p_in_value_global,
+                                         const IndexDataType* const __restrict__ p_in_index_global,
+                                         AccDataType beta,
+                                         OutDataType* const __restrict__ p_out_value_global,
+                                         IndexDataType* const __restrict__ p_out_index_global)
+{
+    if constexpr(!OutputIndex)
+    {
+        (void)p_in_index_global;
+        (void)p_out_index_global;
+
+        GridwiseReduction::Run(in_grid_desc_m_k,
+                               out_grid_desc_m,
+                               in_elementwise_op,
+                               acc_elementwise_op,
+                               block_group_size,
+                               num_k_block_tile_iteration,
+                               alpha,
+                               p_in_value_global,
+                               beta,
+                               p_out_value_global);
+    }
+    else
+    {
+        GridwiseReduction::template RunWithIndex<HaveIndexInput>(in_grid_desc_m_k,
+                                                                 out_grid_desc_m,
+                                                                 in_elementwise_op,
+                                                                 acc_elementwise_op,
+                                                                 num_k_block_tile_iteration,
+                                                                 alpha,
+                                                                 p_in_value_global,
+                                                                 p_in_index_global,
+                                                                 beta,
+                                                                 p_out_value_global,
+                                                                 p_out_index_global);
+    };
+};
+
+template <typename InDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename IndexDataType,
+          typename InGridDesc_M_K,
+          typename OutGridDesc_M,
+          typename ReduceOperation,
+          typename InElementwiseOperation,
+          typename AccElementwiseOperation,
+          InMemoryDataOperationEnum OutMemoryDataOperation,
+          bool PropagateNan,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t InSrcVectorDim,
+          index_t InSrcVectorSize,
+          index_t OutDstVectorSize>
+struct GridwiseReduction_mk_to_m_multiblock
+{
+    static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
+                   (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
+                      (MThreadSliceSize % OutDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0);
+
+    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
+
+    using ThreadBufferDimAccessOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    using ThreadClusterArrangeOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+
+    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
+    using ThreadReduceDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+
+    using BlockwiseReduce = PartitionedBlockwiseReduction<AccDataType,
+                                                          BlockSize,
+                                                          ThreadClusterLengths_M_K,
+                                                          ThreadClusterArrangeOrder,
+                                                          ReduceOperation,
+                                                          PropagateNan>;
+
+    using ThreadwiseReduce = ThreadwiseReduction<AccDataType,
+                                                 ThreadReduceSrcDesc_M_K,
+                                                 ThreadReduceDstDesc_M,
+                                                 ReduceOperation,
+                                                 PropagateNan>;
+
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    using Accumulation = detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;
+
+    __device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k,
+                               const OutGridDesc_M& out_grid_desc_m,
+                               const InElementwiseOperation& in_elementwise_op,
+                               const AccElementwiseOperation& acc_elementwise_op,
+                               index_t block_group_size,
+                               index_t num_k_block_tile_iteration,
+                               AccDataType alpha,
+                               const InDataType* const __restrict__ p_in_value_global,
+                               AccDataType beta,
+                               OutDataType* const __restrict__ p_out_value_global)
+    {
+        const auto zeroVal = ReduceOperation::GetReductionZeroVal();
+
+        // LDS
+        __shared__ AccDataType p_reduce_work_buffer[BlockSize];
+
+        const auto in_global_val_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Global>(p_in_value_global,
+                                                          in_grid_desc_m_k.GetElementSpaceSize(),
+                                                          type_convert<InDataType>(zeroVal));
+        auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_out_value_global, out_grid_desc_m.GetElementSpaceSize());
+
+        auto reduce_work_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_buffer, BlockSize);
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            in_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; });
+
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+        const index_t blkgroup_id     = block_global_id / block_group_size;
+        const index_t block_local_id  = block_global_id % block_group_size;
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+
+        const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration;
+
+        using ThreadBufferLengths         = Sequence<MThreadSliceSize, KThreadSliceSize>;
+        constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
+
+        auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<InDataType,
+                                                                    AccDataType,
+                                                                    InGridDesc_M_K,
+                                                                    decltype(thread_buffer_desc),
+                                                                    ThreadBufferLengths,
+                                                                    ThreadBufferDimAccessOrder,
+                                                                    InSrcVectorDim,
+                                                                    InSrcVectorSize,
+                                                                    1,
+                                                                    false>(
+            in_grid_desc_m_k,
+            make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                             block_local_id * reduceSizePerBlock +
+                                 thread_k_cluster_id * KThreadSliceSize));
+
+        constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize);
+
+        index_t reducedTiles = 0;
+        do
+        {
+            threadwise_src_load.Run(in_grid_desc_m_k,
+                                    in_global_val_buf,
+                                    thread_buffer_desc,
+                                    make_tuple(I0, I0),
+                                    in_thread_buf);
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                // do element-wise pre-reduction operation
+                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                    constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
+                    in_elementwise_op(in_thread_buf(Number<offset>{}),
+                                      in_thread_buf(Number<offset>{}));
+                });
+            });
+
+            ThreadwiseReduce::Reduce(in_thread_buf, accu_value_buf);
+
+            threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
+
+            reducedTiles++;
+        } while(reducedTiles < num_k_block_tile_iteration);
+
+        constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{};
+
+        static_for<0, MThreadSliceSize, 1>{}(
+            [&](auto I) { BlockwiseReduce::Reduce(reduce_work_buf, accu_value_buf(I)); });
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if(thread_k_cluster_id == 0)
+            {
+                acc_elementwise_op(accu_value_buf(I), accu_value_buf(I));
+
+                accu_value_buf(I) *= alpha;
+            }
+        });
+
+        if(thread_k_cluster_id == 0)
+        {
+            if(block_group_size == 0 && !float_equal_zero{}(beta))
+            {
+                StaticBuffer<AddressSpaceEnum::Vgpr, OutDataType, MThreadSliceSize, true>
+                    priorDstValueBuf;
+
+                auto threadwise_dst_load =
+                    ThreadwiseTensorSliceTransfer_v2<OutDataType,
+                                                     OutDataType,
+                                                     OutGridDesc_M,
+                                                     decltype(reduced_data_desc),
+                                                     Sequence<MThreadSliceSize>,
+                                                     Sequence<0>,
+                                                     0,
+                                                     OutDstVectorSize,
+                                                     1,
+                                                     false>(
+                        out_grid_desc_m,
+                        make_multi_index(blkgroup_id * M_BlockTileSize +
+                                         thread_m_cluster_id * MThreadSliceSize));
+
+                threadwise_dst_load.Run(out_grid_desc_m,
+                                        out_global_val_buf,
+                                        reduced_data_desc,
+                                        make_tuple(I0),
+                                        priorDstValueBuf);
+
+                static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                    accu_value_buf(I) += type_convert<AccDataType>(priorDstValueBuf[I]) * beta;
+                });
+            };
+
+            auto threadwise_dst_store =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   OutDataType,
+                                                   decltype(reduced_data_desc),
+                                                   OutGridDesc_M,
+                                                   PassThroughOp,
+                                                   Sequence<MThreadSliceSize>,
+                                                   Sequence<0>,
+                                                   0,
+                                                   OutDstVectorSize,
+                                                   OutMemoryDataOperation,
+                                                   1,
+                                                   true>(
+                    out_grid_desc_m,
+                    make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize),
+                    PassThroughOp{});
+
+            threadwise_dst_store.Run(reduced_data_desc,
+                                     make_tuple(I0),
+                                     accu_value_buf,
+                                     out_grid_desc_m,
+                                     out_global_val_buf);
+        }
+    };
+
+    template <bool HaveIndexInput>
+    __device__ static void RunWithIndex(const InGridDesc_M_K& in_grid_desc_m_k,
+                                        const OutGridDesc_M& out_grid_desc_m,
+                                        const InElementwiseOperation in_elementwise_op,
+                                        const AccElementwiseOperation acc_elementwise_op,
+                                        index_t num_k_block_tile_iteration,
+                                        AccDataType alpha,
+                                        const InDataType* const __restrict__ p_in_value_global,
+                                        const IndexDataType* const __restrict__ p_in_index_global,
+                                        AccDataType beta,
+                                        OutDataType* const __restrict__ p_out_value_global,
+                                        IndexDataType* const __restrict__ p_out_index_global)
+    {
+        using BlockwiseReduceWithIndex =
+            PartitionedBlockwiseReductionWithIndex<AccDataType,
+                                                   IndexDataType,
+                                                   BlockSize,
+                                                   Sequence<MThreadClusterSize, KThreadClusterSize>,
+                                                   ThreadClusterArrangeOrder,
+                                                   ReduceOperation,
+                                                   PropagateNan>;
+
+        using AccumulationWithIndex = detail::AccumulateWithIndexAndNanCheck<PropagateNan,
+                                                                             ReduceOperation,
+                                                                             AccDataType,
+                                                                             IndexDataType>;
+
+        (void)in_elementwise_op;
+
+        // LDS
+        __shared__ AccDataType p_reduce_work_val_buffer[BlockSize];
+        __shared__ IndexDataType p_reduce_work_idx_buffer[BlockSize];
+
+        const auto zeroVal = ReduceOperation::GetReductionZeroVal();
+
+        const auto in_global_val_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Global>(p_in_value_global,
+                                                          in_grid_desc_m_k.GetElementSpaceSize(),
+                                                          type_convert<InDataType>(zeroVal));
+        const auto in_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_in_index_global, in_grid_desc_m_k.GetElementSpaceSize());
+        auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_out_value_global, out_grid_desc_m.GetElementSpaceSize());
+        auto out_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_out_index_global, out_grid_desc_m.GetElementSpaceSize());
+
+        auto reduce_work_val_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_val_buffer, BlockSize);
+        auto reduce_work_idx_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_idx_buffer, BlockSize);
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            in_thread_val_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr,
+                     IndexDataType,
+                     MThreadSliceSize * KThreadSliceSize,
+                     true>
+            in_thread_idx_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, IndexDataType, MThreadSliceSize, true> accu_index_buf;
+
+        const index_t thread_local_id    = get_thread_local_1d_id();
+        const index_t block_global_1d_id = get_block_1d_id();
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+
+        using ThreadBufferLengths         = Sequence<MThreadSliceSize, KThreadSliceSize>;
+        constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
+
+        auto threadwise_src_val_load =
+            ThreadwiseTensorSliceTransfer_v2<InDataType,
+                                             AccDataType,
+                                             InGridDesc_M_K,
+                                             decltype(thread_buffer_desc),
+                                             ThreadBufferLengths,
+                                             ThreadBufferDimAccessOrder,
+                                             InSrcVectorDim,
+                                             InSrcVectorSize,
+                                             1,
+                                             false>(
+                in_grid_desc_m_k,
+                make_multi_index(block_global_1d_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * KThreadSliceSize));
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            accu_value_buf(I) = zeroVal;
+            accu_index_buf(I) = 0;
+        });
+
+        constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize);
+
+        index_t reducedTiles = 0;
+
+        if constexpr(HaveIndexInput)
+        {
+            auto threadwise_src_idx_load =
+                ThreadwiseTensorSliceTransfer_v2<IndexDataType,
+                                                 IndexDataType,
+                                                 InGridDesc_M_K,
+                                                 decltype(thread_buffer_desc),
+                                                 ThreadBufferLengths,
+                                                 ThreadBufferDimAccessOrder,
+                                                 InSrcVectorDim,
+                                                 InSrcVectorSize,
+                                                 1,
+                                                 false>(
+                    in_grid_desc_m_k,
+                    make_multi_index(block_global_1d_id * M_BlockTileSize +
+                                         thread_m_cluster_id * MThreadSliceSize,
+                                     thread_k_cluster_id * KThreadSliceSize));
+
+            do
+            {
+                // load the thread slice
+                threadwise_src_val_load.Run(in_grid_desc_m_k,
+                                            in_global_val_buf,
+                                            thread_buffer_desc,
+                                            make_tuple(I0, I0),
+                                            in_thread_val_buf);
+                threadwise_src_idx_load.Run(in_grid_desc_m_k,
+                                            in_global_idx_buf,
+                                            thread_buffer_desc,
+                                            make_tuple(I0, I0),
+                                            in_thread_idx_buf);
+
+                static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                    AccDataType tmpValue   = zeroVal;
+                    IndexDataType tmpIndex = 0;
+
+                    static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                        constexpr auto offset =
+                            thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
+
+                        AccumulationWithIndex::Calculate(tmpValue,
+                                                         in_thread_val_buf[Number<offset>{}],
+                                                         tmpIndex,
+                                                         in_thread_idx_buf[Number<offset>{}]);
+                    });
+
+                    BlockwiseReduceWithIndex::Reduce(
+                        reduce_work_val_buf, reduce_work_idx_buf, tmpValue, tmpIndex);
+
+                    AccumulationWithIndex::Calculate(
+                        accu_value_buf(iM), tmpValue, accu_index_buf(iM), tmpIndex);
+                });
+
+                threadwise_src_val_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
+                threadwise_src_idx_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
+
+                reducedTiles++;
+            } while(reducedTiles < num_k_block_tile_iteration);
+        }
+        else
+        {
+            index_t indexOffset = 0;
+
+            do
+            {
+                // load the thread slice
+                threadwise_src_val_load.Run(in_grid_desc_m_k,
+                                            in_global_val_buf,
+                                            thread_buffer_desc,
+                                            make_tuple(I0, I0),
+                                            in_thread_val_buf);
+
+                static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                    static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                        constexpr auto offset =
+                            thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
+
+                        // initialize the indices for the per-thread to-reduce values
+                        in_thread_idx_buf(Number<offset>{}) =
+                            indexOffset + thread_k_cluster_id * KThreadSliceSize + iK();
+
+                        // do element-wise pre-reduction operation
+                        in_elementwise_op(in_thread_val_buf(Number<offset>{}),
+                                          in_thread_val_buf(Number<offset>{}));
+                    });
+
+                    AccDataType tmpValue   = zeroVal;
+                    IndexDataType tmpIndex = 0;
+
+                    static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                        constexpr auto offset =
+                            thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
+
+                        AccumulationWithIndex::Calculate(tmpValue,
+                                                         in_thread_val_buf[Number<offset>{}],
+                                                         tmpIndex,
+                                                         in_thread_idx_buf[Number<offset>{}]);
+                    });
+
+                    BlockwiseReduceWithIndex::Reduce(
+                        reduce_work_val_buf, reduce_work_idx_buf, tmpValue, tmpIndex);
+
+                    AccumulationWithIndex::Calculate(
+                        accu_value_buf(iM), tmpValue, accu_index_buf(iM), tmpIndex);
+                });
+
+                threadwise_src_val_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
+
+                indexOffset += K_BlockTileSize;
+                reducedTiles++;
+            } while(reducedTiles < num_k_block_tile_iteration);
+        };
+
+        constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{};
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if(thread_k_cluster_id == 0)
+            {
+                // for indiced operation, acc_elementwise_op shoud do nothing
+                acc_elementwise_op(accu_value_buf(I), accu_value_buf(I));
+
+                accu_value_buf(I) *= alpha;
+            }
+        });
+
+        if(thread_k_cluster_id == 0)
+        {
+            if(!float_equal_zero{}(beta))
+            {
+                StaticBuffer<AddressSpaceEnum::Vgpr, OutDataType, MThreadSliceSize, true>
+                    priorDstValueBuf;
+
+                auto threadwise_dst_load =
+                    ThreadwiseTensorSliceTransfer_v2<OutDataType,
+                                                     OutDataType,
+                                                     OutGridDesc_M,
+                                                     decltype(reduced_data_desc),
+                                                     Sequence<MThreadSliceSize>,
+                                                     Sequence<0>,
+                                                     0,
+                                                     OutDstVectorSize,
+                                                     1,
+                                                     true>(
+                        out_grid_desc_m,
+                        make_multi_index(block_global_1d_id * M_BlockTileSize +
+                                         thread_m_cluster_id * MThreadSliceSize));
+
+                threadwise_dst_load.Run(out_grid_desc_m,
+                                        out_global_val_buf,
+                                        reduced_data_desc,
+                                        make_tuple(I0),
+                                        priorDstValueBuf);
+
+                static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                    accu_value_buf(I) += type_convert<AccDataType>(priorDstValueBuf[I]) * beta;
+                });
+            };
+
+            auto threadwise_dst_val_store =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   OutDataType,
+                                                   decltype(reduced_data_desc),
+                                                   OutGridDesc_M,
+                                                   PassThroughOp,
+                                                   Sequence<MThreadSliceSize>,
+                                                   Sequence<0>,
+                                                   0,
+                                                   OutDstVectorSize,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>(
+                    out_grid_desc_m,
+                    make_multi_index(block_global_1d_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize),
+                    PassThroughOp{});
+
+            auto threadwise_dst_idx_store =
+                ThreadwiseTensorSliceTransfer_v1r3<IndexDataType,
+                                                   IndexDataType,
+                                                   decltype(reduced_data_desc),
+                                                   OutGridDesc_M,
+                                                   PassThroughOp,
+                                                   Sequence<MThreadSliceSize>,
+                                                   Sequence<0>,
+                                                   0,
+                                                   OutDstVectorSize,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>(
+                    out_grid_desc_m,
+                    make_multi_index(block_global_1d_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize),
+                    PassThroughOp{});
+
+            threadwise_dst_val_store.Run(reduced_data_desc,
+                                         make_tuple(I0),
+                                         accu_value_buf,
+                                         out_grid_desc_m,
+                                         out_global_val_buf);
+            threadwise_dst_idx_store.Run(reduced_data_desc,
+                                         make_tuple(I0),
+                                         accu_index_buf,
+                                         out_grid_desc_m,
+                                         out_global_idx_buf);
+        }
+    };
+};
+
+} // namespace ck
+#endif
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp
deleted file mode 100644
index 4e325f3573..0000000000
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp
+++ /dev/null
@@ -1,269 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_ATOMIC_ADD_HPP
-#define CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_ATOMIC_ADD_HPP
-
-#include "reduction_common.hpp"
-#include "reduction_operator.hpp"
-#include "reduction_functions_accumulate.hpp"
-#include "reduction_functions_blockwise.hpp"
-#include "reduction_functions_threadwise.hpp"
-
-#include "threadwise_tensor_slice_transfer.hpp"
-#include "element_wise_operation.hpp"
-
-namespace ck {
-
-template <typename GridwiseReduction,
-          typename InDataType,
-          typename OutDataType,
-          typename AccDataType,
-          typename InGridDesc_M_K,
-          typename OutGridDesc_M,
-          typename InElementwiseOperation,
-          typename AccElementwiseOperation>
-__global__ void
-kernel_reduce_multiblock_atocmi_add(const InGridDesc_M_K in_grid_desc_m_k,
-                                    const OutGridDesc_M out_grid_desc_m,
-                                    const InElementwiseOperation in_elementwise_op,
-                                    const AccElementwiseOperation acc_elementwise_op,
-                                    index_t block_group_size,
-                                    index_t num_k_block_tile_iteration,
-                                    AccDataType alpha,
-                                    const InDataType* const __restrict__ p_in_global,
-                                    OutDataType* const __restrict__ p_out_global)
-{
-    GridwiseReduction::Run(in_grid_desc_m_k,
-                           out_grid_desc_m,
-                           in_elementwise_op,
-                           acc_elementwise_op,
-                           block_group_size,
-                           num_k_block_tile_iteration,
-                           alpha,
-                           p_in_global,
-                           p_out_global);
-};
-
-template <typename InDataType,
-          typename OutDataType,
-          typename AccDataType,
-          typename InGridDesc_M_K,
-          typename OutGridDesc_M,
-          typename ReduceOperation,
-          typename InElementwiseOperation,
-          typename AccElementwiseOperation,
-          bool PropagateNan,
-          index_t BlockSize,
-          index_t MThreadClusterSize,
-          index_t KThreadClusterSize,
-          index_t MThreadSliceSize,
-          index_t KThreadSliceSize,
-          index_t InSrcVectorDim,
-          index_t InSrcVectorSize,
-          index_t OutDstVectorSize>
-struct GridwiseReduction_mk_to_m_multiblock_atomic_add
-{
-    static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
-                   (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
-                      (MThreadSliceSize % OutDstVectorSize == 0),
-                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
-
-    static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0);
-
-    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
-
-    using ThreadBufferDimAccessOrder =
-        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
-
-    using ThreadClusterArrangeOrder =
-        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
-
-    static constexpr auto thread_cluster_desc =
-        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
-
-    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
-        make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
-    using ThreadReduceDstDesc_M =
-        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
-
-    using BlockwiseReduce = PartitionedBlockwiseReduction<AccDataType,
-                                                          BlockSize,
-                                                          ThreadClusterLengths_M_K,
-                                                          ThreadClusterArrangeOrder,
-                                                          ReduceOperation,
-                                                          PropagateNan>;
-
-    using ThreadwiseReduce = ThreadwiseReduction<AccDataType,
-                                                 ThreadReduceSrcDesc_M_K,
-                                                 ThreadReduceDstDesc_M,
-                                                 ReduceOperation,
-                                                 PropagateNan>;
-
-    using PassThroughOp = tensor_operation::element_wise::PassThrough;
-
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto I1 = Number<1>{};
-
-    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
-    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
-
-    using Accumulation = detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;
-
-    __device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k,
-                               const OutGridDesc_M& out_grid_desc_m,
-                               const InElementwiseOperation& in_elementwise_op,
-                               const AccElementwiseOperation& acc_elementwise_op,
-                               index_t block_group_size,
-                               index_t num_k_block_tile_iteration,
-                               AccDataType alpha,
-                               const InDataType* const __restrict__ p_in_global,
-                               OutDataType* const __restrict__ p_out_global)
-    {
-        const auto zeroVal = ReduceOperation::GetReductionZeroVal();
-
-        // LDS
-        __shared__ AccDataType p_reduce_work_buffer[BlockSize];
-
-        const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert<InDataType>(zeroVal));
-        auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_out_global, out_grid_desc_m.GetElementSpaceSize());
-
-        auto reduce_work_buf =
-            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_buffer, BlockSize);
-
-        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
-            in_thread_buf;
-
-        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
-
-        static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; });
-
-        const index_t thread_local_id = get_thread_local_1d_id();
-        const index_t block_global_id = get_block_1d_id();
-        const index_t blkgroup_id     = block_global_id / block_group_size;
-        const index_t block_local_id  = block_global_id % block_group_size;
-
-        const auto thread_cluster_idx =
-            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
-
-        const auto thread_m_cluster_id = thread_cluster_idx[I0];
-        const auto thread_k_cluster_id = thread_cluster_idx[I1];
-
-        const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration;
-
-        using ThreadBufferLengths         = Sequence<MThreadSliceSize, KThreadSliceSize>;
-        constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
-            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
-
-        auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<InDataType,
-                                                                    AccDataType,
-                                                                    InGridDesc_M_K,
-                                                                    decltype(thread_buffer_desc),
-                                                                    ThreadBufferLengths,
-                                                                    ThreadBufferDimAccessOrder,
-                                                                    InSrcVectorDim,
-                                                                    InSrcVectorSize,
-                                                                    1,
-                                                                    false>(
-            in_grid_desc_m_k,
-            make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
-                             block_local_id * reduceSizePerBlock +
-                                 thread_k_cluster_id * KThreadSliceSize));
-
-        constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize);
-
-        index_t reducedTiles = 0;
-        do
-        {
-            threadwise_src_load.Run(in_grid_desc_m_k,
-                                    in_global_buf,
-                                    thread_buffer_desc,
-                                    make_tuple(I0, I0),
-                                    in_thread_buf);
-
-            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
-                // do element-wise pre-reduction operation
-                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
-                    constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
-                    in_elementwise_op(in_thread_buf(Number<offset>{}),
-                                      in_thread_buf(Number<offset>{}));
-                });
-            });
-
-            ThreadwiseReduce::Reduce(in_thread_buf, accu_value_buf);
-
-            threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
-
-            reducedTiles++;
-        } while(reducedTiles < num_k_block_tile_iteration);
-
-        constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{};
-
-        // Each block executes multiple parallel reductions on the LDS, and by atomic-adding its
-        // reduced output to the global location corresponding to each invariant dimension to get a
-        // consistent reduced result for that invariant dimension. due to the using of vector_load,
-        // each block/thread is involved into multiple invarirant dimensions.
-        static_for<0, MThreadSliceSize, 1>{}(
-            [&](auto I) { BlockwiseReduce::Reduce(reduce_work_buf, accu_value_buf(I)); });
-
-        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-            if(thread_k_cluster_id == 0)
-            {
-                acc_elementwise_op(accu_value_buf(I), accu_value_buf(I));
-
-                accu_value_buf(I) *= alpha;
-            }
-        });
-
-        if(thread_k_cluster_id == 0)
-        {
-            auto threadwise_dst_store =
-                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
-                                                   OutDataType,
-                                                   decltype(reduced_data_desc),
-                                                   OutGridDesc_M,
-                                                   PassThroughOp,
-                                                   Sequence<MThreadSliceSize>,
-                                                   Sequence<0>,
-                                                   0,
-                                                   OutDstVectorSize,
-                                                   InMemoryDataOperationEnum::AtomicAdd,
-                                                   1,
-                                                   true>(
-                    out_grid_desc_m,
-                    make_multi_index(blkgroup_id * M_BlockTileSize +
-                                     thread_m_cluster_id * MThreadSliceSize),
-                    PassThroughOp{});
-
-            threadwise_dst_store.Run(
-                reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, out_global_buf);
-        }
-    };
-};
-
-} // namespace ck
-#endif
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp
deleted file mode 100644
index d1be1f5275..0000000000
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp
+++ /dev/null
@@ -1,487 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_PARTIAL_REDUCE_HPP
-#define CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_PARTIAL_REDUCE_HPP
-
-#include "reduction_common.hpp"
-#include "reduction_operator.hpp"
-#include "reduction_functions_accumulate.hpp"
-#include "reduction_functions_blockwise.hpp"
-#include "reduction_functions_threadwise.hpp"
-#include "threadwise_tensor_slice_transfer.hpp"
-#include "cluster_descriptor.hpp"
-#include "element_wise_operation.hpp"
-
-namespace ck {
-
-template <typename GridwiseReduction,
-          bool NeedIndices,
-          typename InDataType,
-          typename AccDataType,
-          typename IndexDataType,
-          typename InGridDesc_M_K,
-          typename WorkspaceDesc_M_K,
-          typename InElementwiseOperation,
-          typename AccElementwiseOperation>
-__global__ void
-kernel_partial_reduce_multiblock(const InGridDesc_M_K in_grid_desc_m_k,
-                                 const WorkspaceDesc_M_K workspace_desc_m_k,
-                                 const InElementwiseOperation in_elementwise_op,
-                                 const AccElementwiseOperation acc_elementwise_op,
-                                 index_t block_group_size,
-                                 index_t num_k_block_tile_iteration,
-                                 const InDataType* const __restrict__ p_src_global,
-                                 AccDataType* const __restrict__ p_ws_values_global,
-                                 IndexDataType* const __restrict__ p_ws_indices_global)
-
-{
-    if constexpr(!NeedIndices)
-    {
-        GridwiseReduction::Run(in_grid_desc_m_k,
-                               workspace_desc_m_k,
-                               in_elementwise_op,
-                               acc_elementwise_op,
-                               block_group_size,
-                               num_k_block_tile_iteration,
-                               p_src_global,
-                               p_ws_values_global,
-                               p_ws_indices_global);
-    }
-    else
-    {
-        GridwiseReduction::RunWithIndex(in_grid_desc_m_k,
-                                        workspace_desc_m_k,
-                                        in_elementwise_op,
-                                        acc_elementwise_op,
-                                        block_group_size,
-                                        num_k_block_tile_iteration,
-                                        p_src_global,
-                                        p_ws_values_global,
-                                        p_ws_indices_global);
-    };
-};
-
-template <typename InDataType,
-          typename AccDataType,
-          typename IndexDataType,
-          typename InGridDesc_M_K,
-          typename WorkspaceDesc_M_K,
-          typename ReduceOperation,
-          typename InElementwiseOperation,
-          typename AccElementwiseOperation,
-          bool PropagateNan,
-          index_t BlockSize,
-          index_t MThreadClusterSize,
-          index_t KThreadClusterSize,
-          index_t MThreadSliceSize,
-          index_t KThreadSliceSize,
-          index_t InSrcVectorDim,
-          index_t InSrcVectorSize,
-          index_t OutDstVectorSize>
-struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
-{
-    static_assert((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
-                      (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0),
-                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
-
-    static_assert(OutDstVectorSize == 1, "OutDstVectorSize must be 1 for MultiBlockPartialReduce!");
-
-    static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0);
-
-    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
-
-    using ThreadBufferDimAccessOrder =
-        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
-
-    using ThreadClusterArrangeOrder =
-        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
-
-    static constexpr auto thread_cluster_desc =
-        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
-
-    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
-        make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
-    using ThreadReduceDstDesc_M =
-        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
-
-    using PassThroughOp = tensor_operation::element_wise::PassThrough;
-
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto I1 = Number<1>{};
-
-    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
-    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
-
-    __device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k,
-                               const WorkspaceDesc_M_K& workspace_desc_m_k,
-                               const InElementwiseOperation& in_elementwise_op,
-                               const AccElementwiseOperation& acc_elementwise_op,
-                               index_t block_group_size,
-                               index_t num_k_block_tile_iteration,
-                               const InDataType* const __restrict__ p_src_global,
-                               AccDataType* const __restrict__ p_ws_values_global,
-                               IndexDataType* const __restrict__ p_ws_indices_global)
-    {
-        using BlockwiseReduce = PartitionedBlockwiseReduction<AccDataType,
-                                                              BlockSize,
-                                                              ThreadClusterLengths_M_K,
-                                                              ThreadClusterArrangeOrder,
-                                                              ReduceOperation,
-                                                              PropagateNan>;
-
-        using ThreadwiseReduce = ThreadwiseReduction<AccDataType,
-                                                     ThreadReduceSrcDesc_M_K,
-                                                     ThreadReduceDstDesc_M,
-                                                     ReduceOperation,
-                                                     PropagateNan>;
-
-        (void)p_ws_indices_global;
-        (void)acc_elementwise_op;
-
-        const auto zeroVal = ReduceOperation::GetReductionZeroVal();
-
-        // LDS
-        __shared__ AccDataType p_reduce_work_buffer[BlockSize];
-
-        const auto in_global_buf =
-            make_dynamic_buffer<AddressSpaceEnum::Global>(p_src_global,
-                                                          in_grid_desc_m_k.GetElementSpaceSize(),
-                                                          type_convert<InDataType>(zeroVal));
-        auto workspace_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_ws_values_global, workspace_desc_m_k.GetElementSpaceSize());
-
-        auto reduce_work_buf =
-            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_buffer, BlockSize);
-
-        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
-            in_thread_buf;
-
-        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
-
-        static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; });
-
-        const index_t thread_local_id = get_thread_local_1d_id();
-        const index_t block_global_id = get_block_1d_id();
-        const index_t blkgroup_id     = block_global_id / block_group_size;
-        const index_t block_local_id  = block_global_id % block_group_size;
-
-        const auto thread_cluster_idx =
-            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
-
-        const auto thread_m_cluster_id = thread_cluster_idx[I0];
-        const auto thread_k_cluster_id = thread_cluster_idx[I1];
-
-        const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration;
-
-        using ThreadBufferLengths         = Sequence<MThreadSliceSize, KThreadSliceSize>;
-        constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
-            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
-
-        auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<InDataType,
-                                                                    AccDataType,
-                                                                    InGridDesc_M_K,
-                                                                    decltype(thread_buffer_desc),
-                                                                    ThreadBufferLengths,
-                                                                    ThreadBufferDimAccessOrder,
-                                                                    InSrcVectorDim,
-                                                                    InSrcVectorSize,
-                                                                    1,
-                                                                    false>(
-            in_grid_desc_m_k,
-            make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
-                             block_local_id * reduceSizePerBlock +
-                                 thread_k_cluster_id * KThreadSliceSize));
-
-        constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize);
-
-        index_t reducedTiles = 0;
-        do
-        {
-            threadwise_src_load.Run(in_grid_desc_m_k,
-                                    in_global_buf,
-                                    thread_buffer_desc,
-                                    make_tuple(I0, I0),
-                                    in_thread_buf);
-
-            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
-                // do element-wise pre-reduction operation
-                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
-                    constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
-                    in_elementwise_op(in_thread_buf(Number<offset>{}),
-                                      in_thread_buf(Number<offset>{}));
-                });
-            });
-
-            ThreadwiseReduce::Reduce(in_thread_buf, accu_value_buf);
-
-            threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
-
-            reducedTiles++;
-        } while(reducedTiles < num_k_block_tile_iteration);
-
-        // Each block executes multiple parallel reductions on the LDS, and due to the using of
-        // vector_load, each block/thread is involved into multiple invarirant dimensions.
-        static_for<0, MThreadSliceSize, 1>{}(
-            [&](auto I) { BlockwiseReduce::Reduce(reduce_work_buf, accu_value_buf(I)); });
-
-        constexpr auto reduced_data_desc = make_naive_tensor_descriptor_packed(
-            make_tuple(Number<MThreadSliceSize>{}, Number<1>{}));
-
-        if(thread_k_cluster_id == 0)
-        {
-            auto threadwise_workspace_store =
-                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
-                                                   AccDataType,
-                                                   decltype(reduced_data_desc),
-                                                   WorkspaceDesc_M_K,
-                                                   PassThroughOp,
-                                                   Sequence<MThreadSliceSize, 1>,
-                                                   Sequence<0, 1>,
-                                                   1,
-                                                   1,
-                                                   InMemoryDataOperationEnum::Set,
-                                                   1,
-                                                   true>(
-                    workspace_desc_m_k,
-                    make_multi_index(blkgroup_id * M_BlockTileSize +
-                                         thread_m_cluster_id * MThreadSliceSize,
-                                     block_local_id),
-                    PassThroughOp{});
-
-            threadwise_workspace_store.Run(reduced_data_desc,
-                                           make_tuple(I0, I0),
-                                           accu_value_buf,
-                                           workspace_desc_m_k,
-                                           workspace_global_buf);
-        }
-    };
-
-    __device__ static void RunWithIndex(const InGridDesc_M_K& in_grid_desc_m_k,
-                                        const WorkspaceDesc_M_K& workspace_desc_m_k,
-                                        const InElementwiseOperation& in_elementwise_op,
-                                        const AccElementwiseOperation& acc_elementwise_op,
-                                        index_t block_group_size,
-                                        index_t num_k_block_tile_iteration,
-                                        const InDataType* const __restrict__ p_src_global,
-                                        AccDataType* const __restrict__ p_ws_values_global,
-                                        IndexDataType* const __restrict__ p_ws_indices_global)
-    {
-        using BlockwiseReduceWithIndex =
-            PartitionedBlockwiseReductionWithIndex<AccDataType,
-                                                   IndexDataType,
-                                                   BlockSize,
-                                                   ThreadClusterLengths_M_K,
-                                                   ThreadClusterArrangeOrder,
-                                                   ReduceOperation,
-                                                   PropagateNan>;
-
-        using AccumulationWithIndex = detail::AccumulateWithIndexAndNanCheck<PropagateNan,
-                                                                             ReduceOperation,
-                                                                             AccDataType,
-                                                                             IndexDataType>;
-
-        (void)acc_elementwise_op;
-
-        const auto zeroVal = ReduceOperation::GetReductionZeroVal();
-
-        // LDS
-        __shared__ AccDataType p_reduce_work_val_buffer[BlockSize];
-        __shared__ index_t p_reduce_work_idx_buffer[BlockSize];
-
-        const auto in_global_buf =
-            make_dynamic_buffer<AddressSpaceEnum::Global>(p_src_global,
-                                                          in_grid_desc_m_k.GetElementSpaceSize(),
-                                                          type_convert<InDataType>(zeroVal));
-        auto workspace_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_ws_values_global, workspace_desc_m_k.GetElementSpaceSize());
-        auto workspace_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_ws_indices_global, workspace_desc_m_k.GetElementSpaceSize());
-
-        auto reduce_work_val_buf =
-            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_val_buffer, BlockSize);
-        auto reduce_work_idx_buf =
-            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_idx_buffer, BlockSize);
-
-        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
-            in_thread_val_buf;
-        StaticBuffer<AddressSpaceEnum::Vgpr,
-                     IndexDataType,
-                     MThreadSliceSize * KThreadSliceSize,
-                     true>
-            in_thread_idx_buf;
-
-        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
-        StaticBuffer<AddressSpaceEnum::Vgpr, IndexDataType, MThreadSliceSize, true> accu_index_buf;
-
-        const index_t thread_local_id = get_thread_local_1d_id();
-        const index_t block_global_id = get_block_1d_id();
-        const index_t blkgroup_id     = block_global_id / block_group_size;
-        const index_t block_local_id  = block_global_id % block_group_size;
-
-        const auto thread_cluster_idx =
-            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
-
-        const auto thread_m_cluster_id = thread_cluster_idx[I0];
-        const auto thread_k_cluster_id = thread_cluster_idx[I1];
-
-        const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration;
-
-        using ThreadBufferLengths         = Sequence<MThreadSliceSize, KThreadSliceSize>;
-        constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
-            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
-
-        auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<InDataType,
-                                                                    AccDataType,
-                                                                    InGridDesc_M_K,
-                                                                    decltype(thread_buffer_desc),
-                                                                    ThreadBufferLengths,
-                                                                    ThreadBufferDimAccessOrder,
-                                                                    InSrcVectorDim,
-                                                                    InSrcVectorSize,
-                                                                    1,
-                                                                    false>(
-            in_grid_desc_m_k,
-            make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
-                             block_local_id * reduceSizePerBlock +
-                                 thread_k_cluster_id * KThreadSliceSize));
-
-        constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize);
-
-        index_t indexOffset = block_local_id * reduceSizePerBlock;
-
-        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-            accu_value_buf(I) = zeroVal;
-            accu_index_buf(I) = 0;
-        });
-
-        index_t reducedTiles = 0;
-        do
-        {
-            // load the thread slice
-            threadwise_src_load.Run(in_grid_desc_m_k,
-                                    in_global_buf,
-                                    thread_buffer_desc,
-                                    make_tuple(I0, I0),
-                                    in_thread_val_buf);
-
-            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
-                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
-                    constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
-
-                    // initialize the indices for the per-thread to-reduce values
-                    in_thread_idx_buf(Number<offset>{}) =
-                        indexOffset + thread_k_cluster_id * KThreadSliceSize + iK();
-
-                    // do element-wise pre-reduction operation
-                    in_elementwise_op(in_thread_val_buf(Number<offset>{}),
-                                      in_thread_val_buf(Number<offset>{}));
-                });
-
-                AccDataType tmpValue   = zeroVal;
-                IndexDataType tmpIndex = 0;
-
-                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
-                    constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
-
-                    AccumulationWithIndex::Calculate(tmpValue,
-                                                     in_thread_val_buf[Number<offset>{}],
-                                                     tmpIndex,
-                                                     in_thread_idx_buf[Number<offset>{}]);
-                });
-
-                BlockwiseReduceWithIndex::Reduce(
-                    reduce_work_val_buf, reduce_work_idx_buf, tmpValue, tmpIndex);
-
-                AccumulationWithIndex::Calculate(
-                    accu_value_buf(iM), tmpValue, accu_index_buf(iM), tmpIndex);
-            });
-
-            threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
-
-            indexOffset += K_BlockTileSize;
-
-            reducedTiles++;
-        } while(reducedTiles < num_k_block_tile_iteration);
-
-        constexpr auto reduced_data_desc = make_naive_tensor_descriptor_packed(
-            make_tuple(Number<MThreadSliceSize>{}, Number<1>{}));
-
-        if(thread_k_cluster_id == 0)
-        {
-            auto threadwise_workspace_val_store =
-                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
-                                                   AccDataType,
-                                                   decltype(reduced_data_desc),
-                                                   WorkspaceDesc_M_K,
-                                                   PassThroughOp,
-                                                   Sequence<MThreadSliceSize, 1>,
-                                                   Sequence<0, 1>,
-                                                   1,
-                                                   1,
-                                                   InMemoryDataOperationEnum::Set,
-                                                   1,
-                                                   true>(
-                    workspace_desc_m_k,
-                    make_multi_index(blkgroup_id * M_BlockTileSize +
-                                         thread_m_cluster_id * MThreadSliceSize,
-                                     block_local_id),
-                    PassThroughOp{});
-
-            auto threadwise_workspace_idx_store =
-                ThreadwiseTensorSliceTransfer_v1r3<IndexDataType,
-                                                   IndexDataType,
-                                                   decltype(reduced_data_desc),
-                                                   WorkspaceDesc_M_K,
-                                                   PassThroughOp,
-                                                   Sequence<MThreadSliceSize, 1>,
-                                                   Sequence<0, 1>,
-                                                   1,
-                                                   1,
-                                                   InMemoryDataOperationEnum::Set,
-                                                   1,
-                                                   true>(
-                    workspace_desc_m_k,
-                    make_multi_index(blkgroup_id * M_BlockTileSize +
-                                         thread_m_cluster_id * MThreadSliceSize,
-                                     block_local_id),
-                    PassThroughOp{});
-
-            threadwise_workspace_val_store.Run(reduced_data_desc,
-                                               make_tuple(I0, I0),
-                                               accu_value_buf,
-                                               workspace_desc_m_k,
-                                               workspace_global_val_buf);
-            threadwise_workspace_idx_store.Run(reduced_data_desc,
-                                               make_tuple(I0, I0),
-                                               accu_index_buf,
-                                               workspace_desc_m_k,
-                                               workspace_global_idx_buf);
-        }
-    };
-};
-
-} // namespace ck
-#endif
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
index c047f7e375..ff01b88146 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
@@ -37,7 +37,8 @@
 namespace ck {
 
 template <typename GridwiseReduction,
-          bool NeedIndices,
+          bool OutputIndex,
+          bool HaveIndexInput,
           typename InDataType,
           typename OutDataType,
           typename AccDataType,
@@ -51,34 +52,35 @@ __global__ void kernel_reduce_threadwise(const InGridDesc_M_K in_grid_desc_m_k,
                                          const InElementwiseOperation in_elementwise_op,
                                          const AccElementwiseOperation acc_elementwise_op,
                                          AccDataType alpha,
-                                         const InDataType* const __restrict__ p_in_global,
+                                         const InDataType* const __restrict__ p_in_value_global,
+                                         const IndexDataType* const __restrict__ p_in_index_global,
                                          AccDataType beta,
-                                         OutDataType* const __restrict__ p_out_global,
-                                         IndexDataType* const __restrict__ p_indices_global)
+                                         OutDataType* const __restrict__ p_out_value_global,
+                                         IndexDataType* const __restrict__ p_out_index_global)
 {
-    if constexpr(!NeedIndices)
+    if constexpr(!OutputIndex)
     {
         GridwiseReduction::Run(in_grid_desc_m_k,
                                out_grid_desc_m,
                                in_elementwise_op,
                                acc_elementwise_op,
                                alpha,
-                               p_in_global,
+                               p_in_value_global,
                                beta,
-                               p_out_global,
-                               p_indices_global);
+                               p_out_value_global);
     }
     else
     {
-        GridwiseReduction::RunWithIndices(in_grid_desc_m_k,
-                                          out_grid_desc_m,
-                                          in_elementwise_op,
-                                          acc_elementwise_op,
-                                          alpha,
-                                          p_in_global,
-                                          beta,
-                                          p_out_global,
-                                          p_indices_global);
+        GridwiseReduction::template RunWithIndex<HaveIndexInput>(in_grid_desc_m_k,
+                                                                 out_grid_desc_m,
+                                                                 in_elementwise_op,
+                                                                 acc_elementwise_op,
+                                                                 alpha,
+                                                                 p_in_value_global,
+                                                                 p_in_index_global,
+                                                                 beta,
+                                                                 p_out_value_global,
+                                                                 p_out_index_global);
     };
 };
 
@@ -91,11 +93,9 @@ template <typename InDataType,
           typename ReduceOperation,
           typename InElementwiseOperation,
           typename AccElementwiseOperation,
+          InMemoryDataOperationEnum OutMemoryDataOperation,
           bool PropagateNan,
-          bool BetaIsZero,
           index_t BlockSize,
-          index_t MThreadClusterSize,
-          index_t KThreadClusterSize,
           index_t MThreadSliceSize,
           index_t KThreadSliceSize,
           index_t InSrcVectorDim,
@@ -125,10 +125,9 @@ struct GridwiseReduction_mk_to_m_threadwise
                                const InElementwiseOperation& in_elementwise_op,
                                const AccElementwiseOperation& acc_elementwise_op,
                                AccDataType alpha,
-                               const InDataType* const __restrict__ p_in_global,
+                               const InDataType* const __restrict__ p_in_value_global,
                                AccDataType beta,
-                               OutDataType* const __restrict__ p_out_global,
-                               IndexDataType* const __restrict__ p_indices_global)
+                               OutDataType* const __restrict__ p_out_value_global)
     {
         using ThreadwiseReduce = ThreadwiseReduction<AccDataType,
                                                      ThreadReduceSrcDesc_M_K,
@@ -136,14 +135,14 @@ struct GridwiseReduction_mk_to_m_threadwise
                                                      ReduceOperation,
                                                      PropagateNan>;
 
-        (void)p_indices_global;
-
         const auto zeroVal = ReduceOperation::GetReductionZeroVal();
 
-        const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert<InDataType>(zeroVal));
+        const auto in_global_val_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Global>(p_in_value_global,
+                                                          in_grid_desc_m_k.GetElementSpaceSize(),
+                                                          type_convert<InDataType>(zeroVal));
         auto dst_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_out_global, out_grid_desc_m.GetElementSpaceSize());
+            p_out_value_global, out_grid_desc_m.GetElementSpaceSize());
 
         StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
             in_thread_buf;
@@ -160,28 +159,29 @@ struct GridwiseReduction_mk_to_m_threadwise
 
         index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
 
-        auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<InDataType,
-                                                                    AccDataType,
-                                                                    InGridDesc_M_K,
-                                                                    decltype(thread_buffer_desc),
-                                                                    ThreadBufferLengths,
-                                                                    ThreadBufferDimAccessOrder,
-                                                                    InSrcVectorDim,
-                                                                    InSrcVectorSize,
-                                                                    1,
-                                                                    false>(
-            in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0));
+        auto threadwise_src_val_load =
+            ThreadwiseTensorSliceTransfer_v2<InDataType,
+                                             AccDataType,
+                                             InGridDesc_M_K,
+                                             decltype(thread_buffer_desc),
+                                             ThreadBufferLengths,
+                                             ThreadBufferDimAccessOrder,
+                                             InSrcVectorDim,
+                                             InSrcVectorSize,
+                                             1,
+                                             false>(
+                in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0));
 
         constexpr auto in_thread_copy_step = make_multi_index(0, KThreadSliceSize);
 
         index_t reducedLength = 0;
         do
         {
-            threadwise_src_load.Run(in_grid_desc_m_k,
-                                    in_global_buf,
-                                    thread_buffer_desc,
-                                    make_tuple(I0, I0),
-                                    in_thread_buf);
+            threadwise_src_val_load.Run(in_grid_desc_m_k,
+                                        in_global_val_buf,
+                                        thread_buffer_desc,
+                                        make_tuple(I0, I0),
+                                        in_thread_buf);
 
             static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
                 // do element-wise pre-reduction operation
@@ -194,7 +194,7 @@ struct GridwiseReduction_mk_to_m_threadwise
 
             ThreadwiseReduce::Reduce(in_thread_buf, accu_value_buf);
 
-            threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
+            threadwise_src_val_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
 
             reducedLength += KThreadSliceSize;
         } while(reducedLength < toReduceLength);
@@ -207,68 +207,65 @@ struct GridwiseReduction_mk_to_m_threadwise
 
         constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{};
 
-        if constexpr(!BetaIsZero)
+        if(!float_equal_zero{}(beta))
         {
-            if(!float_equal_zero{}(beta))
-            {
-                auto threadwise_dst_load =
-                    ThreadwiseTensorSliceTransfer_v2<OutDataType,
-                                                     OutDataType,
-                                                     OutGridDesc_M,
-                                                     decltype(reduced_data_desc),
-                                                     Sequence<MThreadSliceSize>,
-                                                     Sequence<0>,
-                                                     0,
-                                                     1,
-                                                     1,
-                                                     true>(
-                        out_grid_desc_m, make_multi_index(thread_global_1d_id * MThreadSliceSize));
+            auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<OutDataType,
+                                                                        OutDataType,
+                                                                        OutGridDesc_M,
+                                                                        decltype(reduced_data_desc),
+                                                                        Sequence<MThreadSliceSize>,
+                                                                        Sequence<0>,
+                                                                        0,
+                                                                        1,
+                                                                        1,
+                                                                        true>(
+                out_grid_desc_m, make_multi_index(thread_global_1d_id * MThreadSliceSize));
 
-                StaticBuffer<AddressSpaceEnum::Vgpr, OutDataType, MThreadSliceSize, true>
-                    priorDstValue_buf;
+            StaticBuffer<AddressSpaceEnum::Vgpr, OutDataType, MThreadSliceSize, true>
+                priorDstValue_buf;
 
-                threadwise_dst_load.Run(out_grid_desc_m,
-                                        dst_global_buf,
-                                        reduced_data_desc,
-                                        make_tuple(I0),
-                                        priorDstValue_buf);
+            threadwise_dst_load.Run(out_grid_desc_m,
+                                    dst_global_buf,
+                                    reduced_data_desc,
+                                    make_tuple(I0),
+                                    priorDstValue_buf);
 
-                static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-                    accu_value_buf(I) += type_convert<AccDataType>(priorDstValue_buf[I]) * beta;
-                });
-            };
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                accu_value_buf(I) += type_convert<AccDataType>(priorDstValue_buf[I]) * beta;
+            });
         };
 
-        auto threadwise_dst_store =
-            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
-                                               OutDataType,
-                                               decltype(reduced_data_desc),
-                                               OutGridDesc_M,
-                                               PassThroughOp,
-                                               Sequence<MThreadSliceSize>,
-                                               Sequence<0>,
-                                               0,
-                                               OutDstVectorSize,
-                                               InMemoryDataOperationEnum::Set,
-                                               1,
-                                               false>(
-                out_grid_desc_m,
-                make_multi_index(thread_global_1d_id * MThreadSliceSize),
-                PassThroughOp{});
+        auto threadwise_dst_store = ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                                       OutDataType,
+                                                                       decltype(reduced_data_desc),
+                                                                       OutGridDesc_M,
+                                                                       PassThroughOp,
+                                                                       Sequence<MThreadSliceSize>,
+                                                                       Sequence<0>,
+                                                                       0,
+                                                                       OutDstVectorSize,
+                                                                       OutMemoryDataOperation,
+                                                                       1,
+                                                                       false>(
+            out_grid_desc_m,
+            make_multi_index(thread_global_1d_id * MThreadSliceSize),
+            PassThroughOp{});
 
         threadwise_dst_store.Run(
             reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, dst_global_buf);
     };
 
-    __device__ static void RunWithIndices(const InGridDesc_M_K& in_grid_desc_m_k,
-                                          const OutGridDesc_M& out_grid_desc_m,
-                                          const InElementwiseOperation& in_elementwise_op,
-                                          const AccElementwiseOperation& acc_elementwise_op,
-                                          AccDataType alpha,
-                                          const InDataType* const __restrict__ p_in_global,
-                                          AccDataType beta,
-                                          OutDataType* const __restrict__ p_out_global,
-                                          IndexDataType* const __restrict__ p_indices_global)
+    template <bool HaveIndexInput>
+    __device__ static void RunWithIndex(const InGridDesc_M_K& in_grid_desc_m_k,
+                                        const OutGridDesc_M& out_grid_desc_m,
+                                        const InElementwiseOperation& in_elementwise_op,
+                                        const AccElementwiseOperation& acc_elementwise_op,
+                                        AccDataType alpha,
+                                        const InDataType* const __restrict__ p_in_value_global,
+                                        const IndexDataType* const __restrict__ p_in_index_global,
+                                        AccDataType beta,
+                                        OutDataType* const __restrict__ p_out_value_global,
+                                        IndexDataType* const __restrict__ p_out_index_global)
     {
         using ThreadwiseReduceWithIndex = ThreadwiseReductionWithIndex<AccDataType,
                                                                        IndexDataType,
@@ -281,12 +278,17 @@ struct GridwiseReduction_mk_to_m_threadwise
 
         const auto zeroVal = ReduceOperation::GetReductionZeroVal();
 
-        const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert<InDataType>(zeroVal));
+        const auto in_global_val_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Global>(p_in_value_global,
+                                                          in_grid_desc_m_k.GetElementSpaceSize(),
+                                                          type_convert<InDataType>(zeroVal));
+        const auto in_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_in_index_global, in_grid_desc_m_k.GetElementSpaceSize());
+
         auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_out_global, out_grid_desc_m.GetElementSpaceSize());
+            p_out_value_global, out_grid_desc_m.GetElementSpaceSize());
         auto out_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_indices_global, out_grid_desc_m.GetElementSpaceSize());
+            p_out_index_global, out_grid_desc_m.GetElementSpaceSize());
 
         StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
             in_thread_val_buf;
@@ -313,50 +315,105 @@ struct GridwiseReduction_mk_to_m_threadwise
 
         index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
 
-        auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<InDataType,
-                                                                    AccDataType,
-                                                                    InGridDesc_M_K,
-                                                                    decltype(thread_buffer_desc),
-                                                                    ThreadBufferLengths,
-                                                                    ThreadBufferDimAccessOrder,
-                                                                    InSrcVectorDim,
-                                                                    InSrcVectorSize,
-                                                                    1,
-                                                                    false>(
-            in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0));
+        auto threadwise_src_val_load =
+            ThreadwiseTensorSliceTransfer_v2<InDataType,
+                                             AccDataType,
+                                             InGridDesc_M_K,
+                                             decltype(thread_buffer_desc),
+                                             ThreadBufferLengths,
+                                             ThreadBufferDimAccessOrder,
+                                             InSrcVectorDim,
+                                             InSrcVectorSize,
+                                             1,
+                                             false>(
+                in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0));
 
         constexpr auto in_thread_copy_step = make_multi_index(0, KThreadSliceSize);
 
         index_t indexStart    = 0;
         index_t reducedLength = 0;
-        do
+        if constexpr(HaveIndexInput)
         {
-            threadwise_src_load.Run(in_grid_desc_m_k,
-                                    in_global_buf,
-                                    thread_buffer_desc,
-                                    make_tuple(I0, I0),
-                                    in_thread_val_buf);
+            auto threadwise_src_idx_load =
+                ThreadwiseTensorSliceTransfer_v2<IndexDataType,
+                                                 IndexDataType,
+                                                 InGridDesc_M_K,
+                                                 decltype(thread_buffer_desc),
+                                                 ThreadBufferLengths,
+                                                 ThreadBufferDimAccessOrder,
+                                                 InSrcVectorDim,
+                                                 InSrcVectorSize,
+                                                 1,
+                                                 false>(
+                    in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0));
 
-            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
-                // do element-wise pre-reduction operation
-                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
-                    constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
+            do
+            {
+                threadwise_src_val_load.Run(in_grid_desc_m_k,
+                                            in_global_val_buf,
+                                            thread_buffer_desc,
+                                            make_tuple(I0, I0),
+                                            in_thread_val_buf);
 
-                    in_thread_idx_buf(Number<offset>{}) = indexStart + iK();
+                threadwise_src_idx_load.Run(in_grid_desc_m_k,
+                                            in_global_idx_buf,
+                                            thread_buffer_desc,
+                                            make_tuple(I0, I0),
+                                            in_thread_idx_buf);
 
-                    in_elementwise_op(in_thread_val_buf(Number<offset>{}),
-                                      in_thread_val_buf(Number<offset>{}));
+                static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                    // do element-wise pre-reduction operation
+                    static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                        constexpr auto offset =
+                            thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
+
+                        in_elementwise_op(in_thread_val_buf(Number<offset>{}),
+                                          in_thread_val_buf(Number<offset>{}));
+                    });
                 });
-            });
 
-            ThreadwiseReduceWithIndex::Reduce(
-                in_thread_val_buf, in_thread_idx_buf, accu_value_buf, accu_index_buf);
+                ThreadwiseReduceWithIndex::Reduce(
+                    in_thread_val_buf, in_thread_idx_buf, accu_value_buf, accu_index_buf);
 
-            threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
+                threadwise_src_val_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
+                threadwise_src_idx_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
 
-            indexStart += KThreadSliceSize;
-            reducedLength += KThreadSliceSize;
-        } while(reducedLength < toReduceLength);
+                indexStart += KThreadSliceSize;
+                reducedLength += KThreadSliceSize;
+            } while(reducedLength < toReduceLength);
+        }
+        else
+        {
+            do
+            {
+                threadwise_src_val_load.Run(in_grid_desc_m_k,
+                                            in_global_val_buf,
+                                            thread_buffer_desc,
+                                            make_tuple(I0, I0),
+                                            in_thread_val_buf);
+
+                static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                    // do element-wise pre-reduction operation
+                    static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                        constexpr auto offset =
+                            thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
+
+                        in_thread_idx_buf(Number<offset>{}) = indexStart + iK();
+
+                        in_elementwise_op(in_thread_val_buf(Number<offset>{}),
+                                          in_thread_val_buf(Number<offset>{}));
+                    });
+                });
+
+                ThreadwiseReduceWithIndex::Reduce(
+                    in_thread_val_buf, in_thread_idx_buf, accu_value_buf, accu_index_buf);
+
+                threadwise_src_val_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
+
+                indexStart += KThreadSliceSize;
+                reducedLength += KThreadSliceSize;
+            } while(reducedLength < toReduceLength);
+        };
 
         // for indiced operation, acc_elementwise_op shoud do nothing
         static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
@@ -367,36 +424,32 @@ struct GridwiseReduction_mk_to_m_threadwise
 
         constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{};
 
-        if constexpr(!BetaIsZero)
+        if(!float_equal_zero{}(beta))
         {
-            if(!float_equal_zero{}(beta))
-            {
-                auto threadwise_dst_load =
-                    ThreadwiseTensorSliceTransfer_v2<OutDataType,
-                                                     OutDataType,
-                                                     OutGridDesc_M,
-                                                     decltype(reduced_data_desc),
-                                                     Sequence<MThreadSliceSize>,
-                                                     Sequence<0>,
-                                                     0,
-                                                     1,
-                                                     1,
-                                                     false>(
-                        out_grid_desc_m, make_multi_index(thread_global_1d_id * MThreadSliceSize));
+            auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<OutDataType,
+                                                                        OutDataType,
+                                                                        OutGridDesc_M,
+                                                                        decltype(reduced_data_desc),
+                                                                        Sequence<MThreadSliceSize>,
+                                                                        Sequence<0>,
+                                                                        0,
+                                                                        1,
+                                                                        1,
+                                                                        false>(
+                out_grid_desc_m, make_multi_index(thread_global_1d_id * MThreadSliceSize));
 
-                StaticBuffer<AddressSpaceEnum::Vgpr, OutDataType, MThreadSliceSize, true>
-                    priorDstValue_buf;
+            StaticBuffer<AddressSpaceEnum::Vgpr, OutDataType, MThreadSliceSize, true>
+                priorDstValue_buf;
 
-                threadwise_dst_load.Run(out_grid_desc_m,
-                                        out_global_val_buf,
-                                        reduced_data_desc,
-                                        make_tuple(I0),
-                                        priorDstValue_buf);
+            threadwise_dst_load.Run(out_grid_desc_m,
+                                    out_global_val_buf,
+                                    reduced_data_desc,
+                                    make_tuple(I0),
+                                    priorDstValue_buf);
 
-                static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-                    accu_value_buf(I) += type_convert<AccDataType>(priorDstValue_buf[I]) * beta;
-                });
-            };
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                accu_value_buf(I) += type_convert<AccDataType>(priorDstValue_buf[I]) * beta;
+            });
         };
 
         auto threadwise_dst_val_store =
@@ -409,7 +462,7 @@ struct GridwiseReduction_mk_to_m_threadwise
                                                Sequence<0>,
                                                0,
                                                OutDstVectorSize,
-                                               InMemoryDataOperationEnum::Set,
+                                               OutMemoryDataOperation,
                                                1,
                                                false>(
                 out_grid_desc_m,
@@ -426,7 +479,7 @@ struct GridwiseReduction_mk_to_m_threadwise
                                                Sequence<0>,
                                                0,
                                                OutDstVectorSize,
-                                               InMemoryDataOperationEnum::Set,
+                                               OutMemoryDataOperation,
                                                1,
                                                false>(
                 out_grid_desc_m,
diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp
index 0ad78423fe..5e81c6a469 100644
--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -325,7 +325,7 @@ struct DynamicBuffer
         {
             if(is_valid_element)
             {
-                atomic_add<X>(c_style_pointer_cast<X*>(&p_data_[i]), x);
+                atomic_add(c_style_pointer_cast<X*>(&p_data_[i]), x);
             }
         }
     }
diff --git a/include/ck/utility/generic_memory_space_atomic.hpp b/include/ck/utility/generic_memory_space_atomic.hpp
index 712d815f52..1a2dacb5c5 100644
--- a/include/ck/utility/generic_memory_space_atomic.hpp
+++ b/include/ck/utility/generic_memory_space_atomic.hpp
@@ -28,6 +28,12 @@ __device__ float atomic_add<float>(float* p_dst, const float& x)
     return atomicAdd(p_dst, x);
 }
 
+template <>
+__device__ double atomic_add<double>(double* p_dst, const double& x)
+{
+    return atomicAdd(p_dst, x);
+}
+
 template <>
 __device__ float2_t atomic_add<float2_t>(float2_t* p_dst, const float2_t& x)
 {
@@ -45,6 +51,23 @@ __device__ float2_t atomic_add<float2_t>(float2_t* p_dst, const float2_t& x)
     return vy.template AsType<float2_t>()[I0];
 }
 
+template <>
+__device__ double2_t atomic_add<double2_t>(double2_t* p_dst, const double2_t& x)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+
+    const vector_type<double, 2> vx{x};
+    vector_type<double, 2> vy{0};
+
+    vy.template AsType<double>()(I0) =
+        atomicAdd(c_style_pointer_cast<double*>(p_dst), vx.template AsType<double>()[I0]);
+    vy.template AsType<double>()(I1) =
+        atomicAdd(c_style_pointer_cast<double*>(p_dst) + 1, vx.template AsType<double>()[I1]);
+
+    return vy.template AsType<double2_t>()[I0];
+}
+
 // Caution: DO NOT REMOVE
 // intentionally have only declaration but no definition to cause compilation failure when trying to
 // instantiate this template. The purpose is to make the implementation of atomic_max explicit for
diff --git a/include/ck/utility/reduction_operator.hpp b/include/ck/utility/reduction_operator.hpp
index 5893f60547..e7a8db8c01 100644
--- a/include/ck/utility/reduction_operator.hpp
+++ b/include/ck/utility/reduction_operator.hpp
@@ -26,7 +26,8 @@
 #ifndef CK_REDUCTION_OPERATOR_HPP
 #define CK_REDUCTION_OPERATOR_HPP
 
-#include "common_header.hpp"
+#include "config.hpp"
+#include "data_type.hpp"
 
 namespace ck {
 
@@ -41,12 +42,10 @@ namespace reduce {
 //                    when operated against them, and the concept is similar to zero vector in
 //                    vector space
 //                    (http://pages.cs.wisc.edu/~matthewb/pages/notes/pdf/linearalgebra/VectorSpaces.pdf).
-// 2) indexable -- boolean value indicating whether indices of the operated elements could be
-// recorded. Usually, Min/Max operator could
-//                 need to record the indices of elements. For operator like Add/Mul, no need to
-//                 record the indices.
-// 3) operator() -- the first argument of the operator must be both an input & output, and the
-// corresponding variable usually stores
+// 2) IsCompatibleInMemoryDataOperation() -- return true if the reduction task corresponding to this
+// operator can use the InMemoryDataOperation to finalize, or else it return false 3) operator() --
+// the first argument of the operator must be both an input & output, and the corresponding variable
+// usually stores
 //                  the accumulated result of many operator() calls; the second argument is only an
 //                  input. For indexable binary
 //                  operator, the second version of operator() has third argument (which is an
@@ -62,6 +61,13 @@ struct Add
 
     __host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); };
 
+    __device__ static constexpr bool
+    IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation)
+    {
+        return operation == InMemoryDataOperationEnum::AtomicAdd ||
+               operation == InMemoryDataOperationEnum::Set;
+    };
+
     __host__ __device__ inline constexpr void operator()(T& a, T b) const { a = a + b; }
 };
 
@@ -72,6 +78,12 @@ struct Mul
 
     __host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(1.0f); };
 
+    __device__ static constexpr bool
+    IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation)
+    {
+        return operation == InMemoryDataOperationEnum::Set;
+    };
+
     __host__ __device__ inline constexpr void operator()(T& a, T b) const { a = a * b; }
 };
 
@@ -85,6 +97,13 @@ struct Max
         return NumericLimits<T>::Lowest();
     };
 
+    __device__ static constexpr bool
+    IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation)
+    {
+        // ToChange: atomic_max to be added
+        return operation == InMemoryDataOperationEnum::Set;
+    };
+
     __host__ __device__ inline constexpr void operator()(T& a, T b) const
     {
         if(a < b)
@@ -111,6 +130,13 @@ struct Min
         return NumericLimits<T>::Max();
     };
 
+    __device__ static constexpr bool
+    IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation)
+    {
+        // ToChange: atomic_min to be added
+        return operation == InMemoryDataOperationEnum::Set;
+    };
+
     __host__ __device__ inline constexpr void operator()(T& a, T b) const
     {
         if(a > b)
@@ -134,6 +160,13 @@ struct AMax
 
     __host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); };
 
+    __device__ static constexpr bool
+    IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation)
+    {
+        // ToChange: atomic_max to be added
+        return operation == InMemoryDataOperationEnum::Set;
+    };
+
     __host__ __device__ inline constexpr void operator()(T& a, T b) const
     {
         if(a < b)
@@ -150,6 +183,17 @@ struct AMax
     }
 };
 
+template <typename T>
+T GetReductionZeroValueForInMemoryDataOperation(InMemoryDataOperationEnum operation)
+{
+    T result = ck::type_convert<T>(0.0f);
+
+    if(operation == InMemoryDataOperationEnum::AtomicMax)
+        result = ck::NumericLimits<T>::Lowest();
+
+    return (result);
+};
+
 }; // end of namespace reduce
 
 } // end of namespace ck
diff --git a/library/include/ck/library/host_tensor/host_common_util.hpp b/library/include/ck/library/host_tensor/host_common_util.hpp
new file mode 100644
index 0000000000..8fc1d36430
--- /dev/null
+++ b/library/include/ck/library/host_tensor/host_common_util.hpp
@@ -0,0 +1,102 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_HOST_COMMON_UTIL_HPP
+#define GUARD_HOST_COMMON_UTIL_HPP
+
+#include <vector>
+#include <iostream>
+#include <fstream>
+#include <string>
+
+#include "config.hpp"
+
+namespace ck {
+
+namespace host_common {
+
+template <typename T>
+static inline void dumpBufferToFile(const char* fileName, T* data, size_t dataNumItems)
+{
+    std::ofstream outFile(fileName, std::ios::binary);
+    if(outFile)
+    {
+        outFile.write(reinterpret_cast<char*>(data), dataNumItems * sizeof(T));
+        outFile.close();
+        std::cout << "Write output to file " << fileName << std::endl;
+    }
+    else
+    {
+        std::cout << "Could not open file " << fileName << " for writing" << std::endl;
+    }
+};
+
+template <typename T>
+static inline T getSingleValueFromString(const std::string& valueStr)
+{
+    std::istringstream iss(valueStr);
+
+    T val;
+
+    iss >> val;
+
+    return (val);
+};
+
+template <typename T>
+static inline std::vector<T> getTypeValuesFromString(const char* cstr_values)
+{
+    std::string valuesStr(cstr_values);
+
+    std::vector<T> values;
+    std::size_t pos = 0;
+    std::size_t new_pos;
+
+    new_pos = valuesStr.find(',', pos);
+    while(new_pos != std::string::npos)
+    {
+        const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
+
+        T val = getSingleValueFromString<T>(sliceStr);
+
+        values.push_back(val);
+
+        pos     = new_pos + 1;
+        new_pos = valuesStr.find(',', pos);
+    };
+
+    std::string sliceStr = valuesStr.substr(pos);
+    T val                = getSingleValueFromString<T>(sliceStr);
+
+    values.push_back(val);
+
+    return (values);
+}
+
+}; // namespace host_common
+
+}; // namespace ck
+
+#endif
diff --git a/library/include/ck/library/host_tensor/host_reduce_util.hpp b/library/include/ck/library/host_tensor/host_reduce_util.hpp
index 53e17bcb5c..095bb03426 100644
--- a/library/include/ck/library/host_tensor/host_reduce_util.hpp
+++ b/library/include/ck/library/host_tensor/host_reduce_util.hpp
@@ -28,9 +28,7 @@
 
 #include <limits>
 #include <cmath>
-#include <cassert>
-#include <stdexcept>
-#include <string>
+#include <functional>
 
 #include "reduction_enums.hpp"
 #include "data_type.hpp"
@@ -214,13 +212,13 @@ binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
     };
 };
 
-template <typename AccDataType, bool PropagateNan>
+template <typename AccDataType, typename IndexDataType, bool PropagateNan>
 __host__ static inline void
-binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opReduce,
-                      AccDataType& accuVal,
-                      AccDataType currVal,
-                      int& accuIndex,
-                      int currIndex)
+binop_with_index_and_nan_check(std::function<void(AccDataType&, AccDataType, bool&)> opReduce,
+                               AccDataType& accuVal,
+                               AccDataType currVal,
+                               IndexDataType& accuIndex,
+                               IndexDataType currIndex)
 {
     using ck::math::isnan;
 
@@ -254,16 +252,6 @@ binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opRe
 
 }; // namespace host_reduce
 
-static inline std::vector<int> to_int_vector(const std::vector<size_t>& inData)
-{
-    std::vector<int> outData;
-
-    for(auto elem : inData)
-        outData.push_back(static_cast<int>(elem));
-
-    return (outData);
-};
-
 }; // namespace ck
 
 #endif
diff --git a/library/include/ck/library/host_tensor/host_reduction.hpp b/library/include/ck/library/host_tensor/host_reduction.hpp
index b67f794505..1add62d1b5 100644
--- a/library/include/ck/library/host_tensor/host_reduction.hpp
+++ b/library/include/ck/library/host_tensor/host_reduction.hpp
@@ -34,6 +34,7 @@
 #include "reduction_enums.hpp"
 #include "reduction_common.hpp"
 #include "host_reduce_util.hpp"
+#include "host_common_util.hpp"
 #include "host_tensor.hpp"
 #include "data_type.hpp"
 
@@ -200,7 +201,7 @@ struct ReductionHost
         using ck::float_equal_one;
         using ck::float_equal_zero;
         using ck::type_convert;
-        using ck::host_reduce::binop_with_nan_check2;
+        using ck::host_reduce::binop_with_index_and_nan_check;
         using ck::host_reduce::ReduceOpFn2;
         using ck::host_reduce::ReduceOpZeroVal;
 
@@ -211,8 +212,7 @@ struct ReductionHost
             AccDataType accuVal     = ReduceOpZeroVal<AccDataType, ReduceOpId>();
             IndexDataType accuIndex = 0;
 
-            for(IndexDataType i = 0; i < ck::type_convert<IndexDataType>(reduce_dim_indexes.size());
-                i++)
+            for(std::size_t i = 0; i < reduce_dim_indexes.size(); i++)
             {
                 auto offset_reduce =
                     get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]);
@@ -221,9 +221,9 @@ struct ReductionHost
 
                 preUnaryOp(currVal);
 
-                auto currIndex = i;
+                auto currIndex = static_cast<IndexDataType>(i);
 
-                binop_with_nan_check2<AccDataType, PropagateNan>(
+                binop_with_index_and_nan_check<AccDataType, IndexDataType, PropagateNan>(
                     opReduce2, accuVal, currVal, accuIndex, currIndex);
             };
 
@@ -247,9 +247,7 @@ struct ReductionHost
                 auto offset_invariant =
                     get_offset_from_index<NumInvariantDim>(invariantStrides, invariant_index);
 
-                for(IndexDataType i = 0;
-                    i < ck::type_convert<IndexDataType>(reduce_dim_indexes.size());
-                    i++)
+                for(std::size_t i = 0; i < reduce_dim_indexes.size(); i++)
                 {
                     auto offset_reduce =
                         get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]);
@@ -259,9 +257,9 @@ struct ReductionHost
 
                     preUnaryOp(currVal);
 
-                    auto currIndex = i;
+                    auto currIndex = static_cast<IndexDataType>(i);
 
-                    binop_with_nan_check2<AccDataType, PropagateNan>(
+                    binop_with_index_and_nan_check<AccDataType, IndexDataType, PropagateNan>(
                         opReduce2, accuVal, currVal, accuIndex, currIndex);
                 };
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
index fafbe120b9..6f0dbe75ff 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
@@ -9,26 +9,11 @@
 #include "device_reduce_instance_blockwise_i8_i8_i8.hpp"
 #include "device_reduce_instance_blockwise_i8_i32_i8.hpp"
 #include "device_reduce_instance_blockwise_b16_f32_b16.hpp"
-#include "device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp"
-#include "device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp"
-#include "device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp"
-#include "device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp"
-#include "device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp"
-#include "device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp"
-#include "device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp"
-#include "device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp"
 #include "device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp"
 #include "device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp"
 #include "device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp"
+#include "device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp"
 #include "device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp"
-#include "device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp"
-#include "device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp"
-#include "device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp"
-#include "device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp"
-#include "device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp"
-#include "device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp"
-#include "device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp"
-#include "device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp"
 #include "device_reduce_instance_threadwise_f16_f16_f16.hpp"
 #include "device_reduce_instance_threadwise_f16_f32_f16.hpp"
 #include "device_reduce_instance_threadwise_f32_f32_f32.hpp"
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
index e4b06cf96d..e31d4e769e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
@@ -3,13 +3,27 @@
 
 #include "reduction_operator_mapping.hpp"
 #include "device_reduce_instance_impl_common.hpp"
-#include "device_reduce_blockwise.hpp"
+#include "device_reduce_multiblock.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace device_reduce_instance {
 
+using reduce_configuration_1_instances_blockwise = std::tuple<
+    // clang-format off
+    // BlockSize | MThreadClusterSize | KThreadClusterSize
+    ReductionConfiguration_1<256, 128, 2>,
+    ReductionConfiguration_1<256, 64, 4>,
+    ReductionConfiguration_1<256, 32, 8>,
+    ReductionConfiguration_1<256, 16, 16>,
+    ReductionConfiguration_1<256, 8, 32>,
+    ReductionConfiguration_1<256, 4, 64>,
+    ReductionConfiguration_1<256, 2, 128>,
+    ReductionConfiguration_1<256, 1, 256>
+    // clang-format on
+    >;
+
 #ifdef QUICK_REDUCE_TEST
 using reduce_configuration_2_instances_blockwise = std::tuple<
     // clang-format off
@@ -58,8 +72,8 @@ template <typename InDataType,
           int Rank,
           int NumReduceDim,
           ReduceTensorOp ReduceOpId,
-          NanPropagation NanOpt,
-          ReduceTensorIndices IndicesOpt>
+          bool PropagateNan,
+          bool UseIndex>
 void add_device_reduce_instance_blockwise(
     std::vector<deviceReduceBlockWisePtrType<AccDataType, ReduceOpId>>& device_op_instances)
 {
@@ -73,92 +87,94 @@ void add_device_reduce_instance_blockwise(
     constexpr bool Indexable =
         (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
          ReduceOpId == ReduceTensorOp::AMAX);
-    constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
+    constexpr bool OutputIndex = Indexable && UseIndex;
 
-    constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
+    static_for<0, std::tuple_size<reduce_configuration_1_instances_blockwise>::value, 1>{}(
+        [&](auto i) {
+            using cfg1 = remove_cvref_t<decltype(
+                std::get<i.value>(reduce_configuration_1_instances_blockwise{}))>;
 
-    static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) {
-        using cfg1 =
-            remove_cvref_t<decltype(std::get<i.value>(reduce_configuration_1_instances{}))>;
+            static_for<0, std::tuple_size<reduce_configuration_2_instances_blockwise>::value, 1>{}(
+                [&](auto j) {
+                    using cfg2 = remove_cvref_t<decltype(
+                        std::get<j.value>(reduce_configuration_2_instances_blockwise{}))>;
 
-        static_for<0, std::tuple_size<reduce_configuration_2_instances_blockwise>::value, 1>{}(
-            [&](auto j) {
-                using cfg2 = remove_cvref_t<decltype(
-                    std::get<j.value>(reduce_configuration_2_instances_blockwise{}))>;
+                    using ReduceOpInstance =
+                        DeviceReduceMultiBlock<InDataType,
+                                               AccDataType,
+                                               OutDataType,
+                                               Rank,
+                                               NumReduceDim,
+                                               ReduceOperation,
+                                               InElementwiseOperation,
+                                               AccElementwiseOperation,
+                                               InMemoryDataOperationEnum::Set,
+                                               PropagateNan,
+                                               OutputIndex,
+                                               false, // HaveIndexInputIfOutputIndex
+                                               cfg1::BlockSize_,
+                                               cfg1::MThreadClusterSize_,
+                                               cfg1::KThreadClusterSize_,
+                                               cfg2::MThreadSliceSize_,
+                                               cfg2::KThreadSliceSize_,
+                                               cfg2::InSrcVectorDim_,
+                                               cfg2::InSrcVectorSize_,
+                                               cfg2::OutDstVectorSize_>;
 
-                using ReduceOpInstance = DeviceReduceBlockWise<InDataType,
-                                                               AccDataType,
-                                                               OutDataType,
-                                                               Rank,
-                                                               NumReduceDim,
-                                                               ReduceOperation,
-                                                               InElementwiseOperation,
-                                                               AccElementwiseOperation,
-                                                               PropagateNan,
-                                                               NeedIndices,
-                                                               cfg1::BlockSize_,
-                                                               cfg1::MThreadClusterSize_,
-                                                               cfg1::KThreadClusterSize_,
-                                                               cfg2::MThreadSliceSize_,
-                                                               cfg2::KThreadSliceSize_,
-                                                               cfg2::InSrcVectorDim_,
-                                                               cfg2::InSrcVectorSize_,
-                                                               cfg2::OutDstVectorSize_>;
-
-                device_op_instances.push_back(
-                    std::make_unique<ReduceOpInstance>(ReduceOpInstance{}));
-            });
-    });
+                    device_op_instances.push_back(
+                        std::make_unique<ReduceOpInstance>(ReduceOpInstance{}));
+                });
+        });
 };
 
-#define ADD_BLOCKWISE_INST_BY_TYPE(                                       \
-    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
-    template void add_device_reduce_instance_blockwise<inT,               \
-                                                       compT,             \
-                                                       outT,              \
-                                                       Rank,              \
-                                                       NumReduceDim,      \
-                                                       ReduceOpId,        \
-                                                       NanOpt,            \
-                                                       IndicesOpt>(       \
+#define ADD_BLOCKWISE_INST_BY_TYPE(                                           \
+    inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \
+    template void add_device_reduce_instance_blockwise<inT,                   \
+                                                       compT,                 \
+                                                       outT,                  \
+                                                       Rank,                  \
+                                                       NumReduceDim,          \
+                                                       ReduceOpId,            \
+                                                       PropagateNan,          \
+                                                       UseIndex>(             \
         std::vector<deviceReduceBlockWisePtrType<compT, ReduceOpId>> & device_op_instances)
 
-#define ADD_BLOCKWISE_INST_BY_ID(                                            \
-    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)    \
-    ADD_BLOCKWISE_INST_BY_TYPE(inT,                                          \
-                               compT,                                        \
-                               outT,                                         \
-                               static_cast<ReduceTensorOp>(ReduceOpId),      \
-                               static_cast<NanPropagation>(NanOpt),          \
-                               static_cast<ReduceTensorIndices>(IndicesOpt), \
-                               Rank,                                         \
+#define ADD_BLOCKWISE_INST_BY_ID(                                         \
+    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
+    ADD_BLOCKWISE_INST_BY_TYPE(inT,                                       \
+                               compT,                                     \
+                               outT,                                      \
+                               static_cast<ReduceTensorOp>(ReduceOpId),   \
+                               static_cast<bool>(NanOpt),                 \
+                               static_cast<bool>(IndicesOpt),             \
+                               Rank,                                      \
                                NumReduceDim)
 
 #define ADD_BLOCKWISE_INST_REF_BY_TYPE(                                                            \
-    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)                          \
+    inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim)                      \
     extern template void add_device_reduce_instance_blockwise<inT,                                 \
                                                               compT,                               \
                                                               outT,                                \
                                                               Rank,                                \
                                                               NumReduceDim,                        \
                                                               ReduceOpId,                          \
-                                                              NanOpt,                              \
-                                                              IndicesOpt>(                         \
+                                                              PropagateNan,                        \
+                                                              UseIndex>(                           \
         std::vector<DeviceReducePtr<                                                               \
             typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
             typename reduce_unary_operator<compT, ReduceOpId, true, true>::                        \
                 AccElementwiseOperation>> &                                                        \
         device_op_instances)
 
-#define ADD_BLOCKWISE_INST_REF_BY_ID(                                            \
-    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)        \
-    ADD_BLOCKWISE_INST_REF_BY_TYPE(inT,                                          \
-                                   compT,                                        \
-                                   outT,                                         \
-                                   static_cast<ReduceTensorOp>(ReduceOpId),      \
-                                   static_cast<NanPropagation>(NanOpt),          \
-                                   static_cast<ReduceTensorIndices>(IndicesOpt), \
-                                   Rank,                                         \
+#define ADD_BLOCKWISE_INST_REF_BY_ID(                                       \
+    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)   \
+    ADD_BLOCKWISE_INST_REF_BY_TYPE(inT,                                     \
+                                   compT,                                   \
+                                   outT,                                    \
+                                   static_cast<ReduceTensorOp>(ReduceOpId), \
+                                   static_cast<bool>(NanOpt),               \
+                                   static_cast<bool>(IndicesOpt),           \
+                                   Rank,                                    \
                                    NumReduceDim)
 
 } // namespace device_reduce_instance
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp
index 0ae3289a0d..3cad45f2e5 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp
@@ -1,8 +1,7 @@
 #ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_B16_F32_B16_HPP
 #define DEVICE_REDUCE_INSTANCE_BLOCKWISE_B16_F32_B16_HPP
 
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
+#include "data_type.hpp"
 #include "device_reduce_instance_blockwise.hpp"
 
 namespace ck {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp
index e7bdb15d92..441c1aec3f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp
@@ -1,8 +1,7 @@
 #ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F16_F16_HPP
 #define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F16_F16_HPP
 
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
+#include "data_type.hpp"
 #include "device_reduce_instance_blockwise.hpp"
 
 namespace ck {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp
index dad0d86350..ca8532a458 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp
@@ -1,8 +1,7 @@
 #ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F32_F16_HPP
 #define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F32_F16_HPP
 
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
+#include "data_type.hpp"
 #include "device_reduce_instance_blockwise.hpp"
 
 namespace ck {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp
index 34ec15db2b..64f504c9da 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp
@@ -1,8 +1,6 @@
 #ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F32_F32_HPP
 #define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F32_F32_HPP
 
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
 #include "device_reduce_instance_blockwise.hpp"
 
 namespace ck {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp
index b08f35ad09..9e84ee34fb 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp
@@ -1,8 +1,6 @@
 #ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F64_F32_HPP
 #define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F64_F32_HPP
 
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
 #include "device_reduce_instance_blockwise.hpp"
 
 namespace ck {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp
index 65cdd45340..a37e3bdeb9 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp
@@ -1,8 +1,6 @@
 #ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F64_F64_F64_HPP
 #define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F64_F64_F64_HPP
 
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
 #include "device_reduce_instance_blockwise.hpp"
 
 namespace ck {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp
index f4a6677b3e..1d8695bbb0 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp
@@ -1,8 +1,6 @@
 #ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I32_I8_HPP
 #define DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I32_I8_HPP
 
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
 #include "device_reduce_instance_blockwise.hpp"
 
 namespace ck {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp
index 7f67138e6b..b5c19b7207 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp
@@ -1,8 +1,6 @@
 #ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP
 #define DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP
 
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
 #include "device_reduce_instance_blockwise.hpp"
 
 namespace ck {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp
deleted file mode 100644
index 8e47bbfb6a..0000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp
+++ /dev/null
@@ -1,165 +0,0 @@
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_HPP
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_HPP
-
-#include "reduction_operator_mapping.hpp"
-#include "device_reduce_instance_impl_common.hpp"
-#include "device_reduce_blockwise_second_call.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-#ifdef QUICK_REDUCE_TEST
-using reduce_configuration_2_instances_blockwise_second_call = std::tuple<
-    // clang-format off
-    // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
-    ReductionConfiguration_2<1, 2, 1, 1, 2>,
-    ReductionConfiguration_2<1, 1, 1, 1, 3>
-    // clang-format on
-    >;
-#else
-using reduce_configuration_2_instances_blockwise_second_call = std::tuple<
-    // clang-format off
-    // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
-    ReductionConfiguration_2<1, 4, 1, 1, 8>,
-    ReductionConfiguration_2<1, 4, 1, 1, 4>,
-    ReductionConfiguration_2<1, 2, 1, 1, 2>,
-
-    ReductionConfiguration_2<1, 1, 1, 1, 3>,
-    ReductionConfiguration_2<1, 1, 1, 1, 5>,
-    ReductionConfiguration_2<1, 1, 1, 1, 7>,
-    ReductionConfiguration_2<1, 1, 1, 1, 11>
-    // clang-format on
-    >;
-#endif
-
-template <typename AccDataType, ReduceTensorOp ReduceOpId>
-using deviceReduceBlockWiseSecondCallPtrType = DeviceReducePtr<
-    typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::InElementwiseOperation,
-    typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::AccElementwiseOperation>;
-
-template <typename InDataType,
-          typename AccDataType,
-          typename OutDataType,
-          int Rank,
-          int NumReduceDim,
-          ReduceTensorOp ReduceOpId,
-          NanPropagation NanOpt,
-          ReduceTensorIndices IndicesOpt>
-void add_device_reduce_instance_blockwise_second_call(
-    std::vector<deviceReduceBlockWiseSecondCallPtrType<AccDataType, ReduceOpId>>&
-        device_op_instances)
-{
-    using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
-    using InElementwiseOperation =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
-            InElementwiseOperation;
-    using AccElementwiseOperation =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
-            AccElementwiseOperation;
-
-    constexpr bool Indexable =
-        (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
-         ReduceOpId == ReduceTensorOp::AMAX);
-    constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
-
-    constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
-
-    static_assert(std::is_same<InDataType, AccDataType>::value,
-                  "InDataType and AccDataType should be the same to use "
-                  "add_device_reduce_instance_blockwise_second_call!");
-
-    static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) {
-        using cfg1 =
-            remove_cvref_t<decltype(std::get<i.value>(reduce_configuration_1_instances{}))>;
-
-        static_for<0,
-                   std::tuple_size<reduce_configuration_2_instances_blockwise_second_call>::value,
-                   1>{}([&](auto j) {
-            using cfg2 = remove_cvref_t<decltype(
-                std::get<j.value>(reduce_configuration_2_instances_blockwise_second_call{}))>;
-
-            using ReduceOpInstance = DeviceReduceBlockWiseSecondCall<InDataType,
-                                                                     AccDataType,
-                                                                     OutDataType,
-                                                                     Rank,
-                                                                     NumReduceDim,
-                                                                     ReduceOperation,
-                                                                     InElementwiseOperation,
-                                                                     AccElementwiseOperation,
-                                                                     PropagateNan,
-                                                                     NeedIndices,
-                                                                     cfg1::BlockSize_,
-                                                                     cfg1::MThreadClusterSize_,
-                                                                     cfg1::KThreadClusterSize_,
-                                                                     cfg2::MThreadSliceSize_,
-                                                                     cfg2::KThreadSliceSize_,
-                                                                     cfg2::InSrcVectorDim_,
-                                                                     cfg2::InSrcVectorSize_,
-                                                                     cfg2::OutDstVectorSize_>;
-
-            device_op_instances.push_back(std::make_unique<ReduceOpInstance>(ReduceOpInstance{}));
-        });
-    });
-};
-
-#define ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE(                                  \
-    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)        \
-    template void add_device_reduce_instance_blockwise_second_call<inT,          \
-                                                                   compT,        \
-                                                                   outT,         \
-                                                                   Rank,         \
-                                                                   NumReduceDim, \
-                                                                   ReduceOpId,   \
-                                                                   NanOpt,       \
-                                                                   IndicesOpt>(  \
-        std::vector<deviceReduceBlockWiseSecondCallPtrType<compT, ReduceOpId>> & \
-        device_op_instances)
-
-#define ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(                                            \
-    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)                \
-    ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE(inT,                                          \
-                                           compT,                                        \
-                                           outT,                                         \
-                                           static_cast<ReduceTensorOp>(ReduceOpId),      \
-                                           static_cast<NanPropagation>(NanOpt),          \
-                                           static_cast<ReduceTensorIndices>(IndicesOpt), \
-                                           Rank,                                         \
-                                           NumReduceDim)
-
-#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE(                                          \
-    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)                    \
-    extern template void add_device_reduce_instance_blockwise_second_call<inT,               \
-                                                                          compT,             \
-                                                                          outT,              \
-                                                                          Rank,              \
-                                                                          NumReduceDim,      \
-                                                                          ReduceOpId,        \
-                                                                          NanOpt,            \
-                                                                          IndicesOpt>(       \
-        std::vector<                                                                         \
-            DeviceReducePtr<typename reduce_unary_operator<compT, ReduceOpId, false, true>:: \
-                                InElementwiseOperation,                                      \
-                            typename reduce_unary_operator<compT, ReduceOpId, false, true>:: \
-                                AccElementwiseOperation>> &                                  \
-        device_op_instances)
-
-#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(                                            \
-    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)                    \
-    ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE(inT,                                          \
-                                               compT,                                        \
-                                               outT,                                         \
-                                               static_cast<ReduceTensorOp>(ReduceOpId),      \
-                                               static_cast<NanPropagation>(NanOpt),          \
-                                               static_cast<ReduceTensorIndices>(IndicesOpt), \
-                                               Rank,                                         \
-                                               NumReduceDim)
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
-
-#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp
deleted file mode 100644
index 4ce19c7d0c..0000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F16_F16_F16_HPP
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F16_F16_F16_HPP
-
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
-#include "device_reduce_instance_blockwise_second_call.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
-
-#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp
deleted file mode 100644
index c85419befc..0000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_B16_HPP
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_B16_HPP
-
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
-#include "device_reduce_instance_blockwise_second_call.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 0, 0, 0, 2, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 5, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 7, 0, 0, 2, 1);
-
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 0, 2, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 0, 2, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 0, 2, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 1, 2, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 1, 2, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 1, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
-
-#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp
deleted file mode 100644
index d42e7e020f..0000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F16_HPP
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F16_HPP
-
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
-#include "device_reduce_instance_blockwise_second_call.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 4, 3); // for ADD
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 2, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 4, 3); // for AVG
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 4, 3); // for NORM2
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
-
-#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp
deleted file mode 100644
index fcf244d1d3..0000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F32_HPP
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F32_HPP
-
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
-#include "device_reduce_instance_blockwise_second_call.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
-
-#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp
deleted file mode 100644
index 72e806ee60..0000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F32_HPP
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F32_HPP
-
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
-#include "device_reduce_instance_blockwise_second_call.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 4, 3); // for ADD
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 2, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 4, 3); // for AVG
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 4, 3); // for NORM2
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
-
-#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp
deleted file mode 100644
index 476c3a7d8f..0000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F64_HPP
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F64_HPP
-
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
-#include "device_reduce_instance_blockwise_second_call.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
-
-#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp
deleted file mode 100644
index d46780483b..0000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I32_I32_I8_HPP
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I32_I32_I8_HPP
-
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
-#include "device_reduce_instance_blockwise_second_call.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 2, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
-
-#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp
deleted file mode 100644
index 7b020fb439..0000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I8_I8_I8_HPP
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I8_I8_I8_HPP
-
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
-#include "device_reduce_instance_blockwise_second_call.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
-
-#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp
index b25645034c..721d98a718 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp
@@ -30,20 +30,6 @@ struct ReductionConfiguration_2
     static constexpr int KThreadSliceSize_ = KThreadSliceSize;
 };
 
-using reduce_configuration_1_instances = std::tuple<
-    // clang-format off
-    // BlockSize | MThreadClusterSize | KThreadClusterSize
-    ReductionConfiguration_1<256, 128, 2>,
-    ReductionConfiguration_1<256, 64, 4>,
-    ReductionConfiguration_1<256, 32, 8>,
-    ReductionConfiguration_1<256, 16, 16>,
-    ReductionConfiguration_1<256, 8, 32>,
-    ReductionConfiguration_1<256, 4, 64>,
-    ReductionConfiguration_1<256, 2, 128>,
-    ReductionConfiguration_1<256, 1, 256>
-    // clang-format on
-    >;
-
 #define QUICK_REDUCE_TEST 1
 
 } // namespace device_reduce_instance
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
index bf10080b5e..605109d077 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
@@ -3,13 +3,27 @@
 
 #include "reduction_operator_mapping.hpp"
 #include "device_reduce_instance_impl_common.hpp"
-#include "device_reduce_multiblock_atomic_add.hpp"
+#include "device_reduce_multiblock.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace device_reduce_instance {
 
+using reduce_configuration_1_instances_multiblock_atomic_add = std::tuple<
+    // clang-format off
+    // BlockSize | MThreadClusterSize | KThreadClusterSize
+    ReductionConfiguration_1<256, 128, 2>,
+    ReductionConfiguration_1<256, 64, 4>,
+    ReductionConfiguration_1<256, 32, 8>,
+    ReductionConfiguration_1<256, 16, 16>,
+    ReductionConfiguration_1<256, 8, 32>,
+    ReductionConfiguration_1<256, 4, 64>,
+    ReductionConfiguration_1<256, 2, 128>,
+    ReductionConfiguration_1<256, 1, 256>
+    // clang-format on
+    >;
+
 #ifdef QUICK_REDUCE_TEST
 using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple<
     // clang-format off
@@ -60,8 +74,8 @@ template <typename InDataType,
           int Rank,
           int NumReduceDim,
           ReduceTensorOp ReduceOpId,
-          NanPropagation NanOpt,
-          ReduceTensorIndices IndicesOpt>
+          bool PropagateNan,
+          bool UseIndex>
 void add_device_reduce_instance_multiblock_atomic_add(
     std::vector<deviceReduceMultiBlockAtomicAddPtrType<AccDataType, ReduceOpId>>&
         device_op_instances)
@@ -76,12 +90,10 @@ void add_device_reduce_instance_multiblock_atomic_add(
     constexpr bool Indexable =
         (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
          ReduceOpId == ReduceTensorOp::AMAX);
-    constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
+    constexpr bool OutputIndex = Indexable && UseIndex;
 
-    constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
-
-    static_assert(IndicesOpt == ReduceTensorIndices::NO_INDICES,
-                  "AtomicAdd can only be used with reduction operations without indices!");
+    static_assert(UseIndex == false,
+                  "AtomicAdd can only be used with reduction operations using no index!");
 
     constexpr bool op_acceptable =
         (ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::MUL ||
@@ -94,9 +106,11 @@ void add_device_reduce_instance_multiblock_atomic_add(
         return;
     else
     {
-        static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) {
-            using cfg1 =
-                remove_cvref_t<decltype(std::get<i.value>(reduce_configuration_1_instances{}))>;
+        static_for<0,
+                   std::tuple_size<reduce_configuration_1_instances_multiblock_atomic_add>::value,
+                   1>{}([&](auto i) {
+            using cfg1 = remove_cvref_t<decltype(
+                std::get<i.value>(reduce_configuration_1_instances_multiblock_atomic_add{}))>;
 
             static_for<
                 0,
@@ -105,24 +119,27 @@ void add_device_reduce_instance_multiblock_atomic_add(
                 using cfg2 = remove_cvref_t<decltype(
                     std::get<j.value>(reduce_configuration_2_instances_multiblock_atomic_add{}))>;
 
-                using ReduceOpInstance = DeviceReduceMultiBlockAtomicAdd<InDataType,
-                                                                         AccDataType,
-                                                                         OutDataType,
-                                                                         Rank,
-                                                                         NumReduceDim,
-                                                                         ReduceOperation,
-                                                                         InElementwiseOperation,
-                                                                         AccElementwiseOperation,
-                                                                         PropagateNan,
-                                                                         NeedIndices,
-                                                                         cfg1::BlockSize_,
-                                                                         cfg1::MThreadClusterSize_,
-                                                                         cfg1::KThreadClusterSize_,
-                                                                         cfg2::MThreadSliceSize_,
-                                                                         cfg2::KThreadSliceSize_,
-                                                                         cfg2::InSrcVectorDim_,
-                                                                         cfg2::InSrcVectorSize_,
-                                                                         cfg2::OutDstVectorSize_>;
+                using ReduceOpInstance =
+                    DeviceReduceMultiBlock<InDataType,
+                                           AccDataType,
+                                           OutDataType,
+                                           Rank,
+                                           NumReduceDim,
+                                           ReduceOperation,
+                                           InElementwiseOperation,
+                                           AccElementwiseOperation,
+                                           InMemoryDataOperationEnum::AtomicAdd,
+                                           PropagateNan,
+                                           OutputIndex,
+                                           false, // HaveIndexInputIfOutputIndex
+                                           cfg1::BlockSize_,
+                                           cfg1::MThreadClusterSize_,
+                                           cfg1::KThreadClusterSize_,
+                                           cfg2::MThreadSliceSize_,
+                                           cfg2::KThreadSliceSize_,
+                                           cfg2::InSrcVectorDim_,
+                                           cfg2::InSrcVectorSize_,
+                                           cfg2::OutDstVectorSize_>;
 
                 device_op_instances.push_back(
                     std::make_unique<ReduceOpInstance>(ReduceOpInstance{}));
@@ -132,54 +149,54 @@ void add_device_reduce_instance_multiblock_atomic_add(
 };
 
 #define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(                                  \
-    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)        \
+    inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim)    \
     template void add_device_reduce_instance_multiblock_atomic_add<inT,          \
                                                                    compT,        \
                                                                    outT,         \
                                                                    Rank,         \
                                                                    NumReduceDim, \
                                                                    ReduceOpId,   \
-                                                                   NanOpt,       \
-                                                                   IndicesOpt>(  \
+                                                                   PropagateNan, \
+                                                                   UseIndex>(    \
         std::vector<deviceReduceMultiBlockAtomicAddPtrType<compT, ReduceOpId>> & \
         device_op_instances)
 
-#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(                                            \
-    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)                \
-    ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT,                                          \
-                                           compT,                                        \
-                                           outT,                                         \
-                                           static_cast<ReduceTensorOp>(ReduceOpId),      \
-                                           static_cast<NanPropagation>(NanOpt),          \
-                                           static_cast<ReduceTensorIndices>(IndicesOpt), \
-                                           Rank,                                         \
+#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(                                       \
+    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)           \
+    ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT,                                     \
+                                           compT,                                   \
+                                           outT,                                    \
+                                           static_cast<ReduceTensorOp>(ReduceOpId), \
+                                           static_cast<bool>(NanOpt),               \
+                                           static_cast<bool>(IndicesOpt),           \
+                                           Rank,                                    \
                                            NumReduceDim)
 
 #define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(                                                \
-    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)                          \
+    inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim)                      \
     extern template void add_device_reduce_instance_multiblock_atomic_add<inT,                     \
                                                                           compT,                   \
                                                                           outT,                    \
                                                                           Rank,                    \
                                                                           NumReduceDim,            \
                                                                           ReduceOpId,              \
-                                                                          NanOpt,                  \
-                                                                          IndicesOpt>(             \
+                                                                          PropagateNan,            \
+                                                                          UseIndex>(               \
         std::vector<DeviceReducePtr<                                                               \
             typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
             typename reduce_unary_operator<compT, ReduceOpId, true, true>::                        \
                 AccElementwiseOperation>> &                                                        \
         device_op_instances)
 
-#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(                                            \
-    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)                    \
-    ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT,                                          \
-                                               compT,                                        \
-                                               outT,                                         \
-                                               static_cast<ReduceTensorOp>(ReduceOpId),      \
-                                               static_cast<NanPropagation>(NanOpt),          \
-                                               static_cast<ReduceTensorIndices>(IndicesOpt), \
-                                               Rank,                                         \
+#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(                                       \
+    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)               \
+    ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT,                                     \
+                                               compT,                                   \
+                                               outT,                                    \
+                                               static_cast<ReduceTensorOp>(ReduceOpId), \
+                                               static_cast<bool>(NanOpt),               \
+                                               static_cast<bool>(IndicesOpt),           \
+                                               Rank,                                    \
                                                NumReduceDim)
 
 } // namespace device_reduce_instance
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp
index 58f90bb94f..4e39cf49f6 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp
@@ -1,8 +1,7 @@
 #ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP
 #define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP
 
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
+#include "data_type.hpp"
 #include "device_reduce_instance_multiblock_atomic_add.hpp"
 
 namespace ck {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp
index f4c766ca03..73424322ae 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp
@@ -1,8 +1,7 @@
 #ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP
 #define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP
 
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
+#include "data_type.hpp"
 #include "device_reduce_instance_multiblock_atomic_add.hpp"
 
 namespace ck {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
index c2f2564fc9..ecc9c4ea87 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
@@ -1,8 +1,6 @@
 #ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
 #define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
 
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
 #include "device_reduce_instance_multiblock_atomic_add.hpp"
 
 namespace ck {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
index 830dcf9407..41a60d5b70 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
@@ -1,8 +1,6 @@
 #ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
 #define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
 
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
 #include "device_reduce_instance_multiblock_atomic_add.hpp"
 
 namespace ck {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp
new file mode 100644
index 0000000000..bdcca274d7
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp
@@ -0,0 +1,29 @@
+#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP
+#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP
+
+#include "device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4);
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1);
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1);
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4);       
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1);       
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
+
+#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce.hpp
deleted file mode 100644
index 5c323ec175..0000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce.hpp
+++ /dev/null
@@ -1,174 +0,0 @@
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_HPP
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_HPP
-
-#include "reduction_operator_mapping.hpp"
-#include "device_reduce_instance_impl_common.hpp"
-#include "device_reduce_multiblock_partial_reduce.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-#ifdef QUICK_REDUCE_TEST
-using reduce_configuration_2_instances_multiblock_partial_reduce = std::tuple<
-    // clang-format off
-    // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
-    ReductionConfiguration_2<0, 1, 1, 2, 1>,
-    ReductionConfiguration_2<1, 2, 1, 1, 2>,
-    ReductionConfiguration_2<0, 1, 1, 3, 1>,
-    ReductionConfiguration_2<1, 1, 1, 1, 3>
-    // clang-format on
-    >;
-#else
-using reduce_configuration_2_instances_multiblock_partial_reduce = std::tuple<
-    // clang-format off
-    // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
-    ReductionConfiguration_2<0, 4, 1, 8, 1>,
-    ReductionConfiguration_2<0, 4, 1, 4, 1>,
-    ReductionConfiguration_2<0, 2, 1, 2, 1>,
-
-    ReductionConfiguration_2<1, 4, 1, 1, 8>,
-    ReductionConfiguration_2<1, 4, 1, 1, 4>,
-    ReductionConfiguration_2<1, 2, 1, 1, 2>,
-
-    // special instances
-    ReductionConfiguration_2<0, 1, 1, 3, 1>,
-    ReductionConfiguration_2<0, 1, 1, 5, 1>,
-    ReductionConfiguration_2<0, 1, 1, 7, 1>,
-    ReductionConfiguration_2<0, 1, 1, 11, 1>,
-
-    ReductionConfiguration_2<0, 1, 1, 1, 3>,
-    ReductionConfiguration_2<0, 1, 1, 1, 5>,
-    ReductionConfiguration_2<0, 1, 1, 1, 7>,
-    ReductionConfiguration_2<0, 1, 1, 1, 11>
-    // clang-format on
-    >;
-#endif
-
-template <typename AccDataType, ReduceTensorOp ReduceOpId>
-using deviceReduceMultiBlockPartialReducePtrType = DeviceReducePtr<
-    typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::InElementwiseOperation,
-    typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::AccElementwiseOperation>;
-
-template <typename InDataType,
-          typename AccDataType,
-          typename OutDataType,
-          int Rank,
-          int NumReduceDim,
-          ReduceTensorOp ReduceOpId,
-          NanPropagation NanOpt,
-          ReduceTensorIndices IndicesOpt>
-void add_device_reduce_instance_multiblock_partial_reduce(
-    std::vector<deviceReduceMultiBlockPartialReducePtrType<AccDataType, ReduceOpId>>&
-        device_op_instances)
-{
-    using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
-    using InElementwiseOperation =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
-            InElementwiseOperation;
-    using AccElementwiseOperation =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
-            AccElementwiseOperation;
-
-    constexpr bool Indexable =
-        (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
-         ReduceOpId == ReduceTensorOp::AMAX);
-    constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
-
-    constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
-
-    static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) {
-        using cfg1 =
-            remove_cvref_t<decltype(std::get<i.value>(reduce_configuration_1_instances{}))>;
-
-        static_for<
-            0,
-            std::tuple_size<reduce_configuration_2_instances_multiblock_partial_reduce>::value,
-            1>{}([&](auto j) {
-            using cfg2 = remove_cvref_t<decltype(
-                std::get<j.value>(reduce_configuration_2_instances_multiblock_partial_reduce{}))>;
-
-            using ReduceOpInstance = DeviceReduceMultiBlockPartialReduce<InDataType,
-                                                                         AccDataType,
-                                                                         OutDataType,
-                                                                         Rank,
-                                                                         NumReduceDim,
-                                                                         ReduceOperation,
-                                                                         InElementwiseOperation,
-                                                                         AccElementwiseOperation,
-                                                                         PropagateNan,
-                                                                         NeedIndices,
-                                                                         cfg1::BlockSize_,
-                                                                         cfg1::MThreadClusterSize_,
-                                                                         cfg1::KThreadClusterSize_,
-                                                                         cfg2::MThreadSliceSize_,
-                                                                         cfg2::KThreadSliceSize_,
-                                                                         cfg2::InSrcVectorDim_,
-                                                                         cfg2::InSrcVectorSize_,
-                                                                         cfg2::OutDstVectorSize_>;
-
-            device_op_instances.push_back(std::make_unique<ReduceOpInstance>(ReduceOpInstance{}));
-        });
-    });
-};
-
-#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE(                                  \
-    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)            \
-    template void add_device_reduce_instance_multiblock_partial_reduce<inT,          \
-                                                                       compT,        \
-                                                                       outT,         \
-                                                                       Rank,         \
-                                                                       NumReduceDim, \
-                                                                       ReduceOpId,   \
-                                                                       NanOpt,       \
-                                                                       IndicesOpt>(  \
-        std::vector<deviceReduceMultiBlockPartialReducePtrType<compT, ReduceOpId>> & \
-        device_op_instances)
-
-#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(                                            \
-    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)                    \
-    ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE(inT,                                          \
-                                               compT,                                        \
-                                               outT,                                         \
-                                               static_cast<ReduceTensorOp>(ReduceOpId),      \
-                                               static_cast<NanPropagation>(NanOpt),          \
-                                               static_cast<ReduceTensorIndices>(IndicesOpt), \
-                                               Rank,                                         \
-                                               NumReduceDim)
-
-#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE(                                      \
-    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)                    \
-    extern template void add_device_reduce_instance_multiblock_partial_reduce<inT,           \
-                                                                              compT,         \
-                                                                              outT,          \
-                                                                              Rank,          \
-                                                                              NumReduceDim,  \
-                                                                              ReduceOpId,    \
-                                                                              NanOpt,        \
-                                                                              IndicesOpt>(   \
-        std::vector<                                                                         \
-            DeviceReducePtr<typename reduce_unary_operator<compT, ReduceOpId, true, false>:: \
-                                InElementwiseOperation,                                      \
-                            typename reduce_unary_operator<compT, ReduceOpId, true, false>:: \
-                                AccElementwiseOperation>> &                                  \
-        device_op_instances)
-
-#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(                                            \
-    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)                        \
-    ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE(inT,                                          \
-                                                   compT,                                        \
-                                                   outT,                                         \
-                                                   static_cast<ReduceTensorOp>(ReduceOpId),      \
-                                                   static_cast<NanPropagation>(NanOpt),          \
-                                                   static_cast<ReduceTensorIndices>(IndicesOpt), \
-                                                   Rank,                                         \
-                                                   NumReduceDim)
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
-
-#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp
deleted file mode 100644
index d25645ad1e..0000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_B16_F32_B16_HPP
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_B16_F32_B16_HPP
-
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
-#include "device_reduce_instance_multiblock_partial_reduce.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1);
-
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
-
-#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp
deleted file mode 100644
index 05549fc702..0000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F16_F16_HPP
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F16_F16_HPP
-
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
-#include "device_reduce_instance_multiblock_partial_reduce.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
-
-#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp
deleted file mode 100644
index 3e4aaef51b..0000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F32_F16_HPP
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F32_F16_HPP
-
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
-#include "device_reduce_instance_multiblock_partial_reduce.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
-
-#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp
deleted file mode 100644
index 2a1e4e7bf0..0000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F32_F32_HPP
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F32_F32_HPP
-
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
-#include "device_reduce_instance_multiblock_partial_reduce.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1);       
-
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
-
-#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp
deleted file mode 100644
index f95e3001ee..0000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F64_F32_HPP
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F64_F32_HPP
-
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
-#include "device_reduce_instance_multiblock_partial_reduce.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
-
-#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp
deleted file mode 100644
index fac65128b6..0000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F64_F64_F64_HPP
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F64_F64_F64_HPP
-
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
-#include "device_reduce_instance_multiblock_partial_reduce.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1);       
-
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1);       
-
-// Will be moved to use MultiBlockAtomicAdd
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
-
-#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp
deleted file mode 100644
index 895c144c66..0000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I32_I8_HPP
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I32_I8_HPP
-
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
-#include "device_reduce_instance_multiblock_partial_reduce.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
-
-#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp
deleted file mode 100644
index d6bee57fcd..0000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I8_I8_HPP
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I8_I8_HPP
-
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
-#include "device_reduce_instance_multiblock_partial_reduce.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
-
-#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
index f3a0781c2b..a2b4ae22be 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
@@ -58,8 +58,8 @@ template <typename InDataType,
           int Rank,
           int NumReduceDim,
           ReduceTensorOp ReduceOpId,
-          NanPropagation NanOpt,
-          ReduceTensorIndices IndicesOpt>
+          bool PropagateNan,
+          bool UseIndex>
 void add_device_reduce_instance_threadwise(
     std::vector<deviceReduceThreadWisePtrType<AccDataType, ReduceOpId>>& device_op_instances)
 {
@@ -73,9 +73,7 @@ void add_device_reduce_instance_threadwise(
     constexpr bool Indexable =
         (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
          ReduceOpId == ReduceTensorOp::AMAX);
-    constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
-
-    constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
+    constexpr bool OutputIndex = Indexable && UseIndex;
 
     using cfg1 = ReductionConfiguration_1<256, 256, 1>;
 
@@ -93,10 +91,9 @@ void add_device_reduce_instance_threadwise(
                                                             InElementwiseOperation,
                                                             AccElementwiseOperation,
                                                             PropagateNan,
-                                                            NeedIndices,
+                                                            OutputIndex,
+                                                            false, // HaveIndexInputIfOutputIndex
                                                             cfg1::BlockSize_,
-                                                            cfg1::MThreadClusterSize_,
-                                                            cfg1::KThreadClusterSize_,
                                                             cfg2::MThreadSliceSize_,
                                                             cfg2::KThreadSliceSize_,
                                                             cfg2::InSrcVectorDim_,
@@ -107,54 +104,54 @@ void add_device_reduce_instance_threadwise(
         });
 };
 
-#define ADD_THREADWISE_INST_BY_TYPE(                                      \
-    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
-    template void add_device_reduce_instance_threadwise<inT,              \
-                                                        compT,            \
-                                                        outT,             \
-                                                        Rank,             \
-                                                        NumReduceDim,     \
-                                                        ReduceOpId,       \
-                                                        NanOpt,           \
-                                                        IndicesOpt>(      \
+#define ADD_THREADWISE_INST_BY_TYPE(                                          \
+    inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \
+    template void add_device_reduce_instance_threadwise<inT,                  \
+                                                        compT,                \
+                                                        outT,                 \
+                                                        Rank,                 \
+                                                        NumReduceDim,         \
+                                                        ReduceOpId,           \
+                                                        PropagateNan,         \
+                                                        UseIndex>(            \
         std::vector<deviceReduceThreadWisePtrType<compT, ReduceOpId>> & device_op_instances)
 
-#define ADD_THREADWISE_INST_BY_ID(                                            \
-    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)     \
-    ADD_THREADWISE_INST_BY_TYPE(inT,                                          \
-                                compT,                                        \
-                                outT,                                         \
-                                static_cast<ReduceTensorOp>(ReduceOpId),      \
-                                static_cast<NanPropagation>(NanOpt),          \
-                                static_cast<ReduceTensorIndices>(IndicesOpt), \
-                                Rank,                                         \
+#define ADD_THREADWISE_INST_BY_ID(                                        \
+    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
+    ADD_THREADWISE_INST_BY_TYPE(inT,                                      \
+                                compT,                                    \
+                                outT,                                     \
+                                static_cast<ReduceTensorOp>(ReduceOpId),  \
+                                static_cast<bool>(NanOpt),                \
+                                static_cast<bool>(IndicesOpt),            \
+                                Rank,                                     \
                                 NumReduceDim)
 
 #define ADD_THREADWISE_INST_REF_BY_TYPE(                                                           \
-    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)                          \
+    inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim)                      \
     extern template void add_device_reduce_instance_threadwise<inT,                                \
                                                                compT,                              \
                                                                outT,                               \
                                                                Rank,                               \
                                                                NumReduceDim,                       \
                                                                ReduceOpId,                         \
-                                                               NanOpt,                             \
-                                                               IndicesOpt>(                        \
+                                                               PropagateNan,                       \
+                                                               UseIndex>(                          \
         std::vector<DeviceReducePtr<                                                               \
             typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
             typename reduce_unary_operator<compT, ReduceOpId, true, true>::                        \
                 AccElementwiseOperation>> &                                                        \
         device_op_instances)
 
-#define ADD_THREADWISE_INST_REF_BY_ID(                                            \
-    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)         \
-    ADD_THREADWISE_INST_REF_BY_TYPE(inT,                                          \
-                                    compT,                                        \
-                                    outT,                                         \
-                                    static_cast<ReduceTensorOp>(ReduceOpId),      \
-                                    static_cast<NanPropagation>(NanOpt),          \
-                                    static_cast<ReduceTensorIndices>(IndicesOpt), \
-                                    Rank,                                         \
+#define ADD_THREADWISE_INST_REF_BY_ID(                                       \
+    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)    \
+    ADD_THREADWISE_INST_REF_BY_TYPE(inT,                                     \
+                                    compT,                                   \
+                                    outT,                                    \
+                                    static_cast<ReduceTensorOp>(ReduceOpId), \
+                                    static_cast<bool>(NanOpt),               \
+                                    static_cast<bool>(IndicesOpt),           \
+                                    Rank,                                    \
                                     NumReduceDim)
 
 } // namespace device_reduce_instance
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
index f11d9118c9..0291f33214 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
@@ -1,8 +1,7 @@
 #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
 #define DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
 
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
+#include "data_type.hpp"
 #include "device_reduce_instance_threadwise.hpp"
 
 namespace ck {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
index fe220335c5..7ab1bebc5f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
@@ -1,8 +1,7 @@
 #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
 #define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
 
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
+#include "data_type.hpp"
 #include "device_reduce_instance_threadwise.hpp"
 
 namespace ck {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
index 970559cfac..39c3d10660 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
@@ -1,8 +1,7 @@
 #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
 #define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
 
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
+#include "data_type.hpp"
 #include "device_reduce_instance_threadwise.hpp"
 
 namespace ck {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
index 66c33a72a4..3c47bfd189 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
@@ -1,8 +1,6 @@
 #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
 #define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
 
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
 #include "device_reduce_instance_threadwise.hpp"
 
 namespace ck {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp
index 196f142dbf..9df9f6f1fa 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp
@@ -1,8 +1,6 @@
 #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
 #define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
 
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
 #include "device_reduce_instance_threadwise.hpp"
 
 namespace ck {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp
index 4f3e1448d0..00ab218f20 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp
@@ -1,8 +1,6 @@
 #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
 #define DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
 
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
 #include "device_reduce_instance_threadwise.hpp"
 
 namespace ck {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp
index 8f19a5d0a2..de7445b043 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp
@@ -1,8 +1,6 @@
 #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
 #define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
 
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
 #include "device_reduce_instance_threadwise.hpp"
 
 namespace ck {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
index 83bd48cd3f..1ea1ee745e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
@@ -1,8 +1,6 @@
 #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
 #define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
 
-#include "reduction_enums.hpp"
-#include "reduction_operator_mapping.hpp"
 #include "device_reduce_instance_threadwise.hpp"
 
 namespace ck {
diff --git a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt
index 81987ac0d4..d566796c13 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt
@@ -16,26 +16,11 @@ set(DEVICE_REDUCE_INSTANCE_SOURCE
    device_reduce_instance_threadwise_i8_i32_i8.cpp;
    device_reduce_instance_threadwise_i8_i8_i8.cpp;
    device_reduce_instance_threadwise_b16_f32_b16.cpp;
-   device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp;
-   device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp;
-   device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp;
-   device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp;
-   device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp;
-   device_reduce_instance_blockwise_second_call_i32_i32_i8.cpp;
-   device_reduce_instance_blockwise_second_call_i8_i8_i8.cpp;
-   device_reduce_instance_blockwise_second_call_f32_f32_b16.cpp;
    device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp;
    device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp;
    device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp;
+   device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp;
    device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp;
-   device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp;
-   device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp;
-   device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp;
-   device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp;
-   device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp;
-   device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp;
-   device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp;
-   device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp;
 )
 
 add_library(device_reduce_instance OBJECT ${DEVICE_REDUCE_INSTANCE_SOURCE}) 
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp
deleted file mode 100644
index 82a9c11413..0000000000
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-#include "device_reduce_instance_blockwise_second_call.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.cpp
deleted file mode 100644
index 6b8139c32c..0000000000
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-#include "device_reduce_instance_blockwise_second_call.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 0, 0, 0, 2, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 5, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 7, 0, 0, 2, 1);
-
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 0, 2, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 0, 2, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 0, 2, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 1, 2, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 1, 2, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 1, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp
deleted file mode 100644
index 267b9d4d9d..0000000000
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-#include "device_reduce_instance_blockwise_second_call.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 4, 3); // for ADD
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 2, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 4, 3); // for AVG
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 4, 3); // for NORM2
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp
deleted file mode 100644
index 0036a89542..0000000000
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-#include "device_reduce_instance_blockwise_second_call.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp
deleted file mode 100644
index 0512fa4158..0000000000
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-#include "device_reduce_instance_blockwise_second_call.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 4, 3); // for ADD
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 2, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 4, 3); // for AVG
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 4, 3); // for NORM2
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp
deleted file mode 100644
index afe7f0752e..0000000000
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-#include "device_reduce_instance_blockwise_second_call.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.cpp
deleted file mode 100644
index 9cb3b8684f..0000000000
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-#include "device_reduce_instance_blockwise_second_call.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 2, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 4);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 1);
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.cpp
deleted file mode 100644
index 8783a75486..0000000000
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-#include "device_reduce_instance_blockwise_second_call.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);       
-ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp
new file mode 100644
index 0000000000..497f2695be
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp
@@ -0,0 +1,24 @@
+#include "device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 0, 0, 0, 4, 4);
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1);
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1);
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 5, 0, 0, 4, 4);       
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1);       
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp
deleted file mode 100644
index d740fcfa8f..0000000000
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-#include "device_reduce_instance_multiblock_partial_reduce.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1);
-
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp
deleted file mode 100644
index f57ed5ad86..0000000000
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-#include "device_reduce_instance_multiblock_partial_reduce.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp
deleted file mode 100644
index 724b364104..0000000000
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-#include "device_reduce_instance_multiblock_partial_reduce.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp
deleted file mode 100644
index 15028a0b4c..0000000000
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-#include "device_reduce_instance_multiblock_partial_reduce.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1);       
-
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp
deleted file mode 100644
index ec0ba3cf8e..0000000000
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "device_reduce_instance_multiblock_partial_reduce.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp
deleted file mode 100644
index 9ff2dcd93b..0000000000
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-#include "device_reduce_instance_multiblock_partial_reduce.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1);       
-
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1);       
-
-// Will be moved to use MultiBlockAtomicAdd
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp
deleted file mode 100644
index 0e37c2947f..0000000000
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-#include "device_reduce_instance_multiblock_partial_reduce.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1);
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp
deleted file mode 100644
index 4634faed06..0000000000
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-#include "device_reduce_instance_multiblock_partial_reduce.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_reduce_instance {
-
-// clang-format off
-// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);       
-ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
-// clang-format on
-
-} // namespace device_reduce_instance
-} // namespace device
-} // namespace tensor_operation
-
-} // namespace ck
diff --git a/profiler/include/profile_reduce_impl.hpp b/profiler/include/profile_reduce_impl.hpp
index 33c7929ddd..a87694754e 100644
--- a/profiler/include/profile_reduce_impl.hpp
+++ b/profiler/include/profile_reduce_impl.hpp
@@ -5,74 +5,77 @@
 #include "device_reduce_instance.hpp"
 #include "reduction_enums.hpp"
 #include "host_reduction.hpp"
+#include "host_common_util.hpp"
+#include "host_tensor_generator.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace device_reduce_instance {
 
-template <int Rank, int NumReduceDim, int ReduceOpId, int NanOpt, int IndicesOpt>
+template <int Rank, int NumReduceDim, int ReduceOpId, bool PropagateNan, bool UseIndex>
 struct ReduceDescription
 {
     static constexpr int Rank_         = Rank;
     static constexpr int NumReduceDim_ = NumReduceDim;
     static constexpr int ReduceOpId_   = ReduceOpId;
-    static constexpr int NanOpt_       = NanOpt;
-    static constexpr int IndicesOpt_   = IndicesOpt;
+    static constexpr int PropagateNan_ = PropagateNan;
+    static constexpr int UseIndex_     = UseIndex;
 };
 
-using reduce_description_instances = std::tuple<ReduceDescription<4, 3, 0, 0, 0>, // for ADD
-                                                ReduceDescription<4, 4, 0, 0, 0>,
-                                                ReduceDescription<4, 1, 0, 0, 0>,
-                                                ReduceDescription<2, 1, 0, 0, 0>,
+using reduce_description_instances =
+    std::tuple<ReduceDescription<4, 3, 0, false, false>, // for ADD
+               ReduceDescription<4, 4, 0, false, false>,
+               ReduceDescription<4, 1, 0, false, false>,
+               ReduceDescription<2, 1, 0, false, false>,
 
-                                                ReduceDescription<4, 3, 5, 0, 0>, // for AVG
-                                                ReduceDescription<4, 4, 5, 0, 0>,
-                                                ReduceDescription<4, 1, 5, 0, 0>,
-                                                ReduceDescription<2, 1, 5, 0, 0>,
+               ReduceDescription<4, 3, 5, false, false>, // for AVG
+               ReduceDescription<4, 4, 5, false, false>,
+               ReduceDescription<4, 1, 5, false, false>,
+               ReduceDescription<2, 1, 5, false, false>,
 
-                                                ReduceDescription<4, 3, 7, 0, 0>, // for NORM2
-                                                ReduceDescription<4, 4, 7, 0, 0>,
-                                                ReduceDescription<4, 1, 7, 0, 0>,
-                                                ReduceDescription<2, 1, 7, 0, 0>,
+               ReduceDescription<4, 3, 7, false, false>, // for NORM2
+               ReduceDescription<4, 4, 7, false, false>,
+               ReduceDescription<4, 1, 7, false, false>,
+               ReduceDescription<2, 1, 7, false, false>,
 
-                                                ReduceDescription<4, 3, 2, 0, 0>, // for MIN
-                                                ReduceDescription<4, 4, 2, 0, 0>,
-                                                ReduceDescription<4, 1, 2, 0, 0>,
-                                                ReduceDescription<2, 1, 2, 0, 0>,
-                                                ReduceDescription<4, 3, 3, 0, 0>, // for MAX
-                                                ReduceDescription<4, 4, 3, 0, 0>,
-                                                ReduceDescription<4, 1, 3, 0, 0>,
-                                                ReduceDescription<2, 1, 3, 0, 0>,
-                                                ReduceDescription<4, 3, 4, 0, 0>, // for AMAX
-                                                ReduceDescription<4, 4, 4, 0, 0>,
-                                                ReduceDescription<4, 1, 4, 0, 0>,
-                                                ReduceDescription<2, 1, 4, 0, 0>,
+               ReduceDescription<4, 3, 2, false, false>, // for MIN
+               ReduceDescription<4, 4, 2, false, false>,
+               ReduceDescription<4, 1, 2, false, false>,
+               ReduceDescription<2, 1, 2, false, false>,
+               ReduceDescription<4, 3, 3, false, false>, // for MAX
+               ReduceDescription<4, 4, 3, false, false>,
+               ReduceDescription<4, 1, 3, false, false>,
+               ReduceDescription<2, 1, 3, false, false>,
+               ReduceDescription<4, 3, 4, false, false>, // for AMAX
+               ReduceDescription<4, 4, 4, false, false>,
+               ReduceDescription<4, 1, 4, false, false>,
+               ReduceDescription<2, 1, 4, false, false>,
 
-                                                ReduceDescription<4, 3, 2, 0, 1>, // for MIN
-                                                ReduceDescription<4, 4, 2, 0, 1>,
-                                                ReduceDescription<4, 1, 2, 0, 1>,
-                                                ReduceDescription<2, 1, 2, 0, 1>,
-                                                ReduceDescription<4, 3, 3, 0, 1>, // for MAX
-                                                ReduceDescription<4, 4, 3, 0, 1>,
-                                                ReduceDescription<4, 1, 3, 0, 1>,
-                                                ReduceDescription<2, 1, 3, 0, 1>,
-                                                ReduceDescription<4, 3, 4, 0, 1>, // for AMAX
-                                                ReduceDescription<4, 4, 4, 0, 1>,
-                                                ReduceDescription<4, 1, 4, 0, 1>,
-                                                ReduceDescription<2, 1, 4, 0, 1>>;
+               ReduceDescription<4, 3, 2, false, true>, // for MIN
+               ReduceDescription<4, 4, 2, false, true>,
+               ReduceDescription<4, 1, 2, false, true>,
+               ReduceDescription<2, 1, 2, false, true>,
+               ReduceDescription<4, 3, 3, false, true>, // for MAX
+               ReduceDescription<4, 4, 3, false, true>,
+               ReduceDescription<4, 1, 3, false, true>,
+               ReduceDescription<2, 1, 3, false, true>,
+               ReduceDescription<4, 3, 4, false, true>, // for AMAX
+               ReduceDescription<4, 4, 4, false, true>,
+               ReduceDescription<4, 1, 4, false, true>,
+               ReduceDescription<2, 1, 4, false, true>>;
 
 template <typename DescriptionType>
 bool description_match(const DescriptionType& description,
                        int Rank,
                        const std::vector<int>& reduceDims,
                        ReduceTensorOp ReduceOpId,
-                       NanPropagation NanOpt,
-                       ReduceTensorIndices IndicesOpt)
+                       bool PropagateNan,
+                       bool UseIndex)
 {
     if(description.Rank_ != Rank || description.ReduceOpId_ != static_cast<int>(ReduceOpId) ||
-       description.NanOpt_ != static_cast<int>(NanOpt) ||
-       description.IndicesOpt_ != static_cast<int>(IndicesOpt))
+       description.PropagateNan_ != static_cast<int>(PropagateNan) ||
+       description.UseIndex_ != static_cast<int>(UseIndex))
         return (false);
 
     if(DescriptionType::NumReduceDim_ != reduceDims.size())
@@ -116,46 +119,16 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
     return invariantDims;
 };
 
-template <typename T>
-static void dumpBufferToFile(const char* fileName, T* data, size_t dataNumItems)
-{
-    std::ofstream outFile(fileName, std::ios::binary);
-    if(outFile)
-    {
-        outFile.write(reinterpret_cast<char*>(data), dataNumItems * sizeof(T));
-        outFile.close();
-        std::cout << "Write output to file " << fileName << std::endl;
-    }
-    else
-    {
-        std::cout << "Could not open file " << fileName << " for writing" << std::endl;
-    }
-};
-
-// map the data type used by the GPU kernels to the corresponding type used by the host codes
-template <typename InType>
-struct type_mapping
-{
-    using OutType = InType;
-};
-
-template <>
-struct type_mapping<ck::half_t>
-{
-    using OutType = half_float::half;
-};
-
 template <typename InDataType,
           typename AccDataType,
           typename OutDataType,
           int Rank,
           int NumReduceDim,
           ReduceTensorOp ReduceOpId,
-          NanPropagation NanOpt,
-          ReduceTensorIndices IndicesOpt>
-void profile_reduce_impl_impl(bool do_verification,
+          bool PropagateNan,
+          bool UseIndex>
+bool profile_reduce_impl_impl(bool do_verification,
                               int init_method,
-                              bool do_log,
                               bool do_dumpout,
                               bool time_kernel,
                               const std::vector<size_t>& inLengths,
@@ -166,15 +139,13 @@ void profile_reduce_impl_impl(bool do_verification,
     using namespace ck::tensor_operation::device;
     using namespace ck::tensor_operation::device::device_reduce_instance;
     using namespace ck::host_reduce;
+    using ck::host_common::dumpBufferToFile;
 
     constexpr bool op_support_indices =
         (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
          ReduceOpId == ReduceTensorOp::AMAX);
 
-    constexpr bool NeedIndices =
-        (op_support_indices && (IndicesOpt != ReduceTensorIndices::NO_INDICES));
-
-    constexpr bool PropagateNan = (NanOpt == NanPropagation::PROPAGATE_NAN);
+    constexpr bool OutputIndex = (op_support_indices && UseIndex);
 
     constexpr bool out_support_atomic_add = std::is_same<OutDataType, float>::value;
     constexpr bool op_support_atomic_add =
@@ -195,8 +166,7 @@ void profile_reduce_impl_impl(bool do_verification,
         (op_support_indices && !std::is_same<AccDataType, float>::value);
 
     // 1) The indices can only be used when the reduction operation is indexable
-    constexpr bool invalid_reduce_3 =
-        (!op_support_indices && IndicesOpt != ReduceTensorIndices::NO_INDICES);
+    constexpr bool invalid_reduce_3 = (!op_support_indices && UseIndex);
 
     // 1) If InDataType is int8_t, must use int8_t as AccDataType for indexable reduction operations
     // 2) If InDataType is int8_t, must use int32_t as AccDataType for non-indexable reduction
@@ -219,6 +189,8 @@ void profile_reduce_impl_impl(bool do_verification,
     constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3 ||
                                      invalid_reduce_4 || invalid_reduce_5 || invalid_reduce_6);
 
+    bool pass = true;
+
     if constexpr(!invalid_reduce)
     {
         Tensor<InDataType> in(inLengths);
@@ -282,7 +254,7 @@ void profile_reduce_impl_impl(bool do_verification,
         if(beta != 0.0f)
             out_dev.ToDevice(out.mData.data());
 
-        size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int) : 0;
+        size_t indicesSizeInBytes = OutputIndex ? out.mDesc.GetElementSize() * sizeof(int) : 0;
 
         DeviceMem out_indices_dev(indicesSizeInBytes);
 
@@ -295,29 +267,11 @@ void profile_reduce_impl_impl(bool do_verification,
         using AccElementwiseOperation_0 =
             typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
                 AccElementwiseOperation;
-        using InElementwiseOperation_1 =
-            typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
-                InElementwiseOperation;
-        using AccElementwiseOperation_1 =
-            typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
-                AccElementwiseOperation;
-        using InElementwiseOperation_2 =
-            typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
-                InElementwiseOperation;
-        using AccElementwiseOperation_2 =
-            typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
-                AccElementwiseOperation;
 
         using DeviceReduceInstPtr0 =
             DeviceReducePtr<InElementwiseOperation_0, AccElementwiseOperation_0>;
-        using DeviceReduceInstPtr1 =
-            DeviceReducePtr<InElementwiseOperation_1, AccElementwiseOperation_1>;
-        using DeviceReduceInstPtr2 =
-            DeviceReducePtr<InElementwiseOperation_2, AccElementwiseOperation_2>;
 
         std::vector<DeviceReduceInstPtr0> reduce0_ptrs;
-        std::vector<DeviceReduceInstPtr1> reduce1_ptrs;
-        std::vector<DeviceReduceInstPtr2> reduce2_ptrs;
 
         add_device_reduce_instance_threadwise<InDataType,
                                               AccDataType,
@@ -325,8 +279,8 @@ void profile_reduce_impl_impl(bool do_verification,
                                               Rank,
                                               NumReduceDim,
                                               ReduceOpId,
-                                              NanOpt,
-                                              IndicesOpt>(reduce0_ptrs);
+                                              PropagateNan,
+                                              UseIndex>(reduce0_ptrs);
 
         add_device_reduce_instance_blockwise<InDataType,
                                              AccDataType,
@@ -334,8 +288,8 @@ void profile_reduce_impl_impl(bool do_verification,
                                              Rank,
                                              NumReduceDim,
                                              ReduceOpId,
-                                             NanOpt,
-                                             IndicesOpt>(reduce0_ptrs);
+                                             PropagateNan,
+                                             UseIndex>(reduce0_ptrs);
 
         if constexpr(use_atomic_add)
         {
@@ -345,35 +299,11 @@ void profile_reduce_impl_impl(bool do_verification,
                                                              Rank,
                                                              NumReduceDim,
                                                              ReduceOpId,
-                                                             NanOpt,
-                                                             IndicesOpt>(reduce0_ptrs);
+                                                             PropagateNan,
+                                                             UseIndex>(reduce0_ptrs);
         }
-        else
-        {
-            add_device_reduce_instance_multiblock_partial_reduce<InDataType,
-                                                                 AccDataType,
-                                                                 OutDataType,
-                                                                 Rank,
-                                                                 NumReduceDim,
-                                                                 ReduceOpId,
-                                                                 NanOpt,
-                                                                 IndicesOpt>(reduce1_ptrs);
-        };
 
-        // used for secondary reduction
-        if constexpr(!use_atomic_add)
-        {
-            add_device_reduce_instance_blockwise_second_call<AccDataType,
-                                                             AccDataType,
-                                                             OutDataType,
-                                                             Rank,
-                                                             NumReduceDim,
-                                                             ReduceOpId,
-                                                             NanOpt,
-                                                             IndicesOpt>(reduce2_ptrs);
-        };
-
-        if(reduce0_ptrs.empty() && reduce1_ptrs.empty())
+        if(reduce0_ptrs.empty())
         {
             throw std::runtime_error("Wrong! No device REDUCE instance found");
         };
@@ -387,23 +317,25 @@ void profile_reduce_impl_impl(bool do_verification,
                           Rank,
                           NumReduceDim,
                           PropagateNan,
-                          NeedIndices>
+                          OutputIndex>
                 hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
 
             hostReduce.Run(
                 alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
         };
 
-        const auto i_inLengths  = to_int_vector(inLengths);
-        const auto i_inStrides  = to_int_vector(inStrides);
-        const auto i_outLengths = to_int_vector(outLengths);
-        const auto i_outStrides = to_int_vector(outStrides);
+        std::vector<ck::index_t> i_inLengths;
+        std::vector<ck::index_t> i_inStrides;
+        std::vector<ck::index_t> i_outLengths;
+        std::vector<ck::index_t> i_outStrides;
+
+        i_inLengths.assign(inLengths.begin(), inLengths.end());
+        i_inStrides.assign(inStrides.begin(), inStrides.end());
+        i_outLengths.assign(outLengths.begin(), outLengths.end());
+        i_outStrides.assign(outStrides.begin(), outStrides.end());
 
         for(auto& reduce_ptr : reduce0_ptrs)
         {
-            auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
-
-            DeviceMem ws_dev(wsSizeInBytes);
 
             InElementwiseOperation_0 in_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
             AccElementwiseOperation_0 acc_elementwise_op_0(
@@ -417,9 +349,9 @@ void profile_reduce_impl_impl(bool do_verification,
                                                                 alpha,
                                                                 beta,
                                                                 in_dev.GetDeviceBuffer(),
+                                                                nullptr,
                                                                 out_dev.GetDeviceBuffer(),
                                                                 out_indices_dev.GetDeviceBuffer(),
-                                                                ws_dev.GetDeviceBuffer(),
                                                                 in_elementwise_op_0,
                                                                 acc_elementwise_op_0);
 
@@ -439,8 +371,9 @@ void profile_reduce_impl_impl(bool do_verification,
 
             float gb_per_sec = num_bytes / 1.E6 / avg_time;
 
-            std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name
-                      << std::endl;
+            if(time_kernel)
+                std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                          << reduce_name << std::endl;
 
             if(gb_per_sec > best_gb_per_sec)
             {
@@ -450,22 +383,24 @@ void profile_reduce_impl_impl(bool do_verification,
 
             if(do_verification)
             {
-                out_dev.FromDevice(out.mData.data());
-                ck::utils::check_err(out.mData, out_ref.mData);
+                bool single_pass;
 
-                if(NeedIndices)
+                out_dev.FromDevice(out.mData.data());
+                single_pass = ck::utils::check_err(out.mData, out_ref.mData);
+
+                if(OutputIndex)
                 {
                     out_indices_dev.FromDevice(out_indices.mData.data());
-                    ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
-                    ;
+                    single_pass = single_pass &&
+                                  ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
                 };
 
-                if(do_log)
+                if(!single_pass)
                 {
-                    LogRangeAsType<float>(std::cout << "out_host  : ", out_ref.mData, ",")
-                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "out_device: ", out.mData, ",") << std::endl;
-                };
+                    std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << std::endl;
+                }
+
+                pass = pass && single_pass;
             };
 
             if(do_dumpout)
@@ -474,7 +409,7 @@ void profile_reduce_impl_impl(bool do_verification,
                 dumpBufferToFile("dump_out.bin", out.mData.data(), out.mDesc.GetElementSize());
                 dumpBufferToFile(
                     "dump_out_host.bin", out_ref.mData.data(), out_ref.mDesc.GetElementSize());
-                if(NeedIndices)
+                if(OutputIndex)
                 {
                     dumpBufferToFile("dump_indices.bin",
                                      out_indices.mData.data(),
@@ -486,158 +421,34 @@ void profile_reduce_impl_impl(bool do_verification,
             };
         };
 
-        for(auto& reduce_ptr : reduce1_ptrs)
-        {
-            auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
-
-            DeviceMem ws_dev(wsSizeInBytes);
-
-            InElementwiseOperation_1 in_elementwise_op_1(static_cast<int32_t>(reduce_total_length));
-            AccElementwiseOperation_1 acc_elementwise_op_1(
-                static_cast<int32_t>(reduce_total_length));
-
-            auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
-                                                                i_inStrides,
-                                                                i_outLengths,
-                                                                i_outStrides,
-                                                                reduceDims,
-                                                                alpha,
-                                                                beta,
-                                                                in_dev.GetDeviceBuffer(),
-                                                                out_dev.GetDeviceBuffer(),
-                                                                out_indices_dev.GetDeviceBuffer(),
-                                                                ws_dev.GetDeviceBuffer(),
-                                                                in_elementwise_op_1,
-                                                                acc_elementwise_op_1);
-
-            if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
-                continue;
-
-            std::string reduce_name = reduce_ptr->GetTypeString();
-
-            auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
-
-            float avg_time =
-                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
-
-            std::size_t num_bytes =
-                invariant_total_length * reduce_total_length * sizeof(InDataType) +
-                invariant_total_length * sizeof(OutDataType);
-
-            std::vector<int> inLengths2 = reduce_ptr->GetWorkspace2dLengths(argument_ptr.get());
-            std::vector<int> inStrides2{inLengths2[1], 1};
-
-            for(auto& reduce2_ptr : reduce2_ptrs)
-            {
-                InElementwiseOperation_2 in_elementwise_op_2(
-                    static_cast<int32_t>(reduce_total_length));
-                AccElementwiseOperation_2 acc_elementwise_op_2(
-                    static_cast<int32_t>(reduce_total_length));
-
-                auto argument2_ptr =
-                    reduce2_ptr->MakeArgumentPointer(inLengths2,
-                                                     inStrides2,
-                                                     i_outLengths,
-                                                     i_outStrides,
-                                                     reduceDims,
-                                                     alpha,
-                                                     beta,
-                                                     ws_dev.GetDeviceBuffer(),
-                                                     out_dev.GetDeviceBuffer(),
-                                                     out_indices_dev.GetDeviceBuffer(),
-                                                     ws_dev.GetDeviceBuffer(),
-                                                     in_elementwise_op_2,
-                                                     acc_elementwise_op_2);
-
-                if(!reduce2_ptr->IsSupportedArgument(argument2_ptr.get()))
-                    continue;
-
-                std::string reduce2_name = reduce2_ptr->GetTypeString();
-
-                auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer();
-
-                float avg_time_2 =
-                    invoker2_ptr->Run(argument2_ptr.get(), StreamConfig{nullptr, time_kernel});
-
-                std::size_t num_bytes_2 =
-                    static_cast<size_t>(inLengths2[0]) * inLengths2[1] * sizeof(AccDataType);
-
-                float gb_per_sec = (num_bytes + num_bytes_2) / 1.E6 / (avg_time + avg_time_2);
-
-                std::cout << "Perf: " << (avg_time + avg_time_2) << " ms, " << gb_per_sec
-                          << " GB/s, " << reduce_name << " => " << reduce2_name << std::endl;
-
-                if(gb_per_sec > best_gb_per_sec)
-                {
-                    best_avg_time   = avg_time + avg_time_2;
-                    best_gb_per_sec = gb_per_sec;
-                }
-
-                if(do_verification)
-                {
-                    out_dev.FromDevice(out.mData.data());
-                    ck::utils::check_err(out.mData, out_ref.mData);
-
-                    if(NeedIndices)
-                    {
-                        out_indices_dev.FromDevice(out_indices.mData.data());
-                        ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
-                        ;
-                    };
-
-                    if(do_log)
-                    {
-                        LogRangeAsType<float>(std::cout << "out_host  : ", out_ref.mData, ",")
-                            << std::endl;
-                        LogRangeAsType<float>(std::cout << "out_device: ", out.mData, ",")
-                            << std::endl;
-                    }
-                }
-
-                if(do_dumpout)
-                {
-                    dumpBufferToFile("dump_in.bin", in.mData.data(), in.mDesc.GetElementSize());
-                    dumpBufferToFile("dump_out.bin", out.mData.data(), out.mDesc.GetElementSize());
-                    dumpBufferToFile(
-                        "dump_out_host.bin", out_ref.mData.data(), out_ref.mDesc.GetElementSize());
-                    if(NeedIndices)
-                    {
-                        dumpBufferToFile("dump_indices.bin",
-                                         out_indices.mData.data(),
-                                         out_indices.mDesc.GetElementSize());
-                        dumpBufferToFile("dump_indices_host.bin",
-                                         out_indices_ref.mData.data(),
-                                         out_indices_ref.mDesc.GetElementSize());
-                    };
-                };
-            };
-        };
-
-        std::cout << "Best Perf: " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s"
-                  << std::endl;
+        if(time_kernel)
+            std::cout << "Best Perf: " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s"
+                      << std::endl;
     }
     else
     {
         std::cout << "The requested reduction operation is not supported, please check !!!"
                   << std::endl;
     };
+
+    return pass;
 };
 
 template <typename InDataType, typename AccDataType, typename OutDataType>
-void profile_reduce_impl(bool do_verification,
+bool profile_reduce_impl(bool do_verification,
                          int init_method,
-                         bool do_log,
                          bool do_dumpout,
                          bool time_kernel,
                          const std::vector<size_t>& inLengths,
                          const std::vector<int>& reduceDims,
                          ReduceTensorOp ReduceOpId,
-                         NanPropagation NanOpt,
-                         ReduceTensorIndices IndicesOpt,
+                         bool PropagateNan,
+                         bool UseIndex,
                          float alpha,
                          float beta)
 {
     bool matched = false;
+    bool pass    = true;
 
     using tuple_of_description_instances =
         tensor_operation::device::device_reduce_instance::reduce_description_instances;
@@ -651,29 +462,30 @@ void profile_reduce_impl(bool do_verification,
         using descType = remove_cvref_t<decltype(std::get<i>(tuple_object))>;
 
         if(!description_match(
-               descType{}, inLengths.size(), reduceDims, ReduceOpId, NanOpt, IndicesOpt))
+               descType{}, inLengths.size(), reduceDims, ReduceOpId, PropagateNan, UseIndex))
             return;
 
-        profile_reduce_impl_impl<InDataType,
-                                 AccDataType,
-                                 OutDataType,
-                                 descType::Rank_,
-                                 descType::NumReduceDim_,
-                                 static_cast<ReduceTensorOp>(descType::ReduceOpId_),
-                                 static_cast<NanPropagation>(descType::NanOpt_),
-                                 static_cast<ReduceTensorIndices>(descType::IndicesOpt_)>(
-            do_verification,
-            init_method,
-            do_log,
-            do_dumpout,
-            time_kernel,
-            inLengths,
-            reduceDims,
-            alpha,
-            beta);
+        pass = pass &&
+               profile_reduce_impl_impl<InDataType,
+                                        AccDataType,
+                                        OutDataType,
+                                        descType::Rank_,
+                                        descType::NumReduceDim_,
+                                        static_cast<ReduceTensorOp>(descType::ReduceOpId_),
+                                        static_cast<bool>(descType::PropagateNan_),
+                                        static_cast<bool>(descType::UseIndex_)>(do_verification,
+                                                                                init_method,
+                                                                                do_dumpout,
+                                                                                time_kernel,
+                                                                                inLengths,
+                                                                                reduceDims,
+                                                                                alpha,
+                                                                                beta);
 
         matched = true;
     });
+
+    return pass;
 };
 
 } // namespace profiler
diff --git a/profiler/src/profile_reduce.cpp b/profiler/src/profile_reduce.cpp
index 5e91a1d2d1..bdbac4fab4 100644
--- a/profiler/src/profile_reduce.cpp
+++ b/profiler/src/profile_reduce.cpp
@@ -1,27 +1,19 @@
 #include <iostream>
 #include <fstream>
-#include <numeric>
-#include <initializer_list>
 #include <cstdlib>
 #include <vector>
 #include <stdexcept>
 #include <sstream>
 #include <getopt.h>
 
-#include "config.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "device_tensor.hpp"
+#include "data_type_enum.hpp"
 #include "reduction_enums.hpp"
 
+#include "host_common_util.hpp"
 #include "profile_reduce_impl.hpp"
 
 using namespace std;
 
-using ck::NanPropagation;
-using ck::ReduceTensorIndices;
 using ck::ReduceTensorOp;
 
 static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
@@ -38,63 +30,9 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
                                        {"bf16", no_argument, nullptr, '?'},
                                        {"dumpout", required_argument, nullptr, 'o'},
                                        {"verify", required_argument, nullptr, 'v'},
-                                       {"log", required_argument, nullptr, 'l'},
                                        {"help", no_argument, nullptr, '?'},
                                        {nullptr, 0, nullptr, 0}};
 
-template <typename T>
-static T getSingleValueFromString(const string& valueStr)
-{
-    std::istringstream iss(valueStr);
-
-    T val;
-
-    iss >> val;
-
-    return (val);
-};
-
-template <typename T>
-static std::vector<T> getTypeValuesFromString(const char* cstr_values)
-{
-    std::string valuesStr(cstr_values);
-
-    std::vector<T> values;
-    std::size_t pos = 0;
-    std::size_t new_pos;
-
-    new_pos = valuesStr.find(',', pos);
-    while(new_pos != std::string::npos)
-    {
-        const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
-
-        T val = getSingleValueFromString<T>(sliceStr);
-
-        values.push_back(val);
-
-        pos     = new_pos + 1;
-        new_pos = valuesStr.find(',', pos);
-    };
-
-    std::string sliceStr = valuesStr.substr(pos);
-    T val                = getSingleValueFromString<T>(sliceStr);
-
-    values.push_back(val);
-
-    return (values);
-}
-
-enum struct AppDataType
-{
-    appHalf     = 0,
-    appFloat    = 1,
-    appInt32    = 2,
-    appInt8     = 3,
-    appInt8x4   = 4,
-    appBFloat16 = 5,
-    appDouble   = 6,
-};
-
 static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims)
 {
     for(auto dim : reduceDims)
@@ -113,7 +51,7 @@ static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims
     };
 };
 
-class AppArgs
+class ReduceProfilerArgs
 {
     private:
     int option_index = 0;
@@ -130,26 +68,23 @@ class AppArgs
 
     std::vector<float> scales;
 
-    ReduceTensorOp reduceOp = ReduceTensorOp::ADD;
-    AppDataType compTypeId  = AppDataType::appFloat;
-    AppDataType outTypeId   = AppDataType::appFloat;
+    ReduceTensorOp reduceOp     = ReduceTensorOp::ADD;
+    ck::DataTypeEnum compTypeId = ck::DataTypeEnum::Float;
+    ck::DataTypeEnum outTypeId  = ck::DataTypeEnum::Float;
 
     bool compType_assigned = false;
     bool outType_assigned  = false;
 
-    NanPropagation nanOpt          = NanPropagation::NOT_PROPAGATE_NAN;
-    ReduceTensorIndices indicesOpt = ReduceTensorIndices::NO_INDICES;
-    bool do_log                    = false;
-    bool do_verification           = false;
-    bool do_dumpout                = false;
+    int nanOpt           = 0;
+    int indicesOpt       = 0;
+    bool do_verification = false;
+    bool do_dumpout      = false;
 
     int init_method;
     bool time_kernel;
 
-    bool need_indices = false;
-
-    AppArgs()  = default;
-    ~AppArgs() = default;
+    ReduceProfilerArgs()  = default;
+    ~ReduceProfilerArgs() = default;
 
     void show_usage(const char* cmd)
     {
@@ -166,8 +101,11 @@ class AppArgs
         std::cout << "--outType or -W, optional enum value indicating the type of the reduced "
                      "output, which could be float when the input data is half"
                   << std::endl;
-        std::cout << "--nanOpt or -N, enum value indicates the selection for NanOpt" << std::endl;
-        std::cout << "--indicesOpt or -I, enum value indicates the selection for IndicesOpt"
+        std::cout
+            << "--nanOpt or -N, 1/0 value indicates the selection to use or not use Nan-Propagation"
+            << std::endl;
+        std::cout << "--indicesOpt or -I, 1/0 value indicates the selection to use or not use "
+                     "index in reduction"
                   << std::endl;
         std::cout << "--scales or -S, comma separated two float values for alpha and beta"
                   << std::endl;
@@ -181,18 +119,19 @@ class AppArgs
         std::cout << "--dumpout or -o, 1/0 to indicate where to save the reduction result to files "
                      "for further analysis"
                   << std::endl;
-        std::cout << "--log or -l, 1/0 to indicate whether to log some information" << std::endl;
     };
 
     int processArgs(int argc, char* argv[])
     {
+        using ck::host_common::getTypeValuesFromString;
+
         int ch;
 
         optind++; // to skip the "reduce" module name
 
         while(1)
         {
-            ch = getopt_long(argc, argv, "D:R:O:C:W:N:I:S:v:o:l:", long_options, &option_index);
+            ch = getopt_long(argc, argv, "D:R:O:C:W:N:I:S:v:o:", long_options, &option_index);
             if(ch == -1)
                 break;
             switch(ch)
@@ -219,27 +158,27 @@ class AppArgs
                 if(!optarg)
                     throw std::runtime_error("Invalid option format!");
 
-                compTypeId        = static_cast<AppDataType>(std::atoi(optarg));
+                compTypeId        = static_cast<ck::DataTypeEnum>(std::atoi(optarg));
                 compType_assigned = true;
                 break;
             case 'W':
                 if(!optarg)
                     throw std::runtime_error("Invalid option format!");
 
-                outTypeId        = static_cast<AppDataType>(std::atoi(optarg));
+                outTypeId        = static_cast<ck::DataTypeEnum>(std::atoi(optarg));
                 outType_assigned = true;
                 break;
             case 'N':
                 if(!optarg)
                     throw std::runtime_error("Invalid option format!");
 
-                nanOpt = static_cast<NanPropagation>(std::atoi(optarg));
+                nanOpt = std::atoi(optarg);
                 break;
             case 'I':
                 if(!optarg)
                     throw std::runtime_error("Invalid option format!");
 
-                indicesOpt = static_cast<ReduceTensorIndices>(std::atoi(optarg));
+                indicesOpt = std::atoi(optarg);
                 break;
             case 'S':
                 if(!optarg)
@@ -262,12 +201,6 @@ class AppArgs
 
                 do_dumpout = static_cast<bool>(std::atoi(optarg));
                 break;
-            case 'l':
-                if(!optarg)
-                    throw std::runtime_error("Invalid option format!");
-
-                do_log = static_cast<bool>(std::atoi(optarg));
-                break;
             case '?':
                 if(std::string(long_options[option_index].name) == "half")
                     use_half = true;
@@ -295,7 +228,7 @@ class AppArgs
             throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
 
         init_method = std::atoi(argv[optind++]);
-        time_kernel = std::atoi(argv[optind]);
+        time_kernel = static_cast<bool>(std::atoi(argv[optind]));
 
         if(scales.empty())
         {
@@ -306,9 +239,6 @@ class AppArgs
         if(reduceOp == ReduceTensorOp::MIN || reduceOp == ReduceTensorOp::MAX ||
            reduceOp == ReduceTensorOp::AMAX)
         {
-            if(indicesOpt != ReduceTensorIndices::NO_INDICES)
-                need_indices = true;
-
             // for indexable operations, no need to assign compType and outType, just let them be
             // same as inType
             compType_assigned = false;
@@ -322,9 +252,10 @@ class AppArgs
 
 int profile_reduce(int argc, char* argv[])
 {
-    using namespace ck::profiler;
+    using ck::DataTypeEnum;
+    using ck::profiler::profile_reduce_impl;
 
-    AppArgs args;
+    ReduceProfilerArgs args;
 
     if(args.processArgs(argc, argv) < 0)
         return (-1);
@@ -339,42 +270,41 @@ int profile_reduce(int argc, char* argv[])
     if(args.use_half)
     {
         if(!args.compType_assigned)
-            args.compTypeId = AppDataType::appHalf;
+            args.compTypeId = DataTypeEnum::Half;
 
         if(args.outType_assigned &&
-           (args.outTypeId != AppDataType::appHalf && args.outTypeId != AppDataType::appFloat))
-            args.outTypeId = AppDataType::appFloat;
+           (args.outTypeId != DataTypeEnum::Half && args.outTypeId != DataTypeEnum::Float))
+            args.outTypeId = DataTypeEnum::Float;
 
         if(!args.outType_assigned)
-            args.outTypeId = AppDataType::appHalf;
+            args.outTypeId = DataTypeEnum::Half;
 
-        if(args.compTypeId == AppDataType::appHalf)
+        if(args.compTypeId == DataTypeEnum::Half)
         {
-            profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(args.do_verification,
-                                                                    args.init_method,
-                                                                    args.do_log,
-                                                                    args.do_dumpout,
-                                                                    args.time_kernel,
-                                                                    args.inLengths,
-                                                                    args.reduceDims,
-                                                                    args.reduceOp,
-                                                                    args.nanOpt,
-                                                                    args.indicesOpt,
-                                                                    args.scales[0],
-                                                                    args.scales[1]);
+            profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(
+                args.do_verification,
+                args.init_method,
+                args.do_dumpout,
+                args.time_kernel,
+                args.inLengths,
+                args.reduceDims,
+                args.reduceOp,
+                static_cast<bool>(args.nanOpt),
+                static_cast<bool>(args.indicesOpt),
+                args.scales[0],
+                args.scales[1]);
         }
-        else if(args.compTypeId == AppDataType::appFloat)
+        else if(args.compTypeId == DataTypeEnum::Float)
         {
             profile_reduce_impl<ck::half_t, float, ck::half_t>(args.do_verification,
                                                                args.init_method,
-                                                               args.do_log,
                                                                args.do_dumpout,
                                                                args.time_kernel,
                                                                args.inLengths,
                                                                args.reduceDims,
                                                                args.reduceOp,
-                                                               args.nanOpt,
-                                                               args.indicesOpt,
+                                                               static_cast<bool>(args.nanOpt),
+                                                               static_cast<bool>(args.indicesOpt),
                                                                args.scales[0],
                                                                args.scales[1]);
         }
@@ -385,56 +315,53 @@ int profile_reduce(int argc, char* argv[])
     {
         profile_reduce_impl<double, double, double>(args.do_verification,
                                                     args.init_method,
-                                                    args.do_log,
                                                     args.do_dumpout,
                                                     args.time_kernel,
                                                     args.inLengths,
                                                     args.reduceDims,
                                                     args.reduceOp,
-                                                    args.nanOpt,
-                                                    args.indicesOpt,
+                                                    static_cast<bool>(args.nanOpt),
+                                                    static_cast<bool>(args.indicesOpt),
                                                     args.scales[0],
                                                     args.scales[1]);
     }
     else if(args.use_int8)
     {
         if(!args.compType_assigned)
-            args.compTypeId = AppDataType::appInt8;
+            args.compTypeId = DataTypeEnum::Int8;
 
         if(args.outType_assigned &&
-           (args.outTypeId != AppDataType::appInt8 && args.outTypeId != AppDataType::appInt32))
-            args.outTypeId = AppDataType::appInt32;
+           (args.outTypeId != DataTypeEnum::Int8 && args.outTypeId != DataTypeEnum::Int32))
+            args.outTypeId = DataTypeEnum::Int32;
 
         if(!args.outType_assigned)
-            args.outTypeId = AppDataType::appInt8;
+            args.outTypeId = DataTypeEnum::Int8;
 
-        if(args.compTypeId == AppDataType::appInt8)
+        if(args.compTypeId == DataTypeEnum::Int8)
         {
             profile_reduce_impl<int8_t, int8_t, int8_t>(args.do_verification,
                                                         args.init_method,
-                                                        args.do_log,
                                                         args.do_dumpout,
                                                         args.time_kernel,
                                                         args.inLengths,
                                                         args.reduceDims,
                                                         args.reduceOp,
-                                                        args.nanOpt,
-                                                        args.indicesOpt,
+                                                        static_cast<bool>(args.nanOpt),
+                                                        static_cast<bool>(args.indicesOpt),
                                                         args.scales[0],
                                                         args.scales[1]);
         }
-        else if(args.compTypeId == AppDataType::appInt32)
+        else if(args.compTypeId == DataTypeEnum::Int32)
         {
             profile_reduce_impl<int8_t, int32_t, int8_t>(args.do_verification,
                                                          args.init_method,
-                                                         args.do_log,
                                                          args.do_dumpout,
                                                          args.time_kernel,
                                                          args.inLengths,
                                                          args.reduceDims,
                                                          args.reduceOp,
-                                                         args.nanOpt,
-                                                         args.indicesOpt,
+                                                         static_cast<bool>(args.nanOpt),
+                                                         static_cast<bool>(args.indicesOpt),
                                                          args.scales[0],
                                                          args.scales[1]);
         }
@@ -444,54 +371,51 @@ int profile_reduce(int argc, char* argv[])
     else if(args.use_bf16)
     {
         if(args.outType_assigned &&
-           (args.outTypeId != AppDataType::appBFloat16 && args.outTypeId != AppDataType::appFloat))
-            args.outTypeId = AppDataType::appFloat;
+           (args.outTypeId != DataTypeEnum::BFloat16 && args.outTypeId != DataTypeEnum::Float))
+            args.outTypeId = DataTypeEnum::Float;
 
         if(!args.outType_assigned)
-            args.outTypeId = AppDataType::appBFloat16;
+            args.outTypeId = DataTypeEnum::BFloat16;
 
         profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(args.do_verification,
                                                              args.init_method,
-                                                             args.do_log,
                                                              args.do_dumpout,
                                                              args.time_kernel,
                                                              args.inLengths,
                                                              args.reduceDims,
                                                              args.reduceOp,
-                                                             args.nanOpt,
-                                                             args.indicesOpt,
+                                                             static_cast<bool>(args.nanOpt),
+                                                             static_cast<bool>(args.indicesOpt),
                                                              args.scales[0],
                                                              args.scales[1]);
     }
     else
     {
-        if(args.compTypeId == AppDataType::appFloat)
+        if(args.compTypeId == DataTypeEnum::Float)
         {
             profile_reduce_impl<float, float, float>(args.do_verification,
                                                      args.init_method,
-                                                     args.do_log,
                                                      args.do_dumpout,
                                                      args.time_kernel,
                                                      args.inLengths,
                                                      args.reduceDims,
                                                      args.reduceOp,
-                                                     args.nanOpt,
-                                                     args.indicesOpt,
+                                                     static_cast<bool>(args.nanOpt),
+                                                     static_cast<bool>(args.indicesOpt),
                                                      args.scales[0],
                                                      args.scales[1]);
         }
-        else if(args.compTypeId == AppDataType::appDouble)
+        else if(args.compTypeId == DataTypeEnum::Double)
         {
             profile_reduce_impl<float, double, float>(args.do_verification,
                                                       args.init_method,
-                                                      args.do_log,
                                                       args.do_dumpout,
                                                       args.time_kernel,
                                                       args.inLengths,
                                                       args.reduceDims,
                                                       args.reduceOp,
-                                                      args.nanOpt,
-                                                      args.indicesOpt,
+                                                      static_cast<bool>(args.nanOpt),
+                                                      static_cast<bool>(args.indicesOpt),
                                                       args.scales[0],
                                                       args.scales[1]);
         }
diff --git a/script/test_reduce_no_index.sh b/script/test_reduce_no_index.sh
index 95e563c93c..b956303837 100755
--- a/script/test_reduce_no_index.sh
+++ b/script/test_reduce_no_index.sh
@@ -15,6 +15,17 @@ bin/test_reduce_no_index -D 64,4,280,82  -R 1  0 2
 bin/test_reduce_no_index -D 64,4,280,82  -R 2  0 2
 bin/test_reduce_no_index -D 64,4,280,82  -R 3  0 2
 
+## for float64
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2,3  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,3  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,2,3  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1,2,3  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 2  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 3  6 2
+
 ## for float16
 bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2,3  1 2
 bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2  1 2
diff --git a/script/test_reduce_with_index.sh b/script/test_reduce_with_index.sh
index 8e7ed33847..b0843ba6c1 100755
--- a/script/test_reduce_with_index.sh
+++ b/script/test_reduce_with_index.sh
@@ -15,6 +15,17 @@ bin/test_reduce_with_index -D 64,4,280,82  -R 1  0 2
 bin/test_reduce_with_index -D 64,4,280,82  -R 2  0 2
 bin/test_reduce_with_index -D 64,4,280,82  -R 3  0 2
 
+## for float64
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 2  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 3  6 2
+
 ## for float16
 bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  1 2
 bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  1 2
diff --git a/test/reduce/reduce_no_index.cpp b/test/reduce/reduce_no_index.cpp
index 317abab53a..20030392b5 100644
--- a/test/reduce/reduce_no_index.cpp
+++ b/test/reduce/reduce_no_index.cpp
@@ -1,384 +1,10 @@
 #include "getopt.h"
 
-#include "check_err.hpp"
-#include "device_reduce_instance.hpp"
-#include "reduction_enums.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "host_reduction.hpp"
-#include "reduce_util.hpp"
+#include "host_common_util.hpp"
+#include "profile_reduce_impl.hpp"
 
 using namespace ck;
 
-namespace {
-
-template <index_t Rank, index_t NumReduceDim>
-static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
-{
-    assert(NumReduceDim == reduceDims.size());
-
-    int reduceFlag = 0;
-
-    // flag the bits for the reduceDims
-    for(int i = 0; i < NumReduceDim; i++)
-    {
-        reduceFlag |= 1 << reduceDims[i];
-    };
-
-    std::vector<int> invariantDims;
-
-    // collect invariant dimensions
-    for(int i = 0; i < Rank; i++)
-        if((reduceFlag & (1 << i)) == 0)
-        {
-            invariantDims.push_back(i);
-        };
-
-    return invariantDims;
-};
-
-constexpr int Rank = 4;
-
-constexpr ReduceTensorOp ReduceOpId      = ReduceTensorOp::AVG;
-constexpr NanPropagation NanOpt          = NanPropagation::PROPAGATE_NAN;
-constexpr bool PropagateNan              = false;
-constexpr ReduceTensorIndices IndicesOpt = ReduceTensorIndices::NO_INDICES;
-constexpr bool NeedIndices               = false;
-
-template <typename InDataType,
-          typename AccDataType,
-          typename OutDataType,
-          int Rank,
-          int NumReduceDim>
-bool test_reduce_no_index_impl(int init_method,
-                               const std::vector<size_t>& inLengths,
-                               const std::vector<int>& reduceDims,
-                               float alpha,
-                               float beta)
-{
-    using namespace ck::tensor_operation::device;
-    using namespace ck::tensor_operation::device::device_reduce_instance;
-    using namespace ck::host_reduce;
-
-    constexpr bool out_support_atomic_add = std::is_same<OutDataType, float>::value;
-    constexpr bool op_support_atomic_add  = true;
-    constexpr bool use_atomic_add         = (out_support_atomic_add && op_support_atomic_add);
-
-    Tensor<InDataType> in(inLengths);
-
-    std::vector<size_t> outLengths;
-
-    const auto invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
-
-    if(reduceDims.size() == Rank)
-        outLengths.push_back(1);
-    else
-        for(auto dim : invariantDims)
-            outLengths.push_back(inLengths[dim]);
-
-    Tensor<OutDataType> out_ref(outLengths);
-    Tensor<OutDataType> out(outLengths);
-
-    // only used when the OutDataType is bhalf_t
-    Tensor<float> out_ref_fp32(outLengths);
-    Tensor<float> out_fp32(outLengths);
-
-    auto inStrides  = in.mDesc.GetStrides();
-    auto outStrides = out.mDesc.GetStrides();
-
-    size_t invariant_total_length = out.mDesc.GetElementSize();
-    size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;
-
-    std::size_t num_thread = 1;
-
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
-        if(beta != 0.0f)
-            out_ref.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
-        break;
-    case 2:
-        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
-        if(beta != 0.0f)
-            out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
-        break;
-    default:
-        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
-        if(beta != 0.0f)
-            out_ref.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
-    }
-
-    if(beta != 0.0f)
-        for(size_t i = 0; i < out_ref.mDesc.GetElementSpace(); i++)
-            out.mData[i] = out_ref.mData[i];
-
-    // these buffers are usually provided by the user application
-    DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace());
-    DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace());
-
-    in_dev.ToDevice(in.mData.data());
-
-    if(beta != 0.0f)
-        out_dev.ToDevice(out.mData.data());
-
-    using InElementwiseOperation_0 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
-    using AccElementwiseOperation_0 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
-            AccElementwiseOperation;
-    using InElementwiseOperation_1 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
-            InElementwiseOperation;
-    using AccElementwiseOperation_1 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
-            AccElementwiseOperation;
-    using InElementwiseOperation_2 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
-            InElementwiseOperation;
-    using AccElementwiseOperation_2 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
-            AccElementwiseOperation;
-
-    using DeviceReduceInstPtr0 =
-        DeviceReducePtr<InElementwiseOperation_0, AccElementwiseOperation_0>;
-    using DeviceReduceInstPtr1 =
-        DeviceReducePtr<InElementwiseOperation_1, AccElementwiseOperation_1>;
-    using DeviceReduceInstPtr2 =
-        DeviceReducePtr<InElementwiseOperation_2, AccElementwiseOperation_2>;
-
-    std::vector<DeviceReduceInstPtr0> reduce0_ptrs;
-    std::vector<DeviceReduceInstPtr1> reduce1_ptrs;
-    std::vector<DeviceReduceInstPtr2> reduce2_ptrs;
-
-    add_device_reduce_instance_threadwise<InDataType,
-                                          AccDataType,
-                                          OutDataType,
-                                          Rank,
-                                          NumReduceDim,
-                                          ReduceOpId,
-                                          NanOpt,
-                                          IndicesOpt>(reduce0_ptrs);
-
-    add_device_reduce_instance_blockwise<InDataType,
-                                         AccDataType,
-                                         OutDataType,
-                                         Rank,
-                                         NumReduceDim,
-                                         ReduceOpId,
-                                         NanOpt,
-                                         IndicesOpt>(reduce0_ptrs);
-
-    if constexpr(use_atomic_add)
-    {
-        add_device_reduce_instance_multiblock_atomic_add<InDataType,
-                                                         AccDataType,
-                                                         OutDataType,
-                                                         Rank,
-                                                         NumReduceDim,
-                                                         ReduceOpId,
-                                                         NanOpt,
-                                                         IndicesOpt>(reduce0_ptrs);
-    }
-    else
-    {
-        add_device_reduce_instance_multiblock_partial_reduce<InDataType,
-                                                             AccDataType,
-                                                             OutDataType,
-                                                             Rank,
-                                                             NumReduceDim,
-                                                             ReduceOpId,
-                                                             NanOpt,
-                                                             IndicesOpt>(reduce1_ptrs);
-    };
-
-    // used for secondary reduction
-    if constexpr(!use_atomic_add)
-    {
-        add_device_reduce_instance_blockwise_second_call<AccDataType,
-                                                         AccDataType,
-                                                         OutDataType,
-                                                         Rank,
-                                                         NumReduceDim,
-                                                         ReduceOpId,
-                                                         NanOpt,
-                                                         IndicesOpt>(reduce2_ptrs);
-    };
-
-    if(reduce0_ptrs.empty() && reduce1_ptrs.empty())
-    {
-        throw std::runtime_error("Wrong! No device REDUCE instance found");
-    };
-
-    bool result = true;
-
-    ReductionHost<InDataType,
-                  AccDataType,
-                  OutDataType,
-                  ReduceOpId,
-                  Rank,
-                  NumReduceDim,
-                  PropagateNan,
-                  NeedIndices>
-        hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
-
-    hostReduce.Run(alpha, in.mData.data(), beta, out_ref.mData.data(), nullptr);
-
-    const auto i_inLengths  = to_int_vector(inLengths);
-    const auto i_inStrides  = to_int_vector(inStrides);
-    const auto i_outLengths = to_int_vector(outLengths);
-    const auto i_outStrides = to_int_vector(outStrides);
-
-    for(auto& reduce_ptr : reduce0_ptrs)
-    {
-        auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
-
-        DeviceMem ws_dev(wsSizeInBytes);
-
-        InElementwiseOperation_0 in_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
-        AccElementwiseOperation_0 acc_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
-
-        auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
-                                                            i_inStrides,
-                                                            i_outLengths,
-                                                            i_outStrides,
-                                                            reduceDims,
-                                                            alpha,
-                                                            beta,
-                                                            in_dev.GetDeviceBuffer(),
-                                                            out_dev.GetDeviceBuffer(),
-                                                            nullptr,
-                                                            ws_dev.GetDeviceBuffer(),
-                                                            in_elementwise_op_0,
-                                                            acc_elementwise_op_0);
-
-        if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
-            continue;
-
-        auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
-
-        (void)invoker_ptr->Run(argument_ptr.get());
-
-        out_dev.FromDevice(out.mData.data());
-
-        bool single_result = true;
-
-        if constexpr(std::is_same<OutDataType, ck::half_t>::value ||
-                     std::is_same<OutDataType, ck::bhalf_t>::value)
-        {
-            reduce_util::to_f32_vector(out, out_fp32);
-            reduce_util::to_f32_vector(out_ref, out_ref_fp32);
-            single_result = ck::utils::check_err(
-                out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!");
-        }
-        else
-        {
-            single_result =
-                ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
-        };
-
-        if(!single_result)
-        {
-            std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << std::endl;
-            result = false;
-        }
-    };
-
-    for(auto& reduce_ptr : reduce1_ptrs)
-    {
-        auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
-
-        DeviceMem ws_dev(wsSizeInBytes);
-
-        InElementwiseOperation_1 in_elementwise_op_1(static_cast<int32_t>(reduce_total_length));
-        AccElementwiseOperation_1 acc_elementwise_op_1(static_cast<int32_t>(reduce_total_length));
-
-        auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
-                                                            i_inStrides,
-                                                            i_outLengths,
-                                                            i_outStrides,
-                                                            reduceDims,
-                                                            alpha,
-                                                            beta,
-                                                            in_dev.GetDeviceBuffer(),
-                                                            out_dev.GetDeviceBuffer(),
-                                                            nullptr,
-                                                            ws_dev.GetDeviceBuffer(),
-                                                            in_elementwise_op_1,
-                                                            acc_elementwise_op_1);
-
-        if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
-            continue;
-
-        auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
-
-        (void)invoker_ptr->Run(argument_ptr.get());
-
-        std::vector<int> inLengths2 = reduce_ptr->GetWorkspace2dLengths(argument_ptr.get());
-        std::vector<int> inStrides2{inLengths2[1], 1};
-
-        for(auto& reduce2_ptr : reduce2_ptrs)
-        {
-            InElementwiseOperation_2 in_elementwise_op_2(static_cast<int32_t>(reduce_total_length));
-            AccElementwiseOperation_2 acc_elementwise_op_2(
-                static_cast<int32_t>(reduce_total_length));
-
-            auto argument2_ptr = reduce2_ptr->MakeArgumentPointer(inLengths2,
-                                                                  inStrides2,
-                                                                  i_outLengths,
-                                                                  i_outStrides,
-                                                                  reduceDims,
-                                                                  alpha,
-                                                                  beta,
-                                                                  ws_dev.GetDeviceBuffer(),
-                                                                  out_dev.GetDeviceBuffer(),
-                                                                  nullptr,
-                                                                  ws_dev.GetDeviceBuffer(),
-                                                                  in_elementwise_op_2,
-                                                                  acc_elementwise_op_2);
-
-            if(!reduce2_ptr->IsSupportedArgument(argument2_ptr.get()))
-                continue;
-
-            std::string reduce2_name = reduce2_ptr->GetTypeString();
-
-            auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer();
-
-            (void)invoker2_ptr->Run(argument2_ptr.get());
-
-            out_dev.FromDevice(out.mData.data());
-
-            bool single_result = true;
-
-            if constexpr(std::is_same<OutDataType, ck::half_t>::value ||
-                         std::is_same<OutDataType, ck::bhalf_t>::value)
-            {
-                reduce_util::to_f32_vector(out, out_fp32);
-                reduce_util::to_f32_vector(out_ref, out_ref_fp32);
-                single_result = ck::utils::check_err(
-                    out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!");
-            }
-            else
-            {
-                single_result =
-                    ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
-            };
-
-            if(!single_result)
-            {
-                std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << " => "
-                          << reduce2_ptr->GetTypeString() << std::endl;
-                result = false;
-            }
-        };
-    };
-
-    return (result);
-};
-
-} // anonymous namespace
-
 static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
                                        {"reduceDimensions", required_argument, nullptr, 'R'},
                                        {"scales", required_argument, nullptr, 'S'},
@@ -387,48 +13,6 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
 
 class SimpleAppArgs
 {
-    template <typename T>
-    static T getSingleValueFromString(const std::string& valueStr)
-    {
-        std::istringstream iss(valueStr);
-
-        T ret;
-
-        iss >> ret;
-
-        return (ret);
-    };
-
-    template <typename T>
-    static std::vector<T> getTypeValuesFromString(const char* cstr_values)
-    {
-        std::string valuesStr(cstr_values);
-
-        std::vector<T> values;
-        std::size_t pos = 0;
-        std::size_t new_pos;
-
-        new_pos = valuesStr.find(',', pos);
-        while(new_pos != std::string::npos)
-        {
-            const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
-
-            T val = getSingleValueFromString<T>(sliceStr);
-
-            values.push_back(val);
-
-            pos     = new_pos + 1;
-            new_pos = valuesStr.find(',', pos);
-        };
-
-        std::string sliceStr = valuesStr.substr(pos);
-        T val                = getSingleValueFromString<T>(sliceStr);
-
-        values.push_back(val);
-
-        return (values);
-    };
-
     private:
     int option_index = 0;
 
@@ -460,6 +44,8 @@ class SimpleAppArgs
 
     int processArgs(int argc, char* argv[])
     {
+        using ck::host_common::getTypeValuesFromString;
+
         int ch;
 
         while(1)
@@ -514,7 +100,7 @@ class SimpleAppArgs
            (reduceDims.size() != 1 && reduceDims.size() != 3 && reduceDims.size() != 4))
             return (-1);
 
-        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5)
+        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
             return (-1);
 
         return (0);
@@ -525,87 +111,92 @@ bool test_reduce_no_index(int data_type,
                           int init_method,
                           std::vector<int> reduceDims,
                           std::vector<size_t> inLengths,
+                          ReduceTensorOp reduceOpId,
+                          bool propagateNan,
                           float alpha,
                           float beta)
 {
+    using ck::profiler::profile_reduce_impl;
+
     bool result = true;
 
     if(data_type == 0)
     {
-        switch(reduceDims.size())
-        {
-        case 1:
-            result = test_reduce_no_index_impl<float, float, float, Rank, 1>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 3:
-            result = test_reduce_no_index_impl<float, float, float, Rank, 3>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 4:
-            result = test_reduce_no_index_impl<float, float, float, Rank, 4>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        };
+        result = profile_reduce_impl<float, float, float>(true,
+                                                          init_method,
+                                                          false,
+                                                          false,
+                                                          inLengths,
+                                                          reduceDims,
+                                                          reduceOpId,
+                                                          propagateNan,
+                                                          false,
+                                                          alpha,
+                                                          beta);
     }
     else if(data_type == 1)
     {
-        switch(reduceDims.size())
-        {
-        case 1:
-            result = test_reduce_no_index_impl<ck::half_t, float, ck::half_t, Rank, 1>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 3:
-            result = test_reduce_no_index_impl<ck::half_t, float, ck::half_t, Rank, 3>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 4:
-            result = test_reduce_no_index_impl<ck::half_t, float, ck::half_t, Rank, 4>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        };
+        result = profile_reduce_impl<ck::half_t, float, ck::half_t>(true,
+                                                                    init_method,
+                                                                    false,
+                                                                    false,
+                                                                    inLengths,
+                                                                    reduceDims,
+                                                                    reduceOpId,
+                                                                    propagateNan,
+                                                                    false,
+                                                                    alpha,
+                                                                    beta);
     }
     else if(data_type == 3)
     {
-        switch(reduceDims.size())
-        {
-        case 1:
-            result = test_reduce_no_index_impl<int8_t, int32_t, int8_t, Rank, 1>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 3:
-            result = test_reduce_no_index_impl<int8_t, int32_t, int8_t, Rank, 3>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 4:
-            result = test_reduce_no_index_impl<int8_t, int32_t, int8_t, Rank, 4>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        };
+        result = profile_reduce_impl<int8_t, int32_t, int8_t>(true,
+                                                              init_method,
+                                                              false,
+                                                              false,
+                                                              inLengths,
+                                                              reduceDims,
+                                                              reduceOpId,
+                                                              propagateNan,
+                                                              false,
+                                                              alpha,
+                                                              beta);
     }
     else if(data_type == 5)
     {
-        switch(reduceDims.size())
-        {
-        case 1:
-            result = test_reduce_no_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 1>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 3:
-            result = test_reduce_no_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 3>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 4:
-            result = test_reduce_no_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 4>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        };
+        result = profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(true,
+                                                                      init_method,
+                                                                      false,
+                                                                      false,
+                                                                      inLengths,
+                                                                      reduceDims,
+                                                                      reduceOpId,
+                                                                      propagateNan,
+                                                                      false,
+                                                                      alpha,
+                                                                      beta);
+    }
+    else if(data_type == 6)
+    {
+        result = profile_reduce_impl<double, double, double>(true,
+                                                             init_method,
+                                                             false,
+                                                             false,
+                                                             inLengths,
+                                                             reduceDims,
+                                                             reduceOpId,
+                                                             propagateNan,
+                                                             false,
+                                                             alpha,
+                                                             beta);
     }
 
     return (result);
 };
 
+constexpr ReduceTensorOp reduceOpId = ReduceTensorOp::AVG;
+constexpr bool propagateNan         = false;
+
 int main(int argc, char* argv[])
 {
     SimpleAppArgs args;
@@ -621,8 +212,14 @@ int main(int argc, char* argv[])
             {0, 1, 2, 3}, {0, 1, 2}, {1, 2, 3}, {0, 1, 3}, {0, 2, 3}, {0}, {1}, {2}, {3}};
 
         for(auto& reduceDims : v_reduceDims)
-            result = result && test_reduce_no_index(
-                                   data_type, init_method, reduceDims, inLengths, 1.0f, 0.0f);
+            result = result && test_reduce_no_index(data_type,
+                                                    init_method,
+                                                    reduceDims,
+                                                    inLengths,
+                                                    reduceOpId,
+                                                    propagateNan,
+                                                    1.0f,
+                                                    0.0f);
     }
     else
     {
@@ -636,6 +233,8 @@ int main(int argc, char* argv[])
                                       args.init_method,
                                       args.reduceDims,
                                       args.inLengths,
+                                      reduceOpId,
+                                      propagateNan,
                                       args.scales[0],
                                       args.scales[1]);
     }
diff --git a/test/reduce/reduce_util.hpp b/test/reduce/reduce_util.hpp
deleted file mode 100644
index 9eb66513bf..0000000000
--- a/test/reduce/reduce_util.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef REDUCE_UTILS_HPP
-#define REDUCE_UTILS_HPP
-
-#include "data_type.hpp"
-
-namespace ck {
-namespace reduce_util {
-
-template <typename T>
-void to_f32_vector(const Tensor<T>& src, Tensor<float>& dst)
-{
-    for(std::size_t i = 0; i < src.mData.size(); ++i)
-        dst.mData[i] = type_convert<float>(src.mData[i]);
-}
-
-} // namespace reduce_util
-
-} // namespace ck
-#endif
diff --git a/test/reduce/reduce_with_index.cpp b/test/reduce/reduce_with_index.cpp
index d7d5e551a2..c1918bf388 100644
--- a/test/reduce/reduce_with_index.cpp
+++ b/test/reduce/reduce_with_index.cpp
@@ -1,387 +1,10 @@
 #include "getopt.h"
-#include "device_reduce_instance.hpp"
-#include "reduction_enums.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "host_reduction.hpp"
-#include "check_err.hpp"
-#include "reduce_util.hpp"
+
+#include "host_common_util.hpp"
+#include "profile_reduce_impl.hpp"
 
 using namespace ck;
 
-namespace {
-
-template <index_t Rank, index_t NumReduceDim>
-static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
-{
-    assert(NumReduceDim == reduceDims.size());
-
-    int reduceFlag = 0;
-
-    // flag the bits for the reduceDims
-    for(int i = 0; i < NumReduceDim; i++)
-    {
-        reduceFlag |= 1 << reduceDims[i];
-    };
-
-    std::vector<int> invariantDims;
-
-    // collect invariant dimensions
-    for(int i = 0; i < Rank; i++)
-        if((reduceFlag & (1 << i)) == 0)
-        {
-            invariantDims.push_back(i);
-        };
-
-    return invariantDims;
-};
-
-constexpr int Rank = 4;
-
-constexpr ReduceTensorOp ReduceOpId      = ReduceTensorOp::AMAX;
-constexpr NanPropagation NanOpt          = NanPropagation::PROPAGATE_NAN;
-constexpr bool PropagateNan              = false;
-constexpr ReduceTensorIndices IndicesOpt = ReduceTensorIndices::FLATTENED_INDICES;
-constexpr bool NeedIndices               = true;
-
-template <typename InDataType,
-          typename AccDataType,
-          typename OutDataType,
-          int Rank,
-          int NumReduceDim>
-bool test_reduce_with_index_impl(int init_method,
-                                 const std::vector<size_t>& inLengths,
-                                 const std::vector<int>& reduceDims,
-                                 float alpha,
-                                 float beta)
-{
-    using namespace ck::tensor_operation::device;
-    using namespace ck::tensor_operation::device::device_reduce_instance;
-    using namespace ck::host_reduce;
-
-    Tensor<InDataType> in(inLengths);
-
-    std::vector<size_t> outLengths;
-
-    const auto invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
-
-    if(reduceDims.size() == Rank)
-        outLengths.push_back(1);
-    else
-        for(auto dim : invariantDims)
-            outLengths.push_back(inLengths[dim]);
-
-    Tensor<OutDataType> out_ref(outLengths);
-    Tensor<OutDataType> out(outLengths);
-    Tensor<int32_t> out_indices_ref(outLengths);
-    Tensor<int32_t> out_indices(outLengths);
-
-    // only used when the OutDataType is bhalf_t
-    Tensor<float> out_ref_fp32(outLengths);
-    Tensor<float> out_fp32(outLengths);
-
-    auto inStrides  = in.mDesc.GetStrides();
-    auto outStrides = out.mDesc.GetStrides();
-
-    size_t invariant_total_length = out.mDesc.GetElementSize();
-    size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;
-
-    std::size_t num_thread = 1;
-
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
-        if(beta != 0.0f)
-            out_ref.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
-        break;
-    case 2:
-        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
-        if(beta != 0.0f)
-            out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
-        break;
-    default:
-        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
-        if(beta != 0.0f)
-            out_ref.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
-    }
-
-    if(beta != 0.0f)
-        for(size_t i = 0; i < out_ref.mDesc.GetElementSpace(); i++)
-            out.mData[i] = out_ref.mData[i];
-
-    // these buffers are usually provided by the user application
-    DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace());
-    DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace());
-
-    in_dev.ToDevice(in.mData.data());
-
-    if(beta != 0.0f)
-        out_dev.ToDevice(out.mData.data());
-
-    size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int) : 0;
-
-    DeviceMem out_indices_dev(indicesSizeInBytes);
-
-    using InElementwiseOperation_0 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
-    using AccElementwiseOperation_0 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
-            AccElementwiseOperation;
-    using InElementwiseOperation_1 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
-            InElementwiseOperation;
-    using AccElementwiseOperation_1 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
-            AccElementwiseOperation;
-    using InElementwiseOperation_2 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
-            InElementwiseOperation;
-    using AccElementwiseOperation_2 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
-            AccElementwiseOperation;
-
-    using DeviceReduceInstPtr0 =
-        DeviceReducePtr<InElementwiseOperation_0, AccElementwiseOperation_0>;
-    using DeviceReduceInstPtr1 =
-        DeviceReducePtr<InElementwiseOperation_1, AccElementwiseOperation_1>;
-    using DeviceReduceInstPtr2 =
-        DeviceReducePtr<InElementwiseOperation_2, AccElementwiseOperation_2>;
-
-    std::vector<DeviceReduceInstPtr0> reduce0_ptrs;
-    std::vector<DeviceReduceInstPtr1> reduce1_ptrs;
-    std::vector<DeviceReduceInstPtr2> reduce2_ptrs;
-
-    add_device_reduce_instance_threadwise<InDataType,
-                                          AccDataType,
-                                          OutDataType,
-                                          Rank,
-                                          NumReduceDim,
-                                          ReduceOpId,
-                                          NanOpt,
-                                          IndicesOpt>(reduce0_ptrs);
-
-    add_device_reduce_instance_blockwise<InDataType,
-                                         AccDataType,
-                                         OutDataType,
-                                         Rank,
-                                         NumReduceDim,
-                                         ReduceOpId,
-                                         NanOpt,
-                                         IndicesOpt>(reduce0_ptrs);
-
-    add_device_reduce_instance_multiblock_partial_reduce<InDataType,
-                                                         AccDataType,
-                                                         OutDataType,
-                                                         Rank,
-                                                         NumReduceDim,
-                                                         ReduceOpId,
-                                                         NanOpt,
-                                                         IndicesOpt>(reduce1_ptrs);
-
-    add_device_reduce_instance_blockwise_second_call<AccDataType,
-                                                     AccDataType,
-                                                     OutDataType,
-                                                     Rank,
-                                                     NumReduceDim,
-                                                     ReduceOpId,
-                                                     NanOpt,
-                                                     IndicesOpt>(reduce2_ptrs);
-
-    if(reduce0_ptrs.empty() && reduce1_ptrs.empty())
-    {
-        throw std::runtime_error("Wrong! No device REDUCE instance found");
-    };
-
-    bool result = true;
-
-    ReductionHost<InDataType,
-                  AccDataType,
-                  OutDataType,
-                  ReduceOpId,
-                  Rank,
-                  NumReduceDim,
-                  PropagateNan,
-                  NeedIndices>
-        hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
-
-    hostReduce.Run(
-        alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
-
-    const auto i_inLengths  = to_int_vector(inLengths);
-    const auto i_inStrides  = to_int_vector(inStrides);
-    const auto i_outLengths = to_int_vector(outLengths);
-    const auto i_outStrides = to_int_vector(outStrides);
-
-    for(auto& reduce_ptr : reduce0_ptrs)
-    {
-        auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
-
-        DeviceMem ws_dev(wsSizeInBytes);
-
-        InElementwiseOperation_0 in_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
-        AccElementwiseOperation_0 acc_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
-
-        auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
-                                                            i_inStrides,
-                                                            i_outLengths,
-                                                            i_outStrides,
-                                                            reduceDims,
-                                                            alpha,
-                                                            beta,
-                                                            in_dev.GetDeviceBuffer(),
-                                                            out_dev.GetDeviceBuffer(),
-                                                            out_indices_dev.GetDeviceBuffer(),
-                                                            ws_dev.GetDeviceBuffer(),
-                                                            in_elementwise_op_0,
-                                                            acc_elementwise_op_0);
-
-        if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
-            continue;
-
-        auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
-
-        (void)invoker_ptr->Run(argument_ptr.get());
-
-        out_dev.FromDevice(out.mData.data());
-
-        bool single_result = true;
-
-        if constexpr(std::is_same<OutDataType, ck::half_t>::value ||
-                     std::is_same<OutDataType, ck::bhalf_t>::value)
-        {
-            reduce_util::to_f32_vector(out, out_fp32);
-            reduce_util::to_f32_vector(out_ref, out_ref_fp32);
-            single_result = ck::utils::check_err(
-                out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!");
-        }
-        else
-        {
-            single_result =
-                ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
-        };
-
-        if(NeedIndices)
-        {
-            out_indices_dev.FromDevice(out_indices.mData.data());
-            single_result = single_result && ck::utils::check_err(out_indices_ref.mData,
-                                                                  out_indices.mData,
-                                                                  "Error: incorrect index result!");
-        };
-
-        if(!single_result)
-        {
-            std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << std::endl;
-            result = false;
-        }
-    };
-
-    for(auto& reduce_ptr : reduce1_ptrs)
-    {
-        auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
-
-        DeviceMem ws_dev(wsSizeInBytes);
-
-        InElementwiseOperation_1 in_elementwise_op_1(static_cast<int32_t>(reduce_total_length));
-        AccElementwiseOperation_1 acc_elementwise_op_1(static_cast<int32_t>(reduce_total_length));
-
-        auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
-                                                            i_inStrides,
-                                                            i_outLengths,
-                                                            i_outStrides,
-                                                            reduceDims,
-                                                            alpha,
-                                                            beta,
-                                                            in_dev.GetDeviceBuffer(),
-                                                            out_dev.GetDeviceBuffer(),
-                                                            out_indices_dev.GetDeviceBuffer(),
-                                                            ws_dev.GetDeviceBuffer(),
-                                                            in_elementwise_op_1,
-                                                            acc_elementwise_op_1);
-
-        if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
-            continue;
-
-        std::string reduce_name = reduce_ptr->GetTypeString();
-
-        auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
-
-        (void)invoker_ptr->Run(argument_ptr.get());
-
-        std::vector<int> inLengths2 = reduce_ptr->GetWorkspace2dLengths(argument_ptr.get());
-        std::vector<int> inStrides2{inLengths2[1], 1};
-
-        for(auto& reduce2_ptr : reduce2_ptrs)
-        {
-            InElementwiseOperation_2 in_elementwise_op_2(static_cast<int32_t>(reduce_total_length));
-            AccElementwiseOperation_2 acc_elementwise_op_2(
-                static_cast<int32_t>(reduce_total_length));
-
-            auto argument2_ptr = reduce2_ptr->MakeArgumentPointer(inLengths2,
-                                                                  inStrides2,
-                                                                  i_outLengths,
-                                                                  i_outStrides,
-                                                                  reduceDims,
-                                                                  alpha,
-                                                                  beta,
-                                                                  ws_dev.GetDeviceBuffer(),
-                                                                  out_dev.GetDeviceBuffer(),
-                                                                  out_indices_dev.GetDeviceBuffer(),
-                                                                  ws_dev.GetDeviceBuffer(),
-                                                                  in_elementwise_op_2,
-                                                                  acc_elementwise_op_2);
-
-            if(!reduce2_ptr->IsSupportedArgument(argument2_ptr.get()))
-                continue;
-
-            std::string reduce2_name = reduce2_ptr->GetTypeString();
-
-            auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer();
-
-            (void)invoker2_ptr->Run(argument2_ptr.get());
-
-            out_dev.FromDevice(out.mData.data());
-
-            bool single_result = true;
-
-            if constexpr(std::is_same<OutDataType, ck::half_t>::value ||
-                         std::is_same<OutDataType, ck::bhalf_t>::value)
-            {
-                reduce_util::to_f32_vector(out, out_fp32);
-                reduce_util::to_f32_vector(out_ref, out_ref_fp32);
-                single_result = ck::utils::check_err(
-                    out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!");
-            }
-            else
-            {
-                single_result =
-                    ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
-            };
-
-            if(NeedIndices)
-            {
-                out_indices_dev.FromDevice(out_indices.mData.data());
-                single_result =
-                    single_result && ck::utils::check_err(out_indices_ref.mData,
-                                                          out_indices.mData,
-                                                          "Error: incorrect index result!");
-            };
-
-            if(!single_result)
-            {
-                std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << " => "
-                          << reduce2_ptr->GetTypeString() << std::endl;
-                result = false;
-            }
-        };
-    };
-
-    return (result);
-};
-
-} // anonymous namespace
-
 static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
                                        {"reduceDimensions", required_argument, nullptr, 'R'},
                                        {"scales", required_argument, nullptr, 'S'},
@@ -390,48 +13,6 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
 
 class SimpleAppArgs
 {
-    template <typename T>
-    static T getSingleValueFromString(const std::string& valueStr)
-    {
-        std::istringstream iss(valueStr);
-
-        T ret;
-
-        iss >> ret;
-
-        return (ret);
-    };
-
-    template <typename T>
-    static std::vector<T> getTypeValuesFromString(const char* cstr_values)
-    {
-        std::string valuesStr(cstr_values);
-
-        std::vector<T> values;
-        std::size_t pos = 0;
-        std::size_t new_pos;
-
-        new_pos = valuesStr.find(',', pos);
-        while(new_pos != std::string::npos)
-        {
-            const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
-
-            T val = getSingleValueFromString<T>(sliceStr);
-
-            values.push_back(val);
-
-            pos     = new_pos + 1;
-            new_pos = valuesStr.find(',', pos);
-        };
-
-        std::string sliceStr = valuesStr.substr(pos);
-        T val                = getSingleValueFromString<T>(sliceStr);
-
-        values.push_back(val);
-
-        return (values);
-    };
-
     private:
     int option_index = 0;
 
@@ -463,6 +44,8 @@ class SimpleAppArgs
 
     int processArgs(int argc, char* argv[])
     {
+        using ck::host_common::getTypeValuesFromString;
+
         int ch;
 
         while(1)
@@ -517,7 +100,7 @@ class SimpleAppArgs
            (reduceDims.size() != 1 && reduceDims.size() != 3 && reduceDims.size() != 4))
             return (-1);
 
-        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5)
+        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
             return (-1);
 
         return (0);
@@ -528,87 +111,92 @@ bool test_reduce_with_index(int data_type,
                             int init_method,
                             std::vector<int> reduceDims,
                             std::vector<size_t> inLengths,
+                            ReduceTensorOp reduceOpId,
+                            bool propagateNan,
                             float alpha,
                             float beta)
 {
+    using ck::profiler::profile_reduce_impl;
+
     bool result = true;
 
     if(data_type == 0)
     {
-        switch(reduceDims.size())
-        {
-        case 1:
-            result = test_reduce_with_index_impl<float, float, float, Rank, 1>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 3:
-            result = test_reduce_with_index_impl<float, float, float, Rank, 3>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 4:
-            result = test_reduce_with_index_impl<float, float, float, Rank, 4>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        };
+        result = profile_reduce_impl<float, float, float>(true,
+                                                          init_method,
+                                                          false,
+                                                          false,
+                                                          inLengths,
+                                                          reduceDims,
+                                                          reduceOpId,
+                                                          propagateNan,
+                                                          true,
+                                                          alpha,
+                                                          beta);
     }
     else if(data_type == 1)
     {
-        switch(reduceDims.size())
-        {
-        case 1:
-            result = test_reduce_with_index_impl<ck::half_t, ck::half_t, ck::half_t, Rank, 1>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 3:
-            result = test_reduce_with_index_impl<ck::half_t, ck::half_t, ck::half_t, Rank, 3>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 4:
-            result = test_reduce_with_index_impl<ck::half_t, ck::half_t, ck::half_t, Rank, 4>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        };
+        result = profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(true,
+                                                                         init_method,
+                                                                         false,
+                                                                         false,
+                                                                         inLengths,
+                                                                         reduceDims,
+                                                                         reduceOpId,
+                                                                         propagateNan,
+                                                                         true,
+                                                                         alpha,
+                                                                         beta);
     }
     else if(data_type == 3)
     {
-        switch(reduceDims.size())
-        {
-        case 1:
-            result = test_reduce_with_index_impl<int8_t, int8_t, int8_t, Rank, 1>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 3:
-            result = test_reduce_with_index_impl<int8_t, int8_t, int8_t, Rank, 3>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 4:
-            result = test_reduce_with_index_impl<int8_t, int8_t, int8_t, Rank, 4>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        };
+        result = profile_reduce_impl<int8_t, int8_t, int8_t>(true,
+                                                             init_method,
+                                                             false,
+                                                             false,
+                                                             inLengths,
+                                                             reduceDims,
+                                                             reduceOpId,
+                                                             propagateNan,
+                                                             true,
+                                                             alpha,
+                                                             beta);
     }
     else if(data_type == 5)
     {
-        switch(reduceDims.size())
-        {
-        case 1:
-            result = test_reduce_with_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 1>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 3:
-            result = test_reduce_with_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 3>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        case 4:
-            result = test_reduce_with_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 4>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        };
+        result = profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(true,
+                                                                      init_method,
+                                                                      false,
+                                                                      false,
+                                                                      inLengths,
+                                                                      reduceDims,
+                                                                      reduceOpId,
+                                                                      propagateNan,
+                                                                      true,
+                                                                      alpha,
+                                                                      beta);
+    }
+    else if(data_type == 6)
+    {
+        result = profile_reduce_impl<double, double, double>(true,
+                                                             init_method,
+                                                             false,
+                                                             false,
+                                                             inLengths,
+                                                             reduceDims,
+                                                             reduceOpId,
+                                                             propagateNan,
+                                                             true,
+                                                             alpha,
+                                                             beta);
     }
 
     return (result);
 };
 
+constexpr ReduceTensorOp reduceOpId = ReduceTensorOp::AMAX;
+constexpr bool propagateNan         = false;
+
 int main(int argc, char* argv[])
 {
     SimpleAppArgs args;
@@ -624,8 +212,14 @@ int main(int argc, char* argv[])
             {0, 1, 2, 3}, {0, 1, 2}, {1, 2, 3}, {0, 1, 3}, {0, 2, 3}, {0}, {1}, {2}, {3}};
 
         for(auto& reduceDims : v_reduceDims)
-            result = result && test_reduce_with_index(
-                                   data_type, init_method, reduceDims, inLengths, 1.0f, 0.0f);
+            result = result && test_reduce_with_index(data_type,
+                                                      init_method,
+                                                      reduceDims,
+                                                      inLengths,
+                                                      reduceOpId,
+                                                      propagateNan,
+                                                      1.0f,
+                                                      0.0f);
     }
     else
     {
@@ -639,6 +233,8 @@ int main(int argc, char* argv[])
                                         args.init_method,
                                         args.reduceDims,
                                         args.inLengths,
+                                        reduceOpId,
+                                        propagateNan,
                                         args.scales[0],
                                         args.scales[1]);
     }