From 3c3d1a0cdf00db4841e8e3705473c28a481a1858 Mon Sep 17 00:00:00 2001
From: Qianfeng <qianfeng.zhang@amd.com>
Date: Thu, 2 Jun 2022 10:49:53 +0800
Subject: [PATCH] Unify the naming of the math functions used by the host and
 kernel (#262)

* Use the unified naming for math functions on host and HIP kernel

* Corresponding change/simplification in reduction host/profiler/examples due to unified math functions renaming

* Renaming GetReductionZeroVal() to GetIdentityValue()

* Tiny renaming in profile_reduce_impl.hpp

* More renaming in profile_reduce_impl.hpp

* Replace zeroVal by identiyVal

* Remove ck_ prefix in the naming of ck::math provided functions

[ROCm/composable_kernel commit: 86185bd7ce1b84696f064822e05837dd63e4f218]
---
 example/12_reduce/reduce_blockwise.cpp        |   6 +-
 .../12_reduce/reduce_blockwise_two_call.cpp   |   6 +-
 example/13_pool2d_fwd/pool2d_fwd_common.hpp   |  46 ++--
 example/13_pool2d_fwd/pool2d_fwd_fp16.cpp     |   2 -
 example/13_pool2d_fwd/pool2d_fwd_fp32.cpp     |   2 -
 .../gemm_reduce_xdl_max_fp16.cpp              |   2 +-
 .../gemm_reduce_xdl_mean_squaremean_fp16.cpp  |   4 +-
 .../batched_gemm_reduce_xdl_fp16.cpp          |   4 +-
 .../gemm_layernorm_xdl_fp16.cpp               |   4 +-
 .../gpu/device/device_reduce_multiblock.hpp   |   6 +-
 .../gpu/element/element_wise_operation.hpp    |  21 +-
 .../grid/gridwise_2d_reduction_multiblock.hpp |  16 +-
 .../grid/gridwise_2d_reduction_threadwise.hpp |  12 +-
 .../gridwise_gemm_reduce_xdl_cshuffle_v1.hpp  |   4 +-
 include/ck/utility/math_v2.hpp                |  70 ++++-
 .../reduction_functions_accumulate.hpp        |  35 +--
 include/ck/utility/reduction_operator.hpp     |  17 +-
 .../library/host_tensor/host_reduce_util.hpp  | 257 ------------------
 .../ck/library/host_tensor/host_reduction.hpp |  71 +++--
 .../profile_batched_gemm_reduce_impl.hpp      |   4 +-
 profiler/include/profile_gemm_reduce_impl.hpp |   4 +-
 profiler/include/profile_reduce_impl.hpp      |  22 +-
 22 files changed, 198 insertions(+), 417 deletions(-)
 delete mode 100644 library/include/ck/library/host_tensor/host_reduce_util.hpp
diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp
index e1e3afc58a..cc75bbad60 100644
--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -147,8 +147,6 @@ class SimpleAppArgs
 
 int main(int argc, char* argv[])
 {
-    using namespace ck::host_reduce;
-
     const std::vector<int> reduceDims{0, 1, 2};
     const std::vector<int> invariantDims{3};
 
@@ -254,7 +252,9 @@ int main(int argc, char* argv[])
         ReductionHost<InDataType,
                       AccDataType,
                       OutDataType,
-                      ReduceOpId,
+                      ReduceOperation,
+                      InElementwiseOperation,
+                      AccElementwiseOperation,
                       Rank,
                       NumReduceDim,
                       PropagateNan,
diff --git a/example/12_reduce/reduce_blockwise_two_call.cpp b/example/12_reduce/reduce_blockwise_two_call.cpp
index cd166c40fe..f42fd08f1e 100644
--- a/example/12_reduce/reduce_blockwise_two_call.cpp
+++ b/example/12_reduce/reduce_blockwise_two_call.cpp
@@ -108,8 +108,6 @@ int main(int argc, char* argv[])
 
     const std::vector<size_t> outLengths = {64, 320, 80};
 
-    using namespace ck::host_reduce;
-
     if(argc == 1)
     {
         do_verify   = true;
@@ -191,7 +189,9 @@ int main(int argc, char* argv[])
         ReductionHost<InOutDataType,
                       AccDataType,
                       InOutDataType,
-                      ReduceOpId,
+                      ReduceOperation,
+                      InElementwiseOperation,
+                      AccElementwiseOperation,
                       5, // Rank
                       2, // NumReduceDim
                       PropagateNan,
diff --git a/example/13_pool2d_fwd/pool2d_fwd_common.hpp b/example/13_pool2d_fwd/pool2d_fwd_common.hpp
index 632112a77a..4652ce1189 100644
--- a/example/13_pool2d_fwd/pool2d_fwd_common.hpp
+++ b/example/13_pool2d_fwd/pool2d_fwd_common.hpp
@@ -8,10 +8,12 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "host_tensor_generator.hpp"
-#include "host_reduce_util.hpp"
 #include "device_tensor.hpp"
 #include "tensor_layout.hpp"
 #include "reduction_enums.hpp"
+#include "reduction_operator_mapping.hpp"
+#include "reduction_functions_accumulate.hpp"
+
 #include "device_pool2d_fwd_nhwc_nhwc.hpp"
 
 template <typename InDataType,
@@ -29,19 +31,24 @@ static void pool_host_verify(const Tensor<InDataType>& in,
                              const std::array<ck::index_t, 2>& in_left_pads,
                              const std::array<ck::index_t, 2>& /*in_right_pads*/)
 {
-    using namespace ck::host_reduce;
-
     const int32_t divider = window_spatial_lengths[0] * window_spatial_lengths[1];
 
-    const auto PreUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
-    const auto PosUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
+    using ReduceOperation = typename ck::reduce_binary_operator<AccDataType, ReduceOpId>::opType;
+    using InElementwiseOperation = typename ck::
+        reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
+    using AccElementwiseOperation = typename ck::
+        reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation;
+
+    const InElementwiseOperation in_elementwise_op(divider);
+    const AccElementwiseOperation acc_elementwise_op(divider);
 
     if constexpr(!OutputIndex)
     {
-        auto opReduce = ReduceOpFn<AccDataType, ReduceOpId>();
+        using Accumulation =
+            ck::detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;
 
         auto f_nchw = [&](auto n, auto c, auto ho, auto wo) {
-            auto accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+            auto accuVal = ReduceOperation::GetIdentityValue();
 
             for(ck::index_t y = 0; y < window_spatial_lengths[0]; ++y)
             {
@@ -54,14 +61,14 @@ static void pool_host_verify(const Tensor<InDataType>& in,
                     {
                         AccDataType currVal = static_cast<AccDataType>(in(n, c, hi, wi));
 
-                        PreUnaryOp(currVal);
+                        in_elementwise_op(currVal, currVal);
 
-                        binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
+                        Accumulation::Calculate(accuVal, currVal);
                     }
                 }
             }
 
-            PosUnaryOp(accuVal);
+            acc_elementwise_op(accuVal, accuVal);
 
             out(n, c, ho, wo) = accuVal;
         };
@@ -74,10 +81,12 @@ static void pool_host_verify(const Tensor<InDataType>& in,
     }
     else
     {
-        auto opReduce = ReduceOpFn2<AccDataType, ReduceOpId>();
-
-        auto f_nchw = [&](auto n, auto c, auto ho, auto wo) {
-            auto accuVal            = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+        using Accumulation = ck::detail::AccumulateWithIndexAndNanCheck<PropagateNan,
+                                                                        ReduceOperation,
+                                                                        AccDataType,
+                                                                        IndexDataType>;
+        auto f_nchw        = [&](auto n, auto c, auto ho, auto wo) {
+            auto accuVal            = ReduceOperation::GetIdentityValue();
             IndexDataType accuIndex = 0;
 
             for(ck::index_t y = 0; y < window_spatial_lengths[0]; ++y)
@@ -92,15 +101,14 @@ static void pool_host_verify(const Tensor<InDataType>& in,
                         AccDataType currVal     = static_cast<AccDataType>(in(n, c, hi, wi));
                         IndexDataType currIndex = y * window_spatial_lengths[1] + x;
 
-                        PreUnaryOp(currVal);
+                        in_elementwise_op(currVal, currVal);
 
-                        binop_with_index_and_nan_check<AccDataType, IndexDataType, PropagateNan>(
-                            opReduce, accuVal, currVal, accuIndex, currIndex);
+                        Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex);
                     }
                 }
             }
 
-            PosUnaryOp(accuVal);
+            acc_elementwise_op(accuVal, accuVal);
 
             out(n, c, ho, wo)         = accuVal;
             out_indices(n, c, ho, wo) = accuIndex;
@@ -139,8 +147,6 @@ bool pool_test(bool do_verification,
                ck::index_t in_right_pad_h,
                ck::index_t in_right_pad_w)
 {
-    using namespace ck::host_reduce;
-
     using DevicePoolFwdInstance =
         ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<
             InDataType,  // InDataType
diff --git a/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
index 624c8ad6cd..74507fdfb3 100644
--- a/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
@@ -27,8 +27,6 @@ static constexpr bool PropagateNan = false;
 
 int main(int argc, char* argv[])
 {
-    using namespace ck::host_reduce;
-
     bool do_verification;
     int init_method;
     bool time_kernel;
diff --git a/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp b/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
index d2d2ae05d1..7ca5b1aab7 100644
--- a/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
@@ -27,8 +27,6 @@ static constexpr bool PropagateNan = false;
 
 int main(int argc, char* argv[])
 {
-    using namespace ck::host_reduce;
-
     bool do_verification;
     int init_method;
     bool time_kernel;
diff --git a/example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp b/example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp
index 6f3f7708a2..4469130502 100644
--- a/example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp
+++ b/example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp
@@ -236,7 +236,7 @@ int main(int argc, char* argv[])
 
         for(int m = 0; m < M; ++m)
         {
-            ReduceAccDataType d_acc = d_reduce_op.GetReductionZeroVal();
+            ReduceAccDataType d_acc = d_reduce_op.GetIdentityValue();
 
             for(int n = 0; n < N; ++n)
                 d_reduce_op(d_acc, c_m_n_host_result(m, n));
diff --git a/example/16_gemm_reduce/gemm_reduce_xdl_mean_squaremean_fp16.cpp b/example/16_gemm_reduce/gemm_reduce_xdl_mean_squaremean_fp16.cpp
index 92e67d31b6..e73e61c532 100644
--- a/example/16_gemm_reduce/gemm_reduce_xdl_mean_squaremean_fp16.cpp
+++ b/example/16_gemm_reduce/gemm_reduce_xdl_mean_squaremean_fp16.cpp
@@ -261,8 +261,8 @@ int main(int argc, char* argv[])
 
         for(int m = 0; m < M; ++m)
         {
-            float d0_acc = d0_reduce_op.GetReductionZeroVal();
-            float d1_acc = d1_reduce_op.GetReductionZeroVal();
+            float d0_acc = d0_reduce_op.GetIdentityValue();
+            float d1_acc = d1_reduce_op.GetIdentityValue();
 
             for(int n = 0; n < N; ++n)
             {
diff --git a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
index c579763c0b..685762fc13 100644
--- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
+++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
@@ -259,8 +259,8 @@ int main(int argc, char* argv[])
         {
             for(int m = 0; m < M; ++m)
             {
-                float d0_acc = d0_reduce_op.GetReductionZeroVal();
-                float d1_acc = d1_reduce_op.GetReductionZeroVal();
+                float d0_acc = d0_reduce_op.GetIdentityValue();
+                float d1_acc = d1_reduce_op.GetIdentityValue();
 
                 for(int n = 0; n < N; ++n)
                 {
diff --git a/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp b/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
index feedb2338e..630f8df1f8 100644
--- a/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
@@ -157,8 +157,8 @@ void host_gemm_layernorm(Tensor<LayerNormOutDataType>& out_m_n,
     auto reduceSumOpInst = ReduceSumOp{};
     for(int m = 0; m < M; ++m)
     {
-        float mean_acc        = reduceSumOpInst.GetReductionZeroVal();
-        float square_mean_acc = reduceSumOpInst.GetReductionZeroVal();
+        float mean_acc        = reduceSumOpInst.GetIdentityValue();
+        float square_mean_acc = reduceSumOpInst.GetIdentityValue();
 
         for(int n = 0; n < N; ++n)
         {
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
index 2f447c0979..575c6bff1d 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
@@ -348,8 +348,8 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
 
             if constexpr(use_multiblock)
             {
-                const auto zeroVal =
-                    ck::reduce::GetReductionZeroValueForInMemoryDataOperation<OutDataType>(
+                const auto identityVal =
+                    ck::reduce::GetIdentityValueueForInMemoryDataOperation<OutDataType>(
                         OutMemoryDataOperation);
 
                 const auto kernel_pre =
@@ -362,7 +362,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
                                                    0,
                                                    out_grid_desc_m_2,
                                                    arg.out_dev_,
-                                                   zeroVal);
+                                                   identityVal);
             };
 
             avg_time += launch_and_time_kernel(stream_config,
diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
index b6cfb2d78c..d899bdc967 100644
--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -1,5 +1,6 @@
 #pragma once
 #include "data_type.hpp"
+#include "math_v2.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -296,7 +297,7 @@ struct UnaryAbs<float, float>
 {
     __host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };
 
-    __host__ __device__ void operator()(float& y, const float& x) const { y = abs(x); };
+    __host__ __device__ void operator()(float& y, const float& x) const { y = ck::math::abs(x); };
 };
 
 template <>
@@ -304,7 +305,7 @@ struct UnaryAbs<half_t, half_t>
 {
     __host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };
 
-    __host__ __device__ void operator()(half_t& y, const half_t& x) const { y = __habs(x); };
+    __host__ __device__ void operator()(half_t& y, const half_t& x) const { y = ck::math::abs(x); };
 };
 
 template <>
@@ -312,7 +313,7 @@ struct UnaryAbs<double, double>
 {
     __host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };
 
-    __host__ __device__ void operator()(double& y, const double& x) const { y = abs(x); };
+    __host__ __device__ void operator()(double& y, const double& x) const { y = ck::math::abs(x); };
 };
 
 template <>
@@ -320,12 +321,7 @@ struct UnaryAbs<int8_t, int8_t>
 {
     __host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };
 
-    __host__ __device__ void operator()(int8_t& y, const int8_t& x) const
-    {
-        int8_t sgn = x >> (8 - 1);
-
-        y = (x ^ sgn) - sgn;
-    };
+    __host__ __device__ void operator()(int8_t& y, const int8_t& x) const { y = ck::math::abs(x); };
 };
 
 template <typename Y, typename X>
@@ -336,7 +332,7 @@ struct UnarySqrt<float, float>
 {
     __host__ __device__ UnarySqrt(const int32_t divider = 1) { (void)divider; };
 
-    __host__ __device__ void operator()(float& y, const float& x) const { y = sqrtf(x); };
+    __host__ __device__ void operator()(float& y, const float& x) const { y = ck::math::sqrt(x); };
 };
 
 template <>
@@ -344,7 +340,10 @@ struct UnarySqrt<double, double>
 {
     __host__ __device__ UnarySqrt(const int32_t divider = 1) { (void)divider; };
 
-    __host__ __device__ void operator()(double& y, const double& x) const { y = sqrt(x); };
+    __host__ __device__ void operator()(double& y, const double& x) const
+    {
+        y = ck::math::sqrt(x);
+    };
 };
 
 } // namespace element_wise
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
index f3e9836d4f..b2f06c03c6 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
@@ -171,7 +171,7 @@ struct GridwiseReduction_mk_to_m_multiblock
                                AccDataType beta,
                                OutDataType* const __restrict__ p_out_value_global)
     {
-        const auto zeroVal = ReduceOperation::GetReductionZeroVal();
+        const auto identityVal = ReduceOperation::GetIdentityValue();
 
         // LDS
         __shared__ AccDataType p_reduce_work_buffer[BlockSize];
@@ -179,7 +179,7 @@ struct GridwiseReduction_mk_to_m_multiblock
         const auto in_global_val_buf =
             make_dynamic_buffer<AddressSpaceEnum::Global>(p_in_value_global,
                                                           in_grid_desc_m_k.GetElementSpaceSize(),
-                                                          type_convert<InDataType>(zeroVal));
+                                                          type_convert<InDataType>(identityVal));
         auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_out_value_global, out_grid_desc_m.GetElementSpaceSize());
 
@@ -191,7 +191,7 @@ struct GridwiseReduction_mk_to_m_multiblock
 
         StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
 
-        static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; });
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = identityVal; });
 
         const index_t thread_local_id = get_thread_local_1d_id();
         const index_t block_global_id = get_block_1d_id();
@@ -358,12 +358,12 @@ struct GridwiseReduction_mk_to_m_multiblock
         __shared__ AccDataType p_reduce_work_val_buffer[BlockSize];
         __shared__ IndexDataType p_reduce_work_idx_buffer[BlockSize];
 
-        const auto zeroVal = ReduceOperation::GetReductionZeroVal();
+        const auto identityVal = ReduceOperation::GetIdentityValue();
 
         const auto in_global_val_buf =
             make_dynamic_buffer<AddressSpaceEnum::Global>(p_in_value_global,
                                                           in_grid_desc_m_k.GetElementSpaceSize(),
-                                                          type_convert<InDataType>(zeroVal));
+                                                          type_convert<InDataType>(identityVal));
         const auto in_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_in_index_global, in_grid_desc_m_k.GetElementSpaceSize());
         auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
@@ -418,7 +418,7 @@ struct GridwiseReduction_mk_to_m_multiblock
                                  thread_k_cluster_id * KThreadSliceSize));
 
         static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-            accu_value_buf(I) = zeroVal;
+            accu_value_buf(I) = identityVal;
             accu_index_buf(I) = 0;
         });
 
@@ -459,7 +459,7 @@ struct GridwiseReduction_mk_to_m_multiblock
                                             in_thread_idx_buf);
 
                 static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
-                    AccDataType tmpValue   = zeroVal;
+                    AccDataType tmpValue   = identityVal;
                     IndexDataType tmpIndex = 0;
 
                     static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
@@ -512,7 +512,7 @@ struct GridwiseReduction_mk_to_m_multiblock
                                           in_thread_val_buf(Number<offset>{}));
                     });
 
-                    AccDataType tmpValue   = zeroVal;
+                    AccDataType tmpValue   = identityVal;
                     IndexDataType tmpIndex = 0;
 
                     static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
index ff01b88146..074aafb9d4 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
@@ -135,12 +135,12 @@ struct GridwiseReduction_mk_to_m_threadwise
                                                      ReduceOperation,
                                                      PropagateNan>;
 
-        const auto zeroVal = ReduceOperation::GetReductionZeroVal();
+        const auto identityVal = ReduceOperation::GetIdentityValue();
 
         const auto in_global_val_buf =
             make_dynamic_buffer<AddressSpaceEnum::Global>(p_in_value_global,
                                                           in_grid_desc_m_k.GetElementSpaceSize(),
-                                                          type_convert<InDataType>(zeroVal));
+                                                          type_convert<InDataType>(identityVal));
         auto dst_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_out_value_global, out_grid_desc_m.GetElementSpaceSize());
 
@@ -149,7 +149,7 @@ struct GridwiseReduction_mk_to_m_threadwise
 
         StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
 
-        static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; });
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = identityVal; });
 
         const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{});
 
@@ -276,12 +276,12 @@ struct GridwiseReduction_mk_to_m_threadwise
 
         (void)acc_elementwise_op;
 
-        const auto zeroVal = ReduceOperation::GetReductionZeroVal();
+        const auto identityVal = ReduceOperation::GetIdentityValue();
 
         const auto in_global_val_buf =
             make_dynamic_buffer<AddressSpaceEnum::Global>(p_in_value_global,
                                                           in_grid_desc_m_k.GetElementSpaceSize(),
-                                                          type_convert<InDataType>(zeroVal));
+                                                          type_convert<InDataType>(identityVal));
         const auto in_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_in_index_global, in_grid_desc_m_k.GetElementSpaceSize());
 
@@ -303,7 +303,7 @@ struct GridwiseReduction_mk_to_m_threadwise
         StaticBuffer<AddressSpaceEnum::Vgpr, IndexDataType, MThreadSliceSize, true> accu_index_buf;
 
         static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-            accu_value_buf(I) = zeroVal;
+            accu_value_buf(I) = identityVal;
             accu_index_buf(I) = 0;
         });
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
index e8ab8c7d8e..c178e29496 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
@@ -816,10 +816,10 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                                                 false>;
 
                         // Global write Gemm shuffle + reduction
-                        const auto d_zeroVal = DReduceOperation::GetReductionZeroVal();
+                        const auto d_identityVal = DReduceOperation::GetIdentityValue();
 
                         static_for<0, mreduce_per_thread, 1>{}(
-                            [&](auto I) { d_thread_buf(I) = d_zeroVal; });
+                            [&](auto I) { d_thread_buf(I) = d_identityVal; });
 
                         // reduce in VGPR
                         static_for<0, mreduce_per_thread, 1>{}([&](auto im) {
diff --git a/include/ck/utility/math_v2.hpp b/include/ck/utility/math_v2.hpp
index 572d576e7a..438f5e12bd 100644
--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
@@ -3,11 +3,13 @@
 
 #include <cmath>
 #include "data_type.hpp"
-#include "half.hpp"
+#include "type.hpp"
 
 namespace ck {
 namespace math {
 
+// math functions for the host,  some are implemented by calling C++ std functions
+
 static inline __host__ float abs(float x) { return std::abs(x); };
 
 static inline __host__ double abs(double x) { return std::abs(x); };
@@ -28,26 +30,26 @@ static inline __host__ int32_t abs(int32_t x)
 
 static inline __host__ half_t abs(half_t x)
 {
-    half_float::half xx = *reinterpret_cast<half_float::half*>(&x);
+    uint16_t xx = ck::bit_cast<uint16_t>(x);
 
-    half_float::half abs_xx = half_float::abs(xx);
+    uint16_t abs_xx = xx & 0x7fff;
 
-    half_t abs_x = *reinterpret_cast<half_t*>(&abs_xx);
+    half_t abs_x = ck::bit_cast<half_t>(abs_xx);
 
     return abs_x;
 };
 
-static inline __host__ float isnan(float x) { return std::isnan(x); };
+static inline __host__ bool isnan(float x) { return std::isnan(x); };
 
-static inline __host__ double isnan(double x) { return std::isnan(x); };
+static inline __host__ bool isnan(double x) { return std::isnan(x); };
 
-static inline __host__ int8_t isnan(int8_t x)
+static inline __host__ bool isnan(int8_t x)
 {
     (void)x;
     return false;
 };
 
-static inline __host__ int32_t isnan(int32_t x)
+static inline __host__ bool isnan(int32_t x)
 {
     (void)x;
     return false;
@@ -55,11 +57,59 @@ static inline __host__ int32_t isnan(int32_t x)
 
 static inline __host__ bool isnan(half_t x)
 {
-    half_float::half xx = *reinterpret_cast<half_float::half*>(&x);
+    uint16_t xx = ck::bit_cast<uint16_t>(x);
 
-    return half_float::isnan(xx);
+    return (xx & 0x7FFF) > 0x7C00;
 };
 
+static inline __host__ float sqrt(float x) { return std::sqrt(x); };
+
+static inline __host__ double sqrt(double x) { return std::sqrt(x); };
+
+// math functions for the HIP kernel,  some are implemented by calling hip builtin functions
+
+static inline __device__ float abs(float x) { return ::abs(x); };
+
+static inline __device__ double abs(double x) { return ::abs(x); };
+
+static inline __device__ int8_t abs(int8_t x)
+{
+    int8_t sgn = x >> (8 - 1);
+
+    return (x ^ sgn) - sgn;
+};
+
+static inline __device__ int32_t abs(int32_t x)
+{
+    int32_t sgn = x >> (32 - 1);
+
+    return (x ^ sgn) - sgn;
+};
+
+static inline __device__ half_t abs(half_t x) { return ::__habs(x); };
+
+static inline __device__ bool isnan(float x) { return ::isnan(x); };
+
+static inline __device__ bool isnan(double x) { return ::isnan(x); };
+
+static inline __device__ bool isnan(int8_t x)
+{
+    (void)x;
+    return false;
+};
+
+static inline __device__ bool isnan(int32_t x)
+{
+    (void)x;
+    return false;
+};
+
+static inline __device__ bool isnan(half_t x) { return ::__hisnan(x); };
+
+static inline __device__ float sqrt(float x) { return ::sqrtf(x); };
+
+static inline __device__ double sqrt(double x) { return ::sqrt(x); };
+
 } // namespace math
 } // namespace ck
 
diff --git a/include/ck/utility/reduction_functions_accumulate.hpp b/include/ck/utility/reduction_functions_accumulate.hpp
index 4e8636e5b2..22175c5bcc 100644
--- a/include/ck/utility/reduction_functions_accumulate.hpp
+++ b/include/ck/utility/reduction_functions_accumulate.hpp
@@ -27,6 +27,7 @@
 #define CK_REDUCTION_FUNCTIONS_BINOP_HPP
 
 #include "data_type.hpp"
+#include "math_v2.hpp"
 
 #include "reduction_common.hpp"
 #include "reduction_operator.hpp"
@@ -34,18 +35,6 @@
 namespace ck {
 namespace detail {
 
-template <typename T>
-static inline __device__ bool is_nan(T x)
-{
-    return (isnan(x));
-};
-
-template <>
-inline __device__ bool is_nan<half_t>(half_t x)
-{
-    return (__hisnan(x));
-};
-
 template <bool PropagateNan, typename ReduceOperation, typename AccDataType>
 struct AccumulateWithNanCheck;
 
@@ -53,7 +42,7 @@ template <typename ReduceOperation, typename AccDataType>
 struct AccumulateWithNanCheck<false, ReduceOperation, AccDataType>
 {
     // cppcheck-suppress constParameter
-    __device__ static inline void Calculate(AccDataType& accuVal, AccDataType currVal)
+    __host__ __device__ static inline void Calculate(AccDataType& accuVal, AccDataType currVal)
     {
         ReduceOperation{}(accuVal, currVal);
     };
@@ -62,9 +51,11 @@ struct AccumulateWithNanCheck<false, ReduceOperation, AccDataType>
 template <typename ReduceOperation, typename AccDataType>
 struct AccumulateWithNanCheck<true, ReduceOperation, AccDataType>
 {
-    __device__ static inline void Calculate(AccDataType& accuVal, AccDataType currVal)
+    __host__ __device__ static inline void Calculate(AccDataType& accuVal, AccDataType currVal)
     {
-        if(is_nan(currVal))
+        using ck::math::isnan;
+
+        if(isnan(currVal))
         {
             accuVal = currVal;
         }
@@ -81,7 +72,7 @@ struct AccumulateWithIndexAndNanCheck;
 template <typename ReduceOperation, typename AccDataType, typename IndexDataType>
 struct AccumulateWithIndexAndNanCheck<false, ReduceOperation, AccDataType, IndexDataType>
 {
-    __device__ static inline void
+    __host__ __device__ static inline void
     // cppcheck-suppress constParameter
     Calculate(AccDataType& accuVal,
               AccDataType currVal,
@@ -101,12 +92,14 @@ template <typename ReduceOperation, typename AccDataType, typename IndexDataType
 struct AccumulateWithIndexAndNanCheck<true, ReduceOperation, AccDataType, IndexDataType>
 {
     // The method is called when the ReduceOperation is indexable and the user asked for indices
-    __device__ static inline void Calculate(AccDataType& accuVal,
-                                            AccDataType currVal,
-                                            IndexDataType& accuIndex,
-                                            IndexDataType currIndex)
+    __host__ __device__ static inline void Calculate(AccDataType& accuVal,
+                                                     AccDataType currVal,
+                                                     IndexDataType& accuIndex,
+                                                     IndexDataType currIndex)
     {
-        if(is_nan(currVal))
+        using ck::math::isnan;
+
+        if(isnan(currVal))
         {
             accuVal   = currVal;
             accuIndex = currIndex;
diff --git a/include/ck/utility/reduction_operator.hpp b/include/ck/utility/reduction_operator.hpp
index e7a8db8c01..ee40398d25 100644
--- a/include/ck/utility/reduction_operator.hpp
+++ b/include/ck/utility/reduction_operator.hpp
@@ -36,7 +36,7 @@ namespace reduce {
 // Every binary operator used in reduction is represented by a templated functor class. Each functor
 // class must provide at least
 // three members:
-// 1) GetReductionZeroVal() -- the interface to return the "identity element" for the binary
+// 1) GetIdentityValue() -- the interface to return the "identity element" for the binary
 // operator, "identity element" is the unique
 //                    element in the algebraic space that doesn't affect the value of other elements
 //                    when operated against them, and the concept is similar to zero vector in
@@ -59,7 +59,7 @@ struct Add
 {
     using dataType = T;
 
-    __host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); };
+    __host__ __device__ static constexpr T GetIdentityValue() { return static_cast<T>(0.0f); };
 
     __device__ static constexpr bool
     IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation)
@@ -76,7 +76,7 @@ struct Mul
 {
     using dataType = T;
 
-    __host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(1.0f); };
+    __host__ __device__ static constexpr T GetIdentityValue() { return static_cast<T>(1.0f); };
 
     __device__ static constexpr bool
     IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation)
@@ -92,7 +92,7 @@ struct Max
 {
     using dataType = T;
 
-    __host__ __device__ static constexpr T GetReductionZeroVal()
+    __host__ __device__ static constexpr T GetIdentityValue()
     {
         return NumericLimits<T>::Lowest();
     };
@@ -125,10 +125,7 @@ struct Min
 {
     using dataType = T;
 
-    __host__ __device__ static constexpr T GetReductionZeroVal()
-    {
-        return NumericLimits<T>::Max();
-    };
+    __host__ __device__ static constexpr T GetIdentityValue() { return NumericLimits<T>::Max(); };
 
     __device__ static constexpr bool
     IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation)
@@ -158,7 +155,7 @@ struct AMax
 {
     using dataType = T;
 
-    __host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); };
+    __host__ __device__ static constexpr T GetIdentityValue() { return static_cast<T>(0.0f); };
 
     __device__ static constexpr bool
     IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation)
@@ -184,7 +181,7 @@ struct AMax
 };
 
 template <typename T>
-T GetReductionZeroValueForInMemoryDataOperation(InMemoryDataOperationEnum operation)
+T GetIdentityValueueForInMemoryDataOperation(InMemoryDataOperationEnum operation)
 {
     T result = ck::type_convert<T>(0.0f);
 
diff --git a/library/include/ck/library/host_tensor/host_reduce_util.hpp b/library/include/ck/library/host_tensor/host_reduce_util.hpp
deleted file mode 100644
index 095bb03426..0000000000
--- a/library/include/ck/library/host_tensor/host_reduce_util.hpp
+++ /dev/null
@@ -1,257 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef GUARD_HOST_REDUCE_UTIL_HPP
-#define GUARD_HOST_REDUCE_UTIL_HPP
-
-#include <limits>
-#include <cmath>
-#include <functional>
-
-#include "reduction_enums.hpp"
-#include "data_type.hpp"
-#include "math_v2.hpp"
-
-namespace ck {
-
-namespace host_reduce {
-
-using ck::NanPropagation;
-using ck::ReduceTensorOp;
-
-template <typename AccDataType, ReduceTensorOp ReduceOpId>
-__host__ static inline std::function<void(AccDataType&)> PreUnaryOpFn(int)
-{
-    using ck::math::abs;
-
-    if constexpr(ReduceOpId == ReduceTensorOp::NORM1)
-    {
-        return ([&](AccDataType& a_) { a_ = abs(a_); });
-    }
-    else if constexpr(ReduceOpId == ReduceTensorOp::NORM2)
-    {
-        return ([&](AccDataType& a_) { a_ = a_ * a_; });
-    }
-    else if constexpr(ReduceOpId == ReduceTensorOp::AMAX)
-    {
-        return ([&](AccDataType& a_) { a_ = abs(a_); });
-    }
-    else
-    {
-        // ReduceTensorOp::AVG:
-        // ReduceTensorOp::ADD:
-        // ReduceTensorOp::MUL:
-        // ReduceTensorOp::MIN:
-        // ReduceTensorOp::MAX:
-        return ([&](AccDataType&) {});
-    };
-};
-
-template <typename AccDataType, ReduceTensorOp ReduceOpId>
-__host__ static inline std::function<void(AccDataType&)> PosUnaryOpFn(int32_t divider)
-{
-    using std::sqrt;
-
-    if constexpr(ReduceOpId == ReduceTensorOp::NORM2)
-    {
-        return ([&](AccDataType& a_) { a_ = sqrt(a_); });
-    }
-    else if constexpr(ReduceOpId == ReduceTensorOp::AVG)
-    {
-        return ([&, divider](AccDataType& a_) {
-            a_ = a_ / static_cast<AccDataType>(static_cast<float>(divider));
-        });
-    }
-    else
-    {
-        // ReduceTensorOp::ADD:
-        // ReduceTensorOp::NORM1:
-        // ReduceTensorOp::MUL:
-        // ReduceTensorOp::MIN:
-        // ReduceTensorOp::MAX:
-        // ReduceTensorOp::AMAX:
-        return ([&](AccDataType&) {});
-    }
-};
-
-template <typename AccDataType, ReduceTensorOp ReduceOpId>
-__host__ static inline std::function<void(AccDataType&, AccDataType)> ReduceOpFn()
-{
-    if constexpr(ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::AVG ||
-                 ReduceOpId == ReduceTensorOp::NORM1 || ReduceOpId == ReduceTensorOp::NORM2)
-    {
-        return ([&](AccDataType& a_, AccDataType b_) { a_ = a_ + b_; });
-    }
-    else if constexpr(ReduceOpId == ReduceTensorOp::MUL)
-    {
-        return ([&](AccDataType& a_, AccDataType b_) { a_ = a_ * b_; });
-    }
-    else if constexpr(ReduceOpId == ReduceTensorOp::MIN)
-    {
-        return ([&](AccDataType& a_, AccDataType b_) {
-            if(a_ > b_)
-                a_ = b_;
-        });
-    }
-    else if constexpr(ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX)
-    {
-        return ([&](AccDataType& a_, AccDataType b_) {
-            if(a_ < b_)
-                a_ = b_;
-        });
-    }
-};
-
-template <typename AccDataType, ReduceTensorOp ReduceOpId>
-__host__ static inline std::function<void(AccDataType&, AccDataType, bool& changed)> ReduceOpFn2()
-{
-    if constexpr(ReduceOpId == ReduceTensorOp::MIN)
-    {
-        return ([&](AccDataType& a_, AccDataType b_, bool& changed) {
-            if(a_ > b_)
-            {
-                a_      = b_;
-                changed = true;
-            }
-            else
-                changed = false;
-        });
-    }
-    else if constexpr(ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX)
-    {
-        return ([&](AccDataType& a_, AccDataType b_, bool& changed) {
-            if(a_ < b_)
-            {
-                a_      = b_;
-                changed = true;
-            }
-            else
-                changed = false;
-        });
-    }
-    else
-    {
-        // ReduceTensorOp::ADD:
-        // ReduceTensorOp::MUL:
-        // ReduceTensorOp::AVG:
-        // ReduceTensorOp::NORM1:
-        // ReduceTensorOp::NORM2:
-        return (std::function<void(AccDataType&, AccDataType, bool&)>{});
-    };
-};
-
-template <typename AccDataType, ReduceTensorOp ReduceOpId>
-__host__ static inline AccDataType ReduceOpZeroVal()
-{
-    if constexpr(ReduceOpId == ReduceTensorOp::MUL)
-    {
-        return (static_cast<AccDataType>(1.0f));
-    }
-    else if constexpr(ReduceOpId == ReduceTensorOp::MIN)
-    {
-        return (ck::NumericLimits<AccDataType>::Max());
-    }
-    else if constexpr(ReduceOpId == ReduceTensorOp::MAX)
-    {
-        return (ck::NumericLimits<AccDataType>::Lowest());
-    }
-    else if constexpr(ReduceOpId == ReduceTensorOp::AMAX)
-    {
-        return (static_cast<AccDataType>(0.0f));
-    }
-    else
-    {
-        // ReduceTensorOp::ADD
-        // ReduceTensorOp::AVG
-        // ReduceTensorOp::NORM1
-        // ReduceTensorOp::NORM2
-        return (static_cast<AccDataType>(0.0f));
-    };
-};
-
-template <typename AccDataType, bool PropagateNan>
-__host__ static inline void
-binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
-                     AccDataType& accuVal,
-                     AccDataType currVal)
-{
-    using ck::math::isnan;
-
-    if constexpr(!PropagateNan)
-    {
-        opReduce(accuVal, currVal);
-    }
-    else
-    {
-        if(isnan(currVal))
-            accuVal = currVal;
-        else
-            opReduce(accuVal, currVal);
-    };
-};
-
-template <typename AccDataType, typename IndexDataType, bool PropagateNan>
-__host__ static inline void
-binop_with_index_and_nan_check(std::function<void(AccDataType&, AccDataType, bool&)> opReduce,
-                               AccDataType& accuVal,
-                               AccDataType currVal,
-                               IndexDataType& accuIndex,
-                               IndexDataType currIndex)
-{
-    using ck::math::isnan;
-
-    if constexpr(!PropagateNan)
-    {
-        bool changed;
-
-        opReduce(accuVal, currVal, changed);
-
-        if(changed)
-            accuIndex = currIndex;
-    }
-    else
-    {
-        if(isnan(currVal))
-        {
-            accuVal   = currVal;
-            accuIndex = currIndex;
-        }
-        else
-        {
-            bool changed;
-
-            opReduce(accuVal, currVal, changed);
-
-            if(changed)
-                accuIndex = currIndex;
-        };
-    };
-};
-
-}; // namespace host_reduce
-
-}; // namespace ck
-
-#endif
diff --git a/library/include/ck/library/host_tensor/host_reduction.hpp b/library/include/ck/library/host_tensor/host_reduction.hpp
index 1add62d1b5..0e94095639 100644
--- a/library/include/ck/library/host_tensor/host_reduction.hpp
+++ b/library/include/ck/library/host_tensor/host_reduction.hpp
@@ -33,10 +33,10 @@
 
 #include "reduction_enums.hpp"
 #include "reduction_common.hpp"
-#include "host_reduce_util.hpp"
 #include "host_common_util.hpp"
 #include "host_tensor.hpp"
 #include "data_type.hpp"
+#include "reduction_functions_accumulate.hpp"
 
 template <int NDim>
 static void get_all_indexes(const std::array<size_t, NDim>& dimLengths,
@@ -106,11 +106,13 @@ static size_t get_offset_from_index(const std::vector<size_t>& strides,
 template <typename InDataType,
           typename AccDataType,
           typename OutDataType,
-          ck::ReduceTensorOp ReduceOpId,
+          typename ReduceOperation,
+          typename InElementwiseOperation,
+          typename AccElementwiseOperation,
           int Rank,
           int NumReduceDim,
           bool PropagateNan,
-          bool NeedIndices>
+          bool OutputIndex>
 struct ReductionHost
 {
     using IndexDataType = int32_t;
@@ -122,8 +124,6 @@ struct ReductionHost
     std::vector<int> reduceDims;
 
     IndexDataType divider;
-    std::function<void(AccDataType&)> preUnaryOp;
-    std::function<void(AccDataType&)> posUnaryOp;
     std::array<size_t, NumReduceDim> reduceLengths;
     std::array<size_t, NumReduceDim> reduceStrides;
     std::array<size_t, NumInvariantDim> invariantLengths;
@@ -137,9 +137,6 @@ struct ReductionHost
                   const std::vector<int>& invariantDims_,
                   const std::vector<int>& reduceDims_)
     {
-        using ck::host_reduce::PosUnaryOpFn;
-        using ck::host_reduce::PreUnaryOpFn;
-
         // this->outLengths = to_int_vector(outDesc.GetLengths());
         this->outStrides = outDesc.GetStrides();
 
@@ -171,9 +168,6 @@ struct ReductionHost
             invariant_dim_indexes.clear();
             get_all_indexes<NumInvariantDim>(invariantLengths, invariant_dim_indexes);
         };
-
-        preUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
-        posUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
     };
 
     void Run(float alpha,
@@ -182,7 +176,7 @@ struct ReductionHost
              OutDataType* out_data,
              IndexDataType* out_indices)
     {
-        if constexpr(NeedIndices)
+        if constexpr(OutputIndex)
         {
             RunImpl_with_index(alpha, in_data, beta, out_data, out_indices);
         }
@@ -201,15 +195,17 @@ struct ReductionHost
         using ck::float_equal_one;
         using ck::float_equal_zero;
         using ck::type_convert;
-        using ck::host_reduce::binop_with_index_and_nan_check;
-        using ck::host_reduce::ReduceOpFn2;
-        using ck::host_reduce::ReduceOpZeroVal;
 
-        auto opReduce2 = ReduceOpFn2<AccDataType, ReduceOpId>();
+        using Accumulation = ck::detail::AccumulateWithIndexAndNanCheck<PropagateNan,
+                                                                        ReduceOperation,
+                                                                        AccDataType,
+                                                                        IndexDataType>;
+        InElementwiseOperation in_elementwise_op(divider);
+        AccElementwiseOperation acc_elementwise_op(divider);
 
         if constexpr(NumInvariantDim == 0)
         {
-            AccDataType accuVal     = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+            AccDataType accuVal     = ReduceOperation::GetIdentityValue();
             IndexDataType accuIndex = 0;
 
             for(std::size_t i = 0; i < reduce_dim_indexes.size(); i++)
@@ -219,15 +215,14 @@ struct ReductionHost
 
                 auto currVal = type_convert<AccDataType>(in_data[offset_reduce]);
 
-                preUnaryOp(currVal);
+                in_elementwise_op(currVal, currVal);
 
                 auto currIndex = static_cast<IndexDataType>(i);
 
-                binop_with_index_and_nan_check<AccDataType, IndexDataType, PropagateNan>(
-                    opReduce2, accuVal, currVal, accuIndex, currIndex);
+                Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex);
             };
 
-            posUnaryOp(accuVal);
+            acc_elementwise_op(accuVal, accuVal);
 
             if(!float_equal_one{}(alpha))
                 accuVal *= type_convert<AccDataType>(alpha);
@@ -241,7 +236,7 @@ struct ReductionHost
         else
         {
             auto thread_reduce_func = [&](auto invariant_index) {
-                AccDataType accuVal     = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+                AccDataType accuVal     = ReduceOperation::GetIdentityValue();
                 IndexDataType accuIndex = 0;
 
                 auto offset_invariant =
@@ -255,15 +250,14 @@ struct ReductionHost
                     auto currVal =
                         type_convert<AccDataType>(in_data[offset_invariant + offset_reduce]);
 
-                    preUnaryOp(currVal);
+                    in_elementwise_op(currVal, currVal);
 
                     auto currIndex = static_cast<IndexDataType>(i);
 
-                    binop_with_index_and_nan_check<AccDataType, IndexDataType, PropagateNan>(
-                        opReduce2, accuVal, currVal, accuIndex, currIndex);
+                    Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex);
                 };
 
-                posUnaryOp(accuVal);
+                acc_elementwise_op(accuVal, accuVal);
 
                 if(!float_equal_one{}(alpha))
                     accuVal *= type_convert<AccDataType>(alpha);
@@ -308,15 +302,16 @@ struct ReductionHost
         using ck::float_equal_one;
         using ck::float_equal_zero;
         using ck::type_convert;
-        using ck::host_reduce::binop_with_nan_check;
-        using ck::host_reduce::ReduceOpFn;
-        using ck::host_reduce::ReduceOpZeroVal;
 
-        auto opReduce = ReduceOpFn<AccDataType, ReduceOpId>();
+        using Accumulation =
+            ck::detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;
+
+        InElementwiseOperation in_elementwise_op(divider);
+        AccElementwiseOperation acc_elementwise_op(divider);
 
         if constexpr(NumInvariantDim == 0)
         {
-            AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+            AccDataType accuVal = ReduceOperation::GetIdentityValue();
 
             for(const auto& reduce_index : reduce_dim_indexes)
             {
@@ -325,12 +320,12 @@ struct ReductionHost
 
                 auto currVal = type_convert<AccDataType>(in_data[offset_reduce]);
 
-                preUnaryOp(currVal);
+                in_elementwise_op(currVal, currVal);
 
-                binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
+                Accumulation::Calculate(accuVal, currVal);
             };
 
-            posUnaryOp(accuVal);
+            acc_elementwise_op(accuVal, accuVal);
 
             if(!float_equal_one{}(alpha))
                 accuVal *= type_convert<AccDataType>(alpha);
@@ -343,7 +338,7 @@ struct ReductionHost
         else
         {
             auto thread_reduce_func = [&](auto invariant_index) {
-                AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+                AccDataType accuVal = ReduceOperation::GetIdentityValue();
 
                 auto offset_invariant =
                     get_offset_from_index<NumInvariantDim>(invariantStrides, invariant_index);
@@ -356,12 +351,12 @@ struct ReductionHost
                     auto currVal =
                         type_convert<AccDataType>(in_data[offset_invariant + offset_reduce]);
 
-                    preUnaryOp(currVal);
+                    in_elementwise_op(currVal, currVal);
 
-                    binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
+                    Accumulation::Calculate(accuVal, currVal);
                 };
 
-                posUnaryOp(accuVal);
+                acc_elementwise_op(accuVal, accuVal);
 
                 if(!float_equal_one{}(alpha))
                     accuVal *= type_convert<AccDataType>(alpha);
diff --git a/profiler/include/profile_batched_gemm_reduce_impl.hpp b/profiler/include/profile_batched_gemm_reduce_impl.hpp
index 56ca2cbebe..7ba0472686 100644
--- a/profiler/include/profile_batched_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_batched_gemm_reduce_impl.hpp
@@ -171,8 +171,8 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
         {
             for(int m = 0; m < M; ++m)
             {
-                float d0_acc = d0_reduce_op.GetReductionZeroVal();
-                float d1_acc = d1_reduce_op.GetReductionZeroVal();
+                float d0_acc = d0_reduce_op.GetIdentityValue();
+                float d1_acc = d1_reduce_op.GetIdentityValue();
 
                 for(int n = 0; n < N; ++n)
                 {
diff --git a/profiler/include/profile_gemm_reduce_impl.hpp b/profiler/include/profile_gemm_reduce_impl.hpp
index 752a1d9641..dbdc9fd9d8 100644
--- a/profiler/include/profile_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_gemm_reduce_impl.hpp
@@ -165,8 +165,8 @@ bool profile_gemm_reduce_impl(int do_verification,
 
         for(int m = 0; m < M; ++m)
         {
-            float d0_acc = d0_reduce_op.GetReductionZeroVal();
-            float d1_acc = d1_reduce_op.GetReductionZeroVal();
+            float d0_acc = d0_reduce_op.GetIdentityValue();
+            float d1_acc = d1_reduce_op.GetIdentityValue();
 
             for(int n = 0; n < N; ++n)
             {
diff --git a/profiler/include/profile_reduce_impl.hpp b/profiler/include/profile_reduce_impl.hpp
index a87694754e..fd519d1033 100644
--- a/profiler/include/profile_reduce_impl.hpp
+++ b/profiler/include/profile_reduce_impl.hpp
@@ -138,7 +138,6 @@ bool profile_reduce_impl_impl(bool do_verification,
 {
     using namespace ck::tensor_operation::device;
     using namespace ck::tensor_operation::device::device_reduce_instance;
-    using namespace ck::host_reduce;
     using ck::host_common::dumpBufferToFile;
 
     constexpr bool op_support_indices =
@@ -261,15 +260,17 @@ bool profile_reduce_impl_impl(bool do_verification,
         float best_avg_time   = 0;
         float best_gb_per_sec = 0;
 
-        using InElementwiseOperation_0 =
+        using InElementwiseOperation =
             typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
                 InElementwiseOperation;
-        using AccElementwiseOperation_0 =
+        using AccElementwiseOperation =
             typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
                 AccElementwiseOperation;
 
+        using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
+
         using DeviceReduceInstPtr0 =
-            DeviceReducePtr<InElementwiseOperation_0, AccElementwiseOperation_0>;
+            DeviceReducePtr<InElementwiseOperation, AccElementwiseOperation>;
 
         std::vector<DeviceReduceInstPtr0> reduce0_ptrs;
 
@@ -313,7 +314,9 @@ bool profile_reduce_impl_impl(bool do_verification,
             ReductionHost<InDataType,
                           AccDataType,
                           OutDataType,
-                          ReduceOpId,
+                          ReduceOperation,
+                          InElementwiseOperation,
+                          AccElementwiseOperation,
                           Rank,
                           NumReduceDim,
                           PropagateNan,
@@ -337,9 +340,8 @@ bool profile_reduce_impl_impl(bool do_verification,
         for(auto& reduce_ptr : reduce0_ptrs)
         {
 
-            InElementwiseOperation_0 in_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
-            AccElementwiseOperation_0 acc_elementwise_op_0(
-                static_cast<int32_t>(reduce_total_length));
+            InElementwiseOperation in_elementwise_op(static_cast<int32_t>(reduce_total_length));
+            AccElementwiseOperation acc_elementwise_op(static_cast<int32_t>(reduce_total_length));
 
             auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
                                                                 i_inStrides,
@@ -352,8 +354,8 @@ bool profile_reduce_impl_impl(bool do_verification,
                                                                 nullptr,
                                                                 out_dev.GetDeviceBuffer(),
                                                                 out_indices_dev.GetDeviceBuffer(),
-                                                                in_elementwise_op_0,
-                                                                acc_elementwise_op_0);
+                                                                in_elementwise_op,
+                                                                acc_elementwise_op);
 
             if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
                 continue;