diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp
index b8fc980e10..293b593902 100644
--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -3,7 +3,6 @@
 #include <initializer_list>
 #include <cstdlib>
 #include <getopt.h>
-#include <half.hpp>
 
 #include "check_err.hpp"
 #include "config.hpp"
@@ -27,10 +26,6 @@ using InDataType  = ck::half_t;
 using OutDataType = ck::half_t;
 using AccDataType = float;
 
-using HostInDataType  = half_float::half;
-using HostOutDataType = half_float::half;
-using HostAccDataType = float;
-
 constexpr int Rank         = 4;
 constexpr int NumReduceDim = 3;
 
@@ -306,9 +301,9 @@ int main(int argc, char* argv[])
 
     if(args.do_verification)
     {
-        ReductionHost<HostInDataType,
-                      HostAccDataType,
-                      HostOutDataType,
+        ReductionHost<InDataType,
+                      AccDataType,
+                      OutDataType,
                       ReduceOpId,
                       Rank,
                       NumReduceDim,
@@ -316,11 +311,8 @@ int main(int argc, char* argv[])
                       NeedIndices>
             hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
 
-        hostReduce.Run(alpha,
-                       reinterpret_cast<const HostInDataType*>(in.mData.data()),
-                       beta,
-                       reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
-                       out_indices_ref.mData.data());
+        hostReduce.Run(
+            alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
     };
 
     const auto i_inLengths  = to_int_vector(args.inLengths);
diff --git a/include/ck/utility/math_v2.hpp b/include/ck/utility/math_v2.hpp
index 25604149d4..572d576e7a 100644
--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
@@ -1,14 +1,64 @@
 #ifndef CK_MATH_V2_HPP
 #define CK_MATH_V2_HPP
 
+#include <cmath>
 #include "data_type.hpp"
+#include "half.hpp"
 
 namespace ck {
 namespace math {
 
-static inline __device__ half_t abs(half_t x) { return __habs(x); };
-static inline __device__ half_t sqrtf(half_t x) { return hsqrt(x); };
-static inline __device__ bool isnan(half_t x) { return __hisnan(x); };
+static inline __host__ float abs(float x) { return std::abs(x); };
+
+static inline __host__ double abs(double x) { return std::abs(x); };
+
+static inline __host__ int8_t abs(int8_t x)
+{
+    int8_t sgn = x >> (8 - 1);
+
+    return (x ^ sgn) - sgn;
+};
+
+static inline __host__ int32_t abs(int32_t x)
+{
+    int32_t sgn = x >> (32 - 1);
+
+    return (x ^ sgn) - sgn;
+};
+
+static inline __host__ half_t abs(half_t x)
+{
+    half_float::half xx = *reinterpret_cast<half_float::half*>(&x);
+
+    half_float::half abs_xx = half_float::abs(xx);
+
+    half_t abs_x = *reinterpret_cast<half_t*>(&abs_xx);
+
+    return abs_x;
+};
+
+static inline __host__ float isnan(float x) { return std::isnan(x); };
+
+static inline __host__ double isnan(double x) { return std::isnan(x); };
+
+static inline __host__ int8_t isnan(int8_t x)
+{
+    (void)x;
+    return false;
+};
+
+static inline __host__ int32_t isnan(int32_t x)
+{
+    (void)x;
+    return false;
+};
+
+static inline __host__ bool isnan(half_t x)
+{
+    half_float::half xx = *reinterpret_cast<half_float::half*>(&x);
+
+    return half_float::isnan(xx);
+};
 
 } // namespace math
 } // namespace ck
diff --git a/include/ck/utility/reduction_common.hpp b/include/ck/utility/reduction_common.hpp
index 0cf6d31ed6..a34cfce837 100644
--- a/include/ck/utility/reduction_common.hpp
+++ b/include/ck/utility/reduction_common.hpp
@@ -33,7 +33,7 @@ namespace ck {
 struct float_equal_one
 {
     template <class T>
-    __device__ inline bool operator()(T x)
+    __host__ __device__ inline bool operator()(T x)
     {
         return x <= static_cast<T>(1.0f) and x >= static_cast<T>(1.0f);
     };
@@ -42,7 +42,7 @@ struct float_equal_one
 struct float_equal_zero
 {
     template <class T>
-    __device__ inline bool operator()(T x)
+    __host__ __device__ inline bool operator()(T x)
     {
         return x <= static_cast<T>(0.0f) and x >= static_cast<T>(0.0f);
     };
diff --git a/library/include/ck/library/host_tensor/host_reduce_util.hpp b/library/include/ck/library/host_tensor/host_reduce_util.hpp
index cf301bb18a..53e17bcb5c 100644
--- a/library/include/ck/library/host_tensor/host_reduce_util.hpp
+++ b/library/include/ck/library/host_tensor/host_reduce_util.hpp
@@ -26,7 +26,6 @@
 #ifndef GUARD_HOST_REDUCE_UTIL_HPP
 #define GUARD_HOST_REDUCE_UTIL_HPP
 
-#include <half.hpp>
 #include <limits>
 #include <cmath>
 #include <cassert>
@@ -34,6 +33,8 @@
 #include <string>
 
 #include "reduction_enums.hpp"
+#include "data_type.hpp"
+#include "math_v2.hpp"
 
 namespace ck {
 
@@ -42,34 +43,10 @@ namespace host_reduce {
 using ck::NanPropagation;
 using ck::ReduceTensorOp;
 
-template <typename T>
-static inline bool float_equal_one(T);
-
-static inline bool float_equal_one(float x) { return x == 1.0f; };
-
-static inline bool float_equal_one(double x) { return x == 1.0; };
-
-static inline bool float_equal_one(half_float::half x)
-{
-    return x == static_cast<half_float::half>(1.0f);
-};
-
-template <typename T>
-static inline bool float_equal_zero(T x);
-
-static inline bool float_equal_zero(float x) { return x == 0.0f; };
-
-static inline bool float_equal_zero(double x) { return x == 0.0; };
-
-static inline bool float_equal_zero(half_float::half x)
-{
-    return x == static_cast<half_float::half>(0.0f);
-};
-
 template <typename AccDataType, ReduceTensorOp ReduceOpId>
 __host__ static inline std::function<void(AccDataType&)> PreUnaryOpFn(int)
 {
-    using std::abs;
+    using ck::math::abs;
 
     if constexpr(ReduceOpId == ReduceTensorOp::NORM1)
     {
@@ -196,11 +173,11 @@ __host__ static inline AccDataType ReduceOpZeroVal()
     }
     else if constexpr(ReduceOpId == ReduceTensorOp::MIN)
     {
-        return (std::numeric_limits<AccDataType>::max());
+        return (ck::NumericLimits<AccDataType>::Max());
     }
     else if constexpr(ReduceOpId == ReduceTensorOp::MAX)
     {
-        return (std::numeric_limits<AccDataType>::lowest());
+        return (ck::NumericLimits<AccDataType>::Lowest());
     }
     else if constexpr(ReduceOpId == ReduceTensorOp::AMAX)
     {
@@ -222,7 +199,7 @@ binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
                      AccDataType& accuVal,
                      AccDataType currVal)
 {
-    using std::isnan;
+    using ck::math::isnan;
 
     if constexpr(!PropagateNan)
     {
@@ -245,7 +222,7 @@ binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opRe
                       int& accuIndex,
                       int currIndex)
 {
-    using std::isnan;
+    using ck::math::isnan;
 
     if constexpr(!PropagateNan)
     {
diff --git a/library/include/ck/library/host_tensor/host_reduction.hpp b/library/include/ck/library/host_tensor/host_reduction.hpp
index f25d753a46..786d34b73a 100644
--- a/library/include/ck/library/host_tensor/host_reduction.hpp
+++ b/library/include/ck/library/host_tensor/host_reduction.hpp
@@ -32,6 +32,7 @@
 #include <functional>
 
 #include "reduction_enums.hpp"
+#include "reduction_common.hpp"
 #include "host_reduce_util.hpp"
 #include "host_tensor.hpp"
 #include "data_type.hpp"
@@ -196,10 +197,10 @@ struct ReductionHost
                             OutDataType* out_data,
                             IndexDataType* out_indices)
     {
+        using ck::float_equal_one;
+        using ck::float_equal_zero;
         using ck::type_convert;
         using ck::host_reduce::binop_with_nan_check2;
-        using ck::host_reduce::float_equal_one;
-        using ck::host_reduce::float_equal_zero;
         using ck::host_reduce::ReduceOpFn2;
         using ck::host_reduce::ReduceOpZeroVal;
 
@@ -227,10 +228,10 @@ struct ReductionHost
 
             posUnaryOp(accuVal);
 
-            if(!float_equal_one(alpha))
+            if(!float_equal_one{}(alpha))
                 accuVal *= type_convert<AccDataType>(alpha);
 
-            if(!float_equal_zero(beta))
+            if(!float_equal_zero{}(beta))
                 accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
 
             out_data[0]    = type_convert<OutDataType>(accuVal);
@@ -263,13 +264,13 @@ struct ReductionHost
 
                 posUnaryOp(accuVal);
 
-                if(!float_equal_one(alpha))
+                if(!float_equal_one{}(alpha))
                     accuVal *= type_convert<AccDataType>(alpha);
 
                 auto dst_offset =
                     get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
 
-                if(!float_equal_zero(beta))
+                if(!float_equal_zero{}(beta))
                     accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
                                type_convert<AccDataType>(beta);
 
@@ -303,10 +304,10 @@ struct ReductionHost
 
     void RunImpl_no_index(float alpha, const InDataType* in_data, float beta, OutDataType* out_data)
     {
+        using ck::float_equal_one;
+        using ck::float_equal_zero;
         using ck::type_convert;
         using ck::host_reduce::binop_with_nan_check;
-        using ck::host_reduce::float_equal_one;
-        using ck::host_reduce::float_equal_zero;
         using ck::host_reduce::ReduceOpFn;
         using ck::host_reduce::ReduceOpZeroVal;
 
@@ -330,10 +331,10 @@ struct ReductionHost
 
             posUnaryOp(accuVal);
 
-            if(!float_equal_one(alpha))
+            if(!float_equal_one{}(alpha))
                 accuVal *= type_convert<AccDataType>(alpha);
 
-            if(!float_equal_zero(beta))
+            if(!float_equal_zero{}(beta))
                 accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
 
             out_data[0] = type_convert<OutDataType>(accuVal);
@@ -361,13 +362,13 @@ struct ReductionHost
 
                 posUnaryOp(accuVal);
 
-                if(!float_equal_one(alpha))
+                if(!float_equal_one{}(alpha))
                     accuVal *= type_convert<AccDataType>(alpha);
 
                 auto dst_offset =
                     get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
 
-                if(!float_equal_zero(beta))
+                if(!float_equal_zero{}(beta))
                     accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
                                type_convert<AccDataType>(beta);
 
diff --git a/profiler/include/profile_reduce_impl.hpp b/profiler/include/profile_reduce_impl.hpp
index db7886e4b0..678134f60b 100644
--- a/profiler/include/profile_reduce_impl.hpp
+++ b/profiler/include/profile_reduce_impl.hpp
@@ -380,13 +380,9 @@ void profile_reduce_impl_impl(bool do_verification,
 
         if(do_verification)
         {
-            using HostInDataType  = typename type_mapping<InDataType>::OutType;
-            using HostOutDataType = typename type_mapping<OutDataType>::OutType;
-            using HostAccDataType = typename type_mapping<AccDataType>::OutType;
-
-            ReductionHost<HostInDataType,
-                          HostAccDataType,
-                          HostOutDataType,
+            ReductionHost<InDataType,
+                          AccDataType,
+                          OutDataType,
                           ReduceOpId,
                           Rank,
                           NumReduceDim,
@@ -394,11 +390,8 @@ void profile_reduce_impl_impl(bool do_verification,
                           NeedIndices>
                 hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
 
-            hostReduce.Run(alpha,
-                           reinterpret_cast<const HostInDataType*>(in.mData.data()),
-                           beta,
-                           reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
-                           out_indices_ref.mData.data());
+            hostReduce.Run(
+                alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
         };
 
         const auto i_inLengths  = to_int_vector(inLengths);
diff --git a/test/reduce/reduce_no_index.cpp b/test/reduce/reduce_no_index.cpp
index 6bb35f3fa6..28370cb2cd 100644
--- a/test/reduce/reduce_no_index.cpp
+++ b/test/reduce/reduce_no_index.cpp
@@ -37,19 +37,6 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
     return invariantDims;
 };
 
-// map the data type used by the GPU kernels to the corresponding type used by the host codes
-template <typename InType>
-struct type_mapping
-{
-    using OutType = InType;
-};
-
-template <>
-struct type_mapping<ck::half_t>
-{
-    using OutType = half_float::half;
-};
-
 constexpr int Rank = 4;
 
 constexpr ReduceTensorOp ReduceOpId      = ReduceTensorOp::AVG;
@@ -226,13 +213,9 @@ bool test_reduce_no_index_impl(int init_method,
 
     bool result = true;
 
-    using HostInDataType  = typename type_mapping<InDataType>::OutType;
-    using HostOutDataType = typename type_mapping<OutDataType>::OutType;
-    using HostAccDataType = typename type_mapping<AccDataType>::OutType;
-
-    ReductionHost<HostInDataType,
-                  HostAccDataType,
-                  HostOutDataType,
+    ReductionHost<InDataType,
+                  AccDataType,
+                  OutDataType,
                   ReduceOpId,
                   Rank,
                   NumReduceDim,
@@ -240,11 +223,7 @@ bool test_reduce_no_index_impl(int init_method,
                   NeedIndices>
         hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
 
-    hostReduce.Run(alpha,
-                   reinterpret_cast<const HostInDataType*>(in.mData.data()),
-                   beta,
-                   reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
-                   nullptr);
+    hostReduce.Run(alpha, in.mData.data(), beta, out_ref.mData.data(), nullptr);
 
     const auto i_inLengths  = to_int_vector(inLengths);
     const auto i_inStrides  = to_int_vector(inStrides);
diff --git a/test/reduce/reduce_with_index.cpp b/test/reduce/reduce_with_index.cpp
index de67da9352..667b84a8dc 100644
--- a/test/reduce/reduce_with_index.cpp
+++ b/test/reduce/reduce_with_index.cpp
@@ -36,19 +36,6 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
     return invariantDims;
 };
 
-// map the data type used by the GPU kernels to the corresponding type used by the host codes
-template <typename InType>
-struct type_mapping
-{
-    using OutType = InType;
-};
-
-template <>
-struct type_mapping<ck::half_t>
-{
-    using OutType = half_float::half;
-};
-
 constexpr int Rank = 4;
 
 constexpr ReduceTensorOp ReduceOpId      = ReduceTensorOp::AMAX;
@@ -209,13 +196,9 @@ bool test_reduce_with_index_impl(int init_method,
 
     bool result = true;
 
-    using HostInDataType  = typename type_mapping<InDataType>::OutType;
-    using HostOutDataType = typename type_mapping<OutDataType>::OutType;
-    using HostAccDataType = typename type_mapping<AccDataType>::OutType;
-
-    ReductionHost<HostInDataType,
-                  HostAccDataType,
-                  HostOutDataType,
+    ReductionHost<InDataType,
+                  AccDataType,
+                  OutDataType,
                   ReduceOpId,
                   Rank,
                   NumReduceDim,
@@ -223,11 +206,8 @@ bool test_reduce_with_index_impl(int init_method,
                   NeedIndices>
         hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
 
-    hostReduce.Run(alpha,
-                   reinterpret_cast<const HostInDataType*>(in.mData.data()),
-                   beta,
-                   reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
-                   out_indices_ref.mData.data());
+    hostReduce.Run(
+        alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
 
     const auto i_inLengths  = to_int_vector(inLengths);
     const auto i_inStrides  = to_int_vector(inStrides);