diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp index b8fc980e10..293b593902 100644 --- a/example/12_reduce/reduce_blockwise.cpp +++ b/example/12_reduce/reduce_blockwise.cpp @@ -3,7 +3,6 @@ #include #include #include -#include #include "check_err.hpp" #include "config.hpp" @@ -27,10 +26,6 @@ using InDataType = ck::half_t; using OutDataType = ck::half_t; using AccDataType = float; -using HostInDataType = half_float::half; -using HostOutDataType = half_float::half; -using HostAccDataType = float; - constexpr int Rank = 4; constexpr int NumReduceDim = 3; @@ -306,9 +301,9 @@ int main(int argc, char* argv[]) if(args.do_verification) { - ReductionHost hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); - hostReduce.Run(alpha, - reinterpret_cast(in.mData.data()), - beta, - reinterpret_cast(out_ref.mData.data()), - out_indices_ref.mData.data()); + hostReduce.Run( + alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data()); }; const auto i_inLengths = to_int_vector(args.inLengths); diff --git a/include/ck/utility/math_v2.hpp b/include/ck/utility/math_v2.hpp index 25604149d4..572d576e7a 100644 --- a/include/ck/utility/math_v2.hpp +++ b/include/ck/utility/math_v2.hpp @@ -1,14 +1,64 @@ #ifndef CK_MATH_V2_HPP #define CK_MATH_V2_HPP +#include #include "data_type.hpp" +#include "half.hpp" namespace ck { namespace math { -static inline __device__ half_t abs(half_t x) { return __habs(x); }; -static inline __device__ half_t sqrtf(half_t x) { return hsqrt(x); }; -static inline __device__ bool isnan(half_t x) { return __hisnan(x); }; +static inline __host__ float abs(float x) { return std::abs(x); }; + +static inline __host__ double abs(double x) { return std::abs(x); }; + +static inline __host__ int8_t abs(int8_t x) +{ + int8_t sgn = x >> (8 - 1); + + return (x ^ sgn) - sgn; +}; + +static inline __host__ int32_t abs(int32_t x) +{ + int32_t sgn = x >> (32 - 1); + + return (x ^ sgn) - sgn; +}; + +static inline __host__ half_t abs(half_t x) +{ + half_float::half xx = *reinterpret_cast(&x); + + half_float::half abs_xx = half_float::abs(xx); + + half_t abs_x = *reinterpret_cast(&abs_xx); + + return abs_x; +}; + +static inline __host__ float isnan(float x) { return std::isnan(x); }; + +static inline __host__ double isnan(double x) { return std::isnan(x); }; + +static inline __host__ int8_t isnan(int8_t x) +{ + (void)x; + return false; +}; + +static inline __host__ int32_t isnan(int32_t x) +{ + (void)x; + return false; +}; + +static inline __host__ bool isnan(half_t x) +{ + half_float::half xx = *reinterpret_cast(&x); + + return half_float::isnan(xx); +}; } // namespace math } // namespace ck diff --git a/include/ck/utility/reduction_common.hpp b/include/ck/utility/reduction_common.hpp index 0cf6d31ed6..a34cfce837 100644 --- a/include/ck/utility/reduction_common.hpp +++ b/include/ck/utility/reduction_common.hpp @@ -33,7 +33,7 @@ namespace ck { struct float_equal_one { template - __device__ inline bool operator()(T x) + __host__ __device__ inline bool operator()(T x) { return x <= static_cast(1.0f) and x >= static_cast(1.0f); }; @@ -42,7 +42,7 @@ struct float_equal_one struct float_equal_zero { template - __device__ inline bool operator()(T x) + __host__ __device__ inline bool operator()(T x) { return x <= static_cast(0.0f) and x >= static_cast(0.0f); }; diff --git a/library/include/ck/library/host_tensor/host_reduce_util.hpp b/library/include/ck/library/host_tensor/host_reduce_util.hpp index cf301bb18a..53e17bcb5c 100644 --- a/library/include/ck/library/host_tensor/host_reduce_util.hpp +++ b/library/include/ck/library/host_tensor/host_reduce_util.hpp @@ -26,7 +26,6 @@ #ifndef GUARD_HOST_REDUCE_UTIL_HPP #define GUARD_HOST_REDUCE_UTIL_HPP -#include #include #include #include @@ -34,6 +33,8 @@ #include #include "reduction_enums.hpp" +#include "data_type.hpp" +#include "math_v2.hpp" namespace ck { @@ -42,34 +43,10 @@ namespace host_reduce { using ck::NanPropagation; using ck::ReduceTensorOp; -template -static inline bool float_equal_one(T); - -static inline bool float_equal_one(float x) { return x == 1.0f; }; - -static inline bool float_equal_one(double x) { return x == 1.0; }; - -static inline bool float_equal_one(half_float::half x) -{ - return x == static_cast(1.0f); -}; - -template -static inline bool float_equal_zero(T x); - -static inline bool float_equal_zero(float x) { return x == 0.0f; }; - -static inline bool float_equal_zero(double x) { return x == 0.0; }; - -static inline bool float_equal_zero(half_float::half x) -{ - return x == static_cast(0.0f); -}; - template __host__ static inline std::function PreUnaryOpFn(int) { - using std::abs; + using ck::math::abs; if constexpr(ReduceOpId == ReduceTensorOp::NORM1) { @@ -196,11 +173,11 @@ __host__ static inline AccDataType ReduceOpZeroVal() } else if constexpr(ReduceOpId == ReduceTensorOp::MIN) { - return (std::numeric_limits::max()); + return (ck::NumericLimits::Max()); } else if constexpr(ReduceOpId == ReduceTensorOp::MAX) { - return (std::numeric_limits::lowest()); + return (ck::NumericLimits::Lowest()); } else if constexpr(ReduceOpId == ReduceTensorOp::AMAX) { @@ -222,7 +199,7 @@ binop_with_nan_check(std::function opReduce, AccDataType& accuVal, AccDataType currVal) { - using std::isnan; + using ck::math::isnan; if constexpr(!PropagateNan) { @@ -245,7 +222,7 @@ binop_with_nan_check2(std::function opRe int& accuIndex, int currIndex) { - using std::isnan; + using ck::math::isnan; if constexpr(!PropagateNan) { diff --git a/library/include/ck/library/host_tensor/host_reduction.hpp b/library/include/ck/library/host_tensor/host_reduction.hpp index f25d753a46..786d34b73a 100644 --- a/library/include/ck/library/host_tensor/host_reduction.hpp +++ b/library/include/ck/library/host_tensor/host_reduction.hpp @@ -32,6 +32,7 @@ #include #include "reduction_enums.hpp" +#include "reduction_common.hpp" #include "host_reduce_util.hpp" #include "host_tensor.hpp" #include "data_type.hpp" @@ -196,10 +197,10 @@ struct ReductionHost OutDataType* out_data, IndexDataType* out_indices) { + using ck::float_equal_one; + using ck::float_equal_zero; using ck::type_convert; using ck::host_reduce::binop_with_nan_check2; - using ck::host_reduce::float_equal_one; - using ck::host_reduce::float_equal_zero; using ck::host_reduce::ReduceOpFn2; using ck::host_reduce::ReduceOpZeroVal; @@ -227,10 +228,10 @@ struct ReductionHost posUnaryOp(accuVal); - if(!float_equal_one(alpha)) + if(!float_equal_one{}(alpha)) accuVal *= type_convert(alpha); - if(!float_equal_zero(beta)) + if(!float_equal_zero{}(beta)) accuVal += type_convert(out_data[0]) * type_convert(beta); out_data[0] = type_convert(accuVal); @@ -263,13 +264,13 @@ struct ReductionHost posUnaryOp(accuVal); - if(!float_equal_one(alpha)) + if(!float_equal_one{}(alpha)) accuVal *= type_convert(alpha); auto dst_offset = get_offset_from_index(outStrides, invariant_index); - if(!float_equal_zero(beta)) + if(!float_equal_zero{}(beta)) accuVal += type_convert(out_data[dst_offset]) * type_convert(beta); @@ -303,10 +304,10 @@ struct ReductionHost void RunImpl_no_index(float alpha, const InDataType* in_data, float beta, OutDataType* out_data) { + using ck::float_equal_one; + using ck::float_equal_zero; using ck::type_convert; using ck::host_reduce::binop_with_nan_check; - using ck::host_reduce::float_equal_one; - using ck::host_reduce::float_equal_zero; using ck::host_reduce::ReduceOpFn; using ck::host_reduce::ReduceOpZeroVal; @@ -330,10 +331,10 @@ struct ReductionHost posUnaryOp(accuVal); - if(!float_equal_one(alpha)) + if(!float_equal_one{}(alpha)) accuVal *= type_convert(alpha); - if(!float_equal_zero(beta)) + if(!float_equal_zero{}(beta)) accuVal += type_convert(out_data[0]) * type_convert(beta); out_data[0] = type_convert(accuVal); @@ -361,13 +362,13 @@ struct ReductionHost posUnaryOp(accuVal); - if(!float_equal_one(alpha)) + if(!float_equal_one{}(alpha)) accuVal *= type_convert(alpha); auto dst_offset = get_offset_from_index(outStrides, invariant_index); - if(!float_equal_zero(beta)) + if(!float_equal_zero{}(beta)) accuVal += type_convert(out_data[dst_offset]) * type_convert(beta); diff --git a/profiler/include/profile_reduce_impl.hpp b/profiler/include/profile_reduce_impl.hpp index db7886e4b0..678134f60b 100644 --- a/profiler/include/profile_reduce_impl.hpp +++ b/profiler/include/profile_reduce_impl.hpp @@ -380,13 +380,9 @@ void profile_reduce_impl_impl(bool do_verification, if(do_verification) { - using HostInDataType = typename type_mapping::OutType; - using HostOutDataType = typename type_mapping::OutType; - using HostAccDataType = typename type_mapping::OutType; - - ReductionHost hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); - hostReduce.Run(alpha, - reinterpret_cast(in.mData.data()), - beta, - reinterpret_cast(out_ref.mData.data()), - out_indices_ref.mData.data()); + hostReduce.Run( + alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data()); }; const auto i_inLengths = to_int_vector(inLengths); diff --git a/test/reduce/reduce_no_index.cpp b/test/reduce/reduce_no_index.cpp index 6bb35f3fa6..28370cb2cd 100644 --- a/test/reduce/reduce_no_index.cpp +++ b/test/reduce/reduce_no_index.cpp @@ -37,19 +37,6 @@ static inline std::vector get_invariant_dims(const std::vector& reduce return invariantDims; }; -// map the data type used by the GPU kernels to the corresponding type used by the host codes -template -struct type_mapping -{ - using OutType = InType; -}; - -template <> -struct type_mapping -{ - using OutType = half_float::half; -}; - constexpr int Rank = 4; constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AVG; @@ -226,13 +213,9 @@ bool test_reduce_no_index_impl(int init_method, bool result = true; - using HostInDataType = typename type_mapping::OutType; - using HostOutDataType = typename type_mapping::OutType; - using HostAccDataType = typename type_mapping::OutType; - - ReductionHost hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); - hostReduce.Run(alpha, - reinterpret_cast(in.mData.data()), - beta, - reinterpret_cast(out_ref.mData.data()), - nullptr); + hostReduce.Run(alpha, in.mData.data(), beta, out_ref.mData.data(), nullptr); const auto i_inLengths = to_int_vector(inLengths); const auto i_inStrides = to_int_vector(inStrides); diff --git a/test/reduce/reduce_with_index.cpp b/test/reduce/reduce_with_index.cpp index de67da9352..667b84a8dc 100644 --- a/test/reduce/reduce_with_index.cpp +++ b/test/reduce/reduce_with_index.cpp @@ -36,19 +36,6 @@ static inline std::vector get_invariant_dims(const std::vector& reduce return invariantDims; }; -// map the data type used by the GPU kernels to the corresponding type used by the host codes -template -struct type_mapping -{ - using OutType = InType; -}; - -template <> -struct type_mapping -{ - using OutType = half_float::half; -}; - constexpr int Rank = 4; constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AMAX; @@ -209,13 +196,9 @@ bool test_reduce_with_index_impl(int init_method, bool result = true; - using HostInDataType = typename type_mapping::OutType; - using HostOutDataType = typename type_mapping::OutType; - using HostAccDataType = typename type_mapping::OutType; - - ReductionHost hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); - hostReduce.Run(alpha, - reinterpret_cast(in.mData.data()), - beta, - reinterpret_cast(out_ref.mData.data()), - out_indices_ref.mData.data()); + hostReduce.Run( + alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data()); const auto i_inLengths = to_int_vector(inLengths); const auto i_inStrides = to_int_vector(inStrides);