mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
Use ck::half_t for Host Reduction (#195)
* Add math functions for host * Change to host reduction to use ck::math: * Remove the using of half_float::half and half.hpp from reduction example/profiler/ctest
This commit is contained in:
@@ -3,7 +3,6 @@
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
#include <getopt.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
@@ -27,10 +26,6 @@ using InDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
using AccDataType = float;
|
||||
|
||||
using HostInDataType = half_float::half;
|
||||
using HostOutDataType = half_float::half;
|
||||
using HostAccDataType = float;
|
||||
|
||||
constexpr int Rank = 4;
|
||||
constexpr int NumReduceDim = 3;
|
||||
|
||||
@@ -306,9 +301,9 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(args.do_verification)
|
||||
{
|
||||
ReductionHost<HostInDataType,
|
||||
HostAccDataType,
|
||||
HostOutDataType,
|
||||
ReductionHost<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
ReduceOpId,
|
||||
Rank,
|
||||
NumReduceDim,
|
||||
@@ -316,11 +311,8 @@ int main(int argc, char* argv[])
|
||||
NeedIndices>
|
||||
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
|
||||
|
||||
hostReduce.Run(alpha,
|
||||
reinterpret_cast<const HostInDataType*>(in.mData.data()),
|
||||
beta,
|
||||
reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
|
||||
out_indices_ref.mData.data());
|
||||
hostReduce.Run(
|
||||
alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
|
||||
};
|
||||
|
||||
const auto i_inLengths = to_int_vector(args.inLengths);
|
||||
|
||||
@@ -1,14 +1,64 @@
|
||||
#ifndef CK_MATH_V2_HPP
|
||||
#define CK_MATH_V2_HPP
|
||||
|
||||
#include <cmath>
|
||||
#include "data_type.hpp"
|
||||
#include "half.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace math {
|
||||
|
||||
static inline __device__ half_t abs(half_t x) { return __habs(x); };
|
||||
static inline __device__ half_t sqrtf(half_t x) { return hsqrt(x); };
|
||||
static inline __device__ bool isnan(half_t x) { return __hisnan(x); };
|
||||
static inline __host__ float abs(float x) { return std::abs(x); };
|
||||
|
||||
static inline __host__ double abs(double x) { return std::abs(x); };
|
||||
|
||||
static inline __host__ int8_t abs(int8_t x)
|
||||
{
|
||||
int8_t sgn = x >> (8 - 1);
|
||||
|
||||
return (x ^ sgn) - sgn;
|
||||
};
|
||||
|
||||
static inline __host__ int32_t abs(int32_t x)
|
||||
{
|
||||
int32_t sgn = x >> (32 - 1);
|
||||
|
||||
return (x ^ sgn) - sgn;
|
||||
};
|
||||
|
||||
static inline __host__ half_t abs(half_t x)
|
||||
{
|
||||
half_float::half xx = *reinterpret_cast<half_float::half*>(&x);
|
||||
|
||||
half_float::half abs_xx = half_float::abs(xx);
|
||||
|
||||
half_t abs_x = *reinterpret_cast<half_t*>(&abs_xx);
|
||||
|
||||
return abs_x;
|
||||
};
|
||||
|
||||
static inline __host__ float isnan(float x) { return std::isnan(x); };
|
||||
|
||||
static inline __host__ double isnan(double x) { return std::isnan(x); };
|
||||
|
||||
static inline __host__ int8_t isnan(int8_t x)
|
||||
{
|
||||
(void)x;
|
||||
return false;
|
||||
};
|
||||
|
||||
static inline __host__ int32_t isnan(int32_t x)
|
||||
{
|
||||
(void)x;
|
||||
return false;
|
||||
};
|
||||
|
||||
static inline __host__ bool isnan(half_t x)
|
||||
{
|
||||
half_float::half xx = *reinterpret_cast<half_float::half*>(&x);
|
||||
|
||||
return half_float::isnan(xx);
|
||||
};
|
||||
|
||||
} // namespace math
|
||||
} // namespace ck
|
||||
|
||||
@@ -33,7 +33,7 @@ namespace ck {
|
||||
struct float_equal_one
|
||||
{
|
||||
template <class T>
|
||||
__device__ inline bool operator()(T x)
|
||||
__host__ __device__ inline bool operator()(T x)
|
||||
{
|
||||
return x <= static_cast<T>(1.0f) and x >= static_cast<T>(1.0f);
|
||||
};
|
||||
@@ -42,7 +42,7 @@ struct float_equal_one
|
||||
struct float_equal_zero
|
||||
{
|
||||
template <class T>
|
||||
__device__ inline bool operator()(T x)
|
||||
__host__ __device__ inline bool operator()(T x)
|
||||
{
|
||||
return x <= static_cast<T>(0.0f) and x >= static_cast<T>(0.0f);
|
||||
};
|
||||
|
||||
@@ -26,7 +26,6 @@
|
||||
#ifndef GUARD_HOST_REDUCE_UTIL_HPP
|
||||
#define GUARD_HOST_REDUCE_UTIL_HPP
|
||||
|
||||
#include <half.hpp>
|
||||
#include <limits>
|
||||
#include <cmath>
|
||||
#include <cassert>
|
||||
@@ -34,6 +33,8 @@
|
||||
#include <string>
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "data_type.hpp"
|
||||
#include "math_v2.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -42,34 +43,10 @@ namespace host_reduce {
|
||||
using ck::NanPropagation;
|
||||
using ck::ReduceTensorOp;
|
||||
|
||||
template <typename T>
|
||||
static inline bool float_equal_one(T);
|
||||
|
||||
static inline bool float_equal_one(float x) { return x == 1.0f; };
|
||||
|
||||
static inline bool float_equal_one(double x) { return x == 1.0; };
|
||||
|
||||
static inline bool float_equal_one(half_float::half x)
|
||||
{
|
||||
return x == static_cast<half_float::half>(1.0f);
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
static inline bool float_equal_zero(T x);
|
||||
|
||||
static inline bool float_equal_zero(float x) { return x == 0.0f; };
|
||||
|
||||
static inline bool float_equal_zero(double x) { return x == 0.0; };
|
||||
|
||||
static inline bool float_equal_zero(half_float::half x)
|
||||
{
|
||||
return x == static_cast<half_float::half>(0.0f);
|
||||
};
|
||||
|
||||
template <typename AccDataType, ReduceTensorOp ReduceOpId>
|
||||
__host__ static inline std::function<void(AccDataType&)> PreUnaryOpFn(int)
|
||||
{
|
||||
using std::abs;
|
||||
using ck::math::abs;
|
||||
|
||||
if constexpr(ReduceOpId == ReduceTensorOp::NORM1)
|
||||
{
|
||||
@@ -196,11 +173,11 @@ __host__ static inline AccDataType ReduceOpZeroVal()
|
||||
}
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp::MIN)
|
||||
{
|
||||
return (std::numeric_limits<AccDataType>::max());
|
||||
return (ck::NumericLimits<AccDataType>::Max());
|
||||
}
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp::MAX)
|
||||
{
|
||||
return (std::numeric_limits<AccDataType>::lowest());
|
||||
return (ck::NumericLimits<AccDataType>::Lowest());
|
||||
}
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp::AMAX)
|
||||
{
|
||||
@@ -222,7 +199,7 @@ binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
|
||||
AccDataType& accuVal,
|
||||
AccDataType currVal)
|
||||
{
|
||||
using std::isnan;
|
||||
using ck::math::isnan;
|
||||
|
||||
if constexpr(!PropagateNan)
|
||||
{
|
||||
@@ -245,7 +222,7 @@ binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opRe
|
||||
int& accuIndex,
|
||||
int currIndex)
|
||||
{
|
||||
using std::isnan;
|
||||
using ck::math::isnan;
|
||||
|
||||
if constexpr(!PropagateNan)
|
||||
{
|
||||
|
||||
@@ -32,6 +32,7 @@
|
||||
#include <functional>
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_common.hpp"
|
||||
#include "host_reduce_util.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "data_type.hpp"
|
||||
@@ -196,10 +197,10 @@ struct ReductionHost
|
||||
OutDataType* out_data,
|
||||
IndexDataType* out_indices)
|
||||
{
|
||||
using ck::float_equal_one;
|
||||
using ck::float_equal_zero;
|
||||
using ck::type_convert;
|
||||
using ck::host_reduce::binop_with_nan_check2;
|
||||
using ck::host_reduce::float_equal_one;
|
||||
using ck::host_reduce::float_equal_zero;
|
||||
using ck::host_reduce::ReduceOpFn2;
|
||||
using ck::host_reduce::ReduceOpZeroVal;
|
||||
|
||||
@@ -227,10 +228,10 @@ struct ReductionHost
|
||||
|
||||
posUnaryOp(accuVal);
|
||||
|
||||
if(!float_equal_one(alpha))
|
||||
if(!float_equal_one{}(alpha))
|
||||
accuVal *= type_convert<AccDataType>(alpha);
|
||||
|
||||
if(!float_equal_zero(beta))
|
||||
if(!float_equal_zero{}(beta))
|
||||
accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
|
||||
|
||||
out_data[0] = type_convert<OutDataType>(accuVal);
|
||||
@@ -263,13 +264,13 @@ struct ReductionHost
|
||||
|
||||
posUnaryOp(accuVal);
|
||||
|
||||
if(!float_equal_one(alpha))
|
||||
if(!float_equal_one{}(alpha))
|
||||
accuVal *= type_convert<AccDataType>(alpha);
|
||||
|
||||
auto dst_offset =
|
||||
get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
|
||||
|
||||
if(!float_equal_zero(beta))
|
||||
if(!float_equal_zero{}(beta))
|
||||
accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
|
||||
type_convert<AccDataType>(beta);
|
||||
|
||||
@@ -303,10 +304,10 @@ struct ReductionHost
|
||||
|
||||
void RunImpl_no_index(float alpha, const InDataType* in_data, float beta, OutDataType* out_data)
|
||||
{
|
||||
using ck::float_equal_one;
|
||||
using ck::float_equal_zero;
|
||||
using ck::type_convert;
|
||||
using ck::host_reduce::binop_with_nan_check;
|
||||
using ck::host_reduce::float_equal_one;
|
||||
using ck::host_reduce::float_equal_zero;
|
||||
using ck::host_reduce::ReduceOpFn;
|
||||
using ck::host_reduce::ReduceOpZeroVal;
|
||||
|
||||
@@ -330,10 +331,10 @@ struct ReductionHost
|
||||
|
||||
posUnaryOp(accuVal);
|
||||
|
||||
if(!float_equal_one(alpha))
|
||||
if(!float_equal_one{}(alpha))
|
||||
accuVal *= type_convert<AccDataType>(alpha);
|
||||
|
||||
if(!float_equal_zero(beta))
|
||||
if(!float_equal_zero{}(beta))
|
||||
accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
|
||||
|
||||
out_data[0] = type_convert<OutDataType>(accuVal);
|
||||
@@ -361,13 +362,13 @@ struct ReductionHost
|
||||
|
||||
posUnaryOp(accuVal);
|
||||
|
||||
if(!float_equal_one(alpha))
|
||||
if(!float_equal_one{}(alpha))
|
||||
accuVal *= type_convert<AccDataType>(alpha);
|
||||
|
||||
auto dst_offset =
|
||||
get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
|
||||
|
||||
if(!float_equal_zero(beta))
|
||||
if(!float_equal_zero{}(beta))
|
||||
accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
|
||||
type_convert<AccDataType>(beta);
|
||||
|
||||
|
||||
@@ -380,13 +380,9 @@ void profile_reduce_impl_impl(bool do_verification,
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
using HostInDataType = typename type_mapping<InDataType>::OutType;
|
||||
using HostOutDataType = typename type_mapping<OutDataType>::OutType;
|
||||
using HostAccDataType = typename type_mapping<AccDataType>::OutType;
|
||||
|
||||
ReductionHost<HostInDataType,
|
||||
HostAccDataType,
|
||||
HostOutDataType,
|
||||
ReductionHost<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
ReduceOpId,
|
||||
Rank,
|
||||
NumReduceDim,
|
||||
@@ -394,11 +390,8 @@ void profile_reduce_impl_impl(bool do_verification,
|
||||
NeedIndices>
|
||||
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
|
||||
|
||||
hostReduce.Run(alpha,
|
||||
reinterpret_cast<const HostInDataType*>(in.mData.data()),
|
||||
beta,
|
||||
reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
|
||||
out_indices_ref.mData.data());
|
||||
hostReduce.Run(
|
||||
alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
|
||||
};
|
||||
|
||||
const auto i_inLengths = to_int_vector(inLengths);
|
||||
|
||||
@@ -37,19 +37,6 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
|
||||
return invariantDims;
|
||||
};
|
||||
|
||||
// map the data type used by the GPU kernels to the corresponding type used by the host codes
|
||||
template <typename InType>
|
||||
struct type_mapping
|
||||
{
|
||||
using OutType = InType;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_mapping<ck::half_t>
|
||||
{
|
||||
using OutType = half_float::half;
|
||||
};
|
||||
|
||||
constexpr int Rank = 4;
|
||||
|
||||
constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AVG;
|
||||
@@ -226,13 +213,9 @@ bool test_reduce_no_index_impl(int init_method,
|
||||
|
||||
bool result = true;
|
||||
|
||||
using HostInDataType = typename type_mapping<InDataType>::OutType;
|
||||
using HostOutDataType = typename type_mapping<OutDataType>::OutType;
|
||||
using HostAccDataType = typename type_mapping<AccDataType>::OutType;
|
||||
|
||||
ReductionHost<HostInDataType,
|
||||
HostAccDataType,
|
||||
HostOutDataType,
|
||||
ReductionHost<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
ReduceOpId,
|
||||
Rank,
|
||||
NumReduceDim,
|
||||
@@ -240,11 +223,7 @@ bool test_reduce_no_index_impl(int init_method,
|
||||
NeedIndices>
|
||||
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
|
||||
|
||||
hostReduce.Run(alpha,
|
||||
reinterpret_cast<const HostInDataType*>(in.mData.data()),
|
||||
beta,
|
||||
reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
|
||||
nullptr);
|
||||
hostReduce.Run(alpha, in.mData.data(), beta, out_ref.mData.data(), nullptr);
|
||||
|
||||
const auto i_inLengths = to_int_vector(inLengths);
|
||||
const auto i_inStrides = to_int_vector(inStrides);
|
||||
|
||||
@@ -36,19 +36,6 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
|
||||
return invariantDims;
|
||||
};
|
||||
|
||||
// map the data type used by the GPU kernels to the corresponding type used by the host codes
|
||||
template <typename InType>
|
||||
struct type_mapping
|
||||
{
|
||||
using OutType = InType;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_mapping<ck::half_t>
|
||||
{
|
||||
using OutType = half_float::half;
|
||||
};
|
||||
|
||||
constexpr int Rank = 4;
|
||||
|
||||
constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AMAX;
|
||||
@@ -209,13 +196,9 @@ bool test_reduce_with_index_impl(int init_method,
|
||||
|
||||
bool result = true;
|
||||
|
||||
using HostInDataType = typename type_mapping<InDataType>::OutType;
|
||||
using HostOutDataType = typename type_mapping<OutDataType>::OutType;
|
||||
using HostAccDataType = typename type_mapping<AccDataType>::OutType;
|
||||
|
||||
ReductionHost<HostInDataType,
|
||||
HostAccDataType,
|
||||
HostOutDataType,
|
||||
ReductionHost<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
ReduceOpId,
|
||||
Rank,
|
||||
NumReduceDim,
|
||||
@@ -223,11 +206,8 @@ bool test_reduce_with_index_impl(int init_method,
|
||||
NeedIndices>
|
||||
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
|
||||
|
||||
hostReduce.Run(alpha,
|
||||
reinterpret_cast<const HostInDataType*>(in.mData.data()),
|
||||
beta,
|
||||
reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
|
||||
out_indices_ref.mData.data());
|
||||
hostReduce.Run(
|
||||
alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
|
||||
|
||||
const auto i_inLengths = to_int_vector(inLengths);
|
||||
const auto i_inStrides = to_int_vector(inStrides);
|
||||
|
||||
Reference in New Issue
Block a user