mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-07 08:15:04 +00:00
Merge remote-tracking branch 'origin/develop' into improve_pipeline
This commit is contained in:
@@ -3,7 +3,6 @@
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
#include <getopt.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
@@ -27,10 +26,6 @@ using InDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
using AccDataType = float;
|
||||
|
||||
using HostInDataType = half_float::half;
|
||||
using HostOutDataType = half_float::half;
|
||||
using HostAccDataType = float;
|
||||
|
||||
constexpr int Rank = 4;
|
||||
constexpr int NumReduceDim = 3;
|
||||
|
||||
@@ -306,9 +301,9 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(args.do_verification)
|
||||
{
|
||||
ReductionHost<HostInDataType,
|
||||
HostAccDataType,
|
||||
HostOutDataType,
|
||||
ReductionHost<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
ReduceOpId,
|
||||
Rank,
|
||||
NumReduceDim,
|
||||
@@ -316,11 +311,8 @@ int main(int argc, char* argv[])
|
||||
NeedIndices>
|
||||
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
|
||||
|
||||
hostReduce.Run(alpha,
|
||||
reinterpret_cast<const HostInDataType*>(in.mData.data()),
|
||||
beta,
|
||||
reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
|
||||
out_indices_ref.mData.data());
|
||||
hostReduce.Run(
|
||||
alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
|
||||
};
|
||||
|
||||
const auto i_inLengths = to_int_vector(args.inLengths);
|
||||
|
||||
@@ -37,6 +37,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
|
||||
|
||||
static constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack>{};
|
||||
|
||||
static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops;
|
||||
|
||||
static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
|
||||
static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
|
||||
|
||||
@@ -69,7 +71,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
|
||||
|
||||
const auto xdlops_a_idx = xdlops_gemm.CalculateAThreadOriginDataIndex();
|
||||
|
||||
return make_tuple(0, waveId_m, xdlops_a_idx[I1], Number<KPack>{} * xdlops_a_idx[I0]);
|
||||
return make_tuple(0, waveId_m, xdlops_a_idx[I1], KPerThread * xdlops_a_idx[I0]);
|
||||
}
|
||||
|
||||
__device__ static auto CalculateBThreadOriginDataIndex()
|
||||
@@ -80,7 +82,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
|
||||
|
||||
const auto xdlops_b_idx = xdlops_gemm.CalculateBThreadOriginDataIndex();
|
||||
|
||||
return make_tuple(0, waveId_n, xdlops_b_idx[I1], Number<KPack>{} * xdlops_b_idx[I0]);
|
||||
return make_tuple(0, waveId_n, xdlops_b_idx[I1], KPerThread * xdlops_b_idx[I0]);
|
||||
}
|
||||
|
||||
template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
|
||||
@@ -271,7 +273,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
|
||||
make_tuple(I0, I0, I0, I0),
|
||||
b_thread_buf);
|
||||
|
||||
static_for<0, KPerBlock, KPack * xdlops_gemm.K0PerXdlops>{}([&](auto k) {
|
||||
static_for<0, KPerThread, KPack>{}([&](auto k) {
|
||||
vector_type<FloatAB, KPack> a_thread_vec;
|
||||
vector_type<FloatAB, KPack> b_thread_vec;
|
||||
|
||||
@@ -298,13 +300,13 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
|
||||
}
|
||||
|
||||
private:
|
||||
// A[M0, M1, M2, KPerBlock]
|
||||
// A[M0, M1, M2, KPerThread]
|
||||
static constexpr auto a_thread_desc_ =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerBlock>{}));
|
||||
make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerThread>{}));
|
||||
|
||||
// B[N0, N1, N2, KPerBlock]
|
||||
// B[N0, N1, N2, KPerThread]
|
||||
static constexpr auto b_thread_desc_ =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerBlock>{}));
|
||||
make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerThread>{}));
|
||||
|
||||
// C[M, N, NumRegXdlops]
|
||||
static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
|
||||
@@ -314,7 +316,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
|
||||
FloatAB,
|
||||
decltype(a_block_desc_m0_m1_m2_k),
|
||||
decltype(a_thread_desc_),
|
||||
Sequence<1, 1, 1, KPerBlock>,
|
||||
Sequence<1, 1, 1, KPerThread>,
|
||||
Sequence<0, 1, 2, 3>,
|
||||
3,
|
||||
A_K1,
|
||||
@@ -324,7 +326,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
|
||||
FloatAB,
|
||||
decltype(b_block_desc_n0_n1_n2_k),
|
||||
decltype(b_thread_desc_),
|
||||
Sequence<1, 1, 1, KPerBlock>,
|
||||
Sequence<1, 1, 1, KPerThread>,
|
||||
Sequence<0, 1, 2, 3>,
|
||||
3,
|
||||
B_K1,
|
||||
|
||||
@@ -1,14 +1,64 @@
|
||||
#ifndef CK_MATH_V2_HPP
|
||||
#define CK_MATH_V2_HPP
|
||||
|
||||
#include <cmath>
|
||||
#include "data_type.hpp"
|
||||
#include "half.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace math {
|
||||
|
||||
static inline __device__ half_t abs(half_t x) { return __habs(x); };
|
||||
static inline __device__ half_t sqrtf(half_t x) { return hsqrt(x); };
|
||||
static inline __device__ bool isnan(half_t x) { return __hisnan(x); };
|
||||
static inline __host__ float abs(float x) { return std::abs(x); };
|
||||
|
||||
static inline __host__ double abs(double x) { return std::abs(x); };
|
||||
|
||||
static inline __host__ int8_t abs(int8_t x)
|
||||
{
|
||||
int8_t sgn = x >> (8 - 1);
|
||||
|
||||
return (x ^ sgn) - sgn;
|
||||
};
|
||||
|
||||
static inline __host__ int32_t abs(int32_t x)
|
||||
{
|
||||
int32_t sgn = x >> (32 - 1);
|
||||
|
||||
return (x ^ sgn) - sgn;
|
||||
};
|
||||
|
||||
static inline __host__ half_t abs(half_t x)
|
||||
{
|
||||
half_float::half xx = *reinterpret_cast<half_float::half*>(&x);
|
||||
|
||||
half_float::half abs_xx = half_float::abs(xx);
|
||||
|
||||
half_t abs_x = *reinterpret_cast<half_t*>(&abs_xx);
|
||||
|
||||
return abs_x;
|
||||
};
|
||||
|
||||
static inline __host__ float isnan(float x) { return std::isnan(x); };
|
||||
|
||||
static inline __host__ double isnan(double x) { return std::isnan(x); };
|
||||
|
||||
static inline __host__ int8_t isnan(int8_t x)
|
||||
{
|
||||
(void)x;
|
||||
return false;
|
||||
};
|
||||
|
||||
static inline __host__ int32_t isnan(int32_t x)
|
||||
{
|
||||
(void)x;
|
||||
return false;
|
||||
};
|
||||
|
||||
static inline __host__ bool isnan(half_t x)
|
||||
{
|
||||
half_float::half xx = *reinterpret_cast<half_float::half*>(&x);
|
||||
|
||||
return half_float::isnan(xx);
|
||||
};
|
||||
|
||||
} // namespace math
|
||||
} // namespace ck
|
||||
|
||||
@@ -33,7 +33,7 @@ namespace ck {
|
||||
struct float_equal_one
|
||||
{
|
||||
template <class T>
|
||||
__device__ inline bool operator()(T x)
|
||||
__host__ __device__ inline bool operator()(T x)
|
||||
{
|
||||
return x <= static_cast<T>(1.0f) and x >= static_cast<T>(1.0f);
|
||||
};
|
||||
@@ -42,7 +42,7 @@ struct float_equal_one
|
||||
struct float_equal_zero
|
||||
{
|
||||
template <class T>
|
||||
__device__ inline bool operator()(T x)
|
||||
__host__ __device__ inline bool operator()(T x)
|
||||
{
|
||||
return x <= static_cast<T>(0.0f) and x >= static_cast<T>(0.0f);
|
||||
};
|
||||
|
||||
@@ -26,7 +26,6 @@
|
||||
#ifndef GUARD_HOST_REDUCE_UTIL_HPP
|
||||
#define GUARD_HOST_REDUCE_UTIL_HPP
|
||||
|
||||
#include <half.hpp>
|
||||
#include <limits>
|
||||
#include <cmath>
|
||||
#include <cassert>
|
||||
@@ -34,6 +33,8 @@
|
||||
#include <string>
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "data_type.hpp"
|
||||
#include "math_v2.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -42,34 +43,10 @@ namespace host_reduce {
|
||||
using ck::NanPropagation;
|
||||
using ck::ReduceTensorOp;
|
||||
|
||||
template <typename T>
|
||||
static inline bool float_equal_one(T);
|
||||
|
||||
static inline bool float_equal_one(float x) { return x == 1.0f; };
|
||||
|
||||
static inline bool float_equal_one(double x) { return x == 1.0; };
|
||||
|
||||
static inline bool float_equal_one(half_float::half x)
|
||||
{
|
||||
return x == static_cast<half_float::half>(1.0f);
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
static inline bool float_equal_zero(T x);
|
||||
|
||||
static inline bool float_equal_zero(float x) { return x == 0.0f; };
|
||||
|
||||
static inline bool float_equal_zero(double x) { return x == 0.0; };
|
||||
|
||||
static inline bool float_equal_zero(half_float::half x)
|
||||
{
|
||||
return x == static_cast<half_float::half>(0.0f);
|
||||
};
|
||||
|
||||
template <typename AccDataType, ReduceTensorOp ReduceOpId>
|
||||
__host__ static inline std::function<void(AccDataType&)> PreUnaryOpFn(int)
|
||||
{
|
||||
using std::abs;
|
||||
using ck::math::abs;
|
||||
|
||||
if constexpr(ReduceOpId == ReduceTensorOp::NORM1)
|
||||
{
|
||||
@@ -196,11 +173,11 @@ __host__ static inline AccDataType ReduceOpZeroVal()
|
||||
}
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp::MIN)
|
||||
{
|
||||
return (std::numeric_limits<AccDataType>::max());
|
||||
return (ck::NumericLimits<AccDataType>::Max());
|
||||
}
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp::MAX)
|
||||
{
|
||||
return (std::numeric_limits<AccDataType>::lowest());
|
||||
return (ck::NumericLimits<AccDataType>::Lowest());
|
||||
}
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp::AMAX)
|
||||
{
|
||||
@@ -222,7 +199,7 @@ binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
|
||||
AccDataType& accuVal,
|
||||
AccDataType currVal)
|
||||
{
|
||||
using std::isnan;
|
||||
using ck::math::isnan;
|
||||
|
||||
if constexpr(!PropagateNan)
|
||||
{
|
||||
@@ -245,7 +222,7 @@ binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opRe
|
||||
int& accuIndex,
|
||||
int currIndex)
|
||||
{
|
||||
using std::isnan;
|
||||
using ck::math::isnan;
|
||||
|
||||
if constexpr(!PropagateNan)
|
||||
{
|
||||
|
||||
@@ -32,6 +32,7 @@
|
||||
#include <functional>
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_common.hpp"
|
||||
#include "host_reduce_util.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "data_type.hpp"
|
||||
@@ -196,10 +197,10 @@ struct ReductionHost
|
||||
OutDataType* out_data,
|
||||
IndexDataType* out_indices)
|
||||
{
|
||||
using ck::float_equal_one;
|
||||
using ck::float_equal_zero;
|
||||
using ck::type_convert;
|
||||
using ck::host_reduce::binop_with_nan_check2;
|
||||
using ck::host_reduce::float_equal_one;
|
||||
using ck::host_reduce::float_equal_zero;
|
||||
using ck::host_reduce::ReduceOpFn2;
|
||||
using ck::host_reduce::ReduceOpZeroVal;
|
||||
|
||||
@@ -227,10 +228,10 @@ struct ReductionHost
|
||||
|
||||
posUnaryOp(accuVal);
|
||||
|
||||
if(!float_equal_one(alpha))
|
||||
if(!float_equal_one{}(alpha))
|
||||
accuVal *= type_convert<AccDataType>(alpha);
|
||||
|
||||
if(!float_equal_zero(beta))
|
||||
if(!float_equal_zero{}(beta))
|
||||
accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
|
||||
|
||||
out_data[0] = type_convert<OutDataType>(accuVal);
|
||||
@@ -263,13 +264,13 @@ struct ReductionHost
|
||||
|
||||
posUnaryOp(accuVal);
|
||||
|
||||
if(!float_equal_one(alpha))
|
||||
if(!float_equal_one{}(alpha))
|
||||
accuVal *= type_convert<AccDataType>(alpha);
|
||||
|
||||
auto dst_offset =
|
||||
get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
|
||||
|
||||
if(!float_equal_zero(beta))
|
||||
if(!float_equal_zero{}(beta))
|
||||
accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
|
||||
type_convert<AccDataType>(beta);
|
||||
|
||||
@@ -303,10 +304,10 @@ struct ReductionHost
|
||||
|
||||
void RunImpl_no_index(float alpha, const InDataType* in_data, float beta, OutDataType* out_data)
|
||||
{
|
||||
using ck::float_equal_one;
|
||||
using ck::float_equal_zero;
|
||||
using ck::type_convert;
|
||||
using ck::host_reduce::binop_with_nan_check;
|
||||
using ck::host_reduce::float_equal_one;
|
||||
using ck::host_reduce::float_equal_zero;
|
||||
using ck::host_reduce::ReduceOpFn;
|
||||
using ck::host_reduce::ReduceOpZeroVal;
|
||||
|
||||
@@ -330,10 +331,10 @@ struct ReductionHost
|
||||
|
||||
posUnaryOp(accuVal);
|
||||
|
||||
if(!float_equal_one(alpha))
|
||||
if(!float_equal_one{}(alpha))
|
||||
accuVal *= type_convert<AccDataType>(alpha);
|
||||
|
||||
if(!float_equal_zero(beta))
|
||||
if(!float_equal_zero{}(beta))
|
||||
accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
|
||||
|
||||
out_data[0] = type_convert<OutDataType>(accuVal);
|
||||
@@ -361,13 +362,13 @@ struct ReductionHost
|
||||
|
||||
posUnaryOp(accuVal);
|
||||
|
||||
if(!float_equal_one(alpha))
|
||||
if(!float_equal_one{}(alpha))
|
||||
accuVal *= type_convert<AccDataType>(alpha);
|
||||
|
||||
auto dst_offset =
|
||||
get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
|
||||
|
||||
if(!float_equal_zero(beta))
|
||||
if(!float_equal_zero{}(beta))
|
||||
accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
|
||||
type_convert<AccDataType>(beta);
|
||||
|
||||
|
||||
@@ -380,13 +380,9 @@ void profile_reduce_impl_impl(bool do_verification,
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
using HostInDataType = typename type_mapping<InDataType>::OutType;
|
||||
using HostOutDataType = typename type_mapping<OutDataType>::OutType;
|
||||
using HostAccDataType = typename type_mapping<AccDataType>::OutType;
|
||||
|
||||
ReductionHost<HostInDataType,
|
||||
HostAccDataType,
|
||||
HostOutDataType,
|
||||
ReductionHost<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
ReduceOpId,
|
||||
Rank,
|
||||
NumReduceDim,
|
||||
@@ -394,11 +390,8 @@ void profile_reduce_impl_impl(bool do_verification,
|
||||
NeedIndices>
|
||||
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
|
||||
|
||||
hostReduce.Run(alpha,
|
||||
reinterpret_cast<const HostInDataType*>(in.mData.data()),
|
||||
beta,
|
||||
reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
|
||||
out_indices_ref.mData.data());
|
||||
hostReduce.Run(
|
||||
alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
|
||||
};
|
||||
|
||||
const auto i_inLengths = to_int_vector(inLengths);
|
||||
|
||||
@@ -37,19 +37,6 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
|
||||
return invariantDims;
|
||||
};
|
||||
|
||||
// map the data type used by the GPU kernels to the corresponding type used by the host codes
|
||||
template <typename InType>
|
||||
struct type_mapping
|
||||
{
|
||||
using OutType = InType;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_mapping<ck::half_t>
|
||||
{
|
||||
using OutType = half_float::half;
|
||||
};
|
||||
|
||||
constexpr int Rank = 4;
|
||||
|
||||
constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AVG;
|
||||
@@ -226,13 +213,9 @@ bool test_reduce_no_index_impl(int init_method,
|
||||
|
||||
bool result = true;
|
||||
|
||||
using HostInDataType = typename type_mapping<InDataType>::OutType;
|
||||
using HostOutDataType = typename type_mapping<OutDataType>::OutType;
|
||||
using HostAccDataType = typename type_mapping<AccDataType>::OutType;
|
||||
|
||||
ReductionHost<HostInDataType,
|
||||
HostAccDataType,
|
||||
HostOutDataType,
|
||||
ReductionHost<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
ReduceOpId,
|
||||
Rank,
|
||||
NumReduceDim,
|
||||
@@ -240,11 +223,7 @@ bool test_reduce_no_index_impl(int init_method,
|
||||
NeedIndices>
|
||||
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
|
||||
|
||||
hostReduce.Run(alpha,
|
||||
reinterpret_cast<const HostInDataType*>(in.mData.data()),
|
||||
beta,
|
||||
reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
|
||||
nullptr);
|
||||
hostReduce.Run(alpha, in.mData.data(), beta, out_ref.mData.data(), nullptr);
|
||||
|
||||
const auto i_inLengths = to_int_vector(inLengths);
|
||||
const auto i_inStrides = to_int_vector(inStrides);
|
||||
|
||||
@@ -36,19 +36,6 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
|
||||
return invariantDims;
|
||||
};
|
||||
|
||||
// map the data type used by the GPU kernels to the corresponding type used by the host codes
|
||||
template <typename InType>
|
||||
struct type_mapping
|
||||
{
|
||||
using OutType = InType;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_mapping<ck::half_t>
|
||||
{
|
||||
using OutType = half_float::half;
|
||||
};
|
||||
|
||||
constexpr int Rank = 4;
|
||||
|
||||
constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AMAX;
|
||||
@@ -209,13 +196,9 @@ bool test_reduce_with_index_impl(int init_method,
|
||||
|
||||
bool result = true;
|
||||
|
||||
using HostInDataType = typename type_mapping<InDataType>::OutType;
|
||||
using HostOutDataType = typename type_mapping<OutDataType>::OutType;
|
||||
using HostAccDataType = typename type_mapping<AccDataType>::OutType;
|
||||
|
||||
ReductionHost<HostInDataType,
|
||||
HostAccDataType,
|
||||
HostOutDataType,
|
||||
ReductionHost<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
ReduceOpId,
|
||||
Rank,
|
||||
NumReduceDim,
|
||||
@@ -223,11 +206,8 @@ bool test_reduce_with_index_impl(int init_method,
|
||||
NeedIndices>
|
||||
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
|
||||
|
||||
hostReduce.Run(alpha,
|
||||
reinterpret_cast<const HostInDataType*>(in.mData.data()),
|
||||
beta,
|
||||
reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
|
||||
out_indices_ref.mData.data());
|
||||
hostReduce.Run(
|
||||
alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
|
||||
|
||||
const auto i_inLengths = to_int_vector(inLengths);
|
||||
const auto i_inStrides = to_int_vector(inStrides);
|
||||
|
||||
Reference in New Issue
Block a user