mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-13 01:36:06 +00:00
Compile for gfx908 and gfx90a (#130)
* adding compilation for multiple targets * fix build * clean * update Jekinsfile * update readme * update Jenkins * use ck::half_t instead of ushort for bf16 * rename enum classes * clean * rename * clean
This commit is contained in:
@@ -75,14 +75,14 @@ calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDes
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline auto activ(T v, const ck::ActivTypeEnum_t activ_type)
|
||||
inline auto activ(T v, const ck::ActivTypeEnum activ_type)
|
||||
{
|
||||
const T alpha = 0.3;
|
||||
switch(activ_type)
|
||||
{
|
||||
case ck::ActivTypeEnum_t::None: return v;
|
||||
case ck::ActivTypeEnum_t::LeakyRelu: return (v >= 0 ? v : alpha * v);
|
||||
case ck::ActivTypeEnum_t::Sigmoid: return (1 / (1 + exp(-v)));
|
||||
case ck::ActivTypeEnum::None: return v;
|
||||
case ck::ActivTypeEnum::LeakyRelu: return (v >= 0 ? v : alpha * v);
|
||||
case ck::ActivTypeEnum::Sigmoid: return (1 / (1 + exp(-v)));
|
||||
default: throw std::runtime_error("unsupported activ type"); break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
#pragma once
|
||||
#include "host_tensor.hpp"
|
||||
#include "common_header.hpp"
|
||||
|
||||
template <typename TensorDesc>
|
||||
void ostream_tensor_descriptor(TensorDesc, std::ostream& os = std::cout)
|
||||
|
||||
@@ -39,8 +39,8 @@ namespace ck {
|
||||
|
||||
namespace host_reduce {
|
||||
|
||||
using ck::NanPropagation_t;
|
||||
using ck::ReduceTensorOp_t;
|
||||
using ck::NanPropagation;
|
||||
using ck::ReduceTensorOp;
|
||||
|
||||
template <typename T>
|
||||
static inline bool float_equal_one(T);
|
||||
@@ -66,44 +66,44 @@ static inline bool float_equal_zero(half_float::half x)
|
||||
return x == static_cast<half_float::half>(0.0f);
|
||||
};
|
||||
|
||||
template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
|
||||
template <typename AccDataType, ReduceTensorOp ReduceOpId>
|
||||
__host__ static inline std::function<void(AccDataType&)> PreUnaryOpFn(int)
|
||||
{
|
||||
using std::abs;
|
||||
|
||||
if constexpr(ReduceOpId == ReduceTensorOp_t::NORM1)
|
||||
if constexpr(ReduceOpId == ReduceTensorOp::NORM1)
|
||||
{
|
||||
return ([&](AccDataType& a_) { a_ = abs(a_); });
|
||||
}
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2)
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp::NORM2)
|
||||
{
|
||||
return ([&](AccDataType& a_) { a_ = a_ * a_; });
|
||||
}
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX)
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp::AMAX)
|
||||
{
|
||||
return ([&](AccDataType& a_) { a_ = abs(a_); });
|
||||
}
|
||||
else
|
||||
{
|
||||
// ReduceTensorOp_t::AVG:
|
||||
// ReduceTensorOp_t::ADD:
|
||||
// ReduceTensorOp_t::MUL:
|
||||
// ReduceTensorOp_t::MIN:
|
||||
// ReduceTensorOp_t::MAX:
|
||||
// ReduceTensorOp::AVG:
|
||||
// ReduceTensorOp::ADD:
|
||||
// ReduceTensorOp::MUL:
|
||||
// ReduceTensorOp::MIN:
|
||||
// ReduceTensorOp::MAX:
|
||||
return ([&](AccDataType&) {});
|
||||
};
|
||||
};
|
||||
|
||||
template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
|
||||
template <typename AccDataType, ReduceTensorOp ReduceOpId>
|
||||
__host__ static inline std::function<void(AccDataType&)> PosUnaryOpFn(int32_t divider)
|
||||
{
|
||||
using std::sqrt;
|
||||
|
||||
if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2)
|
||||
if constexpr(ReduceOpId == ReduceTensorOp::NORM2)
|
||||
{
|
||||
return ([&](AccDataType& a_) { a_ = sqrt(a_); });
|
||||
}
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp_t::AVG)
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp::AVG)
|
||||
{
|
||||
return ([&, divider](AccDataType& a_) {
|
||||
a_ = a_ / static_cast<AccDataType>(static_cast<float>(divider));
|
||||
@@ -111,36 +111,36 @@ __host__ static inline std::function<void(AccDataType&)> PosUnaryOpFn(int32_t di
|
||||
}
|
||||
else
|
||||
{
|
||||
// ReduceTensorOp_t::ADD:
|
||||
// ReduceTensorOp_t::NORM1:
|
||||
// ReduceTensorOp_t::MUL:
|
||||
// ReduceTensorOp_t::MIN:
|
||||
// ReduceTensorOp_t::MAX:
|
||||
// ReduceTensorOp_t::AMAX:
|
||||
// ReduceTensorOp::ADD:
|
||||
// ReduceTensorOp::NORM1:
|
||||
// ReduceTensorOp::MUL:
|
||||
// ReduceTensorOp::MIN:
|
||||
// ReduceTensorOp::MAX:
|
||||
// ReduceTensorOp::AMAX:
|
||||
return ([&](AccDataType&) {});
|
||||
}
|
||||
};
|
||||
|
||||
template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
|
||||
template <typename AccDataType, ReduceTensorOp ReduceOpId>
|
||||
__host__ static inline std::function<void(AccDataType&, AccDataType)> ReduceOpFn()
|
||||
{
|
||||
if constexpr(ReduceOpId == ReduceTensorOp_t::ADD || ReduceOpId == ReduceTensorOp_t::AVG ||
|
||||
ReduceOpId == ReduceTensorOp_t::NORM1 || ReduceOpId == ReduceTensorOp_t::NORM2)
|
||||
if constexpr(ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::AVG ||
|
||||
ReduceOpId == ReduceTensorOp::NORM1 || ReduceOpId == ReduceTensorOp::NORM2)
|
||||
{
|
||||
return ([&](AccDataType& a_, AccDataType b_) { a_ = a_ + b_; });
|
||||
}
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp_t::MUL)
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp::MUL)
|
||||
{
|
||||
return ([&](AccDataType& a_, AccDataType b_) { a_ = a_ * b_; });
|
||||
}
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN)
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp::MIN)
|
||||
{
|
||||
return ([&](AccDataType& a_, AccDataType b_) {
|
||||
if(a_ > b_)
|
||||
a_ = b_;
|
||||
});
|
||||
}
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX)
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX)
|
||||
{
|
||||
return ([&](AccDataType& a_, AccDataType b_) {
|
||||
if(a_ < b_)
|
||||
@@ -149,10 +149,10 @@ __host__ static inline std::function<void(AccDataType&, AccDataType)> ReduceOpFn
|
||||
}
|
||||
};
|
||||
|
||||
template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
|
||||
template <typename AccDataType, ReduceTensorOp ReduceOpId>
|
||||
__host__ static inline std::function<void(AccDataType&, AccDataType, bool& changed)> ReduceOpFn2()
|
||||
{
|
||||
if constexpr(ReduceOpId == ReduceTensorOp_t::MIN)
|
||||
if constexpr(ReduceOpId == ReduceTensorOp::MIN)
|
||||
{
|
||||
return ([&](AccDataType& a_, AccDataType b_, bool& changed) {
|
||||
if(a_ > b_)
|
||||
@@ -164,7 +164,7 @@ __host__ static inline std::function<void(AccDataType&, AccDataType, bool& chang
|
||||
changed = false;
|
||||
});
|
||||
}
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX)
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX)
|
||||
{
|
||||
return ([&](AccDataType& a_, AccDataType b_, bool& changed) {
|
||||
if(a_ < b_)
|
||||
@@ -178,40 +178,40 @@ __host__ static inline std::function<void(AccDataType&, AccDataType, bool& chang
|
||||
}
|
||||
else
|
||||
{
|
||||
// ReduceTensorOp_t::ADD:
|
||||
// ReduceTensorOp_t::MUL:
|
||||
// ReduceTensorOp_t::AVG:
|
||||
// ReduceTensorOp_t::NORM1:
|
||||
// ReduceTensorOp_t::NORM2:
|
||||
// ReduceTensorOp::ADD:
|
||||
// ReduceTensorOp::MUL:
|
||||
// ReduceTensorOp::AVG:
|
||||
// ReduceTensorOp::NORM1:
|
||||
// ReduceTensorOp::NORM2:
|
||||
return (std::function<void(AccDataType&, AccDataType, bool&)>{});
|
||||
};
|
||||
};
|
||||
|
||||
template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
|
||||
template <typename AccDataType, ReduceTensorOp ReduceOpId>
|
||||
__host__ static inline AccDataType ReduceOpZeroVal()
|
||||
{
|
||||
if constexpr(ReduceOpId == ReduceTensorOp_t::MUL)
|
||||
if constexpr(ReduceOpId == ReduceTensorOp::MUL)
|
||||
{
|
||||
return (static_cast<AccDataType>(1.0f));
|
||||
}
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN)
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp::MIN)
|
||||
{
|
||||
return (std::numeric_limits<AccDataType>::max());
|
||||
}
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX)
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp::MAX)
|
||||
{
|
||||
return (std::numeric_limits<AccDataType>::lowest());
|
||||
}
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX)
|
||||
else if constexpr(ReduceOpId == ReduceTensorOp::AMAX)
|
||||
{
|
||||
return (static_cast<AccDataType>(0.0f));
|
||||
}
|
||||
else
|
||||
{
|
||||
// ReduceTensorOp_t::ADD
|
||||
// ReduceTensorOp_t::AVG
|
||||
// ReduceTensorOp_t::NORM1
|
||||
// ReduceTensorOp_t::NORM2
|
||||
// ReduceTensorOp::ADD
|
||||
// ReduceTensorOp::AVG
|
||||
// ReduceTensorOp::NORM1
|
||||
// ReduceTensorOp::NORM2
|
||||
return (static_cast<AccDataType>(0.0f));
|
||||
};
|
||||
};
|
||||
|
||||
@@ -104,7 +104,7 @@ static size_t get_offset_from_index(const std::vector<size_t>& strides,
|
||||
template <typename InDataType,
|
||||
typename AccDataType,
|
||||
typename OutDataType,
|
||||
ck::ReduceTensorOp_t ReduceOpId,
|
||||
ck::ReduceTensorOp ReduceOpId,
|
||||
int Rank,
|
||||
int NumReduceDim,
|
||||
bool PropagateNan,
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
template <typename TInWei,
|
||||
typename TAcc,
|
||||
typename TOut,
|
||||
ck::ActivTypeEnum_t activ_type,
|
||||
ck::ActivTypeEnum activ_type,
|
||||
typename InLengths,
|
||||
typename WeiLengths,
|
||||
typename AddLengths,
|
||||
|
||||
@@ -231,7 +231,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk(
|
||||
TInWei,
|
||||
TAcc,
|
||||
TOut,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
decltype(wei_gemmk0_gemmm_gemmk1_grid_desc),
|
||||
decltype(out_gemmk0_gemmn_gemmk1_grid_desc),
|
||||
decltype(in_gemmm_gemmn_grid_desc),
|
||||
|
||||
@@ -338,7 +338,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
|
||||
TInWei,
|
||||
TAcc,
|
||||
TOut,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
decltype(out_gemmk0_gemmm_gemmk1_grid_desc),
|
||||
decltype(wei_gemmk0_gemmn_gemmk1_grid_desc),
|
||||
decltype(in_gemmm_gemmn_grid_desc),
|
||||
|
||||
@@ -307,7 +307,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
|
||||
TInWei,
|
||||
TAcc,
|
||||
TOut,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
decltype(out_gemmk0_gemmm_gemmk1_grid_desc),
|
||||
decltype(wei_gemmk0_gemmn_gemmk1_grid_desc),
|
||||
decltype(in_gemmm_gemmn_grid_desc),
|
||||
|
||||
@@ -171,7 +171,7 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_atomic_nchw_
|
||||
TIn,
|
||||
TAcc,
|
||||
TWei,
|
||||
InMemoryDataOperationEnum_t::AtomicAdd,
|
||||
InMemoryDataOperationEnum::AtomicAdd,
|
||||
decltype(out_gemmk0_gemmm_gemmk1_grid_desc),
|
||||
decltype(in_gemmk0_gemmn_gemmk1_grid_desc),
|
||||
decltype(wei_gemmm_gemmn_grid_desc),
|
||||
|
||||
@@ -168,7 +168,7 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
|
||||
TIn,
|
||||
TAcc,
|
||||
TWei,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
decltype(out_gemmk0_gemmm_gemmk1_grid_desc),
|
||||
decltype(in_gemmk0_gemmn_gemmk1_grid_desc),
|
||||
decltype(wei_gemmm_gemmn_grid_desc),
|
||||
|
||||
@@ -200,7 +200,7 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_atomic_nhwc_
|
||||
TIn,
|
||||
TAcc,
|
||||
TWei,
|
||||
InMemoryDataOperationEnum_t::AtomicAdd,
|
||||
InMemoryDataOperationEnum::AtomicAdd,
|
||||
decltype(in_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc),
|
||||
decltype(out_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc),
|
||||
decltype(wei_gemmm_gemmn_grid_desc),
|
||||
|
||||
@@ -199,7 +199,7 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
|
||||
TIn,
|
||||
TAcc,
|
||||
TWei,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
decltype(in_gemmk0_gemmm_gemmk1_grid_desc),
|
||||
decltype(out_gemmk0_gemmn_gemmk1_grid_desc),
|
||||
decltype(wei_gemmm_gemmn_grid_desc),
|
||||
|
||||
@@ -367,7 +367,7 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r5_xdlops_atomic_nhwc_
|
||||
TIn,
|
||||
TAcc,
|
||||
TWei,
|
||||
InMemoryDataOperationEnum_t::AtomicAdd,
|
||||
InMemoryDataOperationEnum::AtomicAdd,
|
||||
decltype(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc),
|
||||
decltype(in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc),
|
||||
decltype(wei_gemmm_gemmn_grid_desc),
|
||||
|
||||
@@ -138,7 +138,7 @@ void device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
|
||||
TInWei,
|
||||
TAcc,
|
||||
TOut,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
decltype(wei_gemmk_gemmm_grid_desc),
|
||||
decltype(in_gemmk_gemmn_grid_desc),
|
||||
decltype(out_gemmm_gemmn_grid_desc),
|
||||
|
||||
@@ -202,7 +202,7 @@ void device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk(
|
||||
TInWei,
|
||||
TAcc,
|
||||
TOut,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
decltype(in_gemmk0_gemmm_gemmk1_grid_desc),
|
||||
decltype(wei_gemmk0_gemmn_gemmk1_grid_desc),
|
||||
decltype(out_gemmm_gemmn_grid_desc),
|
||||
|
||||
@@ -167,7 +167,7 @@ void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw(
|
||||
TInWei,
|
||||
TAcc,
|
||||
TOut,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
decltype(wei_gemmk0_gemmm_gemmk1_grid_desc),
|
||||
decltype(in_gemmk0_gemmn_gemmk1_grid_desc),
|
||||
decltype(out_gemmm_gemmn_grid_desc),
|
||||
|
||||
@@ -522,7 +522,7 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
|
||||
TInWei,
|
||||
TAcc,
|
||||
TOut,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
decltype(in_gemmk0_gemmm_gemmk1_grid_desc),
|
||||
decltype(wei_gemmk0_gemmn_gemmk1_grid_desc),
|
||||
decltype(out_gemmm_gemmn_grid_desc),
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
template <typename TInWei,
|
||||
typename TAcc,
|
||||
typename TOut,
|
||||
ck::ActivTypeEnum_t activ_type,
|
||||
ck::ActivTypeEnum activ_type,
|
||||
typename InLengths,
|
||||
typename WeiLengths,
|
||||
typename OutLengths,
|
||||
|
||||
@@ -182,7 +182,7 @@ void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
|
||||
TInWei,
|
||||
TAcc,
|
||||
TOut,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
decltype(wei_grid_desc_gk0_gm0_gm1_gk1),
|
||||
decltype(in_grid_desc_gk0_gn0_gn1_gk1),
|
||||
decltype(out_grid_desc_gm0_gm1_gn0_gn1),
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
template <typename TInWei,
|
||||
typename TAcc,
|
||||
typename TOut,
|
||||
ck::ActivTypeEnum_t activ_type,
|
||||
ck::ActivTypeEnum activ_type,
|
||||
typename InLengths,
|
||||
typename WeiLengths,
|
||||
typename MaxLengths,
|
||||
|
||||
@@ -398,7 +398,7 @@ void device_gemm_xdlops_km_kn_mn(const Tensor<ABType>& a_k_m,
|
||||
ABType,
|
||||
AccType,
|
||||
CType,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
decltype(a_k0_m_k1_grid_desc),
|
||||
decltype(b_k0_n_k1_grid_desc),
|
||||
decltype(c_m_n_grid_desc),
|
||||
|
||||
@@ -202,7 +202,7 @@ void device_gemm_xdlops_km_kn_nm(const Tensor<ABType>& a_k_m,
|
||||
ABType,
|
||||
AccType,
|
||||
CType,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
decltype(a_k0_m_k1_grid_desc),
|
||||
decltype(b_k0_n_k1_grid_desc),
|
||||
decltype(c_m_n_grid_desc),
|
||||
|
||||
@@ -398,7 +398,7 @@ void device_gemm_xdlops_km_nk_mn(const Tensor<ABType>& a_k_m,
|
||||
ABType,
|
||||
AccType,
|
||||
CType,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
decltype(a_k0_m_k1_grid_desc),
|
||||
decltype(b_k0_n_k1_grid_desc),
|
||||
decltype(c_m_n_grid_desc),
|
||||
|
||||
@@ -202,7 +202,7 @@ void device_gemm_xdlops_km_nk_nm(const Tensor<ABType>& a_k_m,
|
||||
ABType,
|
||||
AccType,
|
||||
CType,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
decltype(a_k0_m_k1_grid_desc),
|
||||
decltype(b_k0_n_k1_grid_desc),
|
||||
decltype(c_m_n_grid_desc),
|
||||
|
||||
@@ -398,7 +398,7 @@ void device_gemm_xdlops_mk_kn_mn(const Tensor<ABType>& a_m_k,
|
||||
ABType,
|
||||
AccType,
|
||||
CType,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
decltype(a_k0_m_k1_grid_desc),
|
||||
decltype(b_k0_n_k1_grid_desc),
|
||||
decltype(c_m_n_grid_desc),
|
||||
|
||||
@@ -230,7 +230,7 @@ void device_gemm_xdlops_mk_kn_nm(const Tensor<ABType>& a_m_k,
|
||||
ABType,
|
||||
AccType,
|
||||
CType,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
decltype(a_k0_m_k1_grid_desc),
|
||||
decltype(b_k0_n_k1_grid_desc),
|
||||
decltype(c_m_n_grid_desc),
|
||||
|
||||
@@ -499,7 +499,7 @@ void device_gemm_xdlops_mk_nk_mn(const Tensor<ABType>& a_m_k,
|
||||
ABType,
|
||||
AccType,
|
||||
CType,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
decltype(a_k0_m_k1_grid_desc),
|
||||
decltype(b_k0_n_k1_grid_desc),
|
||||
decltype(c_m_n_grid_desc),
|
||||
|
||||
@@ -286,7 +286,7 @@ void device_gemm_xdlops_mk_nk_nm(const Tensor<ABType>& a_m_k,
|
||||
ABType,
|
||||
AccType,
|
||||
CType,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
decltype(a_k0_m_k1_grid_desc),
|
||||
decltype(b_k0_n_k1_grid_desc),
|
||||
decltype(c_m_n_grid_desc),
|
||||
|
||||
@@ -10,7 +10,7 @@ template <ck::index_t BlockSize,
|
||||
typename FloatAB,
|
||||
typename FloatAcc,
|
||||
typename FloatC,
|
||||
ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
|
||||
ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
|
||||
typename AGridDesc_GK0_GM0_GM1_GK1,
|
||||
typename BGridDesc_GK0_GN0_GN1_GK1,
|
||||
typename CGridDesc_GM0_GM1_GN0_GN1,
|
||||
|
||||
@@ -27,7 +27,7 @@ template <ck::index_t BlockSize,
|
||||
ck::index_t ABlockTransferDstScalarPerVector_E2,
|
||||
ck::index_t BThreadTransferSrcScalarPerVector_E2,
|
||||
ck::index_t CThreadTransferDstScalarPerVector_K,
|
||||
ck::ActivTypeEnum_t activ_type>
|
||||
ck::ActivTypeEnum activ_type>
|
||||
struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_add
|
||||
{
|
||||
template <typename... Wei,
|
||||
@@ -294,7 +294,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
|
||||
FloatAB,
|
||||
FloatAcc,
|
||||
FloatC,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
decltype(a_e0_e1_k_e2_grid_desc),
|
||||
decltype(b_e0_e1_n_ho_wo_e2_grid_desc),
|
||||
decltype(c_k_n_hop_wop_grid_desc),
|
||||
|
||||
@@ -27,7 +27,7 @@ template <ck::index_t BlockSize,
|
||||
ck::index_t ABlockTransferDstScalarPerVector_E2,
|
||||
ck::index_t BThreadTransferSrcScalarPerVector_E2,
|
||||
ck::index_t CThreadTransferDstScalarPerVector_K,
|
||||
ck::ActivTypeEnum_t activ_type>
|
||||
ck::ActivTypeEnum activ_type>
|
||||
struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_outpad
|
||||
{
|
||||
template <typename... Wei,
|
||||
@@ -260,7 +260,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
|
||||
FloatAB,
|
||||
FloatAcc,
|
||||
FloatC,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
decltype(a_e0_e1_k_e2_grid_desc),
|
||||
decltype(b_e0_e1_n_ho_wo_e2_grid_desc),
|
||||
decltype(c_k_n_hop_wop_grid_desc),
|
||||
|
||||
@@ -27,7 +27,7 @@ template <ck::index_t BlockSize,
|
||||
ck::index_t ABlockTransferDstScalarPerVector_E2,
|
||||
ck::index_t BThreadTransferSrcScalarPerVector_E2,
|
||||
ck::index_t CThreadTransferDstScalarPerVector_K,
|
||||
ck::ActivTypeEnum_t activ_type>
|
||||
ck::ActivTypeEnum activ_type>
|
||||
struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool
|
||||
{
|
||||
template <typename... Wei,
|
||||
@@ -305,7 +305,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
|
||||
FloatAB,
|
||||
FloatAcc,
|
||||
FloatC,
|
||||
InMemoryDataOperationEnum_t::Set,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
decltype(a_e0_e1_k_e2_grid_desc),
|
||||
decltype(b_e0_e1_n_ho_wo_e2_grid_desc),
|
||||
decltype(c_k_n_hop_wop_grid_desc),
|
||||
|
||||
@@ -10,7 +10,7 @@ template <ck::index_t BlockSize,
|
||||
typename FloatAB,
|
||||
typename FloatAcc,
|
||||
typename FloatC,
|
||||
ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
|
||||
ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
|
||||
typename AKMGridDesc,
|
||||
typename BKNGridDesc,
|
||||
typename CMNGridDesc,
|
||||
|
||||
@@ -10,7 +10,7 @@ template <ck::index_t BlockSize,
|
||||
typename FloatAB,
|
||||
typename FloatAcc,
|
||||
typename FloatC,
|
||||
ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
|
||||
ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
|
||||
typename AK0MK1GridDesc,
|
||||
typename BK0NK1GridDesc,
|
||||
typename CMNGridDesc,
|
||||
|
||||
@@ -11,7 +11,7 @@ template <ck::index_t BlockSize,
|
||||
typename FloatAB,
|
||||
typename FloatAcc,
|
||||
typename FloatC,
|
||||
ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
|
||||
ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
|
||||
typename AGridDesc_K0_M_K1,
|
||||
typename BGridDesc_K0_N_K,
|
||||
typename CMNGridDesc,
|
||||
|
||||
@@ -10,7 +10,7 @@ template <ck::index_t BlockSize,
|
||||
typename FloatAB,
|
||||
typename FloatAcc,
|
||||
typename FloatC,
|
||||
ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
|
||||
ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
|
||||
typename ABK0MK1GridDesc,
|
||||
typename BBK0NK1GridDesc,
|
||||
typename CMNGridDesc,
|
||||
|
||||
@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_blockwise = std::tuple<
|
||||
>;
|
||||
#endif
|
||||
|
||||
template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
|
||||
template <typename AccDataType, ReduceTensorOp ReduceOpId>
|
||||
using deviceReduceBlockWisePtrType = DeviceReducePtr<
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation,
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation>;
|
||||
@@ -57,9 +57,9 @@ template <typename InDataType,
|
||||
typename OutDataType,
|
||||
int Rank,
|
||||
int NumReduceDim,
|
||||
ReduceTensorOp_t ReduceOpId,
|
||||
NanPropagation_t NanOpt,
|
||||
ReduceTensorIndices_t IndicesOpt>
|
||||
ReduceTensorOp ReduceOpId,
|
||||
NanPropagation NanOpt,
|
||||
ReduceTensorIndices IndicesOpt>
|
||||
void add_device_reduce_instance_blockwise(
|
||||
std::vector<deviceReduceBlockWisePtrType<AccDataType, ReduceOpId>>& device_op_instances)
|
||||
{
|
||||
@@ -71,11 +71,11 @@ void add_device_reduce_instance_blockwise(
|
||||
AccElementwiseOperation;
|
||||
|
||||
constexpr bool Indexable =
|
||||
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
|
||||
ReduceOpId == ReduceTensorOp_t::AMAX);
|
||||
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
||||
(ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
|
||||
ReduceOpId == ReduceTensorOp::AMAX);
|
||||
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
|
||||
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
|
||||
|
||||
static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) {
|
||||
using cfg1 =
|
||||
@@ -123,15 +123,15 @@ void add_device_reduce_instance_blockwise(
|
||||
IndicesOpt>( \
|
||||
std::vector<deviceReduceBlockWisePtrType<compT, ReduceOpId>> & device_op_instances)
|
||||
|
||||
#define ADD_BLOCKWISE_INST_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_BLOCKWISE_INST_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
||||
static_cast<NanPropagation_t>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
||||
Rank, \
|
||||
#define ADD_BLOCKWISE_INST_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_BLOCKWISE_INST_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<NanPropagation>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices>(IndicesOpt), \
|
||||
Rank, \
|
||||
NumReduceDim)
|
||||
|
||||
#define ADD_BLOCKWISE_INST_REF_BY_TYPE( \
|
||||
@@ -150,15 +150,15 @@ void add_device_reduce_instance_blockwise(
|
||||
AccElementwiseOperation>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_BLOCKWISE_INST_REF_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
||||
static_cast<NanPropagation_t>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
||||
Rank, \
|
||||
#define ADD_BLOCKWISE_INST_REF_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<NanPropagation>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices>(IndicesOpt), \
|
||||
Rank, \
|
||||
NumReduceDim)
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
|
||||
@@ -34,7 +34,7 @@ using reduce_configuration_2_instances_blockwise_second_call = std::tuple<
|
||||
>;
|
||||
#endif
|
||||
|
||||
template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
|
||||
template <typename AccDataType, ReduceTensorOp ReduceOpId>
|
||||
using deviceReduceBlockWiseSecondCallPtrType = DeviceReducePtr<
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::InElementwiseOperation,
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::AccElementwiseOperation>;
|
||||
@@ -44,9 +44,9 @@ template <typename InDataType,
|
||||
typename OutDataType,
|
||||
int Rank,
|
||||
int NumReduceDim,
|
||||
ReduceTensorOp_t ReduceOpId,
|
||||
NanPropagation_t NanOpt,
|
||||
ReduceTensorIndices_t IndicesOpt>
|
||||
ReduceTensorOp ReduceOpId,
|
||||
NanPropagation NanOpt,
|
||||
ReduceTensorIndices IndicesOpt>
|
||||
void add_device_reduce_instance_blockwise_second_call(
|
||||
std::vector<deviceReduceBlockWiseSecondCallPtrType<AccDataType, ReduceOpId>>&
|
||||
device_op_instances)
|
||||
@@ -60,11 +60,11 @@ void add_device_reduce_instance_blockwise_second_call(
|
||||
AccElementwiseOperation;
|
||||
|
||||
constexpr bool Indexable =
|
||||
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
|
||||
ReduceOpId == ReduceTensorOp_t::AMAX);
|
||||
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
||||
(ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
|
||||
ReduceOpId == ReduceTensorOp::AMAX);
|
||||
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
|
||||
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
|
||||
|
||||
static_assert(std::is_same<InDataType, AccDataType>::value,
|
||||
"InDataType and AccDataType should be the same to use "
|
||||
@@ -117,15 +117,15 @@ void add_device_reduce_instance_blockwise_second_call(
|
||||
std::vector<deviceReduceBlockWiseSecondCallPtrType<compT, ReduceOpId>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
||||
static_cast<NanPropagation_t>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
||||
Rank, \
|
||||
#define ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<NanPropagation>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices>(IndicesOpt), \
|
||||
Rank, \
|
||||
NumReduceDim)
|
||||
|
||||
#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE( \
|
||||
@@ -145,15 +145,15 @@ void add_device_reduce_instance_blockwise_second_call(
|
||||
AccElementwiseOperation>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
||||
static_cast<NanPropagation_t>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
||||
Rank, \
|
||||
#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<NanPropagation>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices>(IndicesOpt), \
|
||||
Rank, \
|
||||
NumReduceDim)
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
|
||||
@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple<
|
||||
>;
|
||||
#endif
|
||||
|
||||
template <typename AccDataType, ReduceTensorOp_t ReduceOperation>
|
||||
template <typename AccDataType, ReduceTensorOp ReduceOperation>
|
||||
using deviceReduceMultiBlockAtomicAddPtrType =
|
||||
DeviceReducePtr<typename reduce_unary_operator<AccDataType, ReduceOperation, true, true>::
|
||||
InElementwiseOperation,
|
||||
@@ -59,9 +59,9 @@ template <typename InDataType,
|
||||
typename OutDataType,
|
||||
int Rank,
|
||||
int NumReduceDim,
|
||||
ReduceTensorOp_t ReduceOpId,
|
||||
NanPropagation_t NanOpt,
|
||||
ReduceTensorIndices_t IndicesOpt>
|
||||
ReduceTensorOp ReduceOpId,
|
||||
NanPropagation NanOpt,
|
||||
ReduceTensorIndices IndicesOpt>
|
||||
void add_device_reduce_instance_multiblock_atomic_add(
|
||||
std::vector<deviceReduceMultiBlockAtomicAddPtrType<AccDataType, ReduceOpId>>&
|
||||
device_op_instances)
|
||||
@@ -74,18 +74,18 @@ void add_device_reduce_instance_multiblock_atomic_add(
|
||||
AccElementwiseOperation;
|
||||
|
||||
constexpr bool Indexable =
|
||||
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
|
||||
ReduceOpId == ReduceTensorOp_t::AMAX);
|
||||
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
||||
(ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
|
||||
ReduceOpId == ReduceTensorOp::AMAX);
|
||||
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
|
||||
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
|
||||
|
||||
static_assert(IndicesOpt == ReduceTensorIndices_t::NO_INDICES,
|
||||
static_assert(IndicesOpt == ReduceTensorIndices::NO_INDICES,
|
||||
"AtomicAdd can only be used with reduction operations without indices!");
|
||||
|
||||
constexpr bool op_acceptable =
|
||||
(ReduceOpId == ReduceTensorOp_t::ADD || ReduceOpId == ReduceTensorOp_t::MUL ||
|
||||
ReduceOpId == ReduceTensorOp_t::AVG || ReduceOpId == ReduceTensorOp_t::NORM1);
|
||||
(ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::MUL ||
|
||||
ReduceOpId == ReduceTensorOp::AVG || ReduceOpId == ReduceTensorOp::NORM1);
|
||||
|
||||
constexpr bool out_type_acceptable =
|
||||
(std::is_same<OutDataType, float>::value || std::is_same<OutDataType, double>::value);
|
||||
@@ -144,15 +144,15 @@ void add_device_reduce_instance_multiblock_atomic_add(
|
||||
std::vector<deviceReduceMultiBlockAtomicAddPtrType<compT, ReduceOpId>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
||||
static_cast<NanPropagation_t>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
||||
Rank, \
|
||||
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<NanPropagation>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices>(IndicesOpt), \
|
||||
Rank, \
|
||||
NumReduceDim)
|
||||
|
||||
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE( \
|
||||
@@ -171,15 +171,15 @@ void add_device_reduce_instance_multiblock_atomic_add(
|
||||
AccElementwiseOperation>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
||||
static_cast<NanPropagation_t>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
||||
Rank, \
|
||||
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<NanPropagation>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices>(IndicesOpt), \
|
||||
Rank, \
|
||||
NumReduceDim)
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
|
||||
@@ -46,7 +46,7 @@ using reduce_configuration_2_instances_multiblock_partial_reduce = std::tuple<
|
||||
>;
|
||||
#endif
|
||||
|
||||
template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
|
||||
template <typename AccDataType, ReduceTensorOp ReduceOpId>
|
||||
using deviceReduceMultiBlockPartialReducePtrType = DeviceReducePtr<
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::InElementwiseOperation,
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::AccElementwiseOperation>;
|
||||
@@ -56,9 +56,9 @@ template <typename InDataType,
|
||||
typename OutDataType,
|
||||
int Rank,
|
||||
int NumReduceDim,
|
||||
ReduceTensorOp_t ReduceOpId,
|
||||
NanPropagation_t NanOpt,
|
||||
ReduceTensorIndices_t IndicesOpt>
|
||||
ReduceTensorOp ReduceOpId,
|
||||
NanPropagation NanOpt,
|
||||
ReduceTensorIndices IndicesOpt>
|
||||
void add_device_reduce_instance_multiblock_partial_reduce(
|
||||
std::vector<deviceReduceMultiBlockPartialReducePtrType<AccDataType, ReduceOpId>>&
|
||||
device_op_instances)
|
||||
@@ -72,11 +72,11 @@ void add_device_reduce_instance_multiblock_partial_reduce(
|
||||
AccElementwiseOperation;
|
||||
|
||||
constexpr bool Indexable =
|
||||
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
|
||||
ReduceOpId == ReduceTensorOp_t::AMAX);
|
||||
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
||||
(ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
|
||||
ReduceOpId == ReduceTensorOp::AMAX);
|
||||
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
|
||||
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
|
||||
|
||||
static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) {
|
||||
using cfg1 =
|
||||
@@ -126,15 +126,15 @@ void add_device_reduce_instance_multiblock_partial_reduce(
|
||||
std::vector<deviceReduceMultiBlockPartialReducePtrType<compT, ReduceOpId>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
||||
static_cast<NanPropagation_t>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
||||
Rank, \
|
||||
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<NanPropagation>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices>(IndicesOpt), \
|
||||
Rank, \
|
||||
NumReduceDim)
|
||||
|
||||
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE( \
|
||||
@@ -154,15 +154,15 @@ void add_device_reduce_instance_multiblock_partial_reduce(
|
||||
AccElementwiseOperation>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
||||
static_cast<NanPropagation_t>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
||||
Rank, \
|
||||
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<NanPropagation>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices>(IndicesOpt), \
|
||||
Rank, \
|
||||
NumReduceDim)
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
|
||||
@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_threadwise = std::tuple<
|
||||
>;
|
||||
#endif
|
||||
|
||||
template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
|
||||
template <typename AccDataType, ReduceTensorOp ReduceOpId>
|
||||
using deviceReduceThreadWisePtrType = DeviceReducePtr<
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation,
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation>;
|
||||
@@ -57,9 +57,9 @@ template <typename InDataType,
|
||||
typename OutDataType,
|
||||
int Rank,
|
||||
int NumReduceDim,
|
||||
ReduceTensorOp_t ReduceOpId,
|
||||
NanPropagation_t NanOpt,
|
||||
ReduceTensorIndices_t IndicesOpt>
|
||||
ReduceTensorOp ReduceOpId,
|
||||
NanPropagation NanOpt,
|
||||
ReduceTensorIndices IndicesOpt>
|
||||
void add_device_reduce_instance_threadwise(
|
||||
std::vector<deviceReduceThreadWisePtrType<AccDataType, ReduceOpId>>& device_op_instances)
|
||||
{
|
||||
@@ -71,11 +71,11 @@ void add_device_reduce_instance_threadwise(
|
||||
AccElementwiseOperation;
|
||||
|
||||
constexpr bool Indexable =
|
||||
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
|
||||
ReduceOpId == ReduceTensorOp_t::AMAX);
|
||||
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
||||
(ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
|
||||
ReduceOpId == ReduceTensorOp::AMAX);
|
||||
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
|
||||
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
|
||||
|
||||
using cfg1 = ReductionConfiguration_1<256, 256, 1>;
|
||||
|
||||
@@ -119,15 +119,15 @@ void add_device_reduce_instance_threadwise(
|
||||
IndicesOpt>( \
|
||||
std::vector<deviceReduceThreadWisePtrType<compT, ReduceOpId>> & device_op_instances)
|
||||
|
||||
#define ADD_THREADWISE_INST_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_THREADWISE_INST_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
||||
static_cast<NanPropagation_t>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
||||
Rank, \
|
||||
#define ADD_THREADWISE_INST_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_THREADWISE_INST_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<NanPropagation>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices>(IndicesOpt), \
|
||||
Rank, \
|
||||
NumReduceDim)
|
||||
|
||||
#define ADD_THREADWISE_INST_REF_BY_TYPE( \
|
||||
@@ -146,15 +146,15 @@ void add_device_reduce_instance_threadwise(
|
||||
AccElementwiseOperation>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_THREADWISE_INST_REF_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_THREADWISE_INST_REF_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
||||
static_cast<NanPropagation_t>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
||||
Rank, \
|
||||
#define ADD_THREADWISE_INST_REF_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_THREADWISE_INST_REF_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<NanPropagation>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices>(IndicesOpt), \
|
||||
Rank, \
|
||||
NumReduceDim)
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
|
||||
Reference in New Issue
Block a user