mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-03 13:11:25 +00:00
Pr82 followup (#115)
* Use thread cluster descriptor and explicit M_K 2d descriptor to simply Blockwise Reduction * Change by replacing ReduceDims by NumReduceDims as Device Reduce interface template parameter * Rename the folder name for the pool2d and reduce examples * Update to reduction test scripts * Add Readme for pool2d_fwd and reduce_blockwise examples * Tiny fix in reduce profiler and tiny update in reduce testing scripts * Tiny fix in testing script profile_reduce_no_index.sh * Tiny change in script/profile_reduce_with_index.sh * Renaming and refining in Reduction profiler/device layer/examples * Renaming and refining in Reduction profiler/device layer/examples * Renaming all NumReduceDims to NumReduceDim
This commit is contained in:
@@ -36,14 +36,15 @@ struct DeviceReduce : public BaseOperator
|
||||
const std::vector<int>& inStrides,
|
||||
const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides,
|
||||
const std::vector<int>& reduceDims,
|
||||
float alpha,
|
||||
float beta,
|
||||
const void* in_dev,
|
||||
void* out_dev,
|
||||
void* out_indices_dev,
|
||||
void* workspace_dev,
|
||||
const InElementwiseOperation& inElementwiseOp,
|
||||
const AccElementwiseOperation& accElementwiseOp) = 0;
|
||||
const InElementwiseOperation& in_elementwise_op,
|
||||
const AccElementwiseOperation& acc_elementwise_op) = 0;
|
||||
|
||||
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
|
||||
};
|
||||
|
||||
@@ -15,8 +15,8 @@ namespace device {
|
||||
template <typename InDataType,
|
||||
typename AccDataType,
|
||||
typename OutDataType,
|
||||
int Rank,
|
||||
typename ReduceDims,
|
||||
index_t Rank,
|
||||
index_t NumReduceDim,
|
||||
typename ReduceOperation,
|
||||
typename InElementwiseOperation,
|
||||
typename AccElementwiseOperation,
|
||||
@@ -40,7 +40,12 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
|
||||
|
||||
static constexpr bool BetaIsZero = NeedIndices;
|
||||
|
||||
using InvariantDims = decltype(get_invariant_dims<Rank, ReduceDims>());
|
||||
static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
|
||||
using InvariantDims =
|
||||
typename conditional<NumInvariantDim == 0,
|
||||
Sequence<>,
|
||||
typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type;
|
||||
using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
|
||||
|
||||
static constexpr index_t srcDims = Rank;
|
||||
static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
|
||||
@@ -74,7 +79,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto toReduceDimLengths =
|
||||
const auto reduceDimLengths =
|
||||
make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
|
||||
const auto invariantDimLengths =
|
||||
make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
|
||||
@@ -82,7 +87,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
|
||||
return transform_tensor_descriptor(
|
||||
inDesc,
|
||||
make_tuple(make_merge_transform(invariantDimLengths),
|
||||
make_merge_transform(toReduceDimLengths)),
|
||||
make_merge_transform(reduceDimLengths)),
|
||||
make_tuple(InvariantDims{}, ReduceDims{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
}
|
||||
@@ -136,6 +141,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
|
||||
const std::vector<int>& inStrides,
|
||||
const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides,
|
||||
const std::vector<int>& reduceDims,
|
||||
float alpha,
|
||||
float beta,
|
||||
const InDataType* in_dev,
|
||||
@@ -144,30 +150,31 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
|
||||
AccDataType* workspace_dev,
|
||||
const InElementwiseOperation& in_elementwise_op,
|
||||
const AccElementwiseOperation& acc_elementwise_op)
|
||||
: in_dev_{in_dev}, out_dev_{out_dev}, out_indices_dev_{out_indices_dev}
|
||||
: outLengths_{outLengths},
|
||||
outStrides_{outStrides},
|
||||
in_dev_{in_dev},
|
||||
out_dev_{out_dev},
|
||||
out_indices_dev_{out_indices_dev},
|
||||
in_elementwise_op_{in_elementwise_op},
|
||||
acc_elementwise_op_{acc_elementwise_op}
|
||||
{
|
||||
(void)workspace_dev;
|
||||
|
||||
inLengths_ = inLengths;
|
||||
inStrides_ = inStrides;
|
||||
outLengths_ = outLengths;
|
||||
outStrides_ = outStrides;
|
||||
|
||||
in_elementwise_op_ = in_elementwise_op;
|
||||
acc_elementwise_op_ = acc_elementwise_op;
|
||||
std::tie(inLengths_, inStrides_) =
|
||||
shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, inStrides, reduceDims);
|
||||
|
||||
alpha_ = static_cast<AccDataType>(alpha);
|
||||
beta_ = static_cast<OutDataType>(beta);
|
||||
|
||||
std::tie(invariant_total_length, reduce_total_length) =
|
||||
get_2d_lengths<Rank, ReduceDims>(inLengths);
|
||||
get_2d_lengths<Rank, ReduceDims>(inLengths_);
|
||||
|
||||
if constexpr(InvariantDims::Size() == 0)
|
||||
invariant_lowest_length = 1;
|
||||
else
|
||||
invariant_lowest_length = inLengths[InvariantDims::At(InvariantDims::Size() - 1)];
|
||||
invariant_lowest_length = inLengths_[InvariantDims::At(InvariantDims::Size() - 1)];
|
||||
|
||||
reduce_lowest_length = inLengths[ReduceDims::At(ReduceDims::Size() - 1)];
|
||||
reduce_lowest_length = inLengths_[ReduceDims::At(ReduceDims::Size() - 1)];
|
||||
|
||||
gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
|
||||
M_BlockTileSize;
|
||||
@@ -305,6 +312,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
|
||||
const std::vector<int>& inStrides,
|
||||
const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides,
|
||||
const std::vector<int>& reduceDims,
|
||||
float alpha,
|
||||
float beta,
|
||||
const void* in_dev,
|
||||
@@ -318,6 +326,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
|
||||
inStrides,
|
||||
outLengths,
|
||||
outStrides,
|
||||
reduceDims,
|
||||
alpha,
|
||||
beta,
|
||||
static_cast<const InDataType*>(in_dev),
|
||||
|
||||
@@ -15,8 +15,8 @@ namespace device {
|
||||
template <typename InDataType,
|
||||
typename AccDataType,
|
||||
typename OutDataType,
|
||||
int Rank,
|
||||
typename ReduceDims,
|
||||
index_t Rank,
|
||||
index_t NumReduceDim,
|
||||
typename ReduceOperation,
|
||||
typename InElementwiseOperation,
|
||||
typename AccElementwiseOperation,
|
||||
@@ -45,7 +45,11 @@ struct DeviceReduceBlockWiseSecondCall
|
||||
std::is_same<InDataType, AccDataType>::value,
|
||||
"InDataType and AccDataType should be the same to use DEviceReduceBlockWiseSecondCall!");
|
||||
|
||||
using InvariantDims = decltype(get_invariant_dims<Rank, ReduceDims>());
|
||||
static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
|
||||
using InvariantDims =
|
||||
typename conditional<NumInvariantDim == 0,
|
||||
Sequence<>,
|
||||
typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type;
|
||||
|
||||
static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
|
||||
|
||||
@@ -117,16 +121,16 @@ struct DeviceReduceBlockWiseSecondCall
|
||||
AccDataType* workspace_dev,
|
||||
const InElementwiseOperation& in_elementwise_op,
|
||||
const AccElementwiseOperation& acc_elementwise_op)
|
||||
: in_dev_{in_dev}, out_dev_{out_dev}, out_indices_dev_{out_indices_dev}
|
||||
: inLengths_(inLengths),
|
||||
inStrides_(inStrides),
|
||||
outLengths_(outLengths),
|
||||
outStrides_(outStrides),
|
||||
in_dev_{in_dev},
|
||||
out_dev_{out_dev},
|
||||
out_indices_dev_{out_indices_dev},
|
||||
in_elementwise_op_(in_elementwise_op),
|
||||
acc_elementwise_op_(acc_elementwise_op)
|
||||
{
|
||||
inLengths_ = inLengths;
|
||||
inStrides_ = inStrides;
|
||||
outLengths_ = outLengths;
|
||||
outStrides_ = outStrides;
|
||||
|
||||
in_elementwise_op_ = in_elementwise_op;
|
||||
acc_elementwise_op_ = acc_elementwise_op;
|
||||
|
||||
alpha_ = static_cast<AccDataType>(alpha);
|
||||
beta_ = static_cast<OutDataType>(beta);
|
||||
|
||||
@@ -268,6 +272,7 @@ struct DeviceReduceBlockWiseSecondCall
|
||||
const std::vector<int>& inStrides,
|
||||
const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides,
|
||||
const std::vector<int>& reduceDims,
|
||||
float alpha,
|
||||
float beta,
|
||||
const void* in_dev,
|
||||
@@ -277,6 +282,8 @@ struct DeviceReduceBlockWiseSecondCall
|
||||
const InElementwiseOperation& in_elementwise_op,
|
||||
const AccElementwiseOperation& acc_elementwise_op) override
|
||||
{
|
||||
(void)reduceDims;
|
||||
|
||||
return std::make_unique<Argument>(inLengths,
|
||||
inStrides,
|
||||
outLengths,
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#define DEVICE_REDUCE_COMMON_HPP
|
||||
|
||||
#include <vector>
|
||||
#include <cassert>
|
||||
|
||||
#include "common_header.hpp"
|
||||
#include "reduction_enums.hpp"
|
||||
@@ -40,23 +41,6 @@ constexpr bool belong()
|
||||
return (inside);
|
||||
};
|
||||
|
||||
template <int Rank, typename ReduceDims, int start = 0>
|
||||
constexpr auto get_invariant_dims()
|
||||
{
|
||||
static_assert(Rank <= 6, "bigger Rank size not supported!");
|
||||
|
||||
if constexpr(start >= Rank)
|
||||
return Sequence<>{};
|
||||
else
|
||||
{
|
||||
if constexpr(!belong<start, ReduceDims>())
|
||||
return merge_sequences(Sequence<start>{},
|
||||
get_invariant_dims<Rank, ReduceDims, start + 1>());
|
||||
else
|
||||
return get_invariant_dims<Rank, ReduceDims, start + 1>();
|
||||
};
|
||||
};
|
||||
|
||||
// helper functions using variadic template arguments
|
||||
template <index_t... Ns>
|
||||
static auto make_tuple_from_array_and_index_seq(const std::vector<int>& lengths, Sequence<Ns...>)
|
||||
@@ -74,6 +58,45 @@ static auto make_tuple_from_array(const std::vector<int>& lengths, Number<arrayS
|
||||
return make_tuple_from_array_and_index_seq(lengths, index_seq);
|
||||
};
|
||||
|
||||
template <index_t Rank, index_t NumReduceDim>
|
||||
static inline std::pair<std::vector<int>, std::vector<int>>
|
||||
shuffle_tensor_dimensions(const std::vector<int>& dimLengths,
|
||||
const std::vector<int>& dimStrides,
|
||||
const std::vector<int>& reduceDims)
|
||||
{
|
||||
std::vector<int> newDimLengths;
|
||||
std::vector<int> newDimStrides;
|
||||
|
||||
assert(Rank == dimLengths.size() && Rank == dimStrides.size() &&
|
||||
NumReduceDim == reduceDims.size());
|
||||
|
||||
int reduceFlag = 0;
|
||||
|
||||
// flag the bits for the reduceDims
|
||||
for(int i = 0; i < NumReduceDim; i++)
|
||||
{
|
||||
reduceFlag |= 1 << reduceDims[i];
|
||||
};
|
||||
|
||||
// collect invariant dimensions
|
||||
for(int i = 0; i < Rank; i++)
|
||||
if((reduceFlag & (1 << i)) == 0)
|
||||
{
|
||||
newDimLengths.push_back(dimLengths[i]);
|
||||
newDimStrides.push_back(dimStrides[i]);
|
||||
};
|
||||
|
||||
// collect reduce dimensions
|
||||
for(int i = 0; i < Rank; i++)
|
||||
if((reduceFlag & (1 << i)) > 0)
|
||||
{
|
||||
newDimLengths.push_back(dimLengths[i]);
|
||||
newDimStrides.push_back(dimStrides[i]);
|
||||
};
|
||||
|
||||
return std::make_pair(newDimLengths, newDimStrides);
|
||||
};
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
|
||||
@@ -17,8 +17,8 @@ namespace device {
|
||||
template <typename InDataType,
|
||||
typename AccDataType,
|
||||
typename OutDataType,
|
||||
int Rank,
|
||||
typename ReduceDims,
|
||||
index_t Rank,
|
||||
index_t NumReduceDim,
|
||||
typename ReduceOperation,
|
||||
typename InElementwiseOperation,
|
||||
typename AccElementwiseOperation,
|
||||
@@ -41,7 +41,12 @@ struct DeviceReduceMultiBlockAtomicAdd
|
||||
|
||||
using IndexDataType = int32_t;
|
||||
|
||||
using InvariantDims = decltype(get_invariant_dims<Rank, ReduceDims>());
|
||||
static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
|
||||
using InvariantDims =
|
||||
typename conditional<NumInvariantDim == 0,
|
||||
Sequence<>,
|
||||
typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type;
|
||||
using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
|
||||
|
||||
static constexpr index_t srcDims = Rank;
|
||||
static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
|
||||
@@ -84,7 +89,7 @@ struct DeviceReduceMultiBlockAtomicAdd
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto toReduceDimLengths =
|
||||
const auto reduceDimLengths =
|
||||
make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
|
||||
const auto invariantDimLengths =
|
||||
make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
|
||||
@@ -92,7 +97,7 @@ struct DeviceReduceMultiBlockAtomicAdd
|
||||
return transform_tensor_descriptor(
|
||||
inDesc,
|
||||
make_tuple(make_merge_transform(invariantDimLengths),
|
||||
make_merge_transform(toReduceDimLengths)),
|
||||
make_merge_transform(reduceDimLengths)),
|
||||
make_tuple(InvariantDims{}, ReduceDims{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
}
|
||||
@@ -147,6 +152,7 @@ struct DeviceReduceMultiBlockAtomicAdd
|
||||
const std::vector<int>& inStrides,
|
||||
const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides,
|
||||
const std::vector<int>& reduceDims,
|
||||
float alpha,
|
||||
float beta,
|
||||
const InDataType* in_dev,
|
||||
@@ -155,31 +161,31 @@ struct DeviceReduceMultiBlockAtomicAdd
|
||||
AccDataType* workspace_dev,
|
||||
const InElementwiseOperation& in_elementwise_op,
|
||||
const AccElementwiseOperation& acc_elementwise_op)
|
||||
: in_dev_{in_dev}, out_dev_{out_dev}
|
||||
: outLengths_{outLengths},
|
||||
outStrides_{outStrides},
|
||||
in_dev_{in_dev},
|
||||
out_dev_{out_dev},
|
||||
in_elementwise_op_{in_elementwise_op},
|
||||
acc_elementwise_op_{acc_elementwise_op}
|
||||
{
|
||||
(void)out_indices_dev;
|
||||
(void)workspace_dev;
|
||||
|
||||
inLengths_ = inLengths;
|
||||
inStrides_ = inStrides;
|
||||
outLengths_ = outLengths;
|
||||
outStrides_ = outStrides;
|
||||
|
||||
in_elementwise_op_ = in_elementwise_op;
|
||||
acc_elementwise_op_ = acc_elementwise_op;
|
||||
std::tie(inLengths_, inStrides_) =
|
||||
shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, inStrides, reduceDims);
|
||||
|
||||
alpha_ = static_cast<AccDataType>(alpha);
|
||||
beta_ = static_cast<OutDataType>(beta);
|
||||
|
||||
std::tie(invariant_total_length, reduce_total_length) =
|
||||
get_2d_lengths<Rank, ReduceDims>(inLengths);
|
||||
get_2d_lengths<Rank, ReduceDims>(inLengths_);
|
||||
|
||||
if constexpr(InvariantDims::Size() == 0)
|
||||
invariant_lowest_length = 1;
|
||||
else
|
||||
invariant_lowest_length = inLengths[InvariantDims::At(InvariantDims::Size() - 1)];
|
||||
invariant_lowest_length = inLengths_[InvariantDims::At(InvariantDims::Size() - 1)];
|
||||
|
||||
reduce_lowest_length = inLengths[ReduceDims::At(ReduceDims::Size() - 1)];
|
||||
reduce_lowest_length = inLengths_[ReduceDims::At(ReduceDims::Size() - 1)];
|
||||
|
||||
int iterations = 1;
|
||||
while(true)
|
||||
@@ -369,6 +375,7 @@ struct DeviceReduceMultiBlockAtomicAdd
|
||||
const std::vector<int>& inStrides,
|
||||
const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides,
|
||||
const std::vector<int>& reduceDims,
|
||||
float alpha,
|
||||
float beta,
|
||||
const void* in_dev,
|
||||
@@ -382,6 +389,7 @@ struct DeviceReduceMultiBlockAtomicAdd
|
||||
inStrides,
|
||||
outLengths,
|
||||
outStrides,
|
||||
reduceDims,
|
||||
alpha,
|
||||
beta,
|
||||
static_cast<const InDataType*>(in_dev),
|
||||
|
||||
@@ -15,8 +15,8 @@ namespace device {
|
||||
template <typename InDataType,
|
||||
typename AccDataType,
|
||||
typename OutDataType,
|
||||
int Rank,
|
||||
typename ReduceDims,
|
||||
index_t Rank,
|
||||
index_t NumReduceDim,
|
||||
typename ReduceOperation,
|
||||
typename InElementwiseOperation,
|
||||
typename AccElementwiseOperation,
|
||||
@@ -41,7 +41,12 @@ struct DeviceReduceMultiBlockPartialReduce
|
||||
|
||||
using IndexDataType = int32_t;
|
||||
|
||||
using InvariantDims = decltype(get_invariant_dims<Rank, ReduceDims>());
|
||||
static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
|
||||
using InvariantDims =
|
||||
typename conditional<NumInvariantDim == 0,
|
||||
Sequence<>,
|
||||
typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type;
|
||||
using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
|
||||
|
||||
static constexpr index_t srcDims = Rank;
|
||||
static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
|
||||
@@ -112,7 +117,7 @@ struct DeviceReduceMultiBlockPartialReduce
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto toReduceDimLengths =
|
||||
const auto reduceDimLengths =
|
||||
make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
|
||||
const auto invariantDimLengths =
|
||||
make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
|
||||
@@ -120,7 +125,7 @@ struct DeviceReduceMultiBlockPartialReduce
|
||||
return transform_tensor_descriptor(
|
||||
inDesc,
|
||||
make_tuple(make_merge_transform(invariantDimLengths),
|
||||
make_merge_transform(toReduceDimLengths)),
|
||||
make_merge_transform(reduceDimLengths)),
|
||||
make_tuple(InvariantDims{}, ReduceDims{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
}
|
||||
@@ -161,10 +166,11 @@ struct DeviceReduceMultiBlockPartialReduce
|
||||
|
||||
struct Argument : public BaseArgument
|
||||
{
|
||||
Argument(const std::vector<index_t>& inLengths,
|
||||
const std::vector<index_t>& inStrides,
|
||||
const std::vector<index_t>& outLengths,
|
||||
const std::vector<index_t>& outStrides,
|
||||
Argument(const std::vector<int>& inLengths,
|
||||
const std::vector<int>& inStrides,
|
||||
const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides,
|
||||
const std::vector<int>& reduceDims,
|
||||
float alpha,
|
||||
float beta,
|
||||
const InDataType* in_dev,
|
||||
@@ -173,31 +179,30 @@ struct DeviceReduceMultiBlockPartialReduce
|
||||
AccDataType* workspace_dev,
|
||||
const InElementwiseOperation& in_elementwise_op,
|
||||
const AccElementwiseOperation& acc_elementwise_op)
|
||||
: in_dev_{in_dev},
|
||||
: outLengths_{outLengths},
|
||||
outStrides_{outStrides},
|
||||
in_dev_{in_dev},
|
||||
out_dev_{out_dev},
|
||||
out_indices_dev_{out_indices_dev},
|
||||
workspace_dev_{workspace_dev}
|
||||
workspace_dev_{workspace_dev},
|
||||
in_elementwise_op_{in_elementwise_op},
|
||||
acc_elementwise_op_{acc_elementwise_op}
|
||||
{
|
||||
inLengths_ = inLengths;
|
||||
inStrides_ = inStrides;
|
||||
outLengths_ = outLengths;
|
||||
outStrides_ = outStrides;
|
||||
|
||||
in_elementwise_op_ = in_elementwise_op;
|
||||
acc_elementwise_op_ = acc_elementwise_op;
|
||||
std::tie(inLengths_, inStrides_) =
|
||||
shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, inStrides, reduceDims);
|
||||
|
||||
alpha_ = static_cast<AccDataType>(alpha);
|
||||
beta_ = static_cast<OutDataType>(beta);
|
||||
|
||||
std::tie(invariant_total_length, reduce_total_length) =
|
||||
get_2d_lengths<Rank, ReduceDims>(inLengths);
|
||||
get_2d_lengths<Rank, ReduceDims>(inLengths_);
|
||||
|
||||
if constexpr(InvariantDims::Size() == 0)
|
||||
invariant_lowest_length = 1;
|
||||
else
|
||||
invariant_lowest_length = inLengths[InvariantDims::At(InvariantDims::Size() - 1)];
|
||||
invariant_lowest_length = inLengths_[InvariantDims::At(InvariantDims::Size() - 1)];
|
||||
|
||||
reduce_lowest_length = inLengths[ReduceDims::At(ReduceDims::Size() - 1)];
|
||||
reduce_lowest_length = inLengths_[ReduceDims::At(ReduceDims::Size() - 1)];
|
||||
|
||||
int iterations = 1;
|
||||
while(true)
|
||||
@@ -370,6 +375,7 @@ struct DeviceReduceMultiBlockPartialReduce
|
||||
const std::vector<int>& inStrides,
|
||||
const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides,
|
||||
const std::vector<int>& reduceDims,
|
||||
float alpha,
|
||||
float beta,
|
||||
const void* in_dev,
|
||||
@@ -383,6 +389,7 @@ struct DeviceReduceMultiBlockPartialReduce
|
||||
inStrides,
|
||||
outLengths,
|
||||
outStrides,
|
||||
reduceDims,
|
||||
alpha,
|
||||
beta,
|
||||
static_cast<const InDataType*>(in_dev),
|
||||
|
||||
@@ -16,7 +16,7 @@ template <typename InDataType,
|
||||
typename AccDataType,
|
||||
typename OutDataType,
|
||||
index_t Rank,
|
||||
typename ReduceDims,
|
||||
index_t NumReduceDim,
|
||||
typename ReduceOperation,
|
||||
typename InElementwiseOperation,
|
||||
typename OutElementwiseOperation,
|
||||
@@ -40,7 +40,12 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
|
||||
|
||||
static constexpr bool BetaIsZero = NeedIndices;
|
||||
|
||||
using InvariantDims = decltype(get_invariant_dims<Rank, ReduceDims>());
|
||||
static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
|
||||
using InvariantDims =
|
||||
typename conditional<NumInvariantDim == 0,
|
||||
Sequence<>,
|
||||
typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type;
|
||||
using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
|
||||
|
||||
static constexpr index_t srcDims = Rank;
|
||||
static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
|
||||
@@ -74,7 +79,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto toReduceDimLengths =
|
||||
const auto reduceDimLengths =
|
||||
make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
|
||||
const auto invariantDimLengths =
|
||||
make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
|
||||
@@ -82,7 +87,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
|
||||
return transform_tensor_descriptor(
|
||||
inDesc,
|
||||
make_tuple(make_merge_transform(invariantDimLengths),
|
||||
make_merge_transform(toReduceDimLengths)),
|
||||
make_merge_transform(reduceDimLengths)),
|
||||
make_tuple(InvariantDims{}, ReduceDims{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
}
|
||||
@@ -136,6 +141,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
|
||||
const std::vector<int>& inStrides,
|
||||
const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides,
|
||||
const std::vector<int>& reduceDims,
|
||||
float alpha,
|
||||
float beta,
|
||||
const InDataType* in_dev,
|
||||
@@ -144,30 +150,32 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
|
||||
AccDataType* workspace_dev,
|
||||
const InElementwiseOperation& in_elementwise_op,
|
||||
const OutElementwiseOperation& acc_elementwise_op)
|
||||
: in_dev_{in_dev}, out_dev_{out_dev}, out_indices_dev_{out_indices_dev}
|
||||
: outLengths_{outLengths},
|
||||
outStrides_{outStrides},
|
||||
in_dev_{in_dev},
|
||||
out_dev_{out_dev},
|
||||
out_indices_dev_{out_indices_dev},
|
||||
in_elementwise_op_{in_elementwise_op},
|
||||
acc_elementwise_op_{acc_elementwise_op}
|
||||
|
||||
{
|
||||
(void)workspace_dev;
|
||||
|
||||
inLengths_ = inLengths;
|
||||
inStrides_ = inStrides;
|
||||
outLengths_ = outLengths;
|
||||
outStrides_ = outStrides;
|
||||
|
||||
in_elementwise_op_ = in_elementwise_op;
|
||||
acc_elementwise_op_ = acc_elementwise_op;
|
||||
std::tie(inLengths_, inStrides_) =
|
||||
shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, inStrides, reduceDims);
|
||||
|
||||
alpha_ = static_cast<AccDataType>(alpha);
|
||||
beta_ = static_cast<OutDataType>(beta);
|
||||
|
||||
std::tie(invariant_total_length, reduce_total_length) =
|
||||
get_2d_lengths<Rank, ReduceDims>(inLengths);
|
||||
get_2d_lengths<Rank, ReduceDims>(inLengths_);
|
||||
|
||||
if constexpr(InvariantDims::Size() == 0)
|
||||
invariant_lowest_length = 1;
|
||||
else
|
||||
invariant_lowest_length = inLengths[InvariantDims::At(InvariantDims::Size() - 1)];
|
||||
invariant_lowest_length = inLengths_[InvariantDims::At(InvariantDims::Size() - 1)];
|
||||
|
||||
reduce_lowest_length = inLengths[ReduceDims::At(ReduceDims::Size() - 1)];
|
||||
reduce_lowest_length = inLengths_[ReduceDims::At(ReduceDims::Size() - 1)];
|
||||
|
||||
gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
|
||||
M_BlockTileSize;
|
||||
@@ -306,6 +314,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
|
||||
const std::vector<int>& inStrides,
|
||||
const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides,
|
||||
const std::vector<int>& reduceDims,
|
||||
float alpha,
|
||||
float beta,
|
||||
const void* in_dev,
|
||||
@@ -319,6 +328,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
|
||||
inStrides,
|
||||
outLengths,
|
||||
outStrides,
|
||||
reduceDims,
|
||||
alpha,
|
||||
beta,
|
||||
static_cast<const InDataType*>(in_dev),
|
||||
|
||||
Reference in New Issue
Block a user