mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-13 09:45:56 +00:00
* Initial adding of generic reduction * Initial adding of generic reduction ... * Updates to make compiling done * clang-format all files * clang-format some files again * Renaming in profiler/include/profile_reduce.hpp * Updates and make BlockWise cases passed * Updates and make ThreadWise and MultiBlockTwoCall cases passed * Remove the support for MUL and NORM1 reduceOp from the profiler and the device instances * Change to replace the dim0_max_vector_size/dim1_max_vector_size template argument in the device reduce classes * format * adding pooling * added max and average pooling * comment out cout and kernel timing * Tiny simplification in profiler/reduce_profiler.cpp * Add example for reduce_blockwise * Tiny updates * Change to pass the ElementWiseOp from device layer to kernel * Fix the vectorDim and vectorSize in Device layer * Enable vector load on both dim0 and dim1 for Threadwise method * Tiny updates * Change to let the user to pass the preUnaryOp and posUnaryOp * Make pooling example work * split device_reduce_instance into two libraries * Tiny update * Replace nanPropaOpt enum by boolean propagate_nan * Simplification in DeviceReduce layer codes * update build * Change to clarify the difference between ck::half_t and half_float::half * Renaming in all the reduction codes * Add VectorSize as template parameter for device layer * Add BetaIsZero as kernel template and as AccDataType for alpha * print * Small updates for pooling * Updates for host_generic_reduction for reference * Update to make AVG pooling pass * Update to make MAX pooling with indices output pass * fix * add OutDst vector store to threadwise reduction and pooling * tweak * turn off check_indices that caused build issue * refactor pooling * clean up * turn off check_indices for building issue for php-compiler * add more tile size for odd C * tweak conv for odd C * update script * clean up elementwise op * add hack in reduction_operator.hpp to avoid compile error. To fix it, need to use element_wise_op in reduction op * Add OutVectorSize as device and kernel tunable, also update to Elementwise Operations * Move reduce operator mapping to host layer file reduction_operator_mapping.hpp from reduction_operator.hpp * Change to the unary operators * Move the definitions of unary operations to element_wise_operation.hpp * re-org files * Refine in device interfaces and multiblock kernels * Split the reduction configurations into instances for specific methods * Update in getTypeString() of device pool2d * Renaming in host and kernel * Tiny update in profiler/src/profiler.cpp * Uncomment in device_operation/CMakeLists.txt to enable the building of all operations * Make check_indices a templated function to remove some linking issue * Renaming in the profiler reduce module * Add support for double Reduction (but disable MultiblockAtomicAdd for double) * Tiny correction of literal string * Rename DevicePoolFwd to DevicePool2dFwd * Split device_reduce_instance_xxx.cpp files according to the data types to speed up compiling * Add comments for lists of configurations, lists of instances and references of add_reduce_instances_xxx * Remove un-used header file gridwise_generic_reduction_wrapper_common.hpp * Renaming and refining in the Reduction codes * Tiny change in the unary operators * Renaming symbols and files * Renaming symbols in the kernels * Move kernel kernel_set_buffer_value to separate file * Add IndexDataType template parameter for kernels and use int32_t as index data type in device layer * Tiny update in the kernels * Remove definition of sqrtf()/isnan()/abs() for half_t due to some ADL issue * Simplify a helper function in device layer * Tiny adjustment in testing data initialization * Renaming in kernel/device/host * Add two testing scripts for reduction * Refine the Unary operators in element_wise_operation.hpp * Update in the reduce profiler module * Update to the reduction testing scripts * reduce compile parallelism * change CI docker to rocm5.0 * remove unused variables * fix build Co-authored-by: Chao Liu <chao.liu2@amd.com>
356 lines
15 KiB
C++
356 lines
15 KiB
C++
#ifndef DEVICE_REDUCE_THREADWISE_HPP
|
|
#define DEVICE_REDUCE_THREADWISE_HPP
|
|
|
|
#include <iostream>
|
|
#include <sstream>
|
|
#include "device.hpp"
|
|
#include "device_reduce.hpp"
|
|
#include "device_reduce_common.hpp"
|
|
#include "gridwise_2d_reduction_threadwise.hpp"
|
|
|
|
namespace ck {
|
|
namespace tensor_operation {
|
|
namespace device {
|
|
|
|
template <typename InDataType,
|
|
typename AccDataType,
|
|
typename OutDataType,
|
|
index_t Rank,
|
|
typename ReduceDims,
|
|
typename ReduceOperation,
|
|
typename InElementwiseOperation,
|
|
typename OutElementwiseOperation,
|
|
bool PropagateNan,
|
|
bool NeedIndices,
|
|
index_t BlockSize,
|
|
index_t MThreadClusterSize,
|
|
index_t KThreadClusterSize,
|
|
index_t MThreadSliceSize,
|
|
index_t KThreadSliceSize,
|
|
index_t InSrcVectorDim,
|
|
index_t InSrcVectorSize,
|
|
index_t OutDstVectorSize>
|
|
struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutElementwiseOperation>
|
|
{
|
|
static_assert(Rank <= 6, "Bigger Rank size is not supported!");
|
|
static_assert((BlockSize == MThreadClusterSize) && (KThreadClusterSize == 1),
|
|
"Threadwise can only be called with KThreadClusterSize be 1 !");
|
|
|
|
using IndexDataType = int32_t;
|
|
|
|
static constexpr bool BetaIsZero = NeedIndices;
|
|
|
|
using InvariantDims = decltype(get_invariant_dims<Rank, ReduceDims>());
|
|
|
|
static constexpr index_t srcDims = Rank;
|
|
static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
|
|
static constexpr bool reduceAllDims = (InvariantDims::Size() == 0);
|
|
|
|
static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
|
|
static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
|
|
|
|
static auto MakeSrc2dDescriptor(const std::vector<int>& inLengths,
|
|
const std::vector<int>& inStrides)
|
|
{
|
|
const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<srcDims>{});
|
|
const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<srcDims>{});
|
|
|
|
const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
|
|
|
|
const auto in_grid_desc_m_k = [&]() {
|
|
if constexpr(reduceAllDims)
|
|
{
|
|
const auto one_dim_inDesc = transform_tensor_descriptor(
|
|
inDesc,
|
|
make_tuple(make_merge_transform(tupleSrcLengths)),
|
|
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
|
|
make_tuple(Sequence<0>{}));
|
|
|
|
return transform_tensor_descriptor(one_dim_inDesc,
|
|
make_tuple(make_unmerge_transform(make_tuple(
|
|
1, one_dim_inDesc.GetLength(Number<0>{})))),
|
|
make_tuple(Sequence<0>{}),
|
|
make_tuple(Sequence<0, 1>{}));
|
|
}
|
|
else
|
|
{
|
|
const auto toReduceDimLengths =
|
|
make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
|
|
const auto invariantDimLengths =
|
|
make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
|
|
|
|
return transform_tensor_descriptor(
|
|
inDesc,
|
|
make_tuple(make_merge_transform(invariantDimLengths),
|
|
make_merge_transform(toReduceDimLengths)),
|
|
make_tuple(InvariantDims{}, ReduceDims{}),
|
|
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
|
}
|
|
}();
|
|
|
|
const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{});
|
|
const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{});
|
|
|
|
const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
|
|
const auto inPad_K = math::integer_least_multiple(innerLen, K_BlockTileSize) - innerLen;
|
|
|
|
auto in_grid_desc_m_k_padded =
|
|
transform_tensor_descriptor(in_grid_desc_m_k,
|
|
make_tuple(make_right_pad_transform(outerLen, inPad_M),
|
|
make_right_pad_transform(innerLen, inPad_K)),
|
|
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
|
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
|
|
|
return (in_grid_desc_m_k_padded);
|
|
};
|
|
|
|
static auto MakeDst1dDescriptor(const std::vector<int>& outLengths,
|
|
const std::vector<int>& outStrides)
|
|
{
|
|
const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<dstDims>{});
|
|
const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<dstDims>{});
|
|
|
|
auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
|
|
|
|
auto out_grid_desc_m = transform_tensor_descriptor(
|
|
outDesc,
|
|
make_tuple(make_merge_transform(tupleDstLengths)),
|
|
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
|
make_tuple(Sequence<0>{}));
|
|
|
|
const auto outerLen = out_grid_desc_m.GetLength(Number<0>{});
|
|
|
|
const auto outPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
|
|
|
|
auto out_grid_desc_m_padded =
|
|
transform_tensor_descriptor(out_grid_desc_m,
|
|
make_tuple(make_right_pad_transform(outerLen, outPad)),
|
|
make_tuple(Sequence<0>{}),
|
|
make_tuple(Sequence<0>{}));
|
|
return (out_grid_desc_m_padded);
|
|
};
|
|
|
|
struct Argument : public BaseArgument
|
|
{
|
|
Argument(const std::vector<int>& inLengths,
|
|
const std::vector<int>& inStrides,
|
|
const std::vector<int>& outLengths,
|
|
const std::vector<int>& outStrides,
|
|
float alpha,
|
|
float beta,
|
|
const InDataType* in_dev,
|
|
OutDataType* out_dev,
|
|
IndexDataType* out_indices_dev,
|
|
AccDataType* workspace_dev,
|
|
const InElementwiseOperation& in_elementwise_op,
|
|
const OutElementwiseOperation& acc_elementwise_op)
|
|
: in_dev_{in_dev}, out_dev_{out_dev}, out_indices_dev_{out_indices_dev}
|
|
{
|
|
(void)workspace_dev;
|
|
|
|
inLengths_ = inLengths;
|
|
inStrides_ = inStrides;
|
|
outLengths_ = outLengths;
|
|
outStrides_ = outStrides;
|
|
|
|
in_elementwise_op_ = in_elementwise_op;
|
|
acc_elementwise_op_ = acc_elementwise_op;
|
|
|
|
alpha_ = static_cast<AccDataType>(alpha);
|
|
beta_ = static_cast<OutDataType>(beta);
|
|
|
|
std::tie(invariant_total_length, reduce_total_length) =
|
|
get_2d_lengths<Rank, ReduceDims>(inLengths);
|
|
|
|
if constexpr(InvariantDims::Size() == 0)
|
|
invariant_lowest_length = 1;
|
|
else
|
|
invariant_lowest_length = inLengths[InvariantDims::At(InvariantDims::Size() - 1)];
|
|
|
|
reduce_lowest_length = inLengths[ReduceDims::At(ReduceDims::Size() - 1)];
|
|
|
|
gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
|
|
M_BlockTileSize;
|
|
}
|
|
|
|
std::vector<int> inLengths_;
|
|
std::vector<int> inStrides_;
|
|
std::vector<int> outLengths_;
|
|
std::vector<int> outStrides_;
|
|
|
|
AccDataType alpha_;
|
|
OutDataType beta_;
|
|
|
|
const InDataType* in_dev_;
|
|
OutDataType* out_dev_;
|
|
IndexDataType* out_indices_dev_;
|
|
|
|
InElementwiseOperation in_elementwise_op_;
|
|
OutElementwiseOperation acc_elementwise_op_;
|
|
|
|
int invariant_lowest_length;
|
|
int reduce_lowest_length;
|
|
size_t invariant_total_length;
|
|
size_t reduce_total_length;
|
|
|
|
size_t gridSize;
|
|
};
|
|
|
|
struct Invoker : public BaseInvoker
|
|
{
|
|
float Run(const Argument& arg, int nrepeat = 1)
|
|
{
|
|
const auto in_grid_desc_m_k =
|
|
DeviceReduceThreadWise::MakeSrc2dDescriptor(arg.inLengths_, arg.inStrides_);
|
|
const auto out_grid_desc_m =
|
|
DeviceReduceThreadWise::MakeDst1dDescriptor(arg.outLengths_, arg.outStrides_);
|
|
using InGridDesc_M_K = decltype(in_grid_desc_m_k);
|
|
using OutGridDesc_M = decltype(out_grid_desc_m);
|
|
|
|
using GridwiseReduce = GridwiseReduction_mk_to_m_threadwise<InDataType,
|
|
OutDataType,
|
|
AccDataType,
|
|
IndexDataType,
|
|
InGridDesc_M_K,
|
|
OutGridDesc_M,
|
|
ReduceOperation,
|
|
InElementwiseOperation,
|
|
OutElementwiseOperation,
|
|
PropagateNan,
|
|
BetaIsZero,
|
|
BlockSize,
|
|
MThreadClusterSize,
|
|
KThreadClusterSize,
|
|
MThreadSliceSize,
|
|
KThreadSliceSize,
|
|
InSrcVectorDim,
|
|
InSrcVectorSize,
|
|
OutDstVectorSize>;
|
|
|
|
float avg_time = 0;
|
|
|
|
const auto kernel = kernel_reduce_threadwise<GridwiseReduce,
|
|
NeedIndices,
|
|
InDataType,
|
|
OutDataType,
|
|
AccDataType,
|
|
IndexDataType,
|
|
InGridDesc_M_K,
|
|
OutGridDesc_M,
|
|
InElementwiseOperation,
|
|
OutElementwiseOperation>;
|
|
|
|
avg_time = launch_and_time_kernel(kernel,
|
|
nrepeat,
|
|
dim3(arg.gridSize),
|
|
dim3(BlockSize),
|
|
0,
|
|
in_grid_desc_m_k,
|
|
out_grid_desc_m,
|
|
arg.in_elementwise_op_,
|
|
arg.acc_elementwise_op_,
|
|
arg.alpha_,
|
|
arg.in_dev_,
|
|
arg.beta_,
|
|
arg.out_dev_,
|
|
arg.out_indices_dev_);
|
|
|
|
return (avg_time);
|
|
};
|
|
|
|
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
|
{
|
|
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
|
};
|
|
};
|
|
|
|
bool IsSupportedArgument(const BaseArgument* p_arg) override
|
|
{
|
|
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
|
|
|
|
if constexpr(InSrcVectorDim == 0)
|
|
{
|
|
if constexpr(InvariantDims::Size() == 0)
|
|
return (false);
|
|
|
|
if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1)
|
|
return (false);
|
|
|
|
if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
|
|
return (false);
|
|
}
|
|
else
|
|
{
|
|
if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1)
|
|
return (false);
|
|
|
|
if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
|
|
return (false);
|
|
};
|
|
|
|
// To improve
|
|
if(pArg->invariant_lowest_length % OutDstVectorSize != 0)
|
|
return (false);
|
|
|
|
// TODO: remove this. Should return true, as long as this DeviceOP instance support this
|
|
// case for bigger reduce_total_length size, we are supposed to use BlockWise method for
|
|
// better performance
|
|
if(pArg->reduce_total_length / KThreadSliceSize >= 32)
|
|
return (false);
|
|
|
|
return (true);
|
|
};
|
|
|
|
std::unique_ptr<BaseArgument>
|
|
MakeArgumentPointer(const std::vector<int>& inLengths,
|
|
const std::vector<int>& inStrides,
|
|
const std::vector<int>& outLengths,
|
|
const std::vector<int>& outStrides,
|
|
float alpha,
|
|
float beta,
|
|
const void* in_dev,
|
|
void* out_dev,
|
|
void* out_indices_dev,
|
|
void* workspace_dev,
|
|
const InElementwiseOperation& in_elementwise_op,
|
|
const OutElementwiseOperation& acc_elementwise_op) override
|
|
{
|
|
return std::make_unique<Argument>(inLengths,
|
|
inStrides,
|
|
outLengths,
|
|
outStrides,
|
|
alpha,
|
|
beta,
|
|
static_cast<const InDataType*>(in_dev),
|
|
static_cast<OutDataType*>(out_dev),
|
|
static_cast<IndexDataType*>(out_indices_dev),
|
|
static_cast<AccDataType*>(workspace_dev),
|
|
in_elementwise_op,
|
|
acc_elementwise_op);
|
|
};
|
|
|
|
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
|
|
{
|
|
return std::make_unique<Invoker>();
|
|
};
|
|
|
|
std::string GetTypeString() const override
|
|
{
|
|
auto str = std::stringstream();
|
|
|
|
// clang-format off
|
|
str << "DeviceReducceThreadWise<" << BlockSize << ",";
|
|
str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
|
|
str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
|
|
str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
|
|
// clang-format on
|
|
|
|
return str.str();
|
|
}
|
|
};
|
|
|
|
} // namespace device
|
|
} // namespace tensor_operation
|
|
} // namespace ck
|
|
#endif
|