mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-20 21:09:08 +00:00
* Initial adding of generic reduction
* Initial adding of generic reduction ...
* Updates to make compiling done
* clang-format all files
* clang-format some files again
* Renaming in profiler/include/profile_reduce.hpp
* Updates and make BlockWise cases passed
* Updates and make ThreadWise and MultiBlockTwoCall cases passed
* Remove the support for MUL and NORM1 reduceOp from the profiler and the device instances
* Change to replace the dim0_max_vector_size/dim1_max_vector_size template argument in the device reduce classes
* format
* adding pooling
* added max and average pooling
* comment out cout and kernel timing
* Tiny simplification in profiler/reduce_profiler.cpp
* Add example for reduce_blockwise
* Tiny updates
* Change to pass the ElementWiseOp from device layer to kernel
* Fix the vectorDim and vectorSize in Device layer
* Enable vector load on both dim0 and dim1 for Threadwise method
* Tiny updates
* Change to let the user to pass the preUnaryOp and posUnaryOp
* Make pooling example work
* split device_reduce_instance into two libraries
* Tiny update
* Replace nanPropaOpt enum by boolean propagate_nan
* Simplification in DeviceReduce layer codes
* update build
* Change to clarify the difference between ck::half_t and half_float::half
* Renaming in all the reduction codes
* Add VectorSize as template parameter for device layer
* Add BetaIsZero as kernel template and as AccDataType for alpha
* print
* Small updates for pooling
* Updates for host_generic_reduction for reference
* Update to make AVG pooling pass
* Update to make MAX pooling with indices output pass
* fix
* add OutDst vector store to threadwise reduction and pooling
* tweak
* turn off check_indices that caused build issue
* refactor pooling
* clean up
* turn off check_indices for building issue for php-compiler
* add more tile size for odd C
* tweak conv for odd C
* update script
* clean up elementwise op
* add hack in reduction_operator.hpp to avoid compile error. To fix it, need to use element_wise_op in reduction op
* Add OutVectorSize as device and kernel tunable, also update to Elementwise Operations
* Move reduce operator mapping to host layer file reduction_operator_mapping.hpp from reduction_operator.hpp
* Change to the unary operators
* Move the definitions of unary operations to element_wise_operation.hpp
* re-org files
* Refine in device interfaces and multiblock kernels
* Split the reduction configurations into instances for specific methods
* Update in getTypeString() of device pool2d
* Renaming in host and kernel
* Tiny update in profiler/src/profiler.cpp
* Uncomment in device_operation/CMakeLists.txt to enable the building of all operations
* Make check_indices a templated function to remove some linking issue
* Renaming in the profiler reduce module
* Add support for double Reduction (but disable MultiblockAtomicAdd for double)
* Tiny correction of literal string
* Rename DevicePoolFwd to DevicePool2dFwd
* Split device_reduce_instance_xxx.cpp files according to the data types to speed up compiling
* Add comments for lists of configurations, lists of instances and references of add_reduce_instances_xxx
* Remove un-used header file gridwise_generic_reduction_wrapper_common.hpp
* Renaming and refining in the Reduction codes
* Tiny change in the unary operators
* Renaming symbols and files
* Renaming symbols in the kernels
* Move kernel kernel_set_buffer_value to separate file
* Add IndexDataType template parameter for kernels and use int32_t as index data type in device layer
* Tiny update in the kernels
* Remove definition of sqrtf()/isnan()/abs() for half_t due to some ADL issue
* Simplify a helper function in device layer
* Tiny adjustment in testing data initialization
* Renaming in kernel/device/host
* Add two testing scripts for reduction
* Refine the Unary operators in element_wise_operation.hpp
* Update in the reduce profiler module
* Update to the reduction testing scripts
* reduce compile parallelism
* change CI docker to rocm5.0
* remove unused variables
* fix build
Co-authored-by: Chao Liu <chao.liu2@amd.com>
[ROCm/composable_kernel commit: e17c0d8008]
420 lines
18 KiB
C++
420 lines
18 KiB
C++
#ifndef DEVICE_REDUCE_MULTIBLOCK_PARTIAL_REDUCE_HPP
|
|
#define DEVICE_REDUCE_MULTIBLOCK_PARTIAL_REDUCE_HPP
|
|
|
|
#include <iostream>
|
|
#include <sstream>
|
|
#include "device.hpp"
|
|
#include "device_reduce.hpp"
|
|
#include "device_reduce_common.hpp"
|
|
#include "gridwise_2d_reduction_multiblock_partial_reduce.hpp"
|
|
|
|
namespace ck {
|
|
namespace tensor_operation {
|
|
namespace device {
|
|
|
|
template <typename InDataType,
|
|
typename AccDataType,
|
|
typename OutDataType,
|
|
int Rank,
|
|
typename ReduceDims,
|
|
typename ReduceOperation,
|
|
typename InElementwiseOperation,
|
|
typename AccElementwiseOperation,
|
|
bool PropagateNan,
|
|
bool NeedIndices,
|
|
index_t BlockSize,
|
|
index_t MThreadClusterSize,
|
|
index_t KThreadClusterSize,
|
|
index_t MThreadSliceSize,
|
|
index_t KThreadSliceSize,
|
|
index_t InSrcVectorDim,
|
|
index_t InSrcVectorSize,
|
|
index_t OutDstVectorSize>
|
|
struct DeviceReduceMultiBlockPartialReduce
|
|
: public DeviceReduce<InElementwiseOperation, AccElementwiseOperation>
|
|
{
|
|
static_assert(Rank <= 6, "Bigger Rank size is not supported!");
|
|
static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
|
|
"Invalid thread cluster size assignments!");
|
|
|
|
static_assert(OutDstVectorSize == 1, "OutDstVectorSize must be 1 for MultiBlockPartialReduce!");
|
|
|
|
using IndexDataType = int32_t;
|
|
|
|
using InvariantDims = decltype(get_invariant_dims<Rank, ReduceDims>());
|
|
|
|
static constexpr index_t srcDims = Rank;
|
|
static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
|
|
static constexpr bool reduceAllDims = (InvariantDims::Size() == 0);
|
|
|
|
static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
|
|
static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
|
|
|
|
size_t GetWorkspaceSizeInBytes(const std::vector<int>& inLengths) override
|
|
{
|
|
size_t invariant_total_length;
|
|
size_t reduce_total_length;
|
|
|
|
std::tie(invariant_total_length, reduce_total_length) =
|
|
get_2d_lengths<Rank, ReduceDims>(inLengths);
|
|
|
|
int iterations = 1;
|
|
while(true)
|
|
{
|
|
int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
|
|
(K_BlockTileSize * iterations);
|
|
|
|
// we want the blkGroupSize be not more than 128
|
|
if(testBlkGroupSize <= 128)
|
|
break;
|
|
|
|
iterations++;
|
|
};
|
|
|
|
int blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
|
|
(K_BlockTileSize * iterations);
|
|
|
|
size_t workspace_size = invariant_total_length * blkGroupSize;
|
|
|
|
size_t wsSizeInBytes =
|
|
!NeedIndices ? workspace_size * sizeof(AccDataType)
|
|
: workspace_size * (sizeof(AccDataType) + sizeof(int)) + 64 + sizeof(int);
|
|
|
|
return (wsSizeInBytes);
|
|
};
|
|
|
|
bool HasFurtherCall() override { return (true); };
|
|
|
|
static auto MakeSrc2dDescriptor(const std::vector<int>& inLengths,
|
|
const std::vector<int>& inStrides,
|
|
int blkGroupSize,
|
|
int kBlockTileIterations)
|
|
{
|
|
const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<srcDims>{});
|
|
const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<srcDims>{});
|
|
|
|
const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
|
|
|
|
const auto in_grid_desc_m_k = [&]() {
|
|
if constexpr(reduceAllDims)
|
|
{
|
|
const auto one_dim_inDesc = transform_tensor_descriptor(
|
|
inDesc,
|
|
make_tuple(make_merge_transform(tupleSrcLengths)),
|
|
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
|
|
make_tuple(Sequence<0>{}));
|
|
|
|
return transform_tensor_descriptor(one_dim_inDesc,
|
|
make_tuple(make_unmerge_transform(make_tuple(
|
|
1, one_dim_inDesc.GetLength(Number<0>{})))),
|
|
make_tuple(Sequence<0>{}),
|
|
make_tuple(Sequence<0, 1>{}));
|
|
}
|
|
else
|
|
{
|
|
const auto toReduceDimLengths =
|
|
make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
|
|
const auto invariantDimLengths =
|
|
make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
|
|
|
|
return transform_tensor_descriptor(
|
|
inDesc,
|
|
make_tuple(make_merge_transform(invariantDimLengths),
|
|
make_merge_transform(toReduceDimLengths)),
|
|
make_tuple(InvariantDims{}, ReduceDims{}),
|
|
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
|
}
|
|
}();
|
|
|
|
const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{});
|
|
const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{});
|
|
|
|
const int reduceSizePerBlock = K_BlockTileSize * kBlockTileIterations;
|
|
const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
|
|
const auto inPad_K = reduceSizePerBlock * blkGroupSize - innerLen;
|
|
|
|
auto in_grid_desc_m_k_padded =
|
|
transform_tensor_descriptor(in_grid_desc_m_k,
|
|
make_tuple(make_right_pad_transform(outerLen, inPad_M),
|
|
make_right_pad_transform(innerLen, inPad_K)),
|
|
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
|
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
|
|
|
return (in_grid_desc_m_k_padded);
|
|
};
|
|
|
|
static auto MakeWorkspace2dDescriptor(int outerLen, int blkGroupSize)
|
|
{
|
|
auto ws_desc_m_k = make_naive_tensor_descriptor_packed(make_tuple(outerLen, blkGroupSize));
|
|
|
|
const auto wsPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
|
|
|
|
auto ws_desc_m_k_padded =
|
|
transform_tensor_descriptor(ws_desc_m_k,
|
|
make_tuple(make_right_pad_transform(outerLen, wsPad),
|
|
make_pass_through_transform(blkGroupSize)),
|
|
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
|
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
|
|
|
return (ws_desc_m_k_padded);
|
|
};
|
|
|
|
struct Argument : public BaseArgument
|
|
{
|
|
Argument(const std::vector<index_t>& inLengths,
|
|
const std::vector<index_t>& inStrides,
|
|
const std::vector<index_t>& outLengths,
|
|
const std::vector<index_t>& outStrides,
|
|
float alpha,
|
|
float beta,
|
|
const InDataType* in_dev,
|
|
OutDataType* out_dev,
|
|
IndexDataType* out_indices_dev,
|
|
AccDataType* workspace_dev,
|
|
const InElementwiseOperation& in_elementwise_op,
|
|
const AccElementwiseOperation& acc_elementwise_op)
|
|
: in_dev_{in_dev},
|
|
out_dev_{out_dev},
|
|
out_indices_dev_{out_indices_dev},
|
|
workspace_dev_{workspace_dev}
|
|
{
|
|
inLengths_ = inLengths;
|
|
inStrides_ = inStrides;
|
|
outLengths_ = outLengths;
|
|
outStrides_ = outStrides;
|
|
|
|
in_elementwise_op_ = in_elementwise_op;
|
|
acc_elementwise_op_ = acc_elementwise_op;
|
|
|
|
alpha_ = static_cast<AccDataType>(alpha);
|
|
beta_ = static_cast<OutDataType>(beta);
|
|
|
|
std::tie(invariant_total_length, reduce_total_length) =
|
|
get_2d_lengths<Rank, ReduceDims>(inLengths);
|
|
|
|
if constexpr(InvariantDims::Size() == 0)
|
|
invariant_lowest_length = 1;
|
|
else
|
|
invariant_lowest_length = inLengths[InvariantDims::At(InvariantDims::Size() - 1)];
|
|
|
|
reduce_lowest_length = inLengths[ReduceDims::At(ReduceDims::Size() - 1)];
|
|
|
|
int iterations = 1;
|
|
while(true)
|
|
{
|
|
int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
|
|
(K_BlockTileSize * iterations);
|
|
|
|
// we want the blkGroupSize be not more than 128
|
|
if(testBlkGroupSize <= 128)
|
|
break;
|
|
|
|
iterations++;
|
|
};
|
|
|
|
blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
|
|
(K_BlockTileSize * iterations);
|
|
|
|
kBlockTileIterations = iterations;
|
|
|
|
gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
|
|
M_BlockTileSize * blkGroupSize;
|
|
|
|
size_t ws_buf2_bytes_offset = math::integer_least_multiple(
|
|
invariant_total_length * blkGroupSize * sizeof(AccDataType), 64);
|
|
|
|
if constexpr(NeedIndices)
|
|
workspace_indices_dev_ = reinterpret_cast<int*>(
|
|
reinterpret_cast<char*>(workspace_dev_) + ws_buf2_bytes_offset);
|
|
else
|
|
workspace_indices_dev_ = nullptr;
|
|
}
|
|
|
|
std::vector<int> inLengths_;
|
|
std::vector<int> inStrides_;
|
|
std::vector<int> outLengths_;
|
|
std::vector<int> outStrides_;
|
|
|
|
AccDataType alpha_;
|
|
OutDataType beta_;
|
|
|
|
const InDataType* in_dev_;
|
|
OutDataType* out_dev_;
|
|
IndexDataType* out_indices_dev_;
|
|
AccDataType* workspace_dev_;
|
|
IndexDataType* workspace_indices_dev_;
|
|
|
|
InElementwiseOperation in_elementwise_op_;
|
|
AccElementwiseOperation acc_elementwise_op_;
|
|
|
|
int invariant_lowest_length;
|
|
int reduce_lowest_length;
|
|
size_t invariant_total_length;
|
|
size_t reduce_total_length;
|
|
|
|
index_t blkGroupSize;
|
|
index_t kBlockTileIterations;
|
|
size_t gridSize;
|
|
};
|
|
|
|
struct Invoker : public BaseInvoker
|
|
{
|
|
float Run(const Argument& arg, int nrepeat = 1)
|
|
{
|
|
const auto in_grid_desc_m_k = DeviceReduceMultiBlockPartialReduce::MakeSrc2dDescriptor(
|
|
arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations);
|
|
const auto ws_desc_m_k = DeviceReduceMultiBlockPartialReduce::MakeWorkspace2dDescriptor(
|
|
arg.invariant_total_length, arg.blkGroupSize);
|
|
using InGridDesc_M_K = decltype(in_grid_desc_m_k);
|
|
using WorkspaceDesc_M_K = decltype(ws_desc_m_k);
|
|
|
|
using GridwiseReduce =
|
|
GridwiseReduction_mk_to_mk_multiblock_partial_reduce<InDataType,
|
|
AccDataType,
|
|
IndexDataType,
|
|
InGridDesc_M_K,
|
|
WorkspaceDesc_M_K,
|
|
ReduceOperation,
|
|
InElementwiseOperation,
|
|
AccElementwiseOperation,
|
|
PropagateNan,
|
|
BlockSize,
|
|
MThreadClusterSize,
|
|
KThreadClusterSize,
|
|
MThreadSliceSize,
|
|
KThreadSliceSize,
|
|
InSrcVectorDim,
|
|
InSrcVectorSize,
|
|
OutDstVectorSize>;
|
|
|
|
float avg_time = 0;
|
|
|
|
const auto kernel = kernel_partial_reduce_multiblock<GridwiseReduce,
|
|
NeedIndices,
|
|
InDataType,
|
|
AccDataType,
|
|
IndexDataType,
|
|
InGridDesc_M_K,
|
|
WorkspaceDesc_M_K,
|
|
InElementwiseOperation,
|
|
AccElementwiseOperation>;
|
|
|
|
avg_time = launch_and_time_kernel(kernel,
|
|
nrepeat,
|
|
dim3(arg.gridSize),
|
|
dim3(BlockSize),
|
|
0,
|
|
in_grid_desc_m_k,
|
|
ws_desc_m_k,
|
|
arg.in_elementwise_op_,
|
|
arg.acc_elementwise_op_,
|
|
arg.blkGroupSize,
|
|
arg.kBlockTileIterations,
|
|
arg.in_dev_,
|
|
arg.workspace_dev_,
|
|
arg.workspace_indices_dev_);
|
|
|
|
return (avg_time);
|
|
};
|
|
|
|
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
|
{
|
|
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
|
};
|
|
};
|
|
|
|
bool IsSupportedArgument(const BaseArgument* p_arg) override
|
|
{
|
|
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
|
|
|
|
if constexpr(OutDstVectorSize != 1)
|
|
return (false);
|
|
|
|
if constexpr(InSrcVectorDim == 0)
|
|
{
|
|
if constexpr(InvariantDims::Size() == 0)
|
|
return (false);
|
|
|
|
if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1)
|
|
return (false);
|
|
|
|
if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
|
|
return (false);
|
|
}
|
|
else
|
|
{
|
|
if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1)
|
|
return (false);
|
|
|
|
if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
|
|
return (false);
|
|
};
|
|
|
|
// cases with small reduce_total_length should be handled by the BlockWise method
|
|
if(pArg->reduce_total_length <= BlockSize * KThreadSliceSize)
|
|
return (false);
|
|
|
|
return (true);
|
|
};
|
|
|
|
std::vector<int> GetWorkspace2dLengths(const BaseArgument* p_arg) override
|
|
{
|
|
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
|
|
|
|
return (
|
|
std::vector<int>{static_cast<int>(pArg->invariant_total_length), pArg->blkGroupSize});
|
|
};
|
|
|
|
std::unique_ptr<BaseArgument>
|
|
MakeArgumentPointer(const std::vector<int>& inLengths,
|
|
const std::vector<int>& inStrides,
|
|
const std::vector<int>& outLengths,
|
|
const std::vector<int>& outStrides,
|
|
float alpha,
|
|
float beta,
|
|
const void* in_dev,
|
|
void* out_dev,
|
|
void* out_indices_dev,
|
|
void* workspace_dev,
|
|
const InElementwiseOperation& in_elementwise_op,
|
|
const AccElementwiseOperation& acc_elementwise_op) override
|
|
{
|
|
return std::make_unique<Argument>(inLengths,
|
|
inStrides,
|
|
outLengths,
|
|
outStrides,
|
|
alpha,
|
|
beta,
|
|
static_cast<const InDataType*>(in_dev),
|
|
static_cast<OutDataType*>(out_dev),
|
|
static_cast<IndexDataType*>(out_indices_dev),
|
|
static_cast<AccDataType*>(workspace_dev),
|
|
in_elementwise_op,
|
|
acc_elementwise_op);
|
|
};
|
|
|
|
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
|
|
{
|
|
return std::make_unique<Invoker>();
|
|
};
|
|
|
|
std::string GetTypeString() const override
|
|
{
|
|
auto str = std::stringstream();
|
|
|
|
// clang-format off
|
|
str << "DeviceReduceMultiBlockPartialReduce<" << BlockSize << ",";
|
|
str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
|
|
str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
|
|
str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
|
|
// clang-format on
|
|
|
|
return str.str();
|
|
}
|
|
};
|
|
|
|
} // namespace device
|
|
} // namespace tensor_operation
|
|
} // namespace ck
|
|
#endif
|