mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-13 01:36:06 +00:00
* Initial adding of generic reduction * Initial adding of generic reduction ... * Updates to make compiling done * clang-format all files * clang-format some files again * Renaming in profiler/include/profile_reduce.hpp * Updates and make BlockWise cases passed * Updates and make ThreadWise and MultiBlockTwoCall cases passed * Remove the support for MUL and NORM1 reduceOp from the profiler and the device instances * Change to replace the dim0_max_vector_size/dim1_max_vector_size template argument in the device reduce classes * format * adding pooling * added max and average pooling * comment out cout and kernel timing * Tiny simplification in profiler/reduce_profiler.cpp * Add example for reduce_blockwise * Tiny updates * Change to pass the ElementWiseOp from device layer to kernel * Fix the vectorDim and vectorSize in Device layer * Enable vector load on both dim0 and dim1 for Threadwise method * Tiny updates * Change to let the user to pass the preUnaryOp and posUnaryOp * Make pooling example work * split device_reduce_instance into two libraries * Tiny update * Replace nanPropaOpt enum by boolean propagate_nan * Simplification in DeviceReduce layer codes * update build * Change to clarify the difference between ck::half_t and half_float::half * Renaming in all the reduction codes * Add VectorSize as template parameter for device layer * Add BetaIsZero as kernel template and as AccDataType for alpha * print * Small updates for pooling * Updates for host_generic_reduction for reference * Update to make AVG pooling pass * Update to make MAX pooling with indices output pass * fix * add OutDst vector store to threadwise reduction and pooling * tweak * turn off check_indices that caused build issue * refactor pooling * clean up * turn off check_indices for building issue for php-compiler * add more tile size for odd C * tweak conv for odd C * update script * clean up elementwise op * add hack in reduction_operator.hpp to avoid compile error. To fix it, need to use element_wise_op in reduction op * Add OutVectorSize as device and kernel tunable, also update to Elementwise Operations * Move reduce operator mapping to host layer file reduction_operator_mapping.hpp from reduction_operator.hpp * Change to the unary operators * Move the definitions of unary operations to element_wise_operation.hpp * re-org files * Refine in device interfaces and multiblock kernels * Split the reduction configurations into instances for specific methods * Update in getTypeString() of device pool2d * Renaming in host and kernel * Tiny update in profiler/src/profiler.cpp * Uncomment in device_operation/CMakeLists.txt to enable the building of all operations * Make check_indices a templated function to remove some linking issue * Renaming in the profiler reduce module * Add support for double Reduction (but disable MultiblockAtomicAdd for double) * Tiny correction of literal string * Rename DevicePoolFwd to DevicePool2dFwd * Split device_reduce_instance_xxx.cpp files according to the data types to speed up compiling * Add comments for lists of configurations, lists of instances and references of add_reduce_instances_xxx * Remove un-used header file gridwise_generic_reduction_wrapper_common.hpp * Renaming and refining in the Reduction codes * Tiny change in the unary operators * Renaming symbols and files * Renaming symbols in the kernels * Move kernel kernel_set_buffer_value to separate file * Add IndexDataType template parameter for kernels and use int32_t as index data type in device layer * Tiny update in the kernels * Remove definition of sqrtf()/isnan()/abs() for half_t due to some ADL issue * Simplify a helper function in device layer * Tiny adjustment in testing data initialization * Renaming in kernel/device/host * Add two testing scripts for reduction * Refine the Unary operators in element_wise_operation.hpp * Update in the reduce profiler module * Update to the reduction testing scripts * reduce compile parallelism * change CI docker to rocm5.0 * remove unused variables * fix build Co-authored-by: Chao Liu <chao.liu2@amd.com>
165 lines
9.0 KiB
C++
165 lines
9.0 KiB
C++
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_HPP
|
|
#define DEVICE_REDUCE_INSTANCE_THREADWISE_HPP
|
|
|
|
#include "reduction_operator_mapping.hpp"
|
|
#include "device_reduce_instance_impl_common.hpp"
|
|
#include "device_reduce_threadwise.hpp"
|
|
|
|
namespace ck {
|
|
namespace tensor_operation {
|
|
namespace device {
|
|
namespace device_reduce_instance {
|
|
|
|
#ifdef QUICK_REDUCE_TEST
|
|
using reduce_configuration_2_instances_threadwise = std::tuple<
|
|
// clang-format off
|
|
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
|
|
ReductionConfiguration_2<0, 2, 2, 2, 1>,
|
|
ReductionConfiguration_2<0, 1, 1, 2, 1>,
|
|
ReductionConfiguration_2<1, 2, 1, 1, 2>,
|
|
ReductionConfiguration_2<1, 2, 2, 1, 2>,
|
|
ReductionConfiguration_2<0, 1, 1, 3, 1>,
|
|
ReductionConfiguration_2<1, 1, 1, 1, 3>
|
|
// clang-format on
|
|
>;
|
|
#else
|
|
using reduce_configuration_2_instances_threadwise = std::tuple<
|
|
// clang-format off
|
|
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
|
|
ReductionConfiguration_2<0, 4, 4, 8, 1>,
|
|
ReductionConfiguration_2<0, 4, 4, 4, 1>,
|
|
ReductionConfiguration_2<0, 2, 2, 2, 1>,
|
|
|
|
ReductionConfiguration_2<1, 4, 1, 1, 8>,
|
|
ReductionConfiguration_2<1, 4, 1, 1, 4>,
|
|
ReductionConfiguration_2<1, 2, 1, 1, 2>,
|
|
|
|
// special instances
|
|
ReductionConfiguration_2<0, 1, 1, 3, 1>,
|
|
ReductionConfiguration_2<0, 1, 1, 5, 1>,
|
|
ReductionConfiguration_2<0, 1, 1, 7, 1>,
|
|
ReductionConfiguration_2<0, 1, 1, 11, 1>,
|
|
|
|
ReductionConfiguration_2<1, 1, 1, 1, 3>,
|
|
ReductionConfiguration_2<1, 1, 1, 1, 5>,
|
|
ReductionConfiguration_2<1, 1, 1, 1, 7>,
|
|
ReductionConfiguration_2<1, 1, 1, 1, 11>
|
|
// clang-format on
|
|
>;
|
|
#endif
|
|
|
|
template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
|
|
using deviceReduceThreadWisePtrType = DeviceReducePtr<
|
|
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation,
|
|
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation>;
|
|
|
|
template <typename InDataType,
|
|
typename AccDataType,
|
|
typename OutDataType,
|
|
int Rank,
|
|
typename ReduceDims,
|
|
ReduceTensorOp_t ReduceOpId,
|
|
NanPropagation_t NanOpt,
|
|
ReduceTensorIndices_t IndicesOpt>
|
|
void add_device_reduce_instance_threadwise(
|
|
std::vector<deviceReduceThreadWisePtrType<AccDataType, ReduceOpId>>& device_op_instances)
|
|
{
|
|
using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
|
|
using InElementwiseOperation =
|
|
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
|
|
using AccElementwiseOperation =
|
|
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
|
|
AccElementwiseOperation;
|
|
|
|
constexpr bool Indexable =
|
|
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
|
|
ReduceOpId == ReduceTensorOp_t::AMAX);
|
|
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
|
|
|
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
|
|
|
|
using cfg1 = ReductionConfiguration_1<256, 256, 1>;
|
|
|
|
static_for<0, std::tuple_size<reduce_configuration_2_instances_threadwise>::value, 1>{}(
|
|
[&](auto j) {
|
|
using cfg2 = remove_cvref_t<decltype(
|
|
std::get<j.value>(reduce_configuration_2_instances_threadwise{}))>;
|
|
|
|
using ReduceOpInstance = DeviceReduceThreadWise<InDataType,
|
|
AccDataType,
|
|
OutDataType,
|
|
Rank,
|
|
ReduceDims,
|
|
ReduceOperation,
|
|
InElementwiseOperation,
|
|
AccElementwiseOperation,
|
|
PropagateNan,
|
|
NeedIndices,
|
|
cfg1::BlockSize_,
|
|
cfg1::MThreadClusterSize_,
|
|
cfg1::KThreadClusterSize_,
|
|
cfg2::MThreadSliceSize_,
|
|
cfg2::KThreadSliceSize_,
|
|
cfg2::InSrcVectorDim_,
|
|
cfg2::InSrcVectorSize_,
|
|
cfg2::OutDstVectorSize_>;
|
|
|
|
device_op_instances.push_back(std::make_unique<ReduceOpInstance>(ReduceOpInstance{}));
|
|
});
|
|
};
|
|
|
|
#define ADD_THREADWISE_INST_BY_TYPE(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
|
template void add_device_reduce_instance_threadwise<inT, \
|
|
compT, \
|
|
outT, \
|
|
Rank, \
|
|
Sequence<__VA_ARGS__>, \
|
|
ReduceOpId, \
|
|
NanOpt, \
|
|
IndicesOpt>( \
|
|
std::vector<deviceReduceThreadWisePtrType<compT, ReduceOpId>> & device_op_instances)
|
|
|
|
#define ADD_THREADWISE_INST_BY_ID(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
|
ADD_THREADWISE_INST_BY_TYPE(inT, \
|
|
compT, \
|
|
outT, \
|
|
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
|
static_cast<NanPropagation_t>(NanOpt), \
|
|
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
|
Rank, \
|
|
__VA_ARGS__)
|
|
|
|
#define ADD_THREADWISE_INST_REF_BY_TYPE( \
|
|
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
|
extern template void add_device_reduce_instance_threadwise<inT, \
|
|
compT, \
|
|
outT, \
|
|
Rank, \
|
|
Sequence<__VA_ARGS__>, \
|
|
ReduceOpId, \
|
|
NanOpt, \
|
|
IndicesOpt>( \
|
|
std::vector<DeviceReducePtr< \
|
|
typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
|
|
typename reduce_unary_operator<compT, ReduceOpId, true, true>:: \
|
|
AccElementwiseOperation>> & \
|
|
device_op_instances)
|
|
|
|
#define ADD_THREADWISE_INST_REF_BY_ID(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
|
ADD_THREADWISE_INST_REF_BY_TYPE(inT, \
|
|
compT, \
|
|
outT, \
|
|
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
|
static_cast<NanPropagation_t>(NanOpt), \
|
|
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
|
Rank, \
|
|
__VA_ARGS__)
|
|
|
|
} // namespace device_reduce_instance
|
|
} // namespace device
|
|
} // namespace tensor_operation
|
|
|
|
} // namespace ck
|
|
|
|
#endif
|