mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-16 10:59:55 +00:00
* Initial adding of generic reduction
* Initial adding of generic reduction ...
* Updates to make compiling done
* clang-format all files
* clang-format some files again
* Renaming in profiler/include/profile_reduce.hpp
* Updates and make BlockWise cases passed
* Updates and make ThreadWise and MultiBlockTwoCall cases passed
* Remove the support for MUL and NORM1 reduceOp from the profiler and the device instances
* Change to replace the dim0_max_vector_size/dim1_max_vector_size template argument in the device reduce classes
* format
* adding pooling
* added max and average pooling
* comment out cout and kernel timing
* Tiny simplification in profiler/reduce_profiler.cpp
* Add example for reduce_blockwise
* Tiny updates
* Change to pass the ElementWiseOp from device layer to kernel
* Fix the vectorDim and vectorSize in Device layer
* Enable vector load on both dim0 and dim1 for Threadwise method
* Tiny updates
* Change to let the user to pass the preUnaryOp and posUnaryOp
* Make pooling example work
* split device_reduce_instance into two libraries
* Tiny update
* Replace nanPropaOpt enum by boolean propagate_nan
* Simplification in DeviceReduce layer codes
* update build
* Change to clarify the difference between ck::half_t and half_float::half
* Renaming in all the reduction codes
* Add VectorSize as template parameter for device layer
* Add BetaIsZero as kernel template and as AccDataType for alpha
* print
* Small updates for pooling
* Updates for host_generic_reduction for reference
* Update to make AVG pooling pass
* Update to make MAX pooling with indices output pass
* fix
* add OutDst vector store to threadwise reduction and pooling
* tweak
* turn off check_indices that caused build issue
* refactor pooling
* clean up
* turn off check_indices for building issue for php-compiler
* add more tile size for odd C
* tweak conv for odd C
* update script
* clean up elementwise op
* add hack in reduction_operator.hpp to avoid compile error. To fix it, need to use element_wise_op in reduction op
* Add OutVectorSize as device and kernel tunable, also update to Elementwise Operations
* Move reduce operator mapping to host layer file reduction_operator_mapping.hpp from reduction_operator.hpp
* Change to the unary operators
* Move the definitions of unary operations to element_wise_operation.hpp
* re-org files
* Refine in device interfaces and multiblock kernels
* Split the reduction configurations into instances for specific methods
* Update in getTypeString() of device pool2d
* Renaming in host and kernel
* Tiny update in profiler/src/profiler.cpp
* Uncomment in device_operation/CMakeLists.txt to enable the building of all operations
* Make check_indices a templated function to remove some linking issue
* Renaming in the profiler reduce module
* Add support for double Reduction (but disable MultiblockAtomicAdd for double)
* Tiny correction of literal string
* Rename DevicePoolFwd to DevicePool2dFwd
* Split device_reduce_instance_xxx.cpp files according to the data types to speed up compiling
* Add comments for lists of configurations, lists of instances and references of add_reduce_instances_xxx
* Remove un-used header file gridwise_generic_reduction_wrapper_common.hpp
* Renaming and refining in the Reduction codes
* Tiny change in the unary operators
* Renaming symbols and files
* Renaming symbols in the kernels
* Move kernel kernel_set_buffer_value to separate file
* Add IndexDataType template parameter for kernels and use int32_t as index data type in device layer
* Tiny update in the kernels
* Remove definition of sqrtf()/isnan()/abs() for half_t due to some ADL issue
* Simplify a helper function in device layer
* Tiny adjustment in testing data initialization
* Renaming in kernel/device/host
* Add two testing scripts for reduction
* Refine the Unary operators in element_wise_operation.hpp
* Update in the reduce profiler module
* Update to the reduction testing scripts
* reduce compile parallelism
* change CI docker to rocm5.0
* remove unused variables
* fix build
Co-authored-by: Chao Liu <chao.liu2@amd.com>
[ROCm/composable_kernel commit: e17c0d8008]
165 lines
9.0 KiB
C++
165 lines
9.0 KiB
C++
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_HPP
|
|
#define DEVICE_REDUCE_INSTANCE_THREADWISE_HPP
|
|
|
|
#include "reduction_operator_mapping.hpp"
|
|
#include "device_reduce_instance_impl_common.hpp"
|
|
#include "device_reduce_threadwise.hpp"
|
|
|
|
namespace ck {
|
|
namespace tensor_operation {
|
|
namespace device {
|
|
namespace device_reduce_instance {
|
|
|
|
#ifdef QUICK_REDUCE_TEST
|
|
using reduce_configuration_2_instances_threadwise = std::tuple<
|
|
// clang-format off
|
|
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
|
|
ReductionConfiguration_2<0, 2, 2, 2, 1>,
|
|
ReductionConfiguration_2<0, 1, 1, 2, 1>,
|
|
ReductionConfiguration_2<1, 2, 1, 1, 2>,
|
|
ReductionConfiguration_2<1, 2, 2, 1, 2>,
|
|
ReductionConfiguration_2<0, 1, 1, 3, 1>,
|
|
ReductionConfiguration_2<1, 1, 1, 1, 3>
|
|
// clang-format on
|
|
>;
|
|
#else
|
|
using reduce_configuration_2_instances_threadwise = std::tuple<
|
|
// clang-format off
|
|
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
|
|
ReductionConfiguration_2<0, 4, 4, 8, 1>,
|
|
ReductionConfiguration_2<0, 4, 4, 4, 1>,
|
|
ReductionConfiguration_2<0, 2, 2, 2, 1>,
|
|
|
|
ReductionConfiguration_2<1, 4, 1, 1, 8>,
|
|
ReductionConfiguration_2<1, 4, 1, 1, 4>,
|
|
ReductionConfiguration_2<1, 2, 1, 1, 2>,
|
|
|
|
// special instances
|
|
ReductionConfiguration_2<0, 1, 1, 3, 1>,
|
|
ReductionConfiguration_2<0, 1, 1, 5, 1>,
|
|
ReductionConfiguration_2<0, 1, 1, 7, 1>,
|
|
ReductionConfiguration_2<0, 1, 1, 11, 1>,
|
|
|
|
ReductionConfiguration_2<1, 1, 1, 1, 3>,
|
|
ReductionConfiguration_2<1, 1, 1, 1, 5>,
|
|
ReductionConfiguration_2<1, 1, 1, 1, 7>,
|
|
ReductionConfiguration_2<1, 1, 1, 1, 11>
|
|
// clang-format on
|
|
>;
|
|
#endif
|
|
|
|
template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
|
|
using deviceReduceThreadWisePtrType = DeviceReducePtr<
|
|
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation,
|
|
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation>;
|
|
|
|
template <typename InDataType,
|
|
typename AccDataType,
|
|
typename OutDataType,
|
|
int Rank,
|
|
typename ReduceDims,
|
|
ReduceTensorOp_t ReduceOpId,
|
|
NanPropagation_t NanOpt,
|
|
ReduceTensorIndices_t IndicesOpt>
|
|
void add_device_reduce_instance_threadwise(
|
|
std::vector<deviceReduceThreadWisePtrType<AccDataType, ReduceOpId>>& device_op_instances)
|
|
{
|
|
using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
|
|
using InElementwiseOperation =
|
|
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
|
|
using AccElementwiseOperation =
|
|
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
|
|
AccElementwiseOperation;
|
|
|
|
constexpr bool Indexable =
|
|
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
|
|
ReduceOpId == ReduceTensorOp_t::AMAX);
|
|
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
|
|
|
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
|
|
|
|
using cfg1 = ReductionConfiguration_1<256, 256, 1>;
|
|
|
|
static_for<0, std::tuple_size<reduce_configuration_2_instances_threadwise>::value, 1>{}(
|
|
[&](auto j) {
|
|
using cfg2 = remove_cvref_t<decltype(
|
|
std::get<j.value>(reduce_configuration_2_instances_threadwise{}))>;
|
|
|
|
using ReduceOpInstance = DeviceReduceThreadWise<InDataType,
|
|
AccDataType,
|
|
OutDataType,
|
|
Rank,
|
|
ReduceDims,
|
|
ReduceOperation,
|
|
InElementwiseOperation,
|
|
AccElementwiseOperation,
|
|
PropagateNan,
|
|
NeedIndices,
|
|
cfg1::BlockSize_,
|
|
cfg1::MThreadClusterSize_,
|
|
cfg1::KThreadClusterSize_,
|
|
cfg2::MThreadSliceSize_,
|
|
cfg2::KThreadSliceSize_,
|
|
cfg2::InSrcVectorDim_,
|
|
cfg2::InSrcVectorSize_,
|
|
cfg2::OutDstVectorSize_>;
|
|
|
|
device_op_instances.push_back(std::make_unique<ReduceOpInstance>(ReduceOpInstance{}));
|
|
});
|
|
};
|
|
|
|
#define ADD_THREADWISE_INST_BY_TYPE(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
|
template void add_device_reduce_instance_threadwise<inT, \
|
|
compT, \
|
|
outT, \
|
|
Rank, \
|
|
Sequence<__VA_ARGS__>, \
|
|
ReduceOpId, \
|
|
NanOpt, \
|
|
IndicesOpt>( \
|
|
std::vector<deviceReduceThreadWisePtrType<compT, ReduceOpId>> & device_op_instances)
|
|
|
|
#define ADD_THREADWISE_INST_BY_ID(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
|
ADD_THREADWISE_INST_BY_TYPE(inT, \
|
|
compT, \
|
|
outT, \
|
|
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
|
static_cast<NanPropagation_t>(NanOpt), \
|
|
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
|
Rank, \
|
|
__VA_ARGS__)
|
|
|
|
#define ADD_THREADWISE_INST_REF_BY_TYPE( \
|
|
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
|
extern template void add_device_reduce_instance_threadwise<inT, \
|
|
compT, \
|
|
outT, \
|
|
Rank, \
|
|
Sequence<__VA_ARGS__>, \
|
|
ReduceOpId, \
|
|
NanOpt, \
|
|
IndicesOpt>( \
|
|
std::vector<DeviceReducePtr< \
|
|
typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
|
|
typename reduce_unary_operator<compT, ReduceOpId, true, true>:: \
|
|
AccElementwiseOperation>> & \
|
|
device_op_instances)
|
|
|
|
#define ADD_THREADWISE_INST_REF_BY_ID(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
|
ADD_THREADWISE_INST_REF_BY_TYPE(inT, \
|
|
compT, \
|
|
outT, \
|
|
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
|
static_cast<NanPropagation_t>(NanOpt), \
|
|
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
|
Rank, \
|
|
__VA_ARGS__)
|
|
|
|
} // namespace device_reduce_instance
|
|
} // namespace device
|
|
} // namespace tensor_operation
|
|
|
|
} // namespace ck
|
|
|
|
#endif
|