mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 01:10:17 +00:00
Overhaul to Reducton and its dependants (#237)
* Tiny fix in dynamic_buffer.hpp to support vectorized AtomicAdd for double type * Update to host layer and host reduction * Merge and remove reduction kernels * Merge and remove reduction device interfaces and update pooling device interface * Merge and remove useless reduction device instances * Update to reduction profiler and reduction ctests * Update to reduction and pooling examples and add one reduction example * Change to reduction examples to let them testable by ctest * Add explicit pass checking for reduction and pooling examples * Explicit assignment of tensor shapes in example reduce_blockwise_two_call * Use atomic_add to repace atomicAdd and add atomic_add for double type * Add reduce ctest support for double data type * Replace to_int_vector() by using c++ std::vector::assign() * Keep DeviceReduceThreadWise separated from DeviceReduceBlockWise * Merge DeviceReduceBlockWise and DeviceReduceMultiBlockAtomicAdd into DeviceReduceMultiBlock * Add GetAtomicOperationZeroValue() support for AtomicMax * Tiny change to reduce example README.md * Fix some tiny issues due to branch merging * Revoke previous change in dynamic_buffer.hpp and add atomic_add for double2_t * Add reduce multiblock_atomic_add instances for fp64 to verify vectorized atomic_add on fp64 * Renaming * Clean the header includings in device_reduce instances header files
This commit is contained in:
102
library/include/ck/library/host_tensor/host_common_util.hpp
Normal file
102
library/include/ck/library/host_tensor/host_common_util.hpp
Normal file
@@ -0,0 +1,102 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2020 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*******************************************************************************/
|
||||
#ifndef GUARD_HOST_COMMON_UTIL_HPP
|
||||
#define GUARD_HOST_COMMON_UTIL_HPP
|
||||
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
|
||||
#include "config.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
namespace host_common {
|
||||
|
||||
template <typename T>
|
||||
static inline void dumpBufferToFile(const char* fileName, T* data, size_t dataNumItems)
|
||||
{
|
||||
std::ofstream outFile(fileName, std::ios::binary);
|
||||
if(outFile)
|
||||
{
|
||||
outFile.write(reinterpret_cast<char*>(data), dataNumItems * sizeof(T));
|
||||
outFile.close();
|
||||
std::cout << "Write output to file " << fileName << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Could not open file " << fileName << " for writing" << std::endl;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
static inline T getSingleValueFromString(const std::string& valueStr)
|
||||
{
|
||||
std::istringstream iss(valueStr);
|
||||
|
||||
T val;
|
||||
|
||||
iss >> val;
|
||||
|
||||
return (val);
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
static inline std::vector<T> getTypeValuesFromString(const char* cstr_values)
|
||||
{
|
||||
std::string valuesStr(cstr_values);
|
||||
|
||||
std::vector<T> values;
|
||||
std::size_t pos = 0;
|
||||
std::size_t new_pos;
|
||||
|
||||
new_pos = valuesStr.find(',', pos);
|
||||
while(new_pos != std::string::npos)
|
||||
{
|
||||
const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
|
||||
|
||||
T val = getSingleValueFromString<T>(sliceStr);
|
||||
|
||||
values.push_back(val);
|
||||
|
||||
pos = new_pos + 1;
|
||||
new_pos = valuesStr.find(',', pos);
|
||||
};
|
||||
|
||||
std::string sliceStr = valuesStr.substr(pos);
|
||||
T val = getSingleValueFromString<T>(sliceStr);
|
||||
|
||||
values.push_back(val);
|
||||
|
||||
return (values);
|
||||
}
|
||||
|
||||
}; // namespace host_common
|
||||
|
||||
}; // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -28,9 +28,7 @@
|
||||
|
||||
#include <limits>
|
||||
#include <cmath>
|
||||
#include <cassert>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <functional>
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "data_type.hpp"
|
||||
@@ -214,13 +212,13 @@ binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
|
||||
};
|
||||
};
|
||||
|
||||
template <typename AccDataType, bool PropagateNan>
|
||||
template <typename AccDataType, typename IndexDataType, bool PropagateNan>
|
||||
__host__ static inline void
|
||||
binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opReduce,
|
||||
AccDataType& accuVal,
|
||||
AccDataType currVal,
|
||||
int& accuIndex,
|
||||
int currIndex)
|
||||
binop_with_index_and_nan_check(std::function<void(AccDataType&, AccDataType, bool&)> opReduce,
|
||||
AccDataType& accuVal,
|
||||
AccDataType currVal,
|
||||
IndexDataType& accuIndex,
|
||||
IndexDataType currIndex)
|
||||
{
|
||||
using ck::math::isnan;
|
||||
|
||||
@@ -254,16 +252,6 @@ binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opRe
|
||||
|
||||
}; // namespace host_reduce
|
||||
|
||||
static inline std::vector<int> to_int_vector(const std::vector<size_t>& inData)
|
||||
{
|
||||
std::vector<int> outData;
|
||||
|
||||
for(auto elem : inData)
|
||||
outData.push_back(static_cast<int>(elem));
|
||||
|
||||
return (outData);
|
||||
};
|
||||
|
||||
}; // namespace ck
|
||||
|
||||
#endif
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_common.hpp"
|
||||
#include "host_reduce_util.hpp"
|
||||
#include "host_common_util.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "data_type.hpp"
|
||||
|
||||
@@ -200,7 +201,7 @@ struct ReductionHost
|
||||
using ck::float_equal_one;
|
||||
using ck::float_equal_zero;
|
||||
using ck::type_convert;
|
||||
using ck::host_reduce::binop_with_nan_check2;
|
||||
using ck::host_reduce::binop_with_index_and_nan_check;
|
||||
using ck::host_reduce::ReduceOpFn2;
|
||||
using ck::host_reduce::ReduceOpZeroVal;
|
||||
|
||||
@@ -211,8 +212,7 @@ struct ReductionHost
|
||||
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
|
||||
IndexDataType accuIndex = 0;
|
||||
|
||||
for(IndexDataType i = 0; i < ck::type_convert<IndexDataType>(reduce_dim_indexes.size());
|
||||
i++)
|
||||
for(std::size_t i = 0; i < reduce_dim_indexes.size(); i++)
|
||||
{
|
||||
auto offset_reduce =
|
||||
get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]);
|
||||
@@ -221,9 +221,9 @@ struct ReductionHost
|
||||
|
||||
preUnaryOp(currVal);
|
||||
|
||||
auto currIndex = i;
|
||||
auto currIndex = static_cast<IndexDataType>(i);
|
||||
|
||||
binop_with_nan_check2<AccDataType, PropagateNan>(
|
||||
binop_with_index_and_nan_check<AccDataType, IndexDataType, PropagateNan>(
|
||||
opReduce2, accuVal, currVal, accuIndex, currIndex);
|
||||
};
|
||||
|
||||
@@ -247,9 +247,7 @@ struct ReductionHost
|
||||
auto offset_invariant =
|
||||
get_offset_from_index<NumInvariantDim>(invariantStrides, invariant_index);
|
||||
|
||||
for(IndexDataType i = 0;
|
||||
i < ck::type_convert<IndexDataType>(reduce_dim_indexes.size());
|
||||
i++)
|
||||
for(std::size_t i = 0; i < reduce_dim_indexes.size(); i++)
|
||||
{
|
||||
auto offset_reduce =
|
||||
get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]);
|
||||
@@ -259,9 +257,9 @@ struct ReductionHost
|
||||
|
||||
preUnaryOp(currVal);
|
||||
|
||||
auto currIndex = i;
|
||||
auto currIndex = static_cast<IndexDataType>(i);
|
||||
|
||||
binop_with_nan_check2<AccDataType, PropagateNan>(
|
||||
binop_with_index_and_nan_check<AccDataType, IndexDataType, PropagateNan>(
|
||||
opReduce2, accuVal, currVal, accuIndex, currIndex);
|
||||
};
|
||||
|
||||
|
||||
@@ -9,26 +9,11 @@
|
||||
#include "device_reduce_instance_blockwise_i8_i8_i8.hpp"
|
||||
#include "device_reduce_instance_blockwise_i8_i32_i8.hpp"
|
||||
#include "device_reduce_instance_blockwise_b16_f32_b16.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp"
|
||||
#include "device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp"
|
||||
#include "device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp"
|
||||
#include "device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp"
|
||||
#include "device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp"
|
||||
#include "device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp"
|
||||
#include "device_reduce_instance_threadwise_f16_f16_f16.hpp"
|
||||
#include "device_reduce_instance_threadwise_f16_f32_f16.hpp"
|
||||
#include "device_reduce_instance_threadwise_f32_f32_f32.hpp"
|
||||
|
||||
@@ -3,13 +3,27 @@
|
||||
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_impl_common.hpp"
|
||||
#include "device_reduce_blockwise.hpp"
|
||||
#include "device_reduce_multiblock.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
using reduce_configuration_1_instances_blockwise = std::tuple<
|
||||
// clang-format off
|
||||
// BlockSize | MThreadClusterSize | KThreadClusterSize
|
||||
ReductionConfiguration_1<256, 128, 2>,
|
||||
ReductionConfiguration_1<256, 64, 4>,
|
||||
ReductionConfiguration_1<256, 32, 8>,
|
||||
ReductionConfiguration_1<256, 16, 16>,
|
||||
ReductionConfiguration_1<256, 8, 32>,
|
||||
ReductionConfiguration_1<256, 4, 64>,
|
||||
ReductionConfiguration_1<256, 2, 128>,
|
||||
ReductionConfiguration_1<256, 1, 256>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
#ifdef QUICK_REDUCE_TEST
|
||||
using reduce_configuration_2_instances_blockwise = std::tuple<
|
||||
// clang-format off
|
||||
@@ -58,8 +72,8 @@ template <typename InDataType,
|
||||
int Rank,
|
||||
int NumReduceDim,
|
||||
ReduceTensorOp ReduceOpId,
|
||||
NanPropagation NanOpt,
|
||||
ReduceTensorIndices IndicesOpt>
|
||||
bool PropagateNan,
|
||||
bool UseIndex>
|
||||
void add_device_reduce_instance_blockwise(
|
||||
std::vector<deviceReduceBlockWisePtrType<AccDataType, ReduceOpId>>& device_op_instances)
|
||||
{
|
||||
@@ -73,92 +87,94 @@ void add_device_reduce_instance_blockwise(
|
||||
constexpr bool Indexable =
|
||||
(ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
|
||||
ReduceOpId == ReduceTensorOp::AMAX);
|
||||
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
|
||||
constexpr bool OutputIndex = Indexable && UseIndex;
|
||||
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
|
||||
static_for<0, std::tuple_size<reduce_configuration_1_instances_blockwise>::value, 1>{}(
|
||||
[&](auto i) {
|
||||
using cfg1 = remove_cvref_t<decltype(
|
||||
std::get<i.value>(reduce_configuration_1_instances_blockwise{}))>;
|
||||
|
||||
static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) {
|
||||
using cfg1 =
|
||||
remove_cvref_t<decltype(std::get<i.value>(reduce_configuration_1_instances{}))>;
|
||||
static_for<0, std::tuple_size<reduce_configuration_2_instances_blockwise>::value, 1>{}(
|
||||
[&](auto j) {
|
||||
using cfg2 = remove_cvref_t<decltype(
|
||||
std::get<j.value>(reduce_configuration_2_instances_blockwise{}))>;
|
||||
|
||||
static_for<0, std::tuple_size<reduce_configuration_2_instances_blockwise>::value, 1>{}(
|
||||
[&](auto j) {
|
||||
using cfg2 = remove_cvref_t<decltype(
|
||||
std::get<j.value>(reduce_configuration_2_instances_blockwise{}))>;
|
||||
using ReduceOpInstance =
|
||||
DeviceReduceMultiBlock<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
Rank,
|
||||
NumReduceDim,
|
||||
ReduceOperation,
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
PropagateNan,
|
||||
OutputIndex,
|
||||
false, // HaveIndexInputIfOutputIndex
|
||||
cfg1::BlockSize_,
|
||||
cfg1::MThreadClusterSize_,
|
||||
cfg1::KThreadClusterSize_,
|
||||
cfg2::MThreadSliceSize_,
|
||||
cfg2::KThreadSliceSize_,
|
||||
cfg2::InSrcVectorDim_,
|
||||
cfg2::InSrcVectorSize_,
|
||||
cfg2::OutDstVectorSize_>;
|
||||
|
||||
using ReduceOpInstance = DeviceReduceBlockWise<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
Rank,
|
||||
NumReduceDim,
|
||||
ReduceOperation,
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation,
|
||||
PropagateNan,
|
||||
NeedIndices,
|
||||
cfg1::BlockSize_,
|
||||
cfg1::MThreadClusterSize_,
|
||||
cfg1::KThreadClusterSize_,
|
||||
cfg2::MThreadSliceSize_,
|
||||
cfg2::KThreadSliceSize_,
|
||||
cfg2::InSrcVectorDim_,
|
||||
cfg2::InSrcVectorSize_,
|
||||
cfg2::OutDstVectorSize_>;
|
||||
|
||||
device_op_instances.push_back(
|
||||
std::make_unique<ReduceOpInstance>(ReduceOpInstance{}));
|
||||
});
|
||||
});
|
||||
device_op_instances.push_back(
|
||||
std::make_unique<ReduceOpInstance>(ReduceOpInstance{}));
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
#define ADD_BLOCKWISE_INST_BY_TYPE( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
template void add_device_reduce_instance_blockwise<inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
Rank, \
|
||||
NumReduceDim, \
|
||||
ReduceOpId, \
|
||||
NanOpt, \
|
||||
IndicesOpt>( \
|
||||
#define ADD_BLOCKWISE_INST_BY_TYPE( \
|
||||
inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \
|
||||
template void add_device_reduce_instance_blockwise<inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
Rank, \
|
||||
NumReduceDim, \
|
||||
ReduceOpId, \
|
||||
PropagateNan, \
|
||||
UseIndex>( \
|
||||
std::vector<deviceReduceBlockWisePtrType<compT, ReduceOpId>> & device_op_instances)
|
||||
|
||||
#define ADD_BLOCKWISE_INST_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_BLOCKWISE_INST_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<NanPropagation>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices>(IndicesOpt), \
|
||||
Rank, \
|
||||
#define ADD_BLOCKWISE_INST_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_BLOCKWISE_INST_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<bool>(NanOpt), \
|
||||
static_cast<bool>(IndicesOpt), \
|
||||
Rank, \
|
||||
NumReduceDim)
|
||||
|
||||
#define ADD_BLOCKWISE_INST_REF_BY_TYPE( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \
|
||||
extern template void add_device_reduce_instance_blockwise<inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
Rank, \
|
||||
NumReduceDim, \
|
||||
ReduceOpId, \
|
||||
NanOpt, \
|
||||
IndicesOpt>( \
|
||||
PropagateNan, \
|
||||
UseIndex>( \
|
||||
std::vector<DeviceReducePtr< \
|
||||
typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
|
||||
typename reduce_unary_operator<compT, ReduceOpId, true, true>:: \
|
||||
AccElementwiseOperation>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_BLOCKWISE_INST_REF_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<NanPropagation>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices>(IndicesOpt), \
|
||||
Rank, \
|
||||
#define ADD_BLOCKWISE_INST_REF_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<bool>(NanOpt), \
|
||||
static_cast<bool>(IndicesOpt), \
|
||||
Rank, \
|
||||
NumReduceDim)
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_B16_F32_B16_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_B16_F32_B16_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "data_type.hpp"
|
||||
#include "device_reduce_instance_blockwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F16_F16_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F16_F16_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "data_type.hpp"
|
||||
#include "device_reduce_instance_blockwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F32_F16_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F32_F16_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "data_type.hpp"
|
||||
#include "device_reduce_instance_blockwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F32_F32_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F32_F32_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_blockwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F64_F32_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F64_F32_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_blockwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F64_F64_F64_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F64_F64_F64_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_blockwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I32_I8_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I32_I8_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_blockwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_blockwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -1,165 +0,0 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_HPP
|
||||
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_impl_common.hpp"
|
||||
#include "device_reduce_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
#ifdef QUICK_REDUCE_TEST
|
||||
using reduce_configuration_2_instances_blockwise_second_call = std::tuple<
|
||||
// clang-format off
|
||||
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
|
||||
ReductionConfiguration_2<1, 2, 1, 1, 2>,
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 3>
|
||||
// clang-format on
|
||||
>;
|
||||
#else
|
||||
using reduce_configuration_2_instances_blockwise_second_call = std::tuple<
|
||||
// clang-format off
|
||||
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
|
||||
ReductionConfiguration_2<1, 4, 1, 1, 8>,
|
||||
ReductionConfiguration_2<1, 4, 1, 1, 4>,
|
||||
ReductionConfiguration_2<1, 2, 1, 1, 2>,
|
||||
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 3>,
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 5>,
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 7>,
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 11>
|
||||
// clang-format on
|
||||
>;
|
||||
#endif
|
||||
|
||||
template <typename AccDataType, ReduceTensorOp ReduceOpId>
|
||||
using deviceReduceBlockWiseSecondCallPtrType = DeviceReducePtr<
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::InElementwiseOperation,
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::AccElementwiseOperation>;
|
||||
|
||||
template <typename InDataType,
|
||||
typename AccDataType,
|
||||
typename OutDataType,
|
||||
int Rank,
|
||||
int NumReduceDim,
|
||||
ReduceTensorOp ReduceOpId,
|
||||
NanPropagation NanOpt,
|
||||
ReduceTensorIndices IndicesOpt>
|
||||
void add_device_reduce_instance_blockwise_second_call(
|
||||
std::vector<deviceReduceBlockWiseSecondCallPtrType<AccDataType, ReduceOpId>>&
|
||||
device_op_instances)
|
||||
{
|
||||
using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
|
||||
using InElementwiseOperation =
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
|
||||
InElementwiseOperation;
|
||||
using AccElementwiseOperation =
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
|
||||
AccElementwiseOperation;
|
||||
|
||||
constexpr bool Indexable =
|
||||
(ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
|
||||
ReduceOpId == ReduceTensorOp::AMAX);
|
||||
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
|
||||
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
|
||||
|
||||
static_assert(std::is_same<InDataType, AccDataType>::value,
|
||||
"InDataType and AccDataType should be the same to use "
|
||||
"add_device_reduce_instance_blockwise_second_call!");
|
||||
|
||||
static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) {
|
||||
using cfg1 =
|
||||
remove_cvref_t<decltype(std::get<i.value>(reduce_configuration_1_instances{}))>;
|
||||
|
||||
static_for<0,
|
||||
std::tuple_size<reduce_configuration_2_instances_blockwise_second_call>::value,
|
||||
1>{}([&](auto j) {
|
||||
using cfg2 = remove_cvref_t<decltype(
|
||||
std::get<j.value>(reduce_configuration_2_instances_blockwise_second_call{}))>;
|
||||
|
||||
using ReduceOpInstance = DeviceReduceBlockWiseSecondCall<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
Rank,
|
||||
NumReduceDim,
|
||||
ReduceOperation,
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation,
|
||||
PropagateNan,
|
||||
NeedIndices,
|
||||
cfg1::BlockSize_,
|
||||
cfg1::MThreadClusterSize_,
|
||||
cfg1::KThreadClusterSize_,
|
||||
cfg2::MThreadSliceSize_,
|
||||
cfg2::KThreadSliceSize_,
|
||||
cfg2::InSrcVectorDim_,
|
||||
cfg2::InSrcVectorSize_,
|
||||
cfg2::OutDstVectorSize_>;
|
||||
|
||||
device_op_instances.push_back(std::make_unique<ReduceOpInstance>(ReduceOpInstance{}));
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
#define ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
template void add_device_reduce_instance_blockwise_second_call<inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
Rank, \
|
||||
NumReduceDim, \
|
||||
ReduceOpId, \
|
||||
NanOpt, \
|
||||
IndicesOpt>( \
|
||||
std::vector<deviceReduceBlockWiseSecondCallPtrType<compT, ReduceOpId>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<NanPropagation>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices>(IndicesOpt), \
|
||||
Rank, \
|
||||
NumReduceDim)
|
||||
|
||||
#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
extern template void add_device_reduce_instance_blockwise_second_call<inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
Rank, \
|
||||
NumReduceDim, \
|
||||
ReduceOpId, \
|
||||
NanOpt, \
|
||||
IndicesOpt>( \
|
||||
std::vector< \
|
||||
DeviceReducePtr<typename reduce_unary_operator<compT, ReduceOpId, false, true>:: \
|
||||
InElementwiseOperation, \
|
||||
typename reduce_unary_operator<compT, ReduceOpId, false, true>:: \
|
||||
AccElementwiseOperation>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<NanPropagation>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices>(IndicesOpt), \
|
||||
Rank, \
|
||||
NumReduceDim)
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -1,47 +0,0 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F16_F16_F16_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F16_F16_F16_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -1,60 +0,0 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_B16_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_B16_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 5, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 7, 0, 0, 2, 1);
|
||||
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 1, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 1, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 1, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -1,35 +0,0 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F16_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F16_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 4, 3); // for ADD
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 4, 3); // for AVG
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 4, 3); // for NORM2
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -1,59 +0,0 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F32_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F32_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -1,35 +0,0 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F32_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F32_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 4, 3); // for ADD
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 4, 3); // for AVG
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 4, 3); // for NORM2
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -1,59 +0,0 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F64_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F64_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -1,31 +0,0 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I32_I32_I8_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I32_I32_I8_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -1,47 +0,0 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I8_I8_I8_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I8_I8_I8_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -30,20 +30,6 @@ struct ReductionConfiguration_2
|
||||
static constexpr int KThreadSliceSize_ = KThreadSliceSize;
|
||||
};
|
||||
|
||||
using reduce_configuration_1_instances = std::tuple<
|
||||
// clang-format off
|
||||
// BlockSize | MThreadClusterSize | KThreadClusterSize
|
||||
ReductionConfiguration_1<256, 128, 2>,
|
||||
ReductionConfiguration_1<256, 64, 4>,
|
||||
ReductionConfiguration_1<256, 32, 8>,
|
||||
ReductionConfiguration_1<256, 16, 16>,
|
||||
ReductionConfiguration_1<256, 8, 32>,
|
||||
ReductionConfiguration_1<256, 4, 64>,
|
||||
ReductionConfiguration_1<256, 2, 128>,
|
||||
ReductionConfiguration_1<256, 1, 256>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
#define QUICK_REDUCE_TEST 1
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
|
||||
@@ -3,13 +3,27 @@
|
||||
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_impl_common.hpp"
|
||||
#include "device_reduce_multiblock_atomic_add.hpp"
|
||||
#include "device_reduce_multiblock.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
using reduce_configuration_1_instances_multiblock_atomic_add = std::tuple<
|
||||
// clang-format off
|
||||
// BlockSize | MThreadClusterSize | KThreadClusterSize
|
||||
ReductionConfiguration_1<256, 128, 2>,
|
||||
ReductionConfiguration_1<256, 64, 4>,
|
||||
ReductionConfiguration_1<256, 32, 8>,
|
||||
ReductionConfiguration_1<256, 16, 16>,
|
||||
ReductionConfiguration_1<256, 8, 32>,
|
||||
ReductionConfiguration_1<256, 4, 64>,
|
||||
ReductionConfiguration_1<256, 2, 128>,
|
||||
ReductionConfiguration_1<256, 1, 256>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
#ifdef QUICK_REDUCE_TEST
|
||||
using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple<
|
||||
// clang-format off
|
||||
@@ -60,8 +74,8 @@ template <typename InDataType,
|
||||
int Rank,
|
||||
int NumReduceDim,
|
||||
ReduceTensorOp ReduceOpId,
|
||||
NanPropagation NanOpt,
|
||||
ReduceTensorIndices IndicesOpt>
|
||||
bool PropagateNan,
|
||||
bool UseIndex>
|
||||
void add_device_reduce_instance_multiblock_atomic_add(
|
||||
std::vector<deviceReduceMultiBlockAtomicAddPtrType<AccDataType, ReduceOpId>>&
|
||||
device_op_instances)
|
||||
@@ -76,12 +90,10 @@ void add_device_reduce_instance_multiblock_atomic_add(
|
||||
constexpr bool Indexable =
|
||||
(ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
|
||||
ReduceOpId == ReduceTensorOp::AMAX);
|
||||
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
|
||||
constexpr bool OutputIndex = Indexable && UseIndex;
|
||||
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
|
||||
|
||||
static_assert(IndicesOpt == ReduceTensorIndices::NO_INDICES,
|
||||
"AtomicAdd can only be used with reduction operations without indices!");
|
||||
static_assert(UseIndex == false,
|
||||
"AtomicAdd can only be used with reduction operations using no index!");
|
||||
|
||||
constexpr bool op_acceptable =
|
||||
(ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::MUL ||
|
||||
@@ -94,9 +106,11 @@ void add_device_reduce_instance_multiblock_atomic_add(
|
||||
return;
|
||||
else
|
||||
{
|
||||
static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) {
|
||||
using cfg1 =
|
||||
remove_cvref_t<decltype(std::get<i.value>(reduce_configuration_1_instances{}))>;
|
||||
static_for<0,
|
||||
std::tuple_size<reduce_configuration_1_instances_multiblock_atomic_add>::value,
|
||||
1>{}([&](auto i) {
|
||||
using cfg1 = remove_cvref_t<decltype(
|
||||
std::get<i.value>(reduce_configuration_1_instances_multiblock_atomic_add{}))>;
|
||||
|
||||
static_for<
|
||||
0,
|
||||
@@ -105,24 +119,27 @@ void add_device_reduce_instance_multiblock_atomic_add(
|
||||
using cfg2 = remove_cvref_t<decltype(
|
||||
std::get<j.value>(reduce_configuration_2_instances_multiblock_atomic_add{}))>;
|
||||
|
||||
using ReduceOpInstance = DeviceReduceMultiBlockAtomicAdd<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
Rank,
|
||||
NumReduceDim,
|
||||
ReduceOperation,
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation,
|
||||
PropagateNan,
|
||||
NeedIndices,
|
||||
cfg1::BlockSize_,
|
||||
cfg1::MThreadClusterSize_,
|
||||
cfg1::KThreadClusterSize_,
|
||||
cfg2::MThreadSliceSize_,
|
||||
cfg2::KThreadSliceSize_,
|
||||
cfg2::InSrcVectorDim_,
|
||||
cfg2::InSrcVectorSize_,
|
||||
cfg2::OutDstVectorSize_>;
|
||||
using ReduceOpInstance =
|
||||
DeviceReduceMultiBlock<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
Rank,
|
||||
NumReduceDim,
|
||||
ReduceOperation,
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation,
|
||||
InMemoryDataOperationEnum::AtomicAdd,
|
||||
PropagateNan,
|
||||
OutputIndex,
|
||||
false, // HaveIndexInputIfOutputIndex
|
||||
cfg1::BlockSize_,
|
||||
cfg1::MThreadClusterSize_,
|
||||
cfg1::KThreadClusterSize_,
|
||||
cfg2::MThreadSliceSize_,
|
||||
cfg2::KThreadSliceSize_,
|
||||
cfg2::InSrcVectorDim_,
|
||||
cfg2::InSrcVectorSize_,
|
||||
cfg2::OutDstVectorSize_>;
|
||||
|
||||
device_op_instances.push_back(
|
||||
std::make_unique<ReduceOpInstance>(ReduceOpInstance{}));
|
||||
@@ -132,54 +149,54 @@ void add_device_reduce_instance_multiblock_atomic_add(
|
||||
};
|
||||
|
||||
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \
|
||||
template void add_device_reduce_instance_multiblock_atomic_add<inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
Rank, \
|
||||
NumReduceDim, \
|
||||
ReduceOpId, \
|
||||
NanOpt, \
|
||||
IndicesOpt>( \
|
||||
PropagateNan, \
|
||||
UseIndex>( \
|
||||
std::vector<deviceReduceMultiBlockAtomicAddPtrType<compT, ReduceOpId>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<NanPropagation>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices>(IndicesOpt), \
|
||||
Rank, \
|
||||
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<bool>(NanOpt), \
|
||||
static_cast<bool>(IndicesOpt), \
|
||||
Rank, \
|
||||
NumReduceDim)
|
||||
|
||||
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \
|
||||
extern template void add_device_reduce_instance_multiblock_atomic_add<inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
Rank, \
|
||||
NumReduceDim, \
|
||||
ReduceOpId, \
|
||||
NanOpt, \
|
||||
IndicesOpt>( \
|
||||
PropagateNan, \
|
||||
UseIndex>( \
|
||||
std::vector<DeviceReducePtr< \
|
||||
typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
|
||||
typename reduce_unary_operator<compT, ReduceOpId, true, true>:: \
|
||||
AccElementwiseOperation>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<NanPropagation>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices>(IndicesOpt), \
|
||||
Rank, \
|
||||
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<bool>(NanOpt), \
|
||||
static_cast<bool>(IndicesOpt), \
|
||||
Rank, \
|
||||
NumReduceDim)
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "data_type.hpp"
|
||||
#include "device_reduce_instance_multiblock_atomic_add.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "data_type.hpp"
|
||||
#include "device_reduce_instance_multiblock_atomic_add.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_multiblock_atomic_add.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_multiblock_atomic_add.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -0,0 +1,29 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP
|
||||
|
||||
#include "device_reduce_instance_multiblock_atomic_add.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -1,174 +0,0 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_HPP
|
||||
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_impl_common.hpp"
|
||||
#include "device_reduce_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
#ifdef QUICK_REDUCE_TEST
|
||||
using reduce_configuration_2_instances_multiblock_partial_reduce = std::tuple<
|
||||
// clang-format off
|
||||
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
|
||||
ReductionConfiguration_2<0, 1, 1, 2, 1>,
|
||||
ReductionConfiguration_2<1, 2, 1, 1, 2>,
|
||||
ReductionConfiguration_2<0, 1, 1, 3, 1>,
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 3>
|
||||
// clang-format on
|
||||
>;
|
||||
#else
|
||||
using reduce_configuration_2_instances_multiblock_partial_reduce = std::tuple<
|
||||
// clang-format off
|
||||
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
|
||||
ReductionConfiguration_2<0, 4, 1, 8, 1>,
|
||||
ReductionConfiguration_2<0, 4, 1, 4, 1>,
|
||||
ReductionConfiguration_2<0, 2, 1, 2, 1>,
|
||||
|
||||
ReductionConfiguration_2<1, 4, 1, 1, 8>,
|
||||
ReductionConfiguration_2<1, 4, 1, 1, 4>,
|
||||
ReductionConfiguration_2<1, 2, 1, 1, 2>,
|
||||
|
||||
// special instances
|
||||
ReductionConfiguration_2<0, 1, 1, 3, 1>,
|
||||
ReductionConfiguration_2<0, 1, 1, 5, 1>,
|
||||
ReductionConfiguration_2<0, 1, 1, 7, 1>,
|
||||
ReductionConfiguration_2<0, 1, 1, 11, 1>,
|
||||
|
||||
ReductionConfiguration_2<0, 1, 1, 1, 3>,
|
||||
ReductionConfiguration_2<0, 1, 1, 1, 5>,
|
||||
ReductionConfiguration_2<0, 1, 1, 1, 7>,
|
||||
ReductionConfiguration_2<0, 1, 1, 1, 11>
|
||||
// clang-format on
|
||||
>;
|
||||
#endif
|
||||
|
||||
template <typename AccDataType, ReduceTensorOp ReduceOpId>
|
||||
using deviceReduceMultiBlockPartialReducePtrType = DeviceReducePtr<
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::InElementwiseOperation,
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::AccElementwiseOperation>;
|
||||
|
||||
template <typename InDataType,
|
||||
typename AccDataType,
|
||||
typename OutDataType,
|
||||
int Rank,
|
||||
int NumReduceDim,
|
||||
ReduceTensorOp ReduceOpId,
|
||||
NanPropagation NanOpt,
|
||||
ReduceTensorIndices IndicesOpt>
|
||||
void add_device_reduce_instance_multiblock_partial_reduce(
|
||||
std::vector<deviceReduceMultiBlockPartialReducePtrType<AccDataType, ReduceOpId>>&
|
||||
device_op_instances)
|
||||
{
|
||||
using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
|
||||
using InElementwiseOperation =
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
|
||||
InElementwiseOperation;
|
||||
using AccElementwiseOperation =
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
|
||||
AccElementwiseOperation;
|
||||
|
||||
constexpr bool Indexable =
|
||||
(ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
|
||||
ReduceOpId == ReduceTensorOp::AMAX);
|
||||
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
|
||||
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
|
||||
|
||||
static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) {
|
||||
using cfg1 =
|
||||
remove_cvref_t<decltype(std::get<i.value>(reduce_configuration_1_instances{}))>;
|
||||
|
||||
static_for<
|
||||
0,
|
||||
std::tuple_size<reduce_configuration_2_instances_multiblock_partial_reduce>::value,
|
||||
1>{}([&](auto j) {
|
||||
using cfg2 = remove_cvref_t<decltype(
|
||||
std::get<j.value>(reduce_configuration_2_instances_multiblock_partial_reduce{}))>;
|
||||
|
||||
using ReduceOpInstance = DeviceReduceMultiBlockPartialReduce<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
Rank,
|
||||
NumReduceDim,
|
||||
ReduceOperation,
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation,
|
||||
PropagateNan,
|
||||
NeedIndices,
|
||||
cfg1::BlockSize_,
|
||||
cfg1::MThreadClusterSize_,
|
||||
cfg1::KThreadClusterSize_,
|
||||
cfg2::MThreadSliceSize_,
|
||||
cfg2::KThreadSliceSize_,
|
||||
cfg2::InSrcVectorDim_,
|
||||
cfg2::InSrcVectorSize_,
|
||||
cfg2::OutDstVectorSize_>;
|
||||
|
||||
device_op_instances.push_back(std::make_unique<ReduceOpInstance>(ReduceOpInstance{}));
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
template void add_device_reduce_instance_multiblock_partial_reduce<inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
Rank, \
|
||||
NumReduceDim, \
|
||||
ReduceOpId, \
|
||||
NanOpt, \
|
||||
IndicesOpt>( \
|
||||
std::vector<deviceReduceMultiBlockPartialReducePtrType<compT, ReduceOpId>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<NanPropagation>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices>(IndicesOpt), \
|
||||
Rank, \
|
||||
NumReduceDim)
|
||||
|
||||
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
extern template void add_device_reduce_instance_multiblock_partial_reduce<inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
Rank, \
|
||||
NumReduceDim, \
|
||||
ReduceOpId, \
|
||||
NanOpt, \
|
||||
IndicesOpt>( \
|
||||
std::vector< \
|
||||
DeviceReducePtr<typename reduce_unary_operator<compT, ReduceOpId, true, false>:: \
|
||||
InElementwiseOperation, \
|
||||
typename reduce_unary_operator<compT, ReduceOpId, true, false>:: \
|
||||
AccElementwiseOperation>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<NanPropagation>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices>(IndicesOpt), \
|
||||
Rank, \
|
||||
NumReduceDim)
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -1,60 +0,0 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_B16_F32_B16_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_B16_F32_B16_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1);
|
||||
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -1,47 +0,0 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F16_F16_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F16_F16_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -1,35 +0,0 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F32_F16_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F32_F16_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -1,52 +0,0 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F32_F32_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F32_F32_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1);
|
||||
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -1,27 +0,0 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F64_F32_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F64_F32_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -1,62 +0,0 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F64_F64_F64_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F64_F64_F64_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1);
|
||||
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1);
|
||||
|
||||
// Will be moved to use MultiBlockAtomicAdd
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -1,31 +0,0 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I32_I8_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I32_I8_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -1,47 +0,0 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I8_I8_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I8_I8_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -58,8 +58,8 @@ template <typename InDataType,
|
||||
int Rank,
|
||||
int NumReduceDim,
|
||||
ReduceTensorOp ReduceOpId,
|
||||
NanPropagation NanOpt,
|
||||
ReduceTensorIndices IndicesOpt>
|
||||
bool PropagateNan,
|
||||
bool UseIndex>
|
||||
void add_device_reduce_instance_threadwise(
|
||||
std::vector<deviceReduceThreadWisePtrType<AccDataType, ReduceOpId>>& device_op_instances)
|
||||
{
|
||||
@@ -73,9 +73,7 @@ void add_device_reduce_instance_threadwise(
|
||||
constexpr bool Indexable =
|
||||
(ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
|
||||
ReduceOpId == ReduceTensorOp::AMAX);
|
||||
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
|
||||
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
|
||||
constexpr bool OutputIndex = Indexable && UseIndex;
|
||||
|
||||
using cfg1 = ReductionConfiguration_1<256, 256, 1>;
|
||||
|
||||
@@ -93,10 +91,9 @@ void add_device_reduce_instance_threadwise(
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation,
|
||||
PropagateNan,
|
||||
NeedIndices,
|
||||
OutputIndex,
|
||||
false, // HaveIndexInputIfOutputIndex
|
||||
cfg1::BlockSize_,
|
||||
cfg1::MThreadClusterSize_,
|
||||
cfg1::KThreadClusterSize_,
|
||||
cfg2::MThreadSliceSize_,
|
||||
cfg2::KThreadSliceSize_,
|
||||
cfg2::InSrcVectorDim_,
|
||||
@@ -107,54 +104,54 @@ void add_device_reduce_instance_threadwise(
|
||||
});
|
||||
};
|
||||
|
||||
#define ADD_THREADWISE_INST_BY_TYPE( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
template void add_device_reduce_instance_threadwise<inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
Rank, \
|
||||
NumReduceDim, \
|
||||
ReduceOpId, \
|
||||
NanOpt, \
|
||||
IndicesOpt>( \
|
||||
#define ADD_THREADWISE_INST_BY_TYPE( \
|
||||
inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \
|
||||
template void add_device_reduce_instance_threadwise<inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
Rank, \
|
||||
NumReduceDim, \
|
||||
ReduceOpId, \
|
||||
PropagateNan, \
|
||||
UseIndex>( \
|
||||
std::vector<deviceReduceThreadWisePtrType<compT, ReduceOpId>> & device_op_instances)
|
||||
|
||||
#define ADD_THREADWISE_INST_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_THREADWISE_INST_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<NanPropagation>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices>(IndicesOpt), \
|
||||
Rank, \
|
||||
#define ADD_THREADWISE_INST_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_THREADWISE_INST_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<bool>(NanOpt), \
|
||||
static_cast<bool>(IndicesOpt), \
|
||||
Rank, \
|
||||
NumReduceDim)
|
||||
|
||||
#define ADD_THREADWISE_INST_REF_BY_TYPE( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \
|
||||
extern template void add_device_reduce_instance_threadwise<inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
Rank, \
|
||||
NumReduceDim, \
|
||||
ReduceOpId, \
|
||||
NanOpt, \
|
||||
IndicesOpt>( \
|
||||
PropagateNan, \
|
||||
UseIndex>( \
|
||||
std::vector<DeviceReducePtr< \
|
||||
typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
|
||||
typename reduce_unary_operator<compT, ReduceOpId, true, true>:: \
|
||||
AccElementwiseOperation>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_THREADWISE_INST_REF_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_THREADWISE_INST_REF_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<NanPropagation>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices>(IndicesOpt), \
|
||||
Rank, \
|
||||
#define ADD_THREADWISE_INST_REF_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
|
||||
ADD_THREADWISE_INST_REF_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp>(ReduceOpId), \
|
||||
static_cast<bool>(NanOpt), \
|
||||
static_cast<bool>(IndicesOpt), \
|
||||
Rank, \
|
||||
NumReduceDim)
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "data_type.hpp"
|
||||
#include "device_reduce_instance_threadwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "data_type.hpp"
|
||||
#include "device_reduce_instance_threadwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "data_type.hpp"
|
||||
#include "device_reduce_instance_threadwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_threadwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_threadwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_threadwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_threadwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_threadwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -16,26 +16,11 @@ set(DEVICE_REDUCE_INSTANCE_SOURCE
|
||||
device_reduce_instance_threadwise_i8_i32_i8.cpp;
|
||||
device_reduce_instance_threadwise_i8_i8_i8.cpp;
|
||||
device_reduce_instance_threadwise_b16_f32_b16.cpp;
|
||||
device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp;
|
||||
device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp;
|
||||
device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp;
|
||||
device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp;
|
||||
device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp;
|
||||
device_reduce_instance_blockwise_second_call_i32_i32_i8.cpp;
|
||||
device_reduce_instance_blockwise_second_call_i8_i8_i8.cpp;
|
||||
device_reduce_instance_blockwise_second_call_f32_f32_b16.cpp;
|
||||
device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp;
|
||||
device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp;
|
||||
device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp;
|
||||
device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp;
|
||||
device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp;
|
||||
device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp;
|
||||
device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp;
|
||||
device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp;
|
||||
device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp;
|
||||
device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp;
|
||||
device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp;
|
||||
device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp;
|
||||
device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp;
|
||||
)
|
||||
|
||||
add_library(device_reduce_instance OBJECT ${DEVICE_REDUCE_INSTANCE_SOURCE})
|
||||
|
||||
@@ -1,40 +0,0 @@
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -1,53 +0,0 @@
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 5, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 7, 0, 0, 2, 1);
|
||||
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 1, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 1, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 1, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -1,28 +0,0 @@
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 4, 3); // for ADD
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 4, 3); // for AVG
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 4, 3); // for NORM2
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -1,52 +0,0 @@
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -1,28 +0,0 @@
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 4, 3); // for ADD
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 4, 3); // for AVG
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 4, 3); // for NORM2
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -1,52 +0,0 @@
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -1,24 +0,0 @@
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -1,40 +0,0 @@
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,24 @@
|
||||
#include "device_reduce_instance_multiblock_atomic_add.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 0, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 5, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -1,53 +0,0 @@
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1);
|
||||
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -1,40 +0,0 @@
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -1,28 +0,0 @@
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -1,45 +0,0 @@
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1);
|
||||
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -1,20 +0,0 @@
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -1,55 +0,0 @@
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1);
|
||||
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1);
|
||||
|
||||
// Will be moved to use MultiBlockAtomicAdd
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -1,24 +0,0 @@
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -1,40 +0,0 @@
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
Reference in New Issue
Block a user