mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-30 03:37:38 +00:00
add reduce_multi_d
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
add_example_executable(example_reduce_blockwise reduce_blockwise.cpp)
|
||||
add_example_executable(example_reduce_threadwise reduce_threadwise.cpp)
|
||||
add_example_executable(example_reduce_threadwise_multi_d reduce_threadwise_multi_d.cpp)
|
||||
add_example_executable(example_reduce_multiblock_atomic_add reduce_multiblock_atomic_add.cpp)
|
||||
add_example_executable(example_reduce_blockwise_two_call reduce_blockwise_two_call.cpp)
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
#include <getopt.h>
|
||||
|
||||
#include "ck/utility/reduction_enums.hpp"
|
||||
#include "reduce_threadwise_impl.hpp"
|
||||
#include "reduce_threadwise_multi_d_impl.hpp"
|
||||
#include "reduce_example_common.hpp"
|
||||
|
||||
using namespace ck;
|
||||
@@ -25,7 +25,7 @@ class SimpleAppArgs
|
||||
|
||||
public:
|
||||
std::vector<size_t> inLengths = {16, 64, 32, 16};
|
||||
std::vector<int> reduceDims = {0, 1, 2};
|
||||
std::vector<int> reduceDims = {0};
|
||||
std::vector<float> scales = {1.0f, 0.0f};
|
||||
|
||||
bool do_verification = true;
|
||||
@@ -118,13 +118,13 @@ template <typename InOutDataType,
|
||||
ReduceTensorOp ReduceOpId,
|
||||
index_t PropagateNan,
|
||||
index_t OutputIndex>
|
||||
bool reduce_threadwise_test(bool do_verification,
|
||||
int init_method,
|
||||
bool time_kernel,
|
||||
const std::vector<size_t>& inLengths,
|
||||
const std::vector<int>& reduceDims,
|
||||
float alpha,
|
||||
float beta)
|
||||
bool reduce_threadwise_multi_d_test(bool do_verification,
|
||||
int init_method,
|
||||
bool time_kernel,
|
||||
const std::vector<size_t>& inLengths,
|
||||
const std::vector<int>& reduceDims,
|
||||
float alpha,
|
||||
float beta)
|
||||
{
|
||||
bool matched = false;
|
||||
int result = 0;
|
||||
@@ -144,13 +144,13 @@ bool reduce_threadwise_test(bool do_verification,
|
||||
|
||||
ck::ranges::copy(reduceDims, arrReduceDims.begin());
|
||||
|
||||
result = reduce_threadwise_impl<InOutDataType,
|
||||
AccDataType,
|
||||
ReduceOpId,
|
||||
ShapeType::Rank_,
|
||||
ShapeType::NumReduceDim_,
|
||||
PropagateNan,
|
||||
OutputIndex>(
|
||||
result = reduce_threadwise_multi_d_impl<InOutDataType,
|
||||
AccDataType,
|
||||
ReduceOpId,
|
||||
ShapeType::Rank_,
|
||||
ShapeType::NumReduceDim_,
|
||||
PropagateNan,
|
||||
OutputIndex>(
|
||||
do_verification, init_method, time_kernel, inLengths, arrReduceDims, alpha, beta);
|
||||
|
||||
matched = true;
|
||||
@@ -176,96 +176,53 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(arg.data_type == 0)
|
||||
{
|
||||
pass = reduce_threadwise_test<ck::half_t, float, ReduceOpId, PropagateNan, OutputIndex>(
|
||||
arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inLengths,
|
||||
arg.reduceDims,
|
||||
arg.scales[0],
|
||||
arg.scales[1]);
|
||||
pass = reduce_threadwise_multi_d_test<ck::half_t,
|
||||
float,
|
||||
ReduceOpId,
|
||||
PropagateNan,
|
||||
OutputIndex>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inLengths,
|
||||
arg.reduceDims,
|
||||
arg.scales[0],
|
||||
arg.scales[1]);
|
||||
}
|
||||
else if(arg.data_type == 1)
|
||||
{
|
||||
pass = reduce_threadwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
|
||||
arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inLengths,
|
||||
arg.reduceDims,
|
||||
arg.scales[0],
|
||||
arg.scales[1]);
|
||||
pass =
|
||||
reduce_threadwise_multi_d_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
|
||||
arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inLengths,
|
||||
arg.reduceDims,
|
||||
arg.scales[0],
|
||||
arg.scales[1]);
|
||||
}
|
||||
#if 0
|
||||
else if(arg.data_type == 3)
|
||||
{
|
||||
pass = reduce_threadwise_test<int8_t, float, ReduceOpId, PropagateNan, OutputIndex>(
|
||||
arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inLengths,
|
||||
arg.reduceDims,
|
||||
arg.scales[0],
|
||||
arg.scales[1]);
|
||||
}
|
||||
else if(arg.data_type == 5)
|
||||
{
|
||||
pass = reduce_threadwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
|
||||
arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inLengths,
|
||||
arg.reduceDims,
|
||||
arg.scales[0],
|
||||
arg.scales[1]);
|
||||
}
|
||||
else if(arg.data_type == 6)
|
||||
{
|
||||
pass = reduce_threadwise_test<double, double, ReduceOpId, PropagateNan, OutputIndex>(
|
||||
arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inLengths,
|
||||
arg.reduceDims,
|
||||
arg.scales[0],
|
||||
arg.scales[1]);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
// for testing half_t
|
||||
pass = pass &&
|
||||
reduce_threadwise_test<ck::half_t, float, ReduceOpId, PropagateNan, OutputIndex>(
|
||||
true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f);
|
||||
pass = pass && reduce_threadwise_multi_d_test<ck::half_t,
|
||||
float,
|
||||
ReduceOpId,
|
||||
PropagateNan,
|
||||
OutputIndex>(
|
||||
true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f);
|
||||
|
||||
// for testing float
|
||||
pass = pass && reduce_threadwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
|
||||
true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f);
|
||||
|
||||
// for testing double
|
||||
pass = pass && reduce_threadwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
|
||||
true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f);
|
||||
|
||||
// for testing bhalf_t
|
||||
pass = pass &&
|
||||
reduce_threadwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
|
||||
reduce_threadwise_multi_d_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
|
||||
true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f);
|
||||
|
||||
#if 0
|
||||
// for testing int8_t
|
||||
pass =
|
||||
pass && reduce_threadwise_test<int8_t, int32_t, ReduceOpId, PropagateNan, OutputIndex>(
|
||||
true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f);
|
||||
|
||||
// for testing 3D input
|
||||
pass = pass && reduce_threadwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
|
||||
true, 2, true, {16, 64, 960}, {0}, 1.0f, 0.0f);
|
||||
|
||||
// for testing 5D input
|
||||
pass = pass && reduce_threadwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
|
||||
true, 2, true, {16, 64, 32, 2, 960}, {0}, 1.0f, 0.0f);
|
||||
#endif
|
||||
// for testing bhalf_t
|
||||
pass = pass && reduce_threadwise_multi_d_test<ck::bhalf_t,
|
||||
float,
|
||||
ReduceOpId,
|
||||
PropagateNan,
|
||||
OutputIndex>(
|
||||
true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f);
|
||||
}
|
||||
|
||||
return (pass ? 0 : 1);
|
||||
@@ -8,7 +8,6 @@
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/reduction_enums.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
|
||||
//#include "ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp"
|
||||
|
||||
@@ -28,13 +27,13 @@ template <typename InOutDataType,
|
||||
ck::index_t NumReduceDim,
|
||||
bool PropagateNan,
|
||||
bool OutputIndex>
|
||||
int reduce_threadwise_impl(bool do_verification,
|
||||
int init_method,
|
||||
bool time_kernel,
|
||||
const std::vector<size_t>& inLengths,
|
||||
const std::array<int, NumReduceDim>& reduceDims,
|
||||
float alpha,
|
||||
float beta)
|
||||
int reduce_threadwise_multi_d_impl(bool do_verification,
|
||||
int init_method,
|
||||
bool time_kernel,
|
||||
const std::vector<size_t>& inLengths,
|
||||
const std::array<int, NumReduceDim>& reduceDims,
|
||||
float alpha,
|
||||
float beta)
|
||||
|
||||
{
|
||||
using namespace ck;
|
||||
@@ -90,17 +89,17 @@ int reduce_threadwise_impl(bool do_verification,
|
||||
};
|
||||
|
||||
using PassThrough = tensor_operation::element_wise::PassThrough;
|
||||
// using Add = tensor_operation::element_wise::Add;
|
||||
using Add = tensor_operation::element_wise::Add;
|
||||
|
||||
using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
|
||||
using InElementwiseOperation = PassThrough;
|
||||
using OutElementwiseOperation = PassThrough;
|
||||
using OutElementwiseOperation = Add;
|
||||
|
||||
using InOutDataTypeInDevice = InOutDataType;
|
||||
|
||||
using DeviceReduceInstance =
|
||||
ck::tensor_operation::device::DeviceReduceThreadWiseMultiD<InOutDataTypeInDevice,
|
||||
ck::Tuple<>,
|
||||
ck::Tuple<InOutDataTypeInDevice>,
|
||||
AccDataType,
|
||||
InOutDataTypeInDevice,
|
||||
Rank,
|
||||
@@ -129,6 +128,9 @@ int reduce_threadwise_impl(bool do_verification,
|
||||
|
||||
Tensor<InOutDataType> out_ref(outLengths);
|
||||
Tensor<InOutDataType> out(outLengths);
|
||||
|
||||
Tensor<InOutDataType> d0(outLengths);
|
||||
|
||||
Tensor<int> out_indices_ref(outLengths);
|
||||
Tensor<int> out_indices(outLengths);
|
||||
|
||||
@@ -147,16 +149,19 @@ int reduce_threadwise_impl(bool do_verification,
|
||||
case 0: break;
|
||||
case 1:
|
||||
in.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
|
||||
d0.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
|
||||
if(beta != 0.0f)
|
||||
out_ref.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
|
||||
break;
|
||||
case 2:
|
||||
in.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
|
||||
d0.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
|
||||
if(beta != 0.0f)
|
||||
out_ref.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
|
||||
break;
|
||||
default:
|
||||
in.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0}, num_thread);
|
||||
d0.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0}, num_thread);
|
||||
if(beta != 0.0f)
|
||||
out_ref.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0},
|
||||
num_thread);
|
||||
@@ -169,13 +174,14 @@ int reduce_threadwise_impl(bool do_verification,
|
||||
|
||||
// these buffers are usually provided by the user application
|
||||
DeviceMem in_dev(sizeof(InOutDataTypeInDevice) * in.mDesc.GetElementSpaceSize());
|
||||
DeviceMem d0_dev(sizeof(InOutDataTypeInDevice) * d0.mDesc.GetElementSpaceSize());
|
||||
DeviceMem out_dev(sizeof(InOutDataTypeInDevice) * out.mDesc.GetElementSpaceSize());
|
||||
|
||||
in_dev.ToDevice(in.mData.data());
|
||||
d0_dev.ToDevice(d0.mData.data());
|
||||
|
||||
if(beta != 0.0f)
|
||||
{
|
||||
|
||||
out_dev.ToDevice(out.mData.data());
|
||||
};
|
||||
|
||||
@@ -188,11 +194,13 @@ int reduce_threadwise_impl(bool do_verification,
|
||||
|
||||
std::array<index_t, Rank> arrInLengths;
|
||||
std::array<index_t, Rank> arrInStrides;
|
||||
|
||||
std::array<index_t, NumOutDim> arrOutLengths;
|
||||
std::array<index_t, NumOutDim> arrOutStrides;
|
||||
|
||||
ck::ranges::copy(inLengths, arrInLengths.begin());
|
||||
ck::ranges::copy(inStrides, arrInStrides.begin());
|
||||
|
||||
ck::ranges::copy(outLengths, arrOutLengths.begin());
|
||||
ck::ranges::copy(outStrides, arrOutStrides.begin());
|
||||
|
||||
@@ -236,19 +244,22 @@ int reduce_threadwise_impl(bool do_verification,
|
||||
auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer();
|
||||
|
||||
invoker_ptr_ref->Run(argument_ptr_ref.get());
|
||||
|
||||
for(std::size_t i = 0; i < out_ref.GetElementSize(); i++)
|
||||
out_elementwise_op(out_ref.mData[i], out_ref.mData[i], d0.mData[i]);
|
||||
};
|
||||
|
||||
auto reduce = DeviceReduceInstance{};
|
||||
|
||||
auto argument_ptr = reduce.MakeArgumentPointer(arrInLengths,
|
||||
arrInStrides,
|
||||
{},
|
||||
{},
|
||||
{arrOutLengths},
|
||||
{arrOutStrides},
|
||||
arrOutLengths,
|
||||
arrOutStrides,
|
||||
reduceDims,
|
||||
in_dev.GetDeviceBuffer(),
|
||||
{},
|
||||
{d0_dev.GetDeviceBuffer()},
|
||||
out_dev.GetDeviceBuffer(),
|
||||
in_elementwise_op,
|
||||
out_elementwise_op);
|
||||
Reference in New Issue
Block a user