From 033ea8069a58f2fe3feac5bd77dc0d3d75aed2ce Mon Sep 17 00:00:00 2001 From: root Date: Fri, 12 Apr 2024 04:36:34 +0000 Subject: [PATCH] add reduce_multi_d --- example/12_reduce/CMakeLists.txt | 2 +- ...wise.cpp => reduce_threadwise_multi_d.cpp} | 143 ++++++------------ ...hpp => reduce_threadwise_multi_d_impl.hpp} | 41 +++-- 3 files changed, 77 insertions(+), 109 deletions(-) rename example/12_reduce/{reduce_threadwise.cpp => reduce_threadwise_multi_d.cpp} (59%) rename example/12_reduce/{reduce_threadwise_impl.hpp => reduce_threadwise_multi_d_impl.hpp} (89%) diff --git a/example/12_reduce/CMakeLists.txt b/example/12_reduce/CMakeLists.txt index 968ef4c6eb..03381a449f 100644 --- a/example/12_reduce/CMakeLists.txt +++ b/example/12_reduce/CMakeLists.txt @@ -1,4 +1,4 @@ add_example_executable(example_reduce_blockwise reduce_blockwise.cpp) -add_example_executable(example_reduce_threadwise reduce_threadwise.cpp) +add_example_executable(example_reduce_threadwise_multi_d reduce_threadwise_multi_d.cpp) add_example_executable(example_reduce_multiblock_atomic_add reduce_multiblock_atomic_add.cpp) add_example_executable(example_reduce_blockwise_two_call reduce_blockwise_two_call.cpp) diff --git a/example/12_reduce/reduce_threadwise.cpp b/example/12_reduce/reduce_threadwise_multi_d.cpp similarity index 59% rename from example/12_reduce/reduce_threadwise.cpp rename to example/12_reduce/reduce_threadwise_multi_d.cpp index 6cd7ad6c1a..3d031f298e 100644 --- a/example/12_reduce/reduce_threadwise.cpp +++ b/example/12_reduce/reduce_threadwise_multi_d.cpp @@ -7,7 +7,7 @@ #include #include "ck/utility/reduction_enums.hpp" -#include "reduce_threadwise_impl.hpp" +#include "reduce_threadwise_multi_d_impl.hpp" #include "reduce_example_common.hpp" using namespace ck; @@ -25,7 +25,7 @@ class SimpleAppArgs public: std::vector inLengths = {16, 64, 32, 16}; - std::vector reduceDims = {0, 1, 2}; + std::vector reduceDims = {0}; std::vector scales = {1.0f, 0.0f}; bool do_verification = true; @@ -118,13 +118,13 @@ template -bool reduce_threadwise_test(bool do_verification, - int init_method, - bool time_kernel, - const std::vector& inLengths, - const std::vector& reduceDims, - float alpha, - float beta) +bool reduce_threadwise_multi_d_test(bool do_verification, + int init_method, + bool time_kernel, + const std::vector& inLengths, + const std::vector& reduceDims, + float alpha, + float beta) { bool matched = false; int result = 0; @@ -144,13 +144,13 @@ bool reduce_threadwise_test(bool do_verification, ck::ranges::copy(reduceDims, arrReduceDims.begin()); - result = reduce_threadwise_impl( + result = reduce_threadwise_multi_d_impl( do_verification, init_method, time_kernel, inLengths, arrReduceDims, alpha, beta); matched = true; @@ -176,96 +176,53 @@ int main(int argc, char* argv[]) if(arg.data_type == 0) { - pass = reduce_threadwise_test( - arg.do_verification, - arg.init_method, - arg.time_kernel, - arg.inLengths, - arg.reduceDims, - arg.scales[0], - arg.scales[1]); + pass = reduce_threadwise_multi_d_test(arg.do_verification, + arg.init_method, + arg.time_kernel, + arg.inLengths, + arg.reduceDims, + arg.scales[0], + arg.scales[1]); } else if(arg.data_type == 1) { - pass = reduce_threadwise_test( - arg.do_verification, - arg.init_method, - arg.time_kernel, - arg.inLengths, - arg.reduceDims, - arg.scales[0], - arg.scales[1]); + pass = + reduce_threadwise_multi_d_test( + arg.do_verification, + arg.init_method, + arg.time_kernel, + arg.inLengths, + arg.reduceDims, + arg.scales[0], + arg.scales[1]); } -#if 0 - else if(arg.data_type == 3) - { - pass = reduce_threadwise_test( - arg.do_verification, - arg.init_method, - arg.time_kernel, - arg.inLengths, - arg.reduceDims, - arg.scales[0], - arg.scales[1]); - } - else if(arg.data_type == 5) - { - pass = reduce_threadwise_test( - arg.do_verification, - arg.init_method, - arg.time_kernel, - arg.inLengths, - arg.reduceDims, - arg.scales[0], - arg.scales[1]); - } - else if(arg.data_type == 6) - { - pass = reduce_threadwise_test( - arg.do_verification, - arg.init_method, - arg.time_kernel, - arg.inLengths, - arg.reduceDims, - arg.scales[0], - arg.scales[1]); - } -#endif } else { // for testing half_t - pass = pass && - reduce_threadwise_test( - true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f); + pass = pass && reduce_threadwise_multi_d_test( + true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f); // for testing float - pass = pass && reduce_threadwise_test( - true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f); - - // for testing double - pass = pass && reduce_threadwise_test( - true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f); - - // for testing bhalf_t pass = pass && - reduce_threadwise_test( + reduce_threadwise_multi_d_test( true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f); -#if 0 - // for testing int8_t - pass = - pass && reduce_threadwise_test( - true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f); - - // for testing 3D input - pass = pass && reduce_threadwise_test( - true, 2, true, {16, 64, 960}, {0}, 1.0f, 0.0f); - - // for testing 5D input - pass = pass && reduce_threadwise_test( - true, 2, true, {16, 64, 32, 2, 960}, {0}, 1.0f, 0.0f); -#endif + // for testing bhalf_t + pass = pass && reduce_threadwise_multi_d_test( + true, 2, true, {16, 64, 32, 960}, {0}, 1.0f, 0.0f); } return (pass ? 0 : 1); diff --git a/example/12_reduce/reduce_threadwise_impl.hpp b/example/12_reduce/reduce_threadwise_multi_d_impl.hpp similarity index 89% rename from example/12_reduce/reduce_threadwise_impl.hpp rename to example/12_reduce/reduce_threadwise_multi_d_impl.hpp index 631d0535a4..457a2a637d 100644 --- a/example/12_reduce/reduce_threadwise_impl.hpp +++ b/example/12_reduce/reduce_threadwise_multi_d_impl.hpp @@ -8,7 +8,6 @@ #include "ck/ck.hpp" #include "ck/utility/reduction_enums.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" -//#include "ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp" @@ -28,13 +27,13 @@ template -int reduce_threadwise_impl(bool do_verification, - int init_method, - bool time_kernel, - const std::vector& inLengths, - const std::array& reduceDims, - float alpha, - float beta) +int reduce_threadwise_multi_d_impl(bool do_verification, + int init_method, + bool time_kernel, + const std::vector& inLengths, + const std::array& reduceDims, + float alpha, + float beta) { using namespace ck; @@ -90,17 +89,17 @@ int reduce_threadwise_impl(bool do_verification, }; using PassThrough = tensor_operation::element_wise::PassThrough; - // using Add = tensor_operation::element_wise::Add; + using Add = tensor_operation::element_wise::Add; using ReduceOperation = typename reduce_binary_operator::opType; using InElementwiseOperation = PassThrough; - using OutElementwiseOperation = PassThrough; + using OutElementwiseOperation = Add; using InOutDataTypeInDevice = InOutDataType; using DeviceReduceInstance = ck::tensor_operation::device::DeviceReduceThreadWiseMultiD, + ck::Tuple, AccDataType, InOutDataTypeInDevice, Rank, @@ -129,6 +128,9 @@ int reduce_threadwise_impl(bool do_verification, Tensor out_ref(outLengths); Tensor out(outLengths); + + Tensor d0(outLengths); + Tensor out_indices_ref(outLengths); Tensor out_indices(outLengths); @@ -147,16 +149,19 @@ int reduce_threadwise_impl(bool do_verification, case 0: break; case 1: in.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + d0.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); if(beta != 0.0f) out_ref.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); break; case 2: in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + d0.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); if(beta != 0.0f) out_ref.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); break; default: in.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, num_thread); + d0.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, num_thread); if(beta != 0.0f) out_ref.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, num_thread); @@ -169,13 +174,14 @@ int reduce_threadwise_impl(bool do_verification, // these buffers are usually provided by the user application DeviceMem in_dev(sizeof(InOutDataTypeInDevice) * in.mDesc.GetElementSpaceSize()); + DeviceMem d0_dev(sizeof(InOutDataTypeInDevice) * d0.mDesc.GetElementSpaceSize()); DeviceMem out_dev(sizeof(InOutDataTypeInDevice) * out.mDesc.GetElementSpaceSize()); in_dev.ToDevice(in.mData.data()); + d0_dev.ToDevice(d0.mData.data()); if(beta != 0.0f) { - out_dev.ToDevice(out.mData.data()); }; @@ -188,11 +194,13 @@ int reduce_threadwise_impl(bool do_verification, std::array arrInLengths; std::array arrInStrides; + std::array arrOutLengths; std::array arrOutStrides; ck::ranges::copy(inLengths, arrInLengths.begin()); ck::ranges::copy(inStrides, arrInStrides.begin()); + ck::ranges::copy(outLengths, arrOutLengths.begin()); ck::ranges::copy(outStrides, arrOutStrides.begin()); @@ -236,19 +244,22 @@ int reduce_threadwise_impl(bool do_verification, auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer(); invoker_ptr_ref->Run(argument_ptr_ref.get()); + + for(std::size_t i = 0; i < out_ref.GetElementSize(); i++) + out_elementwise_op(out_ref.mData[i], out_ref.mData[i], d0.mData[i]); }; auto reduce = DeviceReduceInstance{}; auto argument_ptr = reduce.MakeArgumentPointer(arrInLengths, arrInStrides, - {}, - {}, + {arrOutLengths}, + {arrOutStrides}, arrOutLengths, arrOutStrides, reduceDims, in_dev.GetDeviceBuffer(), - {}, + {d0_dev.GetDeviceBuffer()}, out_dev.GetDeviceBuffer(), in_elementwise_op, out_elementwise_op);