diff --git a/example/19_binary_elementwise/CMakeLists.txt b/example/19_binary_elementwise/CMakeLists.txt index 202a9e1fcb..6c95b2e55e 100644 --- a/example/19_binary_elementwise/CMakeLists.txt +++ b/example/19_binary_elementwise/CMakeLists.txt @@ -1,2 +1,3 @@ add_example_executable(example_broadcast_add_2d broadcast_add_2d.cpp) -add_example_executable(example_elementwise_add_1d elementwise_add_1d.cpp) \ No newline at end of file +add_example_executable(example_elementwise_add_1d elementwise_add_1d.cpp) +add_example_executable(example_elementwise_add_4d elementwise_add_4d.cpp) \ No newline at end of file diff --git a/example/19_binary_elementwise/broadcast_add_2d.cpp b/example/19_binary_elementwise/broadcast_add_2d.cpp index 4a6d2038e3..0f67499984 100644 --- a/example/19_binary_elementwise/broadcast_add_2d.cpp +++ b/example/19_binary_elementwise/broadcast_add_2d.cpp @@ -101,8 +101,7 @@ int main() {Stride, 1}, {0, 1}, // broadcast in first dimension {Stride, 1}, - Add{}, - 256); + Add{}); if(!broadcastAdd.IsSupportedArgument(argument.get())) { diff --git a/example/19_binary_elementwise/elementwise_add_1d.cpp b/example/19_binary_elementwise/elementwise_add_1d.cpp index c9d0f77724..602e055290 100644 --- a/example/19_binary_elementwise/elementwise_add_1d.cpp +++ b/example/19_binary_elementwise/elementwise_add_1d.cpp @@ -32,8 +32,7 @@ template + typename Functor> void host_elementwise1D( HostTensorC& C, const HostTensorA& A, const HostTensorB& B, int M, Functor functor) { @@ -60,9 +59,7 @@ int main() }; Tensor a_m(f_host_tensor_descriptor1d(M, 1)); - Tensor b_m(f_host_tensor_descriptor1d(M, 1)); - Tensor c_m(f_host_tensor_descriptor1d(M, 1)); a_m.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); @@ -83,8 +80,7 @@ int main() {1}, {1}, {1}, - Add{}, - 256); + Add{}); if(!broadcastAdd.IsSupportedArgument(argument.get())) { @@ -108,8 +104,7 @@ int main() Tensor, Tensor, EltwiseComputeDataType, - Add, - 0>(host_c_m, a_m, b_m, M, Add{}); + Add>(host_c_m, a_m, b_m, M, Add{}); pass &= ck::utils::check_err( c_m.mData, host_c_m.mData, "Error: Incorrect results d1", 1e-3, 1e-3); diff --git a/example/19_binary_elementwise/elementwise_add_4d.cpp b/example/19_binary_elementwise/elementwise_add_4d.cpp new file mode 100644 index 0000000000..9d468771f2 --- /dev/null +++ b/example/19_binary_elementwise/elementwise_add_4d.cpp @@ -0,0 +1,116 @@ +#include +#include +#include +#include +#include +#include +#include +#include "check_err.hpp" +#include "config.hpp" +#include "device.hpp" +#include "host_reduce_util.hpp" +#include "host_tensor.hpp" +#include "host_tensor_generator.hpp" + +#include "device_tensor.hpp" +#include "binary_element_wise_operation.hpp" + +#include "device_binary_elementwise.hpp" + +using F16 = ck::half_t; +using F32 = float; + +using ABDataType = F16; +using CDataType = F16; +using EltwiseComputeDataType = F32; + +using Add = ck::tensor_operation::binary_element_wise::Add; + +using DeviceElementwiseAddInstance = ck::tensor_operation::device:: + DeviceBinaryElementwise; + +template +void host_elementwise4D(HostTensorC& C, + const HostTensorA& A, + const HostTensorB& B, + const std::vector& shape, + Functor functor) +{ + for(std::size_t n = 0; n < shape[0]; ++n) + for(std::size_t c = 0; c < shape[1]; ++c) + for(std::size_t h = 0; h < shape[2]; ++h) + for(std::size_t w = 0; w < shape[3]; ++w) + { + ComputeDataType a_val = static_cast(A(n, c, h, w)); + ComputeDataType b_val = static_cast(B(n, c, h, w)); + ComputeDataType c_val = 0; + functor(c_val, a_val, b_val); + C(n, c, h, w) = static_cast(c_val); + } +} + +int main() +{ + bool do_verification = true; + bool time_kernel = false; + + std::vector nchw = {4, 16, 32, 32}; + + Tensor a_m(nchw); + Tensor b_m(nchw); + Tensor c_m(nchw); + + a_m.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b_m.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + + DeviceMem a_m_device_buf(sizeof(ABDataType) * a_m.mDesc.GetElementSpace()); + DeviceMem b_m_device_buf(sizeof(ABDataType) * b_m.mDesc.GetElementSpace()); + DeviceMem c_m_device_buf(sizeof(CDataType) * c_m.mDesc.GetElementSpace()); + + a_m_device_buf.ToDevice(a_m.mData.data()); + b_m_device_buf.ToDevice(b_m.mData.data()); + + auto broadcastAdd = DeviceElementwiseAddInstance{}; + auto argument = broadcastAdd.MakeArgumentPointer(a_m_device_buf.GetDeviceBuffer(), + b_m_device_buf.GetDeviceBuffer(), + c_m_device_buf.GetDeviceBuffer(), + ck::to_int_vector(nchw), + ck::to_int_vector(a_m.mDesc.GetStrides()), + ck::to_int_vector(b_m.mDesc.GetStrides()), + ck::to_int_vector(c_m.mDesc.GetStrides()), + Add{}); + + if(!broadcastAdd.IsSupportedArgument(argument.get())) + { + throw std::runtime_error("The runtime parameters seems not supported by the " + "DeviceBinaryElementwise_2D instance, exiting!"); + }; + + auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer(); + float ave_time = + broadcastAdd_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel}); + + std::cout << "Perf: " << ave_time << " ms" << std::endl; + + bool pass = true; + if(do_verification) + { + c_m_device_buf.FromDevice(c_m.mData.data()); + Tensor host_c_m(nchw); + + host_elementwise4D, + Tensor, + Tensor, + EltwiseComputeDataType, + Add>(host_c_m, a_m, b_m, nchw, Add{}); + + pass &= ck::utils::check_err( + c_m.mData, host_c_m.mData, "Error: Incorrect results d1", 1e-3, 1e-3); + } + + return pass ? 0 : 1; +} diff --git a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp index 0b08b818a3..a3a2c89eb7 100644 --- a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp +++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp @@ -19,49 +19,16 @@ template struct DeviceBinaryElementwise : public BaseOperator { - static constexpr auto I0 = Number<0>{}; - - static auto MakeDescriptor_M0_1d(const std::vector& shape, - const std::vector& stride, - index_t gridSize, - index_t threadPerBlock) + DeviceBinaryElementwise(index_t threadPerBlock = 256) + : BaseOperator(), threadPerBlock_(threadPerBlock) { - // 1d desc - [m] - const auto desc_m0 = - make_naive_tensor_descriptor(make_tuple(shape[0]), make_tuple(stride[0])); - - // pad - const auto m0 = desc_m0.GetLength(I0); - const index_t loop_step = gridSize * threadPerBlock * ScalarPerVector; - const auto pad = math::integer_least_multiple(m0, loop_step) - m0; - const auto desc_m0_pad = - transform_tensor_descriptor(desc_m0, - make_tuple(make_right_pad_transform(m0, pad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - return desc_m0_pad; } - static auto MakeDescriptor_M0_2d(const std::vector& shape, - const std::vector& stride, - index_t gridSize, - index_t threadPerBlock) + static constexpr auto I0 = Number<0>{}; + + template + static auto PadDescriptor_M0_1d(Desc_M0 desc_m0, index_t gridSize, index_t threadPerBlock) { - const int m = shape[0]; - const int n = shape[1]; - - // 2d desc - [m, n] - const auto desc_m_n = - make_naive_tensor_descriptor(make_tuple(m, n), make_tuple(stride[0], stride[1])); - - // 1d desc - [m * n] - const auto desc_m0 = - transform_tensor_descriptor(desc_m_n, - make_tuple(make_merge_transform(make_tuple(m, n))), - make_tuple(Sequence<0, 1>{}), - make_tuple(Sequence<0>{})); - - // pad const auto m0 = desc_m0.GetLength(I0); const index_t loop_step = gridSize * threadPerBlock * ScalarPerVector; const auto pad = math::integer_least_multiple(m0, loop_step) - m0; @@ -78,16 +45,25 @@ struct DeviceBinaryElementwise : public BaseOperator index_t gridSize, index_t threadPerBlock) { - static_assert(Dim == 1 || Dim == 2, - "wrong! DeviceBinaryElementwise not support this dimension"); + auto tupleOfShape = generate_tuple([&](auto I) { return shape[I]; }, Number{}); + auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number{}); - // TODO - 3D, 4D, 5D - if constexpr(Dim == 1) - return MakeDescriptor_M0_1d(shape, stride, gridSize, threadPerBlock); - else if constexpr(Dim == 2) - return MakeDescriptor_M0_2d(shape, stride, gridSize, threadPerBlock); + // nd desc - [s0, s1, s2, ...] + const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride); + + // merge nd to 1d desc - [s0 * s1 * ...] + if constexpr(Dim > 1) + { + const auto desc_m0 = transform_tensor_descriptor( + desc, + make_tuple(make_merge_transform(tupleOfShape)), + make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number{})), + make_tuple(Sequence<0>{})); + + return PadDescriptor_M0_1d(desc_m0, gridSize, threadPerBlock); + } else - return make_naive_tensor_descriptor(make_tuple(0), make_tuple(0)); + return PadDescriptor_M0_1d(desc, gridSize, threadPerBlock); } using GridDesc_M0 = decltype(MakeDescriptor_M0({1, 1}, {1, 1}, 1, 1)); @@ -114,12 +90,11 @@ struct DeviceBinaryElementwise : public BaseOperator p_b_(p_b), p_c_(p_c), functor_(functor), - threadPerBlock_(threadPerBlock), - gridSize_(128) // FIXME - Calculate the grid size by number of CU in the future + gridSize_(120) // FIXME - Calculate the grid size by number of CU in the future { - a_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_a, gridSize_, threadPerBlock_); - b_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_b, gridSize_, threadPerBlock_); - c_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_c, gridSize_, threadPerBlock_); + a_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_a, gridSize_, threadPerBlock); + b_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_b, gridSize_, threadPerBlock); + c_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_c, gridSize_, threadPerBlock); } const ADataType* p_a_; @@ -129,12 +104,13 @@ struct DeviceBinaryElementwise : public BaseOperator GridDesc_M0 b_grid_desc_m0_; GridDesc_M0 c_grid_desc_m0_; ElementwiseFunctor functor_; - index_t threadPerBlock_; index_t gridSize_; }; struct Invoker : public BaseInvoker { + Invoker(index_t threadPerBlock) : BaseInvoker(), threadPerBlock_(threadPerBlock) {} + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { const auto kernel = kernel_elementwise_1d(p_arg), stream_config); } + + index_t threadPerBlock_; }; bool IsSupportedArgument(const BaseArgument* p_arg) override @@ -174,7 +152,7 @@ struct DeviceBinaryElementwise : public BaseOperator if(pArg == nullptr) return false; - // m * n + // shape[0] * shape[1] * shape[2] * ... const auto m0 = pArg->c_grid_desc_m0_.GetLength(I0); if(m0 % ScalarPerVector != 0) @@ -190,8 +168,7 @@ struct DeviceBinaryElementwise : public BaseOperator std::vector stride_a, std::vector stride_b, std::vector stride_c, - ElementwiseFunctor functor, - index_t threadPerBlock) + ElementwiseFunctor functor) { return std::make_unique(static_cast(p_a), static_cast(p_b), @@ -201,12 +178,12 @@ struct DeviceBinaryElementwise : public BaseOperator stride_b, stride_c, functor, - threadPerBlock); + threadPerBlock_); } std::unique_ptr MakeInvokerPointer() { - return std::make_unique(Invoker{}); + return std::make_unique(Invoker{threadPerBlock_}); } std::string GetTypeString() const override @@ -222,6 +199,8 @@ struct DeviceBinaryElementwise : public BaseOperator return str.str(); } + + index_t threadPerBlock_; }; } // namespace device