From 6aa106daee4e641f926fa537161c2385f3d4dae7 Mon Sep 17 00:00:00 2001 From: arai713 <67439843+arai713@users.noreply.github.com> Date: Mon, 17 Oct 2022 12:59:34 -0700 Subject: [PATCH] adding tensor_permutation example folder (#389) * adding tensor_permutation example folder * fixed formatting * adding tensor_permutation example folder * fixed formatting * changed deviceelementwise parameters for outscalar * removed .swo file * updated folder/file name * changed function call in verification for better consistency with hostelementwist parameters * formatted again * fixed shape in verification function call * changed verification function call, added definition for nhwc * added elementwise permute example * updated CMakeLists file in folder * Delete CmakeLists.txt * Delete tensor_permute.cpp * first version of 2d gridwise_elementwise kernel * temporary fix for stride problem * formatting * format * changed directory name * Delete gridwise_elementwise_2d.hpp * Delete CMakeLists.txt * Delete extra file * delete extra file * got rid of extraneous code * added 2d device elementwise file * deleted accidently added file * update * stride values generalized with equations * updated stride for output matrix * Update CMakeLists.txt * removed extraneous commented code * removed shape_nchw vector, replaced with GetLength for each dimension * changed vector load in kernel call * removed extra space in CMake [ROCm/composable_kernel commit: cee440fe4c006021a3b4c875bc416e68525a8fd9] --- example/38_elementwise_permute/CMakeLists.txt | 1 + .../elementwise_permute_4D_fp16.cpp | 105 ++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 example/38_elementwise_permute/CMakeLists.txt create mode 100644 example/38_elementwise_permute/elementwise_permute_4D_fp16.cpp diff --git a/example/38_elementwise_permute/CMakeLists.txt b/example/38_elementwise_permute/CMakeLists.txt new file mode 100644 index 0000000000..280797ad71 --- /dev/null +++ b/example/38_elementwise_permute/CMakeLists.txt @@ -0,0 +1 @@ +add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp) diff --git a/example/38_elementwise_permute/elementwise_permute_4D_fp16.cpp b/example/38_elementwise_permute/elementwise_permute_4D_fp16.cpp new file mode 100644 index 0000000000..31defbc0cd --- /dev/null +++ b/example/38_elementwise_permute/elementwise_permute_4D_fp16.cpp @@ -0,0 +1,105 @@ +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" + +using F16 = ck::half_t; +using F32 = float; + +using ADataType = F16; +using BDataType = F16; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using DeviceElementwisePermuteInstance = + ck::tensor_operation::device::DeviceElementwise, + ck::Tuple, + PassThrough, + 4, + 8, + ck::Sequence<8>, + ck::Sequence<1>>; + +template +void host_elementwise4D(HostTensorB& B_nhwc, const HostTensorA& A_nchw, Functor functor) +{ + for(std::size_t n = 0; n < A_nchw.mDesc.GetLengths()[0]; ++n) + for(std::size_t c = 0; c < A_nchw.mDesc.GetLengths()[1]; ++c) + for(std::size_t h = 0; h < A_nchw.mDesc.GetLengths()[2]; ++h) + for(std::size_t w = 0; w < A_nchw.mDesc.GetLengths()[3]; ++w) + { + auto a_val = A_nchw(n, c, h, w); + functor(B_nhwc(n, h, w, c), a_val); + } +} + +int main() +{ + bool do_verification = true; + bool time_kernel = false; + + std::vector nchw = {4, 4, 8, 8}; + std::vector nhwc = {4, 8, 8, 4}; + Tensor a(nchw); + Tensor b(nhwc); + + a.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + + DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize()); + DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize()); + + a_device_buf.ToDevice(a.mData.data()); + // LogRangeAsType(std::cout << "Tensor a : ", a.mData, ",") << std::endl; + + std::array input = {a_device_buf.GetDeviceBuffer()}; + std::array output = {b_device_buf.GetDeviceBuffer()}; + + std::array ab_lengths; + std::array a_strides = {static_cast(nchw[1] * nchw[2] * nchw[3]), + static_cast(nchw[2] * nchw[3]), + static_cast(nchw[3]), + 1}; + std::array b_strides = {static_cast(nhwc[1] * nhwc[2] * nhwc[3]), + 1, + static_cast(nhwc[2] * nhwc[3]), + static_cast(nhwc[3])}; + + std::copy(nchw.begin(), nchw.end(), ab_lengths.begin()); + + auto broadcastPermute = DeviceElementwisePermuteInstance{}; + auto argument = broadcastPermute.MakeArgumentPointer( + ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{}); + + if(!broadcastPermute.IsSupportedArgument(argument.get())) + { + throw std::runtime_error( + "The runtime parameters seems not supported by the device instance, exiting!"); + }; + auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer(); + float ave_time = + broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel}); + + std::cout << "Perf: " << ave_time << " ms" << std::endl; + + bool pass = true; + + if(do_verification) + { + b_device_buf.FromDevice(b.mData.data()); + // LogRangeAsType(std::cout << "Tensor b : ", b.mData, ",") << std::endl; + Tensor host_b(nhwc); + host_elementwise4D(host_b, a, PassThrough{}); + + // LogRangeAsType(std::cout << "Host b : ", host_b.mData, ",") << std::endl; + pass &= + ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3); + } + + return pass ? 0 : 1; +}