From 8bdd05f3666a3d5bf4992d095999418d480a1374 Mon Sep 17 00:00:00 2001 From: rocking5566 Date: Thu, 19 May 2022 12:34:35 +0800 Subject: [PATCH] elementwise op (#238) * Add elementwise operation kernel and example * Add comment * Add template argument of dim . Prepare to support multiple dimension * Rename example * Support 1 dimension * Add static assert * Add comment * Extract pad * Remove redundant argument * Support any dimension for elementwise operation * Remove line * Let it be the multiple number of CU * Move thread per block to the parameter of constructor * rename threadPerBlock with blockSize * Support double * rename kernel function name * remove redundant include header * Refine type * Need to the final dimension * Refine variable name * Refine type * Use index_t instead of int in API Co-authored-by: rocking [ROCm/composable_kernel commit: aafc3ac27a4d448b728a241fd6072005b87df22f] --- example/19_binary_elementwise/CMakeLists.txt | 3 + .../broadcast_add_2d.cpp | 132 ++++++++++++ .../elementwise_add_1d.cpp | 110 ++++++++++ .../elementwise_add_4d.cpp | 113 ++++++++++ example/CMakeLists.txt | 1 + .../gpu/device/device_binary_elementwise.hpp | 204 ++++++++++++++++++ .../element/binary_element_wise_operation.hpp | 25 +++ .../grid/gridwise_binary_elementwise_1d.hpp | 150 +++++++++++++ include/ck/utility/get_id.hpp | 4 + .../ck/library/host_tensor/host_utility.hpp | 17 ++ 10 files changed, 759 insertions(+) create mode 100644 example/19_binary_elementwise/CMakeLists.txt create mode 100644 example/19_binary_elementwise/broadcast_add_2d.cpp create mode 100644 example/19_binary_elementwise/elementwise_add_1d.cpp create mode 100644 example/19_binary_elementwise/elementwise_add_4d.cpp create mode 100644 include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp create mode 100644 include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp create mode 100644 library/include/ck/library/host_tensor/host_utility.hpp diff --git a/example/19_binary_elementwise/CMakeLists.txt b/example/19_binary_elementwise/CMakeLists.txt new file mode 100644 index 0000000000..6c95b2e55e --- /dev/null +++ b/example/19_binary_elementwise/CMakeLists.txt @@ -0,0 +1,3 @@ +add_example_executable(example_broadcast_add_2d broadcast_add_2d.cpp) +add_example_executable(example_elementwise_add_1d elementwise_add_1d.cpp) +add_example_executable(example_elementwise_add_4d elementwise_add_4d.cpp) \ No newline at end of file diff --git a/example/19_binary_elementwise/broadcast_add_2d.cpp b/example/19_binary_elementwise/broadcast_add_2d.cpp new file mode 100644 index 0000000000..181d0e6a2d --- /dev/null +++ b/example/19_binary_elementwise/broadcast_add_2d.cpp @@ -0,0 +1,132 @@ +#include +#include +#include "check_err.hpp" +#include "config.hpp" +#include "device.hpp" +#include "host_tensor.hpp" +#include "host_tensor_generator.hpp" + +#include "device_tensor.hpp" +#include "binary_element_wise_operation.hpp" +#include "device_binary_elementwise.hpp" + +using F16 = ck::half_t; +using F32 = float; + +using ABDataType = F16; +using CDataType = F16; +using EltwiseComputeDataType = F32; + +using Add = ck::tensor_operation::binary_element_wise::Add; + +using DeviceElementwiseAddInstance = ck::tensor_operation::device:: + DeviceBinaryElementwise; + +template +void host_broadcast2D( + HostTensorC& C, const HostTensorA& A, const HostTensorB& B, int M, int N, Functor functor) +{ + using ctype = ck::remove_reference_t; + + for(int m = 0; m < M; ++m) + { + for(int n = 0; n < N; ++n) + { + ComputeDataType Amn = static_cast(A(m, n)); + ComputeDataType Cmn = 0; + if constexpr(broadcastDim == 0) + { + ComputeDataType Bn = static_cast(B(n)); + functor(Cmn, Amn, Bn); + } + else + { + ComputeDataType Bm = static_cast(B(m)); + functor(Cmn, Amn, Bm); + } + C(m, n) = static_cast(Cmn); + } + } +} + +int main() +{ + bool do_verification = true; + bool time_kernel = false; + + ck::index_t M = 1024; + ck::index_t N = 1024; + ck::index_t Stride = 1024; + + auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { + return HostTensorDescriptor(std::vector({len}), + std::vector({stride})); + }; + + auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) { + return HostTensorDescriptor(std::vector({row, col}), + std::vector({stride, 1})); + }; + + Tensor a_m_n(f_host_tensor_descriptor2d(M, N, Stride)); + + Tensor b_n(f_host_tensor_descriptor1d(N, 1)); + + Tensor c_m_n(f_host_tensor_descriptor2d(M, N, Stride)); + + a_m_n.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b_n.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + + DeviceMem a_m_n_device_buf(sizeof(ABDataType) * a_m_n.mDesc.GetElementSpace()); + DeviceMem b_n_device_buf(sizeof(ABDataType) * b_n.mDesc.GetElementSpace()); + DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n.mDesc.GetElementSpace()); + + a_m_n_device_buf.ToDevice(a_m_n.mData.data()); + b_n_device_buf.ToDevice(b_n.mData.data()); + + auto broadcastAdd = DeviceElementwiseAddInstance{}; + auto argument = broadcastAdd.MakeArgumentPointer(a_m_n_device_buf.GetDeviceBuffer(), + b_n_device_buf.GetDeviceBuffer(), + c_m_n_device_buf.GetDeviceBuffer(), + {M, N}, + {Stride, 1}, + {0, 1}, // broadcast in first dimension + {Stride, 1}, + Add{}); + + if(!broadcastAdd.IsSupportedArgument(argument.get())) + { + throw std::runtime_error("The runtime parameters seems not supported by the " + "DeviceBinaryElementwise_2D instance, exiting!"); + }; + + auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer(); + float ave_time = + broadcastAdd_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel}); + + std::cout << "Perf: " << ave_time << " ms" << std::endl; + + bool pass = true; + if(do_verification) + { + c_m_n_device_buf.FromDevice(c_m_n.mData.data()); + Tensor host_c_m_n(f_host_tensor_descriptor2d(M, N, Stride)); + + host_broadcast2D, + Tensor, + Tensor, + EltwiseComputeDataType, + Add, + 0>(host_c_m_n, a_m_n, b_n, M, N, Add{}); + + pass &= ck::utils::check_err( + c_m_n.mData, host_c_m_n.mData, "Error: Incorrect results d1", 1e-3, 1e-3); + } + + return pass ? 0 : 1; +} diff --git a/example/19_binary_elementwise/elementwise_add_1d.cpp b/example/19_binary_elementwise/elementwise_add_1d.cpp new file mode 100644 index 0000000000..f94c19f1d1 --- /dev/null +++ b/example/19_binary_elementwise/elementwise_add_1d.cpp @@ -0,0 +1,110 @@ +#include +#include +#include "check_err.hpp" +#include "config.hpp" +#include "device.hpp" +#include "host_tensor.hpp" +#include "host_tensor_generator.hpp" + +#include "device_tensor.hpp" +#include "binary_element_wise_operation.hpp" +#include "device_binary_elementwise.hpp" + +using F16 = ck::half_t; +using F32 = float; + +using ABDataType = F16; +using CDataType = F16; +using EltwiseComputeDataType = F32; + +using Add = ck::tensor_operation::binary_element_wise::Add; + +using DeviceElementwiseAddInstance = ck::tensor_operation::device:: + DeviceBinaryElementwise; + +template +void host_elementwise1D( + HostTensorC& C, const HostTensorA& A, const HostTensorB& B, int M, Functor functor) +{ + using ctype = ck::remove_reference_t; + + for(int m = 0; m < M; ++m) + { + ComputeDataType Am = static_cast(A(m)); + ComputeDataType Bm = static_cast(B(m)); + ComputeDataType Cm = 0; + functor(Cm, Am, Bm); + C(m) = static_cast(Cm); + } +} + +int main() +{ + bool do_verification = true; + bool time_kernel = false; + + ck::index_t M = 1024; + + auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { + return HostTensorDescriptor(std::vector({len}), + std::vector({stride})); + }; + + Tensor a_m(f_host_tensor_descriptor1d(M, 1)); + Tensor b_m(f_host_tensor_descriptor1d(M, 1)); + Tensor c_m(f_host_tensor_descriptor1d(M, 1)); + + a_m.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b_m.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + + DeviceMem a_m_device_buf(sizeof(ABDataType) * a_m.mDesc.GetElementSpace()); + DeviceMem b_m_device_buf(sizeof(ABDataType) * b_m.mDesc.GetElementSpace()); + DeviceMem c_m_device_buf(sizeof(CDataType) * c_m.mDesc.GetElementSpace()); + + a_m_device_buf.ToDevice(a_m.mData.data()); + b_m_device_buf.ToDevice(b_m.mData.data()); + + auto broadcastAdd = DeviceElementwiseAddInstance{}; + auto argument = broadcastAdd.MakeArgumentPointer(a_m_device_buf.GetDeviceBuffer(), + b_m_device_buf.GetDeviceBuffer(), + c_m_device_buf.GetDeviceBuffer(), + {M}, + {1}, + {1}, + {1}, + Add{}); + + if(!broadcastAdd.IsSupportedArgument(argument.get())) + { + throw std::runtime_error("The runtime parameters seems not supported by the " + "DeviceBinaryElementwise_2D instance, exiting!"); + }; + + auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer(); + float ave_time = + broadcastAdd_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel}); + + std::cout << "Perf: " << ave_time << " ms" << std::endl; + + bool pass = true; + if(do_verification) + { + c_m_device_buf.FromDevice(c_m.mData.data()); + Tensor host_c_m(f_host_tensor_descriptor1d(M, 1)); + + host_elementwise1D, + Tensor, + Tensor, + EltwiseComputeDataType, + Add>(host_c_m, a_m, b_m, M, Add{}); + + pass &= ck::utils::check_err( + c_m.mData, host_c_m.mData, "Error: Incorrect results d1", 1e-3, 1e-3); + } + + return pass ? 0 : 1; +} diff --git a/example/19_binary_elementwise/elementwise_add_4d.cpp b/example/19_binary_elementwise/elementwise_add_4d.cpp new file mode 100644 index 0000000000..e358e993b0 --- /dev/null +++ b/example/19_binary_elementwise/elementwise_add_4d.cpp @@ -0,0 +1,113 @@ +#include +#include +#include "check_err.hpp" +#include "config.hpp" +#include "device.hpp" +#include "host_tensor.hpp" +#include "host_tensor_generator.hpp" +#include "host_utility.hpp" + +#include "device_tensor.hpp" +#include "binary_element_wise_operation.hpp" +#include "device_binary_elementwise.hpp" + +using F16 = ck::half_t; +using F32 = float; + +using ABDataType = F16; +using CDataType = F16; +using EltwiseComputeDataType = F32; + +using Add = ck::tensor_operation::binary_element_wise::Add; + +using DeviceElementwiseAddInstance = ck::tensor_operation::device:: + DeviceBinaryElementwise; + +template +void host_elementwise4D(HostTensorC& C, + const HostTensorA& A, + const HostTensorB& B, + const std::vector& shape, + Functor functor) +{ + using ctype = ck::remove_reference_t; + + for(std::size_t n = 0; n < shape[0]; ++n) + for(std::size_t c = 0; c < shape[1]; ++c) + for(std::size_t h = 0; h < shape[2]; ++h) + for(std::size_t w = 0; w < shape[3]; ++w) + { + ComputeDataType a_val = static_cast(A(n, c, h, w)); + ComputeDataType b_val = static_cast(B(n, c, h, w)); + ComputeDataType c_val = 0; + functor(c_val, a_val, b_val); + C(n, c, h, w) = static_cast(c_val); + } +} + +int main() +{ + bool do_verification = true; + bool time_kernel = false; + + std::vector nchw = {4, 16, 32, 32}; + + Tensor a_m(nchw); + Tensor b_m(nchw); + Tensor c_m(nchw); + + a_m.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b_m.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + + DeviceMem a_m_device_buf(sizeof(ABDataType) * a_m.mDesc.GetElementSpace()); + DeviceMem b_m_device_buf(sizeof(ABDataType) * b_m.mDesc.GetElementSpace()); + DeviceMem c_m_device_buf(sizeof(CDataType) * c_m.mDesc.GetElementSpace()); + + a_m_device_buf.ToDevice(a_m.mData.data()); + b_m_device_buf.ToDevice(b_m.mData.data()); + + auto broadcastAdd = DeviceElementwiseAddInstance{}; + auto argument = broadcastAdd.MakeArgumentPointer( + a_m_device_buf.GetDeviceBuffer(), + b_m_device_buf.GetDeviceBuffer(), + c_m_device_buf.GetDeviceBuffer(), + ck::convert_vector_element_type(nchw), + ck::convert_vector_element_type(a_m.mDesc.GetStrides()), + ck::convert_vector_element_type(b_m.mDesc.GetStrides()), + ck::convert_vector_element_type(c_m.mDesc.GetStrides()), + Add{}); + + if(!broadcastAdd.IsSupportedArgument(argument.get())) + { + throw std::runtime_error("The runtime parameters seems not supported by the " + "DeviceBinaryElementwise_2D instance, exiting!"); + }; + + auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer(); + float ave_time = + broadcastAdd_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel}); + + std::cout << "Perf: " << ave_time << " ms" << std::endl; + + bool pass = true; + if(do_verification) + { + c_m_device_buf.FromDevice(c_m.mData.data()); + Tensor host_c_m(nchw); + + host_elementwise4D, + Tensor, + Tensor, + EltwiseComputeDataType, + Add>(host_c_m, a_m, b_m, nchw, Add{}); + + pass &= ck::utils::check_err( + c_m.mData, host_c_m.mData, "Error: Incorrect results d1", 1e-3, 1e-3); + } + + return pass ? 0 : 1; +} diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 4d81e84c01..1f4ed01de1 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -51,3 +51,4 @@ add_subdirectory(17_convnd_bwd_data_xdl) add_subdirectory(15_grouped_gemm) add_subdirectory(16_gemm_reduce) add_subdirectory(18_batched_gemm_reduce) +add_subdirectory(19_binary_elementwise) diff --git a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp new file mode 100644 index 0000000000..8bf6604f18 --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp @@ -0,0 +1,204 @@ +#pragma once +#include +#include + +#include "device.hpp" +#include "device_base.hpp" +#include "gridwise_binary_elementwise_1d.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DeviceBinaryElementwise : public BaseOperator +{ + DeviceBinaryElementwise(index_t blockSize = 256) : BaseOperator(), blockSize_(blockSize) {} + + static constexpr auto I0 = Number<0>{}; + + template + static auto PadDescriptor_M0_1d(Desc_M0 desc_m0, index_t gridSize, index_t blockSize) + { + const auto m0 = desc_m0.GetLength(I0); + const index_t loop_step = gridSize * blockSize * ScalarPerVector; + const auto pad = math::integer_least_multiple(m0, loop_step) - m0; + const auto desc_m0_pad = + transform_tensor_descriptor(desc_m0, + make_tuple(make_right_pad_transform(m0, pad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); + return desc_m0_pad; + } + + static auto MakeDescriptor_M0(const std::vector& shape, + const std::vector& stride, + index_t gridSize, + index_t blockSize) + { + auto tupleOfShape = generate_tuple([&](auto I) { return shape[I]; }, Number{}); + auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number{}); + + // nd desc - [s0, s1, s2, ...] + const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride); + + // merge nd to 1d desc - [s0 * s1 * ...] + if constexpr(Dim > 1) + { + const auto desc_m0 = transform_tensor_descriptor( + desc, + make_tuple(make_merge_transform(tupleOfShape)), + make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number{})), + make_tuple(Sequence<0>{})); + + return PadDescriptor_M0_1d(desc_m0, gridSize, blockSize); + } + else + return PadDescriptor_M0_1d(desc, gridSize, blockSize); + } + + using GridDesc_M0 = decltype(MakeDescriptor_M0({1, 1}, {1, 1}, 1, 1)); + using GridwiseBinEltwise = GridwiseBinaryElementwise_1D; + + struct Argument : public BaseArgument + { + Argument(const ADataType* p_a, + const BDataType* p_b, + CDataType* p_c, + const std::vector& shape, + const std::vector& stride_a, + const std::vector& stride_b, + const std::vector& stride_c, + ElementwiseFunctor functor, + index_t blockSize) + : p_a_(p_a), + p_b_(p_b), + p_c_(p_c), + shape_(shape), + functor_(functor), + gridSize_(120) // FIXME - Calculate the grid size by number of CU in the future + { + a_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_a, gridSize_, blockSize); + b_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_b, gridSize_, blockSize); + c_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_c, gridSize_, blockSize); + } + + const ADataType* p_a_; + const BDataType* p_b_; + CDataType* p_c_; + std::vector shape_; + GridDesc_M0 a_grid_desc_m0_; + GridDesc_M0 b_grid_desc_m0_; + GridDesc_M0 c_grid_desc_m0_; + ElementwiseFunctor functor_; + index_t gridSize_; + }; + + struct Invoker : public BaseInvoker + { + Invoker(index_t blockSize) : BaseInvoker(), blockSize_(blockSize) {} + + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + const auto kernel = kernel_binary_elementwise_1d; + + float elapsed_time = launch_and_time_kernel(stream_config, + kernel, + dim3(arg.gridSize_), + dim3(blockSize_), + 0, + arg.p_a_, + arg.p_b_, + arg.p_c_, + arg.a_grid_desc_m0_, + arg.b_grid_desc_m0_, + arg.c_grid_desc_m0_, + arg.functor_); + return elapsed_time; + } + + // polymorphic + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + } + + index_t blockSize_; + }; + + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + const Argument* pArg = dynamic_cast(p_arg); + + if(pArg == nullptr) + return false; + + if(pArg->shape_.back() % ScalarPerVector != 0) + return false; + + return true; + }; + + std::unique_ptr MakeArgumentPointer(const void* p_a, + const void* p_b, + void* p_c, + std::vector shape, + std::vector stride_a, + std::vector stride_b, + std::vector stride_c, + ElementwiseFunctor functor) + { + return std::make_unique(static_cast(p_a), + static_cast(p_b), + static_cast(p_c), + shape, + stride_a, + stride_b, + stride_c, + functor, + blockSize_); + } + + std::unique_ptr MakeInvokerPointer() + { + return std::make_unique(Invoker{blockSize_}); + } + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "DeviceBinaryElementwise" + << "<" + << "ScalarPerVector = " << ScalarPerVector + << ">"; + // clang-format on + + return str.str(); + } + + index_t blockSize_; +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp new file mode 100644 index 0000000000..d2c7e1c1b5 --- /dev/null +++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp @@ -0,0 +1,25 @@ +#pragma once +#include "data_type.hpp" + +namespace ck { +namespace tensor_operation { +namespace binary_element_wise { + +struct Add +{ + __host__ __device__ constexpr void + operator()(double& dst, const double& src1, const double& src2) const + { + dst = src1 + src2; + } + + __host__ __device__ constexpr void + operator()(float& dst, const float& src1, const float& src2) const + { + dst = src1 + src2; + } +}; + +} // namespace binary_element_wise +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp new file mode 100644 index 0000000000..c77d49ae94 --- /dev/null +++ b/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp @@ -0,0 +1,150 @@ +#pragma once + +#include "cluster_descriptor.hpp" +#include "data_type.hpp" +#include "element_wise_operation.hpp" +#include "threadwise_tensor_slice_transfer.hpp" + +namespace ck { + +template +__global__ void kernel_binary_elementwise_1d(const ADataType* __restrict__ p_a_global, + const BDataType* __restrict__ p_b_global, + CDataType* __restrict__ p_c_global, + const GridDesc_M0 a_grid_desc_m0, + const GridDesc_M0 b_grid_desc_m0, + const GridDesc_M0 c_grid_desc_m0, + const ElementwiseFunctor functor) +{ + GridwiseBinEltwise::Run(p_a_global, + p_b_global, + p_c_global, + a_grid_desc_m0, + b_grid_desc_m0, + c_grid_desc_m0, + functor); +} + +template +struct GridwiseBinaryElementwise_1D +{ + static constexpr auto I0 = Number<0>{}; + static constexpr auto thread_desc_m0 = + make_naive_tensor_descriptor_packed(make_tuple(Number{})); + + using PassThrough = tensor_operation::element_wise::PassThrough; + + static __device__ auto CalculateElementwiseIndex() + { + const index_t global_thread_id = get_thread_global_1d_id(); + return make_multi_index(global_thread_id * ScalarPerVector); + } + + __device__ static void Run(const ADataType* __restrict__ p_a_global, + const BDataType* __restrict__ p_b_global, + CDataType* __restrict__ p_c_global, + const GridDesc_M0 a_grid_desc_m0, + const GridDesc_M0 b_grid_desc_m0, + const GridDesc_M0 c_grid_desc_m0, + const ElementwiseFunctor functor) + { + const auto a_global_buf = make_dynamic_buffer( + p_a_global, a_grid_desc_m0.GetElementSpaceSize()); + const auto b_global_buf = make_dynamic_buffer( + p_b_global, b_grid_desc_m0.GetElementSpaceSize()); + auto c_global_buf = make_dynamic_buffer( + p_c_global, c_grid_desc_m0.GetElementSpaceSize()); + + StaticBuffer a_thread_buf; + StaticBuffer b_thread_buf; + StaticBuffer c_thread_buf; + + const auto thread_store_global_offset = CalculateElementwiseIndex(); + + auto a_global_load = + ThreadwiseTensorSliceTransfer_v2, // SliceLengths + Sequence<0>, // DimAccessOrder + 0, // SrcVectorDim + ScalarPerVector, + 1, // SrcScalarStrideInVector + false>{a_grid_desc_m0, thread_store_global_offset}; + + auto b_global_load = + ThreadwiseTensorSliceTransfer_v2, // SliceLengths + Sequence<0>, // DimAccessOrder + 0, // SrcVectorDim + ScalarPerVector, + 1, // SrcScalarStrideInVector + false>{b_grid_desc_m0, thread_store_global_offset}; + + auto c_global_write = + ThreadwiseTensorSliceTransfer_v1r3, // SliceLengths + Sequence<0>, // DimAccessOrder + 0, // DstVectorDim + ScalarPerVector, + InMemoryDataOperationEnum::Set, + 1, // DstScalarStrideInVector + false>{ + c_grid_desc_m0, thread_store_global_offset, PassThrough{}}; + + const index_t blockSize = get_block_size(); + const index_t blockPerGrid = get_grid_size(); + const auto m0 = c_grid_desc_m0.GetLength(I0); + const index_t loop_step = blockPerGrid * blockSize * ScalarPerVector; + const auto loop_step_index = make_multi_index(loop_step); + + index_t num_iter = m0 / (loop_step); + do + { + // read and process ScalarPerVector elements + a_global_load.Run( + a_grid_desc_m0, a_global_buf, thread_desc_m0, make_tuple(I0), a_thread_buf); + + b_global_load.Run( + b_grid_desc_m0, b_global_buf, thread_desc_m0, make_tuple(I0), b_thread_buf); + + static_for<0, ScalarPerVector, 1>{}([&](auto m) { + constexpr auto offset = thread_desc_m0.CalculateOffset(make_tuple(m)); + functor(c_thread_buf(Number{}), + a_thread_buf(Number{}), + b_thread_buf(Number{})); + }); + + c_global_write.Run(thread_desc_m0, + make_tuple(I0), // SrcSliceOriginIdx + c_thread_buf, + c_grid_desc_m0, + c_global_buf); + + a_global_load.MoveSrcSliceWindow(a_grid_desc_m0, loop_step_index); + b_global_load.MoveSrcSliceWindow(b_grid_desc_m0, loop_step_index); + c_global_write.MoveDstSliceWindow(c_grid_desc_m0, loop_step_index); + } while(--num_iter); + } +}; + +} // namespace ck diff --git a/include/ck/utility/get_id.hpp b/include/ck/utility/get_id.hpp index d1288a2274..7c62b890c7 100644 --- a/include/ck/utility/get_id.hpp +++ b/include/ck/utility/get_id.hpp @@ -11,10 +11,14 @@ __host__ __device__ constexpr index_t get_warp_size() __device__ index_t get_thread_local_1d_id() { return threadIdx.x; } +__device__ index_t get_thread_global_1d_id() { return blockIdx.x * blockDim.x + threadIdx.x; } + __device__ index_t get_warp_local_1d_id() { return threadIdx.x / get_warp_size(); } __device__ index_t get_block_1d_id() { return blockIdx.x; } __device__ index_t get_grid_size() { return gridDim.x; } +__device__ index_t get_block_size() { return blockDim.x; } + } // namespace ck diff --git a/library/include/ck/library/host_tensor/host_utility.hpp b/library/include/ck/library/host_tensor/host_utility.hpp new file mode 100644 index 0000000000..2ff76e58c3 --- /dev/null +++ b/library/include/ck/library/host_tensor/host_utility.hpp @@ -0,0 +1,17 @@ +#pragma once +#include + +namespace ck { + +template +inline std::vector convert_vector_element_type(const std::vector& inData) +{ + std::vector outData; + + for(auto elem : inData) + outData.push_back(static_cast(elem)); + + return (outData); +}; + +}; // namespace ck