From 2df5929e5e711f0fac08ca2b9bc5fcd3ce83109b Mon Sep 17 00:00:00 2001 From: Po Yen Chen Date: Tue, 20 Sep 2022 10:30:25 +0800 Subject: [PATCH] Add 'Permute' device op & example (#408) * Add example folder for 'DeviceElementwise' * Re-structure example files * Move common parts into common.hpp * Use more strict input * Add more helper methods in 'DeviceElementwise' * Use more specific method to write example * Allow specify problem through command line argument * Allow specify problem 'axes' through command line argument * Add check to template type argument * Add transpose_shape() to generalize shape permute * Generalize transpose utility functions * Use better name for tensor indices * Add checks in helper functions * Remove debug messages * Refine error message for check_err() * Generalize variable naming in example code * Add device op 'DevicePermute' This device op is clone of 'DeviceElementwise' * Use 'DevicePermute' device op in example * Remove 'elementwise' from identifiers * Remove 'elementwise' from file paths * Remove base class of 'DevicePermute' * Let 'DevicePermute' inherit from 'BaseOperator' * Add simple type traits to validate device op type * Add static_assert() to check type constraints * Create 'DevicePermuteBase' to generate methods * Use indirect base type to generate methods * Remove 'is_device_op<>' type traits * Only accept single-input-single-output for 'DervicePermute' * Simplify 'DevicePermute' interface * Re-format 'DeviceElementwise' * Use CRTP to generate overridden virtual method * Remove unnecessary include directives * Distinguish input & output shape in 'DevicePermute' * Passing 'axes' to 'DevicePermute' * Use more reasonable return value for Invoker::Run() * Add 'GridwisePermute' kernel This kernel is a clone of 'GridwiseElementwise_1D' * Remove no-longer used type argument * Check if input/output shape meet the requirement * Remove no-longer used method * Remove never-entered-if-clause * Change problem description for 'DevicePermute' * Transform descriptor into 3 dimensions * Add debug code the verify result * Add comment to indicate template argument location * Add N/H/WPerBlock template parameter to 'DevicePermute' * Rename 'GridwisePermute' to 'GridwiseCopy' * Check tensor descriptor dimensions in 'GridwiseElementwise_1D' * Add missing include directive * Add 'BlockSize' parameter to 'DevicePermute' * Remove no-longer used method * Add 'BlockToTileMap' for 'GridwiseCopy' * Use the normal Block2TileMap convention * Rename 'BlockToTileMap' as 'Block2TileMap' * Fix most of compilation errors * Let 'Block2TileMap' map block to 2d coordinate * Allow data transfer in 'GridwiseCopy' * Fix wrong output descriptor for 2nd blockwise copy * Rename 'GridwiseCopy' as 'GridwisePermute' * Remove '1d' in identifiers * Remove commented-out codes * Remove 'MPerThread' template parameter * Seperate template parameters * Unify variable namming convention * Use more verbose way to create expressions * Add template parameter 'InBlockLdsExtraW' * Release the constraint on In/OutGridDesc * Use date type directly as template argument * Re-arrange template arguments for blockwise copy * Remove no-longer used template parameters * Embed layout in the variable names * Add GridwisePermute::CheckValidity() * Extract local types as template parameters * Rename local type alias * Add more template parameters (vector width related) * Calculate new SrcVectorDim/DstVectorDim after merge descriptor dimensions * Fill tensor values start from 1 * Re-formate example code * Avoid too-large block id * Add comment * Make sure 'SrcVectorDim' is not same as 'DstVectorDim' * Add check for the 'VectorDim' & 'ScalarPerVector' template params * Let 'DstVectorDim' equals 'SrcVectorDim' after transpose out grid desc * Remove no-longer used template parameter 'NPerBlock' * Fix wrong descriptor creation logics * Specify problem in each examples * Use better example name * Add new example 'example_permute_NxHxW_fp32' * Add example for demonstrating bundle multiple elems in tensor * Add support to permute multiple elements together * Change the default problem size * Add span<> class template * Use span<> to generalize check_err() interface * Fix ambiguous ctor call * Avoid create necessary objects * Use helper functions to simplify example code * Add example for 4xfp16 permute * Disable failed-to-compile example * Add check for the NUM_ELEMS_IN_BUNDLE * Remove redundant parameter in helper lambda function * Add check for the input tensor type's byte-size * Check scalar-per-vector with padded length * Use more verbose name to avoid name collision * Use fixed 'VectorDim' & 'ScalarPerVector' for LDS * Embed shape info in name of descriptor constructor * Rename example folder '36_permute' into '37_permute' * Avoid using too-large LDS in kernel code * Remove redundant example * Usw switch() to group similar codes * Add const to the span<> type arguement * Simply initialize tensor with floating point values * Use fp16 as data type in all examples * Enlarge tensor size in example * Enalrge N-dim in example * Add check for the bundled type in example * Use more stricter error threshold * Remove global load/store loop in kernel code * Measure execution time by default * Use faster device op config for example 'NxHxW_fp16' * Use faster device op config for example '1xHxW_fp16' * Use faster device op config for example 'HxWx4_fp16' * Remove cmd arg parsing logics * Rename functions * Extract bundle permutation logic out * Simplify permute bundle example * Add Tensor<>::GetElementSpaceSizeInBytes() * Add Tensor<>::data() * Use new methods to simplify code * Use type alias to replace duplicated code * Use existing method to shorten code * Allow FillUniformDistribution accept range arugment * Intialize random values in range * Add Tensor<>::size() * Use more meaningful names in permute bundle example * Use more meaningful names in permute element examples * Use rangified copy() to copy elements * Use function return value directly to eliminate variables * Add to_array() conversion tool to eliminate more variables * Add Tensor<>::AsSpan<>() to create view of tensor values * Use AsSpan() to shorten check_err() calls * Remove no-longer-used 'using' directives * Move 'using' directive to proper code position * Remove redudant variables * Remove useless static_assert() * Add check for range types * Declare variable right before first use * Move long return type as tailing return type * Add BaseInvokerCRTP<> class template to generate method * Create new base type for 'DervicePermute' implementations * Move 'NumDim' template param to the first * Rename 'DevicePermute' to 'DevicePermuteImpl' * Add 'noexcept' specifier to CRTP generated method * Move 'Block2TileMap' definition into 'GridwisePermute' * Use type alias to reduce code * Unify naming style in 'DevicePermute' * Add comments in 'GridwisePermute' * Rename permute example folder * Use std::cerr to report error * Use larger shape in examples * Rename '38_permute' to '39_permute' * Make sure we use unsigned type for shape & indices * Remove opt-ed out assertion * Remove template BaseInvokerCRTP<> [ROCm/composable_kernel commit: f584ab0c545ade05ae793a8b36fa282d47d0f698] --- example/39_permute/CMakeLists.txt | 9 + example/39_permute/common.hpp | 468 ++++++++++++++++++ example/39_permute/permute_1xHxW_fp16.cpp | 20 + example/39_permute/permute_HxWx4_fp16.cpp | 22 + example/39_permute/permute_NxHxW_fp16.cpp | 20 + .../39_permute/run_permute_bundle_example.inc | 78 +++ .../run_permute_element_example.inc | 65 +++ .../gpu/device/device_base.hpp | 1 + .../gpu/device/device_elementwise.hpp | 34 +- .../gpu/device/device_permute.hpp | 37 ++ .../gpu/device/impl/device_permute_impl.hpp | 282 +++++++++++ .../gpu/grid/gridwise_elementwise_1d.hpp | 4 + .../gpu/grid/gridwise_permute.hpp | 339 +++++++++++++ .../threadwise_tensor_slice_transfer_v3r1.hpp | 1 + include/ck/utility/span.hpp | 67 +++ .../include/ck/library/utility/check_err.hpp | 38 +- library/include/ck/library/utility/fill.hpp | 12 + .../ck/library/utility/host_tensor.hpp | 58 ++- 18 files changed, 1520 insertions(+), 35 deletions(-) create mode 100644 example/39_permute/CMakeLists.txt create mode 100644 example/39_permute/common.hpp create mode 100644 example/39_permute/permute_1xHxW_fp16.cpp create mode 100644 example/39_permute/permute_HxWx4_fp16.cpp create mode 100644 example/39_permute/permute_NxHxW_fp16.cpp create mode 100644 example/39_permute/run_permute_bundle_example.inc create mode 100644 example/39_permute/run_permute_element_example.inc create mode 100644 include/ck/tensor_operation/gpu/device/device_permute.hpp create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp create mode 100644 include/ck/utility/span.hpp diff --git a/example/39_permute/CMakeLists.txt b/example/39_permute/CMakeLists.txt new file mode 100644 index 0000000000..573ad7239e --- /dev/null +++ b/example/39_permute/CMakeLists.txt @@ -0,0 +1,9 @@ +add_custom_target(example_permute) + +add_example_executable(example_permute_1xHxW_fp16 permute_1xHxW_fp16.cpp) +add_example_executable(example_permute_NxHxW_fp16 permute_NxHxW_fp16.cpp) +add_example_executable(example_permute_HxWx4_fp16 permute_HxWx4_fp16.cpp) + +add_dependencies(example_permute example_permute_1xHxW_fp16) +add_dependencies(example_permute example_permute_NxHxW_fp16) +add_dependencies(example_permute example_permute_HxWx4_fp16) diff --git a/example/39_permute/common.hpp b/example/39_permute/common.hpp new file mode 100644 index 0000000000..1c26f3d9a6 --- /dev/null +++ b/example/39_permute/common.hpp @@ -0,0 +1,468 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp" +#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" +#include "ck/utility/type.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/fill.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" + +using F16 = ck::half_t; +using F32 = float; +using F64 = double; + +struct Problem final +{ + static constexpr std::size_t NumDim = 3; + + using Shape = std::array; + using Axes = Shape; + + Problem() = delete; + + explicit Problem(const Shape& default_shape, const Axes& default_axes) + : shape(default_shape), axes(default_axes) + { + } + + Shape shape; + Axes axes; +}; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +namespace detail { + +template +struct enlarge_array_size; + +template +struct enlarge_array_size, Difference> +{ + using type = std::array; +}; + +template +using enlarge_array_size_t = typename enlarge_array_size::type; + +template +struct get_array_size; + +template +struct get_array_size> : std::integral_constant +{ +}; + +template +inline constexpr std::size_t get_array_size_v = get_array_size::value; + +template +struct is_iterator : std::false_type +{ +}; + +template +struct is_iterator()), + decltype(++std::declval>()), + decltype(std::declval>()++)>> + : std::true_type +{ +}; + +template +inline constexpr bool is_iterator_v = is_iterator::value; + +struct Placeholder final +{ + template + constexpr inline operator T() const noexcept; +}; + +template +struct is_output_iterator : std::false_type +{ +}; + +template +struct is_output_iterator< + Iterator, + std::void_t() = std::declval())>> + : std::bool_constant> +{ +}; + +template +inline constexpr bool is_output_iterator_v = is_output_iterator::value; + +template +struct is_bidirectional_iterator : std::false_type +{ +}; + +template +struct is_bidirectional_iterator< + Iterator, + std::void_t>()), + decltype(std::declval>()--)>> + : std::bool_constant> +{ +}; + +template +inline constexpr bool is_bidirectional_iterator_v = is_bidirectional_iterator::value; + +template +struct is_random_access_iterator : std::false_type +{ +}; + +template +struct is_random_access_iterator() + 1), + decltype(std::declval() - 1), + decltype(std::declval()[1])>> + : std::bool_constant> +{ +}; + +template +inline constexpr bool is_random_access_iterator_v = is_random_access_iterator::value; + +template +struct is_range : std::false_type +{ +}; + +template +struct is_range())), + decltype(end(std::declval())), + decltype(begin(std::declval()) != end(std::declval()))>> + : std::bool_constant()))>>> +{ +}; + +template +inline constexpr bool is_range_v = is_range::value; + +template +struct is_sized_range : std::false_type +{ +}; + +template +struct is_sized_range()))>> + : std::bool_constant> +{ +}; + +template +inline constexpr bool is_sized_range_v = is_sized_range::value; + +template +struct is_bidirectional_range : std::false_type +{ +}; + +template +struct is_bidirectional_range> + : std::bool_constant< + is_range_v && + is_bidirectional_iterator_v()))>>> +{ +}; + +template +inline constexpr bool is_bidirectional_range_v = is_bidirectional_range::value; + +template +struct is_random_access_range : std::false_type +{ +}; + +template +struct is_random_access_range> + : std::bool_constant< + is_range_v && + is_random_access_iterator_v()))>>> +{ +}; + +template +inline constexpr bool is_random_access_range_v = is_random_access_range::value; + +template +class to_array_proxy +{ + static_assert(is_range_v); + + public: + explicit to_array_proxy(const Range& source) noexcept : source_(source) {} + + template + operator std::array() const + { + std::array destination; + + std::copy_n(std::begin(source_), + std::min(Size, std::size(source_)), + std::begin(destination)); + + return destination; + } + + private: + const Range& source_; +}; + +} // namespace detail + +template +inline auto to_array(Range& range) noexcept + -> std::enable_if_t, + detail::to_array_proxy>> +{ + return detail::to_array_proxy>{range}; +} + +namespace ranges { +template +inline auto copy(InputRange&& range, OutputIterator iter) + -> decltype(std::copy(std::begin(std::forward(range)), + std::end(std::forward(range)), + iter)) +{ + return std::copy(std::begin(std::forward(range)), + std::end(std::forward(range)), + iter); +} +} // namespace ranges + +template +inline auto is_valid_axes(const Axes& axes) + -> std::enable_if_t, bool> +{ + using std::empty; + if(empty(axes)) + { + return false; + } + + using std::begin, std::end; + std::vector sorted_axes(begin(axes), end(axes)); + + std::sort(begin(sorted_axes), end(sorted_axes)); + const auto last = std::unique(begin(sorted_axes), end(sorted_axes)); + + return (last == end(sorted_axes)) && (*begin(sorted_axes) == 0) && + (*std::prev(last) == size(axes) - 1); +} + +template +inline auto is_valid_shape(const Shape& shape) -> std::enable_if_t, bool> +{ + static_assert(std::is_unsigned_v>); + + using std::begin, std::end; + using std::empty; + return !empty(shape) && std::all_of(begin(shape), end(shape), [](auto dim) { return 0 < dim; }); +} + +template +inline auto is_valid_indices(const Shape& shape, const Indices& indices) + -> std::enable_if_t && detail::is_sized_range_v, bool> +{ + static_assert(std::is_unsigned_v>); + + if(!is_valid_shape(shape)) + { + return false; + } + + using std::empty; + if(empty(indices)) + { + return false; + } + + using std::size; + if(size(shape) != size(indices)) + { + return false; + } + + using std::begin, std::end; + + auto dim = begin(shape); + auto idx = begin(indices); + for(; dim != end(shape) && idx != end(indices); ++dim, ++idx) + { + if(*dim <= *idx) + { + return false; + } + } + + return true; +} + +template +std::array transpose(const std::array& shape, + const std::array& axes) +{ + assert(is_valid_shape(shape) && is_valid_axes(axes)); + + std::array transposed; + auto iter = std::begin(transposed); + for(const auto axis : axes) + { + *iter++ = shape[axis]; + } + + return transposed; +} + +auto extend_shape(const Problem::Shape& shape, std::size_t new_dim) +{ + detail::enlarge_array_size_t extended_shape; + + using std::begin, std::end; + + std::copy(begin(shape), end(shape), begin(extended_shape)); + extended_shape.back() = new_dim; + + return extended_shape; +} + +auto extend_axes(const Problem::Axes& axes) +{ + detail::enlarge_array_size_t extended_axes; + + using std::begin, std::end; + + std::copy(begin(axes), end(axes), begin(extended_axes)); + extended_axes.back() = detail::get_array_size_v; + + return extended_axes; +} + +template +auto advance_indices(const Shape& shape, Indices& indices) -> std::enable_if_t< + detail::is_bidirectional_range_v && detail::is_sized_range_v && + detail::is_bidirectional_range_v && detail::is_sized_range_v, + bool> +{ + using std::size; + if(!(is_valid_shape(shape) && is_valid_indices(shape, indices) && size(shape) == size(indices))) + { + return false; + } + + bool carry = true; + + using std::rbegin, std::rend; + auto dim = rbegin(shape); + auto idx = rbegin(indices); + for(; carry && dim != rend(shape) && idx != rend(indices); ++dim, ++idx) + { + *idx = (*idx + carry); + carry = ((*idx == *dim) ? (*idx = 0, true) : false); + } + + return !carry; +} + +template +auto host_permute(const Tensor& src, const Axes& axes, Functor functor, Tensor& dest) + -> std::enable_if_t && detail::is_sized_range_v && + std::is_invocable_v, + std::add_lvalue_reference_t>, + bool> +{ + const auto& shape = src.mDesc.GetLengths(); + const auto& transposed_shape = dest.mDesc.GetLengths(); + if(!(is_valid_shape(shape) && is_valid_shape(transposed_shape))) + { + return false; + } + + using std::size; + if(!is_valid_axes(axes)) + { + return false; + } + + static_assert(detail::is_sized_range_v> && + detail::is_sized_range_v>); + + if(size(shape) != size(transposed_shape)) + { + return false; + } + + static_assert(detail::is_random_access_range_v> && + detail::is_random_access_range_v>); + { + for(std::size_t idx = 0; idx < size(shape); ++idx) + { + if(transposed_shape[idx] != shape[axes[idx]]) + { + return false; + } + } + } + + std::vector indices(size(shape), 0); + if(!is_valid_indices(shape, indices)) + { + return false; + } + + switch(size(shape)) + { + case 3: { + do + { + Dest output = 0; + functor(output, src(indices[0], indices[1], indices[2])); + dest(indices[axes[0]], indices[axes[1]], indices[axes[2]]) = output; + } while(advance_indices(shape, indices)); + } + break; + case 4: { + do + { + Dest output = 0; + functor(output, src(indices[0], indices[1], indices[2], indices[3])); + dest(indices[axes[0]], indices[axes[1]], indices[axes[2]], indices[axes[3]]) = output; + } while(advance_indices(shape, indices)); + } + break; + default: return false; + } + + return true; +} diff --git a/example/39_permute/permute_1xHxW_fp16.cpp b/example/39_permute/permute_1xHxW_fp16.cpp new file mode 100644 index 0000000000..d7f9b80544 --- /dev/null +++ b/example/39_permute/permute_1xHxW_fp16.cpp @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +using InDataType = F16; +using OutDataType = F16; + +// clang-format off +using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl +// ######| NumDim| InData| OutData| Elementwise| Block| NPer| HPer| WPer| InBlock| InBlockTransfer| InBlockTransfer| Src| Dst| Src| Dst| +// ######| | Type| Type| Operation| Size| Block| Block| Block| LdsExtraW| ThreadClusterLengths| ThreadClusterArrangeOrder| VectorDim| VectorDim| ScalarPerVector| ScalarPerVector| +// ######| | | | | | | | | | | | | | | | +// ######| | | | | | | | | | | | | | | | + < 3, InDataType, OutDataType, PassThrough, 256, 1, 32, 32, 3, S<1, 32, 8>, S<0, 1, 2>, 2, 1, 2, 1>; +// clang-format on + +#include "run_permute_element_example.inc" + +int main() { return !run_permute_element_example({1, 32000, 80}, {0, 2, 1}); } diff --git a/example/39_permute/permute_HxWx4_fp16.cpp b/example/39_permute/permute_HxWx4_fp16.cpp new file mode 100644 index 0000000000..342aa134ec --- /dev/null +++ b/example/39_permute/permute_HxWx4_fp16.cpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +using DataType = F16; +using BundleType = F64; + +static_assert(sizeof(BundleType) % sizeof(DataType) == 0); + +// clang-format off +using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl +// ######| NumDim| InData| OutData| Elementwise| Block| NPer| HPer| WPer| InBlock| InBlockTransfer| InBlockTransfer| Src| Dst| Src| Dst| +// ######| | Type| Type| Operation| Size| Block| Block| Block| LdsExtraW| ThreadClusterLengths| ThreadClusterArrangeOrder| VectorDim| VectorDim| ScalarPerVector| ScalarPerVector| +// ######| | | | | | | | | | | | | | | | +// ######| | | | | | | | | | | | | | | | + < 3, BundleType, BundleType, PassThrough, 256, 1, 32, 32, 5, S<1, 32, 8>, S<0, 1, 2>, 2, 1, 4, 1>; +// clang-format on + +#include "run_permute_bundle_example.inc" + +int main() { return !run_permute_bundle_example({1, 80, 32000}, {0, 2, 1}); } diff --git a/example/39_permute/permute_NxHxW_fp16.cpp b/example/39_permute/permute_NxHxW_fp16.cpp new file mode 100644 index 0000000000..b53975eb2c --- /dev/null +++ b/example/39_permute/permute_NxHxW_fp16.cpp @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +using InDataType = F16; +using OutDataType = F16; + +// clang-format off +using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl +// ######| NumDim| InData| OutData| Elementwise| Block| NPer| HPer| WPer| InBlock| InBlockTransfer| InBlockTransfer| Src| Dst| Src| Dst| +// ######| | Type| Type| Operation| Size| Block| Block| Block| LdsExtraW| ThreadClusterLengths| ThreadClusterArrangeOrder| VectorDim| VectorDim| ScalarPerVector| ScalarPerVector| +// ######| | | | | | | | | | | | | | | | +// ######| | | | | | | | | | | | | | | | + < 3, InDataType, OutDataType, PassThrough, 128, 4, 16, 8, 6, S<2, 16, 4>, S<0, 1, 2>, 2, 1, 2, 1>; +// clang-format on + +#include "run_permute_element_example.inc" + +int main() { return !run_permute_element_example({121, 768, 80}, {0, 2, 1}); } diff --git a/example/39_permute/run_permute_bundle_example.inc b/example/39_permute/run_permute_bundle_example.inc new file mode 100644 index 0000000000..ae23257022 --- /dev/null +++ b/example/39_permute/run_permute_bundle_example.inc @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +bool run_permute_bundle(const Problem& problem) +{ + const auto& input_bundle_shape = problem.shape; + const auto& input_bundle_axes = problem.axes; + + const auto output_bundle_shape = transpose(input_bundle_shape, input_bundle_axes); + + Tensor input_bundle_tensor(input_bundle_shape); + Tensor output_bundle_tensor(output_bundle_shape); + + // initialize tensor by assigning DataType values + ck::utils::FillUniformDistribution{-1.f, 1.f}(input_bundle_tensor.AsSpan()); + + DeviceMem input_device_buf(input_bundle_tensor.GetElementSpaceSizeInBytes()); + DeviceMem output_device_buf(output_bundle_tensor.GetElementSpaceSizeInBytes()); + + using std::data; + input_device_buf.ToDevice(data(input_bundle_tensor)); + + static_assert(std::is_default_constructible_v); + + auto permute = DevicePermuteInstance{}; + auto argument = permute.MakeArgument(to_array(input_bundle_shape), + to_array(input_bundle_tensor.GetStrides()), + to_array(output_bundle_shape), + to_array(output_bundle_tensor.GetStrides()), + input_device_buf.GetDeviceBuffer(), + output_device_buf.GetDeviceBuffer(), + PassThrough{}); + + if(!permute.IsSupportedArgument(argument)) + { + std::cerr << "The runtime parameters seems not supported by the device instance, exiting!" + << std::endl; + return false; + }; + + auto invoker = permute.MakeInvoker(); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, true}); + + std::cout << "Perf: " << ave_time << " ms" << std::endl; + + output_device_buf.FromDevice(data(output_bundle_tensor)); + + constexpr std::size_t NumElemsInBundle = sizeof(BundleType) / sizeof(DataType); + + // extend tensor shape from [N, H, W] to [N, H, W, NumElemsInBundle] + // axes from [0, 2, 1] to [0, 2, 1, 3] + const auto input_shape = extend_shape(input_bundle_shape, NumElemsInBundle); + const auto input_axes = extend_axes(input_bundle_axes); + + using std::begin; + + Tensor input_tensor(input_shape); + ranges::copy(input_bundle_tensor.AsSpan(), begin(input_tensor)); + + Tensor output_tensor(transpose(input_shape, input_axes)); + if(!host_permute(input_tensor, input_axes, PassThrough{}, output_tensor)) + { + return false; + } + + return ck::utils::check_err(output_bundle_tensor.AsSpan(), + output_tensor.AsSpan(), + "Error: incorrect results in output tensor", + 1e-6, + 1e-6); +} + +bool run_permute_bundle_example(const Problem::Shape& shape, const Problem::Axes& axes) +{ + return run_permute_bundle(Problem{shape, axes}); +} diff --git a/example/39_permute/run_permute_element_example.inc b/example/39_permute/run_permute_element_example.inc new file mode 100644 index 0000000000..bc62353030 --- /dev/null +++ b/example/39_permute/run_permute_element_example.inc @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +bool run_permute_element(const Problem& problem) +{ + const auto& input_shape = problem.shape; + const auto& input_axes = problem.axes; + + const auto output_shape = transpose(input_shape, input_axes); + + Tensor input_tensor(input_shape); + Tensor output_tensor(output_shape); + + ck::utils::FillUniformDistribution{-1.f, 1.f}(input_tensor); + + DeviceMem input_device_buf(input_tensor.GetElementSpaceSizeInBytes()); + DeviceMem output_device_buf(output_tensor.GetElementSpaceSizeInBytes()); + + using std::data; + input_device_buf.ToDevice(data(input_tensor)); + + static_assert(std::is_default_constructible_v); + + auto permute = DevicePermuteInstance{}; + auto argument = permute.MakeArgument(to_array(input_shape), + to_array(input_tensor.GetStrides()), + to_array(output_shape), + to_array(output_tensor.GetStrides()), + input_device_buf.GetDeviceBuffer(), + output_device_buf.GetDeviceBuffer(), + PassThrough{}); + + if(!permute.IsSupportedArgument(argument)) + { + std::cerr << "The runtime parameters seems not supported by the device instance, exiting!" + << std::endl; + return false; + }; + + auto invoker = permute.MakeInvoker(); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, true}); + + std::cout << "Perf: " << ave_time << " ms" << std::endl; + + output_device_buf.FromDevice(data(output_tensor)); + + Tensor output_tensor_host(output_shape); + if(!host_permute(input_tensor, input_axes, PassThrough{}, output_tensor_host)) + { + return false; + } + + return ck::utils::check_err(output_tensor.AsSpan(), + output_tensor_host.AsSpan(), + "Error: incorrect results in output tensor", + 1e-6, + 1e-6); +} + +bool run_permute_element_example(const Problem::Shape& shape, const Problem::Axes& axes) +{ + return run_permute_element(Problem{shape, axes}); +} diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/include/ck/tensor_operation/gpu/device/device_base.hpp index f41f65d76b..65906bd03c 100644 --- a/include/ck/tensor_operation/gpu/device/device_base.hpp +++ b/include/ck/tensor_operation/gpu/device/device_base.hpp @@ -3,6 +3,7 @@ #pragma once +#include #include #include "ck/stream_config.hpp" diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise.hpp index d0bf49f891..8e62880098 100644 --- a/include/ck/tensor_operation/gpu/device/device_elementwise.hpp +++ b/include/ck/tensor_operation/gpu/device/device_elementwise.hpp @@ -222,14 +222,9 @@ struct DeviceElementwise } }; - bool IsSupportedArgument(const BaseArgument* p_arg) override + static bool IsSupportedArgument(const Argument& arg) { - const Argument* pArg = dynamic_cast(p_arg); - - if(pArg == nullptr) - return false; - - if(pArg->lengths_.back() % MPerThread != 0) + if(arg.lengths_.back() % MPerThread != 0) return false; auto IsScalarPerVectorValid = [&](const std::array& lengths, @@ -247,19 +242,40 @@ struct DeviceElementwise bool valid = true; static_for<0, NumInput, 1>{}([&](auto I) { if(!IsScalarPerVectorValid( - pArg->lengths_, pArg->inStridesArray_[I.value], InScalarPerVectorSeq::At(I))) + arg.lengths_, arg.inStridesArray_[I.value], InScalarPerVectorSeq::At(I))) valid = false; }); static_for<0, NumOutput, 1>{}([&](auto I) { if(!IsScalarPerVectorValid( - pArg->lengths_, pArg->outStridesArray_[I.value], OutScalarPerVectorSeq::At(I))) + arg.lengths_, arg.outStridesArray_[I.value], OutScalarPerVectorSeq::At(I))) valid = false; }); return valid; }; + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + return IsSupportedArgument(*dynamic_cast(p_arg)); + } + + static auto + MakeArgument(const std::array lengths, + const std::array, NumInput> inStridesArray, + const std::array, NumOutput> outStridesArray, + const std::array in_dev_buffers, + const std::array out_dev_buffers, + ElementwiseOperation elementwise_op) + { + return Argument{lengths, + inStridesArray, + outStridesArray, + in_dev_buffers, + out_dev_buffers, + elementwise_op}; + } + std::unique_ptr MakeArgumentPointer(const std::array lengths, const std::array, NumInput> inStridesArray, diff --git a/include/ck/tensor_operation/gpu/device/device_permute.hpp b/include/ck/tensor_operation/gpu/device/device_permute.hpp new file mode 100644 index 0000000000..baa9144775 --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/device_permute.hpp @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include + +#include "ck/tensor_operation/gpu/device/device_base.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DevicePermute : BaseOperator +{ + using Lengths = std::array; + using Strides = Lengths; + + virtual std::unique_ptr + MakeArgumentPointer(const Lengths& in_lengths, + const Strides& in_strides, + const Lengths& out_lengths, + const Strides& out_strides, + const void* in_dev_buffer, + void* out_dev_buffer, + ElementwiseOperation elementwise_op) = 0; + + virtual std::unique_ptr MakeInvokerPointer() = 0; +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp new file mode 100644 index 0000000000..7b96373c0f --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp @@ -0,0 +1,282 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include + +#include "ck/utility/math.hpp" +#include "ck/utility/sequence.hpp" +#include "ck/tensor_operation/gpu/device/device_base.hpp" +#include "ck/tensor_operation/gpu/device/device_permute.hpp" +#include "ck/tensor_operation/gpu/device/matrix_padder.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_permute.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" + +#include "ck/host_utility/kernel_launch.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +// Swap last 2 dimensions +// input shape: [d[0], d[1], d[2], ..., d[NumDim-3], d[NumDim-2], d[NumDim-1]] +// ^^^^^^^^^^^ +// output shape: [d[0], d[1], d[2], ..., d[NumDim-3], d[NumDim-1], d[NumDim-2]] +// ^^^^^^^^^^^ +template +struct DevicePermuteImpl : DevicePermute +{ + using BaseType = DevicePermute; + using typename BaseType::Lengths; + using typename BaseType::Strides; + + static_assert(3 <= NumDim, "Only accept at least 3D dimension tensor"); + static_assert((NumDim - 2) <= SrcVectorDim && SrcVectorDim < NumDim); + static_assert((NumDim - 2) <= DstVectorDim && DstVectorDim < NumDim); + static_assert(SrcVectorDim != DstVectorDim); + + template + static auto ConvertArrayToTuple(const std::array& array) + { + static_assert(1 <= N && N <= NumDim); + + return generate_tuple([&](auto I) { return array[I]; }, Number{}); + } + + static auto MakeDescriptor_N_H_W(const Lengths& lengths, const Strides& stride) + { + // create nd descriptor, shape: [d[0], d[1], d[2], ..., d[NumDim-3], d[NumDim-2], + // d[NumDim-1]] + const auto desc = + make_naive_tensor_descriptor(ConvertArrayToTuple(lengths), ConvertArrayToTuple(stride)); + + // merge nd to 3d descriptor, shape: [(d[0] * d[1] * d[2] * ... * d[NumDim-3]), d[NumDim-2], + // d[NumDim-1]] + // => [N, H, W] + const index_t H = *std::next(rbegin(lengths)); + const index_t W = *rbegin(lengths); + const auto desc_n_h_w = transform_tensor_descriptor( + desc, + make_tuple(make_merge_transform(ConvertArrayToTuple(lengths)), + make_pass_through_transform(H), + make_pass_through_transform(W)), + make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number{}), + Sequence{}, + Sequence{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + + return PadTensorDescriptor( + desc_n_h_w, make_tuple(NPerBlock, HPerBlock, WPerBlock), Sequence{}); + } + + using InGridDesc = decltype(MakeDescriptor_N_H_W({1, 1}, {1, 1})); + using OutGridDesc = InGridDesc; + + using GridwisePermute = GridwisePermute< + InGridDesc, + OutGridDesc, + InDataType, + OutDataType, + ElementwiseOperation, + BlockSize, + NPerBlock, + HPerBlock, + WPerBlock, + InBlockLdsExtraW, + InBlockTransferThreadClusterLengths, + InBlockTransferThreadClusterArrangeOrder, + SrcVectorDim - (NumDim - 3), // calculate new SrcVectorDim for the merged descriptor + DstVectorDim - (NumDim - 3), // calculate new DstVectorDim for the merged descriptor + SrcScalarPerVector, + DstScalarPerVector>; + + using Block2TileMap = typename GridwisePermute::DefaultBlock2TileMap; + + struct Argument : public BaseArgument + { + Argument(const Lengths& in_lengths, + const Strides& in_strides, + const Lengths& out_lengths, + const Strides& out_strides, + const void* in_dev_buffer, + void* out_dev_buffer, + ElementwiseOperation elementwise_op) + : in_dev_buffer_(static_cast(in_dev_buffer)), + out_dev_buffer_(static_cast(out_dev_buffer)), + in_grid_desc_(MakeDescriptor_N_H_W(in_lengths, in_strides)), + out_grid_desc_(MakeDescriptor_N_H_W(out_lengths, out_strides)), + in_lengths_(in_lengths), + in_strides_(in_strides), + out_lengths_(out_lengths), + out_strides_(out_strides), + elementwise_op_(elementwise_op), + block_2_tile_map_(GridwisePermute::MakeDefaultBlock2TileMap(in_grid_desc_)) + { + } + + const InDataType* in_dev_buffer_; + OutDataType* out_dev_buffer_; + InGridDesc in_grid_desc_; + OutGridDesc out_grid_desc_; + + Lengths in_lengths_; + Strides in_strides_; + Lengths out_lengths_; + Strides out_strides_; + + ElementwiseOperation elementwise_op_; + + Block2TileMap block_2_tile_map_; + }; + + struct Invoker : BaseInvoker + { + static float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + const index_t grid_size = arg.block_2_tile_map_.CalculateGridSize(arg.in_grid_desc_); + + const auto kernel = kernel_nd_permute; + + float elapsed_time = launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.in_grid_desc_, + arg.out_grid_desc_, + arg.in_dev_buffer_, + arg.out_dev_buffer_, + arg.elementwise_op_, + arg.block_2_tile_map_); + return elapsed_time; + } + + float Run(const BaseArgument* arg, + const StreamConfig& stream_config = StreamConfig{}) override final + { + const auto* const argument = dynamic_cast(arg); + if(!argument) + { + return NAN; + } + + return Run(*argument, stream_config); + } + }; + + static bool IsSupportedArgument(const Argument& arg) + { + constexpr auto GetPaddedLength = [](index_t length, index_t tile_length) { + return math::integer_divide_ceil(length, tile_length) * tile_length; + }; + + constexpr auto IsScalarPerVectorValid = + [](index_t length, index_t stride, index_t scalar_per_vector) { + if(stride == 1 && length % scalar_per_vector == 0) + { + return true; + } + else if(stride != 1 && scalar_per_vector == 1) + { + return true; + } + + return false; + }; + + return IsScalarPerVectorValid(arg.in_lengths_[SrcVectorDim], + arg.in_strides_[SrcVectorDim], + SrcScalarPerVector) && + IsScalarPerVectorValid( + GetPaddedLength(arg.in_lengths_[SrcVectorDim], + (SrcVectorDim == NumDim - 2 ? HPerBlock : WPerBlock)), + arg.in_strides_[SrcVectorDim], + SrcScalarPerVector) && + IsScalarPerVectorValid(arg.out_lengths_[DstVectorDim], + arg.out_strides_[DstVectorDim], + DstScalarPerVector) && + IsScalarPerVectorValid( + GetPaddedLength(arg.out_lengths_[DstVectorDim], + (DstVectorDim == NumDim - 2 ? HPerBlock : WPerBlock)), + arg.in_strides_[DstVectorDim], + DstScalarPerVector) && + GridwisePermute::CheckValidity(arg.in_grid_desc_, arg.out_grid_desc_); + }; + + // override methods inherited from 'BaseOperator' + bool IsSupportedArgument(const BaseArgument* arg) override final + { + const auto* const argument = dynamic_cast(arg); + if(!argument) + { + return false; + } + + return IsSupportedArgument(*argument); + } + + // override methods inherited from 'DevicePermute' + std::unique_ptr + MakeArgumentPointer(const Lengths& in_lengths, + const Strides& in_strides, + const Lengths& out_lengths, + const Strides& out_strides, + const void* in_dev_buffer, + void* out_dev_buffer, + ElementwiseOperation elementwise_op) override final + { + return std::make_unique(in_lengths, + in_strides, + out_lengths, + out_strides, + in_dev_buffer, + out_dev_buffer, + elementwise_op); + } + + std::unique_ptr MakeInvokerPointer() override final + { + return std::make_unique(); + }; + + // other constructor methods + template + static std::enable_if_t, Argument> + MakeArgument(Args&&... args) noexcept(std::is_nothrow_constructible_v) + { + return Argument{std::forward(args)...}; + } + + static std::enable_if_t, Invoker> + MakeInvoker() noexcept(std::is_nothrow_default_constructible_v) + { + return Invoker{}; + } +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp index 4feb948156..8b82b65540 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp @@ -83,6 +83,8 @@ struct GridwiseElementwise_1D auto in_global_buf_tuple = generate_tuple( [&](auto I) { + static_assert(in_grid_1d_desc_tuple[I].GetNumOfDimension() == 1); + return make_dynamic_buffer( p_in_global_tuple[I], in_grid_1d_desc_tuple[I].GetElementSpaceSize()); }, @@ -90,6 +92,8 @@ struct GridwiseElementwise_1D auto out_global_buf_tuple = generate_tuple( [&](auto I) { + static_assert(out_grid_1d_desc_tuple[I].GetNumOfDimension() == 1); + return make_dynamic_buffer( p_out_global_tuple[I], out_grid_1d_desc_tuple[I].GetElementSpaceSize()); }, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp new file mode 100644 index 0000000000..de1ae91592 --- /dev/null +++ b/include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp @@ -0,0 +1,339 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include + +#include "ck/tensor_description/cluster_descriptor.hpp" +#include "ck/utility/data_type.hpp" +#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck { + +template +__global__ void kernel_nd_permute(const InGridDesc in_grid_desc, + const OutGridDesc out_grid_desc, + const InDataType* p_in_global, + OutDataType* p_out_global, + const ElementwiseOperation elementwise_op, + const Block2TileMap block_2_tile_map) +{ + __shared__ char p_shared[GridwisePermute::GetSharedMemoryNumberOfByte()]; + + GridwisePermute::Run(in_grid_desc, + out_grid_desc, + p_in_global, + p_out_global, + p_shared, + elementwise_op, + block_2_tile_map); +} + +template +struct GridwisePermute +{ + static_assert(InGridDesc::GetNumOfDimension() == OutGridDesc::GetNumOfDimension()); + static_assert(3 <= InGridDesc::GetNumOfDimension()); + static_assert((InGridDesc::GetNumOfDimension() - 2) <= SrcVectorDim && + SrcVectorDim < InGridDesc::GetNumOfDimension()); + static_assert((OutGridDesc::GetNumOfDimension() - 2) <= DstVectorDim && + DstVectorDim < OutGridDesc::GetNumOfDimension()); + static_assert(SrcVectorDim != DstVectorDim); + + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + + using ThisThreadBlock = ThisThreadBlock; + + struct Block2TileMap + { + static constexpr index_t NumDim = InGridDesc::GetNumOfDimension(); + static_assert(3 <= NumDim); + + static constexpr auto I0 = Number<0>{}; + + Block2TileMap() = delete; + Block2TileMap(const Block2TileMap&) = default; + Block2TileMap(Block2TileMap&&) = delete; + + ~Block2TileMap() = default; + + Block2TileMap& operator=(const Block2TileMap&) = delete; + Block2TileMap& operator=(Block2TileMap&&) = delete; + + explicit Block2TileMap(const InGridDesc& desc) : desc_(desc) {} + + __host__ constexpr index_t CalculateGridSize(const InGridDesc& desc) const + { + const auto N0 = + math::integer_divide_ceil(desc.GetLength(Number{}), NPerBlock); + const auto H0 = + math::integer_divide_ceil(desc.GetLength(Number{}), HPerBlock); + const auto W0 = + math::integer_divide_ceil(desc.GetLength(Number{}), WPerBlock); + + const index_t grid_size = N0 * H0 * W0; + + return grid_size; + } + + template + __host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx& idx_top) const + { + static_assert(TopIdx::Size() == 1); + + auto block_1d_id = idx_top[I0]; + + const auto N0 = + math::integer_divide_ceil(desc_.GetLength(Number{}), NPerBlock); + const auto H0 = + math::integer_divide_ceil(desc_.GetLength(Number{}), HPerBlock); + const auto W0 = + math::integer_divide_ceil(desc_.GetLength(Number{}), WPerBlock); + + block_1d_id = block_1d_id % (N0 * H0 * W0); + + index_t idx_N0 = block_1d_id / (H0 * W0); + index_t idx_H0 = (block_1d_id % (H0 * W0)) / W0; + index_t idx_W0 = block_1d_id % W0; + + return make_tuple(idx_N0, idx_H0, idx_W0); + } + + private: + const InGridDesc desc_; + }; + + using DefaultBlock2TileMap = Block2TileMap; + + // use an [NPerBlock, HPerBlock, WPerBlock] tensor as element-copy relay + __host__ __device__ static constexpr auto GetInBlockDesc_NPerBlock_HPerBlock_WPerBlock() + { + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, Number{}), + make_tuple(Number{}, + Number{}, + I1)); + } + + // for N-dimension descriptor, reserve its last 2 dimensions, then merge its leading dimensions + // into single one. finally, form a 3D descriptor: [d(0), d(1), ..., d(N - 2), d(N - 1)] -> + // [(d(0) x d(1) x ...), d(N - 2), d(N - 1)] + template + __host__ __device__ static constexpr auto GetMergedDesc(const GridDesc& desc) + { + constexpr index_t NumDim = GridDesc::GetNumOfDimension(); + static_assert(3 <= NumDim); + + const auto merged_desc = transform_tensor_descriptor( + desc, + make_tuple(make_merge_transform(generate_tuple( + [&](auto I) { return desc.GetLength(I); }, Number{})), + make_pass_through_transform(desc.GetLength(Number{})), + make_pass_through_transform(desc.GetLength(Number{}))), + make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number{}), + Sequence{}, + Sequence{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + return merged_desc; + } + + __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() + { + constexpr auto in_block_desc_nperblock_hperblock_wperblock = + GetInBlockDesc_NPerBlock_HPerBlock_WPerBlock(); + + return in_block_desc_nperblock_hperblock_wperblock.GetElementSpaceSize() * + sizeof(InDataType); + } + + __host__ __device__ static constexpr auto MakeDefaultBlock2TileMap(const InGridDesc& desc) + { + return DefaultBlock2TileMap{desc}; + } + + __host__ __device__ static constexpr bool CheckValidity(const InGridDesc& in_grid_desc, + const OutGridDesc& out_grid_desc) + { + constexpr index_t NumDim = InGridDesc::GetNumOfDimension(); + + // check if we only swap last 2 dimensions + bool valid = true; + static_for<0, NumDim - 2, 1>{}([&](auto I) { + if(valid && in_grid_desc.GetLength(I) != out_grid_desc.GetLength(I)) + { + valid = false; + } + }); + + return valid && + (in_grid_desc.GetLength(Number{}) == + out_grid_desc.GetLength(Number{})) && + (in_grid_desc.GetLength(Number{}) == + out_grid_desc.GetLength(Number{})); + } + + template + __device__ static void Run(const InGridDesc in_grid_desc, + const OutGridDesc out_grid_desc, + const InDataType* p_in_global, + OutDataType* p_out_global, + void* __restrict__ p_shared, + const ElementwiseOperation elementwise_op, + const Block2TileMap& block_2_tile_map) + { + auto in_global_buf = make_dynamic_buffer( + p_in_global, in_grid_desc.GetElementSpaceSize()); + + auto out_global_buf = make_dynamic_buffer( + p_out_global, out_grid_desc.GetElementSpaceSize()); + + // each workgroup handles an [NPerBlock, HPerBlock, WPerBLock] slice-transpose problem + const auto block_work_idx = + block_2_tile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id())); + + const index_t n_block_data_idx_on_grid = + __builtin_amdgcn_readfirstlane(block_work_idx[I0] * NPerBlock); + + const index_t h_block_data_idx_on_grid = + __builtin_amdgcn_readfirstlane(block_work_idx[I1] * HPerBlock); + + const index_t w_block_data_idx_on_grid = + __builtin_amdgcn_readfirstlane(block_work_idx[I2] * WPerBlock); + + // create [NPerBlock, HPerBlock, WPerBLock] shaped LDS buffer + constexpr auto in_block_desc_nperblock_hperblock_wperblock = + GetInBlockDesc_NPerBlock_HPerBlock_WPerBlock(); + + auto in_block_buf = make_dynamic_buffer( + static_cast(p_shared), + in_block_desc_nperblock_hperblock_wperblock.GetElementSpaceSize()); + + using BlockSliceLengths = Sequence; + using InBlockTransferAccessOrder = Sequence<0, 1, 2>; + + constexpr index_t SrcVectorDimAfterMerge = + SrcVectorDim - (InGridDesc::GetNumOfDimension() - 3); + constexpr index_t DstVectorDimAfterMerge = SrcVectorDimAfterMerge; + + using ck::tensor_operation::element_wise::PassThrough; + + // merge input descriptor into [(in_grid_desc.GetLength(0) x in_grid_desc.GetLength(1) x + // ...), in_grid_desc.GetLength(NumDim - 2), in_grid_desc.GetLength(NumDim - 1)] + const auto in_grid_desc_n_h_w = GetMergedDesc(in_grid_desc); + + // a workgroup copies an [NPerBlock, HPerBlock, WPerBlock] slice from global memory to LDS + auto in_global_load = ThreadGroupTensorSliceTransfer_v4r1< + ThisThreadBlock, + ElementwiseOperation, + PassThrough, + InMemoryDataOperationEnum::Set, + BlockSliceLengths, + InBlockTransferThreadClusterLengths, + InBlockTransferThreadClusterArrangeOrder, + InDataType, + InDataType, + decltype(in_grid_desc_n_h_w), + decltype(in_block_desc_nperblock_hperblock_wperblock), + InBlockTransferAccessOrder, + InBlockTransferAccessOrder, + SrcVectorDimAfterMerge, + 2, + SrcScalarPerVector, + 1, + 1, + 1, + true, + true>(in_grid_desc_n_h_w, + make_multi_index( + n_block_data_idx_on_grid, h_block_data_idx_on_grid, w_block_data_idx_on_grid), + PassThrough{}, + in_block_desc_nperblock_hperblock_wperblock, + make_multi_index(0, 0, 0), + PassThrough{}); + + // merge output descriptor into [(out_grid_desc.GetLength(0) x out_grid_desc.GetLength(1) x + // ...), out_grid_desc.GetLength(NumDim - 2), out_grid_desc.GetLength(NumDim - 1)] + const auto out_grid_desc_n_w_h = GetMergedDesc(out_grid_desc); + + // create transposed view of output tensor + const auto out_grid_desc_n_h_w = transform_tensor_descriptor( + out_grid_desc_n_w_h, + make_tuple(make_pass_through_transform(out_grid_desc_n_w_h.GetLength(I0)), + make_pass_through_transform(out_grid_desc_n_w_h.GetLength(I1)), + make_pass_through_transform(out_grid_desc_n_w_h.GetLength(I2))), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<2>{}, Sequence<1>{})); + + // a workgroup copies an [NPerBlock, HPerBlock, WPerBlock] slice from LDS to global memory + auto out_global_store = ThreadGroupTensorSliceTransfer_v4r1< + ThisThreadBlock, + ElementwiseOperation, + PassThrough, + InMemoryDataOperationEnum::Set, + BlockSliceLengths, + InBlockTransferThreadClusterLengths, + InBlockTransferThreadClusterArrangeOrder, + InDataType, + OutDataType, + decltype(in_block_desc_nperblock_hperblock_wperblock), + decltype(out_grid_desc_n_h_w), + InBlockTransferAccessOrder, + InBlockTransferAccessOrder, + 2, + DstVectorDimAfterMerge, + 1, + DstScalarPerVector, + 1, + 1, + true, + true>(in_block_desc_nperblock_hperblock_wperblock, + make_multi_index(0, 0, 0), + PassThrough{}, + out_grid_desc_n_h_w, + make_multi_index( + n_block_data_idx_on_grid, h_block_data_idx_on_grid, w_block_data_idx_on_grid), + elementwise_op); + + in_global_load.Run(in_grid_desc_n_h_w, + in_global_buf, + in_block_desc_nperblock_hperblock_wperblock, + in_block_buf, + I0); + + out_global_store.Run(in_block_desc_nperblock_hperblock_wperblock, + in_block_buf, + out_grid_desc_n_h_w, + out_global_buf, + I0); + } +}; + +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp index 005f35e909..bb28c194f4 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp @@ -6,6 +6,7 @@ #include "ck/utility/common_header.hpp" #include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" #include "ck/tensor/static_tensor.hpp" namespace ck { diff --git a/include/ck/utility/span.hpp b/include/ck/utility/span.hpp new file mode 100644 index 0000000000..1e50121454 --- /dev/null +++ b/include/ck/utility/span.hpp @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include + +namespace ck { + +template +class span +{ + public: + using element_type = T; + using value_type = std::remove_cv_t; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + using pointer = element_type*; + using const_pointer = const element_type*; + using reference = element_type&; + using const_reference = const element_type&; + using iterator = pointer; + using const_iterator = pointer; + + constexpr span() : span(nullptr, size_type{0}) {} + + constexpr span(pointer first, size_type count) : ptr_(first), size_(count) {} + + constexpr span(pointer first, pointer last) : span(first, last - first) {} + + template + constexpr span(element_type (&arr)[N]) noexcept : span(arr, N) + { + } + + template + constexpr span(std::array& arr) noexcept : span(arr.data(), N) + { + } + + template + constexpr span(const Container& container) : span(container.data(), container.size()) + { + } + + constexpr iterator begin() const noexcept { return ptr_; } + constexpr const_iterator cbegin() const noexcept { return begin(); } + + constexpr iterator end() const noexcept { return begin() + size(); } + constexpr const_iterator cend() const noexcept { return end(); } + + constexpr reference front() const { return *begin(); } + constexpr reference back() const { return *(--end()); } + + constexpr reference operator[](size_type idx) const { return *(begin() + idx); } + constexpr pointer data() const noexcept { return ptr_; } + + constexpr size_type size() const noexcept { return size_; } + + private: + pointer ptr_; + size_type size_; +}; + +} // namespace ck diff --git a/library/include/ck/library/utility/check_err.hpp b/library/include/ck/library/utility/check_err.hpp index d116d44be9..3a5cd1da76 100644 --- a/library/include/ck/library/utility/check_err.hpp +++ b/library/include/ck/library/utility/check_err.hpp @@ -15,6 +15,7 @@ #include "ck/ck.hpp" #include "ck/utility/data_type.hpp" +#include "ck/utility/span.hpp" #include "ck/utility/type.hpp" #include "ck/host_utility/io.hpp" @@ -32,7 +33,7 @@ check_err(const std::vector& out, { if(out.size() != ref.size()) { - std::cout << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size() + std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size() << std::endl; return false; } @@ -50,7 +51,7 @@ check_err(const std::vector& out, err_count++; if(err_count < 5) { - std::cout << msg << std::setw(12) << std::setprecision(7) << " out[" << i + std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i << "] != ref[" << i << "]: " << out[i] << " != " << ref[i] << std::endl; } res = false; @@ -58,7 +59,7 @@ check_err(const std::vector& out, } if(!res) { - std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl; + std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl; } return res; } @@ -73,7 +74,7 @@ check_err(const std::vector& out, { if(out.size() != ref.size()) { - std::cout << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size() + std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size() << std::endl; return false; } @@ -94,7 +95,7 @@ check_err(const std::vector& out, err_count++; if(err_count < 5) { - std::cout << msg << std::setw(12) << std::setprecision(7) << " out[" << i + std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i << "] != ref[" << i << "]: " << o << " != " << r << std::endl; } res = false; @@ -102,22 +103,22 @@ check_err(const std::vector& out, } if(!res) { - std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl; + std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl; } return res; } template -typename std::enable_if::value, bool>::type -check_err(const std::vector& out, - const std::vector& ref, +typename std::enable_if, bool>::type +check_err(span out, + span ref, const std::string& msg = "Error: Incorrect results!", double rtol = 1e-3, double atol = 1e-3) { if(out.size() != ref.size()) { - std::cout << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size() + std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size() << std::endl; return false; } @@ -137,7 +138,7 @@ check_err(const std::vector& out, err_count++; if(err_count < 5) { - std::cout << msg << std::setw(12) << std::setprecision(7) << " out[" << i + std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i << "] != ref[" << i << "]: " << o << " != " << r << std::endl; } res = false; @@ -145,11 +146,22 @@ check_err(const std::vector& out, } if(!res) { - std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl; + std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl; } return res; } +template +typename std::enable_if::value, bool>::type +check_err(const std::vector& out, + const std::vector& ref, + const std::string& msg = "Error: Incorrect results!", + double rtol = 1e-3, + double atol = 1e-3) +{ + return check_err(span{out}, span{ref}, msg, rtol, atol); +} + template std::enable_if_t<(std::is_integral_v && !std::is_same_v) #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 @@ -194,7 +206,7 @@ check_err(const std::vector& out, } if(!res) { - std::cout << "max err: " << max_err << std::endl; + std::cerr << "max err: " << max_err << std::endl; } return res; } diff --git a/library/include/ck/library/utility/fill.hpp b/library/include/ck/library/utility/fill.hpp index 6a76442779..d717738dc4 100644 --- a/library/include/ck/library/utility/fill.hpp +++ b/library/include/ck/library/utility/fill.hpp @@ -5,7 +5,10 @@ #include #include +#include #include +#include +#include #include "ck/utility/data_type.hpp" @@ -25,6 +28,15 @@ struct FillUniformDistribution std::uniform_real_distribution dis(a_, b_); std::generate(first, last, [&dis, &gen]() { return ck::type_convert(dis(gen)); }); } + + template + auto operator()(ForwardRange&& range) -> std::void_t()(std::begin(std::forward(range)), + std::end(std::forward(range))))> + { + (*this)(std::begin(std::forward(range)), + std::end(std::forward(range))); + } }; // Normally FillUniformDistributionIntegerValue should use std::uniform_int_distribution as below. diff --git a/library/include/ck/library/utility/host_tensor.hpp b/library/include/ck/library/utility/host_tensor.hpp index c85c37aabd..5ca34266a1 100644 --- a/library/include/ck/library/utility/host_tensor.hpp +++ b/library/include/ck/library/utility/host_tensor.hpp @@ -3,15 +3,16 @@ #pragma once -#include -#include -#include #include -#include #include #include +#include +#include +#include +#include #include "ck/utility/data_type.hpp" +#include "ck/utility/span.hpp" template std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim) @@ -235,6 +236,9 @@ auto make_ParallelTensorFunctor(F f, Xs... xs) template struct Tensor { + using Descriptor = HostTensorDescriptor; + using Data = std::vector; + template Tensor(std::initializer_list lens) : mDesc(lens), mData(mDesc.GetElementSpaceSize()) { @@ -251,7 +255,7 @@ struct Tensor { } - Tensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpaceSize()) {} + Tensor(const Descriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpaceSize()) {} template Tensor CopyAsType() const @@ -278,9 +282,9 @@ struct Tensor { } - const std::vector& GetLengths() const { return mDesc.GetLengths(); } + decltype(auto) GetLengths() const { return mDesc.GetLengths(); } - const std::vector& GetStrides() const { return mDesc.GetStrides(); } + decltype(auto) GetStrides() const { return mDesc.GetStrides(); } std::size_t GetNumOfDimension() const { return mDesc.GetNumOfDimension(); } @@ -288,6 +292,8 @@ struct Tensor std::size_t GetElementSpaceSize() const { return mDesc.GetElementSpaceSize(); } + std::size_t GetElementSpaceSizeInBytes() const { return sizeof(T) * GetElementSpaceSize(); } + void SetZero() { for(auto& v : mData) @@ -425,14 +431,40 @@ struct Tensor return mData[mDesc.GetOffsetFromMultiIndex(idx)]; } - typename std::vector::iterator begin() { return mData.begin(); } + typename Data::iterator begin() { return mData.begin(); } - typename std::vector::iterator end() { return mData.end(); } + typename Data::iterator end() { return mData.end(); } - typename std::vector::const_iterator begin() const { return mData.begin(); } + typename Data::pointer data() { return mData.data(); } - typename std::vector::const_iterator end() const { return mData.end(); } + typename Data::const_iterator begin() const { return mData.begin(); } - HostTensorDescriptor mDesc; - std::vector mData; + typename Data::const_iterator end() const { return mData.end(); } + + typename Data::const_pointer data() const { return mData.data(); } + + typename Data::size_type size() const { return mData.size(); } + + template + auto AsSpan() const + { + constexpr std::size_t FromSize = sizeof(T); + constexpr std::size_t ToSize = sizeof(U); + + using Element = std::add_const_t>; + return ck::span{reinterpret_cast(data()), size() * FromSize / ToSize}; + } + + template + auto AsSpan() + { + constexpr std::size_t FromSize = sizeof(T); + constexpr std::size_t ToSize = sizeof(U); + + using Element = std::remove_reference_t; + return ck::span{reinterpret_cast(data()), size() * FromSize / ToSize}; + } + + Descriptor mDesc; + Data mData; };