mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
Add 'Permute' device op & example (#408)
* Add example folder for 'DeviceElementwise'
* Re-structure example files
* Move common parts into common.hpp
* Use more strict input
* Add more helper methods in 'DeviceElementwise'
* Use more specific method to write example
* Allow specify problem through command line argument
* Allow specify problem 'axes' through command line argument
* Add check to template type argument
* Add transpose_shape() to generalize shape permute
* Generalize transpose utility functions
* Use better name for tensor indices
* Add checks in helper functions
* Remove debug messages
* Refine error message for check_err()
* Generalize variable naming in example code
* Add device op 'DevicePermute'
This device op is clone of 'DeviceElementwise'
* Use 'DevicePermute' device op in example
* Remove 'elementwise' from identifiers
* Remove 'elementwise' from file paths
* Remove base class of 'DevicePermute'
* Let 'DevicePermute' inherit from 'BaseOperator'
* Add simple type traits to validate device op type
* Add static_assert() to check type constraints
* Create 'DevicePermuteBase' to generate methods
* Use indirect base type to generate methods
* Remove 'is_device_op<>' type traits
* Only accept single-input-single-output for 'DervicePermute'
* Simplify 'DevicePermute' interface
* Re-format 'DeviceElementwise'
* Use CRTP to generate overridden virtual method
* Remove unnecessary include directives
* Distinguish input & output shape in 'DevicePermute'
* Passing 'axes' to 'DevicePermute'
* Use more reasonable return value for Invoker::Run()
* Add 'GridwisePermute' kernel
This kernel is a clone of 'GridwiseElementwise_1D'
* Remove no-longer used type argument
* Check if input/output shape meet the requirement
* Remove no-longer used method
* Remove never-entered-if-clause
* Change problem description for 'DevicePermute'
* Transform descriptor into 3 dimensions
* Add debug code the verify result
* Add comment to indicate template argument location
* Add N/H/WPerBlock template parameter to 'DevicePermute'
* Rename 'GridwisePermute' to 'GridwiseCopy'
* Check tensor descriptor dimensions in 'GridwiseElementwise_1D'
* Add missing include directive
* Add 'BlockSize' parameter to 'DevicePermute'
* Remove no-longer used method
* Add 'BlockToTileMap' for 'GridwiseCopy'
* Use the normal Block2TileMap convention
* Rename 'BlockToTileMap' as 'Block2TileMap'
* Fix most of compilation errors
* Let 'Block2TileMap' map block to 2d coordinate
* Allow data transfer in 'GridwiseCopy'
* Fix wrong output descriptor for 2nd blockwise copy
* Rename 'GridwiseCopy' as 'GridwisePermute'
* Remove '1d' in identifiers
* Remove commented-out codes
* Remove 'MPerThread' template parameter
* Seperate template parameters
* Unify variable namming convention
* Use more verbose way to create expressions
* Add template parameter 'InBlockLdsExtraW'
* Release the constraint on In/OutGridDesc
* Use date type directly as template argument
* Re-arrange template arguments for blockwise copy
* Remove no-longer used template parameters
* Embed layout in the variable names
* Add GridwisePermute::CheckValidity()
* Extract local types as template parameters
* Rename local type alias
* Add more template parameters (vector width related)
* Calculate new SrcVectorDim/DstVectorDim after merge descriptor dimensions
* Fill tensor values start from 1
* Re-formate example code
* Avoid too-large block id
* Add comment
* Make sure 'SrcVectorDim' is not same as 'DstVectorDim'
* Add check for the 'VectorDim' & 'ScalarPerVector' template params
* Let 'DstVectorDim' equals 'SrcVectorDim' after transpose out grid desc
* Remove no-longer used template parameter 'NPerBlock'
* Fix wrong descriptor creation logics
* Specify problem in each examples
* Use better example name
* Add new example 'example_permute_NxHxW_fp32'
* Add example for demonstrating bundle multiple elems in tensor
* Add support to permute multiple elements together
* Change the default problem size
* Add span<> class template
* Use span<> to generalize check_err() interface
* Fix ambiguous ctor call
* Avoid create necessary objects
* Use helper functions to simplify example code
* Add example for 4xfp16 permute
* Disable failed-to-compile example
* Add check for the NUM_ELEMS_IN_BUNDLE
* Remove redundant parameter in helper lambda function
* Add check for the input tensor type's byte-size
* Check scalar-per-vector with padded length
* Use more verbose name to avoid name collision
* Use fixed 'VectorDim' & 'ScalarPerVector' for LDS
* Embed shape info in name of descriptor constructor
* Rename example folder '36_permute' into '37_permute'
* Avoid using too-large LDS in kernel code
* Remove redundant example
* Usw switch() to group similar codes
* Add const to the span<> type arguement
* Simply initialize tensor with floating point values
* Use fp16 as data type in all examples
* Enlarge tensor size in example
* Enalrge N-dim in example
* Add check for the bundled type in example
* Use more stricter error threshold
* Remove global load/store loop in kernel code
* Measure execution time by default
* Use faster device op config for example 'NxHxW_fp16'
* Use faster device op config for example '1xHxW_fp16'
* Use faster device op config for example 'HxWx4_fp16'
* Remove cmd arg parsing logics
* Rename functions
* Extract bundle permutation logic out
* Simplify permute bundle example
* Add Tensor<>::GetElementSpaceSizeInBytes()
* Add Tensor<>::data()
* Use new methods to simplify code
* Use type alias to replace duplicated code
* Use existing method to shorten code
* Allow FillUniformDistribution accept range arugment
* Intialize random values in range
* Add Tensor<>::size()
* Use more meaningful names in permute bundle example
* Use more meaningful names in permute element examples
* Use rangified copy() to copy elements
* Use function return value directly to eliminate variables
* Add to_array() conversion tool to eliminate more variables
* Add Tensor<>::AsSpan<>() to create view of tensor values
* Use AsSpan() to shorten check_err() calls
* Remove no-longer-used 'using' directives
* Move 'using' directive to proper code position
* Remove redudant variables
* Remove useless static_assert()
* Add check for range types
* Declare variable right before first use
* Move long return type as tailing return type
* Add BaseInvokerCRTP<> class template to generate method
* Create new base type for 'DervicePermute' implementations
* Move 'NumDim' template param to the first
* Rename 'DevicePermute' to 'DevicePermuteImpl'
* Add 'noexcept' specifier to CRTP generated method
* Move 'Block2TileMap' definition into 'GridwisePermute'
* Use type alias to reduce code
* Unify naming style in 'DevicePermute'
* Add comments in 'GridwisePermute'
* Rename permute example folder
* Use std::cerr to report error
* Use larger shape in examples
* Rename '38_permute' to '39_permute'
* Make sure we use unsigned type for shape & indices
* Remove opt-ed out assertion
* Remove template BaseInvokerCRTP<>
[ROCm/composable_kernel commit: f584ab0c54]
This commit is contained in:
9
example/39_permute/CMakeLists.txt
Normal file
9
example/39_permute/CMakeLists.txt
Normal file
@@ -0,0 +1,9 @@
|
||||
add_custom_target(example_permute)
|
||||
|
||||
add_example_executable(example_permute_1xHxW_fp16 permute_1xHxW_fp16.cpp)
|
||||
add_example_executable(example_permute_NxHxW_fp16 permute_NxHxW_fp16.cpp)
|
||||
add_example_executable(example_permute_HxWx4_fp16 permute_HxWx4_fp16.cpp)
|
||||
|
||||
add_dependencies(example_permute example_permute_1xHxW_fp16)
|
||||
add_dependencies(example_permute example_permute_NxHxW_fp16)
|
||||
add_dependencies(example_permute example_permute_HxWx4_fp16)
|
||||
468
example/39_permute/common.hpp
Normal file
468
example/39_permute/common.hpp
Normal file
@@ -0,0 +1,468 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
|
||||
#include "ck/utility/type.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/fill.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
using F64 = double;
|
||||
|
||||
struct Problem final
|
||||
{
|
||||
static constexpr std::size_t NumDim = 3;
|
||||
|
||||
using Shape = std::array<std::size_t, NumDim>;
|
||||
using Axes = Shape;
|
||||
|
||||
Problem() = delete;
|
||||
|
||||
explicit Problem(const Shape& default_shape, const Axes& default_axes)
|
||||
: shape(default_shape), axes(default_axes)
|
||||
{
|
||||
}
|
||||
|
||||
Shape shape;
|
||||
Axes axes;
|
||||
};
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <typename Array, std::size_t Difference>
|
||||
struct enlarge_array_size;
|
||||
|
||||
template <typename T, std::size_t Size, std::size_t Difference>
|
||||
struct enlarge_array_size<std::array<T, Size>, Difference>
|
||||
{
|
||||
using type = std::array<T, Size + Difference>;
|
||||
};
|
||||
|
||||
template <typename Array, std::size_t Difference>
|
||||
using enlarge_array_size_t = typename enlarge_array_size<Array, Difference>::type;
|
||||
|
||||
template <typename Array>
|
||||
struct get_array_size;
|
||||
|
||||
template <typename T, std::size_t Size>
|
||||
struct get_array_size<std::array<T, Size>> : std::integral_constant<std::size_t, Size>
|
||||
{
|
||||
};
|
||||
|
||||
template <typename Array>
|
||||
inline constexpr std::size_t get_array_size_v = get_array_size<Array>::value;
|
||||
|
||||
template <typename T, typename = void>
|
||||
struct is_iterator : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct is_iterator<T,
|
||||
std::void_t<decltype(*std::declval<T>()),
|
||||
decltype(++std::declval<std::add_lvalue_reference_t<T>>()),
|
||||
decltype(std::declval<std::add_lvalue_reference_t<T>>()++)>>
|
||||
: std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
inline constexpr bool is_iterator_v = is_iterator<T>::value;
|
||||
|
||||
struct Placeholder final
|
||||
{
|
||||
template <typename T>
|
||||
constexpr inline operator T() const noexcept;
|
||||
};
|
||||
|
||||
template <typename Iterator, typename = void>
|
||||
struct is_output_iterator : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename Iterator>
|
||||
struct is_output_iterator<
|
||||
Iterator,
|
||||
std::void_t<decltype(*std::declval<Iterator>() = std::declval<Placeholder>())>>
|
||||
: std::bool_constant<is_iterator_v<Iterator>>
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
inline constexpr bool is_output_iterator_v = is_output_iterator<T>::value;
|
||||
|
||||
template <typename Iterator, typename = void>
|
||||
struct is_bidirectional_iterator : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename Iterator>
|
||||
struct is_bidirectional_iterator<
|
||||
Iterator,
|
||||
std::void_t<decltype(--std::declval<std::add_lvalue_reference_t<Iterator>>()),
|
||||
decltype(std::declval<std::add_lvalue_reference_t<Iterator>>()--)>>
|
||||
: std::bool_constant<is_iterator_v<Iterator>>
|
||||
{
|
||||
};
|
||||
|
||||
template <typename Iterator>
|
||||
inline constexpr bool is_bidirectional_iterator_v = is_bidirectional_iterator<Iterator>::value;
|
||||
|
||||
template <typename Iterator, typename = void>
|
||||
struct is_random_access_iterator : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename Iterator>
|
||||
struct is_random_access_iterator<Iterator,
|
||||
std::void_t<decltype(std::declval<Iterator>() + 1),
|
||||
decltype(std::declval<Iterator>() - 1),
|
||||
decltype(std::declval<Iterator>()[1])>>
|
||||
: std::bool_constant<is_iterator_v<Iterator>>
|
||||
{
|
||||
};
|
||||
|
||||
template <typename Iterator>
|
||||
inline constexpr bool is_random_access_iterator_v = is_random_access_iterator<Iterator>::value;
|
||||
|
||||
template <typename T, typename = void>
|
||||
struct is_range : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct is_range<T,
|
||||
std::void_t<decltype(begin(std::declval<T>())),
|
||||
decltype(end(std::declval<T>())),
|
||||
decltype(begin(std::declval<T>()) != end(std::declval<T>()))>>
|
||||
: std::bool_constant<is_iterator_v<ck::remove_cvref_t<decltype(begin(std::declval<T>()))>>>
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
inline constexpr bool is_range_v = is_range<T>::value;
|
||||
|
||||
template <typename Range, typename = void>
|
||||
struct is_sized_range : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename Range>
|
||||
struct is_sized_range<Range, std::void_t<decltype(size(std::declval<Range>()))>>
|
||||
: std::bool_constant<is_range_v<Range>>
|
||||
{
|
||||
};
|
||||
|
||||
template <typename Range>
|
||||
inline constexpr bool is_sized_range_v = is_sized_range<Range>::value;
|
||||
|
||||
template <typename Range, typename = void>
|
||||
struct is_bidirectional_range : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename Range>
|
||||
struct is_bidirectional_range<Range, std::void_t<>>
|
||||
: std::bool_constant<
|
||||
is_range_v<Range> &&
|
||||
is_bidirectional_iterator_v<ck::remove_cvref_t<decltype(begin(std::declval<Range>()))>>>
|
||||
{
|
||||
};
|
||||
|
||||
template <typename Range>
|
||||
inline constexpr bool is_bidirectional_range_v = is_bidirectional_range<Range>::value;
|
||||
|
||||
template <typename Range, typename = void>
|
||||
struct is_random_access_range : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename Range>
|
||||
struct is_random_access_range<Range, std::void_t<>>
|
||||
: std::bool_constant<
|
||||
is_range_v<Range> &&
|
||||
is_random_access_iterator_v<ck::remove_cvref_t<decltype(begin(std::declval<Range>()))>>>
|
||||
{
|
||||
};
|
||||
|
||||
template <typename Range>
|
||||
inline constexpr bool is_random_access_range_v = is_random_access_range<Range>::value;
|
||||
|
||||
template <typename Range>
|
||||
class to_array_proxy
|
||||
{
|
||||
static_assert(is_range_v<Range>);
|
||||
|
||||
public:
|
||||
explicit to_array_proxy(const Range& source) noexcept : source_(source) {}
|
||||
|
||||
template <typename T, std::size_t Size>
|
||||
operator std::array<T, Size>() const
|
||||
{
|
||||
std::array<T, Size> destination;
|
||||
|
||||
std::copy_n(std::begin(source_),
|
||||
std::min<std::size_t>(Size, std::size(source_)),
|
||||
std::begin(destination));
|
||||
|
||||
return destination;
|
||||
}
|
||||
|
||||
private:
|
||||
const Range& source_;
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <typename Range>
|
||||
inline auto to_array(Range& range) noexcept
|
||||
-> std::enable_if_t<detail::is_range_v<Range>,
|
||||
detail::to_array_proxy<ck::remove_cvref_t<Range>>>
|
||||
{
|
||||
return detail::to_array_proxy<ck::remove_cvref_t<Range>>{range};
|
||||
}
|
||||
|
||||
namespace ranges {
|
||||
template <typename InputRange, typename OutputIterator>
|
||||
inline auto copy(InputRange&& range, OutputIterator iter)
|
||||
-> decltype(std::copy(std::begin(std::forward<InputRange>(range)),
|
||||
std::end(std::forward<InputRange>(range)),
|
||||
iter))
|
||||
{
|
||||
return std::copy(std::begin(std::forward<InputRange>(range)),
|
||||
std::end(std::forward<InputRange>(range)),
|
||||
iter);
|
||||
}
|
||||
} // namespace ranges
|
||||
|
||||
template <typename Axes>
|
||||
inline auto is_valid_axes(const Axes& axes)
|
||||
-> std::enable_if_t<detail::is_random_access_range_v<Axes>, bool>
|
||||
{
|
||||
using std::empty;
|
||||
if(empty(axes))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
using std::begin, std::end;
|
||||
std::vector<std::size_t> sorted_axes(begin(axes), end(axes));
|
||||
|
||||
std::sort(begin(sorted_axes), end(sorted_axes));
|
||||
const auto last = std::unique(begin(sorted_axes), end(sorted_axes));
|
||||
|
||||
return (last == end(sorted_axes)) && (*begin(sorted_axes) == 0) &&
|
||||
(*std::prev(last) == size(axes) - 1);
|
||||
}
|
||||
|
||||
template <typename Shape>
|
||||
inline auto is_valid_shape(const Shape& shape) -> std::enable_if_t<detail::is_range_v<Shape>, bool>
|
||||
{
|
||||
static_assert(std::is_unsigned_v<ck::remove_cvref_t<decltype(*std::begin(shape))>>);
|
||||
|
||||
using std::begin, std::end;
|
||||
using std::empty;
|
||||
return !empty(shape) && std::all_of(begin(shape), end(shape), [](auto dim) { return 0 < dim; });
|
||||
}
|
||||
|
||||
template <typename Shape, typename Indices>
|
||||
inline auto is_valid_indices(const Shape& shape, const Indices& indices)
|
||||
-> std::enable_if_t<detail::is_sized_range_v<Shape> && detail::is_sized_range_v<Indices>, bool>
|
||||
{
|
||||
static_assert(std::is_unsigned_v<ck::remove_cvref_t<decltype(*std::begin(indices))>>);
|
||||
|
||||
if(!is_valid_shape(shape))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
using std::empty;
|
||||
if(empty(indices))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
using std::size;
|
||||
if(size(shape) != size(indices))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
using std::begin, std::end;
|
||||
|
||||
auto dim = begin(shape);
|
||||
auto idx = begin(indices);
|
||||
for(; dim != end(shape) && idx != end(indices); ++dim, ++idx)
|
||||
{
|
||||
if(*dim <= *idx)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <std::size_t Size>
|
||||
std::array<std::size_t, Size> transpose(const std::array<std::size_t, Size>& shape,
|
||||
const std::array<std::size_t, Size>& axes)
|
||||
{
|
||||
assert(is_valid_shape(shape) && is_valid_axes(axes));
|
||||
|
||||
std::array<std::size_t, Size> transposed;
|
||||
auto iter = std::begin(transposed);
|
||||
for(const auto axis : axes)
|
||||
{
|
||||
*iter++ = shape[axis];
|
||||
}
|
||||
|
||||
return transposed;
|
||||
}
|
||||
|
||||
auto extend_shape(const Problem::Shape& shape, std::size_t new_dim)
|
||||
{
|
||||
detail::enlarge_array_size_t<Problem::Shape, 1> extended_shape;
|
||||
|
||||
using std::begin, std::end;
|
||||
|
||||
std::copy(begin(shape), end(shape), begin(extended_shape));
|
||||
extended_shape.back() = new_dim;
|
||||
|
||||
return extended_shape;
|
||||
}
|
||||
|
||||
auto extend_axes(const Problem::Axes& axes)
|
||||
{
|
||||
detail::enlarge_array_size_t<Problem::Axes, 1> extended_axes;
|
||||
|
||||
using std::begin, std::end;
|
||||
|
||||
std::copy(begin(axes), end(axes), begin(extended_axes));
|
||||
extended_axes.back() = detail::get_array_size_v<Problem::Axes>;
|
||||
|
||||
return extended_axes;
|
||||
}
|
||||
|
||||
template <typename Shape, typename Indices>
|
||||
auto advance_indices(const Shape& shape, Indices& indices) -> std::enable_if_t<
|
||||
detail::is_bidirectional_range_v<Shape> && detail::is_sized_range_v<Shape> &&
|
||||
detail::is_bidirectional_range_v<Indices> && detail::is_sized_range_v<Indices>,
|
||||
bool>
|
||||
{
|
||||
using std::size;
|
||||
if(!(is_valid_shape(shape) && is_valid_indices(shape, indices) && size(shape) == size(indices)))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
bool carry = true;
|
||||
|
||||
using std::rbegin, std::rend;
|
||||
auto dim = rbegin(shape);
|
||||
auto idx = rbegin(indices);
|
||||
for(; carry && dim != rend(shape) && idx != rend(indices); ++dim, ++idx)
|
||||
{
|
||||
*idx = (*idx + carry);
|
||||
carry = ((*idx == *dim) ? (*idx = 0, true) : false);
|
||||
}
|
||||
|
||||
return !carry;
|
||||
}
|
||||
|
||||
template <typename Src, typename Axes, typename Functor, typename Dest>
|
||||
auto host_permute(const Tensor<Src>& src, const Axes& axes, Functor functor, Tensor<Dest>& dest)
|
||||
-> std::enable_if_t<detail::is_random_access_range_v<Axes> && detail::is_sized_range_v<Axes> &&
|
||||
std::is_invocable_v<Functor,
|
||||
std::add_lvalue_reference_t<Dest>,
|
||||
std::add_lvalue_reference_t<Src>>,
|
||||
bool>
|
||||
{
|
||||
const auto& shape = src.mDesc.GetLengths();
|
||||
const auto& transposed_shape = dest.mDesc.GetLengths();
|
||||
if(!(is_valid_shape(shape) && is_valid_shape(transposed_shape)))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
using std::size;
|
||||
if(!is_valid_axes(axes))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static_assert(detail::is_sized_range_v<ck::remove_cvref_t<decltype(shape)>> &&
|
||||
detail::is_sized_range_v<ck::remove_cvref_t<decltype(transposed_shape)>>);
|
||||
|
||||
if(size(shape) != size(transposed_shape))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static_assert(detail::is_random_access_range_v<ck::remove_cvref_t<decltype(shape)>> &&
|
||||
detail::is_random_access_range_v<ck::remove_cvref_t<decltype(transposed_shape)>>);
|
||||
{
|
||||
for(std::size_t idx = 0; idx < size(shape); ++idx)
|
||||
{
|
||||
if(transposed_shape[idx] != shape[axes[idx]])
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::size_t> indices(size(shape), 0);
|
||||
if(!is_valid_indices(shape, indices))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
switch(size(shape))
|
||||
{
|
||||
case 3: {
|
||||
do
|
||||
{
|
||||
Dest output = 0;
|
||||
functor(output, src(indices[0], indices[1], indices[2]));
|
||||
dest(indices[axes[0]], indices[axes[1]], indices[axes[2]]) = output;
|
||||
} while(advance_indices(shape, indices));
|
||||
}
|
||||
break;
|
||||
case 4: {
|
||||
do
|
||||
{
|
||||
Dest output = 0;
|
||||
functor(output, src(indices[0], indices[1], indices[2], indices[3]));
|
||||
dest(indices[axes[0]], indices[axes[1]], indices[axes[2]], indices[axes[3]]) = output;
|
||||
} while(advance_indices(shape, indices));
|
||||
}
|
||||
break;
|
||||
default: return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
20
example/39_permute/permute_1xHxW_fp16.cpp
Normal file
20
example/39_permute/permute_1xHxW_fp16.cpp
Normal file
@@ -0,0 +1,20 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
using InDataType = F16;
|
||||
using OutDataType = F16;
|
||||
|
||||
// clang-format off
|
||||
using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
|
||||
// ######| NumDim| InData| OutData| Elementwise| Block| NPer| HPer| WPer| InBlock| InBlockTransfer| InBlockTransfer| Src| Dst| Src| Dst|
|
||||
// ######| | Type| Type| Operation| Size| Block| Block| Block| LdsExtraW| ThreadClusterLengths| ThreadClusterArrangeOrder| VectorDim| VectorDim| ScalarPerVector| ScalarPerVector|
|
||||
// ######| | | | | | | | | | | | | | | |
|
||||
// ######| | | | | | | | | | | | | | | |
|
||||
< 3, InDataType, OutDataType, PassThrough, 256, 1, 32, 32, 3, S<1, 32, 8>, S<0, 1, 2>, 2, 1, 2, 1>;
|
||||
// clang-format on
|
||||
|
||||
#include "run_permute_element_example.inc"
|
||||
|
||||
int main() { return !run_permute_element_example({1, 32000, 80}, {0, 2, 1}); }
|
||||
22
example/39_permute/permute_HxWx4_fp16.cpp
Normal file
22
example/39_permute/permute_HxWx4_fp16.cpp
Normal file
@@ -0,0 +1,22 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
using DataType = F16;
|
||||
using BundleType = F64;
|
||||
|
||||
static_assert(sizeof(BundleType) % sizeof(DataType) == 0);
|
||||
|
||||
// clang-format off
|
||||
using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
|
||||
// ######| NumDim| InData| OutData| Elementwise| Block| NPer| HPer| WPer| InBlock| InBlockTransfer| InBlockTransfer| Src| Dst| Src| Dst|
|
||||
// ######| | Type| Type| Operation| Size| Block| Block| Block| LdsExtraW| ThreadClusterLengths| ThreadClusterArrangeOrder| VectorDim| VectorDim| ScalarPerVector| ScalarPerVector|
|
||||
// ######| | | | | | | | | | | | | | | |
|
||||
// ######| | | | | | | | | | | | | | | |
|
||||
< 3, BundleType, BundleType, PassThrough, 256, 1, 32, 32, 5, S<1, 32, 8>, S<0, 1, 2>, 2, 1, 4, 1>;
|
||||
// clang-format on
|
||||
|
||||
#include "run_permute_bundle_example.inc"
|
||||
|
||||
int main() { return !run_permute_bundle_example({1, 80, 32000}, {0, 2, 1}); }
|
||||
20
example/39_permute/permute_NxHxW_fp16.cpp
Normal file
20
example/39_permute/permute_NxHxW_fp16.cpp
Normal file
@@ -0,0 +1,20 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
using InDataType = F16;
|
||||
using OutDataType = F16;
|
||||
|
||||
// clang-format off
|
||||
using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
|
||||
// ######| NumDim| InData| OutData| Elementwise| Block| NPer| HPer| WPer| InBlock| InBlockTransfer| InBlockTransfer| Src| Dst| Src| Dst|
|
||||
// ######| | Type| Type| Operation| Size| Block| Block| Block| LdsExtraW| ThreadClusterLengths| ThreadClusterArrangeOrder| VectorDim| VectorDim| ScalarPerVector| ScalarPerVector|
|
||||
// ######| | | | | | | | | | | | | | | |
|
||||
// ######| | | | | | | | | | | | | | | |
|
||||
< 3, InDataType, OutDataType, PassThrough, 128, 4, 16, 8, 6, S<2, 16, 4>, S<0, 1, 2>, 2, 1, 2, 1>;
|
||||
// clang-format on
|
||||
|
||||
#include "run_permute_element_example.inc"
|
||||
|
||||
int main() { return !run_permute_element_example({121, 768, 80}, {0, 2, 1}); }
|
||||
78
example/39_permute/run_permute_bundle_example.inc
Normal file
78
example/39_permute/run_permute_bundle_example.inc
Normal file
@@ -0,0 +1,78 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
bool run_permute_bundle(const Problem& problem)
|
||||
{
|
||||
const auto& input_bundle_shape = problem.shape;
|
||||
const auto& input_bundle_axes = problem.axes;
|
||||
|
||||
const auto output_bundle_shape = transpose(input_bundle_shape, input_bundle_axes);
|
||||
|
||||
Tensor<BundleType> input_bundle_tensor(input_bundle_shape);
|
||||
Tensor<BundleType> output_bundle_tensor(output_bundle_shape);
|
||||
|
||||
// initialize tensor by assigning DataType values
|
||||
ck::utils::FillUniformDistribution<DataType>{-1.f, 1.f}(input_bundle_tensor.AsSpan<DataType>());
|
||||
|
||||
DeviceMem input_device_buf(input_bundle_tensor.GetElementSpaceSizeInBytes());
|
||||
DeviceMem output_device_buf(output_bundle_tensor.GetElementSpaceSizeInBytes());
|
||||
|
||||
using std::data;
|
||||
input_device_buf.ToDevice(data(input_bundle_tensor));
|
||||
|
||||
static_assert(std::is_default_constructible_v<DevicePermuteInstance>);
|
||||
|
||||
auto permute = DevicePermuteInstance{};
|
||||
auto argument = permute.MakeArgument(to_array(input_bundle_shape),
|
||||
to_array(input_bundle_tensor.GetStrides()),
|
||||
to_array(output_bundle_shape),
|
||||
to_array(output_bundle_tensor.GetStrides()),
|
||||
input_device_buf.GetDeviceBuffer(),
|
||||
output_device_buf.GetDeviceBuffer(),
|
||||
PassThrough{});
|
||||
|
||||
if(!permute.IsSupportedArgument(argument))
|
||||
{
|
||||
std::cerr << "The runtime parameters seems not supported by the device instance, exiting!"
|
||||
<< std::endl;
|
||||
return false;
|
||||
};
|
||||
|
||||
auto invoker = permute.MakeInvoker();
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, true});
|
||||
|
||||
std::cout << "Perf: " << ave_time << " ms" << std::endl;
|
||||
|
||||
output_device_buf.FromDevice(data(output_bundle_tensor));
|
||||
|
||||
constexpr std::size_t NumElemsInBundle = sizeof(BundleType) / sizeof(DataType);
|
||||
|
||||
// extend tensor shape from [N, H, W] to [N, H, W, NumElemsInBundle]
|
||||
// axes from [0, 2, 1] to [0, 2, 1, 3]
|
||||
const auto input_shape = extend_shape(input_bundle_shape, NumElemsInBundle);
|
||||
const auto input_axes = extend_axes(input_bundle_axes);
|
||||
|
||||
using std::begin;
|
||||
|
||||
Tensor<DataType> input_tensor(input_shape);
|
||||
ranges::copy(input_bundle_tensor.AsSpan<const DataType>(), begin(input_tensor));
|
||||
|
||||
Tensor<DataType> output_tensor(transpose(input_shape, input_axes));
|
||||
if(!host_permute(input_tensor, input_axes, PassThrough{}, output_tensor))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return ck::utils::check_err(output_bundle_tensor.AsSpan<const DataType>(),
|
||||
output_tensor.AsSpan<const DataType>(),
|
||||
"Error: incorrect results in output tensor",
|
||||
1e-6,
|
||||
1e-6);
|
||||
}
|
||||
|
||||
bool run_permute_bundle_example(const Problem::Shape& shape, const Problem::Axes& axes)
|
||||
{
|
||||
return run_permute_bundle(Problem{shape, axes});
|
||||
}
|
||||
65
example/39_permute/run_permute_element_example.inc
Normal file
65
example/39_permute/run_permute_element_example.inc
Normal file
@@ -0,0 +1,65 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
bool run_permute_element(const Problem& problem)
|
||||
{
|
||||
const auto& input_shape = problem.shape;
|
||||
const auto& input_axes = problem.axes;
|
||||
|
||||
const auto output_shape = transpose(input_shape, input_axes);
|
||||
|
||||
Tensor<InDataType> input_tensor(input_shape);
|
||||
Tensor<OutDataType> output_tensor(output_shape);
|
||||
|
||||
ck::utils::FillUniformDistribution<InDataType>{-1.f, 1.f}(input_tensor);
|
||||
|
||||
DeviceMem input_device_buf(input_tensor.GetElementSpaceSizeInBytes());
|
||||
DeviceMem output_device_buf(output_tensor.GetElementSpaceSizeInBytes());
|
||||
|
||||
using std::data;
|
||||
input_device_buf.ToDevice(data(input_tensor));
|
||||
|
||||
static_assert(std::is_default_constructible_v<DevicePermuteInstance>);
|
||||
|
||||
auto permute = DevicePermuteInstance{};
|
||||
auto argument = permute.MakeArgument(to_array(input_shape),
|
||||
to_array(input_tensor.GetStrides()),
|
||||
to_array(output_shape),
|
||||
to_array(output_tensor.GetStrides()),
|
||||
input_device_buf.GetDeviceBuffer(),
|
||||
output_device_buf.GetDeviceBuffer(),
|
||||
PassThrough{});
|
||||
|
||||
if(!permute.IsSupportedArgument(argument))
|
||||
{
|
||||
std::cerr << "The runtime parameters seems not supported by the device instance, exiting!"
|
||||
<< std::endl;
|
||||
return false;
|
||||
};
|
||||
|
||||
auto invoker = permute.MakeInvoker();
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, true});
|
||||
|
||||
std::cout << "Perf: " << ave_time << " ms" << std::endl;
|
||||
|
||||
output_device_buf.FromDevice(data(output_tensor));
|
||||
|
||||
Tensor<OutDataType> output_tensor_host(output_shape);
|
||||
if(!host_permute(input_tensor, input_axes, PassThrough{}, output_tensor_host))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return ck::utils::check_err(output_tensor.AsSpan<const OutDataType>(),
|
||||
output_tensor_host.AsSpan<const OutDataType>(),
|
||||
"Error: incorrect results in output tensor",
|
||||
1e-6,
|
||||
1e-6);
|
||||
}
|
||||
|
||||
bool run_permute_element_example(const Problem::Shape& shape, const Problem::Axes& axes)
|
||||
{
|
||||
return run_permute_element(Problem{shape, axes});
|
||||
}
|
||||
Reference in New Issue
Block a user