mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-19 04:19:36 +00:00
Reorganize project folders (#6)
This commit is contained in:
17
include/ck/wrapper/utils/kernel_utils.hpp
Normal file
17
include/ck/wrapper/utils/kernel_utils.hpp
Normal file
@@ -0,0 +1,17 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
|
||||
// Disable from doxygen docs generation
|
||||
/// @cond INTERNAL
|
||||
namespace ck {
|
||||
namespace wrapper {
|
||||
/// @endcond
|
||||
|
||||
#define __CK_WRAPPER_LAUNCH_BOUNDS__ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
|
||||
|
||||
} // namespace wrapper
|
||||
} // namespace ck
|
||||
523
include/ck/wrapper/utils/layout_utils.hpp
Normal file
523
include/ck/wrapper/utils/layout_utils.hpp
Normal file
@@ -0,0 +1,523 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
|
||||
#include "ck/utility/number.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
#include "ck/utility/tuple_helper.hpp"
|
||||
#include "ck/utility/sequence.hpp"
|
||||
#include "ck/utility/sequence_helper.hpp"
|
||||
#include "ck/utility/is_detected.hpp"
|
||||
|
||||
#include "ck/tensor_description/tensor_descriptor.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
|
||||
#include "ck/tensor_description/multi_index_transform_helper.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
|
||||
|
||||
// Disable from doxygen docs generation
|
||||
/// @cond INTERNAL
|
||||
namespace ck {
|
||||
namespace wrapper {
|
||||
/// @endcond
|
||||
|
||||
// Disable from doxygen docs generation
|
||||
/// @cond INTERNAL
|
||||
// forward declaration
|
||||
template <typename Shape, typename UnrolledDescriptorType>
|
||||
struct Layout;
|
||||
|
||||
template <typename T>
|
||||
using is_tuple = decltype(std::declval<T&>().IsTuple());
|
||||
|
||||
namespace {
|
||||
namespace detail {
|
||||
/**
|
||||
* \brief Generate packed (column-major) strides if not passed
|
||||
*
|
||||
* \param shape Tensor shape.
|
||||
* \return Generated column-major strides.
|
||||
*/
|
||||
template <typename... Ts>
|
||||
__host__ __device__ constexpr static auto
|
||||
GenerateColumnMajorPackedStrides(const Tuple<Ts...>& shape)
|
||||
{
|
||||
const auto unrolled_shape = UnrollNestedTuple(shape);
|
||||
return generate_tuple(
|
||||
[&](auto i) {
|
||||
if constexpr(i.value == 0)
|
||||
{
|
||||
return Number<1>{};
|
||||
}
|
||||
else
|
||||
{
|
||||
return TupleReduce<Number<0>{}.value, i.value>([](auto x, auto y) { return x * y; },
|
||||
unrolled_shape);
|
||||
}
|
||||
},
|
||||
Number<decltype(unrolled_shape)::Size()>{});
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Create naive tensor descriptor from nested shape.
|
||||
*
|
||||
* \param shape Tensor shape.
|
||||
* \param strides Tensor strides.
|
||||
* \return Unrolled descriptor
|
||||
*/
|
||||
template <typename LayoutShape, typename LayoutStrides>
|
||||
__host__ __device__ constexpr auto MakeUnrolledDescriptor(const LayoutShape& shape,
|
||||
const LayoutStrides& strides)
|
||||
{
|
||||
const auto unrolled_shape = UnrollNestedTuple(shape);
|
||||
if constexpr(is_same_v<LayoutStrides, Tuple<>>)
|
||||
{
|
||||
// if not passed, then generate
|
||||
const auto unrolled_strides = GenerateColumnMajorPackedStrides(unrolled_shape);
|
||||
static_assert(unrolled_shape.Size() == unrolled_strides.Size(),
|
||||
"Size of strides and shape are not consistent.");
|
||||
return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides);
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto unrolled_strides = UnrollNestedTuple(strides);
|
||||
static_assert(unrolled_shape.Size() == unrolled_strides.Size(),
|
||||
"Size of strides and shape are not consistent.");
|
||||
return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides);
|
||||
}
|
||||
}
|
||||
} // namespace detail
|
||||
} // namespace
|
||||
|
||||
/// @endcond
|
||||
|
||||
// make_*
|
||||
/**
|
||||
* \brief Make layout function.
|
||||
*
|
||||
* \tparam Shape Shape for layout.
|
||||
* \tparam Strides Strides for layout.
|
||||
* \return Constructed layout.
|
||||
*/
|
||||
template <typename Shape, typename Strides>
|
||||
__host__ __device__ constexpr auto make_layout(const Shape& shape, const Strides& strides)
|
||||
{
|
||||
using UnrolledDescriptorType = decltype(detail::MakeUnrolledDescriptor(Shape{}, Strides{}));
|
||||
return Layout<Shape, UnrolledDescriptorType>(shape,
|
||||
detail::MakeUnrolledDescriptor(shape, strides));
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Make layout function with packed strides
|
||||
* (column-major).
|
||||
*
|
||||
* \tparam Shape Shape for layout.
|
||||
* \return Constructed layout.
|
||||
*/
|
||||
template <typename Shape>
|
||||
__host__ __device__ constexpr auto make_layout(const Shape& shape)
|
||||
{
|
||||
using UnrolledDescriptorType = decltype(detail::MakeUnrolledDescriptor(Shape{}, Tuple<>{}));
|
||||
return Layout<Shape, UnrolledDescriptorType>(shape,
|
||||
detail::MakeUnrolledDescriptor(shape, Tuple<>{}));
|
||||
}
|
||||
// Layout helpers
|
||||
// get
|
||||
/**
|
||||
* \private
|
||||
* \brief Get dim.
|
||||
*
|
||||
* \param dim Dimension.
|
||||
* \return Returned the same dimension.
|
||||
*/
|
||||
template <typename T>
|
||||
__host__ __device__ T constexpr get(const T& dim)
|
||||
{
|
||||
return dim;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get element from tuple (Shape/Strides/Idxs).
|
||||
*
|
||||
* \tparam idx Index to lookup.
|
||||
* \param tuple Tuple to lookup.
|
||||
* \return Requsted element.
|
||||
*/
|
||||
template <index_t idx, typename... Dims>
|
||||
__host__ __device__ constexpr auto get(const Tuple<Dims...>& tuple)
|
||||
{
|
||||
return tuple.At(Number<idx>{});
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get sub layout.
|
||||
*
|
||||
* \tparam idx Index to lookup.
|
||||
* \param layout Layout to create sub layout.
|
||||
* \return Requsted sub layout.
|
||||
*/
|
||||
template <index_t idx, typename Shape, typename UnrolledDesc>
|
||||
__host__ __device__ constexpr auto get(const Layout<Shape, UnrolledDesc>& layout)
|
||||
{
|
||||
const auto& shape = layout.GetShape();
|
||||
const auto new_shape = get<idx>(shape);
|
||||
static_assert(is_detected<is_tuple, decltype(new_shape)>::value,
|
||||
"Shape of sub layout must be tuple");
|
||||
|
||||
constexpr auto old_shape_dims = decltype(UnrollNestedTuple(shape))::Size();
|
||||
constexpr auto new_shape_dims = decltype(UnrollNestedTuple(new_shape))::Size();
|
||||
constexpr auto shape_offset = decltype(UnrollNestedTuple(TupleSlice<0, idx>(shape)))::Size();
|
||||
|
||||
const auto unrolled_shape = UnrollNestedTuple(shape);
|
||||
const auto transforms = generate_tuple(
|
||||
[&](auto i) {
|
||||
// Compare Idx with shape
|
||||
if constexpr(i < shape_offset || i >= shape_offset + new_shape_dims)
|
||||
{
|
||||
// Remove dimension
|
||||
return make_freeze_transform(Number<0>{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return make_pass_through_transform(unrolled_shape.At(i));
|
||||
}
|
||||
},
|
||||
Number<old_shape_dims>{});
|
||||
|
||||
const auto lower_dims =
|
||||
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<old_shape_dims>{});
|
||||
const auto upper_dims = generate_tuple(
|
||||
[&](auto i) {
|
||||
if constexpr(i < shape_offset || i >= shape_offset + new_shape_dims)
|
||||
return Sequence<>{};
|
||||
|
||||
else
|
||||
{
|
||||
return Sequence<i.value - shape_offset>{};
|
||||
}
|
||||
},
|
||||
Number<old_shape_dims>{});
|
||||
|
||||
const auto& flatten_desc = layout.GetUnrolledDescriptor();
|
||||
auto new_desc = transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims);
|
||||
return Layout<decltype(new_shape), decltype(new_desc)>(new_shape, new_desc);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Hierarchical get.
|
||||
*
|
||||
* \tparam Idxs Indexes to lookup.
|
||||
* \param elem Element to lookup.
|
||||
* \return Requsted element.
|
||||
*/
|
||||
template <index_t Idx, index_t... Idxs, typename T>
|
||||
__host__ __device__ constexpr auto get(const T& elem)
|
||||
{
|
||||
return get<Idxs...>(get<Idx>(elem));
|
||||
}
|
||||
|
||||
// size
|
||||
/**
|
||||
* \private
|
||||
* \brief Get size.
|
||||
*
|
||||
* \param dim Size.
|
||||
* \return Returned the same size.
|
||||
*/
|
||||
template <typename T>
|
||||
__host__ __device__ T constexpr size(const T& dim)
|
||||
{
|
||||
return dim;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Length get (product if tuple).
|
||||
*
|
||||
* \tparam idx Index to lookup.
|
||||
* \param layout Layout to get Shape of.
|
||||
* \return Requsted length.
|
||||
*/
|
||||
template <index_t idx, typename Shape, typename UnrolledDescriptorType>
|
||||
__host__ __device__ constexpr auto size(const Layout<Shape, UnrolledDescriptorType>& layout)
|
||||
{
|
||||
return layout.template GetLength<idx>();
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Shape size (product of dims).
|
||||
*
|
||||
* \param shape Shape to lookup.
|
||||
* \return Requsted size.
|
||||
*/
|
||||
template <typename... ShapeDims>
|
||||
__host__ __device__ constexpr auto size(const Tuple<ShapeDims...>& shape)
|
||||
{
|
||||
const auto unrolled_shape = UnrollNestedTuple(shape);
|
||||
return TupleReduce<0, unrolled_shape.Size()>([](auto x, auto y) { return x * y; },
|
||||
unrolled_shape);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Layout size (product of dims).
|
||||
*
|
||||
* \param layout Layout to calculate shape size.
|
||||
* \return Requsted size.
|
||||
*/
|
||||
template <typename Shape, typename UnrolledDescriptorType>
|
||||
__host__ __device__ constexpr auto size(const Layout<Shape, UnrolledDescriptorType>& layout)
|
||||
{
|
||||
return layout.GetLengths();
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Length get from tuple (product if tuple).
|
||||
*
|
||||
* \tparam idx Index to lookup.
|
||||
* \param tuple Tuple to lookup.
|
||||
* \return Requsted length.
|
||||
*/
|
||||
template <index_t idx, typename... Ts>
|
||||
__host__ __device__ constexpr auto size(const Tuple<Ts...>& tuple)
|
||||
{
|
||||
return size(tuple.At(Number<idx>{}));
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Hierarchical size.
|
||||
*
|
||||
* \tparam Idx First index to lookup (to avoid empty Idxs).
|
||||
* \tparam Idxs Next indexes to lookup.
|
||||
* \param elem Element to lookup.
|
||||
* \return Requsted element.
|
||||
*/
|
||||
template <index_t Idx, index_t... Idxs, typename T>
|
||||
__host__ __device__ constexpr auto size(const T& elem)
|
||||
{
|
||||
return size(get<Idx, Idxs...>(elem));
|
||||
}
|
||||
|
||||
// rank
|
||||
/**
|
||||
* \brief Get layout rank (num elements in shape).
|
||||
*
|
||||
* \param layout Layout to calculate rank.
|
||||
* \return Requsted rank.
|
||||
*/
|
||||
template <typename Shape, typename UnrolledDescriptorType>
|
||||
__host__ __device__ constexpr auto
|
||||
rank([[maybe_unused]] const Layout<Shape, UnrolledDescriptorType>& layout)
|
||||
{
|
||||
return Shape::Size();
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get tuple rank (num elements in tuple).
|
||||
* Return 1 if scalar passed.
|
||||
*
|
||||
* \param tuple Tuple to calculate rank.
|
||||
* \return Requsted rank.
|
||||
*/
|
||||
template <typename... Dims>
|
||||
__host__ __device__ constexpr auto rank([[maybe_unused]] const Tuple<Dims...>& tuple)
|
||||
{
|
||||
return Tuple<Dims...>::Size();
|
||||
}
|
||||
|
||||
/**
|
||||
* \private
|
||||
* \brief Rank for scalar
|
||||
*
|
||||
* \param dim Dimension scalar.
|
||||
* \return Returned 1.
|
||||
*/
|
||||
template <index_t IDim>
|
||||
__host__ __device__ constexpr index_t rank([[maybe_unused]] const Number<IDim>& dim)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* \private
|
||||
* \brief Rank for scalar
|
||||
*
|
||||
* \param dim Dimension scalar.
|
||||
* \return Returned 1.
|
||||
*/
|
||||
__host__ __device__ constexpr index_t rank([[maybe_unused]] const index_t& dim) { return 1; }
|
||||
|
||||
/**
|
||||
* \brief Hierarchical rank.
|
||||
*
|
||||
* \tparam Idxs Indexes to lookup.
|
||||
* \param elem Element to lookup.
|
||||
* \return Requsted rank.
|
||||
*/
|
||||
template <index_t... Idxs, typename T>
|
||||
__host__ __device__ constexpr auto rank(const T& elem)
|
||||
{
|
||||
return rank(get<Idxs...>(elem));
|
||||
}
|
||||
|
||||
// depth
|
||||
/**
|
||||
* \brief Get depth of the layout shape (return 0 if scalar).
|
||||
*
|
||||
* \param layout Layout to calculate depth.
|
||||
* \return Requsted depth.
|
||||
*/
|
||||
template <typename Shape, typename UnrolledDescriptorType>
|
||||
__host__ __device__ constexpr auto depth(const Layout<Shape, UnrolledDescriptorType>& layout)
|
||||
{
|
||||
const auto& shape = layout.GetShape();
|
||||
return TupleDepth(shape);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get depth of the tuple. (return 0 if scalar)
|
||||
*
|
||||
* \param tuple Tuple to calculate depth.
|
||||
* \return Requsted depth.
|
||||
*/
|
||||
template <typename... Dims>
|
||||
__host__ __device__ constexpr auto depth(const Tuple<Dims...>& tuple)
|
||||
{
|
||||
return TupleDepth(tuple);
|
||||
}
|
||||
|
||||
/**
|
||||
* \private
|
||||
* \brief Depth for scalar
|
||||
*
|
||||
* \param dim Scalar.
|
||||
* \return Returned 0.
|
||||
*/
|
||||
template <index_t IDim>
|
||||
__host__ __device__ constexpr index_t depth([[maybe_unused]] const Number<IDim>& dim)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* \private
|
||||
* \brief Depth for scalar
|
||||
*
|
||||
* \param dim Scalar.
|
||||
* \return Returned 0.
|
||||
*/
|
||||
__host__ __device__ constexpr index_t depth([[maybe_unused]] const index_t& dim) { return 0; }
|
||||
|
||||
/**
|
||||
* \brief Hierarchical depth.
|
||||
*
|
||||
* \tparam Idxs Indexes to lookup.
|
||||
* \param elem Element to lookup.
|
||||
* \return Requsted depth.
|
||||
*/
|
||||
template <index_t... Idxs, typename T>
|
||||
__host__ __device__ constexpr auto depth(const T& elem)
|
||||
{
|
||||
return depth(get<Idxs...>(elem));
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get Layout shape.
|
||||
*
|
||||
* \param layout Layout to get shape from.
|
||||
* \return Requsted shape.
|
||||
*/
|
||||
template <typename LayoutType>
|
||||
__host__ __device__ constexpr const auto& shape(const LayoutType& layout)
|
||||
{
|
||||
return layout.GetShape();
|
||||
}
|
||||
|
||||
// pad
|
||||
/**
|
||||
* \brief Pad layout shapes to be adjusted to tile lengths.
|
||||
*
|
||||
*
|
||||
* \param layout Layout to pad.
|
||||
* \param tile_lengths Tile lengths to align layout shape.
|
||||
* \return Padded layout.
|
||||
*/
|
||||
template <typename Shape, typename UnrolledDesc, typename TileLengths>
|
||||
__host__ __device__ constexpr auto pad(const Layout<Shape, UnrolledDesc>& layout,
|
||||
const TileLengths& tile_lengths)
|
||||
{
|
||||
auto& unrolled_desc = layout.GetUnrolledDescriptor();
|
||||
// Generate sequence with ones to mark that all dims will be padded
|
||||
constexpr auto do_pads_seq =
|
||||
generate_sequence_v2([](auto) { return Number<1>{}; }, Number<Shape::Size()>{});
|
||||
// Create descriptor with padding
|
||||
auto padded_desc =
|
||||
tensor_operation::device::PadTensorDescriptor(unrolled_desc, tile_lengths, do_pads_seq);
|
||||
// Generate padded shape
|
||||
const auto padded_shape = generate_tuple(
|
||||
[&](auto i) { return padded_desc.GetLength(Number<i>{}); }, Number<TileLengths::Size()>{});
|
||||
// Create layout
|
||||
return Layout<decltype(padded_shape), decltype(padded_desc)>(padded_shape, padded_desc);
|
||||
}
|
||||
|
||||
// unmerge
|
||||
/**
|
||||
* \brief Unmerge selected dim in layout.
|
||||
*
|
||||
* \tparam Idx Index to dimension being unmerged.
|
||||
* \param layout Layout to pad.
|
||||
* \param new_lengths Dimensions into which the indicated dimension will be divided.
|
||||
* \param new_indexes Indexes to shuffle dims. Dims for unmerged dim should be nested.
|
||||
* \return Unmerged layout.
|
||||
*/
|
||||
template <index_t Idx, typename Shape, typename UnrolledDesc, typename NewLengths, typename NewIdxs>
|
||||
__host__ __device__ constexpr auto unmerge(const Layout<Shape, UnrolledDesc>& layout,
|
||||
const NewLengths& new_lengths,
|
||||
[[maybe_unused]] const NewIdxs& new_indexes)
|
||||
{
|
||||
const auto& layout_shape = shape(layout);
|
||||
auto& unrolled_desc = layout.GetUnrolledDescriptor();
|
||||
constexpr auto dims = Shape::Size();
|
||||
// Generate transforms
|
||||
const auto transforms = generate_tuple(
|
||||
[&](auto i) {
|
||||
if constexpr(i == Idx)
|
||||
{
|
||||
return make_unmerge_transform(new_lengths);
|
||||
}
|
||||
else
|
||||
{
|
||||
return make_pass_through_transform(layout_shape.At(i));
|
||||
}
|
||||
},
|
||||
Number<dims>{});
|
||||
|
||||
constexpr auto lower_dims =
|
||||
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<dims>{});
|
||||
constexpr auto upper_dims = generate_tuple(
|
||||
[&](auto i) {
|
||||
if constexpr(is_detected<is_tuple, tuple_element_t<i.value, NewIdxs>>::value)
|
||||
{
|
||||
constexpr auto idxs_tuple = tuple_element_t<i.value, NewIdxs>{};
|
||||
return to_sequence(idxs_tuple);
|
||||
}
|
||||
else
|
||||
{
|
||||
constexpr index_t index = tuple_element_t<i.value, NewIdxs>{};
|
||||
return Sequence<index>{};
|
||||
}
|
||||
},
|
||||
Number<dims>{});
|
||||
|
||||
const auto unmerged_desc =
|
||||
transform_tensor_descriptor(unrolled_desc, transforms, lower_dims, upper_dims);
|
||||
const auto unmerged_shape =
|
||||
generate_tuple([&](auto i) { return unmerged_desc.GetLength(Number<i>{}); },
|
||||
Number<decltype(unmerged_desc)::GetNumOfVisibleDimension()>{});
|
||||
|
||||
// Create layout
|
||||
return Layout<decltype(unmerged_shape), decltype(unmerged_desc)>(unmerged_shape, unmerged_desc);
|
||||
}
|
||||
|
||||
} // namespace wrapper
|
||||
} // namespace ck
|
||||
470
include/ck/wrapper/utils/tensor_partition.hpp
Normal file
470
include/ck/wrapper/utils/tensor_partition.hpp
Normal file
@@ -0,0 +1,470 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensor_utils.hpp"
|
||||
#include "layout_utils.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
|
||||
#include "ck/tensor_description/cluster_descriptor.hpp"
|
||||
|
||||
// Disable from doxygen docs generation
|
||||
/// @cond INTERNAL
|
||||
namespace ck {
|
||||
namespace wrapper {
|
||||
/// @endcond
|
||||
|
||||
// Disable from doxygen docs generation
|
||||
/// @cond INTERNAL
|
||||
namespace {
|
||||
|
||||
namespace detail {
|
||||
|
||||
/**
|
||||
* \brief Calculate shape for partition based on number of threads per each dim and
|
||||
* previous shape
|
||||
*
|
||||
* \param shape Base tensor shape.
|
||||
* \param thread_lengths Tuple of thread lengths.
|
||||
* \return Partition shape.
|
||||
*/
|
||||
template <typename... Ts, typename... Ls>
|
||||
__host__ __device__ constexpr auto CalculateLocalPartitionShape(const Tuple<Ts...>& shape,
|
||||
const Tuple<Ls...>& thread_lengths)
|
||||
{
|
||||
static_assert(Tuple<Ts...>::Size() == Tuple<Ls...>::Size(), "Wrong thread_lengths shape.");
|
||||
return generate_tuple(
|
||||
[&](auto i) {
|
||||
constexpr auto num_i = Number<i>{};
|
||||
const auto slice_len =
|
||||
ck::math::integer_divide_ceil(size<num_i>(shape), thread_lengths.At(num_i));
|
||||
return slice_len;
|
||||
},
|
||||
Number<Tuple<Ls...>::Size()>{});
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Apply projection.
|
||||
*
|
||||
* \param base_tuple Tuple to apply projection.
|
||||
* \param projection Projection is used to remove selected dim from
|
||||
* partitioning. Use `slice(X)` to remove dimension, where X is dim
|
||||
* size. Use `Number<1>{}` to keep it.
|
||||
* \return Multi index after projection.
|
||||
*/
|
||||
template <typename MultiIndex, typename ProjectionTuple>
|
||||
__host__ __device__ constexpr auto
|
||||
ApplyProjection([[maybe_unused]] const MultiIndex& base_tuple,
|
||||
[[maybe_unused]] const ProjectionTuple& projection)
|
||||
{
|
||||
if constexpr(is_same_v<ProjectionTuple, Tuple<>>)
|
||||
{
|
||||
return Tuple<>{};
|
||||
}
|
||||
else
|
||||
{
|
||||
auto base_tuple_after_projection = generate_tuple(
|
||||
[&](auto i) {
|
||||
const auto i_num = Number<i.value>{};
|
||||
static_assert(
|
||||
is_detected<is_slice, tuple_element_t<i_num, ProjectionTuple>>::value ||
|
||||
is_same_v<tuple_element_t<i_num, ProjectionTuple>, Number<1>>);
|
||||
if constexpr(is_detected<is_slice, tuple_element_t<i_num, ProjectionTuple>>::value)
|
||||
{
|
||||
// When slice (to remove), then insert empty tuple (will be removed in next
|
||||
// step).
|
||||
return Tuple<>{};
|
||||
}
|
||||
else
|
||||
{
|
||||
return make_tuple(base_tuple.At(i_num));
|
||||
}
|
||||
},
|
||||
Number<MultiIndex::Size()>{});
|
||||
// Remove empty tuples
|
||||
return UnrollNestedTuple<0, 1>(base_tuple_after_projection);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Calculate shape with dims from projection.
|
||||
*
|
||||
* \param shape Base tensor shape.
|
||||
* \param projection Projection is used to remove selected dim from
|
||||
* partitioning. Use `slice(X)` to remove dimension, where X is dim
|
||||
* size. Use `Number<1>{}` to keep it.
|
||||
* \return Shape with dims from projection
|
||||
*/
|
||||
template <typename... Ts, typename... Ps>
|
||||
__host__ __device__ constexpr auto CalculateShapeWithProjection(const Tuple<Ts...>& shape,
|
||||
const Tuple<Ps...>& projection)
|
||||
{
|
||||
return generate_tuple(
|
||||
[&](auto i) {
|
||||
if constexpr(is_detected<is_slice, tuple_element_t<i, Tuple<Ps...>>>::value)
|
||||
{
|
||||
return size<i>(projection).to_;
|
||||
}
|
||||
else
|
||||
{
|
||||
// number of shape element in actual fragment of shape and projection (method to
|
||||
// calculate shape idx)
|
||||
constexpr index_t shape_i =
|
||||
detail::ApplyProjection(TupleSlice<0, i>(Tuple<Ts...>{}),
|
||||
TupleSlice<0, i>(Tuple<Ps...>{}))
|
||||
.Size();
|
||||
return size<shape_i>(shape);
|
||||
}
|
||||
},
|
||||
Number<Tuple<Ps...>::Size()>{});
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Calculate total number of blocks.
|
||||
*
|
||||
* \param shape Base tensor shape.
|
||||
* \param tile_shape Tile shape.
|
||||
* \return Tuple with blocks number.
|
||||
*/
|
||||
template <typename... Ts, typename... Ls, typename... Ps>
|
||||
__host__ __device__ constexpr auto CalculateGridSize(const Tuple<Ts...>& shape,
|
||||
const Tuple<Ls...>& tile_shape)
|
||||
{
|
||||
return generate_tuple(
|
||||
[&](auto i) { return ck::math::integer_divide_ceil(size<i>(shape), size<i>(tile_shape)); },
|
||||
Number<Tuple<Ls...>::Size()>{});
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Calculate scaled offset for new partition/tile.
|
||||
*
|
||||
* \param thread_idxs Thread 1d id.
|
||||
* \param partition_lengths_seq Sequence of partition shape.
|
||||
* \param old_offset_idxs Multi index offset from base tensor to shift values.
|
||||
* \return Partition shape.
|
||||
*/
|
||||
template <typename ThreadIdxs, typename PartitionLengthsSeq, typename OldOffsetIdxs>
|
||||
__host__ __device__ constexpr auto
|
||||
CalculateOffsetMultiIdxs(const ThreadIdxs& thread_idxs,
|
||||
const PartitionLengthsSeq& partition_lengths_seq,
|
||||
const OldOffsetIdxs& old_offset_idxs)
|
||||
{
|
||||
return thread_idxs * partition_lengths_seq + old_offset_idxs;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Select dims to partition (skip if slice).
|
||||
*
|
||||
* \param block_idxs Input block indexes.
|
||||
* \return Partitioned dims.
|
||||
*/
|
||||
template <typename BlockIdxs>
|
||||
__host__ __device__ constexpr auto GetDimsToPartition([[maybe_unused]] const BlockIdxs& block_idxs)
|
||||
{
|
||||
const auto dims_to_partition = generate_tuple(
|
||||
[&](auto i) {
|
||||
if constexpr(!is_detected<is_slice, tuple_element_t<i, BlockIdxs>>::value)
|
||||
{
|
||||
return Number<i>{};
|
||||
}
|
||||
else
|
||||
{
|
||||
return Tuple<>{};
|
||||
}
|
||||
},
|
||||
Number<BlockIdxs::Size()>{});
|
||||
// Remove empty tuples
|
||||
return UnrollNestedTuple<0, 1>(dims_to_partition);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Replace slices with zeros (Slice dims are not partitioned).
|
||||
*
|
||||
* \param block_idxs Input block indexes.
|
||||
* \return Parsed dims.
|
||||
*/
|
||||
template <typename BlockIdxs>
|
||||
__host__ __device__ constexpr auto ReplaceSlicesWithZeros(const BlockIdxs& block_idxs)
|
||||
{
|
||||
return generate_tuple(
|
||||
[&](auto i) {
|
||||
if constexpr(!is_detected<is_slice, tuple_element_t<i, BlockIdxs>>::value)
|
||||
{
|
||||
return block_idxs.At(i);
|
||||
}
|
||||
else
|
||||
{
|
||||
return Number<0>{};
|
||||
}
|
||||
},
|
||||
Number<BlockIdxs::Size()>{});
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Calculate default projection.
|
||||
*
|
||||
* \param tile_shape Tile shape.
|
||||
* \return Default projection (filled with Number<1>{}).
|
||||
*/
|
||||
template <typename TileShape>
|
||||
__host__ __device__ constexpr auto
|
||||
GenerateDefaultProjection([[maybe_unused]] const TileShape tile_shape)
|
||||
{
|
||||
return generate_tuple([&](auto) { return Number<1>{}; }, Number<TileShape::Size()>{});
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Calculate thread multi index from 1d thread index.
|
||||
*
|
||||
* \param thread_layout Layout of threads (could not be nested).
|
||||
* \param thread_id Thread index represented as integer.
|
||||
* \return Multi index.
|
||||
*/
|
||||
template <typename ThreadShape, typename ThreadUnrolledDesc>
|
||||
__host__ __device__ constexpr auto CalculateThreadMultiIdx(
|
||||
[[maybe_unused]] const Layout<ThreadShape, ThreadUnrolledDesc>& thread_layout,
|
||||
const index_t thread_id)
|
||||
{
|
||||
static_assert(ThreadUnrolledDesc::GetNumOfTransform() == 1,
|
||||
"Thread layout should not be transformed.");
|
||||
constexpr auto embed_transform = ThreadUnrolledDesc{}.GetTransforms().At(Number<0>{});
|
||||
constexpr auto shape = ThreadShape{};
|
||||
constexpr auto strides = embed_transform.coefficients_;
|
||||
|
||||
return generate_tuple(
|
||||
[&](auto i) {
|
||||
constexpr auto num_i = Number<i>{};
|
||||
return (thread_id / strides.At(num_i)) % shape.At(num_i);
|
||||
},
|
||||
Number<ThreadShape::Size()>{});
|
||||
}
|
||||
} // namespace detail
|
||||
} // namespace
|
||||
/// @endcond
|
||||
|
||||
/**
|
||||
* \brief Create local partition for thread (At now only packed partition
|
||||
* is supported).
|
||||
*
|
||||
* \param tensor Tensor for partition.
|
||||
* \param thread_layout Layout of threads (could not be transformed).
|
||||
* \param thread_id Thread index represented as integer.
|
||||
* \param projection Projection is used to remove selected dim from
|
||||
* partitioning. Use `slice(X)` to remove dimension, where X is dim
|
||||
* size. Use `Number<1>{}` to keep it.
|
||||
* \return Partition tensor.
|
||||
*/
|
||||
template <typename TensorType,
|
||||
typename ThreadShape,
|
||||
typename ThreadUnrolledDesc,
|
||||
typename ProjectionTuple>
|
||||
__host__ __device__ constexpr auto
|
||||
make_local_partition(TensorType& tensor,
|
||||
[[maybe_unused]] const Layout<ThreadShape, ThreadUnrolledDesc>& thread_layout,
|
||||
const index_t thread_id,
|
||||
const ProjectionTuple& projection)
|
||||
{
|
||||
static_assert(!IsNestedTuple(ThreadShape{}));
|
||||
// Calculate new partition shape
|
||||
const auto& tensor_shape = shape(tensor);
|
||||
// Calculate projected thread lengths
|
||||
constexpr auto projected_thread_lengths =
|
||||
detail::ApplyProjection(ThreadShape{}, ProjectionTuple{});
|
||||
constexpr auto partition_shape =
|
||||
detail::CalculateLocalPartitionShape(decltype(tensor_shape){}, projected_thread_lengths);
|
||||
constexpr auto partition_shape_seq =
|
||||
generate_sequence_v2([&](auto I) { return size<I>(partition_shape); },
|
||||
Number<decltype(partition_shape)::Size()>{});
|
||||
// Calculate thread idxs and offsets
|
||||
const auto thread_idxs = detail::CalculateThreadMultiIdx(thread_layout, thread_id);
|
||||
// Apply projection on thread idxs to remove not needed idxs
|
||||
const auto projected_thread_idxs = detail::ApplyProjection(thread_idxs, projection);
|
||||
const auto offset_multi_idxs = detail::CalculateOffsetMultiIdxs(
|
||||
projected_thread_idxs, partition_shape_seq, tensor.GetMultiIdxOffsets());
|
||||
// Create new layout and tensor
|
||||
auto& unrolled_desc = layout(tensor).GetUnrolledDescriptor();
|
||||
// Slice descriptor
|
||||
const auto transforms = generate_tuple(
|
||||
[&](auto i) {
|
||||
return make_slice_transform(partition_shape.At(i),
|
||||
offset_multi_idxs.At(i),
|
||||
partition_shape.At(i) + offset_multi_idxs.At(i));
|
||||
},
|
||||
Number<remove_reference_t<decltype(tensor_shape)>::Size()>{});
|
||||
const auto lower_upper_dims =
|
||||
generate_tuple([&](auto i) { return Sequence<i.value>{}; },
|
||||
Number<remove_reference_t<decltype(tensor_shape)>::Size()>{});
|
||||
auto sliced_desc =
|
||||
transform_tensor_descriptor(unrolled_desc, transforms, lower_upper_dims, lower_upper_dims);
|
||||
// Create layout
|
||||
const auto partition_layout =
|
||||
Layout<remove_reference_t<decltype(partition_shape)>, decltype(sliced_desc)>(
|
||||
partition_shape, sliced_desc);
|
||||
auto partition_tensor =
|
||||
make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer(), partition_layout);
|
||||
// Apply offsets
|
||||
return partition_tensor;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Create local partition for thread (At now only packed partition
|
||||
* is supported).
|
||||
*
|
||||
* \param tensor Tensor for partition.
|
||||
* \param thread_lengths Layout of threads (could not be nested).
|
||||
* \param thread_id Thread index represented as integer.
|
||||
* \return Partition tensor.
|
||||
*/
|
||||
template <typename TensorType, typename ThreadShape, typename ThreadUnrolledDesc>
|
||||
__host__ __device__ constexpr auto
|
||||
make_local_partition(TensorType& tensor,
|
||||
const Layout<ThreadShape, ThreadUnrolledDesc>& thread_lengths,
|
||||
const index_t thread_id)
|
||||
{
|
||||
const auto projection = detail::GenerateDefaultProjection(ThreadShape{});
|
||||
return make_local_partition(tensor, thread_lengths, thread_id, projection);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Create local tile for thread block. (At now only packed tile
|
||||
* is supported).
|
||||
*
|
||||
* \note Temporary to gain the best performance use 2d
|
||||
* tile_shape.
|
||||
*
|
||||
*
|
||||
* \param tensor Tensor for partition.
|
||||
* \param tile_shape Shapes of requested tile.
|
||||
* \param block_idxs Tuple of block indexes represented as integer. If slice,
|
||||
* then get whole dim.
|
||||
* \param projection Projection is used to remove selected dim from
|
||||
* partitioning. Use `slice(X)` to remove dimension, where X is dim
|
||||
* size. Use `Number<1>{}` to keep it.
|
||||
* \return Tile tensor.
|
||||
*/
|
||||
template <typename TensorType,
|
||||
typename BlockShapeTuple,
|
||||
typename BlockIdxs,
|
||||
typename ProjectionTuple>
|
||||
__host__ __device__ constexpr auto make_local_tile(const TensorType& tensor,
|
||||
const BlockShapeTuple& tile_shape,
|
||||
const BlockIdxs& block_idxs,
|
||||
const ProjectionTuple& projection)
|
||||
{
|
||||
static_assert(!IsNestedTuple(BlockShapeTuple{}));
|
||||
static_assert(!IsNestedTuple(BlockIdxs{}));
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
|
||||
auto& aligned_desc = layout(tensor).GetMergedNestingDescriptor();
|
||||
|
||||
constexpr auto projected_tile_shape =
|
||||
detail::ApplyProjection(BlockShapeTuple{}, ProjectionTuple{});
|
||||
// Number of dims which are partitioned
|
||||
constexpr auto dims_to_partition = detail::GetDimsToPartition(BlockIdxs{});
|
||||
const auto parsed_block_idxs = detail::ReplaceSlicesWithZeros(block_idxs);
|
||||
if constexpr(decltype(dims_to_partition)::Size() == I2)
|
||||
{
|
||||
const auto shape_with_projection_dims =
|
||||
detail::CalculateShapeWithProjection(shape(tensor), projection);
|
||||
// Set Value for M, N partition
|
||||
const auto M = shape_with_projection_dims.At(dims_to_partition.At(I0));
|
||||
const auto N = shape_with_projection_dims.At(dims_to_partition.At(I1));
|
||||
constexpr auto MPerBlock = BlockShapeTuple{}.At(dims_to_partition.At(I0));
|
||||
constexpr auto NPerBlock = BlockShapeTuple{}.At(dims_to_partition.At(I1));
|
||||
auto m_n_desc = make_naive_tensor_descriptor_packed(make_tuple(M, N));
|
||||
// Get 1D block id
|
||||
const auto grid_size = detail::CalculateGridSize(shape_with_projection_dims, tile_shape);
|
||||
const auto block_lengths_desc = make_naive_tensor_descriptor_packed(grid_size);
|
||||
const index_t block_id_1d = block_lengths_desc.CalculateOffset(parsed_block_idxs);
|
||||
// Optimized version for 2d tile shape [MxN]
|
||||
const auto block_2_tile_map =
|
||||
BlockToCTileMap_M00_N0_M01Adapt<MPerBlock,
|
||||
NPerBlock,
|
||||
remove_cvref_t<decltype(m_n_desc)>>(m_n_desc);
|
||||
const auto block_work_idx =
|
||||
block_2_tile_map.CalculateBottomIndex(make_multi_index(block_id_1d));
|
||||
const index_t m_block_data_idx_on_grid =
|
||||
__builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
|
||||
const index_t n_block_data_idx_on_grid =
|
||||
__builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
|
||||
// Apply 0 for non partitioned dims
|
||||
const auto offset_multi_idxs = generate_tuple(
|
||||
[&](auto i) {
|
||||
if constexpr(i == dims_to_partition.At(I0))
|
||||
{
|
||||
return m_block_data_idx_on_grid;
|
||||
}
|
||||
else if constexpr(i == dims_to_partition.At(I1))
|
||||
{
|
||||
return n_block_data_idx_on_grid;
|
||||
}
|
||||
else
|
||||
{
|
||||
return Number<0>{};
|
||||
}
|
||||
},
|
||||
Number<BlockShapeTuple::Size()>{});
|
||||
const auto projected_offset_multi_idxs =
|
||||
detail::ApplyProjection(offset_multi_idxs, projection);
|
||||
// Create new layout and tensor
|
||||
const auto tile_layout =
|
||||
Layout<remove_reference_t<decltype(projected_tile_shape)>, decltype(aligned_desc)>(
|
||||
projected_tile_shape, aligned_desc);
|
||||
auto tile_tensor =
|
||||
make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer(), tile_layout);
|
||||
// Apply offsets
|
||||
tile_tensor.SetMultiIdxOffset(to_multi_index(projected_offset_multi_idxs));
|
||||
return tile_tensor;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Calculate offsets
|
||||
// Sequence with data to process per block
|
||||
using ProjectedTileShapeTuple = decltype(projected_tile_shape);
|
||||
constexpr auto projected_tile_shape_seq =
|
||||
generate_sequence_v2([](auto I) { return ProjectedTileShapeTuple{}.At(I); },
|
||||
Number<ProjectedTileShapeTuple::Size()>{});
|
||||
// Tuple with number of blocks
|
||||
const auto projected_block_idxs =
|
||||
to_multi_index(detail::ApplyProjection(parsed_block_idxs, projection));
|
||||
const auto offset_multi_idxs = detail::CalculateOffsetMultiIdxs(
|
||||
projected_block_idxs, projected_tile_shape_seq, tensor.GetMultiIdxOffsets());
|
||||
// Create new layout and tensor
|
||||
const auto tile_layout =
|
||||
Layout<remove_reference_t<ProjectedTileShapeTuple>, decltype(aligned_desc)>(
|
||||
projected_tile_shape, aligned_desc);
|
||||
auto tile_tensor =
|
||||
make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer(), tile_layout);
|
||||
// Apply offsets
|
||||
tile_tensor.SetMultiIdxOffset(to_multi_index(offset_multi_idxs));
|
||||
return tile_tensor;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Create local tile for thread block. (At now only packed tile
|
||||
* is supported).
|
||||
*
|
||||
* \note Currently to get the best performance please use 2d shape.
|
||||
*
|
||||
* \param tensor Tensor for partition.
|
||||
* \param tile_shape Shapes of requested tile.
|
||||
* \param block_idxs Tuple of block indexes represented as integer. If slice,
|
||||
* then get whole dim.
|
||||
* \return Tile tensor.
|
||||
*/
|
||||
template <typename TensorType, typename BlockShapeTuple, typename BlockIdxs>
|
||||
__host__ __device__ constexpr auto make_local_tile(const TensorType& tensor,
|
||||
const BlockShapeTuple& tile_shape,
|
||||
const BlockIdxs& block_idxs)
|
||||
{
|
||||
const auto projection = detail::GenerateDefaultProjection(BlockShapeTuple{});
|
||||
return make_local_tile(tensor, tile_shape, block_idxs, projection);
|
||||
}
|
||||
|
||||
} // namespace wrapper
|
||||
} // namespace ck
|
||||
277
include/ck/wrapper/utils/tensor_utils.hpp
Normal file
277
include/ck/wrapper/utils/tensor_utils.hpp
Normal file
@@ -0,0 +1,277 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/number.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
#include "ck/utility/tuple_helper.hpp"
|
||||
#include "ck/utility/dynamic_buffer.hpp"
|
||||
#include "ck/utility/amd_address_space.hpp"
|
||||
#include "ck/utility/multi_index.hpp"
|
||||
|
||||
// Disable from doxygen docs generation
|
||||
/// @cond INTERNAL
|
||||
namespace ck {
|
||||
namespace wrapper {
|
||||
/// @endcond
|
||||
|
||||
/**
|
||||
* \brief Memory type, allowed members:
|
||||
* - Generic,
|
||||
* - Global,
|
||||
* - Lds,
|
||||
* - Sgpr,
|
||||
* - Vgpr,
|
||||
*/
|
||||
using MemoryTypeEnum = AddressSpaceEnum;
|
||||
|
||||
// Disable from doxygen docs generation
|
||||
/// @cond INTERNAL
|
||||
// forward declarations
|
||||
template <typename Shape, typename UnrolledDescriptorType>
|
||||
struct Layout;
|
||||
template <MemoryTypeEnum BufferAddressSpace,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename UnrolledDescriptorType>
|
||||
struct Tensor;
|
||||
|
||||
template <typename FromType, typename ToType>
|
||||
struct Slice
|
||||
{
|
||||
__host__ __device__ constexpr Slice() : from_(), to_() {}
|
||||
__host__ __device__ constexpr Slice(FromType from, ToType to) : from_(from), to_(to) {}
|
||||
|
||||
/**
|
||||
* \brief Calculate slice range.
|
||||
*
|
||||
* \param dim Dimension size.
|
||||
* \return Slice range.
|
||||
*/
|
||||
template <typename T>
|
||||
__host__ __device__ constexpr auto range(const T& dim) const
|
||||
{
|
||||
if constexpr(is_same_v<FromType, index_t> || is_same_v<ToType, index_t> ||
|
||||
is_same_v<std::remove_const_t<T>, index_t>)
|
||||
{
|
||||
if(to_ < 0)
|
||||
{
|
||||
return dim - from_ + to_ + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
// workaround if one end of the interval is index_t and the second one is Number
|
||||
return static_cast<index_t>(to_) - static_cast<index_t>(from_);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
static_assert(T{} >= ToType{} && FromType{} >= Number<0>{} &&
|
||||
(ToType{} < 0 || ToType{} > FromType{}),
|
||||
"Invalid range");
|
||||
if constexpr(ToType{} < 0)
|
||||
{
|
||||
return dim - from_ + to_ + Number<1>{};
|
||||
}
|
||||
else
|
||||
{
|
||||
return to_ - from_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr bool IsSlice() { return true; }
|
||||
|
||||
const FromType from_;
|
||||
const ToType to_;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
using is_slice = decltype(std::declval<T&>().IsSlice());
|
||||
|
||||
template <typename T>
|
||||
using is_tuple = decltype(std::declval<T&>().IsTuple());
|
||||
/// @endcond
|
||||
|
||||
/**
|
||||
* \brief Make tensor function.
|
||||
*
|
||||
* \tparam MemoryType Type of memory.
|
||||
* \param pointer Pointer to the memory.
|
||||
* \param layout Tensor layout.
|
||||
* \return Constructed tensor.
|
||||
*/
|
||||
template <MemoryTypeEnum MemoryType,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename UnrolledDescriptorType>
|
||||
constexpr auto make_tensor(ElementType* pointer,
|
||||
const Layout<Shape, UnrolledDescriptorType>& layout)
|
||||
{
|
||||
return Tensor<MemoryType, ElementType, Shape, UnrolledDescriptorType>(pointer, layout);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Make SGPR or VGPR tensor function.
|
||||
*
|
||||
* \tparam MemoryType Type of memory.
|
||||
* \tparam ElementType Memory data type.
|
||||
* \return Constructed tensor.
|
||||
*/
|
||||
template <MemoryTypeEnum MemoryType,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename UnrolledDescriptorType>
|
||||
constexpr auto make_register_tensor(const Layout<Shape, UnrolledDescriptorType>& layout)
|
||||
{
|
||||
return Tensor<MemoryType, ElementType, Shape, UnrolledDescriptorType>(layout);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Clear tensor. (Only for Vpgr/Sgpr)
|
||||
*
|
||||
* \param tensor Tensor to be cleared.
|
||||
*/
|
||||
template <MemoryTypeEnum BufferAddressSpace,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename UnrolledDescriptorType>
|
||||
__host__ __device__ void
|
||||
clear(Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
|
||||
{
|
||||
static_assert(
|
||||
!Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>::IsDynamicBuffer);
|
||||
return tensor.GetBuffer().Clear();
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get Tensor Layout.
|
||||
*
|
||||
* \param tensor Tensor to get layout of.
|
||||
* \return Requsted layout.
|
||||
*/
|
||||
template <MemoryTypeEnum BufferAddressSpace,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename UnrolledDescriptorType>
|
||||
__host__ __device__ constexpr const auto&
|
||||
layout(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
|
||||
{
|
||||
return tensor.GetLayout();
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Product of tensor shape dims.
|
||||
*
|
||||
* \tparam Idxs Indexes to access specific shape dim (optional).
|
||||
* \param tensor Tensor to get Shape of.
|
||||
* \return Requsted size.
|
||||
*/
|
||||
template <index_t... Idxs,
|
||||
MemoryTypeEnum BufferAddressSpace,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename UnrolledDescriptorType>
|
||||
__host__ __device__ constexpr auto
|
||||
size(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
|
||||
{
|
||||
return size<Idxs...>(tensor.GetLayout());
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Rank of Shape tuple.
|
||||
*
|
||||
* \tparam Idxs Indexes to access specific shape dim (optional).
|
||||
* \param tensor Tensor to get rank of.
|
||||
* \return Requsted rank.
|
||||
*/
|
||||
template <index_t... Idxs,
|
||||
MemoryTypeEnum BufferAddressSpace,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename UnrolledDescriptorType>
|
||||
__host__ __device__ constexpr auto
|
||||
rank(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
|
||||
{
|
||||
return rank<Idxs...>(tensor.GetLayout());
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Depth of Shape tuple.
|
||||
*
|
||||
* \tparam Idxs Indexes to access specific shape dim (optional).
|
||||
* \param tensor Tensor to get depth of.
|
||||
* \return Requsted depth.
|
||||
*/
|
||||
template <index_t... Idxs,
|
||||
MemoryTypeEnum BufferAddressSpace,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename UnrolledDescriptorType>
|
||||
__host__ __device__ constexpr auto
|
||||
depth(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
|
||||
{
|
||||
return depth<Idxs...>(tensor.GetLayout());
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get Tensor shape.
|
||||
*
|
||||
* \param tensor Tensor to get shape from.
|
||||
* \return Requsted shape.
|
||||
*/
|
||||
template <MemoryTypeEnum BufferAddressSpace,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename UnrolledDescriptorType>
|
||||
__host__ __device__ constexpr const auto&
|
||||
shape(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
|
||||
{
|
||||
return shape(tensor.GetLayout());
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get dim slice.
|
||||
*
|
||||
* \param from Beginning of the interval.
|
||||
* \param to End of the interval. (could be also negative to index from the end)
|
||||
* \return Requested slice. Could be used to create sliced tensor from other tensor.
|
||||
*/
|
||||
template <typename FromType, typename ToType>
|
||||
constexpr auto slice(const FromType from, const ToType to)
|
||||
{
|
||||
return Slice<FromType, ToType>(from, to);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get dim slice. (Assumed that from is equal to 1)
|
||||
*
|
||||
* \param to End of the interval. (could be also negative to index from the end)
|
||||
* \return Requested slice. Could be used to create sliced tensor from other tensor.
|
||||
*/
|
||||
template <typename ToType>
|
||||
constexpr auto slice(const ToType to)
|
||||
{
|
||||
if constexpr(is_same_v<ToType, index_t>)
|
||||
{
|
||||
return Slice<index_t, ToType>(0, to);
|
||||
}
|
||||
else
|
||||
{
|
||||
return Slice<Number<0>, ToType>(Number<0>{}, to);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get whole dim slice (from = 0, to = -1).
|
||||
*
|
||||
* \return Requested slice. Could be used to create sliced tensor from other tensor.
|
||||
*/
|
||||
constexpr auto slice() { return Slice<Number<0>, Number<-1>>(Number<0>{}, Number<-1>{}); }
|
||||
|
||||
} // namespace wrapper
|
||||
} // namespace ck
|
||||
Reference in New Issue
Block a user