diff --git a/CHANGELOG.md b/CHANGELOG.md index 2891b8585b..abca69142e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ None - Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804) - Support for bf16/f32/f16 and NHWGC (2D and 3D) grouped convolution backward data (#757 #799) - Support for Batched Gemm DL (#732) -- Introduce wrapper sublibrary (limited functionality). (#1071, #1098) +- Introduce wrapper sublibrary (limited functionality). (#1071, #1098, #1108) ### Changes - Changed the grouped convolution API to maintain consistency with other convolution kernels (#817) diff --git a/docs/wrapper.rst b/docs/wrapper.rst index a2f60b97ae..da3a79eda8 100644 --- a/docs/wrapper.rst +++ b/docs/wrapper.rst @@ -71,3 +71,11 @@ Tensor helpers ------------------------------------- .. doxygenfile:: tensor_utils.hpp + +.. doxygenfile:: tensor_partition.hpp + +------------------------------------- +Operations +------------------------------------- + +.. doxygenfile:: copy.hpp diff --git a/include/ck/utility/tuple_helper.hpp b/include/ck/utility/tuple_helper.hpp index 75f2693f20..f365230054 100644 --- a/include/ck/utility/tuple_helper.hpp +++ b/include/ck/utility/tuple_helper.hpp @@ -178,4 +178,15 @@ __host__ __device__ constexpr auto TupleDepth(const Tuple&) return math::max(TupleDepth(Ts{})...); } +template +__host__ __device__ constexpr auto TupleSlice(const Tuple& tuple) +{ + return generate_tuple( + [&](auto i) { + using Idx = Number; + return tuple.At(Idx{}); + }, + Number{}); +} + } // namespace ck diff --git a/include/ck/wrapper/layout.hpp b/include/ck/wrapper/layout.hpp index f20d985b49..1643eb7383 100644 --- a/include/ck/wrapper/layout.hpp +++ b/include/ck/wrapper/layout.hpp @@ -14,11 +14,9 @@ namespace wrapper { * \tparam Shape Tuple of Number<> (for compile-time layout) or index_t * (dynamic layout). It is possible to pass nested shapes * (e.g. ((4, 2), 2)), nested dimensions are merged. - * \tparam Strides Tuple of Number<> (for compile-time layout) or index_t - * (dynamic layout). Stride tuple should be nested if shape tuple is - * nested. + * \tparam UnnestedDescriptorType Tensor descriptor for unnested shape dims. */ -template +template struct Layout { private: @@ -31,7 +29,7 @@ struct Layout { return generate_tuple( [&](auto) { - if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime()) + if constexpr(!UnnestedDescriptorType::IsKnownAtCompileTime()) { // runtime layout return index_t(0); @@ -45,27 +43,6 @@ struct Layout Number::Size()>{}); } - // Generate packed (column-major) strides if not passed - template - __host__ __device__ constexpr static auto - GenerateColumnMajorPackedStrides(const Tuple& shape) - { - const auto unrolled_shape = UnrollNestedTuple(shape); - return generate_tuple( - [&](auto i) { - if constexpr(i.value == 0) - { - return I1; - } - else - { - return TupleReduce([](auto x, auto y) { return x * y; }, - unrolled_shape); - } - }, - Number{}); - } - // Generate LowerDims in Compile-time for MergeTrasform using passed Type // If element of Tuple is also tuple, then merge (generate sequence for merge) // If tuple is element, then pass through (sequence with one element) @@ -207,33 +184,15 @@ struct Layout return transform_tensor_descriptor(desc, transforms, lower_dims, upper_dims); } - template - __host__ __device__ static auto MakeFlattenDescriptor(const LayoutShape& shape, - const LayoutStrides& strides) - { - const auto unrolled_shape = UnrollNestedTuple(shape); - const auto unrolled_strides = UnrollNestedTuple(strides); - static_assert(unrolled_shape.Size() == unrolled_strides.Size(), - "Size of strides and shape are not consistent."); - return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides); - } - - // If the stride is not passed, you can infer it from `GenerateColumnMajorPackedStrides`. - using DeducedStrides = - std::conditional_t>, - remove_cvref_t, - Strides>; - using FlattenDescriptorType = - remove_cvref_t; using Descriptor1dType = - remove_cvref_t; + remove_cvref_t; using DefaultIdxsTupleType = remove_cvref_t; template __host__ __device__ constexpr static auto TransformDesc(const Tuple& shape, const Tuple& idx, - const FlattenDescriptorType& naive_descriptor) + const UnnestedDescriptorType& naive_descriptor) { if constexpr(Tuple::Size() == I1) { @@ -256,48 +215,33 @@ struct Layout } using MergedNestsDescriptorType = remove_cvref_t; + Shape{}, DefaultIdxsTupleType{}, UnnestedDescriptorType{}))>; public: __host__ __device__ constexpr auto GetElementSpaceSize() const { - return flatten_descriptor_.GetElementSpaceSize(); + return unnested_descriptor_.GetElementSpaceSize(); } __host__ __device__ Layout() = delete; + /** * \brief Layout constructor. * * \param shape Shape for layout. - * \param strides Strides for layout (optional if tensor is packed). + * \param unnested_descriptor Descriptor */ - __host__ __device__ constexpr Layout(const Shape& shape, const Strides& strides) - : flatten_descriptor_{}, shape_(shape), strides_(strides) + __host__ __device__ constexpr Layout(const Shape& shape, + const UnnestedDescriptorType& unnested_descriptor) + : shape_(shape) { // Construct if runtime mode - if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime()) + if constexpr(!UnnestedDescriptorType::IsKnownAtCompileTime()) { - flatten_descriptor_ = MakeFlattenDescriptor(shape_, strides_); - descriptor_1d_ = MakeMerge1d(shape_, flatten_descriptor_); + unnested_descriptor_ = unnested_descriptor; + descriptor_1d_ = MakeMerge1d(shape_, unnested_descriptor_); merged_nests_descriptor_ = - TransformDesc(shape_, DefaultIdxsTupleType{}, flatten_descriptor_); - } - } - - /** - * \brief Layout constructor (with default packed column-major strides). - * - * \param shape Shape for layout. - */ - __host__ __device__ constexpr Layout(const Shape& shape) - : flatten_descriptor_{}, shape_(shape), strides_(GenerateColumnMajorPackedStrides(shape_)) - { - if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime()) - { - flatten_descriptor_ = MakeFlattenDescriptor(shape_, strides_); - descriptor_1d_ = MakeMerge1d(shape_, flatten_descriptor_); - merged_nests_descriptor_ = - TransformDesc(shape_, DefaultIdxsTupleType{}, flatten_descriptor_); + TransformDesc(shape_, DefaultIdxsTupleType{}, unnested_descriptor_); } } @@ -310,9 +254,9 @@ struct Layout template __host__ __device__ constexpr index_t operator()() const { - static_assert(FlattenDescriptorType::IsKnownAtCompileTime(), + static_assert(UnnestedDescriptorType::IsKnownAtCompileTime(), "Compiletime operator used on runtime layout."); - using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{}, FlattenDescriptorType{})); + using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{}, UnnestedDescriptorType{})); using UnrolledIdx = decltype(UnrollNestedTuple(Idxs{})); return TransformedDesc{}.CalculateOffset(UnrolledIdx{}); } @@ -339,7 +283,7 @@ struct Layout else { // Custom index, need to transform descriptor - const auto transformed_desc = TransformDesc(shape_, Idx, flatten_descriptor_); + const auto transformed_desc = TransformDesc(shape_, Idx, unnested_descriptor_); return transformed_desc.CalculateOffset(UnrollNestedTuple(Idx)); } } @@ -351,7 +295,7 @@ struct Layout * \return Calculated size. */ template - __host__ __device__ constexpr index_t GetLength() const + __host__ __device__ constexpr auto GetLength() const { const auto elem = shape_.At(Number{}); if constexpr(is_detected>::value) @@ -371,7 +315,7 @@ struct Layout * * \return Calculated size. */ - __host__ __device__ constexpr index_t GetLengths() const + __host__ __device__ constexpr auto GetLengths() const { const auto unrolled_shape = UnrollNestedTuple(shape_); return TupleReduce([](auto x, auto y) { return x * y; }, @@ -385,13 +329,6 @@ struct Layout */ __host__ __device__ constexpr const Shape& GetShape() const { return shape_; } - /** - * \brief Strides getter. - * - * \return Strides. - */ - __host__ __device__ constexpr const DeducedStrides& GetStrides() const { return strides_; } - /** * \brief Get default lengths (tuple filled with Shape length elements). * @@ -417,17 +354,26 @@ struct Layout * * \return Default descriptor. */ - __host__ __device__ constexpr MergedNestsDescriptorType GetDefaultDescriptor() + __host__ __device__ constexpr const MergedNestsDescriptorType& GetDefaultDescriptor() const { return merged_nests_descriptor_; } + /** + * \brief Get unnested descriptor (with unrolled dims) + * + * \return Flatten descriptor. + */ + __host__ __device__ constexpr const UnnestedDescriptorType& GetUnnestedDescriptor() const + { + return unnested_descriptor_; + } + private: - FlattenDescriptorType flatten_descriptor_; + UnnestedDescriptorType unnested_descriptor_; Descriptor1dType descriptor_1d_; MergedNestsDescriptorType merged_nests_descriptor_; const Shape shape_; - const DeducedStrides strides_; }; } // namespace wrapper diff --git a/include/ck/wrapper/operations/copy.hpp b/include/ck/wrapper/operations/copy.hpp new file mode 100644 index 0000000000..aec80f9ca7 --- /dev/null +++ b/include/ck/wrapper/operations/copy.hpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "../utils/tensor_utils.hpp" + +namespace ck { +namespace wrapper { + +/** + * \brief Perform generic copy between two tensors. Tensors must have the + * same size. + * + * \param src_tensor Source tensor. + * \param dst_tensor Destination tensor. + */ +template +__host__ __device__ void copy(const SrcTensorType& src_tensor, DstTensorType& dst_tensor) +{ + if constexpr(!SrcTensorType::IsDynamicBuffer) + { + using SizeType = decltype(size(src_tensor)); + static_for<0, SizeType{}, 1>{}([&](auto i) { dst_tensor(i) = src_tensor(i); }); + } + else if constexpr(!DstTensorType::IsDynamicBuffer) + { + using SizeType = decltype(size(dst_tensor)); + static_for<0, SizeType{}, 1>{}([&](auto i) { dst_tensor(i) = src_tensor(i); }); + } + else + { + for(int i = 0; i < size(src_tensor); i++) + { + dst_tensor(i) = src_tensor(i); + } + } +} + +} // namespace wrapper +} // namespace ck diff --git a/include/ck/wrapper/tensor.hpp b/include/ck/wrapper/tensor.hpp index 4ec6498fbc..a363641373 100644 --- a/include/ck/wrapper/tensor.hpp +++ b/include/ck/wrapper/tensor.hpp @@ -1,9 +1,10 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once #include "utils/tensor_utils.hpp" +#include "utils/tensor_partition.hpp" #include "utils/layout_utils.hpp" namespace ck { @@ -15,14 +16,14 @@ namespace wrapper { * \tparam BufferAddressSpace Memory type (Generic, Global, LDS, VGPR, SGPR). * \tparam ElementType Element data type. * \tparam Shape Tensor shape (layout component). - * \tparam Strides Tensor strides (layout component). + * \tparam UnnestedDescriptorType Unnested descriptor (layout component). * \tparam NumVectors Number of vectors (only for VGPR, SGPR). * \tparam ScalarPerVector Scalars per vector (only for VGPR, SGPR). */ template @@ -31,50 +32,20 @@ struct Tensor private: // Check if Tuple contains Slice object template - constexpr static bool IsSlicing(T&&) + __host__ __device__ constexpr static bool IsSlicing(T&&) { return is_detected::value; } template - constexpr static bool IsSlicing(Tuple&&) + __host__ __device__ constexpr static bool IsSlicing(Tuple&&) { return (IsSlicing(Ts{}) || ...); } - // Calculate first index of new tensor after slice - // It is needed to calculate offset for new tensor - template - constexpr auto GetStartIdxForSlicedTensor(const Tuple& idx) const - { - const auto start_idx_for_sliced_tensor = generate_tuple( - [&](auto i) { - constexpr auto num_i = Number{}; - if constexpr(is_detected>>::value) - { - // if tuple then recurrence - return GetStartIdxForSlicedTensor(idx.At(num_i)); - } - else if constexpr(is_detected>>::value) - { - // if slice, return the beginning of the interval - return idx.At(num_i).from_; - } - else - { - // if one dim selected - return idx.At(num_i); - } - }, - Number::Size()>{}); - - return start_idx_for_sliced_tensor; - } - // Calculate new tensor shape after slice template - constexpr auto GetShapeFromSlicedTensor(const Tuple& idx, - const ShapeTmpType& shape) const + __host__ __device__ constexpr auto GetShapeFromSlicedTensor(const Tuple& idx, + const ShapeTmpType& shape) const { // Pack each value in tuple to remove empty tuples after generation auto new_shape = generate_tuple( @@ -112,67 +83,137 @@ struct Tensor return UnrollNestedTuple<0, 1>(new_shape); } - template - constexpr auto GetStridesFromSlicedTensor(const Tuple& idx, - const StridesTmpType& strides) const + // Generate Freeze for each of nested shape + template + __host__ __device__ constexpr auto GenerateMultipleFreeze(T idx, + const ShapeTmpType& shape) const + { + const auto unrolled_shape = UnrollNestedTuple(shape); + return generate_tuple( + [&](auto i) { + // dimension offset from idx + const auto dim = unrolled_shape.At(Number{}); + const auto dim_idx = idx % dim; + idx /= dim; + return make_freeze_transform(dim_idx); + }, + Number{}); + } + + template + __host__ __device__ constexpr auto + GetTransformsFromSlicedTensor(const Tuple& idx, const ShapeTmpType& shape) const { // Pack each value in tuple to remove empty tuples after generation - auto new_strides = generate_tuple( + auto transforms = generate_tuple( [&](auto i) { constexpr auto num_i = Number{}; if constexpr(is_detected>>::value) { - if constexpr(!IsSlicing(tuple_element_t>{})) - { - // if tuple does not have any slice then we can remove dimension - return Tuple<>{}; - } - else - { - // if tuple then recurrence - return make_tuple( - GetStridesFromSlicedTensor(idx.At(num_i), strides.At(num_i))); - } + return GetTransformsFromSlicedTensor(idx.At(num_i), shape.At(num_i)); } else if constexpr(is_detected>>::value) { - // Stride will be the same - return make_tuple(strides.At(num_i)); + + const auto from = idx.At(num_i).from_; + const auto dim = shape.At(num_i); + const auto range = idx.At(num_i).range(dim); + return make_slice_transform(range, from, from + range); } else { // remove dimension for just value - return Tuple<>{}; + return GenerateMultipleFreeze(idx.At(num_i), shape.At(num_i)); } }, Number::Size()>{}); // Remove empty tuples (deleted elements) and return - return UnrollNestedTuple<0, 1>(new_strides); + return UnrollNestedTuple(transforms); + } + + // There is no output for Freeze transform + template + __host__ __device__ constexpr auto GetSequenceVal(const ck::Freeze&) const + { + return Sequence<>{}; + } + + template + __host__ __device__ constexpr auto + GetSequenceVal(const ck::Slice&) const + { + return Sequence{}; + } + + template + __host__ __device__ constexpr auto GenerateUpperDims(const Tuple<>&) const + { + return Tuple<>{}; + } + + template + __host__ __device__ constexpr auto + GenerateUpperDims(const Tuple& transforms) const + { + constexpr auto num_transforms = Tuple::Size(); + // Deduce Sequence element for specific transform + const auto currect_elem = GetSequenceVal(transforms.At(Number<0>{})); + if constexpr(is_same_v>) + { + const auto next_tuple = GenerateUpperDims(TupleSlice<1, num_transforms>(transforms)); + return concat_tuple(make_tuple(currect_elem), next_tuple); + } + else + { + // Increase i if current_elem is Slice transform + const auto next_tuple = + GenerateUpperDims(TupleSlice<1, num_transforms>(transforms)); + return concat_tuple(make_tuple(currect_elem), next_tuple); + } + } + + template + __host__ __device__ constexpr auto + GetDescriptorFromSlicedTensor(const Tuple& idx, + const ShapeTmpType& shape, + const FlattenDescriptor& flatten_desc) const + { + constexpr auto old_shape_dims = decltype(UnrollNestedTuple(shape))::Size(); + + const auto transforms = GetTransformsFromSlicedTensor(idx, shape); + using TransformsTupleType = decltype(transforms); + + const auto lower_dims = + generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + const auto upper_dims = decltype(GenerateUpperDims<0>(TransformsTupleType{})){}; + return transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims); } public: - using ElementSpaceSize = decltype(Layout{ - Shape{}, Strides{}}.GetElementSpaceSize()); // SpaceSize type for buffer - using TensorElementType = ElementType; // DataType + using ElementSpaceSize = decltype(Layout{ + Shape{}, UnnestedDescriptorType{}}.GetElementSpaceSize()); // SpaceSize type for buffer + using TensorElementType = ElementType; // DataType static constexpr MemoryTypeEnum TensorBufferAddressSpace = BufferAddressSpace; static constexpr bool IsDynamicBuffer = !(BufferAddressSpace == MemoryTypeEnum ::Sgpr || BufferAddressSpace == MemoryTypeEnum ::Vgpr); __host__ __device__ Tensor() = delete; - __host__ __device__ Tensor(ElementType* pointer, const Layout& layout) + __host__ __device__ Tensor(ElementType* pointer, + const Layout& layout) : layout_(layout), buffer_(make_dynamic_buffer(pointer, layout.GetElementSpaceSize())) { } - __host__ __device__ Tensor(const Layout& layout) : layout_(layout) + __host__ __device__ Tensor(const Layout& layout) + : layout_(layout) { static_assert(!IsDynamicBuffer, "Wrong BufferAddressSpace for register."); } - __host__ __device__ constexpr const Layout& GetLayout() const + __host__ __device__ constexpr const Layout& GetLayout() const { return layout_; } @@ -182,21 +223,14 @@ struct Tensor __host__ __device__ auto operator[](const Tuple& idx) const { static_assert(IsDynamicBuffer, "Register slice is not supported"); - // Calculate offset based on first idx for new tensor - const index_t offset = layout_(GetStartIdxForSlicedTensor(idx)); + const auto& shape = layout_.GetShape(); + auto new_shape = GetShapeFromSlicedTensor(idx, shape); - auto new_shape = GetShapeFromSlicedTensor(idx, layout_.GetShape()); - if constexpr(is_same_v>) - { - auto new_layout = make_layout(new_shape); - return make_tensor(buffer_.p_data_ + offset, new_layout); - } - else - { - auto new_strides = GetStridesFromSlicedTensor(idx, layout_.GetStrides()); - auto new_layout = make_layout(new_shape, new_strides); - return make_tensor(buffer_.p_data_ + offset, new_layout); - } + const auto& flatten_desc = layout_.GetUnnestedDescriptor(); + auto new_desc = GetDescriptorFromSlicedTensor(idx, shape, flatten_desc); + const auto new_layout = + Layout(new_shape, new_desc); + return make_tensor(buffer_.p_data_, new_layout); } template {}), bool> = false> @@ -222,18 +256,10 @@ struct Tensor } else { - if constexpr(is_same_v>) - { - constexpr index_t offset = - Layout{Shape{}}.template operator()>(); - return buffer_[Number{}]; - } - else - { - constexpr index_t offset = - Layout{Shape{}, Strides{}}.template operator()>(); - return buffer_[Number{}]; - } + constexpr index_t offset = Layout{ + Shape{}, + UnnestedDescriptorType{}}.template operator()>(); + return buffer_[Number{}]; } } @@ -260,18 +286,10 @@ struct Tensor } else { - if constexpr(is_same_v>) - { - constexpr index_t offset = - Layout{Shape{}}.template operator()>(); - return buffer_(Number{}); - } - else - { - constexpr index_t offset = - Layout{Shape{}, Strides{}}.template operator()>(); - return buffer_(Number{}); - } + constexpr index_t offset = Layout{ + Shape{}, + UnnestedDescriptorType{}}.template operator()>(); + return buffer_(Number{}); } } @@ -292,6 +310,8 @@ struct Tensor return layout_.GetDefaultDescriptor(); } + __host__ __device__ ElementType* GetPointer() const { return buffer_.p_data_; } + private: using DynamicBufferType = DynamicBuffer; - const Layout layout_; + const Layout layout_; Buffer buffer_; }; diff --git a/include/ck/wrapper/utils/layout_utils.hpp b/include/ck/wrapper/utils/layout_utils.hpp index 5df9dd7dea..f4ba0a969f 100644 --- a/include/ck/wrapper/utils/layout_utils.hpp +++ b/include/ck/wrapper/utils/layout_utils.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -22,11 +22,57 @@ namespace wrapper { // Disable from doxygen docs generation /// @cond // forward declaration -template +template struct Layout; template using is_tuple = decltype(std::declval().IsTuple()); + +namespace { +// Generate packed (column-major) strides if not passed +template +__host__ __device__ constexpr static auto +GenerateColumnMajorPackedStrides(const Tuple& shape) +{ + const auto unrolled_shape = UnrollNestedTuple(shape); + return generate_tuple( + [&](auto i) { + if constexpr(i.value == 0) + { + return Number<1>{}; + } + else + { + return TupleReduce{}.value, i.value>([](auto x, auto y) { return x * y; }, + unrolled_shape); + } + }, + Number{}); +} + +template +__host__ __device__ constexpr auto MakeFlattenDescriptor(const LayoutShape& shape, + const LayoutStrides& strides) +{ + const auto unrolled_shape = UnrollNestedTuple(shape); + if constexpr(is_same_v>) + { + // if not passed, then generate + const auto unrolled_strides = GenerateColumnMajorPackedStrides(unrolled_shape); + static_assert(unrolled_shape.Size() == unrolled_strides.Size(), + "Size of strides and shape are not consistent."); + return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides); + } + else + { + const auto unrolled_strides = UnrollNestedTuple(strides); + static_assert(unrolled_shape.Size() == unrolled_strides.Size(), + "Size of strides and shape are not consistent."); + return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides); + } +} +} // namespace + /// @endcond // make_* @@ -38,10 +84,10 @@ using is_tuple = decltype(std::declval().IsTuple()); * \return Constructed layout. */ template -__host__ __device__ constexpr Layout make_layout(const Shape& shape, - const Strides& strides) +__host__ __device__ constexpr auto make_layout(const Shape& shape, const Strides& strides) { - return Layout(shape, strides); + using UnnestedDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Strides{})); + return Layout(shape, MakeFlattenDescriptor(shape, strides)); } /** @@ -52,9 +98,10 @@ __host__ __device__ constexpr Layout make_layout(const Shape& sh * \return Constructed layout. */ template -__host__ __device__ constexpr Layout> make_layout(const Shape& shape) +__host__ __device__ constexpr auto make_layout(const Shape& shape) { - return Layout>(shape); + using UnnestedDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Tuple<>{})); + return Layout(shape, MakeFlattenDescriptor(shape, Tuple<>{})); } // Layout helpers @@ -89,26 +136,51 @@ __host__ __device__ constexpr auto get(const Tuple& tuple) * \param layout Layout to create sub layout. * \return Requsted sub layout. */ -template -__host__ __device__ constexpr auto get(const Layout& layout) +template +__host__ __device__ constexpr auto get(const Layout& layout) { - const auto& shape = layout.GetShape(); - const auto& new_shape = get(shape); + const auto& shape = layout.GetShape(); + const auto new_shape = get(shape); static_assert(is_detected::value, "Shape of sub layout must be tuple"); - if constexpr(is_same_v>) - { - // If stride not passed, create without strides - return make_layout(new_shape); - } - else - { - const auto& strides = layout.GetStrides(); - const auto& new_strides = get(strides); - static_assert(is_detected::value, - "Strides of sub layout must be tuple"); - return make_layout(new_shape, new_strides); - } + + constexpr auto old_shape_dims = decltype(UnrollNestedTuple(shape))::Size(); + constexpr auto new_shape_dims = decltype(UnrollNestedTuple(new_shape))::Size(); + constexpr auto shape_offset = decltype(UnrollNestedTuple(TupleSlice<0, idx>(shape)))::Size(); + + const auto unrolled_shape = UnrollNestedTuple(shape); + const auto transforms = generate_tuple( + [&](auto i) { + // Compare Idx with shape + if constexpr(i < shape_offset || i >= shape_offset + new_shape_dims) + { + // Remove dimension + return make_freeze_transform(Number<0>{}); + } + else + { + return make_pass_through_transform(unrolled_shape.At(i)); + } + }, + Number{}); + + const auto lower_dims = + generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + const auto upper_dims = generate_tuple( + [&](auto i) { + if constexpr(i < shape_offset || i >= shape_offset + new_shape_dims) + return Sequence<>{}; + + else + { + return Sequence{}; + } + }, + Number{}); + + const auto& flatten_desc = layout.GetUnnestedDescriptor(); + auto new_desc = transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims); + return Layout(new_shape, new_desc); } /** @@ -142,8 +214,8 @@ __host__ __device__ T constexpr size(const T& dim) * \param layout Layout to get Shape of. * \return Requsted length. */ -template -__host__ __device__ constexpr index_t size(const Layout& layout) +template +__host__ __device__ constexpr auto size(const Layout& layout) { return layout.template GetLength(); } @@ -155,7 +227,7 @@ __host__ __device__ constexpr index_t size(const Layout& layout) * \return Requsted size. */ template -__host__ __device__ constexpr index_t size(const Tuple& shape) +__host__ __device__ constexpr auto size(const Tuple& shape) { const auto unrolled_shape = UnrollNestedTuple(shape); return TupleReduce<0, unrolled_shape.Size()>([](auto x, auto y) { return x * y; }, @@ -168,8 +240,8 @@ __host__ __device__ constexpr index_t size(const Tuple& shape) * \param layout Layout to calculate shape size. * \return Requsted size. */ -template -__host__ __device__ constexpr index_t size(const Layout& layout) +template +__host__ __device__ constexpr auto size(const Layout& layout) { return layout.GetLengths(); } @@ -182,7 +254,7 @@ __host__ __device__ constexpr index_t size(const Layout& layout) * \return Requsted length. */ template -__host__ __device__ constexpr index_t size(const Tuple& tuple) +__host__ __device__ constexpr auto size(const Tuple& tuple) { return size(tuple.At(Number{})); } @@ -208,8 +280,9 @@ __host__ __device__ constexpr auto size(const T& elem) * \param layout Layout to calculate rank. * \return Requsted rank. */ -template -__host__ __device__ constexpr auto rank([[maybe_unused]] const Layout& layout) +template +__host__ __device__ constexpr auto +rank([[maybe_unused]] const Layout& layout) { return Shape::Size(); } @@ -261,8 +334,8 @@ __host__ __device__ constexpr auto rank(const T& elem) * \param layout Layout to calculate depth. * \return Requsted depth. */ -template -__host__ __device__ constexpr auto depth(const Layout& layout) +template +__host__ __device__ constexpr auto depth(const Layout& layout) { const auto& shape = layout.GetShape(); return TupleDepth(shape); @@ -307,26 +380,14 @@ __host__ __device__ constexpr auto depth(const T& elem) return depth(get(elem)); } -/** - * \brief Get Layout strides. - * - * \param layout Layout to get strides from. - * \return Requsted strides. - */ -template -__host__ __device__ constexpr const auto& stride(const Layout& layout) -{ - return layout.GetStrides(); -} - /** * \brief Get Layout shape. * * \param layout Layout to get shape from. * \return Requsted shape. */ -template -__host__ __device__ constexpr const auto& shape(const Layout& layout) +template +__host__ __device__ constexpr const auto& shape(const LayoutType& layout) { return layout.GetShape(); } diff --git a/include/ck/wrapper/utils/tensor_partition.hpp b/include/ck/wrapper/utils/tensor_partition.hpp new file mode 100644 index 0000000000..a0634f6b38 --- /dev/null +++ b/include/ck/wrapper/utils/tensor_partition.hpp @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "tensor_utils.hpp" +#include "layout_utils.hpp" + +namespace ck { +namespace wrapper { + +namespace { +// Calculate shape for partition based on number of threads per each dim and +// previous shape +template +__host__ __device__ constexpr auto CalculateLocalPartitionShape(const Tuple& shape, + const Tuple& thread_lengths) +{ + static_assert(Tuple::Size() == Tuple::Size(), "Wrong thread_lengths shape."); + return generate_tuple( + [&](auto i) { + constexpr auto num_i = Number{}; + if constexpr(is_detected>>::value) + { + // if tuple then recurrence + return CalculateLocalPartitionShape(shape.At(num_i), thread_lengths.At(num_i)); + } + else + { + const auto slice_len = shape.At(num_i) / thread_lengths.At(num_i); + return slice_len; + } + }, + Number::Size()>{}); +} + +// Calculate shape for partition based on number of threads per each dim, +// previous strides and steps +template +__host__ __device__ constexpr auto +CalculateLocalPartitionDescriptor(const Tuple& shape, + const Tuple& thread_lengths, + const Tuple& steps, + const FlattenDescType& flatten_desc) +{ + + static_assert(Tuple::Size() == Tuple::Size(), "Wrong thread_lengths shape."); + const auto unrolled_thread_lengths = UnrollNestedTuple(thread_lengths); + const auto unrolled_shape = UnrollNestedTuple(shape); + constexpr auto dims = decltype(unrolled_thread_lengths)::Size(); + + using UnrolledStepsType = decltype(UnrollNestedTuple(steps)); + + using I1 = Number<1>; + + const auto transforms = generate_tuple( + [&](auto i) { + constexpr auto num_i = Number{}; + if constexpr(is_same_v, Tuple<>>) + { + // By default raked partition + const auto partition_stride = unrolled_thread_lengths.At(num_i); + return make_embed_transform(make_tuple(unrolled_shape.At(num_i)), + make_tuple(partition_stride)); + } + else if constexpr(!is_same_v, index_t>) + { + // Compiletime partition + if constexpr(is_same_v, I1>) + { + // raked + const auto partition_stride = unrolled_thread_lengths.At(num_i); + return make_embed_transform(make_tuple(unrolled_shape.At(num_i)), + make_tuple(partition_stride)); + } + else + { + // packed + return make_embed_transform(make_tuple(unrolled_shape.At(num_i)), + make_tuple(I1{})); + } + } + else + { + // Runtime partition + if(steps.At(num_i) == 1) + { + // raked + const auto partition_stride = unrolled_thread_lengths.At(num_i); + return make_embed_transform(make_tuple(unrolled_shape.At(num_i)), + make_tuple(partition_stride)); + } + else + { + // packed + return make_embed_transform(make_tuple(unrolled_shape.At(num_i)), + make_tuple(I1{})); + } + } + }, + Number{}); + + const auto lower_dims = + generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + const auto upper_dims = + generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + return transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims); +} + +template +__host__ __device__ constexpr auto CalculateLayoutOffsetIdxImpl(const Tuple& thread_lengths, + const Tuple& steps, + index_t& thread_id) +{ + return generate_tuple( + [&](auto i) { + constexpr auto num_i = Number{}; + if constexpr(is_detected>>::value) + { + // if tuple then recurrence + if constexpr(is_same_v, Tuple<>>) + { + return CalculateLayoutOffsetIdxImpl( + thread_lengths.At(num_i), Tuple<>{}, thread_id); + } + else + { + return CalculateLayoutOffsetIdxImpl( + thread_lengths.At(num_i), steps.At(num_i), thread_id); + } + } + else + { + // Update thread_id after each dim + const auto dim_thread_id = thread_id % thread_lengths.At(num_i); + thread_id /= thread_lengths.At(num_i); + if constexpr(is_same_v, Tuple<>>) + { + return dim_thread_id; + } + else + { + // Apply step + return steps.At(num_i) * dim_thread_id; + } + } + }, + Number::Size()>{}); +} + +// Convert integer thread_idx to tuple index with steps applied +template +__host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple& thread_lengths, + const Tuple& steps, + const index_t thread_id) +{ + // Create tmp thread_id copy for CalculateLayoutOffsetIdxImpl updates + index_t thread_id_copy = thread_id; + return CalculateLayoutOffsetIdxImpl(thread_lengths, steps, thread_id_copy); +} + +// Apply steps to index represented as tuple +template +__host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple& steps, + const Tuple& block_idxs) +{ + return generate_tuple( + [&](auto i) { + constexpr auto num_i = Number{}; + if constexpr(is_detected>>::value) + { + // if tuple then recurrence + if constexpr(is_same_v, Tuple<>>) + { + return CalculateLayoutOffsetIdx(Tuple<>{}, block_idxs.At(num_i)); + } + else + { + return CalculateLayoutOffsetIdx(steps.At(num_i), block_idxs.At(num_i)); + } + } + else + { + if constexpr(is_same_v, Tuple<>>) + { + return block_idxs.At(num_i); + } + else + { + // apply step + return steps.At(num_i) * block_idxs.At(num_i); + } + } + }, + Number::Size()>{}); +} + +// User passes only shape per block to the make_local_tile function. This function calculates +// block layout based on the shape. +template +__host__ __device__ constexpr auto CalculateBlockLengths(const Tuple& shape, + const Tuple& tile_shape) +{ + return generate_tuple( + [&](auto i) { + constexpr auto num_i = Number{}; + if constexpr(is_detected>>::value) + { + // if tuple then recurrence + return CalculateBlockLengths(shape.At(num_i), tile_shape.At(num_i)); + } + else + { + return shape.At(num_i) / tile_shape.At(num_i); + } + }, + Number::Size()>{}); +} +} // namespace + +/** + * \brief Create local partition for thread. + * + * \param tensor Tensor for partition. + * \param thread_lengths Layout of threads. + * \param thread_id Thread index represented as integer. + * \param steps Thread step (default=1, raked partition) + * \return Partition tensor. + */ +template > +__host__ __device__ constexpr auto make_local_partition(const TensorType& tensor, + const ThreadLengthsTuple& thread_lengths, + const index_t thread_id, + const StepsTuple steps = StepsTuple{}) +{ + // Create shape, strides and layout for new partition tensor + const auto partition_shape = CalculateLocalPartitionShape(shape(tensor), thread_lengths); + // Create new descriptor and layout + const auto& flatten_desc = layout(tensor).GetUnnestedDescriptor(); + auto partition_desc = + CalculateLocalPartitionDescriptor(shape(tensor), thread_lengths, steps, flatten_desc); + const auto partition_layout = Layout( + partition_shape, partition_desc); + // Calculate offset for new partition tensor + const auto offset_idx = CalculateLayoutOffsetIdx(thread_lengths, steps, thread_id); + const auto partition_offset = layout(tensor)(offset_idx); + return make_tensor(tensor.GetPointer() + partition_offset, + partition_layout); +} + +/** + * \brief Create local tile for thread block. + * + * \param tensor Tensor for partition. + * \param tile_shape Shapes of requested tile. + * \param block_idx Block index represented as tuple. + * \param steps Block step (default=1, raked partition) + * \return Tile tensor. + */ +template > +__host__ __device__ constexpr auto make_local_tile(const TensorType& tensor, + const BlockShapeTuple& tile_shape, + const BlockIdxTuple& block_idx, + const StepsTuple steps = StepsTuple{}) +{ + // Create block lengths, strides and layout for new tile tensor + const auto block_lengths = CalculateBlockLengths(shape(tensor), tile_shape); + // Create new descriptor and layout + const auto& flatten_desc = layout(tensor).GetUnnestedDescriptor(); + auto tile_desc = + CalculateLocalPartitionDescriptor(tile_shape, block_lengths, steps, flatten_desc); + const auto tile_layout = Layout, decltype(tile_desc)>( + tile_shape, tile_desc); + // Calculate offset for new partition tensor + const auto offset_idx = CalculateLayoutOffsetIdx(steps, block_idx); + const auto tile_offset = layout(tensor)(offset_idx); + return make_tensor(tensor.GetPointer() + tile_offset, + tile_layout); +} + +} // namespace wrapper +} // namespace ck diff --git a/include/ck/wrapper/utils/tensor_utils.hpp b/include/ck/wrapper/utils/tensor_utils.hpp index 5f0dc3e500..1e932e62e1 100644 --- a/include/ck/wrapper/utils/tensor_utils.hpp +++ b/include/ck/wrapper/utils/tensor_utils.hpp @@ -27,12 +27,12 @@ using MemoryTypeEnum = AddressSpaceEnum; // Disable from doxygen docs generation /// @cond // forward declarations -template +template struct Layout; template @@ -98,11 +98,19 @@ using is_tuple = decltype(std::declval().IsTuple()); * \param layout Tensor layout. * \return Constructed tensor. */ -template -constexpr auto make_tensor(ElementType* pointer, const Layout& layout) +template +constexpr auto make_tensor(ElementType* pointer, + const Layout& layout) { - return Tensor( - pointer, layout); + return Tensor(pointer, layout); } /** @@ -112,19 +120,21 @@ constexpr auto make_tensor(ElementType* pointer, const Layout& l * \tparam NumVectors Number of vectors. * \tparam ScalarPerVector Scalars per vector. * \tparam ElementType Memory data type. - * \param layout Tensor layout. * \return Constructed tensor. */ template -constexpr auto make_register_tensor(const Layout& layout) + typename ElementType> +constexpr auto make_register_tensor() { - static_assert(!IsNestedTuple(Shape{}), "Register tensor with nested layout is not supported"); - return Tensor(layout); + const auto layout = make_layout(make_tuple(Number{}), make_tuple(Number<1>{})); + return Tensor>, + std::remove_const_t>, + NumVectors, + ScalarPerVector>(layout); } /** @@ -136,12 +146,15 @@ constexpr auto make_register_tensor(const Layout& layout) template -__host__ __device__ constexpr const auto& -layout(const Tensor& - tensor) +__host__ __device__ constexpr const auto& layout(const Tensor& tensor) { return tensor.GetLayout(); } @@ -157,12 +170,15 @@ template -__host__ __device__ constexpr index_t -size(const Tensor& - tensor) +__host__ __device__ constexpr auto size(const Tensor& tensor) { return size(tensor.GetLayout()); } @@ -178,12 +194,15 @@ template -__host__ __device__ constexpr index_t -rank(const Tensor& - tensor) +__host__ __device__ constexpr auto rank(const Tensor& tensor) { return rank(tensor.GetLayout()); } @@ -199,35 +218,19 @@ template -__host__ __device__ constexpr index_t -depth(const Tensor& - tensor) +__host__ __device__ constexpr auto depth(const Tensor& tensor) { return depth(tensor.GetLayout()); } -/** - * \brief Get Tensor strides. - * - * \param tensor Tensor to get strides from. - * \return Requsted strides. - */ -template -__host__ __device__ constexpr const auto& -stride(const Tensor& - tensor) -{ - return stride(tensor.GetLayout()); -} - /** * \brief Get Tensor shape. * @@ -237,12 +240,15 @@ stride(const Tensor -__host__ __device__ constexpr const auto& -shape(const Tensor& - tensor) +__host__ __device__ constexpr const auto& shape(const Tensor& tensor) { return shape(tensor.GetLayout()); } diff --git a/test/wrapper/CMakeLists.txt b/test/wrapper/CMakeLists.txt index 6b25c08a8a..6c3e29ab87 100644 --- a/test/wrapper/CMakeLists.txt +++ b/test/wrapper/CMakeLists.txt @@ -2,3 +2,7 @@ add_gtest_executable(test_layout test_layout.cpp) target_link_libraries(test_layout PRIVATE utility) add_gtest_executable(test_tensor test_tensor.cpp) target_link_libraries(test_tensor PRIVATE utility) +add_gtest_executable(test_copy test_copy.cpp) +target_link_libraries(test_copy PRIVATE utility) +add_gtest_executable(test_partition test_partition.cpp) +target_link_libraries(test_partition PRIVATE utility) diff --git a/test/wrapper/test_copy.cpp b/test/wrapper/test_copy.cpp new file mode 100644 index 0000000000..5cf09a54be --- /dev/null +++ b/test/wrapper/test_copy.cpp @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include + +#include "ck/host_utility/kernel_launch.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/utility/common_header.hpp" +#include "ck/wrapper/layout.hpp" +#include "ck/wrapper/tensor.hpp" +#include "ck/wrapper/operations/copy.hpp" + +// Test copy from Global to Global through LDS and VGPR +template +__global__ void TestCopyDevice(const InputTensor input_tensor, + OutputTensor output_tensor, + const BlockShape tile_shape, + const ThreadLayoutShape thread_layout, + const LocalTileSteps block_steps, + const LocalPartitionSteps thread_steps) +{ + __shared__ ck::index_t p_shared[ck::wrapper::size(tile_shape)]; + auto tensor_lds = ck::wrapper::make_tensor( + p_shared, ck::wrapper::make_layout(tile_shape)); + + const auto block_idxs = ck::make_tuple(ck::make_tuple(0, 0), blockIdx.x); + + // Get local tiles for global memory + const auto input_local_tile = + ck::wrapper::make_local_tile(input_tensor, tile_shape, block_idxs, block_steps); + const auto output_local_tile = + ck::wrapper::make_local_tile(output_tensor, tile_shape, block_idxs, block_steps); + + // Get partition per thread + const auto input_local_partition = ck::wrapper::make_local_partition( + input_local_tile, thread_layout, threadIdx.x, thread_steps); + auto lds_local_partition = + ck::wrapper::make_local_partition(tensor_lds, thread_layout, threadIdx.x, thread_steps); + auto output_local_partition = ck::wrapper::make_local_partition( + output_local_tile, thread_layout, threadIdx.x, thread_steps); + + // Allocate VGPR + constexpr ck::index_t scalar_per_vector = 1; + constexpr ck::index_t vgpr_size = ck::wrapper::size(lds_local_partition); + auto tensor_vgpr = ck::wrapper::make_register_tensor(); + + // Perform copy + ck::wrapper::copy(input_local_partition, lds_local_partition); + ck::wrapper::copy(lds_local_partition, tensor_vgpr); + ck::wrapper::copy(tensor_vgpr, output_local_partition); +} + +void PerformCopyGlobalToGlobalViaLDS() +{ + const auto shape = + ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<2>{}), ck::Number<256>{}); + const auto strides = + ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<2>{}), ck::Number<4>{}); + const auto layout = ck::wrapper::make_layout(shape, strides); + + // 0, 1, 2, ..., size(shape) - 1 + std::vector input_data(ck::wrapper::size(shape)); + std::iota(input_data.begin(), input_data.end(), 0); + + // Global memory buffers + DeviceMem in_buf(ck::wrapper::size(layout) * sizeof(ck::index_t)); + DeviceMem out_buf(ck::wrapper::size(layout) * sizeof(ck::index_t)); + + in_buf.ToDevice(input_data.data()); + out_buf.SetZero(); + + // Create tensors for global memory + const auto input_tensor_global = ck::wrapper::make_tensor( + static_cast(in_buf.GetDeviceBuffer()), layout); + auto output_tensor_global = ck::wrapper::make_tensor( + static_cast(out_buf.GetDeviceBuffer()), layout); + + const auto thread_layout = + ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<32>{}); + const auto tile_shape = + ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<2>{}), ck::Number<64>{}); + + const auto thread_steps = + ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<2>{}); + const auto block_steps = + ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<64>{}); + + const ck::index_t grid_size = ck::math::integer_divide_ceil( + ck::wrapper::size(input_tensor_global), ck::wrapper::size(tile_shape)); + + const auto kernel = TestCopyDevice; + launch_and_time_kernel(StreamConfig{}, + kernel, + dim3(grid_size), + dim3(ck::wrapper::size(thread_layout)), + 0, + input_tensor_global, + output_tensor_global, + tile_shape, + thread_layout, + block_steps, + thread_steps); + + // Verify results + std::vector output_data(ck::wrapper::size(shape)); + out_buf.FromDevice(output_data.data()); + EXPECT_TRUE(ck::utils::check_err(output_data, input_data)); +} + +TEST(TestCopy, CopyGlobalToGlobalViaLDS) { PerformCopyGlobalToGlobalViaLDS(); } diff --git a/test/wrapper/test_layout.cpp b/test/wrapper/test_layout.cpp index 14a8b96462..a128a6d84f 100644 --- a/test/wrapper/test_layout.cpp +++ b/test/wrapper/test_layout.cpp @@ -84,7 +84,8 @@ TEST_F(TestWrapperLayout, 2d) ck::make_tuple(ck::Sequence<0>{})); const auto layout_runtime = ck::wrapper::make_layout(ck::make_tuple(d1, d0)); const auto layout_compiletime = - ck::wrapper::make_layout(ck::make_tuple(ck::Number{}, ck::Number{})); + ck::wrapper::make_layout(ck::make_tuple(ck::Number{}, ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{})); std::vector> idxs; for(ck::index_t h = 0; h < d1; h++) @@ -435,19 +436,11 @@ TEST(TestLayoutHelpers, ShapeAndStrides) constexpr bool check_compiletime_shape = std::is_same_v>; - constexpr bool check_compiletime_strides = - std::is_same_v>; constexpr bool check_runtime_shape = std::is_same_v>; - constexpr bool check_runtime_strides = - std::is_same_v>; EXPECT_TRUE(check_compiletime_shape); - EXPECT_TRUE(check_compiletime_strides); EXPECT_TRUE(check_runtime_shape); - EXPECT_TRUE(check_runtime_strides); } TEST(TestLayoutHelpers, Hierarchical) diff --git a/test/wrapper/test_partition.cpp b/test/wrapper/test_partition.cpp new file mode 100644 index 0000000000..df56b879f6 --- /dev/null +++ b/test/wrapper/test_partition.cpp @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include + +#include "ck/host_utility/kernel_launch.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/utility/common_header.hpp" +#include "ck/wrapper/layout.hpp" +#include "ck/wrapper/tensor.hpp" + +TEST(TestPartition, LocalPartition) +{ + const auto shape = + ck::make_tuple(ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}), ck::Number<4>{}); + const auto strides = + ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}), ck::Number<64>{}); + const auto layout = ck::wrapper::make_layout(shape, strides); + + std::vector data(ck::wrapper::size(layout)); + std::iota(data.begin(), data.end(), 0); + + const auto tensor = + ck::wrapper::make_tensor(data.data(), layout); + + const auto thread_steps = + ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<1>{}), ck::Number<1>{}); + const auto thread_layout = + ck::make_tuple(ck::make_tuple(ck::Number<8>{}, ck::Number<1>{}), ck::Number<1>{}); + + for(ck::index_t thread_id = 0; thread_id < ck::wrapper::size(thread_layout); thread_id++) + { + const auto raked_partition = + ck::wrapper::make_local_partition(tensor, thread_layout, thread_id); + + const auto expected_partition_size = + ck::wrapper::size(tensor) / ck::wrapper::size(thread_layout); + EXPECT_EQ(ck::wrapper::size(raked_partition), expected_partition_size); + EXPECT_EQ(raked_partition(0), thread_id); + } + + for(ck::index_t thread_id = 0; thread_id < ck::wrapper::size(thread_layout); thread_id++) + { + const auto packed_partition = + ck::wrapper::make_local_partition(tensor, thread_layout, thread_id, thread_steps); + + const auto expected_partition_size = + ck::wrapper::size(tensor) / ck::wrapper::size(thread_layout); + const auto expected_partition_first_val = thread_id * ck::wrapper::size<0, 0>(thread_steps); + EXPECT_EQ(ck::wrapper::size(packed_partition), expected_partition_size); + EXPECT_EQ(packed_partition(0), expected_partition_first_val); + } +} + +TEST(TestPartition, LocalTile) +{ + const auto shape = + ck::make_tuple(ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}), ck::Number<4>{}); + const auto strides = + ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}), ck::Number<64>{}); + const auto layout = ck::wrapper::make_layout(shape, strides); + + std::vector data(ck::wrapper::size(layout)); + std::iota(data.begin(), data.end(), 0); + + const auto tensor = + ck::wrapper::make_tensor(data.data(), layout); + + const auto block_steps = + ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{}); + const auto block_shape = + ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{}); + const auto block_layout = + ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{}); + + std::vector, ck::index_t>> block_idxs; + for(ck::index_t x = 0; x < ck::wrapper::size<0, 0>(block_layout); x++) + { + for(ck::index_t y = 0; y < ck::wrapper::size<0, 1>(block_layout); y++) + { + for(ck::index_t z = 0; z < ck::wrapper::size<1>(block_layout); z++) + { + block_idxs.emplace_back(ck::make_tuple(x, y), z); + } + } + } + + for(const auto& block_idx : block_idxs) + { + const auto raked_tile = ck::wrapper::make_local_tile(tensor, block_shape, block_idx); + + const auto expected_tile_size = ck::wrapper::size(block_shape); + EXPECT_EQ(ck::wrapper::size(raked_tile), expected_tile_size); + EXPECT_EQ(raked_tile(0), layout(block_idx)); + } + + for(const auto& block_idx : block_idxs) + { + const auto packed_tile = + ck::wrapper::make_local_tile(tensor, block_shape, block_idx, block_steps); + + const auto expected_tile_size = ck::wrapper::size(block_shape); + const auto expected_tile_first_val = + ck::wrapper::size<0, 0>(block_idx) * ck::wrapper::size<0, 0>(block_shape) * + ck::wrapper::size<0, 0>(strides) + + ck::wrapper::size<0, 1>(block_idx) * ck::wrapper::size<0, 1>(block_shape) * + ck::wrapper::size<0, 1>(strides) + + ck::wrapper::size<1>(block_idx) * ck::wrapper::size<1>(block_shape) * + ck::wrapper::size<1>(strides); + EXPECT_EQ(ck::wrapper::size(packed_tile), expected_tile_size); + EXPECT_EQ(packed_tile(0), expected_tile_first_val); + } +} diff --git a/test/wrapper/test_tensor.cpp b/test/wrapper/test_tensor.cpp index 74cf7f1316..2d4d6f2750 100644 --- a/test/wrapper/test_tensor.cpp +++ b/test/wrapper/test_tensor.cpp @@ -108,7 +108,6 @@ __global__ void TestTensorReadWriteDevice(void* data, void* success) bool* casted_success_ptr = static_cast(success); const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2)); - constexpr auto register_layout = ck::wrapper::make_layout(ck::make_tuple(ck::Number<8>{})); auto tensor_global = ck::wrapper::make_tensor(casted_data_ptr, layout); @@ -116,11 +115,11 @@ __global__ void TestTensorReadWriteDevice(void* data, void* success) auto tensor_vgpr = ck::wrapper::make_register_tensor(register_layout); + ck::index_t>(); auto tensor_sgpr = ck::wrapper::make_register_tensor(register_layout); + ck::index_t>(); InitTensor(tensor_global); InitTensor(tensor_lds); @@ -151,7 +150,7 @@ TEST(TestTensor, ReadWriteGlobalLdsRegistersMemory) TestTensorReadWriteDevice, dim3(1), dim3(1), - nelems * sizeof(ck::index_t), + 0, data_buf.GetDeviceBuffer(), success_buf.GetDeviceBuffer()); @@ -173,33 +172,45 @@ TEST(TestTensor, Slicing) auto tensor2x2x2 = tensor(ck::make_tuple(ck::wrapper::slice(2), ck::wrapper::slice(2)), ck::wrapper::slice(2)); + EXPECT_EQ(tensor2x2x2(0), layout(ck::make_tuple(ck::make_tuple(0, 0), 0))); EXPECT_EQ(ck::wrapper::rank(tensor2x2x2), 2); EXPECT_EQ(ck::wrapper::depth(tensor2x2x2), 2); EXPECT_EQ(ck::wrapper::size(tensor2x2x2), 8); EXPECT_TRUE(TestTensorCheck1d(tensor2x2x2)); auto tensor2x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(2)), ck::wrapper::slice(2)); + EXPECT_EQ(tensor2x2(0), layout(ck::make_tuple(ck::make_tuple(1, 0), 0))); EXPECT_EQ(ck::wrapper::rank(tensor2x2), 2); EXPECT_EQ(ck::wrapper::depth(tensor2x2), 2); EXPECT_EQ(ck::wrapper::size(tensor2x2), 4); - EXPECT_TRUE(TestTensorCheck1d(tensor2x2, layout(ck::make_tuple(ck::make_tuple(1, 0), 0)))); + EXPECT_TRUE(TestTensorCheck1d(tensor2x2)); auto tensor1x1 = tensor(ck::make_tuple(1, ck::wrapper::slice(1, 2)), ck::wrapper::slice(1, 2)); + EXPECT_EQ(tensor1x1(0), layout(ck::make_tuple(ck::make_tuple(1, 1), 1))); EXPECT_EQ(rank(tensor1x1), 2); EXPECT_EQ(depth(tensor1x1), 2); EXPECT_EQ(size(tensor1x1), 1); - EXPECT_TRUE(TestTensorCheck1d(tensor1x1, layout(ck::make_tuple(ck::make_tuple(1, 1), 1)))); + EXPECT_TRUE(TestTensorCheck1d(tensor1x1)); auto tensor2 = tensor(ck::make_tuple(1, 1), ck::wrapper::slice(0, 2)); + EXPECT_EQ(tensor2(0), layout(ck::make_tuple(ck::make_tuple(1, 1), 0))); EXPECT_EQ(ck::wrapper::rank(tensor2), 1); EXPECT_EQ(ck::wrapper::depth(tensor2), 1); EXPECT_EQ(ck::wrapper::size(tensor2), 2); - EXPECT_TRUE(TestTensorCheck1d(tensor2, layout(ck::make_tuple(ck::make_tuple(1, 1), 0)))); + EXPECT_TRUE(TestTensorCheck1d(tensor2)); + + auto tensor2_v2 = tensor(2, ck::wrapper::slice(0, 2)); + EXPECT_EQ(tensor2_v2(0), layout(ck::make_tuple(2, 0))); + EXPECT_EQ(ck::wrapper::rank(tensor2_v2), 1); + EXPECT_EQ(ck::wrapper::depth(tensor2_v2), 1); + EXPECT_EQ(ck::wrapper::size(tensor2_v2), 2); + EXPECT_TRUE(TestTensorCheck1d(tensor2_v2)); // negative indexing auto tensor1x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(0, -2)), ck::wrapper::slice()); + EXPECT_EQ(tensor1x2(0), layout(ck::make_tuple(ck::make_tuple(1, 0), 0))); EXPECT_EQ(rank(tensor1x2), 2); EXPECT_EQ(depth(tensor1x2), 2); EXPECT_EQ(size(tensor1x2), 2); - EXPECT_TRUE(TestTensorCheck1d(tensor1x2, layout(ck::make_tuple(ck::make_tuple(1, 0), 0)))); + EXPECT_TRUE(TestTensorCheck1d(tensor1x2)); }