From 660bfadafd88bd5705fcbea7549a281b8d62f4bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Fri, 19 Jan 2024 11:29:00 +0100 Subject: [PATCH] Add optimized copy to ck wrapper (#1126) * Add optimized copy to ck wrapper * Example optimizations * Fixes * Move img2col test to client example * Refactor example * Fix docs * Fixes * Fix * Fixes * Fixes * Fixes * Fixes * Fixes --------- Co-authored-by: zjing14 [ROCm/composable_kernel commit: 7e4eb4b800b7bec8adb9a1a766f7aba1557e8aa2] --- CHANGELOG.md | 17 +- .../25_tensor_transforms/tensor_transform.cpp | 150 ----- .../CMakeLists.txt | 4 +- .../tensor_transform_using_wrapper.cpp | 2 +- client_example/25_wrapper/wrapper_img2col.cpp | 180 ++++++ docs/wrapper.rst | 8 +- .../ck/utility/is_known_at_compile_time.hpp | 8 +- include/ck/wrapper/layout.hpp | 192 +++++-- include/ck/wrapper/operations/copy.hpp | 140 ++++- include/ck/wrapper/tensor.hpp | 511 ++++++++++-------- include/ck/wrapper/utils/layout_utils.hpp | 81 ++- include/ck/wrapper/utils/tensor_partition.hpp | 376 +++++-------- include/ck/wrapper/utils/tensor_utils.hpp | 111 ++-- .../cpu/reference_image_to_column.hpp | 3 +- test/wrapper/test_copy.cpp | 79 +-- test/wrapper/test_partition.cpp | 89 ++- test/wrapper/test_tensor.cpp | 23 +- 17 files changed, 1109 insertions(+), 865 deletions(-) delete mode 100644 client_example/25_tensor_transforms/tensor_transform.cpp rename client_example/{25_tensor_transforms => 25_wrapper}/CMakeLists.txt (55%) rename client_example/{25_tensor_transforms => 25_wrapper}/tensor_transform_using_wrapper.cpp (98%) create mode 100644 client_example/25_wrapper/wrapper_img2col.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md index abca69142e..12cc4363de 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,21 @@ Full documentation for Composable Kernel is not yet available. -## (Unreleased) CK for ROCm 6.0.0 +## (Unreleased) CK + +### Fixes +None + +### Optimizations +None + +### Additions +- Introduce wrapper sublibrary (limited functionality). (#1071, #1098, #1108, #1126) + +### Changes +None + +## CK for ROCm 6.0.0 ### Fixes - Fixed a hazard associated with inline v_dot (#808) @@ -19,7 +33,6 @@ None - Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804) - Support for bf16/f32/f16 and NHWGC (2D and 3D) grouped convolution backward data (#757 #799) - Support for Batched Gemm DL (#732) -- Introduce wrapper sublibrary (limited functionality). (#1071, #1098, #1108) ### Changes - Changed the grouped convolution API to maintain consistency with other convolution kernels (#817) diff --git a/client_example/25_tensor_transforms/tensor_transform.cpp b/client_example/25_tensor_transforms/tensor_transform.cpp deleted file mode 100644 index 41ceec1cb5..0000000000 --- a/client_example/25_tensor_transforms/tensor_transform.cpp +++ /dev/null @@ -1,150 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/ck.hpp" - -#include "ck/utility/number.hpp" -#include "ck/utility/tuple.hpp" -#include "ck/utility/sequence.hpp" - -#include "ck/tensor_description/tensor_descriptor.hpp" -#include "ck/tensor_description/tensor_descriptor_helper.hpp" -#include "ck/tensor_description/multi_index_transform_helper.hpp" - -static constexpr auto I0 = ck::Number<0>{}; -static constexpr auto I1 = ck::Number<1>{}; -static constexpr auto I2 = ck::Number<2>{}; - -using DataType = int; - -template -void Print1d(const Desc& desc) -{ - std::cout << "Print1d" << std::endl; - for(ck::index_t w = 0; w < desc.GetLength(I0); w++) - { - std::cout << desc.CalculateOffset(ck::make_tuple(w)) << " "; - } - std::cout << std::endl; -} - -template -void Print2d(const Desc& desc) -{ - std::cout << "Print2d" << std::endl; - for(ck::index_t h = 0; h < desc.GetLength(I0); h++) - { - for(ck::index_t w = 0; w < desc.GetLength(I1); w++) - { - std::cout << desc.CalculateOffset(ck::make_tuple(h, w)) << " "; - } - std::cout << std::endl; - } -} - -template -void Print3dCustom(const Desc& desc) -{ - std::cout << "Print3dCustom" << std::endl; - for(ck::index_t d = 0; d < desc.GetLength(I0); d++) - { - for(ck::index_t h = 0; h < desc.GetLength(I1); h++) - { - for(ck::index_t w = 0; w < desc.GetLength(I2); w++) - { - std::cout << desc.CalculateOffset(ck::make_tuple(d, h, w)) << " "; - } - std::cout << std::endl; - } - std::cout << std::endl; - } -} - -int main() -{ - // Tensor descriptor traverse in row-major (need to reverse dims) - std::cout << "Note: Tensor descriptor traverse in row-major" << std::endl; - // Basic descriptor 0, 1, 2, ... 30, 31 - // (dims:4,8 strides:1,4) - const auto desc_4x8_s1x4 = - ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number<4>{}, ck::Number<8>{}), - ck::make_tuple(ck::Number<1>{}, ck::Number<4>{})); - std::cout << "dims:4,8 strides:1,4" << std::endl; - Print2d(desc_4x8_s1x4); - - using Cord1x1Type = ck::Tuple, ck::Number<1>>; - constexpr ck::index_t offset_1x1 = desc_4x8_s1x4.CalculateOffset(Cord1x1Type{}); - std::cout << "Constexpr calculated [1, 1] offset:" << offset_1x1 << std::endl; - - // Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (compile-time descriptor) - // dims:4,(2,4) strides:2,(1,8) - const auto desc_4x2x4_s2x1x8 = - ck::make_naive_tensor_descriptor(ck::make_tuple(4, 2, 4), ck::make_tuple(2, 1, 8)); - // Transform to 2d (column-major, need to to reverse dims) - const auto desc_4x2x4_s2x1x8_merged = ck::transform_tensor_descriptor( - desc_4x2x4_s2x1x8, - ck::make_tuple(ck::make_pass_through_transform(4), - ck::make_merge_transform(ck::make_tuple(4, 2))), - ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<2, 1>{}), - ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{})); - - std::cout << "dims:4,(2,4) strides:2,(1,8)" << std::endl; - Print2d(desc_4x2x4_s2x1x8_merged); - - // Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (compile-time descriptor) - // dims:(2,2),(2,4) strides:((1,4),(2,8) - const auto desc_2x2x2x4_s1x4x2x8 = - ck::make_naive_tensor_descriptor(ck::make_tuple(2, 2, 2, 4), ck::make_tuple(1, 4, 2, 8)); - // Transform to 2d - const auto desc_2x2x2x4_s1x4x2x8_double_merged_2d = ck::transform_tensor_descriptor( - desc_2x2x2x4_s1x4x2x8, - ck::make_tuple(ck::make_merge_transform(ck::make_tuple(2, 2)), - ck::make_merge_transform(ck::make_tuple(4, 2))), - ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<3, 2>{}), - ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{})); - // Transform to 3d - const auto desc_2x2x2x4_s1x4x2x8_double_merged_3d = ck::transform_tensor_descriptor( - desc_2x2x2x4_s1x4x2x8, - ck::make_tuple(ck::make_pass_through_transform(2), - ck::make_pass_through_transform(2), - ck::make_merge_transform(ck::make_tuple(4, 2))), - ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<3, 2>{}), - ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{})); - - std::cout << "dims:(2,2),(2,4) strides:(1,4),(2,8)" << std::endl; - Print2d(desc_2x2x2x4_s1x4x2x8_double_merged_2d); - Print3dCustom(desc_2x2x2x4_s1x4x2x8_double_merged_3d); - - // Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (compile-time descriptor) - // dims:((2,2),2),4 strides:((1,4),2),8 - // Transform to 2d - const auto desc_2x2x2x4_s1x4x2x8_nested = - ck::make_naive_tensor_descriptor(ck::make_tuple(2, 2, 2, 4), ck::make_tuple(1, 4, 2, 8)); - const auto desc_2x2x2x4_s1x4x2x8_nested_merged_3d = ck::transform_tensor_descriptor( - desc_2x2x2x4_s1x4x2x8_nested, - ck::make_tuple(ck::make_merge_transform(ck::make_tuple(2, 2)), - ck::make_pass_through_transform(2), - ck::make_pass_through_transform(4)), - ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<3>{}), - ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{})); - const auto desc_2x2x2x4_s1x4x2x8_nested_merged_1d = ck::transform_tensor_descriptor( - desc_2x2x2x4_s1x4x2x8_nested, - ck::make_tuple(ck::make_merge_transform(ck::make_tuple(4, 2, 2, 2))), - ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}), - ck::make_tuple(ck::Sequence<0>{})); - const auto desc_2x2x2x4_s1x4x2x8_nested_merged_2d = ck::transform_tensor_descriptor( - desc_2x2x2x4_s1x4x2x8_nested_merged_3d, - ck::make_tuple(ck::make_merge_transform(ck::make_tuple(2, 4)), - ck::make_pass_through_transform(4)), - ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}), - ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{})); - - std::cout << "dims:((2,2),2),4 strides:((1,4),2),8" << std::endl; - Print1d(desc_2x2x2x4_s1x4x2x8_nested_merged_1d); - Print2d(desc_2x2x2x4_s1x4x2x8_nested_merged_2d); - Print3dCustom(desc_2x2x2x4_s1x4x2x8_nested_merged_3d); - - return 0; -} diff --git a/client_example/25_tensor_transforms/CMakeLists.txt b/client_example/25_wrapper/CMakeLists.txt similarity index 55% rename from client_example/25_tensor_transforms/CMakeLists.txt rename to client_example/25_wrapper/CMakeLists.txt index d1543fb0ef..eb3be0e6c8 100644 --- a/client_example/25_tensor_transforms/CMakeLists.txt +++ b/client_example/25_wrapper/CMakeLists.txt @@ -1,4 +1,4 @@ -add_executable(client_tensor_transform tensor_transform.cpp) -target_link_libraries(client_tensor_transform PRIVATE composable_kernel::device_other_operations) add_executable(client_tensor_transform_using_wrapper tensor_transform_using_wrapper.cpp) target_link_libraries(client_tensor_transform_using_wrapper PRIVATE composable_kernel::device_other_operations) +add_executable(client_wrapper_img2col wrapper_img2col.cpp) +target_link_libraries(client_wrapper_img2col PRIVATE composable_kernel::device_other_operations) diff --git a/client_example/25_tensor_transforms/tensor_transform_using_wrapper.cpp b/client_example/25_wrapper/tensor_transform_using_wrapper.cpp similarity index 98% rename from client_example/25_tensor_transforms/tensor_transform_using_wrapper.cpp rename to client_example/25_wrapper/tensor_transform_using_wrapper.cpp index de9fcde0b4..4b25d85e2d 100644 --- a/client_example/25_tensor_transforms/tensor_transform_using_wrapper.cpp +++ b/client_example/25_wrapper/tensor_transform_using_wrapper.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #include diff --git a/client_example/25_wrapper/wrapper_img2col.cpp b/client_example/25_wrapper/wrapper_img2col.cpp new file mode 100644 index 0000000000..35074be4c1 --- /dev/null +++ b/client_example/25_wrapper/wrapper_img2col.cpp @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include + +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" + +#include "ck/host_utility/kernel_launch.hpp" +#include "ck/utility/common_header.hpp" +#include "ck/wrapper/layout.hpp" +#include "ck/wrapper/tensor.hpp" +#include "ck/wrapper/operations/copy.hpp" + +static constexpr ck::index_t NumDimSpatial = 3; +using DataType = float; +using InputLayout = ck::tensor_layout::convolution::NDHWGC; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +// Test copy from Global to Global through LDS and VGPR +template +__global__ void DeviceImageToColumnPad0(InputTensor input_tensor, + OutputTensor output_tensor, + const BlockShape tile_shape, + const ThreadLayoutShape thread_layout) +{ + const ck::index_t block_idx = static_cast(blockIdx.x); + + // Get local tiles for global memory + auto input_local_tile = ck::wrapper::make_local_tile(input_tensor, tile_shape, block_idx); + auto output_local_tile = ck::wrapper::make_local_tile(output_tensor, tile_shape, block_idx); + + // Get partition per thread + const auto input_local_partition = + ck::wrapper::make_local_partition(input_local_tile, thread_layout, threadIdx.x); + auto output_local_partition = + ck::wrapper::make_local_partition(output_local_tile, thread_layout, threadIdx.x); + + // Perform copy + using DimAccessOrder = ck::Tuple, ck::Number<1>>; + constexpr ck::index_t vector_dim = 1; + constexpr ck::index_t scalar_per_vector = 4; + ck::wrapper::copy(input_local_partition, + output_local_partition); +} + +void PerformImageToColumnPad0(const ck::index_t G, + const ck::index_t N, + const ck::index_t Di, + const ck::index_t Hi, + const ck::index_t Wi, + const ck::index_t Do, + const ck::index_t Ho, + const ck::index_t Wo, + const ck::index_t C, + const ck::index_t Z, + const ck::index_t Y, + const ck::index_t X, + std::array filter_strides, + std::array filter_dilations) +{ + const ck::index_t ZYXC = Z * Y * X * C; + const ck::index_t GC = G * C; + + // shape: (G, (Wo, Ho, Do, N)), (C, X, Y, Z)) + const auto shape = ck::make_tuple(ck::make_tuple(G, ck::make_tuple(Wo, Ho, Do, N)), + ck::make_tuple(C, X, Y, Z)); + const auto in_strides = + ck::make_tuple(ck::make_tuple(C, + ck::make_tuple(filter_strides[2] * GC, + filter_strides[1] * Wi * GC, + filter_strides[0] * Hi * Wi * GC, + Di * Hi * Wi * GC)), + ck::make_tuple(1, + filter_dilations[2] * GC, + filter_dilations[1] * Wi * GC, + filter_dilations[0] * Hi * Wi * GC)); + const auto in_layout = ck::wrapper::make_layout(shape, in_strides); + + const auto out_strides = ck::make_tuple( + ck::make_tuple( + ZYXC, + ck::make_tuple(ZYXC * G, Wo * ZYXC * G, Ho * Wo * ZYXC * G, Do * Ho * Wo * ZYXC * G)), + ck::make_tuple(1, C, X * C, Y * X * C)); + const auto out_layout = ck::wrapper::make_layout(shape, out_strides); + + const ck::index_t input_size = N * Di * Hi * Wi * GC; + // Global memory buffers + SimpleDeviceMem in_buf(input_size * sizeof(DataType)); + SimpleDeviceMem out_buf(ck::wrapper::size(out_layout) * sizeof(DataType)); + + // User can choose appropriate number of threads and sizes per block + const auto thread_layout = ck::make_tuple(ck::Number<8>{}, ck::Number<16>{}); + // This example doesn't support padding, user should select tile sizes + // which divides the shape completely + const auto tile_shape = ck::make_tuple(ck::Number<32>{}, ck::Number<64>{}); + + // Create buffers for global memory + auto input_tensor_global = ck::wrapper::make_tensor( + static_cast(in_buf.GetDeviceBuffer()), in_layout); + auto output_tensor_global = ck::wrapper::make_tensor( + static_cast(out_buf.GetDeviceBuffer()), out_layout); + + const ck::index_t grid_size = ck::math::integer_divide_ceil(ck::wrapper::size<0>(in_layout), + ck::wrapper::size<0>(tile_shape)) * + ck::math::integer_divide_ceil(ck::wrapper::size<1>(in_layout), + ck::wrapper::size<1>(tile_shape)); + + const auto kernel = DeviceImageToColumnPad0; + const float avg_time = launch_and_time_kernel(StreamConfig{nullptr, true}, + kernel, + dim3(grid_size), + dim3(ck::wrapper::size(thread_layout)), + 0, + input_tensor_global, + output_tensor_global, + tile_shape, + thread_layout); + + std::size_t num_btype = G * N * Do * Ho * Wo * ZYXC * 2 * sizeof(DataType); + float gb_per_sec = num_btype / 1.E6 / avg_time; + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, " + << std::endl; +} + +int main(int argc, char* argv[]) +{ + constexpr ck::index_t G = 4; // number of groups + constexpr ck::index_t N = 32; // batch + constexpr ck::index_t C = 64; // input channel (per group) + constexpr ck::index_t Z = 3; // filter D + constexpr ck::index_t Y = 3; // filter H + constexpr ck::index_t X = 3; // filter W + constexpr ck::index_t Di = 9; // input D + constexpr ck::index_t Hi = 9; // input H + constexpr ck::index_t Wi = 7; // input W + constexpr ck::index_t Do = 7; // output D + constexpr ck::index_t Ho = 7; // output H + constexpr ck::index_t Wo = 5; // output W + PerformImageToColumnPad0(G, + N, + Di, + Hi, + Wi, + Do, + Ho, + Wo, + C, + Z, + Y, + X, + {1, 1, 1} /*filter_strides*/, + {1, 1, 1} /*filter_dilations*/); + return 0; +} diff --git a/docs/wrapper.rst b/docs/wrapper.rst index c050f17caf..79b6c75580 100644 --- a/docs/wrapper.rst +++ b/docs/wrapper.rst @@ -18,8 +18,7 @@ Description The CK library provides a lightweight wrapper for more complex operations implemented in -the library. It allows indexing of nested layouts using a simple interface -(avoiding complex descriptor transformations) and memory access (using Tensor). +the library. Example: @@ -54,6 +53,11 @@ Output:: 1 5 9 13 17 21 25 29 2 6 10 14 18 22 26 30 + +Advanced examples: + +* `Image to column `_ + ------------------------------------- Layout ------------------------------------- diff --git a/include/ck/utility/is_known_at_compile_time.hpp b/include/ck/utility/is_known_at_compile_time.hpp index 2cafc3e6f2..0916e4604e 100644 --- a/include/ck/utility/is_known_at_compile_time.hpp +++ b/include/ck/utility/is_known_at_compile_time.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -19,6 +19,12 @@ struct is_known_at_compile_time static constexpr bool value = false; }; +template <> +struct is_known_at_compile_time +{ + static constexpr bool value = false; +}; + template <> struct is_known_at_compile_time { diff --git a/include/ck/wrapper/layout.hpp b/include/ck/wrapper/layout.hpp index 1643eb7383..39b5c79c67 100644 --- a/include/ck/wrapper/layout.hpp +++ b/include/ck/wrapper/layout.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -14,22 +14,28 @@ namespace wrapper { * \tparam Shape Tuple of Number<> (for compile-time layout) or index_t * (dynamic layout). It is possible to pass nested shapes * (e.g. ((4, 2), 2)), nested dimensions are merged. - * \tparam UnnestedDescriptorType Tensor descriptor for unnested shape dims. + * \tparam UnrolledDescriptorType Tensor descriptor for unnested shape dims. */ -template +template struct Layout { private: static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; - // Generate default idxs tuple (idx with all merged nested shapes) + /** + * \brief Generate default indices tuple (idx with all merged nested shapes) + * + * \param shape Shape to align. + * \return Multi idx tuple with zeros. + */ template - __host__ __device__ constexpr static auto GenerateDefaultIdxsTuple(const Tuple&) + __host__ __device__ constexpr static auto + GenerateDefaultIdxsTuple([[maybe_unused]] const Tuple& shape) { return generate_tuple( [&](auto) { - if constexpr(!UnnestedDescriptorType::IsKnownAtCompileTime()) + if constexpr(!remove_cvref_t::IsKnownAtCompileTime()) { // runtime layout return index_t(0); @@ -43,11 +49,18 @@ struct Layout Number::Size()>{}); } - // Generate LowerDims in Compile-time for MergeTrasform using passed Type - // If element of Tuple is also tuple, then merge (generate sequence for merge) - // If tuple is element, then pass through (sequence with one element) + /** + * \brief Generate lower dims in compile-time for the Merge transform using + * provided type. If element of nested Tuple is also a tuple, then + * merge (generate sequence for merge). If tuple is element, then pass + * through (sequence with one element). + * + * \param shape Shape to align. + * \return LowerDims for MergeTrasform. + */ template - __host__ __device__ constexpr static auto GenerateLowerDim(const Tuple&) + __host__ __device__ constexpr static auto + GenerateLowerDim([[maybe_unused]] const Tuple& shape) { if constexpr(Idx::value == 0) { @@ -87,11 +100,17 @@ struct Layout } } - // Iterate over nested tuples in shape - // Unroll nested tuples to align Tuple to Tuple - // Example idx: (1, 1), 1, 1 - // Example shape: (2, (2, 2)), 2, (2, 2) - // Unrolled shape: 2, (2, 2), 2, (2, 2) + /** + * \brief Iterate over the nested tuples in the shape. + * Unroll nested tuples to align Tuple to Tuple + * Example idx: (1, 1), 1, 1 + * Example shape: (2, (2, 2)), 2, (2, 2) + * Unrolled shape: 2, (2, 2), 2, (2, 2) + * + * \param shape Layout shape. + * \param idx Idx to align. + * \return Algined shape. + */ template __host__ __device__ constexpr static auto AlignShapeToIdx(const Tuple& shape, const Tuple& idx) @@ -126,6 +145,13 @@ struct Layout } } + /** + * \brief Merge descriptor to 1D. + * + * \param shape Layout shape. + * \param desc Descriptor to merge. + * \return 1D descriptor. + */ template __host__ __device__ constexpr static auto MakeMerge1d(const Tuple& shape, const DescriptorToMerge& desc) @@ -137,18 +163,41 @@ struct Layout const auto lower_dims = make_tuple(MergeElemsSequence::Reverse()); const auto upper_dims = make_tuple(Sequence<0>{}); // Merge to 1d - return transform_tensor_descriptor( - desc, make_tuple(make_merge_transform(merge_elems)), lower_dims, upper_dims); + if constexpr(!remove_cvref_t::IsKnownAtCompileTime()) + { + return transform_tensor_descriptor( + desc, make_tuple(make_merge_transform(merge_elems)), lower_dims, upper_dims); + } + else + { + // If the descriptor is known at the compilation time, + // use `make_merge_transform_v1_carry_check` because it doesn't use + // memcpy. + return transform_tensor_descriptor( + desc, + make_tuple(make_merge_transform_v1_carry_check(merge_elems)), + lower_dims, + upper_dims); + } } - // Merge nested shape dims when corresponding index is also nested. - // Input desc shape: 2, 2, 2, 2, 2, 2 - // Example idx: 1, 1, 1, 1 - // Example shape: 2, (2, 2), 2, (2, 2) - // Merged shape: 2, 4, 2, 4 + /** + * \brief Merge nested shape dims when corresponding index is also merged. + * Input desc shape: 2, 2, 2, 2, 2, 2 + * Example idx: 1, 1, 1, (1, 1) + * Example shape: 2, (2, 2), 2, (2, 2) + * Merged shape: 2, 4, 2, 2, 2 + * + * \param shape Layout shape. + * \param idxs Indexes to align descriptor. + * \param desc Descriptor to merge. + * \return Aligned descriptor to idx. + */ template - __host__ __device__ constexpr static auto CreateMergedDescriptor( - const Tuple& shape, const Tuple&, DescriptorToMerge& desc) + __host__ __device__ constexpr static auto + CreateMergedDescriptor(const Tuple& shape, + [[maybe_unused]] const Tuple& idxs, + DescriptorToMerge& desc) { const auto transforms = generate_tuple( [&](auto i) { @@ -160,7 +209,17 @@ struct Layout // If shape element is tuple and idx element is Number, then merge // Unroll and reverse tuple to traverse column-major const auto merge_elems = TupleReverse(UnrollNestedTuple(shape.At(i))); - return make_merge_transform(merge_elems); + if constexpr(!remove_cvref_t::IsKnownAtCompileTime()) + { + return make_merge_transform(merge_elems); + } + else + { + // If the descriptor is known at the compilation time, + // use `make_merge_transform_v1_carry_check` because + // it doesn't use memcpy. + return make_merge_transform_v1_carry_check(merge_elems); + } } else { @@ -185,14 +244,23 @@ struct Layout } using Descriptor1dType = - remove_cvref_t; + remove_cvref_t; using DefaultIdxsTupleType = remove_cvref_t; + public: + /** + * \brief Transform descriptor to align to passed indexes. + * + * \param shape Layout shape. + * \param idxs Indexes to align descriptor. + * \param naive_descriptor Descriptor to merge. + * \return Aligned descriptor to idx. + */ template __host__ __device__ constexpr static auto TransformDesc(const Tuple& shape, - const Tuple& idx, - const UnnestedDescriptorType& naive_descriptor) + const Tuple& idxs, + const UnrolledDescriptorType& naive_descriptor) { if constexpr(Tuple::Size() == I1) { @@ -208,19 +276,18 @@ struct Layout static_assert(Tuple::Size() == Tuple::Size(), "Idx rank and Shape rank must be the same (except 1d)."); // Unroll while IdxDims is nested - const auto aligned_shape = AlignShapeToIdx(shape, idx); + const auto aligned_shape = AlignShapeToIdx(shape, idxs); // Transform correct form of shape - return CreateMergedDescriptor(aligned_shape, UnrollNestedTuple(idx), naive_descriptor); + return CreateMergedDescriptor(aligned_shape, UnrollNestedTuple(idxs), naive_descriptor); } } using MergedNestsDescriptorType = remove_cvref_t; + Shape{}, DefaultIdxsTupleType{}, UnrolledDescriptorType{}))>; - public: __host__ __device__ constexpr auto GetElementSpaceSize() const { - return unnested_descriptor_.GetElementSpaceSize(); + return unrolled_descriptor_.GetElementSpaceSize(); } __host__ __device__ Layout() = delete; @@ -232,16 +299,15 @@ struct Layout * \param unnested_descriptor Descriptor */ __host__ __device__ constexpr Layout(const Shape& shape, - const UnnestedDescriptorType& unnested_descriptor) - : shape_(shape) + const UnrolledDescriptorType& unnested_descriptor) + : unrolled_descriptor_(unnested_descriptor), shape_(shape) { // Construct if runtime mode - if constexpr(!UnnestedDescriptorType::IsKnownAtCompileTime()) + if constexpr(!remove_cvref_t::IsKnownAtCompileTime()) { - unnested_descriptor_ = unnested_descriptor; - descriptor_1d_ = MakeMerge1d(shape_, unnested_descriptor_); + descriptor_1d_ = MakeMerge1d(shape_, unrolled_descriptor_); merged_nests_descriptor_ = - TransformDesc(shape_, DefaultIdxsTupleType{}, unnested_descriptor_); + TransformDesc(shape_, DefaultIdxsTupleType{}, unrolled_descriptor_); } } @@ -254,9 +320,9 @@ struct Layout template __host__ __device__ constexpr index_t operator()() const { - static_assert(UnnestedDescriptorType::IsKnownAtCompileTime(), + static_assert(remove_cvref_t::IsKnownAtCompileTime(), "Compiletime operator used on runtime layout."); - using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{}, UnnestedDescriptorType{})); + using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{}, UnrolledDescriptorType{})); using UnrolledIdx = decltype(UnrollNestedTuple(Idxs{})); return TransformedDesc{}.CalculateOffset(UnrolledIdx{}); } @@ -283,7 +349,7 @@ struct Layout else { // Custom index, need to transform descriptor - const auto transformed_desc = TransformDesc(shape_, Idx, unnested_descriptor_); + const auto transformed_desc = TransformDesc(shape_, Idx, unrolled_descriptor_); return transformed_desc.CalculateOffset(UnrollNestedTuple(Idx)); } } @@ -350,29 +416,55 @@ struct Layout } /** - * \brief Get default descriptor (with the same size as Shape) + * \brief Get descriptor with all nested dimensions merged. + * Example, shape: ((2, 2), 2) + * Descriptor lengths: (4, 2) * - * \return Default descriptor. + * \note The size of merged descriptor is the same as Layout's shape. + * + * \return Merged nests descriptor. */ - __host__ __device__ constexpr const MergedNestsDescriptorType& GetDefaultDescriptor() const + __host__ __device__ constexpr const MergedNestsDescriptorType& + GetMergedNestingDescriptor() const { return merged_nests_descriptor_; } /** - * \brief Get unnested descriptor (with unrolled dims) + * \brief Get descriptor with all dimensions are merged (1D). + * Example, shape: ((2, 2), 2) + * Descriptor lengths: (8) * - * \return Flatten descriptor. + * \return 1D descriptor. */ - __host__ __device__ constexpr const UnnestedDescriptorType& GetUnnestedDescriptor() const + __host__ __device__ constexpr const Descriptor1dType& Get1DDescriptor() const { - return unnested_descriptor_; + return descriptor_1d_; + } + + /** + * \brief Get unnested descriptor (with unrolled dims) + * Example, shape: ((2, 2), 2) + * Descriptor lengths: (2, 2, 2) + * + * \return Flattened descriptor. + */ + __host__ __device__ constexpr const UnrolledDescriptorType& GetUnrolledDescriptor() const + { + return unrolled_descriptor_; } private: - UnnestedDescriptorType unnested_descriptor_; + // All dimensions are unrolled + UnrolledDescriptorType unrolled_descriptor_; + // 1D descriptor Descriptor1dType descriptor_1d_; + // All nesting are merged MergedNestsDescriptorType merged_nests_descriptor_; + // Example, shape: ((2, 2), 2) + // UnrolledDescriptorType lengths: (2, 2, 2) + // Descriptor1dType lengths: (8) + // MergedNestsDescriptorType lengths: (4, 2) const Shape shape_; }; diff --git a/include/ck/wrapper/operations/copy.hpp b/include/ck/wrapper/operations/copy.hpp index aec80f9ca7..7b00fe5500 100644 --- a/include/ck/wrapper/operations/copy.hpp +++ b/include/ck/wrapper/operations/copy.hpp @@ -1,16 +1,21 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once #include "../utils/tensor_utils.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + namespace ck { namespace wrapper { /** - * \brief Perform generic copy between two tensors. Tensors must have the - * same size. + * \brief Perform generic copy between two tensors partitions (threadwise copy). + * Tensors must have the same size. * * \param src_tensor Source tensor. * \param dst_tensor Destination tensor. @@ -37,5 +42,134 @@ __host__ __device__ void copy(const SrcTensorType& src_tensor, DstTensorType& ds } } +/** + * \brief Perform optimized copy between two tensors partitions (threadwise copy). + * Tensors must have the same size. + * + * \tparam DimAccessOrderTuple Tuple with dimension access order. + * \tparam VectorDim Dimension for vectorized read and write. + * \tparam ScalarPerVector Number of scalar per vectorized read and write. + * \param src_tensor Source tensor. + * \param dst_tensor Destination tensor. + */ +template +__device__ void copy(const SrcTensorType& src_tensor, DstTensorType& dst_tensor) +{ + static_assert(is_detected::value); + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + + const auto& in_grid_desc = layout(src_tensor).GetUnrolledDescriptor(); + const auto& out_grid_desc = layout(dst_tensor).GetUnrolledDescriptor(); + + using SrcShapeType = remove_cvref_t; + constexpr index_t num_dims = SrcShapeType::Size(); + + constexpr auto thread_slice_lengths = + generate_sequence_v2([](auto I) { return size(SrcShapeType{}.At(I)); }, Number{}); + constexpr auto dim_access_order = generate_sequence_v2( + [](auto I) { return DimAccessOrderTuple{}.At(I); }, Number{}); + + if constexpr(SrcTensorType::IsDynamicBuffer && DstTensorType::IsDynamicBuffer) + { + // Perform a copy between DynamicBuffers + auto transfer = ThreadwiseTensorSliceTransfer_v7< + Tuple, + Tuple, + decltype(tie(in_grid_desc)), + decltype(tie(out_grid_desc)), + tensor_operation::element_wise::PassThrough, + Sequence(InMemoryDataOperationEnum::Set)>, + decltype(thread_slice_lengths), + decltype(dim_access_order), + VectorDim, + ScalarPerVector, + Sequence, + Sequence>{in_grid_desc, + make_tuple(src_tensor.GetMultiIdxOffsets()), + out_grid_desc, + make_tuple(dst_tensor.GetMultiIdxOffsets()), + tensor_operation::element_wise::PassThrough{}}; + + transfer.Run(tie(in_grid_desc), + tie(src_tensor.GetBuffer()), + tie(out_grid_desc), + tie(dst_tensor.GetBuffer())); + } + else if constexpr(!SrcTensorType::IsDynamicBuffer && DstTensorType::IsDynamicBuffer) + { + // Perform copy from StaticBuffer to DynamicBuffer + const auto src_slice_origin_idxs = + generate_tuple([&](auto) { return I0; }, Number{}); + + auto transfer = + ThreadwiseTensorSliceTransfer_v1r3, + remove_cvref_t, + tensor_operation::element_wise::PassThrough, + decltype(thread_slice_lengths), + decltype(dim_access_order), + VectorDim, + ScalarPerVector, + InMemoryDataOperationEnum::Set, + I1, + true>{out_grid_desc, + dst_tensor.GetMultiIdxOffsets(), + tensor_operation::element_wise::PassThrough{}}; + + transfer.Run(in_grid_desc, + src_slice_origin_idxs, + src_tensor.GetBuffer(), + out_grid_desc, + dst_tensor.GetBuffer()); + } + else if constexpr(SrcTensorType::IsDynamicBuffer && !DstTensorType::IsDynamicBuffer) + { + // Perform copy from DynamicBuffer to StaticBuffer + const auto src_dst_slice_origin = + generate_tuple([&](auto) { return I0; }, Number{}); + constexpr auto src_vector_tensor_lengths = generate_sequence_v2( + [&](auto I) { + if constexpr(I == VectorDim) + { + return Number{}; + } + else + { + return I1; + } + }, + Number{}); + + auto transfer = + ThreadwiseTensorSliceTransfer_v4r1, + remove_cvref_t, + decltype(thread_slice_lengths), + decltype(dim_access_order), + decltype(src_vector_tensor_lengths), + decltype(dim_access_order)>{ + src_tensor.GetMultiIdxOffsets()}; + + transfer.Run(in_grid_desc, + src_dst_slice_origin, + src_tensor.GetBuffer(), + out_grid_desc, + src_dst_slice_origin, + dst_tensor.GetBuffer()); + } + else + { + // Perform copy between StaticBuffers + copy(src_tensor, dst_tensor); + } +} + } // namespace wrapper } // namespace ck diff --git a/include/ck/wrapper/tensor.hpp b/include/ck/wrapper/tensor.hpp index a363641373..57d79c5940 100644 --- a/include/ck/wrapper/tensor.hpp +++ b/include/ck/wrapper/tensor.hpp @@ -10,189 +10,205 @@ namespace ck { namespace wrapper { +namespace detail { +namespace { +/** + * \brief Check if Tuple contains Slice object + * + * \return True if tuple contains Slice object. + */ +template +__host__ __device__ constexpr bool HasSlice(T&&) +{ + return is_detected::value; +} +template +__host__ __device__ constexpr bool HasSlice(Tuple&&) +{ + return (HasSlice(Ts{}) || ...); +} + +/** + * \brief Calculate new shape after slice from parent shape. + * + * \param idxs Tuple of indexes defining slice ranges. + * \param shape Shape which will be sliced. + * \return New tensor shape. + */ +template +__host__ __device__ constexpr auto GetSlicedShape(const Tuple& idxs, + const SlicedShape& shape) +{ + // Pack each value in tuple to remove empty tuples after generation + auto new_shape = generate_tuple( + [&](auto i) { + constexpr auto num_i = Number{}; + if constexpr(is_detected>>::value) + { + if constexpr(!detail::HasSlice(tuple_element_t>{})) + { + // if tuple does not have any slice then we can remove dimension + return Tuple<>{}; + } + else + { + // if tuple then recurrence + return make_tuple(GetSlicedShape(idxs.At(num_i), shape.At(num_i))); + } + } + else if constexpr(is_detected>>::value) + { + // calculate new dimension + const auto& dim = size(shape.At(num_i)); + const auto val = idxs.At(num_i).range(dim); + return make_tuple(val); + } + else + { + // remove dimension for just value + return Tuple<>{}; + } + }, + Number::Size()>{}); + // Remove empty tuples (deleted elements) and return + return UnrollNestedTuple<0, 1>(new_shape); +} + +/** + * \brief Generate Freeze for each of nested shape. + * + * \param idx Tuple of start indices for slice. + * \param shape Shape which will be freezed. + * \return Generated freeze transforms. + */ +template +__host__ __device__ constexpr auto GenerateMultipleFreeze(T idx, const Shape& shape) +{ + const auto unrolled_shape = UnrollNestedTuple(shape); + return generate_tuple( + [&](auto i) { + // dimension offset from idx + const auto dim = unrolled_shape.At(Number{}); + const auto dim_idx = idx % dim; + idx /= dim; + return make_freeze_transform(dim_idx); + }, + Number{}); +} + +/** + * \brief Generate transforms for slice tensor. + * + * \param idx Tuple of start indices for slice. + * \param shape Shape which will be sliced. + * \return Generated transforms. + */ +template +__host__ __device__ constexpr auto GenerateSliceTransforms(const Tuple& idx, + const Shape& shape) +{ + // Pack each value in tuple to remove empty tuples after generation + auto transforms = generate_tuple( + [&](auto i) { + constexpr auto num_i = Number{}; + if constexpr(is_detected>>::value) + { + return GenerateSliceTransforms(idx.At(num_i), shape.At(num_i)); + } + else if constexpr(is_detected>>::value) + { + + const auto from = idx.At(num_i).from_; + const auto dim = size(shape); + const auto range = idx.At(num_i).range(dim); + return make_slice_transform(range, from, from + range); + } + else + { + // remove dimension for just value + return GenerateMultipleFreeze(idx.At(num_i), shape.At(num_i)); + } + }, + Number::Size()>{}); + // Remove empty tuples (deleted elements) and return + return UnrollNestedTuple(transforms); +} + +template +__host__ __device__ constexpr auto GetSequenceVal(const ck::Freeze&) +{ + // There is no output for Freeze transform + return Sequence<>{}; +} + +template +__host__ __device__ constexpr auto GetSequenceVal(const ck::Slice&) +{ + return Sequence{}; +} + +template +__host__ __device__ constexpr auto GenerateUpperDims(const Tuple<>&) +{ + return Tuple<>{}; +} + +template +__host__ __device__ constexpr auto GenerateUpperDims(const Tuple& transforms) +{ + constexpr auto num_transforms = Tuple::Size(); + // Deduce Sequence element for specific transform + const auto current_elem = GetSequenceVal(transforms.At(Number<0>{})); + if constexpr(is_same_v>) + { + const auto next_tuple = GenerateUpperDims(TupleSlice<1, num_transforms>(transforms)); + return concat_tuple(make_tuple(current_elem), next_tuple); + } + else + { + // Increase i if current_elem is Slice transform + const auto next_tuple = GenerateUpperDims(TupleSlice<1, num_transforms>(transforms)); + return concat_tuple(make_tuple(current_elem), next_tuple); + } +} + +template +__host__ __device__ constexpr auto GenerateSlicedDescriptor(const Tuple& idx, + const Shape& shape, + const FlattenDescriptor& flatten_desc) +{ + constexpr auto old_shape_dims = decltype(UnrollNestedTuple(shape))::Size(); + + const auto transforms = GenerateSliceTransforms(idx, shape); + using TransformsTupleType = decltype(transforms); + + const auto lower_dims = + generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + const auto upper_dims = decltype(GenerateUpperDims<0>(TransformsTupleType{})){}; + return transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims); +} +} // namespace +} // namespace detail + /** * \brief Tensor wrapper that performs static and dynamic buffer logic. + * The tensor is based on a descriptor stored in the Layout. Additionally, + * tensor can be sliced or shifted using multi-index offset. * * \tparam BufferAddressSpace Memory type (Generic, Global, LDS, VGPR, SGPR). * \tparam ElementType Element data type. * \tparam Shape Tensor shape (layout component). - * \tparam UnnestedDescriptorType Unnested descriptor (layout component). - * \tparam NumVectors Number of vectors (only for VGPR, SGPR). - * \tparam ScalarPerVector Scalars per vector (only for VGPR, SGPR). + * \tparam UnrolledDescriptorType Flatten descriptor (layout component). */ template + typename UnrolledDescriptorType> struct Tensor { - private: - // Check if Tuple contains Slice object - template - __host__ __device__ constexpr static bool IsSlicing(T&&) - { - return is_detected::value; - } - template - __host__ __device__ constexpr static bool IsSlicing(Tuple&&) - { - return (IsSlicing(Ts{}) || ...); - } - - // Calculate new tensor shape after slice - template - __host__ __device__ constexpr auto GetShapeFromSlicedTensor(const Tuple& idx, - const ShapeTmpType& shape) const - { - // Pack each value in tuple to remove empty tuples after generation - auto new_shape = generate_tuple( - [&](auto i) { - constexpr auto num_i = Number{}; - if constexpr(is_detected>>::value) - { - if constexpr(!IsSlicing(tuple_element_t>{})) - { - // if tuple does not have any slice then we can remove dimension - return Tuple<>{}; - } - else - { - // if tuple then recurrence - return make_tuple(GetShapeFromSlicedTensor(idx.At(num_i), shape.At(num_i))); - } - } - else if constexpr(is_detected>>::value) - { - // calculate new dimension - const auto& dim = size(shape.At(num_i)); - const auto val = idx.At(num_i).range(dim); - return make_tuple(val); - } - else - { - // remove dimension for just value - return Tuple<>{}; - } - }, - Number::Size()>{}); - // Remove empty tuples (deleted elements) and return - return UnrollNestedTuple<0, 1>(new_shape); - } - - // Generate Freeze for each of nested shape - template - __host__ __device__ constexpr auto GenerateMultipleFreeze(T idx, - const ShapeTmpType& shape) const - { - const auto unrolled_shape = UnrollNestedTuple(shape); - return generate_tuple( - [&](auto i) { - // dimension offset from idx - const auto dim = unrolled_shape.At(Number{}); - const auto dim_idx = idx % dim; - idx /= dim; - return make_freeze_transform(dim_idx); - }, - Number{}); - } - - template - __host__ __device__ constexpr auto - GetTransformsFromSlicedTensor(const Tuple& idx, const ShapeTmpType& shape) const - { - // Pack each value in tuple to remove empty tuples after generation - auto transforms = generate_tuple( - [&](auto i) { - constexpr auto num_i = Number{}; - if constexpr(is_detected>>::value) - { - return GetTransformsFromSlicedTensor(idx.At(num_i), shape.At(num_i)); - } - else if constexpr(is_detected>>::value) - { - - const auto from = idx.At(num_i).from_; - const auto dim = shape.At(num_i); - const auto range = idx.At(num_i).range(dim); - return make_slice_transform(range, from, from + range); - } - else - { - // remove dimension for just value - return GenerateMultipleFreeze(idx.At(num_i), shape.At(num_i)); - } - }, - Number::Size()>{}); - // Remove empty tuples (deleted elements) and return - return UnrollNestedTuple(transforms); - } - - // There is no output for Freeze transform - template - __host__ __device__ constexpr auto GetSequenceVal(const ck::Freeze&) const - { - return Sequence<>{}; - } - - template - __host__ __device__ constexpr auto - GetSequenceVal(const ck::Slice&) const - { - return Sequence{}; - } - - template - __host__ __device__ constexpr auto GenerateUpperDims(const Tuple<>&) const - { - return Tuple<>{}; - } - - template - __host__ __device__ constexpr auto - GenerateUpperDims(const Tuple& transforms) const - { - constexpr auto num_transforms = Tuple::Size(); - // Deduce Sequence element for specific transform - const auto currect_elem = GetSequenceVal(transforms.At(Number<0>{})); - if constexpr(is_same_v>) - { - const auto next_tuple = GenerateUpperDims(TupleSlice<1, num_transforms>(transforms)); - return concat_tuple(make_tuple(currect_elem), next_tuple); - } - else - { - // Increase i if current_elem is Slice transform - const auto next_tuple = - GenerateUpperDims(TupleSlice<1, num_transforms>(transforms)); - return concat_tuple(make_tuple(currect_elem), next_tuple); - } - } - - template - __host__ __device__ constexpr auto - GetDescriptorFromSlicedTensor(const Tuple& idx, - const ShapeTmpType& shape, - const FlattenDescriptor& flatten_desc) const - { - constexpr auto old_shape_dims = decltype(UnrollNestedTuple(shape))::Size(); - - const auto transforms = GetTransformsFromSlicedTensor(idx, shape); - using TransformsTupleType = decltype(transforms); - - const auto lower_dims = - generate_tuple([&](auto i) { return Sequence{}; }, Number{}); - const auto upper_dims = decltype(GenerateUpperDims<0>(TransformsTupleType{})){}; - return transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims); - } - public: - using ElementSpaceSize = decltype(Layout{ - Shape{}, UnnestedDescriptorType{}}.GetElementSpaceSize()); // SpaceSize type for buffer + using ElementSpaceSize = decltype(Layout{ + Shape{}, UnrolledDescriptorType{}}.GetElementSpaceSize()); // SpaceSize type for buffer using TensorElementType = ElementType; // DataType static constexpr MemoryTypeEnum TensorBufferAddressSpace = BufferAddressSpace; @@ -200,134 +216,207 @@ struct Tensor BufferAddressSpace == MemoryTypeEnum ::Vgpr); __host__ __device__ Tensor() = delete; - __host__ __device__ Tensor(ElementType* pointer, - const Layout& layout) + __host__ __device__ constexpr Tensor(ElementType* pointer, + const Layout& layout) : layout_(layout), - buffer_(make_dynamic_buffer(pointer, layout.GetElementSpaceSize())) + buffer_(make_dynamic_buffer(pointer, layout.GetElementSpaceSize())), + multi_idx_offset_(make_zero_multi_index()), + base_offset_(0) { + static_assert(IsDynamicBuffer, "Wrong BufferAddressSpace for register."); } - __host__ __device__ Tensor(const Layout& layout) - : layout_(layout) + __host__ __device__ constexpr Tensor(const Layout& layout) + : layout_(layout), + multi_idx_offset_(make_zero_multi_index()), + base_offset_(0) { static_assert(!IsDynamicBuffer, "Wrong BufferAddressSpace for register."); } - __host__ __device__ constexpr const Layout& GetLayout() const + __host__ __device__ constexpr const Layout& GetLayout() const { return layout_; } - // Getter for new sliced tensor - template {}), bool> = false> - __host__ __device__ auto operator[](const Tuple& idx) const + /** + * \brief Get the new sliced tensor. + * + * \param idx Tuple of indices: slice(from,to) or scalar. + * \return Sliced tensor. + */ + template {}), bool> = false> + __host__ __device__ auto operator[](const Tuple& idx) { static_assert(IsDynamicBuffer, "Register slice is not supported"); const auto& shape = layout_.GetShape(); - auto new_shape = GetShapeFromSlicedTensor(idx, shape); + auto new_shape = detail::GetSlicedShape(idx, shape); - const auto& flatten_desc = layout_.GetUnnestedDescriptor(); - auto new_desc = GetDescriptorFromSlicedTensor(idx, shape, flatten_desc); + const auto& flatten_desc = layout_.GetUnrolledDescriptor(); + auto new_desc = detail::GenerateSlicedDescriptor(idx, shape, flatten_desc); const auto new_layout = Layout(new_shape, new_desc); + // Update embed offset + base_offset_ -= new_layout(make_tuple(Number<0>{})); return make_tensor(buffer_.p_data_, new_layout); } - template {}), bool> = false> - __host__ __device__ auto operator()(const Tuple& idx) const + template {}), bool> = false> + __host__ __device__ auto operator()(const Tuple& idx) { return this->operator[](idx); } - template {}), bool> = false> - __host__ __device__ auto operator()(Idxs... idxs) const + template {}), bool> = false> + __host__ __device__ auto operator()(Idxs... idxs) { return this->operator[](make_tuple(idxs...)); } - // Getter for the const value - template {}), bool> = false> + /** + * \brief Getter of the tensor's const value reference. + * + * \param idx Tuple of indices. + * \return Requested value. + */ + template {}), bool> = false> __host__ __device__ const ElementType& operator[](const Tuple& idx) const { if constexpr(IsDynamicBuffer) { - const index_t offset = layout_(idx); + const index_t offset = layout_(idx) + base_offset_; return buffer_[offset]; } else { - constexpr index_t offset = Layout{ + constexpr index_t index_offset = Layout{ Shape{}, - UnnestedDescriptorType{}}.template operator()>(); - return buffer_[Number{}]; + UnrolledDescriptorType{}}.template operator()>(); + // Calculate and apply base offset in compile-time + constexpr index_t base_offset = Layout{ + Shape{}, + UnrolledDescriptorType{}}.template operator()>(); + return buffer_[Number{}]; } } - template {}), bool> = false> + template {}), bool> = false> __host__ __device__ const ElementType& operator()(const Tuple& idx) const { return this->operator[](idx); } - template {}), bool> = false> + template {}), bool> = false> __host__ __device__ const ElementType& operator()(Idxs... idxs) const { return this->operator[](make_tuple(idxs...)); } - // Getter for the value reference - template {}), bool> = false> + /** + * \brief Getter of tensor value reference. + * + * \param idx Tuple of indices. + * \return Requested value. + */ + template {}), bool> = false> __host__ __device__ ElementType& operator[](const Tuple& idx) { if constexpr(IsDynamicBuffer) { - const index_t offset = layout_(idx); + const index_t offset = layout_(idx) + base_offset_; return buffer_(offset); } else { - constexpr index_t offset = Layout{ + constexpr index_t index_offset = Layout{ Shape{}, - UnnestedDescriptorType{}}.template operator()>(); - return buffer_(Number{}); + UnrolledDescriptorType{}}.template operator()>(); + // Apply embed offset (calculate in compiletime) + constexpr index_t base_offset = Layout{ + Shape{}, + UnrolledDescriptorType{}}.template operator()>(); + return buffer_(Number{}); } } - template {}), bool> = false> + template {}), bool> = false> __host__ __device__ ElementType& operator()(const Tuple& idx) { return this->operator[](idx); } - template {}), bool> = false> + template {}), bool> = false> __host__ __device__ ElementType& operator()(Idxs... idxs) { return this->operator[](make_tuple(idxs...)); } - __host__ __device__ constexpr auto GetDefaultDescriptor() + /** + * \brief Get descriptor with all nested dimensions merged. + * + * \return Merged nests descriptor. + */ + __host__ __device__ constexpr auto GetMergedNestingDescriptor() { - return layout_.GetDefaultDescriptor(); + return layout_.GetMergedNestingDescriptor(); } + /** + * \brief Get pointer to the data. + * + * \return Pointer. + */ __host__ __device__ ElementType* GetPointer() const { return buffer_.p_data_; } + __host__ __device__ constexpr auto& GetBuffer() { return buffer_; } + __host__ __device__ constexpr auto& GetBuffer() const { return buffer_; } + + /** + * \brief Get multi index offset to the data. + * + * \return Multi index offset. + */ + __host__ __device__ constexpr auto& GetMultiIdxOffsets() const { return multi_idx_offset_; } + + /** + * \brief Apply multi index offset on the tensor. + * + * \param multi_idx_offset Multi index offset. + */ + template + __host__ __device__ constexpr void SetMultiIdxOffset(const MultiIdxOffsets multi_idx_offset) + { + multi_idx_offset_ = multi_idx_offset; + base_offset_ += layout_(multi_idx_offset); + } + private: using DynamicBufferType = DynamicBuffer; - using StaticBufferType = - StaticBufferTupleOfVector; + using StaticBufferType = StaticBuffer; // If register use static buffer, else use dynamic buffer using Buffer = std::conditional_t; - const Layout layout_; + const Layout layout_; Buffer buffer_; + // We use multi_idx_offset_ to enable the creation of a descriptor in + // compile time for partitions or tiles if tile shape and thread layout + // is known at compile time (We can use the same descriptor for each + // thread). Additionally, the copy between the static and dynamic buffer + // requires a descriptor known at compile time, so we can shift data using + // such multi_idx_offset_. + MultiIndex multi_idx_offset_; + // Base offset and multi index offset are corresponding to exactly the + // same element in tensor ( and in physical memory ). Multi index offset + // is multi dimensional index. However base offset is calculated using + // tensor descriptor (thus all it's transforms) and is linear (1D). + // We store base_offset_ to avoid multiple recalculations. + index_t base_offset_; }; } // namespace wrapper diff --git a/include/ck/wrapper/utils/layout_utils.hpp b/include/ck/wrapper/utils/layout_utils.hpp index f4ba0a969f..d04bd5078b 100644 --- a/include/ck/wrapper/utils/layout_utils.hpp +++ b/include/ck/wrapper/utils/layout_utils.hpp @@ -22,14 +22,19 @@ namespace wrapper { // Disable from doxygen docs generation /// @cond // forward declaration -template +template struct Layout; template using is_tuple = decltype(std::declval().IsTuple()); namespace { -// Generate packed (column-major) strides if not passed +/** + * \brief Generate packed (column-major) strides if not passed + * + * \param shape Tensor shape. + * \return Generated column-major strides. + */ template __host__ __device__ constexpr static auto GenerateColumnMajorPackedStrides(const Tuple& shape) @@ -50,9 +55,16 @@ GenerateColumnMajorPackedStrides(const Tuple& shape) Number{}); } +/** + * \brief Create naive tensor descriptor from nested shape. + * + * \param shape Tensor shape. + * \param strides Tensor strides. + * \return Unrolled descriptor + */ template -__host__ __device__ constexpr auto MakeFlattenDescriptor(const LayoutShape& shape, - const LayoutStrides& strides) +__host__ __device__ constexpr auto MakeUnrolledDescriptor(const LayoutShape& shape, + const LayoutStrides& strides) { const auto unrolled_shape = UnrollNestedTuple(shape); if constexpr(is_same_v>) @@ -86,8 +98,8 @@ __host__ __device__ constexpr auto MakeFlattenDescriptor(const LayoutShape& shap template __host__ __device__ constexpr auto make_layout(const Shape& shape, const Strides& strides) { - using UnnestedDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Strides{})); - return Layout(shape, MakeFlattenDescriptor(shape, strides)); + using UnrolledDescriptorType = decltype(MakeUnrolledDescriptor(Shape{}, Strides{})); + return Layout(shape, MakeUnrolledDescriptor(shape, strides)); } /** @@ -100,15 +112,19 @@ __host__ __device__ constexpr auto make_layout(const Shape& shape, const Strides template __host__ __device__ constexpr auto make_layout(const Shape& shape) { - using UnnestedDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Tuple<>{})); - return Layout(shape, MakeFlattenDescriptor(shape, Tuple<>{})); + using UnrolledDescriptorType = decltype(MakeUnrolledDescriptor(Shape{}, Tuple<>{})); + return Layout(shape, MakeUnrolledDescriptor(shape, Tuple<>{})); } // Layout helpers // get -// Get dim (could be returned from get with empty Idxs) + /** * \private + * \brief Get dim. + * + * \param dim Dimension. + * \return Returned the same dimension. */ template __host__ __device__ T constexpr get(const T& dim) @@ -178,7 +194,7 @@ __host__ __device__ constexpr auto get(const Layout& layout) }, Number{}); - const auto& flatten_desc = layout.GetUnnestedDescriptor(); + const auto& flatten_desc = layout.GetUnrolledDescriptor(); auto new_desc = transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims); return Layout(new_shape, new_desc); } @@ -197,9 +213,12 @@ __host__ __device__ constexpr auto get(const T& elem) } // size -// Get dim size (could be returned from get function) /** * \private + * \brief Get size. + * + * \param dim Size. + * \return Returned the same size. */ template __host__ __device__ T constexpr size(const T& dim) @@ -214,8 +233,8 @@ __host__ __device__ T constexpr size(const T& dim) * \param layout Layout to get Shape of. * \return Requsted length. */ -template -__host__ __device__ constexpr auto size(const Layout& layout) +template +__host__ __device__ constexpr auto size(const Layout& layout) { return layout.template GetLength(); } @@ -240,8 +259,8 @@ __host__ __device__ constexpr auto size(const Tuple& shape) * \param layout Layout to calculate shape size. * \return Requsted size. */ -template -__host__ __device__ constexpr auto size(const Layout& layout) +template +__host__ __device__ constexpr auto size(const Layout& layout) { return layout.GetLengths(); } @@ -280,9 +299,9 @@ __host__ __device__ constexpr auto size(const T& elem) * \param layout Layout to calculate rank. * \return Requsted rank. */ -template +template __host__ __device__ constexpr auto -rank([[maybe_unused]] const Layout& layout) +rank([[maybe_unused]] const Layout& layout) { return Shape::Size(); } @@ -302,17 +321,25 @@ __host__ __device__ constexpr auto rank([[maybe_unused]] const Tuple& t /** * \private + * \brief Rank for scalar + * + * \param dim Dimension scalar. + * \return Returned 1. */ template -__host__ __device__ constexpr index_t rank(const Number&) +__host__ __device__ constexpr index_t rank([[maybe_unused]] const Number& dim) { return 1; } /** * \private + * \brief Rank for scalar + * + * \param dim Dimension scalar. + * \return Returned 1. */ -__host__ __device__ constexpr index_t rank(const index_t&) { return 1; } +__host__ __device__ constexpr index_t rank([[maybe_unused]] const index_t& dim) { return 1; } /** * \brief Hierarchical rank. @@ -334,8 +361,8 @@ __host__ __device__ constexpr auto rank(const T& elem) * \param layout Layout to calculate depth. * \return Requsted depth. */ -template -__host__ __device__ constexpr auto depth(const Layout& layout) +template +__host__ __device__ constexpr auto depth(const Layout& layout) { const auto& shape = layout.GetShape(); return TupleDepth(shape); @@ -355,17 +382,25 @@ __host__ __device__ constexpr auto depth(const Tuple& tuple) /** * \private + * \brief Depth for scalar + * + * \param dim Scalar. + * \return Returned 0. */ template -__host__ __device__ constexpr index_t depth(const Number&) +__host__ __device__ constexpr index_t depth([[maybe_unused]] const Number& dim) { return 0; } /** * \private + * \brief Depth for scalar + * + * \param dim Scalar. + * \return Returned 0. */ -__host__ __device__ constexpr index_t depth(const index_t&) { return 0; } +__host__ __device__ constexpr index_t depth([[maybe_unused]] const index_t& dim) { return 0; } /** * \brief Hierarchical depth. diff --git a/include/ck/wrapper/utils/tensor_partition.hpp b/include/ck/wrapper/utils/tensor_partition.hpp index a0634f6b38..6aae5a92fe 100644 --- a/include/ck/wrapper/utils/tensor_partition.hpp +++ b/include/ck/wrapper/utils/tensor_partition.hpp @@ -6,12 +6,22 @@ #include "tensor_utils.hpp" #include "layout_utils.hpp" +#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp" +#include "ck/tensor_description/cluster_descriptor.hpp" + namespace ck { namespace wrapper { namespace { -// Calculate shape for partition based on number of threads per each dim and -// previous shape + +/** + * \brief Calculate shape for partition based on number of threads per each dim and + * previous shape + * + * \param shape Base tensor shape. + * \param thread_lengths Tuple of thread lengths. + * \return Partition shape. + */ template __host__ __device__ constexpr auto CalculateLocalPartitionShape(const Tuple& shape, const Tuple& thread_lengths) @@ -20,265 +30,165 @@ __host__ __device__ constexpr auto CalculateLocalPartitionShape(const Tuple{}; - if constexpr(is_detected>>::value) - { - // if tuple then recurrence - return CalculateLocalPartitionShape(shape.At(num_i), thread_lengths.At(num_i)); - } - else - { - const auto slice_len = shape.At(num_i) / thread_lengths.At(num_i); - return slice_len; - } - }, - Number::Size()>{}); -} - -// Calculate shape for partition based on number of threads per each dim, -// previous strides and steps -template -__host__ __device__ constexpr auto -CalculateLocalPartitionDescriptor(const Tuple& shape, - const Tuple& thread_lengths, - const Tuple& steps, - const FlattenDescType& flatten_desc) -{ - - static_assert(Tuple::Size() == Tuple::Size(), "Wrong thread_lengths shape."); - const auto unrolled_thread_lengths = UnrollNestedTuple(thread_lengths); - const auto unrolled_shape = UnrollNestedTuple(shape); - constexpr auto dims = decltype(unrolled_thread_lengths)::Size(); - - using UnrolledStepsType = decltype(UnrollNestedTuple(steps)); - - using I1 = Number<1>; - - const auto transforms = generate_tuple( - [&](auto i) { - constexpr auto num_i = Number{}; - if constexpr(is_same_v, Tuple<>>) - { - // By default raked partition - const auto partition_stride = unrolled_thread_lengths.At(num_i); - return make_embed_transform(make_tuple(unrolled_shape.At(num_i)), - make_tuple(partition_stride)); - } - else if constexpr(!is_same_v, index_t>) - { - // Compiletime partition - if constexpr(is_same_v, I1>) - { - // raked - const auto partition_stride = unrolled_thread_lengths.At(num_i); - return make_embed_transform(make_tuple(unrolled_shape.At(num_i)), - make_tuple(partition_stride)); - } - else - { - // packed - return make_embed_transform(make_tuple(unrolled_shape.At(num_i)), - make_tuple(I1{})); - } - } - else - { - // Runtime partition - if(steps.At(num_i) == 1) - { - // raked - const auto partition_stride = unrolled_thread_lengths.At(num_i); - return make_embed_transform(make_tuple(unrolled_shape.At(num_i)), - make_tuple(partition_stride)); - } - else - { - // packed - return make_embed_transform(make_tuple(unrolled_shape.At(num_i)), - make_tuple(I1{})); - } - } - }, - Number{}); - - const auto lower_dims = - generate_tuple([&](auto i) { return Sequence{}; }, Number{}); - const auto upper_dims = - generate_tuple([&](auto i) { return Sequence{}; }, Number{}); - return transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims); -} - -template -__host__ __device__ constexpr auto CalculateLayoutOffsetIdxImpl(const Tuple& thread_lengths, - const Tuple& steps, - index_t& thread_id) -{ - return generate_tuple( - [&](auto i) { - constexpr auto num_i = Number{}; - if constexpr(is_detected>>::value) - { - // if tuple then recurrence - if constexpr(is_same_v, Tuple<>>) - { - return CalculateLayoutOffsetIdxImpl( - thread_lengths.At(num_i), Tuple<>{}, thread_id); - } - else - { - return CalculateLayoutOffsetIdxImpl( - thread_lengths.At(num_i), steps.At(num_i), thread_id); - } - } - else - { - // Update thread_id after each dim - const auto dim_thread_id = thread_id % thread_lengths.At(num_i); - thread_id /= thread_lengths.At(num_i); - if constexpr(is_same_v, Tuple<>>) - { - return dim_thread_id; - } - else - { - // Apply step - return steps.At(num_i) * dim_thread_id; - } - } + const auto slice_len = size(shape) / thread_lengths.At(num_i); + return slice_len; }, Number::Size()>{}); } -// Convert integer thread_idx to tuple index with steps applied -template -__host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple& thread_lengths, - const Tuple& steps, - const index_t thread_id) +/** + * \brief Calculate total number of blocks. + * + * \param shape Base tensor shape. + * \param tile_shape Tile shape. + * \return Tuple with blocks number. + */ +template +__host__ __device__ constexpr auto CalculateGridSize(const Tuple& shape, + const Tuple& tile_shape) { - // Create tmp thread_id copy for CalculateLayoutOffsetIdxImpl updates - index_t thread_id_copy = thread_id; - return CalculateLayoutOffsetIdxImpl(thread_lengths, steps, thread_id_copy); + static_assert(Tuple::Size() == Tuple::Size(), "Wrong thread_lengths shape."); + return generate_tuple([&](auto i) { return size(shape) / size(tile_shape); }, + Number::Size()>{}); } -// Apply steps to index represented as tuple -template -__host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple& steps, - const Tuple& block_idxs) +/** + * \brief Calculate scaled offset for new partition/tile. + * + * \param thread_idxs Thread 1d id. + * \param partition_lengths_seq Sequence of partition shape. + * \param old_offset_idxs Multi index offset from base tensor to shift values. + * \return Partition shape. + */ +template +__host__ __device__ constexpr auto +CalculateOffsetMultiIdxs(const ThreadIdxs& thread_idxs, + const PartitionLengthsSeq& partition_lengths_seq, + const OldOffsetIdxs& old_offset_idxs) { - return generate_tuple( - [&](auto i) { - constexpr auto num_i = Number{}; - if constexpr(is_detected>>::value) - { - // if tuple then recurrence - if constexpr(is_same_v, Tuple<>>) - { - return CalculateLayoutOffsetIdx(Tuple<>{}, block_idxs.At(num_i)); - } - else - { - return CalculateLayoutOffsetIdx(steps.At(num_i), block_idxs.At(num_i)); - } - } - else - { - if constexpr(is_same_v, Tuple<>>) - { - return block_idxs.At(num_i); - } - else - { - // apply step - return steps.At(num_i) * block_idxs.At(num_i); - } - } - }, - Number::Size()>{}); + return thread_idxs * partition_lengths_seq + old_offset_idxs; } -// User passes only shape per block to the make_local_tile function. This function calculates -// block layout based on the shape. -template -__host__ __device__ constexpr auto CalculateBlockLengths(const Tuple& shape, - const Tuple& tile_shape) -{ - return generate_tuple( - [&](auto i) { - constexpr auto num_i = Number{}; - if constexpr(is_detected>>::value) - { - // if tuple then recurrence - return CalculateBlockLengths(shape.At(num_i), tile_shape.At(num_i)); - } - else - { - return shape.At(num_i) / tile_shape.At(num_i); - } - }, - Number::Size()>{}); -} } // namespace /** - * \brief Create local partition for thread. + * \brief Create local partition for thread (At now only packed partition + * is supported). * * \param tensor Tensor for partition. - * \param thread_lengths Layout of threads. + * \param thread_lengths Layout of threads (could not be nested). * \param thread_id Thread index represented as integer. - * \param steps Thread step (default=1, raked partition) * \return Partition tensor. */ -template > -__host__ __device__ constexpr auto make_local_partition(const TensorType& tensor, - const ThreadLengthsTuple& thread_lengths, - const index_t thread_id, - const StepsTuple steps = StepsTuple{}) +template +__host__ __device__ constexpr auto +make_local_partition(TensorType& tensor, + [[maybe_unused]] const ThreadLengthsTuple& thread_lengths, + const index_t thread_id) { - // Create shape, strides and layout for new partition tensor - const auto partition_shape = CalculateLocalPartitionShape(shape(tensor), thread_lengths); - // Create new descriptor and layout - const auto& flatten_desc = layout(tensor).GetUnnestedDescriptor(); - auto partition_desc = - CalculateLocalPartitionDescriptor(shape(tensor), thread_lengths, steps, flatten_desc); - const auto partition_layout = Layout( - partition_shape, partition_desc); - // Calculate offset for new partition tensor - const auto offset_idx = CalculateLayoutOffsetIdx(thread_lengths, steps, thread_id); - const auto partition_offset = layout(tensor)(offset_idx); - return make_tensor(tensor.GetPointer() + partition_offset, - partition_layout); + static_assert(!IsNestedTuple(ThreadLengthsTuple{})); + // Calculate new partition shape + const auto& tensor_shape = shape(tensor); + constexpr auto partition_shape = + CalculateLocalPartitionShape(decltype(tensor_shape){}, ThreadLengthsTuple{}); + // Create Thread Cluster Descriptor + constexpr auto partition_lengths_seq = generate_sequence_v2( + [&](auto I) { return size(partition_shape); }, Number{}); + constexpr auto thread_lengths_seq = + generate_sequence_v2([&](auto I) { return size(ThreadLengthsTuple{}); }, + Number{}); + constexpr auto thread_cluster_desc_ = make_cluster_descriptor(thread_lengths_seq); + // Calculate thread idxs and offsets + const auto thread_idxs = thread_cluster_desc_.CalculateBottomIndex(make_multi_index(thread_id)); + const auto offset_multi_idxs = + CalculateOffsetMultiIdxs(thread_idxs, partition_lengths_seq, tensor.GetMultiIdxOffsets()); + // Create new layout and tensor + auto& flatten_desc = layout(tensor).GetUnrolledDescriptor(); + const auto partition_layout = + Layout, decltype(flatten_desc)>( + partition_shape, flatten_desc); + auto partition_tensor = + make_tensor(tensor.GetPointer(), partition_layout); + // Apply offsets + partition_tensor.SetMultiIdxOffset(to_multi_index(offset_multi_idxs)); + return partition_tensor; } /** - * \brief Create local tile for thread block. + * \brief Create local tile for thread block. (At now only packed tile + * is supported). + * + * \note Temporary to gain the best performance use 2d + * tile_shape. + * * * \param tensor Tensor for partition. * \param tile_shape Shapes of requested tile. - * \param block_idx Block index represented as tuple. - * \param steps Block step (default=1, raked partition) + * \param block_id Block index represented as integer. + * \return Tile tensor. */ -template > -__host__ __device__ constexpr auto make_local_tile(const TensorType& tensor, - const BlockShapeTuple& tile_shape, - const BlockIdxTuple& block_idx, - const StepsTuple steps = StepsTuple{}) +template +__host__ __device__ constexpr auto +make_local_tile(const TensorType& tensor, const BlockShapeTuple& tile_shape, const index_t block_id) { - // Create block lengths, strides and layout for new tile tensor - const auto block_lengths = CalculateBlockLengths(shape(tensor), tile_shape); - // Create new descriptor and layout - const auto& flatten_desc = layout(tensor).GetUnnestedDescriptor(); - auto tile_desc = - CalculateLocalPartitionDescriptor(tile_shape, block_lengths, steps, flatten_desc); - const auto tile_layout = Layout, decltype(tile_desc)>( - tile_shape, tile_desc); - // Calculate offset for new partition tensor - const auto offset_idx = CalculateLayoutOffsetIdx(steps, block_idx); - const auto tile_offset = layout(tensor)(offset_idx); - return make_tensor(tensor.GetPointer() + tile_offset, - tile_layout); + static_assert(!IsNestedTuple(BlockShapeTuple{})); + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + + auto& aligned_desc = layout(tensor).GetMergedNestingDescriptor(); + + if constexpr(BlockShapeTuple::Size() == I2) + { + // Optimized version for 2d tile shape [MxK] + const auto block_2_tile_map = + BlockToCTileMap_M00_N0_M01Adapt>(aligned_desc); + const auto block_work_idx = + block_2_tile_map.CalculateBottomIndex(make_multi_index(block_id)); + const index_t m_block_data_idx_on_grid = + __builtin_amdgcn_readfirstlane(block_work_idx[I0] * size<0>(tile_shape)); + const index_t k_block_data_idx_on_grid = + __builtin_amdgcn_readfirstlane(block_work_idx[I1] * size<1>(tile_shape)); + const auto offset_multi_idxs = + make_tuple(m_block_data_idx_on_grid, k_block_data_idx_on_grid); + // Create new layout and tensor + const auto tile_layout = + Layout, decltype(aligned_desc)>(tile_shape, + aligned_desc); + auto tile_tensor = + make_tensor(tensor.GetPointer(), tile_layout); + // Apply offsets + tile_tensor.SetMultiIdxOffset(to_multi_index(offset_multi_idxs)); + return tile_tensor; + } + else + { + // Calculate offsets + // Sequence with data to process per block + constexpr auto tile_shape_seq = + generate_sequence_v2([](auto I) { return size(BlockShapeTuple{}.At(I)); }, + Number{}); + // Tuple with number of blocks + const auto block_lengths = CalculateGridSize(shape(tensor), tile_shape); + constexpr auto block_cluster_desc_ = make_cluster_descriptor(block_lengths); + const auto block_idxs = + block_cluster_desc_.CalculateBottomIndex(make_multi_index(block_id)); + const auto offset_multi_idxs = + CalculateOffsetMultiIdxs(block_idxs, tile_shape_seq, tensor.GetMultiIdxOffsets()); + // Create new layout and tensor + const auto tile_layout = + Layout, decltype(aligned_desc)>(tile_shape, + aligned_desc); + auto tile_tensor = + make_tensor(tensor.GetPointer(), tile_layout); + // Apply offsets + tile_tensor.SetMultiIdxOffset(to_multi_index(offset_multi_idxs)); + return tile_tensor; + } } } // namespace wrapper diff --git a/include/ck/wrapper/utils/tensor_utils.hpp b/include/ck/wrapper/utils/tensor_utils.hpp index 1e932e62e1..7ec080760a 100644 --- a/include/ck/wrapper/utils/tensor_utils.hpp +++ b/include/ck/wrapper/utils/tensor_utils.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -10,6 +10,7 @@ #include "ck/utility/tuple_helper.hpp" #include "ck/utility/dynamic_buffer.hpp" #include "ck/utility/amd_address_space.hpp" +#include "ck/utility/multi_index.hpp" namespace ck { namespace wrapper { @@ -27,16 +28,12 @@ using MemoryTypeEnum = AddressSpaceEnum; // Disable from doxygen docs generation /// @cond // forward declarations -template +template struct Layout; template - + typename UnrolledDescriptorType> struct Tensor; template @@ -45,13 +42,22 @@ struct Slice __host__ __device__ constexpr Slice() : from_(), to_() {} __host__ __device__ constexpr Slice(FromType from, ToType to) : from_(from), to_(to) {} + /** + * \brief Calculate slice range. + * + * \param dim Dimension size. + * \return Slice range. + */ template __host__ __device__ constexpr auto range(const T& dim) const { if constexpr(is_same_v || is_same_v || is_same_v) { - assert(dim >= to_ && from_ >= 0 && (to_ < 0 || to_ > from_) && "Invalid range"); + if(!(dim >= to_ && from_ >= 0 && (to_ < 0 || to_ > from_))) + { + throw std::runtime_error("Invalid range"); + } if(to_ < 0) { return dim - from_ + to_ + 1; @@ -101,40 +107,27 @@ using is_tuple = decltype(std::declval().IsTuple()); template + typename UnrolledDescriptorType> constexpr auto make_tensor(ElementType* pointer, - const Layout& layout) + const Layout& layout) { - return Tensor(pointer, layout); + return Tensor(pointer, layout); } /** * \brief Make SGPR or VGPR tensor function. * * \tparam MemoryType Type of memory. - * \tparam NumVectors Number of vectors. - * \tparam ScalarPerVector Scalars per vector. * \tparam ElementType Memory data type. * \return Constructed tensor. */ template -constexpr auto make_register_tensor() + typename ElementType, + typename Shape, + typename UnrolledDescriptorType> +constexpr auto make_register_tensor(const Layout& layout) { - const auto layout = make_layout(make_tuple(Number{}), make_tuple(Number<1>{})); - return Tensor>, - std::remove_const_t>, - NumVectors, - ScalarPerVector>(layout); + return Tensor(layout); } /** @@ -146,15 +139,9 @@ constexpr auto make_register_tensor() template -__host__ __device__ constexpr const auto& layout(const Tensor& tensor) + typename UnrolledDescriptorType> +__host__ __device__ constexpr const auto& +layout(const Tensor& tensor) { return tensor.GetLayout(); } @@ -170,15 +157,9 @@ template -__host__ __device__ constexpr auto size(const Tensor& tensor) + typename UnrolledDescriptorType> +__host__ __device__ constexpr auto +size(const Tensor& tensor) { return size(tensor.GetLayout()); } @@ -194,15 +175,9 @@ template -__host__ __device__ constexpr auto rank(const Tensor& tensor) + typename UnrolledDescriptorType> +__host__ __device__ constexpr auto +rank(const Tensor& tensor) { return rank(tensor.GetLayout()); } @@ -218,15 +193,9 @@ template -__host__ __device__ constexpr auto depth(const Tensor& tensor) + typename UnrolledDescriptorType> +__host__ __device__ constexpr auto +depth(const Tensor& tensor) { return depth(tensor.GetLayout()); } @@ -240,15 +209,9 @@ __host__ __device__ constexpr auto depth(const Tensor -__host__ __device__ constexpr const auto& shape(const Tensor& tensor) + typename UnrolledDescriptorType> +__host__ __device__ constexpr const auto& +shape(const Tensor& tensor) { return shape(tensor.GetLayout()); } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp index 56b0ce7914..750d4d14f8 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -10,6 +10,7 @@ #include "ck/tensor_operation/gpu/device/device_base.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/numeric.hpp" namespace ck { namespace tensor_operation { diff --git a/test/wrapper/test_copy.cpp b/test/wrapper/test_copy.cpp index 5cf09a54be..e7fa3c539b 100644 --- a/test/wrapper/test_copy.cpp +++ b/test/wrapper/test_copy.cpp @@ -21,49 +21,59 @@ template + bool UseOptimizedCopy> __global__ void TestCopyDevice(const InputTensor input_tensor, OutputTensor output_tensor, const BlockShape tile_shape, - const ThreadLayoutShape thread_layout, - const LocalTileSteps block_steps, - const LocalPartitionSteps thread_steps) + const ThreadLayoutShape thread_layout) { __shared__ ck::index_t p_shared[ck::wrapper::size(tile_shape)]; - auto tensor_lds = ck::wrapper::make_tensor( + const auto tensor_lds = ck::wrapper::make_tensor( p_shared, ck::wrapper::make_layout(tile_shape)); - const auto block_idxs = ck::make_tuple(ck::make_tuple(0, 0), blockIdx.x); + const auto block_idx = static_cast(blockIdx.x); // Get local tiles for global memory - const auto input_local_tile = - ck::wrapper::make_local_tile(input_tensor, tile_shape, block_idxs, block_steps); + const auto input_local_tile = ck::wrapper::make_local_tile(input_tensor, tile_shape, block_idx); const auto output_local_tile = - ck::wrapper::make_local_tile(output_tensor, tile_shape, block_idxs, block_steps); + ck::wrapper::make_local_tile(output_tensor, tile_shape, block_idx); // Get partition per thread - const auto input_local_partition = ck::wrapper::make_local_partition( - input_local_tile, thread_layout, threadIdx.x, thread_steps); + const auto input_local_partition = + ck::wrapper::make_local_partition(input_local_tile, thread_layout, threadIdx.x); auto lds_local_partition = - ck::wrapper::make_local_partition(tensor_lds, thread_layout, threadIdx.x, thread_steps); - auto output_local_partition = ck::wrapper::make_local_partition( - output_local_tile, thread_layout, threadIdx.x, thread_steps); + ck::wrapper::make_local_partition(tensor_lds, thread_layout, threadIdx.x); + auto output_local_partition = + ck::wrapper::make_local_partition(output_local_tile, thread_layout, threadIdx.x); // Allocate VGPR - constexpr ck::index_t scalar_per_vector = 1; - constexpr ck::index_t vgpr_size = ck::wrapper::size(lds_local_partition); - auto tensor_vgpr = ck::wrapper::make_register_tensor(); + auto tensor_vgpr = + ck::wrapper::make_register_tensor( + layout(lds_local_partition)); // Perform copy - ck::wrapper::copy(input_local_partition, lds_local_partition); - ck::wrapper::copy(lds_local_partition, tensor_vgpr); - ck::wrapper::copy(tensor_vgpr, output_local_partition); + if constexpr(UseOptimizedCopy) + { + using DimAccessOrder = ck::Tuple, ck::Number<0>>; + constexpr ck::index_t vector_dim = 0; + constexpr ck::index_t scalar_per_vector = 2; + ck::wrapper::copy(input_local_partition, + lds_local_partition); + // TODO: Enable optimized copy for static buffers + ck::wrapper::copy(lds_local_partition, + tensor_vgpr); + ck::wrapper::copy(tensor_vgpr, + output_local_partition); + } + else + { + ck::wrapper::copy(input_local_partition, lds_local_partition); + ck::wrapper::copy(lds_local_partition, tensor_vgpr); + ck::wrapper::copy(tensor_vgpr, output_local_partition); + } } +template void PerformCopyGlobalToGlobalViaLDS() { const auto shape = @@ -89,15 +99,8 @@ void PerformCopyGlobalToGlobalViaLDS() auto output_tensor_global = ck::wrapper::make_tensor( static_cast(out_buf.GetDeviceBuffer()), layout); - const auto thread_layout = - ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<32>{}); - const auto tile_shape = - ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<2>{}), ck::Number<64>{}); - - const auto thread_steps = - ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<2>{}); - const auto block_steps = - ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<64>{}); + const auto thread_layout = ck::make_tuple(ck::Number<1>{}, ck::Number<32>{}); + const auto tile_shape = ck::make_tuple(ck::Number<4>{}, ck::Number<64>{}); const ck::index_t grid_size = ck::math::integer_divide_ceil( ck::wrapper::size(input_tensor_global), ck::wrapper::size(tile_shape)); @@ -106,8 +109,7 @@ void PerformCopyGlobalToGlobalViaLDS() decltype(output_tensor_global), decltype(tile_shape), decltype(thread_layout), - decltype(block_steps), - decltype(thread_steps)>; + UseOptimizedCopy>; launch_and_time_kernel(StreamConfig{}, kernel, dim3(grid_size), @@ -116,9 +118,7 @@ void PerformCopyGlobalToGlobalViaLDS() input_tensor_global, output_tensor_global, tile_shape, - thread_layout, - block_steps, - thread_steps); + thread_layout); // Verify results std::vector output_data(ck::wrapper::size(shape)); @@ -126,4 +126,5 @@ void PerformCopyGlobalToGlobalViaLDS() EXPECT_TRUE(ck::utils::check_err(output_data, input_data)); } -TEST(TestCopy, CopyGlobalToGlobalViaLDS) { PerformCopyGlobalToGlobalViaLDS(); } +TEST(TestCopyGlobalToGlobalViaLDS, GenericCopy) { PerformCopyGlobalToGlobalViaLDS(); } +TEST(TestCopyGlobalToGlobalViaLDS, OptimizedCopy) { PerformCopyGlobalToGlobalViaLDS(); } diff --git a/test/wrapper/test_partition.cpp b/test/wrapper/test_partition.cpp index df56b879f6..cacbfe9d88 100644 --- a/test/wrapper/test_partition.cpp +++ b/test/wrapper/test_partition.cpp @@ -29,42 +29,29 @@ TEST(TestPartition, LocalPartition) const auto tensor = ck::wrapper::make_tensor(data.data(), layout); - const auto thread_steps = - ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<1>{}), ck::Number<1>{}); - const auto thread_layout = - ck::make_tuple(ck::make_tuple(ck::Number<8>{}, ck::Number<1>{}), ck::Number<1>{}); - - for(ck::index_t thread_id = 0; thread_id < ck::wrapper::size(thread_layout); thread_id++) - { - const auto raked_partition = - ck::wrapper::make_local_partition(tensor, thread_layout, thread_id); - - const auto expected_partition_size = - ck::wrapper::size(tensor) / ck::wrapper::size(thread_layout); - EXPECT_EQ(ck::wrapper::size(raked_partition), expected_partition_size); - EXPECT_EQ(raked_partition(0), thread_id); - } + const auto thread_steps = ck::make_tuple(ck::Number<8>{}, ck::Number<1>{}); + const auto thread_layout = ck::make_tuple(ck::Number<8>{}, ck::Number<1>{}); for(ck::index_t thread_id = 0; thread_id < ck::wrapper::size(thread_layout); thread_id++) { const auto packed_partition = - ck::wrapper::make_local_partition(tensor, thread_layout, thread_id, thread_steps); + ck::wrapper::make_local_partition(tensor, thread_layout, thread_id); const auto expected_partition_size = ck::wrapper::size(tensor) / ck::wrapper::size(thread_layout); - const auto expected_partition_first_val = thread_id * ck::wrapper::size<0, 0>(thread_steps); + const auto expected_partition_first_val = thread_id * ck::wrapper::size<0>(thread_steps); + const auto expected_partition_second_val = expected_partition_first_val + 1; EXPECT_EQ(ck::wrapper::size(packed_partition), expected_partition_size); EXPECT_EQ(packed_partition(0), expected_partition_first_val); + EXPECT_EQ(packed_partition(1), expected_partition_second_val); } } TEST(TestPartition, LocalTile) { - const auto shape = - ck::make_tuple(ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}), ck::Number<4>{}); - const auto strides = - ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}), ck::Number<64>{}); - const auto layout = ck::wrapper::make_layout(shape, strides); + const auto shape = ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}, ck::Number<4>{}); + const auto strides = ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}, ck::Number<64>{}); + const auto layout = ck::wrapper::make_layout(shape, strides); std::vector data(ck::wrapper::size(layout)); std::iota(data.begin(), data.end(), 0); @@ -72,48 +59,34 @@ TEST(TestPartition, LocalTile) const auto tensor = ck::wrapper::make_tensor(data.data(), layout); - const auto block_steps = - ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{}); - const auto block_shape = - ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{}); - const auto block_layout = - ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{}); + const auto block_shape = ck::make_tuple(ck::Number<2>{}, ck::Number<4>{}, ck::Number<2>{}); + const auto num_blocks = + ck::make_tuple(ck::wrapper::size<0>(shape) / ck::wrapper::size<0>(block_shape), + ck::wrapper::size<1>(shape) / ck::wrapper::size<1>(block_shape), + ck::wrapper::size<2>(shape) / ck::wrapper::size<2>(block_shape)); + std::vector block_idxs(ck::wrapper::size(num_blocks)); + std::iota(block_idxs.begin(), block_idxs.end(), 0); - std::vector, ck::index_t>> block_idxs; - for(ck::index_t x = 0; x < ck::wrapper::size<0, 0>(block_layout); x++) + for(auto block_idx : block_idxs) { - for(ck::index_t y = 0; y < ck::wrapper::size<0, 1>(block_layout); y++) - { - for(ck::index_t z = 0; z < ck::wrapper::size<1>(block_layout); z++) - { - block_idxs.emplace_back(ck::make_tuple(x, y), z); - } - } - } - - for(const auto& block_idx : block_idxs) - { - const auto raked_tile = ck::wrapper::make_local_tile(tensor, block_shape, block_idx); + const auto packed_tile = ck::wrapper::make_local_tile(tensor, block_shape, block_idx); const auto expected_tile_size = ck::wrapper::size(block_shape); - EXPECT_EQ(ck::wrapper::size(raked_tile), expected_tile_size); - EXPECT_EQ(raked_tile(0), layout(block_idx)); - } + auto expected_tile_first_val = (block_idx % ck::wrapper::size<2>(num_blocks)) * + ck::wrapper::size<2>(block_shape) * + ck::wrapper::size<2>(strides); + block_idx /= ck::wrapper::size<2>(num_blocks); + expected_tile_first_val += (block_idx % ck::wrapper::size<1>(num_blocks)) * + ck::wrapper::size<1>(block_shape) * + ck::wrapper::size<1>(strides); + block_idx /= ck::wrapper::size<1>(num_blocks); + expected_tile_first_val += (block_idx % ck::wrapper::size<0>(num_blocks)) * + ck::wrapper::size<0>(block_shape) * + ck::wrapper::size<0>(strides); - for(const auto& block_idx : block_idxs) - { - const auto packed_tile = - ck::wrapper::make_local_tile(tensor, block_shape, block_idx, block_steps); - - const auto expected_tile_size = ck::wrapper::size(block_shape); - const auto expected_tile_first_val = - ck::wrapper::size<0, 0>(block_idx) * ck::wrapper::size<0, 0>(block_shape) * - ck::wrapper::size<0, 0>(strides) + - ck::wrapper::size<0, 1>(block_idx) * ck::wrapper::size<0, 1>(block_shape) * - ck::wrapper::size<0, 1>(strides) + - ck::wrapper::size<1>(block_idx) * ck::wrapper::size<1>(block_shape) * - ck::wrapper::size<1>(strides); + const auto expected_tile_second_val = expected_tile_first_val + 1; EXPECT_EQ(ck::wrapper::size(packed_tile), expected_tile_size); EXPECT_EQ(packed_tile(0), expected_tile_first_val); + EXPECT_EQ(packed_tile(1), expected_tile_second_val); } } diff --git a/test/wrapper/test_tensor.cpp b/test/wrapper/test_tensor.cpp index 2d4d6f2750..3c7d877528 100644 --- a/test/wrapper/test_tensor.cpp +++ b/test/wrapper/test_tensor.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -100,31 +100,26 @@ TEST(TestTensor, ReadWriteHostMemory) __global__ void TestTensorReadWriteDevice(void* data, void* success) { - constexpr ck::index_t nelems = 8; - constexpr ck::index_t scalar_per_vector = 1; + constexpr ck::index_t nelems = 8; __shared__ ck::index_t p_shared[nelems]; ck::index_t* casted_data_ptr = static_cast(data); bool* casted_success_ptr = static_cast(success); const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2)); + constexpr auto vgpr_layout = + ck::wrapper::make_layout(make_tuple(ck::Number{}), make_tuple(ck::Number<1>{})); auto tensor_global = ck::wrapper::make_tensor(casted_data_ptr, layout); - auto tensor_lds = ck::wrapper::make_tensor(p_shared, layout); - auto tensor_vgpr = ck::wrapper::make_register_tensor(); - auto tensor_sgpr = ck::wrapper::make_register_tensor(); + auto tensor_lds = ck::wrapper::make_tensor(p_shared, layout); + auto tensor_vgpr = + ck::wrapper::make_register_tensor( + vgpr_layout); InitTensor(tensor_global); InitTensor(tensor_lds); StaticInitTensor(tensor_vgpr); - StaticInitTensor(tensor_sgpr); *casted_success_ptr = TestTensorCheck1d(tensor_global); *casted_success_ptr &= TestTensorCheck3d(tensor_global); @@ -133,8 +128,6 @@ __global__ void TestTensorReadWriteDevice(void* data, void* success) *casted_success_ptr &= TestTensorCheck3d(tensor_lds); *casted_success_ptr &= StaticTestTensorCheck1d(tensor_vgpr); - - *casted_success_ptr &= StaticTestTensorCheck1d(tensor_sgpr); } TEST(TestTensor, ReadWriteGlobalLdsRegistersMemory)