mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
Add optimized copy to ck wrapper (#1126)
* Add optimized copy to ck wrapper
* Example optimizations
* Fixes
* Move img2col test to client example
* Refactor example
* Fix docs
* Fixes
* Fix
* Fixes
* Fixes
* Fixes
* Fixes
* Fixes
---------
Co-authored-by: zjing14 <zhangjing14@gmail.com>
[ROCm/composable_kernel commit: 7e4eb4b800]
This commit is contained in:
17
CHANGELOG.md
17
CHANGELOG.md
@@ -2,7 +2,21 @@
|
||||
|
||||
Full documentation for Composable Kernel is not yet available.
|
||||
|
||||
## (Unreleased) CK for ROCm 6.0.0
|
||||
## (Unreleased) CK
|
||||
|
||||
### Fixes
|
||||
None
|
||||
|
||||
### Optimizations
|
||||
None
|
||||
|
||||
### Additions
|
||||
- Introduce wrapper sublibrary (limited functionality). (#1071, #1098, #1108, #1126)
|
||||
|
||||
### Changes
|
||||
None
|
||||
|
||||
## CK for ROCm 6.0.0
|
||||
|
||||
### Fixes
|
||||
- Fixed a hazard associated with inline v_dot (#808)
|
||||
@@ -19,7 +33,6 @@ None
|
||||
- Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804)
|
||||
- Support for bf16/f32/f16 and NHWGC (2D and 3D) grouped convolution backward data (#757 #799)
|
||||
- Support for Batched Gemm DL (#732)
|
||||
- Introduce wrapper sublibrary (limited functionality). (#1071, #1098, #1108)
|
||||
|
||||
### Changes
|
||||
- Changed the grouped convolution API to maintain consistency with other convolution kernels (#817)
|
||||
|
||||
@@ -1,150 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
|
||||
#include "ck/utility/number.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
#include "ck/utility/sequence.hpp"
|
||||
|
||||
#include "ck/tensor_description/tensor_descriptor.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
|
||||
#include "ck/tensor_description/multi_index_transform_helper.hpp"
|
||||
|
||||
static constexpr auto I0 = ck::Number<0>{};
|
||||
static constexpr auto I1 = ck::Number<1>{};
|
||||
static constexpr auto I2 = ck::Number<2>{};
|
||||
|
||||
using DataType = int;
|
||||
|
||||
template <typename Desc>
|
||||
void Print1d(const Desc& desc)
|
||||
{
|
||||
std::cout << "Print1d" << std::endl;
|
||||
for(ck::index_t w = 0; w < desc.GetLength(I0); w++)
|
||||
{
|
||||
std::cout << desc.CalculateOffset(ck::make_tuple(w)) << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
template <typename Desc>
|
||||
void Print2d(const Desc& desc)
|
||||
{
|
||||
std::cout << "Print2d" << std::endl;
|
||||
for(ck::index_t h = 0; h < desc.GetLength(I0); h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < desc.GetLength(I1); w++)
|
||||
{
|
||||
std::cout << desc.CalculateOffset(ck::make_tuple(h, w)) << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Desc>
|
||||
void Print3dCustom(const Desc& desc)
|
||||
{
|
||||
std::cout << "Print3dCustom" << std::endl;
|
||||
for(ck::index_t d = 0; d < desc.GetLength(I0); d++)
|
||||
{
|
||||
for(ck::index_t h = 0; h < desc.GetLength(I1); h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < desc.GetLength(I2); w++)
|
||||
{
|
||||
std::cout << desc.CalculateOffset(ck::make_tuple(d, h, w)) << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
// Tensor descriptor traverse in row-major (need to reverse dims)
|
||||
std::cout << "Note: Tensor descriptor traverse in row-major" << std::endl;
|
||||
// Basic descriptor 0, 1, 2, ... 30, 31
|
||||
// (dims:4,8 strides:1,4)
|
||||
const auto desc_4x8_s1x4 =
|
||||
ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number<4>{}, ck::Number<8>{}),
|
||||
ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}));
|
||||
std::cout << "dims:4,8 strides:1,4" << std::endl;
|
||||
Print2d(desc_4x8_s1x4);
|
||||
|
||||
using Cord1x1Type = ck::Tuple<ck::Number<1>, ck::Number<1>>;
|
||||
constexpr ck::index_t offset_1x1 = desc_4x8_s1x4.CalculateOffset(Cord1x1Type{});
|
||||
std::cout << "Constexpr calculated [1, 1] offset:" << offset_1x1 << std::endl;
|
||||
|
||||
// Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (compile-time descriptor)
|
||||
// dims:4,(2,4) strides:2,(1,8)
|
||||
const auto desc_4x2x4_s2x1x8 =
|
||||
ck::make_naive_tensor_descriptor(ck::make_tuple(4, 2, 4), ck::make_tuple(2, 1, 8));
|
||||
// Transform to 2d (column-major, need to to reverse dims)
|
||||
const auto desc_4x2x4_s2x1x8_merged = ck::transform_tensor_descriptor(
|
||||
desc_4x2x4_s2x1x8,
|
||||
ck::make_tuple(ck::make_pass_through_transform(4),
|
||||
ck::make_merge_transform(ck::make_tuple(4, 2))),
|
||||
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<2, 1>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
|
||||
|
||||
std::cout << "dims:4,(2,4) strides:2,(1,8)" << std::endl;
|
||||
Print2d(desc_4x2x4_s2x1x8_merged);
|
||||
|
||||
// Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (compile-time descriptor)
|
||||
// dims:(2,2),(2,4) strides:((1,4),(2,8)
|
||||
const auto desc_2x2x2x4_s1x4x2x8 =
|
||||
ck::make_naive_tensor_descriptor(ck::make_tuple(2, 2, 2, 4), ck::make_tuple(1, 4, 2, 8));
|
||||
// Transform to 2d
|
||||
const auto desc_2x2x2x4_s1x4x2x8_double_merged_2d = ck::transform_tensor_descriptor(
|
||||
desc_2x2x2x4_s1x4x2x8,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(2, 2)),
|
||||
ck::make_merge_transform(ck::make_tuple(4, 2))),
|
||||
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<3, 2>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
|
||||
// Transform to 3d
|
||||
const auto desc_2x2x2x4_s1x4x2x8_double_merged_3d = ck::transform_tensor_descriptor(
|
||||
desc_2x2x2x4_s1x4x2x8,
|
||||
ck::make_tuple(ck::make_pass_through_transform(2),
|
||||
ck::make_pass_through_transform(2),
|
||||
ck::make_merge_transform(ck::make_tuple(4, 2))),
|
||||
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<3, 2>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}));
|
||||
|
||||
std::cout << "dims:(2,2),(2,4) strides:(1,4),(2,8)" << std::endl;
|
||||
Print2d(desc_2x2x2x4_s1x4x2x8_double_merged_2d);
|
||||
Print3dCustom(desc_2x2x2x4_s1x4x2x8_double_merged_3d);
|
||||
|
||||
// Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (compile-time descriptor)
|
||||
// dims:((2,2),2),4 strides:((1,4),2),8
|
||||
// Transform to 2d
|
||||
const auto desc_2x2x2x4_s1x4x2x8_nested =
|
||||
ck::make_naive_tensor_descriptor(ck::make_tuple(2, 2, 2, 4), ck::make_tuple(1, 4, 2, 8));
|
||||
const auto desc_2x2x2x4_s1x4x2x8_nested_merged_3d = ck::transform_tensor_descriptor(
|
||||
desc_2x2x2x4_s1x4x2x8_nested,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(2, 2)),
|
||||
ck::make_pass_through_transform(2),
|
||||
ck::make_pass_through_transform(4)),
|
||||
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<3>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}));
|
||||
const auto desc_2x2x2x4_s1x4x2x8_nested_merged_1d = ck::transform_tensor_descriptor(
|
||||
desc_2x2x2x4_s1x4x2x8_nested,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(4, 2, 2, 2))),
|
||||
ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}));
|
||||
const auto desc_2x2x2x4_s1x4x2x8_nested_merged_2d = ck::transform_tensor_descriptor(
|
||||
desc_2x2x2x4_s1x4x2x8_nested_merged_3d,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(2, 4)),
|
||||
ck::make_pass_through_transform(4)),
|
||||
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
|
||||
|
||||
std::cout << "dims:((2,2),2),4 strides:((1,4),2),8" << std::endl;
|
||||
Print1d(desc_2x2x2x4_s1x4x2x8_nested_merged_1d);
|
||||
Print2d(desc_2x2x2x4_s1x4x2x8_nested_merged_2d);
|
||||
Print3dCustom(desc_2x2x2x4_s1x4x2x8_nested_merged_3d);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
add_executable(client_tensor_transform tensor_transform.cpp)
|
||||
target_link_libraries(client_tensor_transform PRIVATE composable_kernel::device_other_operations)
|
||||
add_executable(client_tensor_transform_using_wrapper tensor_transform_using_wrapper.cpp)
|
||||
target_link_libraries(client_tensor_transform_using_wrapper PRIVATE composable_kernel::device_other_operations)
|
||||
add_executable(client_wrapper_img2col wrapper_img2col.cpp)
|
||||
target_link_libraries(client_wrapper_img2col PRIVATE composable_kernel::device_other_operations)
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iostream>
|
||||
|
||||
180
client_example/25_wrapper/wrapper_img2col.cpp
Normal file
180
client_example/25_wrapper/wrapper_img2col.cpp
Normal file
@@ -0,0 +1,180 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <numeric>
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <initializer_list>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
#include "ck/utility/common_header.hpp"
|
||||
#include "ck/wrapper/layout.hpp"
|
||||
#include "ck/wrapper/tensor.hpp"
|
||||
#include "ck/wrapper/operations/copy.hpp"
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
using DataType = float;
|
||||
using InputLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
// Test copy from Global to Global through LDS and VGPR
|
||||
template <typename InputTensor,
|
||||
typename OutputTensor,
|
||||
typename BlockShape,
|
||||
typename ThreadLayoutShape>
|
||||
__global__ void DeviceImageToColumnPad0(InputTensor input_tensor,
|
||||
OutputTensor output_tensor,
|
||||
const BlockShape tile_shape,
|
||||
const ThreadLayoutShape thread_layout)
|
||||
{
|
||||
const ck::index_t block_idx = static_cast<ck::index_t>(blockIdx.x);
|
||||
|
||||
// Get local tiles for global memory
|
||||
auto input_local_tile = ck::wrapper::make_local_tile(input_tensor, tile_shape, block_idx);
|
||||
auto output_local_tile = ck::wrapper::make_local_tile(output_tensor, tile_shape, block_idx);
|
||||
|
||||
// Get partition per thread
|
||||
const auto input_local_partition =
|
||||
ck::wrapper::make_local_partition(input_local_tile, thread_layout, threadIdx.x);
|
||||
auto output_local_partition =
|
||||
ck::wrapper::make_local_partition(output_local_tile, thread_layout, threadIdx.x);
|
||||
|
||||
// Perform copy
|
||||
using DimAccessOrder = ck::Tuple<ck::Number<0>, ck::Number<1>>;
|
||||
constexpr ck::index_t vector_dim = 1;
|
||||
constexpr ck::index_t scalar_per_vector = 4;
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(input_local_partition,
|
||||
output_local_partition);
|
||||
}
|
||||
|
||||
void PerformImageToColumnPad0(const ck::index_t G,
|
||||
const ck::index_t N,
|
||||
const ck::index_t Di,
|
||||
const ck::index_t Hi,
|
||||
const ck::index_t Wi,
|
||||
const ck::index_t Do,
|
||||
const ck::index_t Ho,
|
||||
const ck::index_t Wo,
|
||||
const ck::index_t C,
|
||||
const ck::index_t Z,
|
||||
const ck::index_t Y,
|
||||
const ck::index_t X,
|
||||
std::array<ck::index_t, NumDimSpatial> filter_strides,
|
||||
std::array<ck::index_t, NumDimSpatial> filter_dilations)
|
||||
{
|
||||
const ck::index_t ZYXC = Z * Y * X * C;
|
||||
const ck::index_t GC = G * C;
|
||||
|
||||
// shape: (G, (Wo, Ho, Do, N)), (C, X, Y, Z))
|
||||
const auto shape = ck::make_tuple(ck::make_tuple(G, ck::make_tuple(Wo, Ho, Do, N)),
|
||||
ck::make_tuple(C, X, Y, Z));
|
||||
const auto in_strides =
|
||||
ck::make_tuple(ck::make_tuple(C,
|
||||
ck::make_tuple(filter_strides[2] * GC,
|
||||
filter_strides[1] * Wi * GC,
|
||||
filter_strides[0] * Hi * Wi * GC,
|
||||
Di * Hi * Wi * GC)),
|
||||
ck::make_tuple(1,
|
||||
filter_dilations[2] * GC,
|
||||
filter_dilations[1] * Wi * GC,
|
||||
filter_dilations[0] * Hi * Wi * GC));
|
||||
const auto in_layout = ck::wrapper::make_layout(shape, in_strides);
|
||||
|
||||
const auto out_strides = ck::make_tuple(
|
||||
ck::make_tuple(
|
||||
ZYXC,
|
||||
ck::make_tuple(ZYXC * G, Wo * ZYXC * G, Ho * Wo * ZYXC * G, Do * Ho * Wo * ZYXC * G)),
|
||||
ck::make_tuple(1, C, X * C, Y * X * C));
|
||||
const auto out_layout = ck::wrapper::make_layout(shape, out_strides);
|
||||
|
||||
const ck::index_t input_size = N * Di * Hi * Wi * GC;
|
||||
// Global memory buffers
|
||||
SimpleDeviceMem in_buf(input_size * sizeof(DataType));
|
||||
SimpleDeviceMem out_buf(ck::wrapper::size(out_layout) * sizeof(DataType));
|
||||
|
||||
// User can choose appropriate number of threads and sizes per block
|
||||
const auto thread_layout = ck::make_tuple(ck::Number<8>{}, ck::Number<16>{});
|
||||
// This example doesn't support padding, user should select tile sizes
|
||||
// which divides the shape completely
|
||||
const auto tile_shape = ck::make_tuple(ck::Number<32>{}, ck::Number<64>{});
|
||||
|
||||
// Create buffers for global memory
|
||||
auto input_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<const DataType*>(in_buf.GetDeviceBuffer()), in_layout);
|
||||
auto output_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<DataType*>(out_buf.GetDeviceBuffer()), out_layout);
|
||||
|
||||
const ck::index_t grid_size = ck::math::integer_divide_ceil(ck::wrapper::size<0>(in_layout),
|
||||
ck::wrapper::size<0>(tile_shape)) *
|
||||
ck::math::integer_divide_ceil(ck::wrapper::size<1>(in_layout),
|
||||
ck::wrapper::size<1>(tile_shape));
|
||||
|
||||
const auto kernel = DeviceImageToColumnPad0<decltype(input_tensor_global),
|
||||
decltype(output_tensor_global),
|
||||
decltype(tile_shape),
|
||||
decltype(thread_layout)>;
|
||||
const float avg_time = launch_and_time_kernel(StreamConfig{nullptr, true},
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
dim3(ck::wrapper::size(thread_layout)),
|
||||
0,
|
||||
input_tensor_global,
|
||||
output_tensor_global,
|
||||
tile_shape,
|
||||
thread_layout);
|
||||
|
||||
std::size_t num_btype = G * N * Do * Ho * Wo * ZYXC * 2 * sizeof(DataType);
|
||||
float gb_per_sec = num_btype / 1.E6 / avg_time;
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
constexpr ck::index_t G = 4; // number of groups
|
||||
constexpr ck::index_t N = 32; // batch
|
||||
constexpr ck::index_t C = 64; // input channel (per group)
|
||||
constexpr ck::index_t Z = 3; // filter D
|
||||
constexpr ck::index_t Y = 3; // filter H
|
||||
constexpr ck::index_t X = 3; // filter W
|
||||
constexpr ck::index_t Di = 9; // input D
|
||||
constexpr ck::index_t Hi = 9; // input H
|
||||
constexpr ck::index_t Wi = 7; // input W
|
||||
constexpr ck::index_t Do = 7; // output D
|
||||
constexpr ck::index_t Ho = 7; // output H
|
||||
constexpr ck::index_t Wo = 5; // output W
|
||||
PerformImageToColumnPad0(G,
|
||||
N,
|
||||
Di,
|
||||
Hi,
|
||||
Wi,
|
||||
Do,
|
||||
Ho,
|
||||
Wo,
|
||||
C,
|
||||
Z,
|
||||
Y,
|
||||
X,
|
||||
{1, 1, 1} /*filter_strides*/,
|
||||
{1, 1, 1} /*filter_dilations*/);
|
||||
return 0;
|
||||
}
|
||||
@@ -18,8 +18,7 @@ Description
|
||||
|
||||
|
||||
The CK library provides a lightweight wrapper for more complex operations implemented in
|
||||
the library. It allows indexing of nested layouts using a simple interface
|
||||
(avoiding complex descriptor transformations) and memory access (using Tensor).
|
||||
the library.
|
||||
|
||||
Example:
|
||||
|
||||
@@ -54,6 +53,11 @@ Output::
|
||||
1 5 9 13 17 21 25 29
|
||||
2 6 10 14 18 22 26 30
|
||||
|
||||
|
||||
Advanced examples:
|
||||
|
||||
* `Image to column <https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_img2col.cpp>`_
|
||||
|
||||
-------------------------------------
|
||||
Layout
|
||||
-------------------------------------
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
@@ -19,6 +19,12 @@ struct is_known_at_compile_time<index_t>
|
||||
static constexpr bool value = false;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct is_known_at_compile_time<unsigned int>
|
||||
{
|
||||
static constexpr bool value = false;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct is_known_at_compile_time<long_index_t>
|
||||
{
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
@@ -14,22 +14,28 @@ namespace wrapper {
|
||||
* \tparam Shape Tuple of Number<> (for compile-time layout) or index_t
|
||||
* (dynamic layout). It is possible to pass nested shapes
|
||||
* (e.g. ((4, 2), 2)), nested dimensions are merged.
|
||||
* \tparam UnnestedDescriptorType Tensor descriptor for unnested shape dims.
|
||||
* \tparam UnrolledDescriptorType Tensor descriptor for unnested shape dims.
|
||||
*/
|
||||
template <typename Shape, typename UnnestedDescriptorType>
|
||||
template <typename Shape, typename UnrolledDescriptorType>
|
||||
struct Layout
|
||||
{
|
||||
private:
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
static constexpr auto I1 = Number<1>{};
|
||||
|
||||
// Generate default idxs tuple (idx with all merged nested shapes)
|
||||
/**
|
||||
* \brief Generate default indices tuple (idx with all merged nested shapes)
|
||||
*
|
||||
* \param shape Shape to align.
|
||||
* \return Multi idx tuple with zeros.
|
||||
*/
|
||||
template <typename... Ts>
|
||||
__host__ __device__ constexpr static auto GenerateDefaultIdxsTuple(const Tuple<Ts...>&)
|
||||
__host__ __device__ constexpr static auto
|
||||
GenerateDefaultIdxsTuple([[maybe_unused]] const Tuple<Ts...>& shape)
|
||||
{
|
||||
return generate_tuple(
|
||||
[&](auto) {
|
||||
if constexpr(!UnnestedDescriptorType::IsKnownAtCompileTime())
|
||||
if constexpr(!remove_cvref_t<UnrolledDescriptorType>::IsKnownAtCompileTime())
|
||||
{
|
||||
// runtime layout
|
||||
return index_t(0);
|
||||
@@ -43,11 +49,18 @@ struct Layout
|
||||
Number<Tuple<Ts...>::Size()>{});
|
||||
}
|
||||
|
||||
// Generate LowerDims in Compile-time for MergeTrasform using passed Type
|
||||
// If element of Tuple<Ts...> is also tuple, then merge (generate sequence for merge)
|
||||
// If tuple is element, then pass through (sequence with one element)
|
||||
/**
|
||||
* \brief Generate lower dims in compile-time for the Merge transform using
|
||||
* provided type. If element of nested Tuple<Ts...> is also a tuple, then
|
||||
* merge (generate sequence for merge). If tuple is element, then pass
|
||||
* through (sequence with one element).
|
||||
*
|
||||
* \param shape Shape to align.
|
||||
* \return LowerDims for MergeTrasform.
|
||||
*/
|
||||
template <typename Idx, typename... Ts>
|
||||
__host__ __device__ constexpr static auto GenerateLowerDim(const Tuple<Ts...>&)
|
||||
__host__ __device__ constexpr static auto
|
||||
GenerateLowerDim([[maybe_unused]] const Tuple<Ts...>& shape)
|
||||
{
|
||||
if constexpr(Idx::value == 0)
|
||||
{
|
||||
@@ -87,11 +100,17 @@ struct Layout
|
||||
}
|
||||
}
|
||||
|
||||
// Iterate over nested tuples in shape
|
||||
// Unroll nested tuples to align Tuple<ShapeDims...> to Tuple<IdxDims...>
|
||||
// Example idx: (1, 1), 1, 1
|
||||
// Example shape: (2, (2, 2)), 2, (2, 2)
|
||||
// Unrolled shape: 2, (2, 2), 2, (2, 2)
|
||||
/**
|
||||
* \brief Iterate over the nested tuples in the shape.
|
||||
* Unroll nested tuples to align Tuple<ShapeDims...> to Tuple<IdxDims...>
|
||||
* Example idx: (1, 1), 1, 1
|
||||
* Example shape: (2, (2, 2)), 2, (2, 2)
|
||||
* Unrolled shape: 2, (2, 2), 2, (2, 2)
|
||||
*
|
||||
* \param shape Layout shape.
|
||||
* \param idx Idx to align.
|
||||
* \return Algined shape.
|
||||
*/
|
||||
template <typename... ShapeDims, typename... IdxDims>
|
||||
__host__ __device__ constexpr static auto AlignShapeToIdx(const Tuple<ShapeDims...>& shape,
|
||||
const Tuple<IdxDims...>& idx)
|
||||
@@ -126,6 +145,13 @@ struct Layout
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Merge descriptor to 1D.
|
||||
*
|
||||
* \param shape Layout shape.
|
||||
* \param desc Descriptor to merge.
|
||||
* \return 1D descriptor.
|
||||
*/
|
||||
template <typename... ShapeDims, typename DescriptorToMerge>
|
||||
__host__ __device__ constexpr static auto MakeMerge1d(const Tuple<ShapeDims...>& shape,
|
||||
const DescriptorToMerge& desc)
|
||||
@@ -137,18 +163,41 @@ struct Layout
|
||||
const auto lower_dims = make_tuple(MergeElemsSequence::Reverse());
|
||||
const auto upper_dims = make_tuple(Sequence<0>{});
|
||||
// Merge to 1d
|
||||
return transform_tensor_descriptor(
|
||||
desc, make_tuple(make_merge_transform(merge_elems)), lower_dims, upper_dims);
|
||||
if constexpr(!remove_cvref_t<UnrolledDescriptorType>::IsKnownAtCompileTime())
|
||||
{
|
||||
return transform_tensor_descriptor(
|
||||
desc, make_tuple(make_merge_transform(merge_elems)), lower_dims, upper_dims);
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the descriptor is known at the compilation time,
|
||||
// use `make_merge_transform_v1_carry_check` because it doesn't use
|
||||
// memcpy.
|
||||
return transform_tensor_descriptor(
|
||||
desc,
|
||||
make_tuple(make_merge_transform_v1_carry_check(merge_elems)),
|
||||
lower_dims,
|
||||
upper_dims);
|
||||
}
|
||||
}
|
||||
|
||||
// Merge nested shape dims when corresponding index is also nested.
|
||||
// Input desc shape: 2, 2, 2, 2, 2, 2
|
||||
// Example idx: 1, 1, 1, 1
|
||||
// Example shape: 2, (2, 2), 2, (2, 2)
|
||||
// Merged shape: 2, 4, 2, 4
|
||||
/**
|
||||
* \brief Merge nested shape dims when corresponding index is also merged.
|
||||
* Input desc shape: 2, 2, 2, 2, 2, 2
|
||||
* Example idx: 1, 1, 1, (1, 1)
|
||||
* Example shape: 2, (2, 2), 2, (2, 2)
|
||||
* Merged shape: 2, 4, 2, 2, 2
|
||||
*
|
||||
* \param shape Layout shape.
|
||||
* \param idxs Indexes to align descriptor.
|
||||
* \param desc Descriptor to merge.
|
||||
* \return Aligned descriptor to idx.
|
||||
*/
|
||||
template <typename... ShapeDims, typename... IdxDims, typename DescriptorToMerge>
|
||||
__host__ __device__ constexpr static auto CreateMergedDescriptor(
|
||||
const Tuple<ShapeDims...>& shape, const Tuple<IdxDims...>&, DescriptorToMerge& desc)
|
||||
__host__ __device__ constexpr static auto
|
||||
CreateMergedDescriptor(const Tuple<ShapeDims...>& shape,
|
||||
[[maybe_unused]] const Tuple<IdxDims...>& idxs,
|
||||
DescriptorToMerge& desc)
|
||||
{
|
||||
const auto transforms = generate_tuple(
|
||||
[&](auto i) {
|
||||
@@ -160,7 +209,17 @@ struct Layout
|
||||
// If shape element is tuple and idx element is Number, then merge
|
||||
// Unroll and reverse tuple to traverse column-major
|
||||
const auto merge_elems = TupleReverse(UnrollNestedTuple(shape.At(i)));
|
||||
return make_merge_transform(merge_elems);
|
||||
if constexpr(!remove_cvref_t<UnrolledDescriptorType>::IsKnownAtCompileTime())
|
||||
{
|
||||
return make_merge_transform(merge_elems);
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the descriptor is known at the compilation time,
|
||||
// use `make_merge_transform_v1_carry_check` because
|
||||
// it doesn't use memcpy.
|
||||
return make_merge_transform_v1_carry_check(merge_elems);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -185,14 +244,23 @@ struct Layout
|
||||
}
|
||||
|
||||
using Descriptor1dType =
|
||||
remove_cvref_t<decltype(MakeMerge1d(Shape{}, UnnestedDescriptorType{}))>;
|
||||
remove_cvref_t<decltype(MakeMerge1d(Shape{}, UnrolledDescriptorType{}))>;
|
||||
using DefaultIdxsTupleType = remove_cvref_t<decltype(GenerateDefaultIdxsTuple(Shape{}))>;
|
||||
|
||||
public:
|
||||
/**
|
||||
* \brief Transform descriptor to align to passed indexes.
|
||||
*
|
||||
* \param shape Layout shape.
|
||||
* \param idxs Indexes to align descriptor.
|
||||
* \param naive_descriptor Descriptor to merge.
|
||||
* \return Aligned descriptor to idx.
|
||||
*/
|
||||
template <typename... ShapeDims, typename... IdxDims>
|
||||
__host__ __device__ constexpr static auto
|
||||
TransformDesc(const Tuple<ShapeDims...>& shape,
|
||||
const Tuple<IdxDims...>& idx,
|
||||
const UnnestedDescriptorType& naive_descriptor)
|
||||
const Tuple<IdxDims...>& idxs,
|
||||
const UnrolledDescriptorType& naive_descriptor)
|
||||
{
|
||||
if constexpr(Tuple<IdxDims...>::Size() == I1)
|
||||
{
|
||||
@@ -208,19 +276,18 @@ struct Layout
|
||||
static_assert(Tuple<ShapeDims...>::Size() == Tuple<IdxDims...>::Size(),
|
||||
"Idx rank and Shape rank must be the same (except 1d).");
|
||||
// Unroll while IdxDims is nested
|
||||
const auto aligned_shape = AlignShapeToIdx(shape, idx);
|
||||
const auto aligned_shape = AlignShapeToIdx(shape, idxs);
|
||||
// Transform correct form of shape
|
||||
return CreateMergedDescriptor(aligned_shape, UnrollNestedTuple(idx), naive_descriptor);
|
||||
return CreateMergedDescriptor(aligned_shape, UnrollNestedTuple(idxs), naive_descriptor);
|
||||
}
|
||||
}
|
||||
|
||||
using MergedNestsDescriptorType = remove_cvref_t<decltype(TransformDesc(
|
||||
Shape{}, DefaultIdxsTupleType{}, UnnestedDescriptorType{}))>;
|
||||
Shape{}, DefaultIdxsTupleType{}, UnrolledDescriptorType{}))>;
|
||||
|
||||
public:
|
||||
__host__ __device__ constexpr auto GetElementSpaceSize() const
|
||||
{
|
||||
return unnested_descriptor_.GetElementSpaceSize();
|
||||
return unrolled_descriptor_.GetElementSpaceSize();
|
||||
}
|
||||
|
||||
__host__ __device__ Layout() = delete;
|
||||
@@ -232,16 +299,15 @@ struct Layout
|
||||
* \param unnested_descriptor Descriptor
|
||||
*/
|
||||
__host__ __device__ constexpr Layout(const Shape& shape,
|
||||
const UnnestedDescriptorType& unnested_descriptor)
|
||||
: shape_(shape)
|
||||
const UnrolledDescriptorType& unnested_descriptor)
|
||||
: unrolled_descriptor_(unnested_descriptor), shape_(shape)
|
||||
{
|
||||
// Construct if runtime mode
|
||||
if constexpr(!UnnestedDescriptorType::IsKnownAtCompileTime())
|
||||
if constexpr(!remove_cvref_t<UnrolledDescriptorType>::IsKnownAtCompileTime())
|
||||
{
|
||||
unnested_descriptor_ = unnested_descriptor;
|
||||
descriptor_1d_ = MakeMerge1d(shape_, unnested_descriptor_);
|
||||
descriptor_1d_ = MakeMerge1d(shape_, unrolled_descriptor_);
|
||||
merged_nests_descriptor_ =
|
||||
TransformDesc(shape_, DefaultIdxsTupleType{}, unnested_descriptor_);
|
||||
TransformDesc(shape_, DefaultIdxsTupleType{}, unrolled_descriptor_);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -254,9 +320,9 @@ struct Layout
|
||||
template <typename Idxs>
|
||||
__host__ __device__ constexpr index_t operator()() const
|
||||
{
|
||||
static_assert(UnnestedDescriptorType::IsKnownAtCompileTime(),
|
||||
static_assert(remove_cvref_t<UnrolledDescriptorType>::IsKnownAtCompileTime(),
|
||||
"Compiletime operator used on runtime layout.");
|
||||
using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{}, UnnestedDescriptorType{}));
|
||||
using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{}, UnrolledDescriptorType{}));
|
||||
using UnrolledIdx = decltype(UnrollNestedTuple(Idxs{}));
|
||||
return TransformedDesc{}.CalculateOffset(UnrolledIdx{});
|
||||
}
|
||||
@@ -283,7 +349,7 @@ struct Layout
|
||||
else
|
||||
{
|
||||
// Custom index, need to transform descriptor
|
||||
const auto transformed_desc = TransformDesc(shape_, Idx, unnested_descriptor_);
|
||||
const auto transformed_desc = TransformDesc(shape_, Idx, unrolled_descriptor_);
|
||||
return transformed_desc.CalculateOffset(UnrollNestedTuple(Idx));
|
||||
}
|
||||
}
|
||||
@@ -350,29 +416,55 @@ struct Layout
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get default descriptor (with the same size as Shape)
|
||||
* \brief Get descriptor with all nested dimensions merged.
|
||||
* Example, shape: ((2, 2), 2)
|
||||
* Descriptor lengths: (4, 2)
|
||||
*
|
||||
* \return Default descriptor.
|
||||
* \note The size of merged descriptor is the same as Layout's shape.
|
||||
*
|
||||
* \return Merged nests descriptor.
|
||||
*/
|
||||
__host__ __device__ constexpr const MergedNestsDescriptorType& GetDefaultDescriptor() const
|
||||
__host__ __device__ constexpr const MergedNestsDescriptorType&
|
||||
GetMergedNestingDescriptor() const
|
||||
{
|
||||
return merged_nests_descriptor_;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get unnested descriptor (with unrolled dims)
|
||||
* \brief Get descriptor with all dimensions are merged (1D).
|
||||
* Example, shape: ((2, 2), 2)
|
||||
* Descriptor lengths: (8)
|
||||
*
|
||||
* \return Flatten descriptor.
|
||||
* \return 1D descriptor.
|
||||
*/
|
||||
__host__ __device__ constexpr const UnnestedDescriptorType& GetUnnestedDescriptor() const
|
||||
__host__ __device__ constexpr const Descriptor1dType& Get1DDescriptor() const
|
||||
{
|
||||
return unnested_descriptor_;
|
||||
return descriptor_1d_;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get unnested descriptor (with unrolled dims)
|
||||
* Example, shape: ((2, 2), 2)
|
||||
* Descriptor lengths: (2, 2, 2)
|
||||
*
|
||||
* \return Flattened descriptor.
|
||||
*/
|
||||
__host__ __device__ constexpr const UnrolledDescriptorType& GetUnrolledDescriptor() const
|
||||
{
|
||||
return unrolled_descriptor_;
|
||||
}
|
||||
|
||||
private:
|
||||
UnnestedDescriptorType unnested_descriptor_;
|
||||
// All dimensions are unrolled
|
||||
UnrolledDescriptorType unrolled_descriptor_;
|
||||
// 1D descriptor
|
||||
Descriptor1dType descriptor_1d_;
|
||||
// All nesting are merged
|
||||
MergedNestsDescriptorType merged_nests_descriptor_;
|
||||
// Example, shape: ((2, 2), 2)
|
||||
// UnrolledDescriptorType lengths: (2, 2, 2)
|
||||
// Descriptor1dType lengths: (8)
|
||||
// MergedNestsDescriptorType lengths: (4, 2)
|
||||
const Shape shape_;
|
||||
};
|
||||
|
||||
|
||||
@@ -1,16 +1,21 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "../utils/tensor_utils.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp"
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace wrapper {
|
||||
|
||||
/**
|
||||
* \brief Perform generic copy between two tensors. Tensors must have the
|
||||
* same size.
|
||||
* \brief Perform generic copy between two tensors partitions (threadwise copy).
|
||||
* Tensors must have the same size.
|
||||
*
|
||||
* \param src_tensor Source tensor.
|
||||
* \param dst_tensor Destination tensor.
|
||||
@@ -37,5 +42,134 @@ __host__ __device__ void copy(const SrcTensorType& src_tensor, DstTensorType& ds
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Perform optimized copy between two tensors partitions (threadwise copy).
|
||||
* Tensors must have the same size.
|
||||
*
|
||||
* \tparam DimAccessOrderTuple Tuple with dimension access order.
|
||||
* \tparam VectorDim Dimension for vectorized read and write.
|
||||
* \tparam ScalarPerVector Number of scalar per vectorized read and write.
|
||||
* \param src_tensor Source tensor.
|
||||
* \param dst_tensor Destination tensor.
|
||||
*/
|
||||
template <typename DimAccessOrderTuple,
|
||||
index_t VectorDim,
|
||||
index_t ScalarPerVector,
|
||||
typename SrcTensorType,
|
||||
typename DstTensorType>
|
||||
__device__ void copy(const SrcTensorType& src_tensor, DstTensorType& dst_tensor)
|
||||
{
|
||||
static_assert(is_detected<is_tuple, DimAccessOrderTuple>::value);
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
const auto& in_grid_desc = layout(src_tensor).GetUnrolledDescriptor();
|
||||
const auto& out_grid_desc = layout(dst_tensor).GetUnrolledDescriptor();
|
||||
|
||||
using SrcShapeType = remove_cvref_t<decltype(shape(src_tensor))>;
|
||||
constexpr index_t num_dims = SrcShapeType::Size();
|
||||
|
||||
constexpr auto thread_slice_lengths =
|
||||
generate_sequence_v2([](auto I) { return size(SrcShapeType{}.At(I)); }, Number<num_dims>{});
|
||||
constexpr auto dim_access_order = generate_sequence_v2(
|
||||
[](auto I) { return DimAccessOrderTuple{}.At(I); }, Number<num_dims>{});
|
||||
|
||||
if constexpr(SrcTensorType::IsDynamicBuffer && DstTensorType::IsDynamicBuffer)
|
||||
{
|
||||
// Perform a copy between DynamicBuffers
|
||||
auto transfer = ThreadwiseTensorSliceTransfer_v7<
|
||||
Tuple<typename SrcTensorType::TensorElementType>,
|
||||
Tuple<typename DstTensorType::TensorElementType>,
|
||||
decltype(tie(in_grid_desc)),
|
||||
decltype(tie(out_grid_desc)),
|
||||
tensor_operation::element_wise::PassThrough,
|
||||
Sequence<static_cast<index_t>(InMemoryDataOperationEnum::Set)>,
|
||||
decltype(thread_slice_lengths),
|
||||
decltype(dim_access_order),
|
||||
VectorDim,
|
||||
ScalarPerVector,
|
||||
Sequence<false>,
|
||||
Sequence<false>>{in_grid_desc,
|
||||
make_tuple(src_tensor.GetMultiIdxOffsets()),
|
||||
out_grid_desc,
|
||||
make_tuple(dst_tensor.GetMultiIdxOffsets()),
|
||||
tensor_operation::element_wise::PassThrough{}};
|
||||
|
||||
transfer.Run(tie(in_grid_desc),
|
||||
tie(src_tensor.GetBuffer()),
|
||||
tie(out_grid_desc),
|
||||
tie(dst_tensor.GetBuffer()));
|
||||
}
|
||||
else if constexpr(!SrcTensorType::IsDynamicBuffer && DstTensorType::IsDynamicBuffer)
|
||||
{
|
||||
// Perform copy from StaticBuffer to DynamicBuffer
|
||||
const auto src_slice_origin_idxs =
|
||||
generate_tuple([&](auto) { return I0; }, Number<num_dims>{});
|
||||
|
||||
auto transfer =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<typename SrcTensorType::TensorElementType,
|
||||
typename DstTensorType::TensorElementType,
|
||||
remove_cvref_t<decltype(in_grid_desc)>,
|
||||
remove_cvref_t<decltype(out_grid_desc)>,
|
||||
tensor_operation::element_wise::PassThrough,
|
||||
decltype(thread_slice_lengths),
|
||||
decltype(dim_access_order),
|
||||
VectorDim,
|
||||
ScalarPerVector,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
I1,
|
||||
true>{out_grid_desc,
|
||||
dst_tensor.GetMultiIdxOffsets(),
|
||||
tensor_operation::element_wise::PassThrough{}};
|
||||
|
||||
transfer.Run(in_grid_desc,
|
||||
src_slice_origin_idxs,
|
||||
src_tensor.GetBuffer(),
|
||||
out_grid_desc,
|
||||
dst_tensor.GetBuffer());
|
||||
}
|
||||
else if constexpr(SrcTensorType::IsDynamicBuffer && !DstTensorType::IsDynamicBuffer)
|
||||
{
|
||||
// Perform copy from DynamicBuffer to StaticBuffer
|
||||
const auto src_dst_slice_origin =
|
||||
generate_tuple([&](auto) { return I0; }, Number<num_dims>{});
|
||||
constexpr auto src_vector_tensor_lengths = generate_sequence_v2(
|
||||
[&](auto I) {
|
||||
if constexpr(I == VectorDim)
|
||||
{
|
||||
return Number<ScalarPerVector>{};
|
||||
}
|
||||
else
|
||||
{
|
||||
return I1;
|
||||
}
|
||||
},
|
||||
Number<num_dims>{});
|
||||
|
||||
auto transfer =
|
||||
ThreadwiseTensorSliceTransfer_v4r1<typename SrcTensorType::TensorElementType,
|
||||
typename DstTensorType::TensorElementType,
|
||||
remove_cvref_t<decltype(in_grid_desc)>,
|
||||
remove_cvref_t<decltype(out_grid_desc)>,
|
||||
decltype(thread_slice_lengths),
|
||||
decltype(dim_access_order),
|
||||
decltype(src_vector_tensor_lengths),
|
||||
decltype(dim_access_order)>{
|
||||
src_tensor.GetMultiIdxOffsets()};
|
||||
|
||||
transfer.Run(in_grid_desc,
|
||||
src_dst_slice_origin,
|
||||
src_tensor.GetBuffer(),
|
||||
out_grid_desc,
|
||||
src_dst_slice_origin,
|
||||
dst_tensor.GetBuffer());
|
||||
}
|
||||
else
|
||||
{
|
||||
// Perform copy between StaticBuffers
|
||||
copy(src_tensor, dst_tensor);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace wrapper
|
||||
} // namespace ck
|
||||
|
||||
@@ -10,189 +10,205 @@
|
||||
namespace ck {
|
||||
namespace wrapper {
|
||||
|
||||
namespace detail {
|
||||
namespace {
|
||||
/**
|
||||
* \brief Check if Tuple contains Slice object
|
||||
*
|
||||
* \return True if tuple contains Slice object.
|
||||
*/
|
||||
template <typename T>
|
||||
__host__ __device__ constexpr bool HasSlice(T&&)
|
||||
{
|
||||
return is_detected<is_slice, T>::value;
|
||||
}
|
||||
template <typename... Ts>
|
||||
__host__ __device__ constexpr bool HasSlice(Tuple<Ts...>&&)
|
||||
{
|
||||
return (HasSlice(Ts{}) || ...);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Calculate new shape after slice from parent shape.
|
||||
*
|
||||
* \param idxs Tuple of indexes defining slice ranges.
|
||||
* \param shape Shape which will be sliced.
|
||||
* \return New tensor shape.
|
||||
*/
|
||||
template <typename... Ts, typename SlicedShape>
|
||||
__host__ __device__ constexpr auto GetSlicedShape(const Tuple<Ts...>& idxs,
|
||||
const SlicedShape& shape)
|
||||
{
|
||||
// Pack each value in tuple to remove empty tuples after generation
|
||||
auto new_shape = generate_tuple(
|
||||
[&](auto i) {
|
||||
constexpr auto num_i = Number<i>{};
|
||||
if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
|
||||
{
|
||||
if constexpr(!detail::HasSlice(tuple_element_t<i.value, Tuple<Ts...>>{}))
|
||||
{
|
||||
// if tuple does not have any slice then we can remove dimension
|
||||
return Tuple<>{};
|
||||
}
|
||||
else
|
||||
{
|
||||
// if tuple then recurrence
|
||||
return make_tuple(GetSlicedShape(idxs.At(num_i), shape.At(num_i)));
|
||||
}
|
||||
}
|
||||
else if constexpr(is_detected<is_slice, tuple_element_t<i.value, Tuple<Ts...>>>::value)
|
||||
{
|
||||
// calculate new dimension
|
||||
const auto& dim = size(shape.At(num_i));
|
||||
const auto val = idxs.At(num_i).range(dim);
|
||||
return make_tuple(val);
|
||||
}
|
||||
else
|
||||
{
|
||||
// remove dimension for just value
|
||||
return Tuple<>{};
|
||||
}
|
||||
},
|
||||
Number<Tuple<Ts...>::Size()>{});
|
||||
// Remove empty tuples (deleted elements) and return
|
||||
return UnrollNestedTuple<0, 1>(new_shape);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Generate Freeze for each of nested shape.
|
||||
*
|
||||
* \param idx Tuple of start indices for slice.
|
||||
* \param shape Shape which will be freezed.
|
||||
* \return Generated freeze transforms.
|
||||
*/
|
||||
template <typename T, typename Shape>
|
||||
__host__ __device__ constexpr auto GenerateMultipleFreeze(T idx, const Shape& shape)
|
||||
{
|
||||
const auto unrolled_shape = UnrollNestedTuple(shape);
|
||||
return generate_tuple(
|
||||
[&](auto i) {
|
||||
// dimension offset from idx
|
||||
const auto dim = unrolled_shape.At(Number<i>{});
|
||||
const auto dim_idx = idx % dim;
|
||||
idx /= dim;
|
||||
return make_freeze_transform(dim_idx);
|
||||
},
|
||||
Number<decltype(unrolled_shape)::Size()>{});
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Generate transforms for slice tensor.
|
||||
*
|
||||
* \param idx Tuple of start indices for slice.
|
||||
* \param shape Shape which will be sliced.
|
||||
* \return Generated transforms.
|
||||
*/
|
||||
template <typename... Ts, typename Shape>
|
||||
__host__ __device__ constexpr auto GenerateSliceTransforms(const Tuple<Ts...>& idx,
|
||||
const Shape& shape)
|
||||
{
|
||||
// Pack each value in tuple to remove empty tuples after generation
|
||||
auto transforms = generate_tuple(
|
||||
[&](auto i) {
|
||||
constexpr auto num_i = Number<i>{};
|
||||
if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
|
||||
{
|
||||
return GenerateSliceTransforms(idx.At(num_i), shape.At(num_i));
|
||||
}
|
||||
else if constexpr(is_detected<is_slice, tuple_element_t<i.value, Tuple<Ts...>>>::value)
|
||||
{
|
||||
|
||||
const auto from = idx.At(num_i).from_;
|
||||
const auto dim = size<num_i>(shape);
|
||||
const auto range = idx.At(num_i).range(dim);
|
||||
return make_slice_transform(range, from, from + range);
|
||||
}
|
||||
else
|
||||
{
|
||||
// remove dimension for just value
|
||||
return GenerateMultipleFreeze(idx.At(num_i), shape.At(num_i));
|
||||
}
|
||||
},
|
||||
Number<Tuple<Ts...>::Size()>{});
|
||||
// Remove empty tuples (deleted elements) and return
|
||||
return UnrollNestedTuple(transforms);
|
||||
}
|
||||
|
||||
template <index_t i, typename LowerIndex>
|
||||
__host__ __device__ constexpr auto GetSequenceVal(const ck::Freeze<LowerIndex>&)
|
||||
{
|
||||
// There is no output for Freeze transform
|
||||
return Sequence<>{};
|
||||
}
|
||||
|
||||
template <index_t i, typename LowLength, typename SliceBegin, typename SliceEnd>
|
||||
__host__ __device__ constexpr auto GetSequenceVal(const ck::Slice<LowLength, SliceBegin, SliceEnd>&)
|
||||
{
|
||||
return Sequence<i>{};
|
||||
}
|
||||
|
||||
template <index_t i>
|
||||
__host__ __device__ constexpr auto GenerateUpperDims(const Tuple<>&)
|
||||
{
|
||||
return Tuple<>{};
|
||||
}
|
||||
|
||||
template <index_t i, typename... Transforms>
|
||||
__host__ __device__ constexpr auto GenerateUpperDims(const Tuple<Transforms...>& transforms)
|
||||
{
|
||||
constexpr auto num_transforms = Tuple<Transforms...>::Size();
|
||||
// Deduce Sequence element for specific transform
|
||||
const auto current_elem = GetSequenceVal<i>(transforms.At(Number<0>{}));
|
||||
if constexpr(is_same_v<decltype(current_elem), const Sequence<>>)
|
||||
{
|
||||
const auto next_tuple = GenerateUpperDims<i>(TupleSlice<1, num_transforms>(transforms));
|
||||
return concat_tuple(make_tuple(current_elem), next_tuple);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Increase i if current_elem is Slice transform
|
||||
const auto next_tuple = GenerateUpperDims<i + 1>(TupleSlice<1, num_transforms>(transforms));
|
||||
return concat_tuple(make_tuple(current_elem), next_tuple);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... Ts, typename Shape, typename FlattenDescriptor>
|
||||
__host__ __device__ constexpr auto GenerateSlicedDescriptor(const Tuple<Ts...>& idx,
|
||||
const Shape& shape,
|
||||
const FlattenDescriptor& flatten_desc)
|
||||
{
|
||||
constexpr auto old_shape_dims = decltype(UnrollNestedTuple(shape))::Size();
|
||||
|
||||
const auto transforms = GenerateSliceTransforms(idx, shape);
|
||||
using TransformsTupleType = decltype(transforms);
|
||||
|
||||
const auto lower_dims =
|
||||
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<old_shape_dims>{});
|
||||
const auto upper_dims = decltype(GenerateUpperDims<0>(TransformsTupleType{})){};
|
||||
return transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims);
|
||||
}
|
||||
} // namespace
|
||||
} // namespace detail
|
||||
|
||||
/**
|
||||
* \brief Tensor wrapper that performs static and dynamic buffer logic.
|
||||
* The tensor is based on a descriptor stored in the Layout. Additionally,
|
||||
* tensor can be sliced or shifted using multi-index offset.
|
||||
*
|
||||
* \tparam BufferAddressSpace Memory type (Generic, Global, LDS, VGPR, SGPR).
|
||||
* \tparam ElementType Element data type.
|
||||
* \tparam Shape Tensor shape (layout component).
|
||||
* \tparam UnnestedDescriptorType Unnested descriptor (layout component).
|
||||
* \tparam NumVectors Number of vectors (only for VGPR, SGPR).
|
||||
* \tparam ScalarPerVector Scalars per vector (only for VGPR, SGPR).
|
||||
* \tparam UnrolledDescriptorType Flatten descriptor (layout component).
|
||||
*/
|
||||
template <MemoryTypeEnum BufferAddressSpace,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename UnnestedDescriptorType,
|
||||
index_t NumVectors, // param for Register memory
|
||||
index_t ScalarPerVector // param for Register memory
|
||||
>
|
||||
typename UnrolledDescriptorType>
|
||||
struct Tensor
|
||||
{
|
||||
private:
|
||||
// Check if Tuple contains Slice object
|
||||
template <typename T>
|
||||
__host__ __device__ constexpr static bool IsSlicing(T&&)
|
||||
{
|
||||
return is_detected<is_slice, T>::value;
|
||||
}
|
||||
template <typename... Ts>
|
||||
__host__ __device__ constexpr static bool IsSlicing(Tuple<Ts...>&&)
|
||||
{
|
||||
return (IsSlicing(Ts{}) || ...);
|
||||
}
|
||||
|
||||
// Calculate new tensor shape after slice
|
||||
template <typename... Ts, typename ShapeTmpType>
|
||||
__host__ __device__ constexpr auto GetShapeFromSlicedTensor(const Tuple<Ts...>& idx,
|
||||
const ShapeTmpType& shape) const
|
||||
{
|
||||
// Pack each value in tuple to remove empty tuples after generation
|
||||
auto new_shape = generate_tuple(
|
||||
[&](auto i) {
|
||||
constexpr auto num_i = Number<i>{};
|
||||
if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
|
||||
{
|
||||
if constexpr(!IsSlicing(tuple_element_t<i.value, Tuple<Ts...>>{}))
|
||||
{
|
||||
// if tuple does not have any slice then we can remove dimension
|
||||
return Tuple<>{};
|
||||
}
|
||||
else
|
||||
{
|
||||
// if tuple then recurrence
|
||||
return make_tuple(GetShapeFromSlicedTensor(idx.At(num_i), shape.At(num_i)));
|
||||
}
|
||||
}
|
||||
else if constexpr(is_detected<is_slice,
|
||||
tuple_element_t<i.value, Tuple<Ts...>>>::value)
|
||||
{
|
||||
// calculate new dimension
|
||||
const auto& dim = size(shape.At(num_i));
|
||||
const auto val = idx.At(num_i).range(dim);
|
||||
return make_tuple(val);
|
||||
}
|
||||
else
|
||||
{
|
||||
// remove dimension for just value
|
||||
return Tuple<>{};
|
||||
}
|
||||
},
|
||||
Number<Tuple<Ts...>::Size()>{});
|
||||
// Remove empty tuples (deleted elements) and return
|
||||
return UnrollNestedTuple<0, 1>(new_shape);
|
||||
}
|
||||
|
||||
// Generate Freeze for each of nested shape
|
||||
template <typename T, typename ShapeTmpType>
|
||||
__host__ __device__ constexpr auto GenerateMultipleFreeze(T idx,
|
||||
const ShapeTmpType& shape) const
|
||||
{
|
||||
const auto unrolled_shape = UnrollNestedTuple(shape);
|
||||
return generate_tuple(
|
||||
[&](auto i) {
|
||||
// dimension offset from idx
|
||||
const auto dim = unrolled_shape.At(Number<i>{});
|
||||
const auto dim_idx = idx % dim;
|
||||
idx /= dim;
|
||||
return make_freeze_transform(dim_idx);
|
||||
},
|
||||
Number<decltype(unrolled_shape)::Size()>{});
|
||||
}
|
||||
|
||||
template <typename... Ts, typename ShapeTmpType>
|
||||
__host__ __device__ constexpr auto
|
||||
GetTransformsFromSlicedTensor(const Tuple<Ts...>& idx, const ShapeTmpType& shape) const
|
||||
{
|
||||
// Pack each value in tuple to remove empty tuples after generation
|
||||
auto transforms = generate_tuple(
|
||||
[&](auto i) {
|
||||
constexpr auto num_i = Number<i>{};
|
||||
if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
|
||||
{
|
||||
return GetTransformsFromSlicedTensor(idx.At(num_i), shape.At(num_i));
|
||||
}
|
||||
else if constexpr(is_detected<is_slice,
|
||||
tuple_element_t<i.value, Tuple<Ts...>>>::value)
|
||||
{
|
||||
|
||||
const auto from = idx.At(num_i).from_;
|
||||
const auto dim = shape.At(num_i);
|
||||
const auto range = idx.At(num_i).range(dim);
|
||||
return make_slice_transform(range, from, from + range);
|
||||
}
|
||||
else
|
||||
{
|
||||
// remove dimension for just value
|
||||
return GenerateMultipleFreeze(idx.At(num_i), shape.At(num_i));
|
||||
}
|
||||
},
|
||||
Number<Tuple<Ts...>::Size()>{});
|
||||
// Remove empty tuples (deleted elements) and return
|
||||
return UnrollNestedTuple(transforms);
|
||||
}
|
||||
|
||||
// There is no output for Freeze transform
|
||||
template <index_t i, typename LowerIndex>
|
||||
__host__ __device__ constexpr auto GetSequenceVal(const ck::Freeze<LowerIndex>&) const
|
||||
{
|
||||
return Sequence<>{};
|
||||
}
|
||||
|
||||
template <index_t i, typename LowLength, typename SliceBegin, typename SliceEnd>
|
||||
__host__ __device__ constexpr auto
|
||||
GetSequenceVal(const ck::Slice<LowLength, SliceBegin, SliceEnd>&) const
|
||||
{
|
||||
return Sequence<i>{};
|
||||
}
|
||||
|
||||
template <index_t i>
|
||||
__host__ __device__ constexpr auto GenerateUpperDims(const Tuple<>&) const
|
||||
{
|
||||
return Tuple<>{};
|
||||
}
|
||||
|
||||
template <index_t i, typename... Transforms>
|
||||
__host__ __device__ constexpr auto
|
||||
GenerateUpperDims(const Tuple<Transforms...>& transforms) const
|
||||
{
|
||||
constexpr auto num_transforms = Tuple<Transforms...>::Size();
|
||||
// Deduce Sequence element for specific transform
|
||||
const auto currect_elem = GetSequenceVal<i>(transforms.At(Number<0>{}));
|
||||
if constexpr(is_same_v<decltype(currect_elem), const Sequence<>>)
|
||||
{
|
||||
const auto next_tuple = GenerateUpperDims<i>(TupleSlice<1, num_transforms>(transforms));
|
||||
return concat_tuple(make_tuple(currect_elem), next_tuple);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Increase i if current_elem is Slice transform
|
||||
const auto next_tuple =
|
||||
GenerateUpperDims<i + 1>(TupleSlice<1, num_transforms>(transforms));
|
||||
return concat_tuple(make_tuple(currect_elem), next_tuple);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... Ts, typename ShapeTmpType, typename FlattenDescriptor>
|
||||
__host__ __device__ constexpr auto
|
||||
GetDescriptorFromSlicedTensor(const Tuple<Ts...>& idx,
|
||||
const ShapeTmpType& shape,
|
||||
const FlattenDescriptor& flatten_desc) const
|
||||
{
|
||||
constexpr auto old_shape_dims = decltype(UnrollNestedTuple(shape))::Size();
|
||||
|
||||
const auto transforms = GetTransformsFromSlicedTensor(idx, shape);
|
||||
using TransformsTupleType = decltype(transforms);
|
||||
|
||||
const auto lower_dims =
|
||||
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<old_shape_dims>{});
|
||||
const auto upper_dims = decltype(GenerateUpperDims<0>(TransformsTupleType{})){};
|
||||
return transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims);
|
||||
}
|
||||
|
||||
public:
|
||||
using ElementSpaceSize = decltype(Layout<Shape, UnnestedDescriptorType>{
|
||||
Shape{}, UnnestedDescriptorType{}}.GetElementSpaceSize()); // SpaceSize type for buffer
|
||||
using ElementSpaceSize = decltype(Layout<Shape, UnrolledDescriptorType>{
|
||||
Shape{}, UnrolledDescriptorType{}}.GetElementSpaceSize()); // SpaceSize type for buffer
|
||||
using TensorElementType = ElementType; // DataType
|
||||
|
||||
static constexpr MemoryTypeEnum TensorBufferAddressSpace = BufferAddressSpace;
|
||||
@@ -200,134 +216,207 @@ struct Tensor
|
||||
BufferAddressSpace == MemoryTypeEnum ::Vgpr);
|
||||
|
||||
__host__ __device__ Tensor() = delete;
|
||||
__host__ __device__ Tensor(ElementType* pointer,
|
||||
const Layout<Shape, UnnestedDescriptorType>& layout)
|
||||
__host__ __device__ constexpr Tensor(ElementType* pointer,
|
||||
const Layout<Shape, UnrolledDescriptorType>& layout)
|
||||
: layout_(layout),
|
||||
buffer_(make_dynamic_buffer<BufferAddressSpace>(pointer, layout.GetElementSpaceSize()))
|
||||
buffer_(make_dynamic_buffer<BufferAddressSpace>(pointer, layout.GetElementSpaceSize())),
|
||||
multi_idx_offset_(make_zero_multi_index<Shape::Size()>()),
|
||||
base_offset_(0)
|
||||
{
|
||||
static_assert(IsDynamicBuffer, "Wrong BufferAddressSpace for register.");
|
||||
}
|
||||
|
||||
__host__ __device__ Tensor(const Layout<Shape, UnnestedDescriptorType>& layout)
|
||||
: layout_(layout)
|
||||
__host__ __device__ constexpr Tensor(const Layout<Shape, UnrolledDescriptorType>& layout)
|
||||
: layout_(layout),
|
||||
multi_idx_offset_(make_zero_multi_index<Shape::Size()>()),
|
||||
base_offset_(0)
|
||||
{
|
||||
static_assert(!IsDynamicBuffer, "Wrong BufferAddressSpace for register.");
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr const Layout<Shape, UnnestedDescriptorType>& GetLayout() const
|
||||
__host__ __device__ constexpr const Layout<Shape, UnrolledDescriptorType>& GetLayout() const
|
||||
{
|
||||
return layout_;
|
||||
}
|
||||
|
||||
// Getter for new sliced tensor
|
||||
template <typename... Ts, enable_if_t<IsSlicing(Tuple<Ts...>{}), bool> = false>
|
||||
__host__ __device__ auto operator[](const Tuple<Ts...>& idx) const
|
||||
/**
|
||||
* \brief Get the new sliced tensor.
|
||||
*
|
||||
* \param idx Tuple of indices: slice(from,to) or scalar.
|
||||
* \return Sliced tensor.
|
||||
*/
|
||||
template <typename... Ts, enable_if_t<detail::HasSlice(Tuple<Ts...>{}), bool> = false>
|
||||
__host__ __device__ auto operator[](const Tuple<Ts...>& idx)
|
||||
{
|
||||
static_assert(IsDynamicBuffer, "Register slice is not supported");
|
||||
const auto& shape = layout_.GetShape();
|
||||
auto new_shape = GetShapeFromSlicedTensor(idx, shape);
|
||||
auto new_shape = detail::GetSlicedShape(idx, shape);
|
||||
|
||||
const auto& flatten_desc = layout_.GetUnnestedDescriptor();
|
||||
auto new_desc = GetDescriptorFromSlicedTensor(idx, shape, flatten_desc);
|
||||
const auto& flatten_desc = layout_.GetUnrolledDescriptor();
|
||||
auto new_desc = detail::GenerateSlicedDescriptor(idx, shape, flatten_desc);
|
||||
const auto new_layout =
|
||||
Layout<decltype(new_shape), decltype(new_desc)>(new_shape, new_desc);
|
||||
// Update embed offset
|
||||
base_offset_ -= new_layout(make_tuple(Number<0>{}));
|
||||
return make_tensor<BufferAddressSpace>(buffer_.p_data_, new_layout);
|
||||
}
|
||||
|
||||
template <typename... Ts, enable_if_t<IsSlicing(Tuple<Ts...>{}), bool> = false>
|
||||
__host__ __device__ auto operator()(const Tuple<Ts...>& idx) const
|
||||
template <typename... Ts, enable_if_t<detail::HasSlice(Tuple<Ts...>{}), bool> = false>
|
||||
__host__ __device__ auto operator()(const Tuple<Ts...>& idx)
|
||||
{
|
||||
return this->operator[](idx);
|
||||
}
|
||||
|
||||
template <typename... Idxs, enable_if_t<IsSlicing(Tuple<Idxs...>{}), bool> = false>
|
||||
__host__ __device__ auto operator()(Idxs... idxs) const
|
||||
template <typename... Idxs, enable_if_t<detail::HasSlice(Tuple<Idxs...>{}), bool> = false>
|
||||
__host__ __device__ auto operator()(Idxs... idxs)
|
||||
{
|
||||
return this->operator[](make_tuple(idxs...));
|
||||
}
|
||||
|
||||
// Getter for the const value
|
||||
template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false>
|
||||
/**
|
||||
* \brief Getter of the tensor's const value reference.
|
||||
*
|
||||
* \param idx Tuple of indices.
|
||||
* \return Requested value.
|
||||
*/
|
||||
template <typename... Ts, enable_if_t<!detail::HasSlice(Tuple<Ts...>{}), bool> = false>
|
||||
__host__ __device__ const ElementType& operator[](const Tuple<Ts...>& idx) const
|
||||
{
|
||||
if constexpr(IsDynamicBuffer)
|
||||
{
|
||||
const index_t offset = layout_(idx);
|
||||
const index_t offset = layout_(idx) + base_offset_;
|
||||
return buffer_[offset];
|
||||
}
|
||||
else
|
||||
{
|
||||
constexpr index_t offset = Layout<Shape, UnnestedDescriptorType>{
|
||||
constexpr index_t index_offset = Layout<Shape, UnrolledDescriptorType>{
|
||||
Shape{},
|
||||
UnnestedDescriptorType{}}.template operator()<Tuple<Ts...>>();
|
||||
return buffer_[Number<offset>{}];
|
||||
UnrolledDescriptorType{}}.template operator()<Tuple<Ts...>>();
|
||||
// Calculate and apply base offset in compile-time
|
||||
constexpr index_t base_offset = Layout<Shape, UnrolledDescriptorType>{
|
||||
Shape{},
|
||||
UnrolledDescriptorType{}}.template operator()<MultiIndex<Shape::Size()>>();
|
||||
return buffer_[Number<index_offset + base_offset>{}];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false>
|
||||
template <typename... Ts, enable_if_t<!detail::HasSlice(Tuple<Ts...>{}), bool> = false>
|
||||
__host__ __device__ const ElementType& operator()(const Tuple<Ts...>& idx) const
|
||||
{
|
||||
return this->operator[](idx);
|
||||
}
|
||||
|
||||
template <typename... Idxs, enable_if_t<!IsSlicing(Tuple<Idxs...>{}), bool> = false>
|
||||
template <typename... Idxs, enable_if_t<!detail::HasSlice(Tuple<Idxs...>{}), bool> = false>
|
||||
__host__ __device__ const ElementType& operator()(Idxs... idxs) const
|
||||
{
|
||||
return this->operator[](make_tuple(idxs...));
|
||||
}
|
||||
|
||||
// Getter for the value reference
|
||||
template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false>
|
||||
/**
|
||||
* \brief Getter of tensor value reference.
|
||||
*
|
||||
* \param idx Tuple of indices.
|
||||
* \return Requested value.
|
||||
*/
|
||||
template <typename... Ts, enable_if_t<!detail::HasSlice(Tuple<Ts...>{}), bool> = false>
|
||||
__host__ __device__ ElementType& operator[](const Tuple<Ts...>& idx)
|
||||
{
|
||||
if constexpr(IsDynamicBuffer)
|
||||
{
|
||||
const index_t offset = layout_(idx);
|
||||
const index_t offset = layout_(idx) + base_offset_;
|
||||
return buffer_(offset);
|
||||
}
|
||||
else
|
||||
{
|
||||
constexpr index_t offset = Layout<Shape, UnnestedDescriptorType>{
|
||||
constexpr index_t index_offset = Layout<Shape, UnrolledDescriptorType>{
|
||||
Shape{},
|
||||
UnnestedDescriptorType{}}.template operator()<Tuple<Ts...>>();
|
||||
return buffer_(Number<offset>{});
|
||||
UnrolledDescriptorType{}}.template operator()<Tuple<Ts...>>();
|
||||
// Apply embed offset (calculate in compiletime)
|
||||
constexpr index_t base_offset = Layout<Shape, UnrolledDescriptorType>{
|
||||
Shape{},
|
||||
UnrolledDescriptorType{}}.template operator()<MultiIndex<Shape::Size()>>();
|
||||
return buffer_(Number<index_offset + base_offset>{});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false>
|
||||
template <typename... Ts, enable_if_t<!detail::HasSlice(Tuple<Ts...>{}), bool> = false>
|
||||
__host__ __device__ ElementType& operator()(const Tuple<Ts...>& idx)
|
||||
{
|
||||
return this->operator[](idx);
|
||||
}
|
||||
|
||||
template <typename... Idxs, enable_if_t<!IsSlicing(Tuple<Idxs...>{}), bool> = false>
|
||||
template <typename... Idxs, enable_if_t<!detail::HasSlice(Tuple<Idxs...>{}), bool> = false>
|
||||
__host__ __device__ ElementType& operator()(Idxs... idxs)
|
||||
{
|
||||
return this->operator[](make_tuple(idxs...));
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr auto GetDefaultDescriptor()
|
||||
/**
|
||||
* \brief Get descriptor with all nested dimensions merged.
|
||||
*
|
||||
* \return Merged nests descriptor.
|
||||
*/
|
||||
__host__ __device__ constexpr auto GetMergedNestingDescriptor()
|
||||
{
|
||||
return layout_.GetDefaultDescriptor();
|
||||
return layout_.GetMergedNestingDescriptor();
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get pointer to the data.
|
||||
*
|
||||
* \return Pointer.
|
||||
*/
|
||||
__host__ __device__ ElementType* GetPointer() const { return buffer_.p_data_; }
|
||||
|
||||
__host__ __device__ constexpr auto& GetBuffer() { return buffer_; }
|
||||
__host__ __device__ constexpr auto& GetBuffer() const { return buffer_; }
|
||||
|
||||
/**
|
||||
* \brief Get multi index offset to the data.
|
||||
*
|
||||
* \return Multi index offset.
|
||||
*/
|
||||
__host__ __device__ constexpr auto& GetMultiIdxOffsets() const { return multi_idx_offset_; }
|
||||
|
||||
/**
|
||||
* \brief Apply multi index offset on the tensor.
|
||||
*
|
||||
* \param multi_idx_offset Multi index offset.
|
||||
*/
|
||||
template <typename MultiIdxOffsets>
|
||||
__host__ __device__ constexpr void SetMultiIdxOffset(const MultiIdxOffsets multi_idx_offset)
|
||||
{
|
||||
multi_idx_offset_ = multi_idx_offset;
|
||||
base_offset_ += layout_(multi_idx_offset);
|
||||
}
|
||||
|
||||
private:
|
||||
using DynamicBufferType = DynamicBuffer<BufferAddressSpace,
|
||||
ElementType,
|
||||
ElementSpaceSize,
|
||||
true /*InvalidElementUseNumericalZeroValue*/>;
|
||||
using StaticBufferType =
|
||||
StaticBufferTupleOfVector<BufferAddressSpace,
|
||||
ElementType,
|
||||
NumVectors,
|
||||
ScalarPerVector,
|
||||
true /*InvalidElementUseNumericalZeroValue*/>;
|
||||
using StaticBufferType = StaticBuffer<BufferAddressSpace,
|
||||
ElementType,
|
||||
size(Shape{}),
|
||||
true /*InvalidElementUseNumericalZeroValue*/>;
|
||||
// If register use static buffer, else use dynamic buffer
|
||||
using Buffer = std::conditional_t<IsDynamicBuffer, DynamicBufferType, StaticBufferType>;
|
||||
|
||||
const Layout<Shape, UnnestedDescriptorType> layout_;
|
||||
const Layout<Shape, UnrolledDescriptorType> layout_;
|
||||
Buffer buffer_;
|
||||
// We use multi_idx_offset_ to enable the creation of a descriptor in
|
||||
// compile time for partitions or tiles if tile shape and thread layout
|
||||
// is known at compile time (We can use the same descriptor for each
|
||||
// thread). Additionally, the copy between the static and dynamic buffer
|
||||
// requires a descriptor known at compile time, so we can shift data using
|
||||
// such multi_idx_offset_.
|
||||
MultiIndex<Shape::Size()> multi_idx_offset_;
|
||||
// Base offset and multi index offset are corresponding to exactly the
|
||||
// same element in tensor ( and in physical memory ). Multi index offset
|
||||
// is multi dimensional index. However base offset is calculated using
|
||||
// tensor descriptor (thus all it's transforms) and is linear (1D).
|
||||
// We store base_offset_ to avoid multiple recalculations.
|
||||
index_t base_offset_;
|
||||
};
|
||||
|
||||
} // namespace wrapper
|
||||
|
||||
@@ -22,14 +22,19 @@ namespace wrapper {
|
||||
// Disable from doxygen docs generation
|
||||
/// @cond
|
||||
// forward declaration
|
||||
template <typename Shape, typename UnnestedDescriptorType>
|
||||
template <typename Shape, typename UnrolledDescriptorType>
|
||||
struct Layout;
|
||||
|
||||
template <typename T>
|
||||
using is_tuple = decltype(std::declval<T&>().IsTuple());
|
||||
|
||||
namespace {
|
||||
// Generate packed (column-major) strides if not passed
|
||||
/**
|
||||
* \brief Generate packed (column-major) strides if not passed
|
||||
*
|
||||
* \param shape Tensor shape.
|
||||
* \return Generated column-major strides.
|
||||
*/
|
||||
template <typename... Ts>
|
||||
__host__ __device__ constexpr static auto
|
||||
GenerateColumnMajorPackedStrides(const Tuple<Ts...>& shape)
|
||||
@@ -50,9 +55,16 @@ GenerateColumnMajorPackedStrides(const Tuple<Ts...>& shape)
|
||||
Number<decltype(unrolled_shape)::Size()>{});
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Create naive tensor descriptor from nested shape.
|
||||
*
|
||||
* \param shape Tensor shape.
|
||||
* \param strides Tensor strides.
|
||||
* \return Unrolled descriptor
|
||||
*/
|
||||
template <typename LayoutShape, typename LayoutStrides>
|
||||
__host__ __device__ constexpr auto MakeFlattenDescriptor(const LayoutShape& shape,
|
||||
const LayoutStrides& strides)
|
||||
__host__ __device__ constexpr auto MakeUnrolledDescriptor(const LayoutShape& shape,
|
||||
const LayoutStrides& strides)
|
||||
{
|
||||
const auto unrolled_shape = UnrollNestedTuple(shape);
|
||||
if constexpr(is_same_v<LayoutStrides, Tuple<>>)
|
||||
@@ -86,8 +98,8 @@ __host__ __device__ constexpr auto MakeFlattenDescriptor(const LayoutShape& shap
|
||||
template <typename Shape, typename Strides>
|
||||
__host__ __device__ constexpr auto make_layout(const Shape& shape, const Strides& strides)
|
||||
{
|
||||
using UnnestedDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Strides{}));
|
||||
return Layout<Shape, UnnestedDescriptorType>(shape, MakeFlattenDescriptor(shape, strides));
|
||||
using UnrolledDescriptorType = decltype(MakeUnrolledDescriptor(Shape{}, Strides{}));
|
||||
return Layout<Shape, UnrolledDescriptorType>(shape, MakeUnrolledDescriptor(shape, strides));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -100,15 +112,19 @@ __host__ __device__ constexpr auto make_layout(const Shape& shape, const Strides
|
||||
template <typename Shape>
|
||||
__host__ __device__ constexpr auto make_layout(const Shape& shape)
|
||||
{
|
||||
using UnnestedDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Tuple<>{}));
|
||||
return Layout<Shape, UnnestedDescriptorType>(shape, MakeFlattenDescriptor(shape, Tuple<>{}));
|
||||
using UnrolledDescriptorType = decltype(MakeUnrolledDescriptor(Shape{}, Tuple<>{}));
|
||||
return Layout<Shape, UnrolledDescriptorType>(shape, MakeUnrolledDescriptor(shape, Tuple<>{}));
|
||||
}
|
||||
|
||||
// Layout helpers
|
||||
// get
|
||||
// Get dim (could be returned from get with empty Idxs)
|
||||
|
||||
/**
|
||||
* \private
|
||||
* \brief Get dim.
|
||||
*
|
||||
* \param dim Dimension.
|
||||
* \return Returned the same dimension.
|
||||
*/
|
||||
template <typename T>
|
||||
__host__ __device__ T constexpr get(const T& dim)
|
||||
@@ -178,7 +194,7 @@ __host__ __device__ constexpr auto get(const Layout<Shape, FlattenDesc>& layout)
|
||||
},
|
||||
Number<old_shape_dims>{});
|
||||
|
||||
const auto& flatten_desc = layout.GetUnnestedDescriptor();
|
||||
const auto& flatten_desc = layout.GetUnrolledDescriptor();
|
||||
auto new_desc = transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims);
|
||||
return Layout<decltype(new_shape), decltype(new_desc)>(new_shape, new_desc);
|
||||
}
|
||||
@@ -197,9 +213,12 @@ __host__ __device__ constexpr auto get(const T& elem)
|
||||
}
|
||||
|
||||
// size
|
||||
// Get dim size (could be returned from get function)
|
||||
/**
|
||||
* \private
|
||||
* \brief Get size.
|
||||
*
|
||||
* \param dim Size.
|
||||
* \return Returned the same size.
|
||||
*/
|
||||
template <typename T>
|
||||
__host__ __device__ T constexpr size(const T& dim)
|
||||
@@ -214,8 +233,8 @@ __host__ __device__ T constexpr size(const T& dim)
|
||||
* \param layout Layout to get Shape of.
|
||||
* \return Requsted length.
|
||||
*/
|
||||
template <index_t idx, typename Shape, typename UnnestedDescriptorType>
|
||||
__host__ __device__ constexpr auto size(const Layout<Shape, UnnestedDescriptorType>& layout)
|
||||
template <index_t idx, typename Shape, typename UnrolledDescriptorType>
|
||||
__host__ __device__ constexpr auto size(const Layout<Shape, UnrolledDescriptorType>& layout)
|
||||
{
|
||||
return layout.template GetLength<idx>();
|
||||
}
|
||||
@@ -240,8 +259,8 @@ __host__ __device__ constexpr auto size(const Tuple<ShapeDims...>& shape)
|
||||
* \param layout Layout to calculate shape size.
|
||||
* \return Requsted size.
|
||||
*/
|
||||
template <typename Shape, typename UnnestedDescriptorType>
|
||||
__host__ __device__ constexpr auto size(const Layout<Shape, UnnestedDescriptorType>& layout)
|
||||
template <typename Shape, typename UnrolledDescriptorType>
|
||||
__host__ __device__ constexpr auto size(const Layout<Shape, UnrolledDescriptorType>& layout)
|
||||
{
|
||||
return layout.GetLengths();
|
||||
}
|
||||
@@ -280,9 +299,9 @@ __host__ __device__ constexpr auto size(const T& elem)
|
||||
* \param layout Layout to calculate rank.
|
||||
* \return Requsted rank.
|
||||
*/
|
||||
template <typename Shape, typename UnnestedDescriptorType>
|
||||
template <typename Shape, typename UnrolledDescriptorType>
|
||||
__host__ __device__ constexpr auto
|
||||
rank([[maybe_unused]] const Layout<Shape, UnnestedDescriptorType>& layout)
|
||||
rank([[maybe_unused]] const Layout<Shape, UnrolledDescriptorType>& layout)
|
||||
{
|
||||
return Shape::Size();
|
||||
}
|
||||
@@ -302,17 +321,25 @@ __host__ __device__ constexpr auto rank([[maybe_unused]] const Tuple<Dims...>& t
|
||||
|
||||
/**
|
||||
* \private
|
||||
* \brief Rank for scalar
|
||||
*
|
||||
* \param dim Dimension scalar.
|
||||
* \return Returned 1.
|
||||
*/
|
||||
template <index_t IDim>
|
||||
__host__ __device__ constexpr index_t rank(const Number<IDim>&)
|
||||
__host__ __device__ constexpr index_t rank([[maybe_unused]] const Number<IDim>& dim)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* \private
|
||||
* \brief Rank for scalar
|
||||
*
|
||||
* \param dim Dimension scalar.
|
||||
* \return Returned 1.
|
||||
*/
|
||||
__host__ __device__ constexpr index_t rank(const index_t&) { return 1; }
|
||||
__host__ __device__ constexpr index_t rank([[maybe_unused]] const index_t& dim) { return 1; }
|
||||
|
||||
/**
|
||||
* \brief Hierarchical rank.
|
||||
@@ -334,8 +361,8 @@ __host__ __device__ constexpr auto rank(const T& elem)
|
||||
* \param layout Layout to calculate depth.
|
||||
* \return Requsted depth.
|
||||
*/
|
||||
template <typename Shape, typename UnnestedDescriptorType>
|
||||
__host__ __device__ constexpr auto depth(const Layout<Shape, UnnestedDescriptorType>& layout)
|
||||
template <typename Shape, typename UnrolledDescriptorType>
|
||||
__host__ __device__ constexpr auto depth(const Layout<Shape, UnrolledDescriptorType>& layout)
|
||||
{
|
||||
const auto& shape = layout.GetShape();
|
||||
return TupleDepth(shape);
|
||||
@@ -355,17 +382,25 @@ __host__ __device__ constexpr auto depth(const Tuple<Dims...>& tuple)
|
||||
|
||||
/**
|
||||
* \private
|
||||
* \brief Depth for scalar
|
||||
*
|
||||
* \param dim Scalar.
|
||||
* \return Returned 0.
|
||||
*/
|
||||
template <index_t IDim>
|
||||
__host__ __device__ constexpr index_t depth(const Number<IDim>&)
|
||||
__host__ __device__ constexpr index_t depth([[maybe_unused]] const Number<IDim>& dim)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* \private
|
||||
* \brief Depth for scalar
|
||||
*
|
||||
* \param dim Scalar.
|
||||
* \return Returned 0.
|
||||
*/
|
||||
__host__ __device__ constexpr index_t depth(const index_t&) { return 0; }
|
||||
__host__ __device__ constexpr index_t depth([[maybe_unused]] const index_t& dim) { return 0; }
|
||||
|
||||
/**
|
||||
* \brief Hierarchical depth.
|
||||
|
||||
@@ -6,12 +6,22 @@
|
||||
#include "tensor_utils.hpp"
|
||||
#include "layout_utils.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
|
||||
#include "ck/tensor_description/cluster_descriptor.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace wrapper {
|
||||
|
||||
namespace {
|
||||
// Calculate shape for partition based on number of threads per each dim and
|
||||
// previous shape
|
||||
|
||||
/**
|
||||
* \brief Calculate shape for partition based on number of threads per each dim and
|
||||
* previous shape
|
||||
*
|
||||
* \param shape Base tensor shape.
|
||||
* \param thread_lengths Tuple of thread lengths.
|
||||
* \return Partition shape.
|
||||
*/
|
||||
template <typename... Ts, typename... Ls>
|
||||
__host__ __device__ constexpr auto CalculateLocalPartitionShape(const Tuple<Ts...>& shape,
|
||||
const Tuple<Ls...>& thread_lengths)
|
||||
@@ -20,265 +30,165 @@ __host__ __device__ constexpr auto CalculateLocalPartitionShape(const Tuple<Ts..
|
||||
return generate_tuple(
|
||||
[&](auto i) {
|
||||
constexpr auto num_i = Number<i>{};
|
||||
if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
|
||||
{
|
||||
// if tuple then recurrence
|
||||
return CalculateLocalPartitionShape(shape.At(num_i), thread_lengths.At(num_i));
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto slice_len = shape.At(num_i) / thread_lengths.At(num_i);
|
||||
return slice_len;
|
||||
}
|
||||
},
|
||||
Number<Tuple<Ts...>::Size()>{});
|
||||
}
|
||||
|
||||
// Calculate shape for partition based on number of threads per each dim,
|
||||
// previous strides and steps
|
||||
template <typename... Ts, typename... Ls, typename... Steps, typename FlattenDescType>
|
||||
__host__ __device__ constexpr auto
|
||||
CalculateLocalPartitionDescriptor(const Tuple<Ts...>& shape,
|
||||
const Tuple<Ls...>& thread_lengths,
|
||||
const Tuple<Steps...>& steps,
|
||||
const FlattenDescType& flatten_desc)
|
||||
{
|
||||
|
||||
static_assert(Tuple<Ts...>::Size() == Tuple<Ls...>::Size(), "Wrong thread_lengths shape.");
|
||||
const auto unrolled_thread_lengths = UnrollNestedTuple(thread_lengths);
|
||||
const auto unrolled_shape = UnrollNestedTuple(shape);
|
||||
constexpr auto dims = decltype(unrolled_thread_lengths)::Size();
|
||||
|
||||
using UnrolledStepsType = decltype(UnrollNestedTuple(steps));
|
||||
|
||||
using I1 = Number<1>;
|
||||
|
||||
const auto transforms = generate_tuple(
|
||||
[&](auto i) {
|
||||
constexpr auto num_i = Number<i>{};
|
||||
if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
|
||||
{
|
||||
// By default raked partition
|
||||
const auto partition_stride = unrolled_thread_lengths.At(num_i);
|
||||
return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
|
||||
make_tuple(partition_stride));
|
||||
}
|
||||
else if constexpr(!is_same_v<tuple_element_t<i.value, UnrolledStepsType>, index_t>)
|
||||
{
|
||||
// Compiletime partition
|
||||
if constexpr(is_same_v<tuple_element_t<i.value, UnrolledStepsType>, I1>)
|
||||
{
|
||||
// raked
|
||||
const auto partition_stride = unrolled_thread_lengths.At(num_i);
|
||||
return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
|
||||
make_tuple(partition_stride));
|
||||
}
|
||||
else
|
||||
{
|
||||
// packed
|
||||
return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
|
||||
make_tuple(I1{}));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Runtime partition
|
||||
if(steps.At(num_i) == 1)
|
||||
{
|
||||
// raked
|
||||
const auto partition_stride = unrolled_thread_lengths.At(num_i);
|
||||
return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
|
||||
make_tuple(partition_stride));
|
||||
}
|
||||
else
|
||||
{
|
||||
// packed
|
||||
return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
|
||||
make_tuple(I1{}));
|
||||
}
|
||||
}
|
||||
},
|
||||
Number<dims>{});
|
||||
|
||||
const auto lower_dims =
|
||||
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<dims>{});
|
||||
const auto upper_dims =
|
||||
generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<dims>{});
|
||||
return transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims);
|
||||
}
|
||||
|
||||
template <typename... Ls, typename... Steps>
|
||||
__host__ __device__ constexpr auto CalculateLayoutOffsetIdxImpl(const Tuple<Ls...>& thread_lengths,
|
||||
const Tuple<Steps...>& steps,
|
||||
index_t& thread_id)
|
||||
{
|
||||
return generate_tuple(
|
||||
[&](auto i) {
|
||||
constexpr auto num_i = Number<i>{};
|
||||
if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ls...>>>::value)
|
||||
{
|
||||
// if tuple then recurrence
|
||||
if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
|
||||
{
|
||||
return CalculateLayoutOffsetIdxImpl(
|
||||
thread_lengths.At(num_i), Tuple<>{}, thread_id);
|
||||
}
|
||||
else
|
||||
{
|
||||
return CalculateLayoutOffsetIdxImpl(
|
||||
thread_lengths.At(num_i), steps.At(num_i), thread_id);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Update thread_id after each dim
|
||||
const auto dim_thread_id = thread_id % thread_lengths.At(num_i);
|
||||
thread_id /= thread_lengths.At(num_i);
|
||||
if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
|
||||
{
|
||||
return dim_thread_id;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Apply step
|
||||
return steps.At(num_i) * dim_thread_id;
|
||||
}
|
||||
}
|
||||
const auto slice_len = size<num_i>(shape) / thread_lengths.At(num_i);
|
||||
return slice_len;
|
||||
},
|
||||
Number<Tuple<Ls...>::Size()>{});
|
||||
}
|
||||
|
||||
// Convert integer thread_idx to tuple index with steps applied
|
||||
template <typename... Ls, typename... Steps>
|
||||
__host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple<Ls...>& thread_lengths,
|
||||
const Tuple<Steps...>& steps,
|
||||
const index_t thread_id)
|
||||
/**
|
||||
* \brief Calculate total number of blocks.
|
||||
*
|
||||
* \param shape Base tensor shape.
|
||||
* \param tile_shape Tile shape.
|
||||
* \return Tuple with blocks number.
|
||||
*/
|
||||
template <typename... Ts, typename... Ls>
|
||||
__host__ __device__ constexpr auto CalculateGridSize(const Tuple<Ts...>& shape,
|
||||
const Tuple<Ls...>& tile_shape)
|
||||
{
|
||||
// Create tmp thread_id copy for CalculateLayoutOffsetIdxImpl updates
|
||||
index_t thread_id_copy = thread_id;
|
||||
return CalculateLayoutOffsetIdxImpl(thread_lengths, steps, thread_id_copy);
|
||||
static_assert(Tuple<Ts...>::Size() == Tuple<Ls...>::Size(), "Wrong thread_lengths shape.");
|
||||
return generate_tuple([&](auto i) { return size<i>(shape) / size<i>(tile_shape); },
|
||||
Number<Tuple<Ls...>::Size()>{});
|
||||
}
|
||||
|
||||
// Apply steps to index represented as tuple
|
||||
template <typename... Steps, typename... Idxs>
|
||||
__host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple<Steps...>& steps,
|
||||
const Tuple<Idxs...>& block_idxs)
|
||||
/**
|
||||
* \brief Calculate scaled offset for new partition/tile.
|
||||
*
|
||||
* \param thread_idxs Thread 1d id.
|
||||
* \param partition_lengths_seq Sequence of partition shape.
|
||||
* \param old_offset_idxs Multi index offset from base tensor to shift values.
|
||||
* \return Partition shape.
|
||||
*/
|
||||
template <typename ThreadIdxs, typename PartitionLengthsSeq, typename OldOffsetIdxs>
|
||||
__host__ __device__ constexpr auto
|
||||
CalculateOffsetMultiIdxs(const ThreadIdxs& thread_idxs,
|
||||
const PartitionLengthsSeq& partition_lengths_seq,
|
||||
const OldOffsetIdxs& old_offset_idxs)
|
||||
{
|
||||
return generate_tuple(
|
||||
[&](auto i) {
|
||||
constexpr auto num_i = Number<i>{};
|
||||
if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Idxs...>>>::value)
|
||||
{
|
||||
// if tuple then recurrence
|
||||
if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
|
||||
{
|
||||
return CalculateLayoutOffsetIdx(Tuple<>{}, block_idxs.At(num_i));
|
||||
}
|
||||
else
|
||||
{
|
||||
return CalculateLayoutOffsetIdx(steps.At(num_i), block_idxs.At(num_i));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
|
||||
{
|
||||
return block_idxs.At(num_i);
|
||||
}
|
||||
else
|
||||
{
|
||||
// apply step
|
||||
return steps.At(num_i) * block_idxs.At(num_i);
|
||||
}
|
||||
}
|
||||
},
|
||||
Number<Tuple<Idxs...>::Size()>{});
|
||||
return thread_idxs * partition_lengths_seq + old_offset_idxs;
|
||||
}
|
||||
|
||||
// User passes only shape per block to the make_local_tile function. This function calculates
|
||||
// block layout based on the shape.
|
||||
template <typename... Ts, typename... BlockDims>
|
||||
__host__ __device__ constexpr auto CalculateBlockLengths(const Tuple<Ts...>& shape,
|
||||
const Tuple<BlockDims...>& tile_shape)
|
||||
{
|
||||
return generate_tuple(
|
||||
[&](auto i) {
|
||||
constexpr auto num_i = Number<i>{};
|
||||
if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
|
||||
{
|
||||
// if tuple then recurrence
|
||||
return CalculateBlockLengths(shape.At(num_i), tile_shape.At(num_i));
|
||||
}
|
||||
else
|
||||
{
|
||||
return shape.At(num_i) / tile_shape.At(num_i);
|
||||
}
|
||||
},
|
||||
Number<Tuple<Ts...>::Size()>{});
|
||||
}
|
||||
} // namespace
|
||||
|
||||
/**
|
||||
* \brief Create local partition for thread.
|
||||
* \brief Create local partition for thread (At now only packed partition
|
||||
* is supported).
|
||||
*
|
||||
* \param tensor Tensor for partition.
|
||||
* \param thread_lengths Layout of threads.
|
||||
* \param thread_lengths Layout of threads (could not be nested).
|
||||
* \param thread_id Thread index represented as integer.
|
||||
* \param steps Thread step (default=1, raked partition)
|
||||
* \return Partition tensor.
|
||||
*/
|
||||
template <typename TensorType, typename ThreadLengthsTuple, typename StepsTuple = Tuple<>>
|
||||
__host__ __device__ constexpr auto make_local_partition(const TensorType& tensor,
|
||||
const ThreadLengthsTuple& thread_lengths,
|
||||
const index_t thread_id,
|
||||
const StepsTuple steps = StepsTuple{})
|
||||
template <typename TensorType, typename ThreadLengthsTuple>
|
||||
__host__ __device__ constexpr auto
|
||||
make_local_partition(TensorType& tensor,
|
||||
[[maybe_unused]] const ThreadLengthsTuple& thread_lengths,
|
||||
const index_t thread_id)
|
||||
{
|
||||
// Create shape, strides and layout for new partition tensor
|
||||
const auto partition_shape = CalculateLocalPartitionShape(shape(tensor), thread_lengths);
|
||||
// Create new descriptor and layout
|
||||
const auto& flatten_desc = layout(tensor).GetUnnestedDescriptor();
|
||||
auto partition_desc =
|
||||
CalculateLocalPartitionDescriptor(shape(tensor), thread_lengths, steps, flatten_desc);
|
||||
const auto partition_layout = Layout<decltype(partition_shape), decltype(partition_desc)>(
|
||||
partition_shape, partition_desc);
|
||||
// Calculate offset for new partition tensor
|
||||
const auto offset_idx = CalculateLayoutOffsetIdx(thread_lengths, steps, thread_id);
|
||||
const auto partition_offset = layout(tensor)(offset_idx);
|
||||
return make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer() + partition_offset,
|
||||
partition_layout);
|
||||
static_assert(!IsNestedTuple(ThreadLengthsTuple{}));
|
||||
// Calculate new partition shape
|
||||
const auto& tensor_shape = shape(tensor);
|
||||
constexpr auto partition_shape =
|
||||
CalculateLocalPartitionShape(decltype(tensor_shape){}, ThreadLengthsTuple{});
|
||||
// Create Thread Cluster Descriptor
|
||||
constexpr auto partition_lengths_seq = generate_sequence_v2(
|
||||
[&](auto I) { return size<I>(partition_shape); }, Number<ThreadLengthsTuple::Size()>{});
|
||||
constexpr auto thread_lengths_seq =
|
||||
generate_sequence_v2([&](auto I) { return size<I>(ThreadLengthsTuple{}); },
|
||||
Number<ThreadLengthsTuple::Size()>{});
|
||||
constexpr auto thread_cluster_desc_ = make_cluster_descriptor(thread_lengths_seq);
|
||||
// Calculate thread idxs and offsets
|
||||
const auto thread_idxs = thread_cluster_desc_.CalculateBottomIndex(make_multi_index(thread_id));
|
||||
const auto offset_multi_idxs =
|
||||
CalculateOffsetMultiIdxs(thread_idxs, partition_lengths_seq, tensor.GetMultiIdxOffsets());
|
||||
// Create new layout and tensor
|
||||
auto& flatten_desc = layout(tensor).GetUnrolledDescriptor();
|
||||
const auto partition_layout =
|
||||
Layout<remove_reference_t<decltype(partition_shape)>, decltype(flatten_desc)>(
|
||||
partition_shape, flatten_desc);
|
||||
auto partition_tensor =
|
||||
make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer(), partition_layout);
|
||||
// Apply offsets
|
||||
partition_tensor.SetMultiIdxOffset(to_multi_index(offset_multi_idxs));
|
||||
return partition_tensor;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Create local tile for thread block.
|
||||
* \brief Create local tile for thread block. (At now only packed tile
|
||||
* is supported).
|
||||
*
|
||||
* \note Temporary to gain the best performance use 2d
|
||||
* tile_shape.
|
||||
*
|
||||
*
|
||||
* \param tensor Tensor for partition.
|
||||
* \param tile_shape Shapes of requested tile.
|
||||
* \param block_idx Block index represented as tuple.
|
||||
* \param steps Block step (default=1, raked partition)
|
||||
* \param block_id Block index represented as integer.
|
||||
|
||||
* \return Tile tensor.
|
||||
*/
|
||||
template <typename TensorType,
|
||||
typename BlockShapeTuple,
|
||||
typename BlockIdxTuple,
|
||||
typename StepsTuple = Tuple<>>
|
||||
__host__ __device__ constexpr auto make_local_tile(const TensorType& tensor,
|
||||
const BlockShapeTuple& tile_shape,
|
||||
const BlockIdxTuple& block_idx,
|
||||
const StepsTuple steps = StepsTuple{})
|
||||
template <typename TensorType, typename BlockShapeTuple>
|
||||
__host__ __device__ constexpr auto
|
||||
make_local_tile(const TensorType& tensor, const BlockShapeTuple& tile_shape, const index_t block_id)
|
||||
{
|
||||
// Create block lengths, strides and layout for new tile tensor
|
||||
const auto block_lengths = CalculateBlockLengths(shape(tensor), tile_shape);
|
||||
// Create new descriptor and layout
|
||||
const auto& flatten_desc = layout(tensor).GetUnnestedDescriptor();
|
||||
auto tile_desc =
|
||||
CalculateLocalPartitionDescriptor(tile_shape, block_lengths, steps, flatten_desc);
|
||||
const auto tile_layout = Layout<remove_reference_t<decltype(tile_shape)>, decltype(tile_desc)>(
|
||||
tile_shape, tile_desc);
|
||||
// Calculate offset for new partition tensor
|
||||
const auto offset_idx = CalculateLayoutOffsetIdx(steps, block_idx);
|
||||
const auto tile_offset = layout(tensor)(offset_idx);
|
||||
return make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer() + tile_offset,
|
||||
tile_layout);
|
||||
static_assert(!IsNestedTuple(BlockShapeTuple{}));
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
|
||||
auto& aligned_desc = layout(tensor).GetMergedNestingDescriptor();
|
||||
|
||||
if constexpr(BlockShapeTuple::Size() == I2)
|
||||
{
|
||||
// Optimized version for 2d tile shape [MxK]
|
||||
const auto block_2_tile_map =
|
||||
BlockToCTileMap_M00_N0_M01Adapt<BlockShapeTuple{}.At(I0),
|
||||
BlockShapeTuple{}.At(I1),
|
||||
remove_cvref_t<decltype(aligned_desc)>>(aligned_desc);
|
||||
const auto block_work_idx =
|
||||
block_2_tile_map.CalculateBottomIndex(make_multi_index(block_id));
|
||||
const index_t m_block_data_idx_on_grid =
|
||||
__builtin_amdgcn_readfirstlane(block_work_idx[I0] * size<0>(tile_shape));
|
||||
const index_t k_block_data_idx_on_grid =
|
||||
__builtin_amdgcn_readfirstlane(block_work_idx[I1] * size<1>(tile_shape));
|
||||
const auto offset_multi_idxs =
|
||||
make_tuple(m_block_data_idx_on_grid, k_block_data_idx_on_grid);
|
||||
// Create new layout and tensor
|
||||
const auto tile_layout =
|
||||
Layout<remove_reference_t<decltype(tile_shape)>, decltype(aligned_desc)>(tile_shape,
|
||||
aligned_desc);
|
||||
auto tile_tensor =
|
||||
make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer(), tile_layout);
|
||||
// Apply offsets
|
||||
tile_tensor.SetMultiIdxOffset(to_multi_index(offset_multi_idxs));
|
||||
return tile_tensor;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Calculate offsets
|
||||
// Sequence with data to process per block
|
||||
constexpr auto tile_shape_seq =
|
||||
generate_sequence_v2([](auto I) { return size(BlockShapeTuple{}.At(I)); },
|
||||
Number<BlockShapeTuple::Size()>{});
|
||||
// Tuple with number of blocks
|
||||
const auto block_lengths = CalculateGridSize(shape(tensor), tile_shape);
|
||||
constexpr auto block_cluster_desc_ = make_cluster_descriptor(block_lengths);
|
||||
const auto block_idxs =
|
||||
block_cluster_desc_.CalculateBottomIndex(make_multi_index(block_id));
|
||||
const auto offset_multi_idxs =
|
||||
CalculateOffsetMultiIdxs(block_idxs, tile_shape_seq, tensor.GetMultiIdxOffsets());
|
||||
// Create new layout and tensor
|
||||
const auto tile_layout =
|
||||
Layout<remove_reference_t<decltype(tile_shape)>, decltype(aligned_desc)>(tile_shape,
|
||||
aligned_desc);
|
||||
auto tile_tensor =
|
||||
make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer(), tile_layout);
|
||||
// Apply offsets
|
||||
tile_tensor.SetMultiIdxOffset(to_multi_index(offset_multi_idxs));
|
||||
return tile_tensor;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace wrapper
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include "ck/utility/tuple_helper.hpp"
|
||||
#include "ck/utility/dynamic_buffer.hpp"
|
||||
#include "ck/utility/amd_address_space.hpp"
|
||||
#include "ck/utility/multi_index.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace wrapper {
|
||||
@@ -27,16 +28,12 @@ using MemoryTypeEnum = AddressSpaceEnum;
|
||||
// Disable from doxygen docs generation
|
||||
/// @cond
|
||||
// forward declarations
|
||||
template <typename Shape, typename UnnestedDescriptorType>
|
||||
template <typename Shape, typename UnrolledDescriptorType>
|
||||
struct Layout;
|
||||
template <MemoryTypeEnum BufferAddressSpace,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename UnnestedDescriptorType,
|
||||
index_t NumVectors, // params for Register memory
|
||||
index_t ScalarPerVector // param for Register memory
|
||||
>
|
||||
|
||||
typename UnrolledDescriptorType>
|
||||
struct Tensor;
|
||||
|
||||
template <typename FromType, typename ToType>
|
||||
@@ -45,13 +42,22 @@ struct Slice
|
||||
__host__ __device__ constexpr Slice() : from_(), to_() {}
|
||||
__host__ __device__ constexpr Slice(FromType from, ToType to) : from_(from), to_(to) {}
|
||||
|
||||
/**
|
||||
* \brief Calculate slice range.
|
||||
*
|
||||
* \param dim Dimension size.
|
||||
* \return Slice range.
|
||||
*/
|
||||
template <typename T>
|
||||
__host__ __device__ constexpr auto range(const T& dim) const
|
||||
{
|
||||
if constexpr(is_same_v<FromType, index_t> || is_same_v<ToType, index_t> ||
|
||||
is_same_v<T, index_t>)
|
||||
{
|
||||
assert(dim >= to_ && from_ >= 0 && (to_ < 0 || to_ > from_) && "Invalid range");
|
||||
if(!(dim >= to_ && from_ >= 0 && (to_ < 0 || to_ > from_)))
|
||||
{
|
||||
throw std::runtime_error("Invalid range");
|
||||
}
|
||||
if(to_ < 0)
|
||||
{
|
||||
return dim - from_ + to_ + 1;
|
||||
@@ -101,40 +107,27 @@ using is_tuple = decltype(std::declval<T&>().IsTuple());
|
||||
template <MemoryTypeEnum MemoryType,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename UnnestedDescriptorType>
|
||||
typename UnrolledDescriptorType>
|
||||
constexpr auto make_tensor(ElementType* pointer,
|
||||
const Layout<Shape, UnnestedDescriptorType>& layout)
|
||||
const Layout<Shape, UnrolledDescriptorType>& layout)
|
||||
{
|
||||
return Tensor<MemoryType,
|
||||
ElementType,
|
||||
Shape,
|
||||
UnnestedDescriptorType,
|
||||
0 /*NumVectors*/,
|
||||
0 /*ScalarPerVector*/>(pointer, layout);
|
||||
return Tensor<MemoryType, ElementType, Shape, UnrolledDescriptorType>(pointer, layout);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Make SGPR or VGPR tensor function.
|
||||
*
|
||||
* \tparam MemoryType Type of memory.
|
||||
* \tparam NumVectors Number of vectors.
|
||||
* \tparam ScalarPerVector Scalars per vector.
|
||||
* \tparam ElementType Memory data type.
|
||||
* \return Constructed tensor.
|
||||
*/
|
||||
template <MemoryTypeEnum MemoryType,
|
||||
index_t NumVectors,
|
||||
index_t ScalarPerVector,
|
||||
typename ElementType>
|
||||
constexpr auto make_register_tensor()
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename UnrolledDescriptorType>
|
||||
constexpr auto make_register_tensor(const Layout<Shape, UnrolledDescriptorType>& layout)
|
||||
{
|
||||
const auto layout = make_layout(make_tuple(Number<NumVectors>{}), make_tuple(Number<1>{}));
|
||||
return Tensor<MemoryType,
|
||||
ElementType,
|
||||
Tuple<Number<NumVectors>>,
|
||||
std::remove_const_t<remove_reference_t<decltype(layout.GetUnnestedDescriptor())>>,
|
||||
NumVectors,
|
||||
ScalarPerVector>(layout);
|
||||
return Tensor<MemoryType, ElementType, Shape, UnrolledDescriptorType>(layout);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -146,15 +139,9 @@ constexpr auto make_register_tensor()
|
||||
template <MemoryTypeEnum BufferAddressSpace,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename UnnestedDescriptorType,
|
||||
index_t NumVectors,
|
||||
index_t ScalarPerVector>
|
||||
__host__ __device__ constexpr const auto& layout(const Tensor<BufferAddressSpace,
|
||||
ElementType,
|
||||
Shape,
|
||||
UnnestedDescriptorType,
|
||||
NumVectors,
|
||||
ScalarPerVector>& tensor)
|
||||
typename UnrolledDescriptorType>
|
||||
__host__ __device__ constexpr const auto&
|
||||
layout(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
|
||||
{
|
||||
return tensor.GetLayout();
|
||||
}
|
||||
@@ -170,15 +157,9 @@ template <index_t... Idxs,
|
||||
MemoryTypeEnum BufferAddressSpace,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename UnnestedDescriptorType,
|
||||
index_t NumVectors,
|
||||
index_t ScalarPerVector>
|
||||
__host__ __device__ constexpr auto size(const Tensor<BufferAddressSpace,
|
||||
ElementType,
|
||||
Shape,
|
||||
UnnestedDescriptorType,
|
||||
NumVectors,
|
||||
ScalarPerVector>& tensor)
|
||||
typename UnrolledDescriptorType>
|
||||
__host__ __device__ constexpr auto
|
||||
size(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
|
||||
{
|
||||
return size<Idxs...>(tensor.GetLayout());
|
||||
}
|
||||
@@ -194,15 +175,9 @@ template <index_t... Idxs,
|
||||
MemoryTypeEnum BufferAddressSpace,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename UnnestedDescriptorType,
|
||||
index_t NumVectors,
|
||||
index_t ScalarPerVector>
|
||||
__host__ __device__ constexpr auto rank(const Tensor<BufferAddressSpace,
|
||||
ElementType,
|
||||
Shape,
|
||||
UnnestedDescriptorType,
|
||||
NumVectors,
|
||||
ScalarPerVector>& tensor)
|
||||
typename UnrolledDescriptorType>
|
||||
__host__ __device__ constexpr auto
|
||||
rank(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
|
||||
{
|
||||
return rank<Idxs...>(tensor.GetLayout());
|
||||
}
|
||||
@@ -218,15 +193,9 @@ template <index_t... Idxs,
|
||||
MemoryTypeEnum BufferAddressSpace,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename UnnestedDescriptorType,
|
||||
index_t NumVectors,
|
||||
index_t ScalarPerVector>
|
||||
__host__ __device__ constexpr auto depth(const Tensor<BufferAddressSpace,
|
||||
ElementType,
|
||||
Shape,
|
||||
UnnestedDescriptorType,
|
||||
NumVectors,
|
||||
ScalarPerVector>& tensor)
|
||||
typename UnrolledDescriptorType>
|
||||
__host__ __device__ constexpr auto
|
||||
depth(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
|
||||
{
|
||||
return depth<Idxs...>(tensor.GetLayout());
|
||||
}
|
||||
@@ -240,15 +209,9 @@ __host__ __device__ constexpr auto depth(const Tensor<BufferAddressSpace,
|
||||
template <MemoryTypeEnum BufferAddressSpace,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename UnnestedDescriptorType,
|
||||
index_t NumVectors,
|
||||
index_t ScalarPerVector>
|
||||
__host__ __device__ constexpr const auto& shape(const Tensor<BufferAddressSpace,
|
||||
ElementType,
|
||||
Shape,
|
||||
UnnestedDescriptorType,
|
||||
NumVectors,
|
||||
ScalarPerVector>& tensor)
|
||||
typename UnrolledDescriptorType>
|
||||
__host__ __device__ constexpr const auto&
|
||||
shape(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
|
||||
{
|
||||
return shape(tensor.GetLayout());
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/numeric.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
|
||||
@@ -21,49 +21,59 @@ template <typename InputTensor,
|
||||
typename OutputTensor,
|
||||
typename BlockShape,
|
||||
typename ThreadLayoutShape,
|
||||
typename LocalTileSteps,
|
||||
typename LocalPartitionSteps>
|
||||
bool UseOptimizedCopy>
|
||||
__global__ void TestCopyDevice(const InputTensor input_tensor,
|
||||
OutputTensor output_tensor,
|
||||
const BlockShape tile_shape,
|
||||
const ThreadLayoutShape thread_layout,
|
||||
const LocalTileSteps block_steps,
|
||||
const LocalPartitionSteps thread_steps)
|
||||
const ThreadLayoutShape thread_layout)
|
||||
{
|
||||
__shared__ ck::index_t p_shared[ck::wrapper::size(tile_shape)];
|
||||
auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
|
||||
const auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
|
||||
p_shared, ck::wrapper::make_layout(tile_shape));
|
||||
|
||||
const auto block_idxs = ck::make_tuple(ck::make_tuple(0, 0), blockIdx.x);
|
||||
const auto block_idx = static_cast<ck::index_t>(blockIdx.x);
|
||||
|
||||
// Get local tiles for global memory
|
||||
const auto input_local_tile =
|
||||
ck::wrapper::make_local_tile(input_tensor, tile_shape, block_idxs, block_steps);
|
||||
const auto input_local_tile = ck::wrapper::make_local_tile(input_tensor, tile_shape, block_idx);
|
||||
const auto output_local_tile =
|
||||
ck::wrapper::make_local_tile(output_tensor, tile_shape, block_idxs, block_steps);
|
||||
ck::wrapper::make_local_tile(output_tensor, tile_shape, block_idx);
|
||||
|
||||
// Get partition per thread
|
||||
const auto input_local_partition = ck::wrapper::make_local_partition(
|
||||
input_local_tile, thread_layout, threadIdx.x, thread_steps);
|
||||
const auto input_local_partition =
|
||||
ck::wrapper::make_local_partition(input_local_tile, thread_layout, threadIdx.x);
|
||||
auto lds_local_partition =
|
||||
ck::wrapper::make_local_partition(tensor_lds, thread_layout, threadIdx.x, thread_steps);
|
||||
auto output_local_partition = ck::wrapper::make_local_partition(
|
||||
output_local_tile, thread_layout, threadIdx.x, thread_steps);
|
||||
ck::wrapper::make_local_partition(tensor_lds, thread_layout, threadIdx.x);
|
||||
auto output_local_partition =
|
||||
ck::wrapper::make_local_partition(output_local_tile, thread_layout, threadIdx.x);
|
||||
|
||||
// Allocate VGPR
|
||||
constexpr ck::index_t scalar_per_vector = 1;
|
||||
constexpr ck::index_t vgpr_size = ck::wrapper::size(lds_local_partition);
|
||||
auto tensor_vgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr,
|
||||
vgpr_size,
|
||||
scalar_per_vector,
|
||||
ck::index_t>();
|
||||
auto tensor_vgpr =
|
||||
ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, ck::index_t>(
|
||||
layout(lds_local_partition));
|
||||
|
||||
// Perform copy
|
||||
ck::wrapper::copy(input_local_partition, lds_local_partition);
|
||||
ck::wrapper::copy(lds_local_partition, tensor_vgpr);
|
||||
ck::wrapper::copy(tensor_vgpr, output_local_partition);
|
||||
if constexpr(UseOptimizedCopy)
|
||||
{
|
||||
using DimAccessOrder = ck::Tuple<ck::Number<1>, ck::Number<0>>;
|
||||
constexpr ck::index_t vector_dim = 0;
|
||||
constexpr ck::index_t scalar_per_vector = 2;
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(input_local_partition,
|
||||
lds_local_partition);
|
||||
// TODO: Enable optimized copy for static buffers
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(lds_local_partition,
|
||||
tensor_vgpr);
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(tensor_vgpr,
|
||||
output_local_partition);
|
||||
}
|
||||
else
|
||||
{
|
||||
ck::wrapper::copy(input_local_partition, lds_local_partition);
|
||||
ck::wrapper::copy(lds_local_partition, tensor_vgpr);
|
||||
ck::wrapper::copy(tensor_vgpr, output_local_partition);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool UseOptimizedCopy>
|
||||
void PerformCopyGlobalToGlobalViaLDS()
|
||||
{
|
||||
const auto shape =
|
||||
@@ -89,15 +99,8 @@ void PerformCopyGlobalToGlobalViaLDS()
|
||||
auto output_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<ck::index_t*>(out_buf.GetDeviceBuffer()), layout);
|
||||
|
||||
const auto thread_layout =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<32>{});
|
||||
const auto tile_shape =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<2>{}), ck::Number<64>{});
|
||||
|
||||
const auto thread_steps =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<2>{});
|
||||
const auto block_steps =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<64>{});
|
||||
const auto thread_layout = ck::make_tuple(ck::Number<1>{}, ck::Number<32>{});
|
||||
const auto tile_shape = ck::make_tuple(ck::Number<4>{}, ck::Number<64>{});
|
||||
|
||||
const ck::index_t grid_size = ck::math::integer_divide_ceil(
|
||||
ck::wrapper::size(input_tensor_global), ck::wrapper::size(tile_shape));
|
||||
@@ -106,8 +109,7 @@ void PerformCopyGlobalToGlobalViaLDS()
|
||||
decltype(output_tensor_global),
|
||||
decltype(tile_shape),
|
||||
decltype(thread_layout),
|
||||
decltype(block_steps),
|
||||
decltype(thread_steps)>;
|
||||
UseOptimizedCopy>;
|
||||
launch_and_time_kernel(StreamConfig{},
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
@@ -116,9 +118,7 @@ void PerformCopyGlobalToGlobalViaLDS()
|
||||
input_tensor_global,
|
||||
output_tensor_global,
|
||||
tile_shape,
|
||||
thread_layout,
|
||||
block_steps,
|
||||
thread_steps);
|
||||
thread_layout);
|
||||
|
||||
// Verify results
|
||||
std::vector<ck::index_t> output_data(ck::wrapper::size(shape));
|
||||
@@ -126,4 +126,5 @@ void PerformCopyGlobalToGlobalViaLDS()
|
||||
EXPECT_TRUE(ck::utils::check_err(output_data, input_data));
|
||||
}
|
||||
|
||||
TEST(TestCopy, CopyGlobalToGlobalViaLDS) { PerformCopyGlobalToGlobalViaLDS(); }
|
||||
TEST(TestCopyGlobalToGlobalViaLDS, GenericCopy) { PerformCopyGlobalToGlobalViaLDS<false>(); }
|
||||
TEST(TestCopyGlobalToGlobalViaLDS, OptimizedCopy) { PerformCopyGlobalToGlobalViaLDS<true>(); }
|
||||
|
||||
@@ -29,42 +29,29 @@ TEST(TestPartition, LocalPartition)
|
||||
const auto tensor =
|
||||
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(data.data(), layout);
|
||||
|
||||
const auto thread_steps =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<1>{}), ck::Number<1>{});
|
||||
const auto thread_layout =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<8>{}, ck::Number<1>{}), ck::Number<1>{});
|
||||
|
||||
for(ck::index_t thread_id = 0; thread_id < ck::wrapper::size(thread_layout); thread_id++)
|
||||
{
|
||||
const auto raked_partition =
|
||||
ck::wrapper::make_local_partition(tensor, thread_layout, thread_id);
|
||||
|
||||
const auto expected_partition_size =
|
||||
ck::wrapper::size(tensor) / ck::wrapper::size(thread_layout);
|
||||
EXPECT_EQ(ck::wrapper::size(raked_partition), expected_partition_size);
|
||||
EXPECT_EQ(raked_partition(0), thread_id);
|
||||
}
|
||||
const auto thread_steps = ck::make_tuple(ck::Number<8>{}, ck::Number<1>{});
|
||||
const auto thread_layout = ck::make_tuple(ck::Number<8>{}, ck::Number<1>{});
|
||||
|
||||
for(ck::index_t thread_id = 0; thread_id < ck::wrapper::size(thread_layout); thread_id++)
|
||||
{
|
||||
const auto packed_partition =
|
||||
ck::wrapper::make_local_partition(tensor, thread_layout, thread_id, thread_steps);
|
||||
ck::wrapper::make_local_partition(tensor, thread_layout, thread_id);
|
||||
|
||||
const auto expected_partition_size =
|
||||
ck::wrapper::size(tensor) / ck::wrapper::size(thread_layout);
|
||||
const auto expected_partition_first_val = thread_id * ck::wrapper::size<0, 0>(thread_steps);
|
||||
const auto expected_partition_first_val = thread_id * ck::wrapper::size<0>(thread_steps);
|
||||
const auto expected_partition_second_val = expected_partition_first_val + 1;
|
||||
EXPECT_EQ(ck::wrapper::size(packed_partition), expected_partition_size);
|
||||
EXPECT_EQ(packed_partition(0), expected_partition_first_val);
|
||||
EXPECT_EQ(packed_partition(1), expected_partition_second_val);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TestPartition, LocalTile)
|
||||
{
|
||||
const auto shape =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}), ck::Number<4>{});
|
||||
const auto strides =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}), ck::Number<64>{});
|
||||
const auto layout = ck::wrapper::make_layout(shape, strides);
|
||||
const auto shape = ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}, ck::Number<4>{});
|
||||
const auto strides = ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}, ck::Number<64>{});
|
||||
const auto layout = ck::wrapper::make_layout(shape, strides);
|
||||
|
||||
std::vector<ck::index_t> data(ck::wrapper::size(layout));
|
||||
std::iota(data.begin(), data.end(), 0);
|
||||
@@ -72,48 +59,34 @@ TEST(TestPartition, LocalTile)
|
||||
const auto tensor =
|
||||
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(data.data(), layout);
|
||||
|
||||
const auto block_steps =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{});
|
||||
const auto block_shape =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{});
|
||||
const auto block_layout =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{});
|
||||
const auto block_shape = ck::make_tuple(ck::Number<2>{}, ck::Number<4>{}, ck::Number<2>{});
|
||||
const auto num_blocks =
|
||||
ck::make_tuple(ck::wrapper::size<0>(shape) / ck::wrapper::size<0>(block_shape),
|
||||
ck::wrapper::size<1>(shape) / ck::wrapper::size<1>(block_shape),
|
||||
ck::wrapper::size<2>(shape) / ck::wrapper::size<2>(block_shape));
|
||||
std::vector<ck::index_t> block_idxs(ck::wrapper::size(num_blocks));
|
||||
std::iota(block_idxs.begin(), block_idxs.end(), 0);
|
||||
|
||||
std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t>> block_idxs;
|
||||
for(ck::index_t x = 0; x < ck::wrapper::size<0, 0>(block_layout); x++)
|
||||
for(auto block_idx : block_idxs)
|
||||
{
|
||||
for(ck::index_t y = 0; y < ck::wrapper::size<0, 1>(block_layout); y++)
|
||||
{
|
||||
for(ck::index_t z = 0; z < ck::wrapper::size<1>(block_layout); z++)
|
||||
{
|
||||
block_idxs.emplace_back(ck::make_tuple(x, y), z);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(const auto& block_idx : block_idxs)
|
||||
{
|
||||
const auto raked_tile = ck::wrapper::make_local_tile(tensor, block_shape, block_idx);
|
||||
const auto packed_tile = ck::wrapper::make_local_tile(tensor, block_shape, block_idx);
|
||||
|
||||
const auto expected_tile_size = ck::wrapper::size(block_shape);
|
||||
EXPECT_EQ(ck::wrapper::size(raked_tile), expected_tile_size);
|
||||
EXPECT_EQ(raked_tile(0), layout(block_idx));
|
||||
}
|
||||
auto expected_tile_first_val = (block_idx % ck::wrapper::size<2>(num_blocks)) *
|
||||
ck::wrapper::size<2>(block_shape) *
|
||||
ck::wrapper::size<2>(strides);
|
||||
block_idx /= ck::wrapper::size<2>(num_blocks);
|
||||
expected_tile_first_val += (block_idx % ck::wrapper::size<1>(num_blocks)) *
|
||||
ck::wrapper::size<1>(block_shape) *
|
||||
ck::wrapper::size<1>(strides);
|
||||
block_idx /= ck::wrapper::size<1>(num_blocks);
|
||||
expected_tile_first_val += (block_idx % ck::wrapper::size<0>(num_blocks)) *
|
||||
ck::wrapper::size<0>(block_shape) *
|
||||
ck::wrapper::size<0>(strides);
|
||||
|
||||
for(const auto& block_idx : block_idxs)
|
||||
{
|
||||
const auto packed_tile =
|
||||
ck::wrapper::make_local_tile(tensor, block_shape, block_idx, block_steps);
|
||||
|
||||
const auto expected_tile_size = ck::wrapper::size(block_shape);
|
||||
const auto expected_tile_first_val =
|
||||
ck::wrapper::size<0, 0>(block_idx) * ck::wrapper::size<0, 0>(block_shape) *
|
||||
ck::wrapper::size<0, 0>(strides) +
|
||||
ck::wrapper::size<0, 1>(block_idx) * ck::wrapper::size<0, 1>(block_shape) *
|
||||
ck::wrapper::size<0, 1>(strides) +
|
||||
ck::wrapper::size<1>(block_idx) * ck::wrapper::size<1>(block_shape) *
|
||||
ck::wrapper::size<1>(strides);
|
||||
const auto expected_tile_second_val = expected_tile_first_val + 1;
|
||||
EXPECT_EQ(ck::wrapper::size(packed_tile), expected_tile_size);
|
||||
EXPECT_EQ(packed_tile(0), expected_tile_first_val);
|
||||
EXPECT_EQ(packed_tile(1), expected_tile_second_val);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
@@ -100,31 +100,26 @@ TEST(TestTensor, ReadWriteHostMemory)
|
||||
|
||||
__global__ void TestTensorReadWriteDevice(void* data, void* success)
|
||||
{
|
||||
constexpr ck::index_t nelems = 8;
|
||||
constexpr ck::index_t scalar_per_vector = 1;
|
||||
constexpr ck::index_t nelems = 8;
|
||||
__shared__ ck::index_t p_shared[nelems];
|
||||
|
||||
ck::index_t* casted_data_ptr = static_cast<ck::index_t*>(data);
|
||||
bool* casted_success_ptr = static_cast<bool*>(success);
|
||||
|
||||
const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2));
|
||||
constexpr auto vgpr_layout =
|
||||
ck::wrapper::make_layout(make_tuple(ck::Number<nelems>{}), make_tuple(ck::Number<1>{}));
|
||||
|
||||
auto tensor_global =
|
||||
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(casted_data_ptr, layout);
|
||||
auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(p_shared, layout);
|
||||
auto tensor_vgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr,
|
||||
nelems,
|
||||
scalar_per_vector,
|
||||
ck::index_t>();
|
||||
auto tensor_sgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Sgpr,
|
||||
nelems,
|
||||
scalar_per_vector,
|
||||
ck::index_t>();
|
||||
auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(p_shared, layout);
|
||||
auto tensor_vgpr =
|
||||
ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, ck::index_t>(
|
||||
vgpr_layout);
|
||||
|
||||
InitTensor(tensor_global);
|
||||
InitTensor(tensor_lds);
|
||||
StaticInitTensor<nelems>(tensor_vgpr);
|
||||
StaticInitTensor<nelems>(tensor_sgpr);
|
||||
|
||||
*casted_success_ptr = TestTensorCheck1d(tensor_global);
|
||||
*casted_success_ptr &= TestTensorCheck3d(tensor_global);
|
||||
@@ -133,8 +128,6 @@ __global__ void TestTensorReadWriteDevice(void* data, void* success)
|
||||
*casted_success_ptr &= TestTensorCheck3d(tensor_lds);
|
||||
|
||||
*casted_success_ptr &= StaticTestTensorCheck1d<nelems>(tensor_vgpr);
|
||||
|
||||
*casted_success_ptr &= StaticTestTensorCheck1d<nelems>(tensor_sgpr);
|
||||
}
|
||||
|
||||
TEST(TestTensor, ReadWriteGlobalLdsRegistersMemory)
|
||||
|
||||
Reference in New Issue
Block a user