Add optimized copy to ck wrapper (#1126)

* Add optimized copy to ck wrapper

* Example optimizations

* Fixes

* Move img2col test to client example

* Refactor example

* Fix docs

* Fixes

* Fix

* Fixes

* Fixes

* Fixes

* Fixes

* Fixes

---------

Co-authored-by: zjing14 <zhangjing14@gmail.com>
This commit is contained in:
Bartłomiej Kocot
2024-01-19 11:29:00 +01:00
committed by GitHub
parent 38882d8ab5
commit 7e4eb4b800
17 changed files with 1109 additions and 865 deletions

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
@@ -10,6 +10,7 @@
#include "ck/utility/tuple_helper.hpp"
#include "ck/utility/dynamic_buffer.hpp"
#include "ck/utility/amd_address_space.hpp"
#include "ck/utility/multi_index.hpp"
namespace ck {
namespace wrapper {
@@ -27,16 +28,12 @@ using MemoryTypeEnum = AddressSpaceEnum;
// Disable from doxygen docs generation
/// @cond
// forward declarations
template <typename Shape, typename UnnestedDescriptorType>
template <typename Shape, typename UnrolledDescriptorType>
struct Layout;
template <MemoryTypeEnum BufferAddressSpace,
typename ElementType,
typename Shape,
typename UnnestedDescriptorType,
index_t NumVectors, // params for Register memory
index_t ScalarPerVector // param for Register memory
>
typename UnrolledDescriptorType>
struct Tensor;
template <typename FromType, typename ToType>
@@ -45,13 +42,22 @@ struct Slice
__host__ __device__ constexpr Slice() : from_(), to_() {}
__host__ __device__ constexpr Slice(FromType from, ToType to) : from_(from), to_(to) {}
/**
* \brief Calculate slice range.
*
* \param dim Dimension size.
* \return Slice range.
*/
template <typename T>
__host__ __device__ constexpr auto range(const T& dim) const
{
if constexpr(is_same_v<FromType, index_t> || is_same_v<ToType, index_t> ||
is_same_v<T, index_t>)
{
assert(dim >= to_ && from_ >= 0 && (to_ < 0 || to_ > from_) && "Invalid range");
if(!(dim >= to_ && from_ >= 0 && (to_ < 0 || to_ > from_)))
{
throw std::runtime_error("Invalid range");
}
if(to_ < 0)
{
return dim - from_ + to_ + 1;
@@ -101,40 +107,27 @@ using is_tuple = decltype(std::declval<T&>().IsTuple());
template <MemoryTypeEnum MemoryType,
typename ElementType,
typename Shape,
typename UnnestedDescriptorType>
typename UnrolledDescriptorType>
constexpr auto make_tensor(ElementType* pointer,
const Layout<Shape, UnnestedDescriptorType>& layout)
const Layout<Shape, UnrolledDescriptorType>& layout)
{
return Tensor<MemoryType,
ElementType,
Shape,
UnnestedDescriptorType,
0 /*NumVectors*/,
0 /*ScalarPerVector*/>(pointer, layout);
return Tensor<MemoryType, ElementType, Shape, UnrolledDescriptorType>(pointer, layout);
}
/**
* \brief Make SGPR or VGPR tensor function.
*
* \tparam MemoryType Type of memory.
* \tparam NumVectors Number of vectors.
* \tparam ScalarPerVector Scalars per vector.
* \tparam ElementType Memory data type.
* \return Constructed tensor.
*/
template <MemoryTypeEnum MemoryType,
index_t NumVectors,
index_t ScalarPerVector,
typename ElementType>
constexpr auto make_register_tensor()
typename ElementType,
typename Shape,
typename UnrolledDescriptorType>
constexpr auto make_register_tensor(const Layout<Shape, UnrolledDescriptorType>& layout)
{
const auto layout = make_layout(make_tuple(Number<NumVectors>{}), make_tuple(Number<1>{}));
return Tensor<MemoryType,
ElementType,
Tuple<Number<NumVectors>>,
std::remove_const_t<remove_reference_t<decltype(layout.GetUnnestedDescriptor())>>,
NumVectors,
ScalarPerVector>(layout);
return Tensor<MemoryType, ElementType, Shape, UnrolledDescriptorType>(layout);
}
/**
@@ -146,15 +139,9 @@ constexpr auto make_register_tensor()
template <MemoryTypeEnum BufferAddressSpace,
typename ElementType,
typename Shape,
typename UnnestedDescriptorType,
index_t NumVectors,
index_t ScalarPerVector>
__host__ __device__ constexpr const auto& layout(const Tensor<BufferAddressSpace,
ElementType,
Shape,
UnnestedDescriptorType,
NumVectors,
ScalarPerVector>& tensor)
typename UnrolledDescriptorType>
__host__ __device__ constexpr const auto&
layout(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
{
return tensor.GetLayout();
}
@@ -170,15 +157,9 @@ template <index_t... Idxs,
MemoryTypeEnum BufferAddressSpace,
typename ElementType,
typename Shape,
typename UnnestedDescriptorType,
index_t NumVectors,
index_t ScalarPerVector>
__host__ __device__ constexpr auto size(const Tensor<BufferAddressSpace,
ElementType,
Shape,
UnnestedDescriptorType,
NumVectors,
ScalarPerVector>& tensor)
typename UnrolledDescriptorType>
__host__ __device__ constexpr auto
size(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
{
return size<Idxs...>(tensor.GetLayout());
}
@@ -194,15 +175,9 @@ template <index_t... Idxs,
MemoryTypeEnum BufferAddressSpace,
typename ElementType,
typename Shape,
typename UnnestedDescriptorType,
index_t NumVectors,
index_t ScalarPerVector>
__host__ __device__ constexpr auto rank(const Tensor<BufferAddressSpace,
ElementType,
Shape,
UnnestedDescriptorType,
NumVectors,
ScalarPerVector>& tensor)
typename UnrolledDescriptorType>
__host__ __device__ constexpr auto
rank(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
{
return rank<Idxs...>(tensor.GetLayout());
}
@@ -218,15 +193,9 @@ template <index_t... Idxs,
MemoryTypeEnum BufferAddressSpace,
typename ElementType,
typename Shape,
typename UnnestedDescriptorType,
index_t NumVectors,
index_t ScalarPerVector>
__host__ __device__ constexpr auto depth(const Tensor<BufferAddressSpace,
ElementType,
Shape,
UnnestedDescriptorType,
NumVectors,
ScalarPerVector>& tensor)
typename UnrolledDescriptorType>
__host__ __device__ constexpr auto
depth(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
{
return depth<Idxs...>(tensor.GetLayout());
}
@@ -240,15 +209,9 @@ __host__ __device__ constexpr auto depth(const Tensor<BufferAddressSpace,
template <MemoryTypeEnum BufferAddressSpace,
typename ElementType,
typename Shape,
typename UnnestedDescriptorType,
index_t NumVectors,
index_t ScalarPerVector>
__host__ __device__ constexpr const auto& shape(const Tensor<BufferAddressSpace,
ElementType,
Shape,
UnnestedDescriptorType,
NumVectors,
ScalarPerVector>& tensor)
typename UnrolledDescriptorType>
__host__ __device__ constexpr const auto&
shape(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
{
return shape(tensor.GetLayout());
}