use ford/for instead of static_ford/static_for in threadwise copy, somehow register spill is greatly reduced on AMD

This commit is contained in:
Chao Liu
2019-08-07 19:09:13 -05:00
parent 5636576f9b
commit bc9ea646f8
7 changed files with 122 additions and 35 deletions

View File

@@ -22,7 +22,6 @@ namespace ck {
// repeat-length on the merged dimension need to be 1. These sanity checks are performed
// in constructor of BlockwiseGenericTensorSliceCopy_v1
template <index_t BlockSize,
class Float,
class SrcDesc,
class DstDesc,
class SliceLengths,
@@ -202,8 +201,9 @@ struct BlockwiseGenericTensorSliceCopy_v1
return GetRegisterBufferDescriptor().GetElementSpace();
}
__device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src,
Float* __restrict__ p_buffer) const
template <class TData>
__device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src,
TData* __restrict__ p_buffer) const
{
constexpr auto thread_sub_tensor_lengths = SubLengths{};
@@ -255,7 +255,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
thread_sub_tensor_lengths,
SrcDimAccessOrder{},
Number<SrcDataPerAccess>{});
#elif 0
#elif 1
ThreadwiseGenericTensorSliceCopy_v1r1<
SrcDesc,
decltype(thread_buffer_desc),
@@ -281,8 +281,9 @@ struct BlockwiseGenericTensorSliceCopy_v1
});
}
__device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_buffer,
Float* __restrict__ p_dst) const
template <class TData>
__device__ void RunStoreRegisterBuffer(const TData* __restrict__ p_buffer,
TData* __restrict__ p_dst) const
{
constexpr auto thread_sub_tensor_lengths = SubLengths{};
@@ -333,7 +334,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
thread_sub_tensor_lengths,
DstDimAccessOrder{},
Number<DstDataPerAccess>{});
#elif 0
#elif 1
ThreadwiseGenericTensorSliceCopy_v1r1<
decltype(thread_buffer_desc),
DstDesc,
@@ -360,9 +361,10 @@ struct BlockwiseGenericTensorSliceCopy_v1
});
}
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
template <class TData>
__device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const
{
Float p_buffer[GetRegisterBufferSize()];
TData p_buffer[GetRegisterBufferSize()];
RunLoadRegisterBuffer(p_src, p_buffer);
RunStoreRegisterBuffer(p_buffer, p_dst);

View File

@@ -10,10 +10,18 @@
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
#endif
namespace ck {
// user need to make sure alignment requirement is satisfied when setting DataPerAccesss > 1
@@ -216,6 +224,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r1
src_vector_access_dim,
SliceLengths::Get(src_vector_access_dim) / src_data_per_access);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
static_ford<decltype(src_access_lengths), SrcDimAccessOrder>{}([&](auto src_access_id) {
constexpr auto src_data_begin_id = src_access_id.Modify(
src_vector_access_dim,
@@ -239,6 +248,31 @@ struct ThreadwiseGenericTensorSliceCopy_v1r1
p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
});
});
#else
ford<decltype(src_access_lengths), SrcDimAccessOrder>{}([&](auto src_access_id) {
auto src_data_begin_id = src_access_id;
src_data_begin_id(src_vector_access_dim) =
src_access_id[src_vector_access_dim] * src_data_per_access;
const index_t src_offset =
SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id);
// load vector from src
const vector_t vector_data = *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
// unpack vector into buffer
for(index_t i = 0; i < SrcDataPerAccess; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(src_vector_access_dim) = i;
const index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id);
p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
}
});
#endif
}
// copy data from buffer to dst
@@ -252,6 +286,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r1
dst_vector_access_dim,
SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
static_ford<decltype(dst_access_lengths), DstDimAccessOrder>{}([&](auto dst_access_id) {
constexpr auto dst_data_begin_id = dst_access_id.Modify(
dst_vector_access_dim,
@@ -277,6 +312,33 @@ struct ThreadwiseGenericTensorSliceCopy_v1r1
// store vector into dst
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = vector_data;
});
#else
ford<decltype(dst_access_lengths), DstDimAccessOrder>{}([&](auto dst_access_id) {
auto dst_data_begin_id = dst_access_id;
dst_data_begin_id(dst_vector_access_dim) =
dst_access_id[dst_vector_access_dim] * dst_data_per_access;
vector_t vector_data;
// pack vector from buffer
for(index_t i = 0; i < DstDataPerAccess; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(dst_vector_access_dim) = i;
const index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id);
reinterpret_cast<TData*>(&vector_data)[i] = p_buffer[buffer_offset];
}
const index_t dst_offset =
DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id);
// store vector into dst
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = vector_data;
});
#endif
}
}
@@ -373,7 +435,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
static_ford<decltype(long_vector_access_lengths), DimAccessOrder>{}([&](
auto long_vector_access_id) {
@@ -524,6 +586,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2
using SrcNormalDimSliceLengthsHack =
decltype((SliceLengths{} + Number<1>{}) - SrcMergedDimSliceLengthsHack{});
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2
static_ford<SrcMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id_) {
constexpr auto merged_dim_data_id = decltype(merged_dim_data_id_){};
@@ -541,6 +604,21 @@ struct ThreadwiseGenericTensorSliceCopy_v2
p_buffer[buffer_offset] = p_src_tmp[src_normal_offset];
});
});
#else
ford<SrcMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id) {
const TData* p_src_tmp = p_src + (mSrcSliceOrigin + merged_dim_data_id).GetOffset();
ford<SrcNormalDimSliceLengthsHack>{}([&](auto normal_dim_data_id) {
const index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id);
const index_t src_normal_offset =
SrcDesc::GetOffsetFromMultiIndex(normal_dim_data_id);
p_buffer[buffer_offset] = p_src_tmp[src_normal_offset];
});
});
#endif
// DstMergedDimSliceLengthsHack has entry same as SliceLengths on dst merged dimensions,
// but 1 on normal dimensions;
@@ -553,6 +631,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2
using DstNormalDimSliceLengthsHack =
decltype((SliceLengths{} + Number<1>{}) - DstMergedDimSliceLengthsHack{});
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2
static_ford<DstMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id_) {
constexpr auto merged_dim_data_id = decltype(merged_dim_data_id_){};
@@ -570,6 +649,21 @@ struct ThreadwiseGenericTensorSliceCopy_v2
p_dst_tmp[dst_normal_offset] = p_buffer[buffer_offset];
});
});
#else
ford<DstMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id) {
TData* p_dst_tmp = p_dst + (mDstSliceOrigin + merged_dim_data_id).GetOffset();
ford<DstNormalDimSliceLengthsHack>{}([&](auto normal_dim_data_id) {
const index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id);
const index_t dst_normal_offset =
DstDesc::GetOffsetFromMultiIndex(normal_dim_data_id);
p_dst_tmp[dst_normal_offset] = p_buffer[buffer_offset];
});
});
#endif
}
// T can be Sequence or Array