This commit is contained in:
Chao Liu
2019-08-13 00:37:23 -05:00
parent fab2f10a55
commit 8bdaba51f8
20 changed files with 505 additions and 651 deletions

View File

@@ -563,7 +563,7 @@ struct Blockwise2dTensorCopy3
}
}
__device__ constexpr index_t GetRegisterClipboardSize() const
__device__ constexpr index_t GetRegisterBufferSize() const
{
static_assert(is_same<Float, float>{}, "wrong! only support float!\n");
@@ -579,8 +579,8 @@ struct Blockwise2dTensorCopy3
return DataPerRead * (L0 + thread_per_d0 - 1) / thread_per_d0;
}
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
Float* __restrict__ p_clipboard) const
__device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src,
Float* __restrict__ p_clipboard) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
@@ -630,8 +630,8 @@ struct Blockwise2dTensorCopy3
}
}
__device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const
__device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
@@ -681,8 +681,8 @@ struct Blockwise2dTensorCopy3
}
#if CK_USE_AMD_INLINE_ASM
__device__ void RunLoadRegisterClipboard_asm(const Float* __restrict__ p_src,
Float* p_clipboard) const
__device__ void RunLoadRegisterBuffer_asm(const Float* __restrict__ p_src,
Float* p_clipboard) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
@@ -741,8 +741,8 @@ struct Blockwise2dTensorCopy3
}
}
__device__ void RunStoreRegisterClipboard_asm(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const
__device__ void RunStoreRegisterBuffer_asm(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};

View File

@@ -237,7 +237,7 @@ struct Blockwise3dTensorCopy3
}
}
__device__ static constexpr index_t GetRegisterClipboardSize()
__device__ static constexpr index_t GetRegisterBufferSize()
{
static_assert(is_same<Float, float>{}, "wrong! only support float!\n");
@@ -260,8 +260,8 @@ struct Blockwise3dTensorCopy3
return DataPerRead * nloop_d0 * nloop_d1 * nloop_d2;
}
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
Float* __restrict__ p_clipboard) const
__device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src,
Float* __restrict__ p_clipboard) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
@@ -316,8 +316,8 @@ struct Blockwise3dTensorCopy3
}
}
__device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const
__device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};

View File

@@ -596,7 +596,7 @@ struct Blockwise4dTensorCopy3
}
}
__device__ constexpr index_t GetRegisterClipboardSize() const
__device__ constexpr index_t GetRegisterBufferSize() const
{
static_assert(is_same<Float, float>{}, "wrong! only support float!\n");
@@ -623,8 +623,8 @@ struct Blockwise4dTensorCopy3
return DataPerRead * nloop_d0 * nloop_d1 * nloop_d2 * nloop_d3;
}
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
Float* __restrict__ p_clipboard) const
__device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src,
Float* __restrict__ p_clipboard) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
@@ -690,8 +690,8 @@ struct Blockwise4dTensorCopy3
}
}
__device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const
__device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};

View File

@@ -420,8 +420,6 @@ struct BlockwiseGenericTensorSliceCopy_v1
template <index_t BlockSize,
class SrcDesc,
class DstDesc,
class SrcCoordinate,
class DstCoordinate,
class SliceLengths,
class SubLengths,
class ThreadClusterLengths,
@@ -436,6 +434,9 @@ struct BlockwiseGenericTensorSliceCopy_v2
{
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
using SrcCoordinate = typename TensorCoordinate<SrcDesc>::type;
using DstCoordinate = typename TensorCoordinate<DstDesc>::type;
__device__ constexpr BlockwiseGenericTensorSliceCopy_v2(SrcCoordinate src_block_slice_origin,
DstCoordinate dst_block_slice_origin)
{
@@ -515,31 +516,25 @@ struct BlockwiseGenericTensorSliceCopy_v2
private:
using RegisterBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{}));
using ThreadwiseLoad =
ThreadwiseGenericTensorSliceCopy_v2r1<SrcDesc,
RegisterBufferDesc,
SrcCoordinate,
NormalTensorCoordinate<RegisterBufferDesc>,
SubLengths,
SrcDimAccessOrder,
SrcDimAccessOrder,
SrcVectorAccessDim,
SrcVectorAccessDim,
SrcDataPerAccess,
1>;
using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v2r1<SrcDesc,
RegisterBufferDesc,
SubLengths,
SrcDimAccessOrder,
SrcDimAccessOrder,
SrcVectorAccessDim,
SrcVectorAccessDim,
SrcDataPerAccess,
1>;
using ThreadwiseStore =
ThreadwiseGenericTensorSliceCopy_v2r1<RegisterBufferDesc,
DstDesc,
NormalTensorCoordinate<RegisterBufferDesc>,
DstCoordinate,
SubLengths,
DstDimAccessOrder,
DstDimAccessOrder,
DstVectorAccessDim,
DstVectorAccessDim,
1,
DstDataPerAccess>;
using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v2r1<RegisterBufferDesc,
DstDesc,
SubLengths,
DstDimAccessOrder,
DstDimAccessOrder,
DstVectorAccessDim,
DstVectorAccessDim,
1,
DstDataPerAccess>;
ThreadwiseLoad mThreadwiseLoad;
ThreadwiseStore mThreadwiseStore;

View File

@@ -165,7 +165,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
#endif
}
__device__ static constexpr index_t GetRegisterClipboardSize()
__device__ static constexpr index_t GetRegisterBufferSize()
{
constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};
@@ -183,8 +183,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
return thread_tensor_desc.GetElementSpace();
}
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
Float* __restrict__ p_clipboard) const
__device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src,
Float* __restrict__ p_clipboard) const
{
constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};
@@ -219,8 +219,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
});
}
__device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const
__device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const
{
constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};
@@ -274,10 +274,10 @@ struct BlockwiseTensorSliceReorderCopy_v3
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
{
Float p_clipboard[GetRegisterClipboardSize()];
Float p_clipboard[GetRegisterBufferSize()];
RunLoadRegisterClipboard(p_src, p_clipboard);
RunStoreRegisterClipboard(p_clipboard, p_dst);
RunLoadRegisterBuffer(p_src, p_clipboard);
RunStoreRegisterBuffer(p_clipboard, p_dst);
}
// this function doesn't do santiy check on whether the slicing window is out of the boundary

View File

@@ -14,10 +14,6 @@
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#endif
@@ -430,170 +426,6 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
Array<index_t, nDim> mDstSliceOrigin;
};
template <class SrcDesc,
class DstDesc,
class SrcCoordinate,
class DstCoordinate,
class SliceLengths>
struct ThreadwiseGenericTensorSliceCopy_v2
{
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v2(SrcCoordinate src_slice_origin,
DstCoordinate dst_slice_origin)
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
{
}
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v2()
: ThreadwiseGenericTensorSliceCopy_v2(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>())
{
}
__device__ void SetSrcSliceOrigin(SrcCoordinate src_slice_origin)
{
mSrcSliceOrigin = src_slice_origin;
}
__device__ void SetDstSliceOrigin(DstCoordinate dst_slice_origin)
{
mDstSliceOrigin = dst_slice_origin;
}
template <class TDesc, class Seq>
struct IsolateMergedDimSliceLengthsHack
{
template <class IDim>
__device__ constexpr index_t operator()(IDim idim) const
{
return TDesc::ContainMultipleOriginalDimensions(idim) ? Seq{}[idim] : 1;
}
};
template <class TData>
__device__ void Run(const TData* p_src, TData* p_dst) const
{
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
TData p_buffer_[buffer_desc.GetElementSpace()];
TData* p_buffer = p_buffer_;
// hacks to isolate merged dimension from normal dimensions, and calculate their offset
// seperately
// SrcMergedDimSliceLengthsHack has entry same as SliceLengths on src merged dimensions,
// but 1 on normal dimensions;
// SrcNormalDimSliceLengthsHack has entry same as SliceLengths on src normal dimensions,
// but 1 on merged dimensions;
using SrcMergedDimSliceLengthsHack =
typename sequence_gen<SliceLengths::GetSize(),
IsolateMergedDimSliceLengthsHack<SrcDesc, SliceLengths>>::type;
using SrcNormalDimSliceLengthsHack =
decltype((SliceLengths{} + Number<1>{}) - SrcMergedDimSliceLengthsHack{});
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2
static_ford<SrcMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id_) {
constexpr auto merged_dim_data_id = decltype(merged_dim_data_id_){};
const TData* p_src_tmp = p_src + (mSrcSliceOrigin + merged_dim_data_id).GetOffset();
static_ford<SrcNormalDimSliceLengthsHack>{}([&](auto normal_dim_data_id_) {
constexpr auto normal_dim_data_id = decltype(normal_dim_data_id_){};
constexpr index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id);
constexpr index_t src_normal_offset =
SrcDesc::GetOffsetFromMultiIndex(normal_dim_data_id);
p_buffer[buffer_offset] = p_src_tmp[src_normal_offset];
});
});
#else
ford<SrcMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id) {
const TData* p_src_tmp = p_src + (mSrcSliceOrigin + merged_dim_data_id).GetOffset();
ford<SrcNormalDimSliceLengthsHack>{}([&](auto normal_dim_data_id) {
const index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id);
const index_t src_normal_offset =
SrcDesc::GetOffsetFromMultiIndex(normal_dim_data_id);
p_buffer[buffer_offset] = p_src_tmp[src_normal_offset];
});
});
#endif
// DstMergedDimSliceLengthsHack has entry same as SliceLengths on dst merged dimensions,
// but 1 on normal dimensions;
// DstNormalDimSliceLengthsHack has entry same as SliceLengths on dst normal dimensions,
// but 1 on merged dimensions;
using DstMergedDimSliceLengthsHack =
typename sequence_gen<SliceLengths::GetSize(),
IsolateMergedDimSliceLengthsHack<DstDesc, SliceLengths>>::type;
using DstNormalDimSliceLengthsHack =
decltype((SliceLengths{} + Number<1>{}) - DstMergedDimSliceLengthsHack{});
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2
static_ford<DstMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id_) {
constexpr auto merged_dim_data_id = decltype(merged_dim_data_id_){};
TData* p_dst_tmp = p_dst + (mDstSliceOrigin + merged_dim_data_id).GetOffset();
static_ford<DstNormalDimSliceLengthsHack>{}([&](auto normal_dim_data_id_) {
constexpr auto normal_dim_data_id = decltype(normal_dim_data_id_){};
constexpr index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id);
constexpr index_t dst_normal_offset =
DstDesc::GetOffsetFromMultiIndex(normal_dim_data_id);
p_dst_tmp[dst_normal_offset] = p_buffer[buffer_offset];
});
});
#else
ford<DstMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id) {
TData* p_dst_tmp = p_dst + (mDstSliceOrigin + merged_dim_data_id).GetOffset();
ford<DstNormalDimSliceLengthsHack>{}([&](auto normal_dim_data_id) {
const index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id);
const index_t dst_normal_offset =
DstDesc::GetOffsetFromMultiIndex(normal_dim_data_id);
p_dst_tmp[dst_normal_offset] = p_buffer[buffer_offset];
});
});
#endif
}
// T can be Sequence or Array
template <class T, bool PositiveDirection>
__device__ void MoveSrcSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
{
static_if<PositiveDirection>{}([&](auto) {
mSrcSliceOrigin += step_sizes;
}).Else([&](auto) { mSrcSliceOrigin -= step_sizes; });
}
template <class T, bool PositiveDirection>
__device__ void MoveDstSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
{
static_if<PositiveDirection>{}([&](auto) {
mDstSliceOrigin += step_sizes;
}).Else([&](auto) { mDstSliceOrigin -= step_sizes; });
}
private:
SrcCoordinate mSrcSliceOrigin;
DstCoordinate mDstSliceOrigin;
};
// This threadwise copy allow vector access of src and dst.
// It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst.
@@ -605,8 +437,6 @@ struct ThreadwiseGenericTensorSliceCopy_v2
// used for the buffer.
template <class SrcDesc,
class DstDesc,
class SrcCoordinate,
class DstCoordinate,
class SliceLengths,
class SrcDimAccessOrder,
class DstDimAccessOrder,
@@ -618,6 +448,9 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
{
static constexpr index_t nDim = SliceLengths::GetSize();
using SrcCoordinate = typename TensorCoordinate<SrcDesc>::type;
using DstCoordinate = typename TensorCoordinate<DstDesc>::type;
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1(SrcCoordinate src_slice_origin,
DstCoordinate dst_slice_origin)
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)