mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 08:50:17 +00:00
adding ThreadwiseGenericTensorSliceCopy_v1r2
This commit is contained in:
@@ -449,18 +449,28 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
|
||||
arithmetic_sequence_gen<0, 8, 1>::type{},
|
||||
Number<1>{});
|
||||
#elif 1
|
||||
ThreadwiseGenericTensorSliceCopy_v1<
|
||||
#elif 0
|
||||
ThreadwiseGenericTensorSliceCopy_v1r1<
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths()),
|
||||
arithmetic_sequence_gen<0, 8, 1>::type,
|
||||
arithmetic_sequence_gen<0, 8, 1>::type,
|
||||
0,
|
||||
0,
|
||||
7,
|
||||
7,
|
||||
1,
|
||||
1>({0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0})
|
||||
1>(make_zero_array<index_t, 8>(), make_zero_array<index_t, 8>())
|
||||
.Run(p_out_thread, p_out_thread_on_global);
|
||||
#elif 1
|
||||
ThreadwiseGenericTensorSliceCopy_v1r2<
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths()),
|
||||
arithmetic_sequence_gen<0, 8, 1>::type,
|
||||
7,
|
||||
1,
|
||||
1>(make_zero_array<index_t, 8>(), make_zero_array<index_t, 8>())
|
||||
.Run_non_static(p_out_thread, p_out_thread_on_global);
|
||||
#elif 0
|
||||
ThreadwiseGenericTensorSliceCopy_v2<
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
|
||||
|
||||
@@ -245,7 +245,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
|
||||
// the merged dimension need to be 1. These sanity checks are performed in constructor
|
||||
// of BlockwiseGenericTensorSliceCopy_v1
|
||||
#if 0 // debug
|
||||
#if 0
|
||||
threadwise_generic_tensor_slice_copy_v1(SrcDesc{},
|
||||
p_src + src_offset + mThreadSrcOffset,
|
||||
make_zero_array<index_t, nDim>(),
|
||||
@@ -255,18 +255,28 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
thread_sub_tensor_lengths,
|
||||
SrcDimAccessOrder{},
|
||||
Number<SrcDataPerAccess>{});
|
||||
#else
|
||||
ThreadwiseGenericTensorSliceCopy_v1<SrcDesc,
|
||||
decltype(thread_buffer_desc),
|
||||
SubLengths,
|
||||
SrcDimAccessOrder,
|
||||
typename arithmetic_sequence_gen<0, nDim, 1>::type,
|
||||
SrcVectorAccessDim,
|
||||
0,
|
||||
SrcDataPerAccess,
|
||||
1>(make_zero_array<index_t, nDim>(),
|
||||
make_zero_array<index_t, nDim>())
|
||||
#elif 0
|
||||
ThreadwiseGenericTensorSliceCopy_v1r1<
|
||||
SrcDesc,
|
||||
decltype(thread_buffer_desc),
|
||||
SubLengths,
|
||||
SrcDimAccessOrder,
|
||||
typename arithmetic_sequence_gen<0, nDim, 1>::type,
|
||||
SrcVectorAccessDim,
|
||||
0,
|
||||
SrcDataPerAccess,
|
||||
1>(make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>())
|
||||
.Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
|
||||
#elif 1
|
||||
ThreadwiseGenericTensorSliceCopy_v1r2<SrcDesc,
|
||||
decltype(thread_buffer_desc),
|
||||
SubLengths,
|
||||
SrcDimAccessOrder,
|
||||
SrcVectorAccessDim,
|
||||
SrcDataPerAccess,
|
||||
1>(make_zero_array<index_t, nDim>(),
|
||||
make_zero_array<index_t, nDim>())
|
||||
.Run_non_static(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
|
||||
#endif
|
||||
});
|
||||
}
|
||||
@@ -312,7 +322,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
// By setting SubLengths = 1 at the merged dimension, this is always true;
|
||||
// If in the future, you want to enable SubLengths > 1 at the merged dimension,
|
||||
// special care in implementation is needed
|
||||
#if 0 // debug
|
||||
#if 0
|
||||
threadwise_generic_tensor_slice_copy_v1(thread_buffer_desc,
|
||||
p_buffer + buffer_offset,
|
||||
make_zero_array<index_t, nDim>(),
|
||||
@@ -322,18 +332,29 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
thread_sub_tensor_lengths,
|
||||
DstDimAccessOrder{},
|
||||
Number<DstDataPerAccess>{});
|
||||
#else
|
||||
ThreadwiseGenericTensorSliceCopy_v1<decltype(thread_buffer_desc),
|
||||
DstDesc,
|
||||
SubLengths,
|
||||
typename arithmetic_sequence_gen<0, nDim, 1>::type,
|
||||
DstDimAccessOrder,
|
||||
0,
|
||||
DstVectorAccessDim,
|
||||
1,
|
||||
DstDataPerAccess>(make_zero_array<index_t, nDim>(),
|
||||
make_zero_array<index_t, nDim>())
|
||||
#elif 0
|
||||
ThreadwiseGenericTensorSliceCopy_v1r1<
|
||||
decltype(thread_buffer_desc),
|
||||
DstDesc,
|
||||
SubLengths,
|
||||
typename arithmetic_sequence_gen<0, nDim, 1>::type,
|
||||
DstDimAccessOrder,
|
||||
0,
|
||||
DstVectorAccessDim,
|
||||
1,
|
||||
DstDataPerAccess>(make_zero_array<index_t, nDim>(),
|
||||
make_zero_array<index_t, nDim>())
|
||||
.Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
|
||||
#elif 1
|
||||
ThreadwiseGenericTensorSliceCopy_v1r2<decltype(thread_buffer_desc),
|
||||
DstDesc,
|
||||
SubLengths,
|
||||
DstDimAccessOrder,
|
||||
DstVectorAccessDim,
|
||||
1,
|
||||
DstDataPerAccess>(make_zero_array<index_t, nDim>(),
|
||||
make_zero_array<index_t, nDim>())
|
||||
.Run_non_static(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
|
||||
#endif
|
||||
});
|
||||
}
|
||||
|
||||
@@ -106,7 +106,15 @@ __device__ void threadwise_generic_tensor_slice_copy_v1(
|
||||
#endif
|
||||
}
|
||||
|
||||
#if 1
|
||||
// This threadwise copy allow vector access of src and dst.
|
||||
// It allows the dimensions of vector access to be different on src and dst.
|
||||
// It also allows the vector size to be different on src and dst.
|
||||
// It also allows order of access to be different on src and dst.
|
||||
// It use register as buffer to hold all data moving from src to dst.
|
||||
// It is designed for copying small amount of data, and src and dst are
|
||||
// device memory or LDS.
|
||||
// When copying large amout of data, let's hope compiler will reduce register
|
||||
// used for the buffer.
|
||||
template <class SrcDesc,
|
||||
class DstDesc,
|
||||
class SliceLengths,
|
||||
@@ -116,12 +124,12 @@ template <class SrcDesc,
|
||||
index_t DstVectorAccessDim,
|
||||
index_t SrcDataPerAccess,
|
||||
index_t DstDataPerAccess>
|
||||
struct ThreadwiseGenericTensorSliceCopy_v1
|
||||
struct ThreadwiseGenericTensorSliceCopy_v1r1
|
||||
{
|
||||
static constexpr index_t nDim = SliceLengths::GetSize();
|
||||
|
||||
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1(Array<index_t, nDim> src_slice_origin,
|
||||
Array<index_t, nDim> dst_slice_origin)
|
||||
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1(
|
||||
Array<index_t, nDim> src_slice_origin, Array<index_t, nDim> dst_slice_origin)
|
||||
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
|
||||
{
|
||||
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
|
||||
@@ -145,7 +153,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1
|
||||
static_if<!SrcDesc::ContainMultipleOriginalDimensions(src_vector_access_dim)>{}(
|
||||
[&](auto fwd) {
|
||||
static_assert(
|
||||
(fwd(SrcDesc{}).GetStrides()[SrcVectorAccessDim] == 1 || SrcDataPerAccess == 1),
|
||||
(fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1),
|
||||
"wrong! vectorized access is allowed only if stride == 1");
|
||||
})
|
||||
.Else([&](auto fwd) {
|
||||
@@ -158,7 +166,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1
|
||||
static_if<!DstDesc::ContainMultipleOriginalDimensions(dst_vector_access_dim)>{}(
|
||||
[&](auto fwd) {
|
||||
static_assert(
|
||||
(fwd(DstDesc{}).GetStrides()[DstVectorAccessDim] == 1 || DstDataPerAccess == 1),
|
||||
(fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1),
|
||||
"wrong! vectorized access is allowed only if stride == 1");
|
||||
})
|
||||
.Else([&](auto fwd) {
|
||||
@@ -169,9 +177,9 @@ struct ThreadwiseGenericTensorSliceCopy_v1
|
||||
});
|
||||
}
|
||||
|
||||
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1()
|
||||
: ThreadwiseGenericTensorSliceCopy_v1(make_zero_array<index_t, nDim>(),
|
||||
make_zero_array<index_t, nDim>())
|
||||
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1()
|
||||
: ThreadwiseGenericTensorSliceCopy_v1r1(make_zero_array<index_t, nDim>(),
|
||||
make_zero_array<index_t, nDim>())
|
||||
{
|
||||
}
|
||||
|
||||
@@ -205,12 +213,12 @@ struct ThreadwiseGenericTensorSliceCopy_v1
|
||||
SliceLengths::Get(src_vector_access_dim) / src_data_per_access);
|
||||
|
||||
static_ford<decltype(src_access_lengths), SrcDimAccessOrder>{}([&](auto src_access_id) {
|
||||
constexpr auto src_data_id = src_access_id.Modify(
|
||||
constexpr auto src_data_begin_id = src_access_id.Modify(
|
||||
src_vector_access_dim,
|
||||
src_access_id[src_vector_access_dim] * src_data_per_access);
|
||||
|
||||
const index_t src_offset =
|
||||
SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_id);
|
||||
SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id);
|
||||
|
||||
// load vector from src
|
||||
const vector_t vector_data = *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
|
||||
@@ -222,7 +230,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1
|
||||
i);
|
||||
|
||||
constexpr index_t buffer_offset =
|
||||
buffer_desc.GetOffsetFromMultiIndex(src_data_id + scalar_id);
|
||||
buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id);
|
||||
|
||||
p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
|
||||
});
|
||||
@@ -241,7 +249,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1
|
||||
SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access);
|
||||
|
||||
static_ford<decltype(dst_access_lengths), DstDimAccessOrder>{}([&](auto dst_access_id) {
|
||||
constexpr auto dst_data_id = dst_access_id.Modify(
|
||||
constexpr auto dst_data_begin_id = dst_access_id.Modify(
|
||||
dst_vector_access_dim,
|
||||
dst_access_id[dst_vector_access_dim] * dst_data_per_access);
|
||||
|
||||
@@ -254,13 +262,13 @@ struct ThreadwiseGenericTensorSliceCopy_v1
|
||||
i);
|
||||
|
||||
constexpr index_t buffer_offset =
|
||||
buffer_desc.GetOffsetFromMultiIndex(dst_data_id + scalar_id);
|
||||
buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id);
|
||||
|
||||
reinterpret_cast<TData*>(&vector_data)[i] = p_buffer[buffer_offset];
|
||||
});
|
||||
|
||||
const index_t dst_offset =
|
||||
DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_id);
|
||||
DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id);
|
||||
|
||||
// store vector into dst
|
||||
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = vector_data;
|
||||
@@ -272,7 +280,196 @@ struct ThreadwiseGenericTensorSliceCopy_v1
|
||||
Array<index_t, nDim> mSrcSliceOrigin;
|
||||
Array<index_t, nDim> mDstSliceOrigin;
|
||||
};
|
||||
#endif
|
||||
|
||||
// This threadwise copy allow vector access of src and dst.
|
||||
// It allows the vector size to be different on src and dst.
|
||||
// The dimensions of vector access should be the same on src and dst.
|
||||
// The dimension access order should be the same on src and dst.
|
||||
// It is designed for cases, where one of src and dst is register, and
|
||||
// the other is device memory or LDS
|
||||
template <class SrcDesc,
|
||||
class DstDesc,
|
||||
class SliceLengths,
|
||||
class DimAccessOrder,
|
||||
index_t VectorAccessDim,
|
||||
index_t SrcDataPerAccess,
|
||||
index_t DstDataPerAccess>
|
||||
struct ThreadwiseGenericTensorSliceCopy_v1r2
|
||||
{
|
||||
static constexpr index_t nDim = SliceLengths::GetSize();
|
||||
|
||||
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2(
|
||||
Array<index_t, nDim> src_slice_origin, Array<index_t, nDim> dst_slice_origin)
|
||||
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
|
||||
{
|
||||
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
|
||||
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
|
||||
nDim == DimAccessOrder::GetSize(),
|
||||
"wrong! # of dimensions not the same");
|
||||
|
||||
static_assert(is_valid_sequence_map<DimAccessOrder>::value, "wrong! map is not valid");
|
||||
|
||||
static_assert(
|
||||
SliceLengths{}[VectorAccessDim] % math::lcm(SrcDataPerAccess, DstDataPerAccess) == 0,
|
||||
"wrong! cannot evenly divide");
|
||||
|
||||
// check vectorized memory access
|
||||
constexpr auto vector_access_dim = Number<VectorAccessDim>{};
|
||||
|
||||
static_if<!SrcDesc::ContainMultipleOriginalDimensions(vector_access_dim)>{}([&](auto fwd) {
|
||||
static_assert(
|
||||
(fwd(SrcDesc{}).GetStride(vector_access_dim) == 1 || SrcDataPerAccess == 1),
|
||||
"wrong! vectorized access is allowed only if stride == 1");
|
||||
}).Else([&](auto fwd) {
|
||||
static_assert((fwd(SrcDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 ||
|
||||
SrcDataPerAccess == 1),
|
||||
"wrong! vectorized access is allowed only if stride == 1");
|
||||
});
|
||||
|
||||
static_if<!DstDesc::ContainMultipleOriginalDimensions(vector_access_dim)>{}([&](auto fwd) {
|
||||
static_assert(
|
||||
(fwd(DstDesc{}).GetStride(vector_access_dim) == 1 || DstDataPerAccess == 1),
|
||||
"wrong! vectorized access is allowed only if stride == 1");
|
||||
}).Else([&](auto fwd) {
|
||||
static_assert((fwd(DstDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 ||
|
||||
DstDataPerAccess == 1),
|
||||
"wrong! vectorized access is allowed only if stride == 1");
|
||||
});
|
||||
}
|
||||
|
||||
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2()
|
||||
: ThreadwiseGenericTensorSliceCopy_v1r2(make_zero_array<index_t, nDim>(),
|
||||
make_zero_array<index_t, nDim>())
|
||||
{
|
||||
}
|
||||
|
||||
__device__ void SetSrcSliceOrigin(Array<index_t, nDim> src_slice_origin)
|
||||
{
|
||||
mSrcSliceOrigin = src_slice_origin;
|
||||
}
|
||||
|
||||
__device__ void SetDstSliceOrigin(Array<index_t, nDim> dst_slice_origin)
|
||||
{
|
||||
mDstSliceOrigin = dst_slice_origin;
|
||||
}
|
||||
|
||||
template <class TData>
|
||||
__device__ void Run(const TData* p_src, TData* p_dst) const
|
||||
{
|
||||
using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
|
||||
using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
|
||||
|
||||
constexpr auto vector_access_dim = Number<VectorAccessDim>{};
|
||||
|
||||
constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
|
||||
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
|
||||
|
||||
constexpr auto long_vector_size = Number<math::lcm(SrcDataPerAccess, DstDataPerAccess)>{};
|
||||
|
||||
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
|
||||
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
|
||||
|
||||
static_ford<decltype(long_vector_access_lengths), DimAccessOrder>{}([&](
|
||||
auto long_vector_access_id) {
|
||||
// data id w.r.t slicing-window
|
||||
constexpr auto long_vector_data_begin_id = long_vector_access_id.Modify(
|
||||
vector_access_dim, long_vector_access_id[vector_access_dim] * long_vector_size);
|
||||
|
||||
// buffer to hold a long-vector
|
||||
TData p_long_vector[long_vector_size];
|
||||
|
||||
// load data from src to the long-vector buffer
|
||||
static_for<0, long_vector_size / src_data_per_access, 1>{}([&](auto i) {
|
||||
constexpr auto scalar_id = typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
|
||||
vector_access_dim, i * src_data_per_access);
|
||||
|
||||
const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(
|
||||
mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id));
|
||||
|
||||
constexpr index_t buffer_offset = i * src_data_per_access;
|
||||
|
||||
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
|
||||
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
|
||||
});
|
||||
|
||||
// store data from the long-vector buffer to dst
|
||||
static_for<0, long_vector_size / dst_data_per_access, 1>{}([&](auto i) {
|
||||
constexpr auto scalar_id = typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
|
||||
vector_access_dim, i * dst_data_per_access);
|
||||
|
||||
constexpr index_t buffer_offset = i * dst_data_per_access;
|
||||
|
||||
const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(
|
||||
mDstSliceOrigin + (long_vector_data_begin_id + scalar_id));
|
||||
|
||||
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
|
||||
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
template <class TData>
|
||||
__device__ void Run_non_static(const TData* p_src, TData* p_dst) const
|
||||
{
|
||||
using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
|
||||
using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
|
||||
|
||||
constexpr auto vector_access_dim = Number<VectorAccessDim>{};
|
||||
|
||||
constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
|
||||
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
|
||||
|
||||
constexpr auto long_vector_size = Number<math::lcm(SrcDataPerAccess, DstDataPerAccess)>{};
|
||||
|
||||
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
|
||||
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
|
||||
|
||||
ford<decltype(long_vector_access_lengths), DimAccessOrder>{}(
|
||||
[&](auto long_vector_access_id) {
|
||||
// data id w.r.t slicing-window
|
||||
auto long_vector_data_begin_id = long_vector_access_id;
|
||||
long_vector_data_begin_id(vector_access_dim) =
|
||||
long_vector_size * long_vector_access_id[vector_access_dim];
|
||||
|
||||
// buffer to hold a long-vector
|
||||
TData p_long_vector[long_vector_size];
|
||||
|
||||
// load data from src to the long-vector buffer
|
||||
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
|
||||
{
|
||||
auto scalar_id = make_zero_array<index_t, nDim>();
|
||||
scalar_id(vector_access_dim) = i * src_data_per_access;
|
||||
|
||||
const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(
|
||||
mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id));
|
||||
|
||||
const index_t buffer_offset = i * src_data_per_access;
|
||||
|
||||
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
|
||||
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
|
||||
}
|
||||
|
||||
// store data from the long-vector buffer to dst
|
||||
for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
|
||||
{
|
||||
auto scalar_id = make_zero_array<index_t, nDim>();
|
||||
scalar_id(vector_access_dim) = i * dst_data_per_access;
|
||||
|
||||
const index_t buffer_offset = i * dst_data_per_access;
|
||||
|
||||
const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(
|
||||
mDstSliceOrigin + (long_vector_data_begin_id + scalar_id));
|
||||
|
||||
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
|
||||
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private:
|
||||
Array<index_t, nDim> mSrcSliceOrigin;
|
||||
Array<index_t, nDim> mDstSliceOrigin;
|
||||
};
|
||||
|
||||
template <class SrcDesc,
|
||||
class DstDesc,
|
||||
|
||||
@@ -59,7 +59,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
|
||||
|
||||
constexpr index_t B = (N * Ho * Wo) / (N1 * N2);
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
// each thread hold 64 data
|
||||
constexpr index_t BlockSize = 256;
|
||||
|
||||
@@ -112,14 +112,14 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
|
||||
constexpr index_t GemmDataPerReadA = 4;
|
||||
constexpr index_t GemmDataPerReadB = 4;
|
||||
|
||||
using InBlockCopySubLengths_E_N1_B_N2 = Sequence<1, 1, 1, 4>;
|
||||
using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<8, 2, 16, 1>;
|
||||
using InBlockCopySubLengths_E_N1_B_N2 = Sequence<1, 1, 4, 1>;
|
||||
using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<8, 2, 4, 4>;
|
||||
using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
|
||||
using InBlockCopySrcAccessOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
|
||||
using InBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [E, N1, B, N2]
|
||||
|
||||
constexpr index_t InBlockCopySrcDataPerRead_B = 1;
|
||||
constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4;
|
||||
constexpr index_t InBlockCopySrcDataPerRead_B = 4;
|
||||
constexpr index_t InBlockCopyDstDataPerWrite_N2 = 1;
|
||||
|
||||
using WeiBlockCopySubLengths_E_K = Sequence<2, 2>;
|
||||
using WeiBlockCopyClusterLengths_E_K = Sequence<4, 64>;
|
||||
|
||||
Reference in New Issue
Block a user