mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 08:50:17 +00:00
added ThreadwiseGenericTensorSliceCopy_v2r1
This commit is contained in:
@@ -47,7 +47,8 @@ template <index_t GridSize,
|
||||
class WeiBlockCopySrcAccessOrder,
|
||||
class WeiBlockCopyDstAccessOrder,
|
||||
index_t WeiBlockCopySrcDataPerRead_E,
|
||||
index_t WeiBlockCopyDstDataPerWrite_K>
|
||||
index_t WeiBlockCopyDstDataPerWrite_K,
|
||||
index_t OutThreadCopyDataPerAccess_W>
|
||||
struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
{
|
||||
__device__ void Run(const Float* const __restrict__ p_in_global,
|
||||
@@ -155,7 +156,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
|
||||
"GemmDataPerReadB alignment requirement is not satisfied");
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
// input blockwise copy
|
||||
// slice a merged tensor, reorder and copy to a normal tensor
|
||||
// this copy operator already has blockwise offset built-in
|
||||
@@ -184,7 +185,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
decltype(in_e_n1_b_n2_block_desc.GetLengths()),
|
||||
InBlockCopySubLengths_E_N1_B_N2,
|
||||
InBlockCopyClusterLengths_E_N1_B_N2,
|
||||
InBlockCopyThreadClusterArrangeOrder>({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
|
||||
InBlockCopyThreadClusterArrangeOrder,
|
||||
InBlockCopySrcAccessOrder,
|
||||
InBlockCopyDstAccessOrder,
|
||||
2,
|
||||
3,
|
||||
InBlockCopySrcDataPerRead_B,
|
||||
InBlockCopyDstDataPerWrite_N2>({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
|
||||
#endif
|
||||
|
||||
// weight tensor
|
||||
@@ -198,7 +205,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
Sequence<EPerBlock, KPerBlock>{},
|
||||
Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
// operator for blockwise copy of weight into LDS
|
||||
// slice a tensor, and copy it into another tensor
|
||||
// this copy operator already have blockwise offset built-in
|
||||
@@ -227,7 +234,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
decltype(wei_e_k_block_desc.GetLengths()),
|
||||
WeiBlockCopySubLengths_E_K,
|
||||
WeiBlockCopyClusterLengths_E_K,
|
||||
WeiBlockCopyThreadClusterArrangeOrder>({0, k_block_data_on_global}, {0, 0});
|
||||
WeiBlockCopyThreadClusterArrangeOrder,
|
||||
WeiBlockCopySrcAccessOrder,
|
||||
WeiBlockCopyDstAccessOrder,
|
||||
0,
|
||||
1,
|
||||
WeiBlockCopySrcDataPerRead_E,
|
||||
WeiBlockCopyDstDataPerWrite_K>({0, k_block_data_on_global}, {0, 0});
|
||||
#endif
|
||||
|
||||
// GEMM definition
|
||||
@@ -322,7 +335,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
|
||||
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
|
||||
// blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{},
|
||||
// True);
|
||||
@@ -354,7 +367,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
|
||||
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
|
||||
// blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
|
||||
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
|
||||
@@ -434,7 +447,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
|
||||
k_thread_data_on_global, 0, b_thread_data_on_global, 0);
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
ThreadwiseGenericTensorSliceCopy_v1r2<
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
|
||||
@@ -445,12 +458,18 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
1>(make_zero_array<index_t, 8>(), make_zero_array<index_t, 8>())
|
||||
.Run(p_out_thread, p_out_thread_on_global);
|
||||
#elif 1
|
||||
ThreadwiseGenericTensorSliceCopy_v2<
|
||||
ThreadwiseGenericTensorSliceCopy_v2r1<
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
|
||||
NormalTensorCoordinate<decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc)>,
|
||||
MergedTensorCoordinate<decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc)>,
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths())>(
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths()),
|
||||
arithmetic_sequence_gen<0, 8, 1>::type,
|
||||
arithmetic_sequence_gen<0, 8, 1>::type,
|
||||
7,
|
||||
7,
|
||||
1,
|
||||
1>(
|
||||
{0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0})
|
||||
.Run(p_out_thread, p_out_thread_on_global);
|
||||
#endif
|
||||
|
||||
@@ -44,7 +44,8 @@ template <index_t GridSize,
|
||||
class WeiBlockCopySrcAccessOrder,
|
||||
class WeiBlockCopyDstAccessOrder,
|
||||
index_t WeiBlockCopySrcDataPerRead_E,
|
||||
index_t WeiBlockCopyDstDataPerWrite_K>
|
||||
index_t WeiBlockCopyDstDataPerWrite_K,
|
||||
index_t OutThreadCopyDataPerAccess_B>
|
||||
struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
|
||||
{
|
||||
__device__ void Run(const Float* const __restrict__ p_in_global,
|
||||
@@ -138,7 +139,13 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
|
||||
decltype(in_e_b_block_desc.GetLengths()),
|
||||
InBlockCopySubLengths_E_B,
|
||||
InBlockCopyClusterLengths_E_B,
|
||||
InBlockCopyThreadClusterArrangeOrder>(
|
||||
InBlockCopyThreadClusterArrangeOrder,
|
||||
InBlockCopySrcAccessOrder,
|
||||
InBlockCopyDstAccessOrder,
|
||||
1,
|
||||
1,
|
||||
InBlockCopyDataPerAccess_B,
|
||||
InBlockCopyDataPerAccess_B>(
|
||||
{0, b_block_data_on_global}, {0, 0});
|
||||
|
||||
// weight tensor
|
||||
@@ -164,7 +171,13 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
|
||||
decltype(wei_e_k_block_desc.GetLengths()),
|
||||
WeiBlockCopySubLengths_E_K,
|
||||
WeiBlockCopyClusterLengths_E_K,
|
||||
WeiBlockCopyThreadClusterArrangeOrder>({0, k_block_data_on_global}, {0, 0});
|
||||
WeiBlockCopyThreadClusterArrangeOrder,
|
||||
WeiBlockCopySrcAccessOrder,
|
||||
WeiBlockCopyDstAccessOrder,
|
||||
0,
|
||||
1,
|
||||
WeiBlockCopySrcDataPerRead_E,
|
||||
WeiBlockCopyDstDataPerWrite_K>({0, k_block_data_on_global}, {0, 0});
|
||||
|
||||
// GEMM definition
|
||||
// c_mtx += transpose(a_mtx) * b_mtx
|
||||
@@ -349,15 +362,21 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
|
||||
using OutThreadCopySliceLengths =
|
||||
Sequence<GemmMRepeat, GemmMPerThreadSubC, GemmNPerThreadSubC>;
|
||||
|
||||
auto threadwise_out_copy = ThreadwiseGenericTensorSliceCopy_v2<
|
||||
auto threadwise_out_copy = ThreadwiseGenericTensorSliceCopy_v2r1<
|
||||
decltype(out_k0_k1_b_thread_desc),
|
||||
decltype(out_k0_k1_b_global_desc),
|
||||
NormalTensorCoordinate<decltype(out_k0_k1_b_thread_desc)>,
|
||||
MergedTensorCoordinate<decltype(out_k0_k1_b_global_desc)>,
|
||||
OutThreadCopySliceLengths>({0, 0, 0},
|
||||
{k_thread_data_on_global / K1,
|
||||
k_thread_data_on_global % K1,
|
||||
b_thread_data_on_global});
|
||||
OutThreadCopySliceLengths,
|
||||
arithmetic_sequence_gen<0, 3, 1>::type,
|
||||
arithmetic_sequence_gen<0, 3, 1>::type,
|
||||
2,
|
||||
2,
|
||||
OutThreadCopyDataPerAccess_B,
|
||||
OutThreadCopyDataPerAccess_B>({0, 0, 0},
|
||||
{k_thread_data_on_global / K1,
|
||||
k_thread_data_on_global % K1,
|
||||
b_thread_data_on_global});
|
||||
|
||||
for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat)
|
||||
{
|
||||
|
||||
@@ -412,7 +412,13 @@ template <index_t BlockSize,
|
||||
class SliceLengths,
|
||||
class SubLengths,
|
||||
class ThreadClusterLengths,
|
||||
class ThreadClusterArrangeOrder>
|
||||
class ThreadClusterArrangeOrder,
|
||||
class SrcDimAccessOrder,
|
||||
class DstDimAccessOrder,
|
||||
index_t SrcVectorAccessDim,
|
||||
index_t DstVectorAccessDim,
|
||||
index_t SrcDataPerAccess,
|
||||
index_t DstDataPerAccess>
|
||||
struct BlockwiseGenericTensorSliceCopy_v2
|
||||
{
|
||||
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
|
||||
@@ -496,6 +502,7 @@ struct BlockwiseGenericTensorSliceCopy_v2
|
||||
private:
|
||||
using RegisterBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{}));
|
||||
|
||||
#if 0
|
||||
using ThreadwiseLoad =
|
||||
ThreadwiseGenericTensorSliceCopy_v2<SrcDesc,
|
||||
RegisterBufferDesc,
|
||||
@@ -509,6 +516,33 @@ struct BlockwiseGenericTensorSliceCopy_v2
|
||||
NormalTensorCoordinate<RegisterBufferDesc>,
|
||||
DstCoordinate,
|
||||
SubLengths>;
|
||||
#else
|
||||
using ThreadwiseLoad =
|
||||
ThreadwiseGenericTensorSliceCopy_v2r1<SrcDesc,
|
||||
RegisterBufferDesc,
|
||||
SrcCoordinate,
|
||||
NormalTensorCoordinate<RegisterBufferDesc>,
|
||||
SubLengths,
|
||||
SrcDimAccessOrder,
|
||||
SrcDimAccessOrder,
|
||||
SrcVectorAccessDim,
|
||||
SrcVectorAccessDim,
|
||||
SrcDataPerAccess,
|
||||
1>;
|
||||
|
||||
using ThreadwiseStore =
|
||||
ThreadwiseGenericTensorSliceCopy_v2r1<RegisterBufferDesc,
|
||||
DstDesc,
|
||||
NormalTensorCoordinate<RegisterBufferDesc>,
|
||||
DstCoordinate,
|
||||
SubLengths,
|
||||
DstDimAccessOrder,
|
||||
DstDimAccessOrder,
|
||||
DstVectorAccessDim,
|
||||
DstVectorAccessDim,
|
||||
1,
|
||||
DstDataPerAccess>;
|
||||
#endif
|
||||
ThreadwiseLoad mThreadwiseLoad;
|
||||
ThreadwiseStore mThreadwiseStore;
|
||||
};
|
||||
|
||||
@@ -18,6 +18,10 @@
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
|
||||
#endif
|
||||
|
||||
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
|
||||
#endif
|
||||
|
||||
namespace ck {
|
||||
|
||||
// This threadwise copy allow vector access of src and dst.
|
||||
@@ -590,5 +594,313 @@ struct ThreadwiseGenericTensorSliceCopy_v2
|
||||
DstCoordinate mDstSliceOrigin;
|
||||
};
|
||||
|
||||
#if 1
|
||||
// This threadwise copy allow vector access of src and dst.
|
||||
// It allows the dimensions of vector access to be different on src and dst.
|
||||
// It also allows the vector size to be different on src and dst.
|
||||
// It also allows order of access to be different on src and dst.
|
||||
// It use register as buffer to hold all data moving from src to dst.
|
||||
// It is designed for copying small amount of data, and src and dst are
|
||||
// device memory or LDS.
|
||||
// When copying large amout of data, let's hope compiler will reduce register
|
||||
// used for the buffer.
|
||||
template <class SrcDesc,
|
||||
class DstDesc,
|
||||
class SrcCoordinate,
|
||||
class DstCoordinate,
|
||||
class SliceLengths,
|
||||
class SrcDimAccessOrder,
|
||||
class DstDimAccessOrder,
|
||||
index_t SrcVectorAccessDim,
|
||||
index_t DstVectorAccessDim,
|
||||
index_t SrcDataPerAccess,
|
||||
index_t DstDataPerAccess>
|
||||
struct ThreadwiseGenericTensorSliceCopy_v2r1
|
||||
{
|
||||
static constexpr index_t nDim = SliceLengths::GetSize();
|
||||
|
||||
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1(SrcCoordinate src_slice_origin,
|
||||
DstCoordinate dst_slice_origin)
|
||||
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
|
||||
{
|
||||
}
|
||||
|
||||
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1()
|
||||
: ThreadwiseGenericTensorSliceCopy_v2r1(make_zero_array<index_t, nDim>(),
|
||||
make_zero_array<index_t, nDim>())
|
||||
{
|
||||
}
|
||||
|
||||
__device__ void SetSrcSliceOrigin(SrcCoordinate src_slice_origin)
|
||||
{
|
||||
mSrcSliceOrigin = src_slice_origin;
|
||||
}
|
||||
|
||||
__device__ void SetDstSliceOrigin(DstCoordinate dst_slice_origin)
|
||||
{
|
||||
mDstSliceOrigin = dst_slice_origin;
|
||||
}
|
||||
|
||||
template <class TDesc, class Lengths>
|
||||
struct IsolateMergedDimLengths
|
||||
{
|
||||
template <class IDim>
|
||||
__device__ constexpr index_t operator()(IDim idim) const
|
||||
{
|
||||
return TDesc::ContainMultipleOriginalDimensions(idim) ? Lengths{}[idim] : 1;
|
||||
}
|
||||
};
|
||||
|
||||
template <class TData>
|
||||
__device__ void Run(const TData* p_src, TData* p_dst) const
|
||||
{
|
||||
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
|
||||
|
||||
TData p_buffer_[buffer_desc.GetElementSpace()];
|
||||
TData* p_buffer = p_buffer_;
|
||||
|
||||
// copy data from src into buffer
|
||||
{
|
||||
using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
|
||||
|
||||
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
|
||||
constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
|
||||
|
||||
constexpr auto src_access_lengths = SliceLengths::Modify(
|
||||
src_vector_access_dim,
|
||||
SliceLengths::Get(src_vector_access_dim) / src_data_per_access);
|
||||
|
||||
// Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t
|
||||
// normal dimensions is known at compile time.
|
||||
// Below is a hack to isolate merged dimension id from normal dimension id, so the
|
||||
// corresponding offset can be calculated seperately at run-time and compile-time.
|
||||
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
|
||||
// merged dimensions, and has value = 1 on normal dimensions;
|
||||
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
|
||||
// normal dimensions, and has value = 1 on merged dimensions;
|
||||
constexpr auto src_merged_dim_access_lengths = typename sequence_gen<
|
||||
nDim,
|
||||
IsolateMergedDimLengths<SrcDesc, decltype(src_access_lengths)>>::type{};
|
||||
|
||||
constexpr auto src_normal_dim_access_lengths =
|
||||
src_access_lengths + Number<1>{} - src_merged_dim_access_lengths;
|
||||
|
||||
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
|
||||
// offset w.r.t. merged dimension need to be computed at run-time
|
||||
static_ford<decltype(src_merged_dim_access_lengths), SrcDimAccessOrder>{}([&](
|
||||
auto src_merged_dim_access_id_) {
|
||||
|
||||
constexpr auto src_merged_dim_access_id = decltype(src_merged_dim_access_id_){};
|
||||
|
||||
constexpr auto src_merged_dim_data_id = src_merged_dim_access_id.Modify(
|
||||
src_vector_access_dim,
|
||||
src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access);
|
||||
|
||||
const TData* p_src_tmp =
|
||||
p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset();
|
||||
|
||||
// offset w.r.t. normal dimension can be computed at compile-time
|
||||
static_ford<decltype(src_normal_dim_access_lengths), SrcDimAccessOrder>{}([&](
|
||||
auto src_normal_dim_access_id_) {
|
||||
|
||||
constexpr auto src_normal_dim_access_id = decltype(src_normal_dim_access_id_){};
|
||||
|
||||
constexpr auto src_normal_dim_data_id = src_normal_dim_access_id.Modify(
|
||||
src_vector_access_dim,
|
||||
src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access);
|
||||
|
||||
constexpr index_t src_normal_offset =
|
||||
SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id);
|
||||
|
||||
// load vector from src
|
||||
const src_vector_t vector_data =
|
||||
*reinterpret_cast<const src_vector_t*>(&p_src_tmp[src_normal_offset]);
|
||||
|
||||
// unpack vector into buffer
|
||||
static_for<0, SrcDataPerAccess, 1>{}([&](auto i) {
|
||||
constexpr auto scalar_id =
|
||||
typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
|
||||
src_vector_access_dim, i);
|
||||
|
||||
constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
|
||||
src_merged_dim_data_id + src_normal_dim_data_id + scalar_id);
|
||||
|
||||
constexpr index_t buffer_offset =
|
||||
buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id);
|
||||
|
||||
p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
|
||||
});
|
||||
});
|
||||
});
|
||||
#else
|
||||
ford<decltype(src_merged_dim_access_lengths), SrcDimAccessOrder>{}([&](
|
||||
auto src_merged_dim_access_id) {
|
||||
|
||||
auto src_merged_dim_data_id = src_merged_dim_access_id;
|
||||
src_merged_dim_data_id(src_vector_access_dim) =
|
||||
src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access;
|
||||
|
||||
const TData* p_src_tmp =
|
||||
p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset();
|
||||
|
||||
// these should be compile-time known
|
||||
ford<decltype(src_normal_dim_access_lengths), SrcDimAccessOrder>{}([&](
|
||||
auto src_normal_dim_access_id) {
|
||||
|
||||
auto src_normal_dim_data_id = src_normal_dim_access_id;
|
||||
src_normal_dim_data_id(src_vector_access_dim) =
|
||||
src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access;
|
||||
|
||||
const index_t src_normal_offset =
|
||||
SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id);
|
||||
|
||||
// load vector from src
|
||||
const src_vector_t vector_data =
|
||||
*reinterpret_cast<const src_vector_t*>(&p_src_tmp[src_normal_offset]);
|
||||
|
||||
// unpack vector into buffer
|
||||
for(index_t i = 0; i < SrcDataPerAccess; ++i)
|
||||
{
|
||||
auto scalar_id = make_zero_array<index_t, nDim>();
|
||||
scalar_id(src_vector_access_dim) = i;
|
||||
|
||||
const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
|
||||
src_merged_dim_data_id + src_normal_dim_data_id + scalar_id);
|
||||
|
||||
p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
|
||||
}
|
||||
});
|
||||
});
|
||||
#endif
|
||||
}
|
||||
|
||||
// copy data from buffer into dst
|
||||
{
|
||||
using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
|
||||
|
||||
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
|
||||
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
|
||||
|
||||
constexpr auto dst_access_lengths = SliceLengths::Modify(
|
||||
dst_vector_access_dim,
|
||||
SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access);
|
||||
|
||||
constexpr auto dst_merged_dim_access_lengths = typename sequence_gen<
|
||||
nDim,
|
||||
IsolateMergedDimLengths<DstDesc, decltype(dst_access_lengths)>>::type{};
|
||||
|
||||
constexpr auto dst_normal_dim_access_lengths =
|
||||
dst_access_lengths + Number<1>{} - dst_merged_dim_access_lengths;
|
||||
|
||||
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
|
||||
// offset w.r.t. merged dimension need to be computed at run-time
|
||||
static_ford<decltype(dst_merged_dim_access_lengths), DstDimAccessOrder>{}([&](
|
||||
auto dst_merged_dim_access_id_) {
|
||||
|
||||
constexpr auto dst_merged_dim_access_id = decltype(dst_merged_dim_access_id_){};
|
||||
|
||||
constexpr auto dst_merged_dim_data_id = dst_merged_dim_access_id.Modify(
|
||||
dst_vector_access_dim,
|
||||
dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access);
|
||||
|
||||
TData* p_dst_tmp = p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset();
|
||||
|
||||
// offset w.r.t. normal dimension can be computed at compile-time
|
||||
static_ford<decltype(dst_normal_dim_access_lengths), DstDimAccessOrder>{}([&](
|
||||
auto dst_normal_dim_access_id_) {
|
||||
constexpr auto dst_normal_dim_access_id = decltype(dst_normal_dim_access_id_){};
|
||||
|
||||
constexpr auto dst_normal_dim_data_id = dst_normal_dim_access_id.Modify(
|
||||
dst_vector_access_dim,
|
||||
dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access);
|
||||
|
||||
dst_vector_t vector_data;
|
||||
|
||||
// pack vector from buffer
|
||||
static_for<0, DstDataPerAccess, 1>{}([&](auto i) {
|
||||
constexpr auto scalar_id =
|
||||
typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
|
||||
dst_vector_access_dim, i);
|
||||
|
||||
constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
|
||||
dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id);
|
||||
|
||||
reinterpret_cast<TData*>(&vector_data)[i] = p_buffer[buffer_offset];
|
||||
});
|
||||
|
||||
constexpr index_t dst_normal_offset =
|
||||
DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id);
|
||||
|
||||
// write vector into dst
|
||||
*reinterpret_cast<dst_vector_t*>(&p_dst_tmp[dst_normal_offset]) = vector_data;
|
||||
});
|
||||
});
|
||||
#else
|
||||
// offset w.r.t. merged dimension need to be computed at run-time
|
||||
ford<decltype(dst_merged_dim_access_lengths), DstDimAccessOrder>{}([&](
|
||||
auto dst_merged_dim_access_id) {
|
||||
|
||||
auto dst_merged_dim_data_id = dst_merged_dim_access_id;
|
||||
dst_merged_dim_data_id(dst_vector_access_dim) =
|
||||
dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access;
|
||||
|
||||
TData* p_dst_tmp = p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset();
|
||||
|
||||
// offset w.r.t. normal dimension can be computed at compile-time
|
||||
ford<decltype(dst_normal_dim_access_lengths), DstDimAccessOrder>{}([&](
|
||||
auto dst_normal_dim_access_id) {
|
||||
|
||||
auto dst_normal_dim_data_id = dst_normal_dim_access_id;
|
||||
dst_normal_dim_data_id(dst_vector_access_dim) =
|
||||
dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access;
|
||||
|
||||
dst_vector_t vector_data;
|
||||
|
||||
// pack vector from buffer
|
||||
for(index_t i = 0; i < DstDataPerAccess; ++i)
|
||||
{
|
||||
auto scalar_id = make_zero_array<index_t, nDim>();
|
||||
scalar_id(dst_vector_access_dim) = i;
|
||||
|
||||
const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
|
||||
dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id);
|
||||
|
||||
reinterpret_cast<TData*>(&vector_data)[i] = p_buffer[buffer_offset];
|
||||
}
|
||||
|
||||
const index_t dst_normal_offset =
|
||||
DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id);
|
||||
|
||||
// write vector into dst
|
||||
*reinterpret_cast<dst_vector_t*>(&p_dst_tmp[dst_normal_offset]) = vector_data;
|
||||
});
|
||||
});
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
// T can be Sequence or Array
|
||||
template <class T, bool PositiveDirection>
|
||||
__device__ void MoveSrcSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
|
||||
{
|
||||
static_if<PositiveDirection>{}([&](auto) {
|
||||
mSrcSliceOrigin += step_sizes;
|
||||
}).Else([&](auto) { mSrcSliceOrigin -= step_sizes; });
|
||||
}
|
||||
|
||||
template <class T, bool PositiveDirection>
|
||||
__device__ void MoveDstSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
|
||||
{
|
||||
static_if<PositiveDirection>{}([&](auto) {
|
||||
mDstSliceOrigin += step_sizes;
|
||||
}).Else([&](auto) { mDstSliceOrigin -= step_sizes; });
|
||||
}
|
||||
|
||||
private:
|
||||
SrcCoordinate mSrcSliceOrigin;
|
||||
DstCoordinate mDstSliceOrigin;
|
||||
};
|
||||
#endif
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
|
||||
|
||||
namespace ck {
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
|
||||
|
||||
namespace ck {
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#include "device.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "gridwise_convolution_kernel_wrapper.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
|
||||
//#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"
|
||||
|
||||
template <class T,
|
||||
@@ -94,6 +94,8 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
|
||||
|
||||
constexpr index_t WeiBlockCopySrcDataPerRead_E = 4;
|
||||
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
|
||||
|
||||
constexpr index_t OutThreadCopyDataPerAccess_W = 1;
|
||||
#elif 1
|
||||
// each thread hold 64 data
|
||||
constexpr index_t BlockSize = 256;
|
||||
@@ -214,7 +216,8 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
|
||||
WeiBlockCopySrcAccessOrder,
|
||||
WeiBlockCopyDstAccessOrder,
|
||||
WeiBlockCopySrcDataPerRead_E,
|
||||
WeiBlockCopyDstDataPerWrite_K>{};
|
||||
WeiBlockCopyDstDataPerWrite_K,
|
||||
OutThreadCopyDataPerAccess_W>{};
|
||||
|
||||
float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
|
||||
dim3(GridSize),
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#include "device.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "gridwise_convolution_kernel_wrapper.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
|
||||
//#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
|
||||
|
||||
using namespace ck;
|
||||
@@ -55,7 +55,6 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
|
||||
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
|
||||
|
||||
#if 1
|
||||
// 1x1 filter, 8x8 image
|
||||
constexpr index_t BlockSize = 256;
|
||||
|
||||
constexpr index_t BPerBlock = 128;
|
||||
@@ -86,8 +85,45 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
|
||||
using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E]
|
||||
using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K]
|
||||
|
||||
constexpr index_t WeiBlockCopySrcDataPerRead_E = 1;
|
||||
constexpr index_t WeiBlockCopySrcDataPerRead_E = 4;
|
||||
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
|
||||
|
||||
constexpr index_t OutThreadCopyDataPerAccess_B = 1;
|
||||
#elif 0 // debug
|
||||
constexpr index_t BlockSize = 256;
|
||||
|
||||
constexpr index_t BPerBlock = 128;
|
||||
constexpr index_t KPerBlock = 128;
|
||||
constexpr index_t EPerBlock = 8;
|
||||
|
||||
constexpr index_t GemmMPerThreadSubC = 4;
|
||||
constexpr index_t GemmNPerThreadSubC = 4;
|
||||
constexpr index_t GemmMLevel0Cluster = 4;
|
||||
constexpr index_t GemmNLevel0Cluster = 4;
|
||||
constexpr index_t GemmMLevel1Cluster = 4;
|
||||
constexpr index_t GemmNLevel1Cluster = 4;
|
||||
constexpr index_t GemmKPerThreadLoop = 1;
|
||||
constexpr index_t GemmDataPerReadA = 4;
|
||||
constexpr index_t GemmDataPerReadB = 4;
|
||||
|
||||
using InBlockCopySubLengths_E_B = Sequence<1, 4>;
|
||||
using InBlockCopyClusterLengths_E_B = Sequence<8, 32>;
|
||||
using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
|
||||
using InBlockCopySrcAccessOrder = Sequence<0, 1>; // [E, B]
|
||||
using InBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, B]
|
||||
|
||||
constexpr index_t InBlockCopyDataPerAccess_B = 1;
|
||||
|
||||
using WeiBlockCopySubLengths_E_K = Sequence<4, 1>;
|
||||
using WeiBlockCopyClusterLengths_E_K = Sequence<2, 128>;
|
||||
using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
|
||||
using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E]
|
||||
using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K]
|
||||
|
||||
constexpr index_t WeiBlockCopySrcDataPerRead_E = 4;
|
||||
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
|
||||
|
||||
constexpr index_t OutThreadCopyDataPerAccess_B = 1;
|
||||
#elif 1
|
||||
// 1x1 filter, 8x8 image
|
||||
constexpr index_t BlockSize = 256;
|
||||
@@ -106,13 +142,13 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
|
||||
constexpr index_t GemmDataPerReadA = 4;
|
||||
constexpr index_t GemmDataPerReadB = 4;
|
||||
|
||||
using InBlockCopySubLengths_E_B = Sequence<2, 2>;
|
||||
using InBlockCopyClusterLengths_E_B = Sequence<4, 64>;
|
||||
using InBlockCopySubLengths_E_B = Sequence<1, 4>;
|
||||
using InBlockCopyClusterLengths_E_B = Sequence<8, 32>;
|
||||
using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
|
||||
using InBlockCopySrcAccessOrder = Sequence<0, 1>; // [E, B]
|
||||
using InBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, B]
|
||||
|
||||
constexpr index_t InBlockCopyDataPerAccess_B = 1;
|
||||
constexpr index_t InBlockCopyDataPerAccess_B = 4;
|
||||
|
||||
using WeiBlockCopySubLengths_E_K = Sequence<4, 1>;
|
||||
using WeiBlockCopyClusterLengths_E_K = Sequence<2, 128>;
|
||||
@@ -120,8 +156,10 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
|
||||
using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E]
|
||||
using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K]
|
||||
|
||||
constexpr index_t WeiBlockCopySrcDataPerRead_E = 1;
|
||||
constexpr index_t WeiBlockCopySrcDataPerRead_E = 4;
|
||||
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
|
||||
|
||||
constexpr index_t OutThreadCopyDataPerAccess_B = 4;
|
||||
#endif
|
||||
|
||||
constexpr index_t B = N * Ho * Wo;
|
||||
@@ -169,7 +207,8 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
|
||||
WeiBlockCopySrcAccessOrder,
|
||||
WeiBlockCopyDstAccessOrder,
|
||||
WeiBlockCopySrcDataPerRead_E,
|
||||
WeiBlockCopyDstDataPerWrite_K>{};
|
||||
WeiBlockCopyDstDataPerWrite_K,
|
||||
OutThreadCopyDataPerAccess_B>{};
|
||||
|
||||
for(index_t i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user