mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 09:16:52 +00:00
enabling padding for chwn format
This commit is contained in:
@@ -18,8 +18,8 @@ template <index_t GridSize,
|
||||
class InGlobalDesc,
|
||||
class WeiGlobalDesc,
|
||||
class OutGlobalDesc,
|
||||
class LowerPads,
|
||||
class UpperPads,
|
||||
class LeftPads,
|
||||
class RightPads,
|
||||
index_t NPerBlock,
|
||||
index_t KPerBlock,
|
||||
index_t CPerBlock,
|
||||
@@ -60,7 +60,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
|
||||
static constexpr auto I10 = Number<10>{};
|
||||
static constexpr auto I11 = Number<11>{};
|
||||
|
||||
#if 0
|
||||
static constexpr auto True = integral_constant<bool, true>{};
|
||||
static constexpr auto False = integral_constant<bool, false>{};
|
||||
|
||||
#if 1
|
||||
__device__ void Run(const Float* const __restrict__ p_in_global,
|
||||
const Float* const __restrict__ p_wei_global,
|
||||
Float* const __restrict__ p_out_global) const
|
||||
@@ -73,14 +76,22 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
|
||||
GemmNPerThreadSubC % NPerThread == 0)),
|
||||
"wrong!");
|
||||
|
||||
constexpr auto True = integral_constant<bool, true>{};
|
||||
constexpr auto False = integral_constant<bool, false>{};
|
||||
constexpr auto in_c_h_w_n_global_desc_old = InGlobalDesc{};
|
||||
constexpr auto wei_c_y_x_k_global_desc_old = WeiGlobalDesc{};
|
||||
constexpr auto out_k_h_w_n_global_desc_old = OutGlobalDesc{};
|
||||
|
||||
constexpr auto in_c_h_w_n_global_desc = InGlobalDesc{};
|
||||
constexpr auto wei_c_y_x_k_global_desc = WeiGlobalDesc{};
|
||||
constexpr auto out_k_h_w_n_global_desc = OutGlobalDesc{};
|
||||
constexpr auto in_c_h_w_n_global_desc = make_native_tensor_descriptor(
|
||||
in_c_h_w_n_global_desc_old.GetLengths(), in_c_h_w_n_global_desc_old.GetStrides());
|
||||
|
||||
constexpr index_t C = in_c_h_w_n_global_desc.GetLength(I0);
|
||||
constexpr auto wei_c_y_x_k_global_desc = make_native_tensor_descriptor(
|
||||
wei_c_y_x_k_global_desc_old.GetLengths(), wei_c_y_x_k_global_desc_old.GetStrides());
|
||||
|
||||
constexpr auto out_k_h_w_n_global_desc = make_native_tensor_descriptor(
|
||||
out_k_h_w_n_global_desc_old.GetLengths(), out_k_h_w_n_global_desc_old.GetStrides());
|
||||
|
||||
constexpr index_t C = in_c_h_w_n_global_desc.GetLength(I0);
|
||||
constexpr index_t Hi = in_c_h_w_n_global_desc.GetLength(I1);
|
||||
constexpr index_t Wi = in_c_h_w_n_global_desc.GetLength(I2);
|
||||
|
||||
constexpr index_t K = out_k_h_w_n_global_desc.GetLength(I0);
|
||||
constexpr index_t Ho = out_k_h_w_n_global_desc.GetLength(I1);
|
||||
@@ -111,11 +122,22 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
|
||||
const index_t wo_block_data_begin = block_work_multi_id[2] * WoPerBlock;
|
||||
const index_t n_block_data_begin = block_work_multi_id[3] * NPerBlock;
|
||||
|
||||
const index_t hi_block_data_begin = ho_block_data_begin;
|
||||
const index_t wi_block_data_begin = wo_block_data_begin;
|
||||
const index_t hi_block_data_begin = ho_block_data_begin - LeftPads{}[0];
|
||||
const index_t wi_block_data_begin = wo_block_data_begin - LeftPads{}[1];
|
||||
|
||||
// input global tensor view
|
||||
constexpr auto in_c_hp_wp_n_global_desc = transform_tensor_descriptor(
|
||||
in_c_h_w_n_global_desc,
|
||||
make_tuple(
|
||||
PassThrough<C>{}, Pad<Sequence<Hi, Wi>, LeftPads, RightPads>{}, PassThrough<N>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
|
||||
|
||||
// global tensor view
|
||||
constexpr auto wei_c_k_global_desc = wei_c_y_x_k_global_desc.Extract(I0, I3);
|
||||
constexpr auto wei_c_k_global_desc_old = wei_c_y_x_k_global_desc_old.Extract(I0, I3);
|
||||
|
||||
constexpr auto wei_c_k_global_desc = make_native_tensor_descriptor(
|
||||
wei_c_k_global_desc_old.GetLengths(), wei_c_k_global_desc_old.GetStrides());
|
||||
|
||||
// LDS tensor view
|
||||
// be careful of alignment
|
||||
@@ -124,122 +146,81 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
|
||||
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
constexpr auto in_c_h_w_n_block_desc_old = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{}, Number<max_align>{});
|
||||
|
||||
// hack
|
||||
constexpr auto in_c_h_w_n_block_desc = make_native_tensor_descriptor(
|
||||
in_c_h_w_n_block_desc_old.GetLengths(), in_c_h_w_n_block_desc_old.GetStrides());
|
||||
|
||||
// this check is ad-hoc
|
||||
// TODO: need to properly implement tensor descriptor with alignment
|
||||
static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
|
||||
"GemmDataPerReadB alignment requirement is not meet");
|
||||
|
||||
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
constexpr auto wei_c_k_block_desc_old = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, KPerBlock>{}, Number<max_align>{});
|
||||
|
||||
constexpr auto wei_c_1_1_k_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, 1, 1, KPerBlock>{}, Number<max_align>{});
|
||||
constexpr auto wei_c_k_block_desc = make_native_tensor_descriptor(
|
||||
wei_c_k_block_desc_old.GetLengths(), wei_c_k_block_desc_old.GetStrides());
|
||||
|
||||
// LDS: be careful of alignment
|
||||
constexpr index_t in_block_space = in_c_h_w_n_block_desc.GetElementSpace();
|
||||
constexpr index_t wei_block_space = wei_c_k_block_desc.GetElementSpace();
|
||||
constexpr index_t in_block_space = in_c_h_w_n_block_desc_old.GetElementSpace();
|
||||
constexpr index_t wei_block_space = wei_c_k_block_desc_old.GetElementSpace();
|
||||
|
||||
__shared__ Float p_in_block[in_block_space];
|
||||
__shared__ Float p_wei_block[wei_block_space];
|
||||
|
||||
// tensor view of threadwise output in register
|
||||
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
|
||||
constexpr auto out_k_h_w_n_thread_desc_old = make_ConstantTensorDescriptor_packed(
|
||||
Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
|
||||
|
||||
#if 1
|
||||
constexpr auto out_k_h_w_n_thread_desc = make_native_tensor_descriptor(
|
||||
out_k_h_w_n_thread_desc_old.GetLengths(), out_k_h_w_n_thread_desc_old.GetStrides());
|
||||
|
||||
// blockwise input copy
|
||||
// format is [C, Hi, Wi, N]
|
||||
auto blockwise_in_copy =
|
||||
BlockwiseGenericTensorSliceCopy_v2<BlockSize,
|
||||
decltype(in_c_h_w_n_global_desc),
|
||||
decltype(in_c_h_w_n_block_desc),
|
||||
decltype(in_c_h_w_n_block_desc.GetLengths()),
|
||||
InBlockCopySubLengths_CHWN,
|
||||
InBlockCopyClusterLengths_CHWN,
|
||||
Sequence<0, 1, 2, 3>,
|
||||
Sequence<0, 1, 2, 3>,
|
||||
Sequence<0, 1, 2, 3>,
|
||||
3,
|
||||
3,
|
||||
InBlockCopyDataPerAccess_N,
|
||||
InBlockCopyDataPerAccess_N>({0, 0, 0, 0},
|
||||
{0, 0, 0, 0});
|
||||
#if 0
|
||||
BlockwiseGenericTensorSliceCopy_v2
|
||||
#else
|
||||
auto in_c_h_w_n_global = make_TensorView(in_c_h_w_n_global_desc, p_in_global);
|
||||
auto in_c_h_w_n_block = make_TensorView(in_c_h_w_n_block_desc, p_in_block);
|
||||
|
||||
auto blockwise_in_copy =
|
||||
BlockwiseGenericTensorSliceCopy_v3<BlockSize,
|
||||
decltype(in_c_h_w_n_global),
|
||||
decltype(in_c_h_w_n_block),
|
||||
decltype(in_c_h_w_n_block.GetLengths()),
|
||||
InBlockCopySubLengths_CHWN,
|
||||
InBlockCopyClusterLengths_CHWN,
|
||||
Sequence<0, 1, 2, 3>,
|
||||
Sequence<0, 1, 2, 3>,
|
||||
Sequence<0, 1, 2, 3>,
|
||||
3,
|
||||
3,
|
||||
InBlockCopyDataPerAccess_N,
|
||||
InBlockCopyDataPerAccess_N>(
|
||||
in_c_h_w_n_global,
|
||||
{0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin},
|
||||
in_c_h_w_n_block,
|
||||
{0, 0, 0, 0});
|
||||
BlockwiseGenericTensorSliceCopy_v4
|
||||
#endif
|
||||
<BlockSize,
|
||||
decltype(in_c_hp_wp_n_global_desc),
|
||||
decltype(in_c_h_w_n_block_desc),
|
||||
decltype(in_c_h_w_n_block_desc.GetLengths()),
|
||||
InBlockCopySubLengths_CHWN,
|
||||
InBlockCopyClusterLengths_CHWN,
|
||||
Sequence<0, 1, 2, 3>,
|
||||
Sequence<0, 1, 2, 3>,
|
||||
Sequence<0, 1, 2, 3>,
|
||||
3,
|
||||
3,
|
||||
InBlockCopyDataPerAccess_N,
|
||||
InBlockCopyDataPerAccess_N>({0, 0, 0, 0}, {0, 0, 0, 0});
|
||||
|
||||
#if 1
|
||||
// blockwise wei copy
|
||||
// format is [CPerBlock, KPerBlock]
|
||||
const auto blockwise_wei_copy =
|
||||
BlockwiseGenericTensorSliceCopy_v2<BlockSize,
|
||||
decltype(wei_c_k_global_desc),
|
||||
decltype(wei_c_k_block_desc),
|
||||
decltype(wei_c_k_block_desc.GetLengths()),
|
||||
WeiBlockCopySubLengths_CK,
|
||||
WeiBlockCopyClusterLengths_CK,
|
||||
Sequence<0, 1>,
|
||||
Sequence<0, 1>,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
WeiBlockCopyDataPerAccess_K,
|
||||
WeiBlockCopyDataPerAccess_K>({0, 0}, {0, 0});
|
||||
#if 0
|
||||
BlockwiseGenericTensorSliceCopy_v2
|
||||
#else
|
||||
auto wei_c_y_x_k_global = make_TensorView(wei_c_y_x_k_global_desc, p_wei_global);
|
||||
auto wei_c_1_1_k_block = make_TensorView(wei_c_1_1_k_block_desc, p_wei_block);
|
||||
|
||||
constexpr index_t WeiBlockCopySubLengths_C = WeiBlockCopySubLengths_CK{}[0];
|
||||
constexpr index_t WeiBlockCopySubLengths_K = WeiBlockCopySubLengths_CK{}[1];
|
||||
|
||||
using WeiBlockCopySubLengths_CYXK =
|
||||
Sequence<WeiBlockCopySubLengths_C, 1, 1, WeiBlockCopySubLengths_K>;
|
||||
|
||||
constexpr index_t WeiBlockCopyClusterLengths_C = WeiBlockCopyClusterLengths_CK{}[0];
|
||||
constexpr index_t WeiBlockCopyClusterLengths_K = WeiBlockCopyClusterLengths_CK{}[1];
|
||||
|
||||
using WeiBlockCopyClusterLengths_CYXK =
|
||||
Sequence<WeiBlockCopyClusterLengths_C, 1, 1, WeiBlockCopyClusterLengths_K>;
|
||||
|
||||
auto blockwise_wei_copy =
|
||||
BlockwiseGenericTensorSliceCopy_v3<BlockSize,
|
||||
decltype(wei_c_y_x_k_global),
|
||||
decltype(wei_c_1_1_k_block),
|
||||
decltype(wei_c_1_1_k_block.GetLengths()),
|
||||
WeiBlockCopySubLengths_CYXK,
|
||||
WeiBlockCopyClusterLengths_CYXK,
|
||||
Sequence<0, 1, 2, 3>,
|
||||
Sequence<0, 1, 2, 3>,
|
||||
Sequence<0, 1, 2, 3>,
|
||||
3,
|
||||
3,
|
||||
WeiBlockCopyDataPerAccess_K,
|
||||
WeiBlockCopyDataPerAccess_K>(
|
||||
wei_c_y_x_k_global, {0, 0, 0, k_block_data_begin}, wei_c_1_1_k_block, {0, 0, 0, 0});
|
||||
BlockwiseGenericTensorSliceCopy_v4
|
||||
#endif
|
||||
<BlockSize,
|
||||
decltype(wei_c_k_global_desc),
|
||||
decltype(wei_c_k_block_desc),
|
||||
decltype(wei_c_k_block_desc.GetLengths()),
|
||||
WeiBlockCopySubLengths_CK,
|
||||
WeiBlockCopyClusterLengths_CK,
|
||||
Sequence<0, 1>,
|
||||
Sequence<0, 1>,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
WeiBlockCopyDataPerAccess_K,
|
||||
WeiBlockCopyDataPerAccess_K>({0, 0}, {0, 0});
|
||||
|
||||
// a series of blockwise batched GEMM
|
||||
// C_matrix += transpose(A_matrix) * B_matrix
|
||||
@@ -283,7 +264,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
|
||||
|
||||
// register
|
||||
// C++ lambda doesn't capture array, use pointer instead
|
||||
Float p_out_thread_data[out_k_h_w_n_thread_desc.GetElementSpace()];
|
||||
Float p_out_thread_data[out_k_h_w_n_thread_desc_old.GetElementSpace()];
|
||||
Float* const p_out_thread = p_out_thread_data;
|
||||
|
||||
// set threadwise output tensor to 0
|
||||
@@ -296,12 +277,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
|
||||
{
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global +
|
||||
in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
0, hi_block_data_begin + y, wi_block_data_begin + x, n_block_data_begin);
|
||||
in_c_h_w_n_global_desc.CalculateOffset(
|
||||
{0, hi_block_data_begin + y, wi_block_data_begin + x, n_block_data_begin});
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin);
|
||||
wei_c_y_x_k_global_desc.CalculateOffset({0, y, x, k_block_data_begin});
|
||||
|
||||
for(index_t c_block_data_begin = 0; c_block_data_begin < C;
|
||||
c_block_data_begin += CPerBlock,
|
||||
@@ -390,25 +371,30 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
|
||||
constexpr index_t K2 = GemmMPerThreadSubC;
|
||||
constexpr index_t K1 = KPerBlock / KPerThread;
|
||||
|
||||
constexpr auto out_10d_global_desc = fwd(out_k_h_w_n_global_desc)
|
||||
.Fold(I3, Number<N1>{}, Number<N2>{})
|
||||
.Fold(I2, Number<W1>{}, Number<W2>{})
|
||||
.Fold(I0, Number<K1>{}, Number<K2>{});
|
||||
constexpr auto out_10d_global_desc_old = fwd(out_k_h_w_n_global_desc_old)
|
||||
.Fold(I3, Number<N1>{}, Number<N2>{})
|
||||
.Fold(I2, Number<W1>{}, Number<W2>{})
|
||||
.Fold(I0, Number<K1>{}, Number<K2>{});
|
||||
|
||||
constexpr auto out_10d_thread_desc = fwd(out_k_h_w_n_thread_desc)
|
||||
.Fold(I3, Number<1>{}, Number<N2>{})
|
||||
.Fold(I2, Number<W1>{}, Number<1>{})
|
||||
.Fold(I0, Number<1>{}, Number<K2>{});
|
||||
constexpr auto out_10d_global_desc = make_native_tensor_descriptor(
|
||||
out_10d_global_desc_old.GetLengths(), out_10d_global_desc_old.GetStrides());
|
||||
|
||||
Float* p_out_thread_on_global = p_out_global +
|
||||
out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin);
|
||||
constexpr auto out_10d_thread_desc_old = fwd(out_k_h_w_n_thread_desc_old)
|
||||
.Fold(I3, Number<1>{}, Number<N2>{})
|
||||
.Fold(I2, Number<W1>{}, Number<1>{})
|
||||
.Fold(I0, Number<1>{}, Number<K2>{});
|
||||
|
||||
#if 1
|
||||
ThreadwiseGenericTensorSliceCopy_v1r2<decltype(out_10d_thread_desc),
|
||||
constexpr auto out_10d_thread_desc = make_native_tensor_descriptor(
|
||||
out_10d_thread_desc_old.GetLengths(), out_10d_thread_desc_old.GetStrides());
|
||||
|
||||
Float* p_out_thread_on_global =
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.CalculateOffset({k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin});
|
||||
|
||||
ThreadwiseGenericTensorSliceCopy_v4r2<decltype(out_10d_thread_desc),
|
||||
decltype(out_10d_global_desc),
|
||||
decltype(out_10d_thread_desc.GetLengths()),
|
||||
arithmetic_sequence_gen<0, 10, 1>::type,
|
||||
@@ -417,19 +403,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
|
||||
OutThreadCopyDataPerAccess_N>(
|
||||
make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>())
|
||||
.Run(p_out_thread, p_out_thread_on_global);
|
||||
#elif 0
|
||||
ThreadwiseGenericTensorSliceCopy_v1r1<decltype(out_10d_thread_desc),
|
||||
decltype(out_10d_global_desc),
|
||||
decltype(out_10d_thread_desc.GetLengths()),
|
||||
arithmetic_sequence_gen<0, 10, 1>::type,
|
||||
arithmetic_sequence_gen<0, 10, 1>::type,
|
||||
9,
|
||||
9,
|
||||
OutThreadCopyDataPerAccess_N,
|
||||
OutThreadCopyDataPerAccess_N>(
|
||||
make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>())
|
||||
.Run(p_out_thread, p_out_thread_on_global);
|
||||
#endif
|
||||
}).Else([&](auto fwd) {
|
||||
static_assert(fwd(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock &&
|
||||
GemmNPerThreadSubC % NPerThread == 0,
|
||||
@@ -445,27 +418,32 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
|
||||
constexpr index_t K2 = GemmMPerThreadSubC;
|
||||
constexpr index_t K1 = KPerBlock / KPerThread;
|
||||
|
||||
constexpr auto out_10d_global_desc =
|
||||
fwd(out_k_h_w_n_global_desc)
|
||||
constexpr auto out_10d_global_desc_old =
|
||||
fwd(out_k_h_w_n_global_desc_old)
|
||||
.Fold(I3, Number<N1>{})
|
||||
.Fold(I2, Number<W1>{}, Number<W2>{}, Number<W3>{})
|
||||
.Fold(I0, Number<K1>{}, Number<K2>{});
|
||||
|
||||
constexpr auto out_10d_thread_desc =
|
||||
fwd(out_k_h_w_n_thread_desc)
|
||||
constexpr auto out_10d_global_desc = make_native_tensor_descriptor(
|
||||
out_10d_global_desc_old.GetLengths(), out_10d_global_desc_old.GetStrides());
|
||||
|
||||
constexpr auto out_10d_thread_desc_old =
|
||||
fwd(out_k_h_w_n_thread_desc_old)
|
||||
.Fold(I3, Number<N1>{})
|
||||
.Fold(I2, Number<W1>{}, Number<1>{}, Number<W3>{})
|
||||
.Fold(I0, Number<1>{}, Number<K2>{});
|
||||
|
||||
Float* p_out_thread_on_global = p_out_global +
|
||||
out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin);
|
||||
constexpr auto out_10d_thread_desc = make_native_tensor_descriptor(
|
||||
out_10d_thread_desc_old.GetLengths(0), out_10d_thread_desc_old.GetStrides());
|
||||
|
||||
#if 1
|
||||
ThreadwiseGenericTensorSliceCopy_v1r2<decltype(out_10d_thread_desc),
|
||||
Float* p_out_thread_on_global =
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.CalculateOffset({k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin});
|
||||
|
||||
ThreadwiseGenericTensorSliceCopy_v4r2<decltype(out_10d_thread_desc),
|
||||
decltype(out_10d_global_desc),
|
||||
decltype(out_10d_thread_desc.GetLengths()),
|
||||
arithmetic_sequence_gen<0, 10, 1>::type,
|
||||
@@ -474,58 +452,13 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
|
||||
OutThreadCopyDataPerAccess_N>(
|
||||
make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>())
|
||||
.Run(p_out_thread, p_out_thread_on_global);
|
||||
#elif 0
|
||||
ThreadwiseGenericTensorSliceCopy_v1r1<decltype(out_10d_thread_desc),
|
||||
decltype(out_10d_global_desc),
|
||||
decltype(out_10d_thread_desc.GetLengths()),
|
||||
arithmetic_sequence_gen<0, 10, 1>::type,
|
||||
arithmetic_sequence_gen<0, 10, 1>::type,
|
||||
9,
|
||||
9,
|
||||
OutThreadCopyDataPerAccess_N,
|
||||
OutThreadCopyDataPerAccess_N>(
|
||||
make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>())
|
||||
.Run(p_out_thread, p_out_thread_on_global);
|
||||
#endif
|
||||
});
|
||||
}
|
||||
#else
|
||||
#elif 0
|
||||
__device__ void Run(const Float* const __restrict__ p_in_global,
|
||||
const Float* const __restrict__ p_wei_global,
|
||||
Float* const __restrict__ p_out_global) const
|
||||
{
|
||||
#if 0
|
||||
constexpr auto a = make_tuple(true, Sequence<1>{}, index_t(99));
|
||||
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
{
|
||||
printf("[0] %d\n", a.At(I0));
|
||||
print_Sequence("[1]", a.At(I1));
|
||||
printf("[2] %lu\n", a.At(I2));
|
||||
}
|
||||
|
||||
bool flag = true;
|
||||
|
||||
auto b = make_tuple(flag, Sequence<1>{}, 99);
|
||||
|
||||
b.At(I0) = false;
|
||||
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
{
|
||||
printf("[0] %d\n", b.At(I0));
|
||||
print_Sequence("[1]", b.At(I1));
|
||||
printf("[2] %lu\n", b.At(I2));
|
||||
|
||||
printf("flag %d\n", flag);
|
||||
}
|
||||
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
{
|
||||
printf("[0] %d\n", make_tuple(true, Sequence<1>(), index_t(99)).At(I0));
|
||||
print_Sequence("[1]", make_tuple(true, Sequence<1>(), index_t(99)).At(I1));
|
||||
printf("[2] %d\n", make_tuple(true, Sequence<1>(), index_t(99)).At(I2));
|
||||
}
|
||||
#elif 1
|
||||
// create a native tensor descriptor
|
||||
constexpr auto in_c_h_w_n_global_desc =
|
||||
make_native_tensor_descriptor(InGlobalDesc::GetLengths(), InGlobalDesc::GetStrides());
|
||||
@@ -540,11 +473,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
|
||||
constexpr auto in_n_c_hp_wp_global_desc = transform_tensor_descriptor(
|
||||
in_c_h_w_n_global_desc,
|
||||
make_tuple(
|
||||
Pad<Sequence<Hi, Wi>, LowerPads, UpperPads>{}, PassThrough<C>{}, PassThrough<N>{}),
|
||||
Pad<Sequence<Hi, Wi>, LeftPads, RightPads>{}, PassThrough<C>{}, PassThrough<N>{}),
|
||||
make_tuple(Sequence<1, 2>{}, Sequence<0>{}, Sequence<3>{}),
|
||||
make_tuple(Sequence<2, 3>{}, Sequence<1>{}, Sequence<0>{}));
|
||||
|
||||
#if 1
|
||||
// transformation: {n, c, hp, wp} --> {c, b}
|
||||
// {n, hp, wp} --> {b}, {c} --> {c}
|
||||
constexpr auto in_c_b_global_desc = transform_tensor_descriptor(
|
||||
@@ -553,9 +485,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
|
||||
PassThrough<in_n_c_hp_wp_global_desc.GetLength(I1)>{}),
|
||||
make_tuple(Sequence<0, 2, 3>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<1>{}, Sequence<0>{}));
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
{
|
||||
// 0
|
||||
@@ -577,16 +507,55 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
|
||||
|
||||
printf("in_c_b_global_desc offset: %lu\n", in_c_b_global_desc.CalculateOffset(idx2));
|
||||
}
|
||||
}
|
||||
#else
|
||||
{
|
||||
index_t c = static_cast<index_t>(threadIdx.x);
|
||||
index_t h = static_cast<index_t>(threadIdx.y);
|
||||
index_t w = static_cast<index_t>(threadIdx.z);
|
||||
__device__ void Run(const Float* const __restrict__ p_in_global,
|
||||
const Float* const __restrict__ p_wei_global,
|
||||
Float* const __restrict__ p_out_global) const
|
||||
{
|
||||
// create a native tensor descriptor
|
||||
constexpr auto in_c_h_w_n_global_desc =
|
||||
make_native_tensor_descriptor(InGlobalDesc::GetLengths(), InGlobalDesc::GetStrides());
|
||||
|
||||
p_out_global[0] = in_n_c_h_w_padded_global_desc.CalculateOffset({1, c, h, w});
|
||||
constexpr index_t C = in_c_h_w_n_global_desc.GetLength(I0);
|
||||
constexpr index_t Hi = in_c_h_w_n_global_desc.GetLength(I1);
|
||||
constexpr index_t Wi = in_c_h_w_n_global_desc.GetLength(I2);
|
||||
constexpr index_t N = in_c_h_w_n_global_desc.GetLength(I3);
|
||||
|
||||
// transformation: {c, h, w, n} --> {n, c, hp, wp}
|
||||
// {h, w} --> {hp, wp}, {c} --> {c}, {n} --> {n}
|
||||
constexpr auto in_c_hp_wp_n_global_desc = transform_tensor_descriptor(
|
||||
in_c_h_w_n_global_desc,
|
||||
make_tuple(
|
||||
PassThrough<C>{}, Pad<Sequence<Hi, Wi>, LeftPads, RightPads>{}, PassThrough<N>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
|
||||
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
{
|
||||
// 0
|
||||
print_tensor_descriptor("in_c_h_w_n_global_desc", in_c_h_w_n_global_desc);
|
||||
|
||||
// 1
|
||||
print_tensor_descriptor("in_c_hp_wp_n_global_desc", in_c_hp_wp_n_global_desc);
|
||||
|
||||
constexpr auto idx1 = MultiIndex<4>{1, 2, 3, 4};
|
||||
auto idx0 = in_c_hp_wp_n_global_desc.CalculateLowerIndex(idx1);
|
||||
|
||||
print_array("idx1: ", idx1);
|
||||
print_array("idx0: ", idx0);
|
||||
|
||||
auto coord1 = make_tensor_coordinate_v2(in_c_hp_wp_n_global_desc, idx1);
|
||||
|
||||
print_array("1: ", coord1.GetIndex());
|
||||
print_array("0: ", coord1.GetLowerCoordinate().GetIndex());
|
||||
|
||||
printf("in_c_hp_wp_n_global_desc is_in_pad: %d\n",
|
||||
coord1.IsAnyLevelIndexInPaddingArea());
|
||||
|
||||
printf("in_c_hp_wp_n_global_desc offset: %lu\n",
|
||||
in_c_hp_wp_n_global_desc.CalculateOffset(idx1));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
@@ -22,17 +22,27 @@ struct PassThrough
|
||||
|
||||
__host__ __device__ static constexpr auto GetUpperLengths() { return Sequence<Length>{}; }
|
||||
|
||||
__host__ __device__ static constexpr auto CalculateLowerIndex(UpperIndex idx_up)
|
||||
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
|
||||
{
|
||||
return idx_up;
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr auto CalculateLowerIndexDiff(UpperIndex idx_up_diff)
|
||||
__host__ __device__ static constexpr auto
|
||||
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
|
||||
const UpperIndex& /* idx_up_old */,
|
||||
const LowerIndex& /* idx_low_old */)
|
||||
{
|
||||
return idx_up_diff;
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
|
||||
|
||||
// TODO: should this function be here? should it be specific for padding check?
|
||||
__host__ __device__ static constexpr bool
|
||||
IsUpperIndexInPaddingArea(const UpperIndex& /* idx_up */)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
// LowLengths: Sequence<...>
|
||||
@@ -55,17 +65,39 @@ struct Pad
|
||||
return GetLowerLengths() + LeftPads{} + RightPads{};
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr auto CalculateLowerIndex(UpperIndex idx_up)
|
||||
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
|
||||
{
|
||||
return idx_up - LeftPads{};
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr auto CalculateLowerIndexDiff(UpperIndex idx_up_diff)
|
||||
__host__ __device__ static constexpr auto
|
||||
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
|
||||
const UpperIndex& /* idx_up_old */,
|
||||
const LowerIndex& /* idx_low_old */)
|
||||
{
|
||||
return idx_up_diff;
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
|
||||
|
||||
// TODO: should this function be here? should it be specific for padding check?
|
||||
__host__ __device__ constexpr bool IsUpperIndexInPaddingArea(const UpperIndex& idx_up) const
|
||||
{
|
||||
bool flag = false;
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto idim) {
|
||||
// only check if there is left-padding
|
||||
static_if<(LeftPads::At(idim) != 0)>{}(
|
||||
[&](auto) { flag = flag || idx_up[idim] < LeftPads::At(idim); });
|
||||
|
||||
// only check if there is right-padding
|
||||
static_if<(RightPads::At(idim) != 0)>{}([&](auto) {
|
||||
flag = flag || idx_up[idim] >= LeftPads::At(idim) + LowLengths::At(idim);
|
||||
});
|
||||
});
|
||||
|
||||
return flag;
|
||||
}
|
||||
};
|
||||
|
||||
// LowLengths: Sequence<...>
|
||||
@@ -124,7 +156,7 @@ struct Merge
|
||||
.PushBack(Number<1>{});
|
||||
|
||||
// calculate index in each of the dimensions in the order of their dimension
|
||||
#if 1
|
||||
#if 1 // would compile to same ISA?
|
||||
static_for<0, nDimLow - 1, 1>{}(
|
||||
lambda_CalculateLowerIndex<decltype(pseudo_low_strides)>(itmp, idx_low));
|
||||
|
||||
@@ -138,8 +170,10 @@ struct Merge
|
||||
}
|
||||
|
||||
// idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
|
||||
__host__ __device__ static constexpr auto CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
|
||||
const LowerIndex& idx_low_old)
|
||||
__host__ __device__ static constexpr auto
|
||||
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
|
||||
const UpperIndex& /* idx_up_old */,
|
||||
const LowerIndex& idx_low_old)
|
||||
{
|
||||
LowerIndex idx_low_diff;
|
||||
|
||||
@@ -149,6 +183,13 @@ struct Merge
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr bool IsLinearTransform() { return false; }
|
||||
|
||||
// TODO: should this function be here? should it be specific for padding check?
|
||||
__host__ __device__ static constexpr bool
|
||||
IsUpperIndexInPaddingArea(const UpperIndex& /* idx_up */)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
// UpLengths: Sequence<...>
|
||||
@@ -189,7 +230,10 @@ struct Unmerge
|
||||
return idx_low;
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr auto CalculateLowerIndexDiff(const UpperIndex& idx_up_diff)
|
||||
__host__ __device__ static constexpr auto
|
||||
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
|
||||
const UpperIndex& /* idx_up_old */,
|
||||
const LowerIndex& /* idx_low_old */)
|
||||
{
|
||||
return CalculateLowerIndex(idx_up_diff);
|
||||
}
|
||||
@@ -240,7 +284,10 @@ struct Embed
|
||||
return idx_low;
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr auto CalculateLowerIndexDiff(const UpperIndex& idx_up_diff)
|
||||
__host__ __device__ static constexpr auto
|
||||
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
|
||||
const UpperIndex& /* idx_up_old */,
|
||||
const LowerIndex& /* idx_low_old */)
|
||||
{
|
||||
LowerIndex idx_low_diff{0};
|
||||
|
||||
|
||||
@@ -3,26 +3,28 @@
|
||||
|
||||
#include "common_header.hpp"
|
||||
#include "dimension.hpp"
|
||||
#include "dimension_transform.hpp"
|
||||
#include "multi_index_transform.hpp"
|
||||
#include "tensor_descriptor.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <class NativeTensorDesc>
|
||||
template <typename TensorDesc>
|
||||
struct TensorCoordinate_v2;
|
||||
|
||||
template <typename NativeTensorDesc>
|
||||
struct NativeTensorCoordinate
|
||||
{
|
||||
using type = NativeTensorCoordinate;
|
||||
using tensor_desc_type = NativeTensorDesc;
|
||||
using Index = tensor_desc_type::Index;
|
||||
|
||||
static constexpr index_t nDim = Index::GetSize();
|
||||
using type = NativeTensorCoordinate;
|
||||
using tensor_desc_type = NativeTensorDesc;
|
||||
static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension();
|
||||
using Index = MultiIndex<nDim>;
|
||||
|
||||
__host__ __device__ constexpr NativeTensorCoordinate(Index idx)
|
||||
: mOffset{GetTensorDesriptor().GetOffset(idx)}
|
||||
: mIndex(idx), mOffset(tensor_desc_type::CalculateOffset(idx))
|
||||
{
|
||||
}
|
||||
|
||||
template <class... Xs>
|
||||
template <typename... Xs>
|
||||
__host__ __device__ constexpr NativeTensorCoordinate(Xs... xs)
|
||||
: NativeTensorCoordinate(Index{xs...})
|
||||
{
|
||||
@@ -36,82 +38,103 @@ struct NativeTensorCoordinate
|
||||
|
||||
__host__ __device__ static constexpr auto GetTensorDescriptor() { return tensor_desc_type{}; }
|
||||
|
||||
__host__ __device__ constexpr index_t GetOffset() const { return mOffset; }
|
||||
__host__ __device__ constexpr const Index& GetIndex() const { return mIndex; }
|
||||
|
||||
__host__ __device__ type operator+=(Index idx_diff)
|
||||
__host__ __device__ constexpr const index_t& GetOffset() const { return mOffset; }
|
||||
|
||||
__host__ __device__ constexpr type operator+=(const Index& idx_diff)
|
||||
{
|
||||
mOffset += tensor_desc_type::GetOffsetDiff(idx_diff);
|
||||
// mIndex is updated here, but some (or all) of its entries may never be used
|
||||
mIndex += idx_diff;
|
||||
|
||||
mOffset += tensor_desc_type::CalculateOffset(idx_diff);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
__host__ __device__ type operator-=(Index idx_diff)
|
||||
__host__ __device__ constexpr type operator-=(const Index& idx_diff)
|
||||
{
|
||||
mOffset -= tensor_desc_type::GetOffsetFromMultiIndex(idx_diff);
|
||||
// mIndex is updated here, but some (or all) of its entries may never be used
|
||||
mIndex -= idx_diff;
|
||||
|
||||
mOffset -= tensor_desc_type::CalculateOffset(idx_diff);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr type operator+(Index idx_diff) const
|
||||
__host__ __device__ constexpr type operator+(const Index& idx_diff) const
|
||||
{
|
||||
type coord = *this;
|
||||
coord += idx_diff;
|
||||
return coord;
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr type operator-(Index idx_diff) const
|
||||
__host__ __device__ constexpr type operator-(const Index& idx_diff) const
|
||||
{
|
||||
type coord = *this;
|
||||
coord -= idx_diff;
|
||||
return coord;
|
||||
}
|
||||
|
||||
// TODO: should this function be here? should it be specific for padding check?
|
||||
__host__ __device__ static constexpr bool IsAnyLevelIndexInPaddingArea() { return false; }
|
||||
|
||||
private:
|
||||
// mIndex may be saved and update, however, the value of some (or all) of its entries may
|
||||
// never be used. Compiler should be able to remove these entries as well as its calculation
|
||||
// as dead code.
|
||||
// TODO: make sure compiler indeed remove these dead code
|
||||
Index mIndex;
|
||||
index_t mOffset;
|
||||
};
|
||||
|
||||
template <class TransformedTensorDesc>
|
||||
template <typename TransformedTensorDesc>
|
||||
struct TransformedTensorCoordinate
|
||||
{
|
||||
using type = TransformedTensorCoordinate;
|
||||
using tensor_desc_type = TransformedTensorDesc;
|
||||
using Index = tensor_desc_type::UpperIndex;
|
||||
using LowerCoord =
|
||||
typename TensorCoordinate_v2<decltype(tensor_desc_type::GetLowerTensorDescriptor())>::type;
|
||||
using UpperCoord = TransformedTensorCoordinate;
|
||||
static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension();
|
||||
using UpperIndex = MultiIndex<nDim>;
|
||||
|
||||
using lower_coordinate_type =
|
||||
TensorCoordiante_v2<decltype(GetTensorDescriptor().GetLowerTensorDescriptor())>::type;
|
||||
|
||||
static constexpr index_t nDim = Index::GetSize();
|
||||
|
||||
__host__ __device__ constexpr TransformedTensorCoordinate(Index idx)
|
||||
: mIndex{idx}, mCoordLow{GetTensorDescriptor().GetLowerIndex(idx)}
|
||||
__host__ __device__ constexpr TransformedTensorCoordinate(UpperIndex idx)
|
||||
: mIndexUp{idx}, mCoordLow{tensor_desc_type::CalculateLowerIndex(idx)}
|
||||
{
|
||||
}
|
||||
|
||||
template <class... Xs>
|
||||
template <typename... Xs>
|
||||
__host__ __device__ constexpr TransformedTensorCoordinate(Xs... xs)
|
||||
: TransformedTensorCoordinate(Index{xs...})
|
||||
: TransformedTensorCoordinate(UpperIndex{xs...})
|
||||
{
|
||||
}
|
||||
|
||||
template <index_t... Xs>
|
||||
__host__ __device__ constexpr TransformedTensorCoordinate(Sequence<Xs...>)
|
||||
: TransformedTensorCoordinate(Index{Xs...})
|
||||
: TransformedTensorCoordinate(UpperIndex{Xs...})
|
||||
{
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr auto GetTensorDescriptor() { return tensor_desc_type{}; }
|
||||
|
||||
__host__ __device__ constexpr index_t GetOffset() const { return mCoordLow.GetOffset(); }
|
||||
__host__ __device__ constexpr const LowerCoord& GetLowerCoordinate() const { return mCoordLow; }
|
||||
|
||||
__host__ __device__ constexpr Index GetIndex() const { return mIndex; }
|
||||
__host__ __device__ constexpr const UpperIndex& GetUpperIndex() const { return mIndexUp; }
|
||||
|
||||
__host__ __device__ type operator+=(Index idx_up_diff)
|
||||
__host__ __device__ constexpr const UpperIndex& GetIndex() const { return GetUpperIndex(); }
|
||||
|
||||
__host__ __device__ constexpr const index_t& GetOffset() const
|
||||
{
|
||||
return GetLowerCoordinate().GetOffset();
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr UpperCoord operator+=(const UpperIndex& idx_up_diff)
|
||||
{
|
||||
// For transformation of multi-index difference, not all transformation functions need to
|
||||
// know the old lower-index or the old upper-index. We pass both of them to the
|
||||
// transformation function. The transformation function itself decides to use them or not.
|
||||
mCoordLow +=
|
||||
tensor_desc_type::GetLowerIndexDiff(idx_up_diff, mIndexUp, mCoordLow.GetIndex());
|
||||
mCoordLow += tensor_desc_type::CalculateLowerIndexDiff(
|
||||
idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex());
|
||||
|
||||
// mIndexUp is updated here, but some (or all) of its entries may never be used
|
||||
mIndexUp += idx_up_diff;
|
||||
@@ -119,11 +142,35 @@ struct TransformedTensorCoordinate
|
||||
return *this;
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr type operator+(Index idx_up_diff) const
|
||||
__host__ __device__ constexpr UpperCoord operator-=(const UpperIndex& idx_up_diff)
|
||||
{
|
||||
type coord = *this;
|
||||
coord += idx_diff;
|
||||
return coord;
|
||||
mCoordLow -= tensor_desc_type::CalculateLowerIndexDiff(
|
||||
idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex());
|
||||
|
||||
mIndexUp -= idx_up_diff;
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr UpperCoord operator+(const UpperIndex& idx_up_diff) const
|
||||
{
|
||||
UpperCoord coord_up = *this;
|
||||
coord_up += idx_up_diff;
|
||||
return coord_up;
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr UpperCoord operator-(const UpperIndex& idx_up_diff) const
|
||||
{
|
||||
UpperCoord coord_up = *this;
|
||||
coord_up -= idx_up_diff;
|
||||
return coord_up;
|
||||
}
|
||||
|
||||
// TODO: should this function be here? should it be specific for padding check?
|
||||
__host__ __device__ constexpr bool IsAnyLevelIndexInPaddingArea() const
|
||||
{
|
||||
return tensor_desc_type::IsUpperIndexInPaddingArea(GetIndex()) ||
|
||||
mCoordLow.IsAnyLevelIndexInPaddingArea();
|
||||
}
|
||||
|
||||
private:
|
||||
@@ -131,22 +178,22 @@ struct TransformedTensorCoordinate
|
||||
// never be used. Compiler should be able to remove these entries as well as its calculation
|
||||
// as dead code.
|
||||
// TODO: make sure compiler indeed remove these dead code
|
||||
Index mIndexUp;
|
||||
lower_coordinate_type mCoordLow;
|
||||
UpperIndex mIndexUp;
|
||||
LowerCoord mCoordLow;
|
||||
};
|
||||
|
||||
template <class TensorDesc>
|
||||
template <typename TensorDesc>
|
||||
struct TensorCoordinate_v2
|
||||
{
|
||||
private:
|
||||
template <class... Ts>
|
||||
template <typename... Ts>
|
||||
__host__ __device__ static constexpr auto
|
||||
MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
|
||||
{
|
||||
return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>();
|
||||
}
|
||||
|
||||
template <class... Ts>
|
||||
template <typename... Ts>
|
||||
__host__ __device__ static constexpr auto
|
||||
MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
|
||||
{
|
||||
@@ -156,5 +203,12 @@ struct TensorCoordinate_v2
|
||||
public:
|
||||
using type = decltype(MakeDummyTensorCoordinate(TensorDesc{}));
|
||||
};
|
||||
|
||||
template <typename TensorDesc>
|
||||
__host__ __device__ constexpr auto
|
||||
make_tensor_coordinate_v2(TensorDesc, MultiIndex<TensorDesc::GetNumOfDimension()> idx)
|
||||
{
|
||||
return typename TensorCoordinate_v2<TensorDesc>::type(idx);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -64,6 +64,18 @@ struct NativeTensorDescriptor
|
||||
return GetStrides(typename arithmetic_sequence_gen<0, nDim, 1>::type{});
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr index_t GetElementSize()
|
||||
{
|
||||
return accumulate_on_sequence(GetLengths(), math::multiplies<index_t>{}, Number<1>{});
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr index_t GetElementSpace()
|
||||
{
|
||||
return accumulate_on_sequence(
|
||||
(GetLengths() - Number<1>{}) * GetStrides(), math::plus<index_t>{}, Number<1>{});
|
||||
}
|
||||
|
||||
// TODO: this cannot return constepxr because of use of lambda
|
||||
__host__ __device__ static constexpr index_t CalculateOffset(const Index& idx)
|
||||
{
|
||||
index_t offset = 0;
|
||||
@@ -73,6 +85,12 @@ struct NativeTensorDescriptor
|
||||
return offset;
|
||||
}
|
||||
|
||||
// TODO: remove this
|
||||
__host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(const Index& idx)
|
||||
{
|
||||
return CalculateOffset(idx);
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr index_t CalculateOffsetDiff(const Index& idx_diff)
|
||||
{
|
||||
index_t offset_diff = 0;
|
||||
@@ -100,6 +118,12 @@ struct NativeTensorDescriptor
|
||||
{
|
||||
return Tuple<>{};
|
||||
}
|
||||
|
||||
// TODO: should this function be here? should it be specific for padding check?
|
||||
__host__ __device__ static constexpr bool IsUpperIndexInPaddingArea(const Index& /* idx */)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
// LowerTensorDescriptor
|
||||
@@ -248,6 +272,17 @@ struct TransformedTensorDescriptor
|
||||
return GetLengths(Sequence<IDim, IDims...>{});
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr index_t GetElementSize()
|
||||
{
|
||||
return accumulate_on_sequence(GetLengths(), math::multiplies<index_t>{}, Number<1>{});
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr index_t GetElementSpace()
|
||||
{
|
||||
// TODO: Is this the correct definition for transformed tensor?
|
||||
return GetLowerTensorDescriptor().GetElementSpace();
|
||||
}
|
||||
|
||||
// TODO: right now return value is constexpr because use of non-constepxr lambda
|
||||
__host__ __device__ static constexpr LowerIndex CalculateLowerIndex(const UpperIndex& idx_up)
|
||||
{
|
||||
@@ -256,8 +291,8 @@ struct TransformedTensorDescriptor
|
||||
static_for<0, nTransform, 1>{}([&](auto itran) {
|
||||
constexpr auto tran = Transforms{}.At(itran);
|
||||
|
||||
auto idx_low_part = pick_array_element(idx_low, LowDimensionIds{}.At(itran));
|
||||
const auto idx_up_part = pick_array_element(idx_up, UpDimensionIds{}.At(itran));
|
||||
auto idx_low_part = pick_array_element(idx_low, LowDimensionIds{}.At(itran));
|
||||
|
||||
// this assume each lower (single) index is only assocaited with one transformation,
|
||||
// which is required for index transformation, and has been checked during constructor
|
||||
@@ -269,26 +304,29 @@ struct TransformedTensorDescriptor
|
||||
}
|
||||
|
||||
// TODO: right now return value is constexpr because use of non-constepxr lambda
|
||||
__host__ __device__ static constexpr LowerIndex
|
||||
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff, const LowerIndex& idx_low_old)
|
||||
__host__ __device__ static constexpr LowerIndex CalculateLowerIndexDiff(
|
||||
const UpperIndex& idx_up_diff, const UpperIndex& idx_up_old, const LowerIndex& idx_low_old)
|
||||
{
|
||||
LowerIndex idx_low_diff;
|
||||
|
||||
static_for<0, nTransform, 1>{}([&](auto itran) {
|
||||
constexpr auto tran = Transforms::At(itran);
|
||||
constexpr auto tran = Transforms{}.At(itran);
|
||||
|
||||
const auto idx_up_diff_part =
|
||||
pick_array_element(idx_up_diff, UpDimensionIds::At(itran));
|
||||
pick_array_element(idx_up_diff, UpDimensionIds{}.At(itran));
|
||||
|
||||
auto idx_low_diff_part = pick_array_element(idx_low_diff, LowDimensionIds::At(itran));
|
||||
const auto idx_up_old_part = pick_array_element(idx_up_old, UpDimensionIds{}.At(itran));
|
||||
|
||||
const auto idx_low_old_part =
|
||||
pick_array_element(idx_low_old, LowDimensionIds::At(itran));
|
||||
pick_array_element(idx_low_old, LowDimensionIds{}.At(itran));
|
||||
|
||||
auto idx_low_diff_part = pick_array_element(idx_low_diff, LowDimensionIds{}.At(itran));
|
||||
|
||||
// this assume each lower (single) index is associated with only one transformation,
|
||||
// which is required for index transformation, and has been checked during constructor
|
||||
// of TransformedTensorDescriptor
|
||||
idx_low_diff_part = tran.CalculateLowerIndex(idx_up_diff_part, idx_low_old_part);
|
||||
idx_low_diff_part = tran.CalculateLowerIndexDiff(
|
||||
to_array(idx_up_diff_part), to_array(idx_up_old_part), to_array(idx_low_old_part));
|
||||
});
|
||||
|
||||
return idx_low_diff;
|
||||
@@ -299,6 +337,12 @@ struct TransformedTensorDescriptor
|
||||
return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up));
|
||||
}
|
||||
|
||||
// TODO: remove this
|
||||
__host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(const UpperIndex& idx_up)
|
||||
{
|
||||
return CalculateOffset(idx_up);
|
||||
}
|
||||
|
||||
#if 0
|
||||
template <index_t IDim>
|
||||
__host__ __device__ static constexpr bool IsLinearDimension(Number<IDim>)
|
||||
@@ -321,6 +365,22 @@ struct TransformedTensorDescriptor
|
||||
// not implemented
|
||||
}
|
||||
#endif
|
||||
|
||||
// TODO: should this function be here? should it be specific for padding check?
|
||||
__host__ __device__ static constexpr bool IsUpperIndexInPaddingArea(const UpperIndex& idx_up)
|
||||
{
|
||||
bool flag = false;
|
||||
|
||||
static_for<0, nTransform, 1>{}([&](auto itran) {
|
||||
constexpr auto tran = Transforms{}.At(itran);
|
||||
|
||||
const auto idx_up_part = pick_array_element(idx_up, UpDimensionIds{}.At(itran));
|
||||
|
||||
flag = flag || tran.IsUpperIndexInPaddingArea(to_array(idx_up_part));
|
||||
});
|
||||
|
||||
return flag;
|
||||
}
|
||||
};
|
||||
|
||||
template <index_t... Lengths, index_t... Strides>
|
||||
@@ -337,7 +397,7 @@ __host__ __device__ constexpr auto make_native_tensor_descriptor_packed(Lengths)
|
||||
Lengths::PopFront(), math::multiplies<index_t>{}, Number<1>{})
|
||||
.PushBack(Number<1>{});
|
||||
|
||||
return make_NativeTensorDescriptor(Lengths{}, strides);
|
||||
return make_native_tensor_descriptor(Lengths{}, strides);
|
||||
}
|
||||
|
||||
template <typename LowTensorDescriptor,
|
||||
|
||||
@@ -7,6 +7,8 @@
|
||||
#include "tensor_coordinate.hpp"
|
||||
#include "tensor_view.hpp"
|
||||
#include "threadwise_generic_tensor_slice_copy.hpp"
|
||||
#include "tensor_descriptor.hpp"
|
||||
#include "tensor_coordinate_v2.hpp"
|
||||
|
||||
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
|
||||
@@ -418,6 +420,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
}
|
||||
};
|
||||
|
||||
// This version use TensorCoordiante
|
||||
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
|
||||
// memory layout (ordering of dimensions) can be different between src and dst.
|
||||
template <index_t BlockSize,
|
||||
@@ -518,7 +521,7 @@ struct BlockwiseGenericTensorSliceCopy_v2
|
||||
}
|
||||
|
||||
private:
|
||||
using RegisterBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{}));
|
||||
using RegisterBufferDesc = decltype(make_native_tensor_descriptor_packed(SubLengths{}));
|
||||
|
||||
using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v2r1<SrcDesc,
|
||||
RegisterBufferDesc,
|
||||
@@ -544,6 +547,7 @@ struct BlockwiseGenericTensorSliceCopy_v2
|
||||
ThreadwiseStore mThreadwiseStore;
|
||||
};
|
||||
|
||||
// this version use TensorView and TensorCoordinate
|
||||
template <index_t BlockSize,
|
||||
class SrcTensor,
|
||||
class DstTensor,
|
||||
@@ -639,25 +643,25 @@ struct BlockwiseGenericTensorSliceCopy_v3
|
||||
using ThreadBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{}));
|
||||
using ThreadBufferTensor = NormalTensorView<ThreadBufferDesc, data_type>;
|
||||
|
||||
using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v3<SrcTensor,
|
||||
ThreadBufferTensor,
|
||||
SubLengths,
|
||||
SrcDimAccessOrder,
|
||||
SrcDimAccessOrder,
|
||||
SrcVectorAccessDim,
|
||||
SrcVectorAccessDim,
|
||||
SrcDataPerAccess,
|
||||
1>;
|
||||
using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v3r1<SrcTensor,
|
||||
ThreadBufferTensor,
|
||||
SubLengths,
|
||||
SrcDimAccessOrder,
|
||||
SrcDimAccessOrder,
|
||||
SrcVectorAccessDim,
|
||||
SrcVectorAccessDim,
|
||||
SrcDataPerAccess,
|
||||
1>;
|
||||
|
||||
using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v3<ThreadBufferTensor,
|
||||
DstTensor,
|
||||
SubLengths,
|
||||
DstDimAccessOrder,
|
||||
DstDimAccessOrder,
|
||||
DstVectorAccessDim,
|
||||
DstVectorAccessDim,
|
||||
1,
|
||||
DstDataPerAccess>;
|
||||
using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v3r1<ThreadBufferTensor,
|
||||
DstTensor,
|
||||
SubLengths,
|
||||
DstDimAccessOrder,
|
||||
DstDimAccessOrder,
|
||||
DstVectorAccessDim,
|
||||
DstVectorAccessDim,
|
||||
1,
|
||||
DstDataPerAccess>;
|
||||
|
||||
data_type mpBuffer[ThreadBufferDesc::GetElementSpace()];
|
||||
|
||||
@@ -667,6 +671,125 @@ struct BlockwiseGenericTensorSliceCopy_v3
|
||||
ThreadwiseStore mThreadwiseStore;
|
||||
};
|
||||
|
||||
template <index_t BlockSize,
|
||||
class SrcDesc,
|
||||
class DstDesc,
|
||||
class SliceLengths,
|
||||
class SubLengths,
|
||||
class ThreadClusterLengths,
|
||||
class ThreadClusterArrangeOrder,
|
||||
class SrcDimAccessOrder,
|
||||
class DstDimAccessOrder,
|
||||
index_t SrcVectorAccessDim,
|
||||
index_t DstVectorAccessDim,
|
||||
index_t SrcDataPerAccess,
|
||||
index_t DstDataPerAccess>
|
||||
struct BlockwiseGenericTensorSliceCopy_v4
|
||||
{
|
||||
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
|
||||
|
||||
using SrcCoord = typename TensorCoordinate_v2<SrcDesc>::type;
|
||||
using DstCoord = typename TensorCoordinate_v2<DstDesc>::type;
|
||||
|
||||
__device__ constexpr BlockwiseGenericTensorSliceCopy_v4(SrcCoord src_block_slice_origin,
|
||||
DstCoord dst_block_slice_origin)
|
||||
{
|
||||
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
|
||||
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::Size() &&
|
||||
nDim == SubLengths::Size() && nDim == ThreadClusterLengths::Size() &&
|
||||
nDim == ThreadClusterArrangeOrder::Size() &&
|
||||
nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
|
||||
"wrong! nDim not consistent");
|
||||
|
||||
static_assert(is_same<SliceLengths, decltype(SubLengths{} * ThreadClusterLengths{})>{},
|
||||
"wrong! threads should be mapped to cover entire slicing window");
|
||||
|
||||
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
|
||||
ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
|
||||
|
||||
static_assert(BlockSize == thread_cluster_desc.GetElementSize(),
|
||||
"wrong! BlockSize not consistent with ThreadClusterLengths");
|
||||
|
||||
const auto thread_cluster_id =
|
||||
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
|
||||
|
||||
const auto data_cluster_id =
|
||||
reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{});
|
||||
|
||||
const auto thread_data_id_begin = data_cluster_id * SubLengths{};
|
||||
|
||||
mThreadwiseLoad.SetSrcSliceOrigin(src_block_slice_origin + thread_data_id_begin);
|
||||
mThreadwiseLoad.SetDstSliceOrigin(make_zero_array<index_t, nDim>());
|
||||
|
||||
mThreadwiseStore.SetSrcSliceOrigin(make_zero_array<index_t, nDim>());
|
||||
mThreadwiseStore.SetDstSliceOrigin(dst_block_slice_origin + thread_data_id_begin);
|
||||
}
|
||||
|
||||
__device__ static constexpr index_t GetRegisterBufferSize()
|
||||
{
|
||||
return RegisterBufferDesc::GetElementSpace();
|
||||
}
|
||||
|
||||
template <class TData>
|
||||
__device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
|
||||
{
|
||||
mThreadwiseLoad.Run(p_src, p_buffer);
|
||||
}
|
||||
|
||||
template <class TData>
|
||||
__device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
|
||||
{
|
||||
mThreadwiseStore.Run(p_buffer, p_dst);
|
||||
}
|
||||
|
||||
template <class TData>
|
||||
__device__ void Run(const TData* p_src, TData* p_dst) const
|
||||
{
|
||||
TData p_buffer[GetRegisterBufferSize()];
|
||||
|
||||
mThreadwiseLoad.Run(p_src, p_buffer);
|
||||
mThreadwiseStore.Run(p_buffer, p_dst);
|
||||
}
|
||||
|
||||
template <class T, bool PositiveDirection>
|
||||
__device__ void
|
||||
MoveSrcSlicingWindow(T step_sizes,
|
||||
integral_constant<bool, PositiveDirection> positive_direction)
|
||||
{
|
||||
mThreadwiseLoad.MoveSrcSlicingWindow(step_sizes, positive_direction);
|
||||
}
|
||||
|
||||
template <class T, bool PositiveDirection>
|
||||
__device__ void
|
||||
MoveDstSlicingWindow(T step_sizes,
|
||||
integral_constant<bool, PositiveDirection> positive_direction)
|
||||
{
|
||||
mThreadwiseStore.MoveDstSlicingWindow(step_sizes, positive_direction);
|
||||
}
|
||||
|
||||
private:
|
||||
using RegisterBufferDesc = decltype(make_native_tensor_descriptor_packed(SubLengths{}));
|
||||
|
||||
using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v4r2<SrcDesc,
|
||||
RegisterBufferDesc,
|
||||
SubLengths,
|
||||
SrcDimAccessOrder,
|
||||
SrcVectorAccessDim,
|
||||
SrcDataPerAccess,
|
||||
1>;
|
||||
|
||||
using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v4r2<RegisterBufferDesc,
|
||||
DstDesc,
|
||||
SubLengths,
|
||||
DstDimAccessOrder,
|
||||
DstVectorAccessDim,
|
||||
1,
|
||||
DstDataPerAccess>;
|
||||
|
||||
ThreadwiseLoad mThreadwiseLoad;
|
||||
ThreadwiseStore mThreadwiseStore;
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
|
||||
@@ -6,6 +6,8 @@
|
||||
#include "ConstantMergedTensorDescriptor.hpp"
|
||||
#include "tensor_coordinate.hpp"
|
||||
#include "tensor_view.hpp"
|
||||
#include "tensor_descriptor.hpp"
|
||||
#include "tensor_coordinate_v2.hpp"
|
||||
|
||||
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
|
||||
@@ -427,6 +429,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
|
||||
Array<index_t, nDim> mDstSliceOrigin;
|
||||
};
|
||||
|
||||
// This version use TensorCoordinate
|
||||
// This threadwise copy allow vector access of src and dst.
|
||||
// It allows the dimensions of vector access to be different on src and dst.
|
||||
// It also allows the vector size to be different on src and dst.
|
||||
@@ -774,6 +777,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
|
||||
DstCoordinate mDstSliceOrigin;
|
||||
};
|
||||
|
||||
// this version use TensorView and TensorCoordinate
|
||||
template <class SrcTensor,
|
||||
class DstTensor,
|
||||
class SliceLengths,
|
||||
@@ -783,7 +787,7 @@ template <class SrcTensor,
|
||||
index_t DstVectorAccessDim,
|
||||
index_t SrcDataPerAccess,
|
||||
index_t DstDataPerAccess>
|
||||
struct ThreadwiseGenericTensorSliceCopy_v3
|
||||
struct ThreadwiseGenericTensorSliceCopy_v3r1
|
||||
{
|
||||
static constexpr index_t nDim = SrcTensor::GetNumOfDimension();
|
||||
using data_type = remove_cv_t<typename SrcTensor::data_type>;
|
||||
@@ -791,10 +795,10 @@ struct ThreadwiseGenericTensorSliceCopy_v3
|
||||
using SrcCoordinate = typename SrcTensor::coordinate_type;
|
||||
using DstCoordinate = typename DstTensor::coordinate_type;
|
||||
|
||||
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v3(SrcTensor src,
|
||||
SrcCoordinate src_slice_origin,
|
||||
DstTensor dst,
|
||||
DstCoordinate dst_slice_origin)
|
||||
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v3r1(SrcTensor src,
|
||||
SrcCoordinate src_slice_origin,
|
||||
DstTensor dst,
|
||||
DstCoordinate dst_slice_origin)
|
||||
: mSrc{src},
|
||||
mDst{dst},
|
||||
mSrcSlice{src.Slice(src_slice_origin, SliceLengths{})},
|
||||
@@ -821,8 +825,8 @@ struct ThreadwiseGenericTensorSliceCopy_v3
|
||||
"wrong! vectorized access is not allowed");
|
||||
}
|
||||
|
||||
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v3()
|
||||
: ThreadwiseGenericTensorSliceCopy_v3(
|
||||
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v3r1()
|
||||
: ThreadwiseGenericTensorSliceCopy_v3r1(
|
||||
SrcTensor{}, SrcCoordinate{}, DstTensor{}, DstCoordinate{})
|
||||
{
|
||||
}
|
||||
@@ -940,5 +944,154 @@ struct ThreadwiseGenericTensorSliceCopy_v3
|
||||
DstSlice mDstSlice;
|
||||
};
|
||||
|
||||
// This version use multi-index transformation
|
||||
// This threadwise copy allow vector access of src and dst.
|
||||
// It allows the vector size to be different on src and dst.
|
||||
// The dimensions of vector access should be the same on src and dst.
|
||||
// The dimension access order should be the same on src and dst.
|
||||
// It is designed for cases, where one of src and dst is register, and
|
||||
// the other is device memory or LDS
|
||||
template <class SrcDesc,
|
||||
class DstDesc,
|
||||
class SliceLengths,
|
||||
class DimAccessOrder,
|
||||
index_t VectorAccessDim,
|
||||
index_t SrcDataPerAccess,
|
||||
index_t DstDataPerAccess>
|
||||
struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
{
|
||||
static constexpr index_t nDim = SliceLengths::Size();
|
||||
using Index = MultiIndex<nDim>;
|
||||
|
||||
using SrcCoord = typename TensorCoordinate_v2<SrcDesc>::type;
|
||||
using DstCoord = typename TensorCoordinate_v2<DstDesc>::type;
|
||||
|
||||
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2(SrcCoord src_slice_origin,
|
||||
DstCoord dst_slice_origin)
|
||||
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
|
||||
{
|
||||
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
|
||||
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::Size() &&
|
||||
nDim == DimAccessOrder::Size(),
|
||||
"wrong! # of dimensions not the same");
|
||||
|
||||
static_assert(is_valid_sequence_map<DimAccessOrder>{}, "wrong! map is not valid");
|
||||
|
||||
static_assert(
|
||||
SliceLengths{}[VectorAccessDim] % math::lcm(SrcDataPerAccess, DstDataPerAccess) == 0,
|
||||
"wrong! cannot evenly divide");
|
||||
|
||||
// TODO:: sanity-check if vectorized memory access is allowed on src and dst
|
||||
}
|
||||
|
||||
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2()
|
||||
: ThreadwiseGenericTensorSliceCopy_v4r2(make_zero_array<index_t, nDim>(),
|
||||
make_zero_array<index_t, nDim>())
|
||||
{
|
||||
}
|
||||
|
||||
__device__ void SetSrcSliceOrigin(SrcCoord src_slice_origin)
|
||||
{
|
||||
mSrcSliceOrigin = src_slice_origin;
|
||||
}
|
||||
|
||||
__device__ void SetDstSliceOrigin(DstCoord dst_slice_origin)
|
||||
{
|
||||
mDstSliceOrigin = dst_slice_origin;
|
||||
}
|
||||
|
||||
template <class TData>
|
||||
__device__ void Run(const TData* p_src, TData* p_dst) const
|
||||
{
|
||||
using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
|
||||
using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
|
||||
|
||||
constexpr auto vector_access_dim = Number<VectorAccessDim>{};
|
||||
|
||||
constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
|
||||
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
|
||||
|
||||
constexpr auto long_vector_size = Number<math::lcm(SrcDataPerAccess, DstDataPerAccess)>{};
|
||||
|
||||
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
|
||||
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
|
||||
|
||||
ford<decltype(long_vector_access_lengths), DimAccessOrder>{}([&](
|
||||
auto long_vector_access_id) {
|
||||
|
||||
// data id w.r.t slicing-window
|
||||
auto long_vector_data_begin_id = long_vector_access_id;
|
||||
long_vector_data_begin_id(vector_access_dim) =
|
||||
long_vector_size * long_vector_access_id[vector_access_dim];
|
||||
|
||||
// buffer to hold a long-vector
|
||||
TData p_long_vector[long_vector_size];
|
||||
|
||||
// set 0
|
||||
for(index_t i = 0; i < long_vector_size; ++i)
|
||||
{
|
||||
p_long_vector[i] = 0;
|
||||
}
|
||||
|
||||
// load data from src to the long-vector buffer
|
||||
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
|
||||
{
|
||||
auto scalar_id = make_zero_array<index_t, nDim>();
|
||||
scalar_id(vector_access_dim) = i * src_data_per_access;
|
||||
|
||||
const auto src_coord = mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id);
|
||||
|
||||
// check for padding
|
||||
// TODO: still kind of messy
|
||||
if(!src_coord.IsAnyLevelIndexInPaddingArea())
|
||||
{
|
||||
const index_t src_offset =
|
||||
(mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id)).GetOffset();
|
||||
|
||||
const index_t buffer_offset = i * src_data_per_access;
|
||||
|
||||
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
|
||||
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
|
||||
}
|
||||
}
|
||||
|
||||
// store data from the long-vector buffer to dst
|
||||
for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
|
||||
{
|
||||
auto scalar_id = make_zero_array<index_t, nDim>();
|
||||
scalar_id(vector_access_dim) = i * dst_data_per_access;
|
||||
|
||||
const index_t buffer_offset = i * dst_data_per_access;
|
||||
|
||||
const index_t dst_offset =
|
||||
(mDstSliceOrigin + (long_vector_data_begin_id + scalar_id)).GetOffset();
|
||||
|
||||
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
|
||||
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
template <class T, bool PositiveDirection>
|
||||
__device__ void MoveSrcSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
|
||||
{
|
||||
static_if<PositiveDirection>{}([&](auto) {
|
||||
mSrcSliceOrigin += step_sizes;
|
||||
}).Else([&](auto) { mSrcSliceOrigin -= step_sizes; });
|
||||
}
|
||||
|
||||
template <class T, bool PositiveDirection>
|
||||
__device__ void MoveDstSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
|
||||
{
|
||||
static_if<PositiveDirection>{}([&](auto) {
|
||||
mDstSliceOrigin += step_sizes;
|
||||
}).Else([&](auto) { mDstSliceOrigin -= step_sizes; });
|
||||
}
|
||||
|
||||
private:
|
||||
SrcCoord mSrcSliceOrigin;
|
||||
DstCoord mDstSliceOrigin;
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -23,20 +23,9 @@ struct Array
|
||||
static_assert(sizeof...(Xs) + 1 == NSize, "wrong! size");
|
||||
}
|
||||
|
||||
#if 0
|
||||
template <typename T>
|
||||
__host__ __device__ explicit constexpr Array(const T& x)
|
||||
{
|
||||
static_assert(T::Size() == NSize, "wrong! size");
|
||||
|
||||
static_for<0, NSize, 1>{}([&](auto i){
|
||||
mData[i] = x.At(i);
|
||||
})
|
||||
}
|
||||
#endif
|
||||
|
||||
__host__ __device__ static constexpr index_t Size() { return NSize; }
|
||||
|
||||
// TODO: remove
|
||||
__host__ __device__ static constexpr index_t GetSize() { return Size(); }
|
||||
|
||||
template <index_t I>
|
||||
@@ -265,8 +254,8 @@ __host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_
|
||||
return new_array;
|
||||
}
|
||||
|
||||
template <typename F, typename X, typename Y, typename Z> // emulate constepxr lambda for array
|
||||
// math
|
||||
// emulate constepxr lambda for array
|
||||
template <typename F, typename X, typename Y, typename Z>
|
||||
struct lambda_array_math
|
||||
{
|
||||
const F& f;
|
||||
|
||||
@@ -5,8 +5,8 @@
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <typename T, index_t NSize>
|
||||
__host__ __device__ void print_array(const char* s, Array<T, NSize> a)
|
||||
template <index_t NSize>
|
||||
__host__ __device__ void print_array(const char* s, Array<unsigned_t, NSize> a)
|
||||
{
|
||||
constexpr index_t nsize = a.GetSize();
|
||||
|
||||
@@ -89,5 +89,89 @@ __host__ __device__ void print_array(const char* s, Array<T, NSize> a)
|
||||
});
|
||||
}
|
||||
|
||||
template <index_t NSize>
|
||||
__host__ __device__ void print_array(const char* s, Array<signed_t, NSize> a)
|
||||
{
|
||||
constexpr index_t nsize = a.GetSize();
|
||||
|
||||
static_assert(nsize > 0 && nsize <= 10, "wrong!");
|
||||
|
||||
static_if<nsize == 1>{}([&](auto) { printf("%s size %d, {%d}\n", s, nsize, a[0]); });
|
||||
|
||||
static_if<nsize == 2>{}([&](auto) { printf("%s size %d, {%d %d}\n", s, nsize, a[0], a[1]); });
|
||||
|
||||
static_if<nsize == 3>{}(
|
||||
[&](auto) { printf("%s size %d, {%d %d %d}\n", s, nsize, a[0], a[1], a[2]); });
|
||||
|
||||
static_if<nsize == 4>{}(
|
||||
[&](auto) { printf("%s size %d, {%d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3]); });
|
||||
|
||||
static_if<nsize == 5>{}([&](auto) {
|
||||
printf("%s size %d, {%d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]);
|
||||
});
|
||||
|
||||
static_if<nsize == 6>{}([&](auto) {
|
||||
printf("%s size %d, {%d %d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]);
|
||||
});
|
||||
|
||||
static_if<nsize == 7>{}([&](auto) {
|
||||
printf("%s size %d, {%d %d %d %d %d %d %d}\n",
|
||||
s,
|
||||
nsize,
|
||||
a[0],
|
||||
a[1],
|
||||
a[2],
|
||||
a[3],
|
||||
a[4],
|
||||
a[5],
|
||||
a[6]);
|
||||
});
|
||||
|
||||
static_if<nsize == 8>{}([&](auto) {
|
||||
printf("%s size %d, {%d %d %d %d %d %d %d %d}\n",
|
||||
s,
|
||||
nsize,
|
||||
a[0],
|
||||
a[1],
|
||||
a[2],
|
||||
a[3],
|
||||
a[4],
|
||||
a[5],
|
||||
a[6],
|
||||
a[7]);
|
||||
});
|
||||
|
||||
static_if<nsize == 9>{}([&](auto) {
|
||||
printf("%s size %d, {%d %d %d %d %d %d %d %d %d}\n",
|
||||
s,
|
||||
nsize,
|
||||
a[0],
|
||||
a[1],
|
||||
a[2],
|
||||
a[3],
|
||||
a[4],
|
||||
a[5],
|
||||
a[6],
|
||||
a[7],
|
||||
a[8]);
|
||||
});
|
||||
|
||||
static_if<nsize == 10>{}([&](auto) {
|
||||
printf("%s size %d, {%d %d %d %d %d %d %d %d %d %d}\n",
|
||||
s,
|
||||
nsize,
|
||||
a[0],
|
||||
a[1],
|
||||
a[2],
|
||||
a[3],
|
||||
a[4],
|
||||
a[5],
|
||||
a[6],
|
||||
a[7],
|
||||
a[8],
|
||||
a[9]);
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -15,6 +15,15 @@
|
||||
|
||||
namespace ck {
|
||||
|
||||
using unsigned_t = uint32_t;
|
||||
using signed_t = int;
|
||||
|
||||
#if 0 // debug
|
||||
using index_t = unsigned_t;
|
||||
#else
|
||||
using index_t = signed_t;
|
||||
#endif
|
||||
|
||||
// For some reason, CUDA need this definition, otherwise
|
||||
// compiler won't generate optimal load and store instruction, and
|
||||
// kernel would produce wrong result, indicating the compiler fail to generate correct
|
||||
@@ -22,8 +31,6 @@ namespace ck {
|
||||
using float2_t = float2;
|
||||
using float4_t = float4;
|
||||
|
||||
using index_t = uint32_t;
|
||||
|
||||
template <class T>
|
||||
__device__ void fused_multiply_accumulate(T& d, const T& s0, const T& s1)
|
||||
{
|
||||
|
||||
@@ -537,11 +537,9 @@ struct sequence_unique_sort
|
||||
};
|
||||
|
||||
template <typename SeqMap>
|
||||
struct is_valid_sequence_map
|
||||
struct is_valid_sequence_map : is_same<typename arithmetic_sequence_gen<0, SeqMap::Size(), 1>::type,
|
||||
typename sequence_sort<SeqMap, math::less<index_t>>::type>
|
||||
{
|
||||
static constexpr bool value =
|
||||
is_same<typename arithmetic_sequence_gen<0, SeqMap::Size(), 1>::type,
|
||||
typename sequence_sort<SeqMap, math::less<index_t>>::type>{};
|
||||
};
|
||||
|
||||
template <typename SeqMap>
|
||||
|
||||
@@ -115,7 +115,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded(InDesc,
|
||||
constexpr index_t OutThreadCopyDataPerAccess_N = 4;
|
||||
#endif
|
||||
|
||||
#if 0 // debug
|
||||
#if 1 // debug
|
||||
constexpr index_t GridSize =
|
||||
(N / NPerBlock) * (K / KPerBlock) * (Ho / HoPerBlock) * (Wo / WoPerBlock);
|
||||
#else
|
||||
|
||||
@@ -73,25 +73,10 @@ int main(int argc, char* argv[])
|
||||
using namespace ck;
|
||||
|
||||
#if 1
|
||||
constexpr index_t N = 10;
|
||||
constexpr index_t C = 10;
|
||||
constexpr index_t HI = 10;
|
||||
constexpr index_t WI = 10;
|
||||
constexpr index_t K = 10;
|
||||
constexpr index_t Y = 1;
|
||||
constexpr index_t X = 1;
|
||||
|
||||
using ConvStrides = Sequence<1, 1>;
|
||||
using ConvDilations = Sequence<1, 1>;
|
||||
|
||||
constexpr index_t HPad = 3;
|
||||
constexpr index_t WPad = 3;
|
||||
#elif 1
|
||||
// 3x3, 34x34
|
||||
constexpr index_t N = 64;
|
||||
constexpr index_t C = 256;
|
||||
constexpr index_t HI = 34;
|
||||
constexpr index_t WI = 34;
|
||||
constexpr index_t N = 32;
|
||||
constexpr index_t C = 8;
|
||||
constexpr index_t HI = 2;
|
||||
constexpr index_t WI = 2;
|
||||
constexpr index_t K = 128;
|
||||
constexpr index_t Y = 3;
|
||||
constexpr index_t X = 3;
|
||||
@@ -99,8 +84,23 @@ int main(int argc, char* argv[])
|
||||
using ConvStrides = Sequence<1, 1>;
|
||||
using ConvDilations = Sequence<1, 1>;
|
||||
|
||||
constexpr index_t HPad = 0;
|
||||
constexpr index_t WPad = 0;
|
||||
constexpr index_t HPad = 1;
|
||||
constexpr index_t WPad = 1;
|
||||
#elif 1
|
||||
// 3x3, 34x34
|
||||
constexpr index_t N = 64;
|
||||
constexpr index_t C = 256;
|
||||
constexpr index_t HI = 32;
|
||||
constexpr index_t WI = 32;
|
||||
constexpr index_t K = 128;
|
||||
constexpr index_t Y = 3;
|
||||
constexpr index_t X = 3;
|
||||
|
||||
using ConvStrides = Sequence<1, 1>;
|
||||
using ConvDilations = Sequence<1, 1>;
|
||||
|
||||
constexpr index_t HPad = 1;
|
||||
constexpr index_t WPad = 1;
|
||||
#elif 0
|
||||
// 1x1 filter, 8x8 image
|
||||
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
|
||||
@@ -434,7 +434,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
#if 1
|
||||
#if 0
|
||||
if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 &&
|
||||
ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user