mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 09:16:52 +00:00
padding for chwn is functional
This commit is contained in:
@@ -47,27 +47,18 @@ template <index_t GridSize,
|
||||
index_t OutThreadCopyDataPerAccess_N>
|
||||
struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
|
||||
{
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
static constexpr auto I1 = Number<1>{};
|
||||
static constexpr auto I2 = Number<2>{};
|
||||
static constexpr auto I3 = Number<3>{};
|
||||
static constexpr auto I4 = Number<4>{};
|
||||
static constexpr auto I5 = Number<5>{};
|
||||
static constexpr auto I6 = Number<6>{};
|
||||
static constexpr auto I7 = Number<7>{};
|
||||
static constexpr auto I8 = Number<8>{};
|
||||
static constexpr auto I9 = Number<9>{};
|
||||
static constexpr auto I10 = Number<10>{};
|
||||
static constexpr auto I11 = Number<11>{};
|
||||
|
||||
static constexpr auto True = integral_constant<bool, true>{};
|
||||
static constexpr auto False = integral_constant<bool, false>{};
|
||||
|
||||
#if 1
|
||||
__device__ void Run(const Float* const __restrict__ p_in_global,
|
||||
const Float* const __restrict__ p_wei_global,
|
||||
Float* const __restrict__ p_out_global) const
|
||||
{
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
static constexpr auto I1 = Number<1>{};
|
||||
static constexpr auto I2 = Number<2>{};
|
||||
static constexpr auto I3 = Number<3>{};
|
||||
|
||||
static constexpr auto True = integral_constant<bool, true>{};
|
||||
static constexpr auto False = integral_constant<bool, false>{};
|
||||
|
||||
// be careful of this assertion
|
||||
static_assert(
|
||||
NPerBlock % NPerThread == 0 &&
|
||||
@@ -122,8 +113,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
|
||||
const index_t wo_block_data_begin = block_work_multi_id[2] * WoPerBlock;
|
||||
const index_t n_block_data_begin = block_work_multi_id[3] * NPerBlock;
|
||||
|
||||
const index_t hi_block_data_begin = ho_block_data_begin - LeftPads{}[0];
|
||||
const index_t wi_block_data_begin = wo_block_data_begin - LeftPads{}[1];
|
||||
const index_t hp_block_data_begin = ho_block_data_begin;
|
||||
const index_t wp_block_data_begin = wo_block_data_begin;
|
||||
|
||||
// input global tensor view
|
||||
constexpr auto in_c_hp_wp_n_global_desc = transform_tensor_descriptor(
|
||||
@@ -133,12 +124,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
|
||||
make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
|
||||
|
||||
// global tensor view
|
||||
constexpr auto wei_c_k_global_desc_old = wei_c_y_x_k_global_desc_old.Extract(I0, I3);
|
||||
|
||||
constexpr auto wei_c_k_global_desc = make_native_tensor_descriptor(
|
||||
wei_c_k_global_desc_old.GetLengths(), wei_c_k_global_desc_old.GetStrides());
|
||||
|
||||
// LDS tensor view
|
||||
// be careful of alignment
|
||||
constexpr index_t max_align = math::lcm(InBlockCopyDataPerAccess_N,
|
||||
@@ -158,15 +143,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
|
||||
static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
|
||||
"GemmDataPerReadB alignment requirement is not meet");
|
||||
|
||||
constexpr auto wei_c_k_block_desc_old = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, KPerBlock>{}, Number<max_align>{});
|
||||
constexpr auto wei_c_1_1_k_block_desc_old = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, 1, 1, KPerBlock>{}, Number<max_align>{});
|
||||
|
||||
constexpr auto wei_c_k_block_desc = make_native_tensor_descriptor(
|
||||
wei_c_k_block_desc_old.GetLengths(), wei_c_k_block_desc_old.GetStrides());
|
||||
constexpr auto wei_c_1_1_k_block_desc = make_native_tensor_descriptor(
|
||||
wei_c_1_1_k_block_desc_old.GetLengths(), wei_c_1_1_k_block_desc_old.GetStrides());
|
||||
|
||||
// LDS: be careful of alignment
|
||||
constexpr index_t in_block_space = in_c_h_w_n_block_desc_old.GetElementSpace();
|
||||
constexpr index_t wei_block_space = wei_c_k_block_desc_old.GetElementSpace();
|
||||
constexpr index_t wei_block_space = wei_c_1_1_k_block_desc_old.GetElementSpace();
|
||||
|
||||
__shared__ Float p_in_block[in_block_space];
|
||||
__shared__ Float p_wei_block[wei_block_space];
|
||||
@@ -181,46 +166,45 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
|
||||
// blockwise input copy
|
||||
// format is [C, Hi, Wi, N]
|
||||
auto blockwise_in_copy =
|
||||
#if 0
|
||||
BlockwiseGenericTensorSliceCopy_v2
|
||||
#else
|
||||
BlockwiseGenericTensorSliceCopy_v4
|
||||
#endif
|
||||
<BlockSize,
|
||||
decltype(in_c_hp_wp_n_global_desc),
|
||||
decltype(in_c_h_w_n_block_desc),
|
||||
decltype(in_c_h_w_n_block_desc.GetLengths()),
|
||||
InBlockCopySubLengths_CHWN,
|
||||
InBlockCopyClusterLengths_CHWN,
|
||||
Sequence<0, 1, 2, 3>,
|
||||
Sequence<0, 1, 2, 3>,
|
||||
Sequence<0, 1, 2, 3>,
|
||||
3,
|
||||
3,
|
||||
InBlockCopyDataPerAccess_N,
|
||||
InBlockCopyDataPerAccess_N>({0, 0, 0, 0}, {0, 0, 0, 0});
|
||||
BlockwiseGenericTensorSliceCopy_v4<BlockSize,
|
||||
decltype(in_c_hp_wp_n_global_desc),
|
||||
decltype(in_c_h_w_n_block_desc),
|
||||
decltype(in_c_h_w_n_block_desc.GetLengths()),
|
||||
InBlockCopySubLengths_CHWN,
|
||||
InBlockCopyClusterLengths_CHWN,
|
||||
Sequence<0, 1, 2, 3>,
|
||||
Sequence<0, 1, 2, 3>,
|
||||
Sequence<0, 1, 2, 3>,
|
||||
3,
|
||||
3,
|
||||
InBlockCopyDataPerAccess_N,
|
||||
InBlockCopyDataPerAccess_N>(
|
||||
{0, hp_block_data_begin, wp_block_data_begin, n_block_data_begin}, {0, 0, 0, 0});
|
||||
|
||||
// blockwise wei copy
|
||||
// format is [CPerBlock, KPerBlock]
|
||||
const auto blockwise_wei_copy =
|
||||
#if 0
|
||||
BlockwiseGenericTensorSliceCopy_v2
|
||||
#else
|
||||
BlockwiseGenericTensorSliceCopy_v4
|
||||
#endif
|
||||
<BlockSize,
|
||||
decltype(wei_c_k_global_desc),
|
||||
decltype(wei_c_k_block_desc),
|
||||
decltype(wei_c_k_block_desc.GetLengths()),
|
||||
WeiBlockCopySubLengths_CK,
|
||||
WeiBlockCopyClusterLengths_CK,
|
||||
Sequence<0, 1>,
|
||||
Sequence<0, 1>,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
WeiBlockCopyDataPerAccess_K,
|
||||
WeiBlockCopyDataPerAccess_K>({0, 0}, {0, 0});
|
||||
using WeiBlockCopySubLengths_CYXK =
|
||||
Sequence<WeiBlockCopySubLengths_CK::At(0), 1, 1, WeiBlockCopySubLengths_CK::At(1)>;
|
||||
using WeiBlockCopyClusterLengths_CYXK = Sequence<WeiBlockCopyClusterLengths_CK::At(0),
|
||||
1,
|
||||
1,
|
||||
WeiBlockCopyClusterLengths_CK::At(1)>;
|
||||
|
||||
auto blockwise_wei_copy =
|
||||
BlockwiseGenericTensorSliceCopy_v4<BlockSize,
|
||||
decltype(wei_c_y_x_k_global_desc),
|
||||
decltype(wei_c_1_1_k_block_desc),
|
||||
decltype(wei_c_1_1_k_block_desc.GetLengths()),
|
||||
WeiBlockCopySubLengths_CYXK,
|
||||
WeiBlockCopyClusterLengths_CYXK,
|
||||
Sequence<0, 1, 2, 3>,
|
||||
Sequence<0, 1, 2, 3>,
|
||||
Sequence<0, 1, 2, 3>,
|
||||
3,
|
||||
3,
|
||||
WeiBlockCopyDataPerAccess_K,
|
||||
WeiBlockCopyDataPerAccess_K>(
|
||||
{0, 0, 0, k_block_data_begin}, {0, 0, 0, 0});
|
||||
|
||||
// a series of blockwise batched GEMM
|
||||
// C_matrix += transpose(A_matrix) * B_matrix
|
||||
@@ -228,8 +212,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
|
||||
// A_matrix[C,K] is a sub-matrix of wei_block[C,K]
|
||||
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
|
||||
// C_matrix[K,Wo*N] is a sub-matrix of out_block[K,Ho,Wo,N]
|
||||
constexpr auto a_c_k_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_c_k_block_desc.GetStride(I0)>{});
|
||||
constexpr auto a_c_k_block_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<CPerBlock>{},
|
||||
Number<KPerBlock>{},
|
||||
Number<wei_c_1_1_k_block_desc.GetStride(I0)>{});
|
||||
|
||||
constexpr auto b_c_wn_block_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<CPerBlock>{},
|
||||
@@ -270,39 +256,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
|
||||
// set threadwise output tensor to 0
|
||||
threadwise_matrix_set_zero(c_k_wn_thread_mtx_desc, p_out_thread);
|
||||
|
||||
#if 1
|
||||
for(index_t y = 0; y < Y; ++y)
|
||||
{
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global +
|
||||
in_c_h_w_n_global_desc.CalculateOffset(
|
||||
{0, hi_block_data_begin + y, wi_block_data_begin + x, n_block_data_begin});
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global +
|
||||
wei_c_y_x_k_global_desc.CalculateOffset({0, y, x, k_block_data_begin});
|
||||
|
||||
for(index_t c_block_data_begin = 0; c_block_data_begin < C;
|
||||
c_block_data_begin += CPerBlock,
|
||||
p_in_global_block_offset +=
|
||||
CPerBlock * in_c_h_w_n_global_desc.GetStride(I0),
|
||||
p_wei_global_block_offset +=
|
||||
CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0))
|
||||
{
|
||||
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block);
|
||||
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
for(index_t y = 0; y < Y; ++y)
|
||||
{
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
@@ -310,8 +263,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
|
||||
for(index_t c_block_data_begin = 0; c_block_data_begin < C;
|
||||
c_block_data_begin += CPerBlock)
|
||||
{
|
||||
blockwise_in_copy.Run();
|
||||
blockwise_wei_copy.Run();
|
||||
blockwise_in_copy.Run(p_in_global, p_in_block);
|
||||
blockwise_wei_copy.Run(p_wei_global, p_wei_block);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@@ -320,28 +273,29 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
|
||||
__syncthreads();
|
||||
|
||||
// move along C
|
||||
blockwise_in_copy.MoveSrcSliceWindow(Sequence<CPerBlock, 0, 0, 0>{}, True);
|
||||
blockwise_wei_copy.MoveSrcSliceWindow(Sequence<CPerBlock, 0, 0, 0>{}, True);
|
||||
blockwise_in_copy.MoveSrcSliceWindow(make_multi_index(CPerBlock, 0, 0, 0),
|
||||
True);
|
||||
blockwise_wei_copy.MoveSrcSliceWindow(make_multi_index(CPerBlock, 0, 0, 0),
|
||||
True);
|
||||
}
|
||||
|
||||
// reset C
|
||||
blockwise_in_copy.MoveSrcSliceWindow(Sequence<C, 0, 0, 0>{}, False);
|
||||
blockwise_wei_copy.MoveSrcSliceWindow(Sequence<C, 0, 0, 0>{}, False);
|
||||
blockwise_in_copy.MoveSrcSliceWindow(make_multi_index(C, 0, 0, 0), False);
|
||||
blockwise_wei_copy.MoveSrcSliceWindow(make_multi_index(C, 0, 0, 0), False);
|
||||
|
||||
// move aling X
|
||||
blockwise_in_copy.MoveSrcSliceWindow(Sequence<0, 0, 1, 0>{}, True);
|
||||
blockwise_wei_copy.MoveSrcSliceWindow(Sequence<0, 0, 1, 0>{}, True);
|
||||
blockwise_in_copy.MoveSrcSliceWindow(make_multi_index(0, 0, 1, 0), True);
|
||||
blockwise_wei_copy.MoveSrcSliceWindow(make_multi_index(0, 0, 1, 0), True);
|
||||
}
|
||||
|
||||
// reset X
|
||||
blockwise_in_copy.MoveSrcSliceWindow(Sequence<0, 0, X, 0>{}, False);
|
||||
blockwise_wei_copy.MoveSrcSliceWindow(Sequence<0, 0, X, 0>{}, False);
|
||||
blockwise_in_copy.MoveSrcSliceWindow(make_multi_index(0, 0, X, 0), False);
|
||||
blockwise_wei_copy.MoveSrcSliceWindow(make_multi_index(0, 0, X, 0), False);
|
||||
|
||||
// move along Y
|
||||
blockwise_in_copy.MoveSrcSliceWindow(Sequence<0, 1, 0, 0>{}, False);
|
||||
blockwise_wei_copy.MoveSrcSliceWindow(Sequence<0, 1, 0, 0>{}, False);
|
||||
blockwise_in_copy.MoveSrcSliceWindow(make_multi_index(0, 1, 0, 0), True);
|
||||
blockwise_wei_copy.MoveSrcSliceWindow(make_multi_index(0, 1, 0, 0), True);
|
||||
}
|
||||
#endif
|
||||
|
||||
// output: register to global mem
|
||||
const auto c_thread_mtx_begin =
|
||||
@@ -454,110 +408,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
|
||||
.Run(p_out_thread, p_out_thread_on_global);
|
||||
});
|
||||
}
|
||||
#elif 0
|
||||
__device__ void Run(const Float* const __restrict__ p_in_global,
|
||||
const Float* const __restrict__ p_wei_global,
|
||||
Float* const __restrict__ p_out_global) const
|
||||
{
|
||||
// create a native tensor descriptor
|
||||
constexpr auto in_c_h_w_n_global_desc =
|
||||
make_native_tensor_descriptor(InGlobalDesc::GetLengths(), InGlobalDesc::GetStrides());
|
||||
|
||||
constexpr index_t C = in_c_h_w_n_global_desc.GetLength(I0);
|
||||
constexpr index_t Hi = in_c_h_w_n_global_desc.GetLength(I1);
|
||||
constexpr index_t Wi = in_c_h_w_n_global_desc.GetLength(I2);
|
||||
constexpr index_t N = in_c_h_w_n_global_desc.GetLength(I3);
|
||||
|
||||
// transformation: {c, h, w, n} --> {n, c, hp, wp}
|
||||
// {h, w} --> {hp, wp}, {c} --> {c}, {n} --> {n}
|
||||
constexpr auto in_n_c_hp_wp_global_desc = transform_tensor_descriptor(
|
||||
in_c_h_w_n_global_desc,
|
||||
make_tuple(
|
||||
Pad<Sequence<Hi, Wi>, LeftPads, RightPads>{}, PassThrough<C>{}, PassThrough<N>{}),
|
||||
make_tuple(Sequence<1, 2>{}, Sequence<0>{}, Sequence<3>{}),
|
||||
make_tuple(Sequence<2, 3>{}, Sequence<1>{}, Sequence<0>{}));
|
||||
|
||||
// transformation: {n, c, hp, wp} --> {c, b}
|
||||
// {n, hp, wp} --> {b}, {c} --> {c}
|
||||
constexpr auto in_c_b_global_desc = transform_tensor_descriptor(
|
||||
in_n_c_hp_wp_global_desc,
|
||||
make_tuple(Merge<decltype(in_n_c_hp_wp_global_desc.GetLengths(I0, I2, I3))>{},
|
||||
PassThrough<in_n_c_hp_wp_global_desc.GetLength(I1)>{}),
|
||||
make_tuple(Sequence<0, 2, 3>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<1>{}, Sequence<0>{}));
|
||||
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
{
|
||||
// 0
|
||||
print_tensor_descriptor("in_c_h_w_n_global_desc", in_c_h_w_n_global_desc);
|
||||
|
||||
// 1
|
||||
print_tensor_descriptor("in_n_c_hp_wp_global_desc", in_n_c_hp_wp_global_desc);
|
||||
|
||||
// 2
|
||||
print_tensor_descriptor("in_c_b_global_desc", in_c_b_global_desc);
|
||||
|
||||
constexpr auto idx2 = MultiIndex<2>{1, 4 * (16 * 16) + 5 * 16 + 6};
|
||||
auto idx1 = in_c_b_global_desc.CalculateLowerIndex(idx2);
|
||||
auto idx0 = in_c_b_global_desc.GetLowerTensorDescriptor().CalculateLowerIndex(idx1);
|
||||
|
||||
print_array("idx2: ", idx2);
|
||||
print_array("idx1: ", idx1);
|
||||
print_array("idx0: ", idx0);
|
||||
|
||||
printf("in_c_b_global_desc offset: %lu\n", in_c_b_global_desc.CalculateOffset(idx2));
|
||||
}
|
||||
}
|
||||
#else
|
||||
__device__ void Run(const Float* const __restrict__ p_in_global,
|
||||
const Float* const __restrict__ p_wei_global,
|
||||
Float* const __restrict__ p_out_global) const
|
||||
{
|
||||
// create a native tensor descriptor
|
||||
constexpr auto in_c_h_w_n_global_desc =
|
||||
make_native_tensor_descriptor(InGlobalDesc::GetLengths(), InGlobalDesc::GetStrides());
|
||||
|
||||
constexpr index_t C = in_c_h_w_n_global_desc.GetLength(I0);
|
||||
constexpr index_t Hi = in_c_h_w_n_global_desc.GetLength(I1);
|
||||
constexpr index_t Wi = in_c_h_w_n_global_desc.GetLength(I2);
|
||||
constexpr index_t N = in_c_h_w_n_global_desc.GetLength(I3);
|
||||
|
||||
// transformation: {c, h, w, n} --> {n, c, hp, wp}
|
||||
// {h, w} --> {hp, wp}, {c} --> {c}, {n} --> {n}
|
||||
constexpr auto in_c_hp_wp_n_global_desc = transform_tensor_descriptor(
|
||||
in_c_h_w_n_global_desc,
|
||||
make_tuple(
|
||||
PassThrough<C>{}, Pad<Sequence<Hi, Wi>, LeftPads, RightPads>{}, PassThrough<N>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
|
||||
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
{
|
||||
// 0
|
||||
print_tensor_descriptor("in_c_h_w_n_global_desc", in_c_h_w_n_global_desc);
|
||||
|
||||
// 1
|
||||
print_tensor_descriptor("in_c_hp_wp_n_global_desc", in_c_hp_wp_n_global_desc);
|
||||
|
||||
constexpr auto idx1 = MultiIndex<4>{1, 2, 3, 4};
|
||||
auto idx0 = in_c_hp_wp_n_global_desc.CalculateLowerIndex(idx1);
|
||||
|
||||
print_array("idx1: ", idx1);
|
||||
print_array("idx0: ", idx0);
|
||||
|
||||
auto coord1 = make_tensor_coordinate_v2(in_c_hp_wp_n_global_desc, idx1);
|
||||
|
||||
print_array("1: ", coord1.GetIndex());
|
||||
print_array("0: ", coord1.GetLowerCoordinate().GetIndex());
|
||||
|
||||
printf("in_c_hp_wp_n_global_desc is_in_pad: %d\n",
|
||||
coord1.IsAnyLevelIndexInPaddingArea());
|
||||
|
||||
printf("in_c_hp_wp_n_global_desc offset: %lu\n",
|
||||
in_c_hp_wp_n_global_desc.CalculateOffset(idx1));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
|
||||
@@ -304,8 +304,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
|
||||
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
|
||||
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
|
||||
#else
|
||||
blockwise_in_copy.MoveSrcSlicingWindow({EPerBlock, 0, 0, 0}, true);
|
||||
blockwise_wei_copy.MoveSrcSlicingWindow({EPerBlock, 0}, true);
|
||||
blockwise_in_copy.MoveSrcSliceWindow({EPerBlock, 0, 0, 0}, true);
|
||||
blockwise_wei_copy.MoveSrcSliceWindow({EPerBlock, 0}, true);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -303,7 +303,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
|
||||
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
|
||||
|
||||
blockwise_in_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0, 0, 0>{}, True);
|
||||
blockwise_in_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0, 0, 0>{}, True);
|
||||
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
|
||||
|
||||
__syncthreads();
|
||||
@@ -328,7 +328,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
|
||||
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
|
||||
|
||||
blockwise_in_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0, 0, 0>{}, True);
|
||||
blockwise_in_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0, 0, 0>{}, True);
|
||||
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@@ -240,8 +240,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
|
||||
|
||||
__syncthreads();
|
||||
|
||||
blockwise_in_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0>{}, True);
|
||||
blockwise_wei_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0>{}, True);
|
||||
blockwise_in_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0>{}, True);
|
||||
blockwise_wei_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0>{}, True);
|
||||
}
|
||||
|
||||
// copy output: register to global memory
|
||||
@@ -297,9 +297,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
|
||||
{
|
||||
threadwise_out_copy.Run(p_out_thread, p_out_global);
|
||||
|
||||
threadwise_out_copy.MoveSrcSlicingWindow(Sequence<0, 0, GemmNPerThreadSubC>{},
|
||||
True);
|
||||
threadwise_out_copy.MoveDstSlicingWindow(Sequence<0, 0, B1>{}, True);
|
||||
threadwise_out_copy.MoveSrcSliceWindow(Sequence<0, 0, GemmNPerThreadSubC>{}, True);
|
||||
threadwise_out_copy.MoveDstSliceWindow(Sequence<0, 0, B1>{}, True);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -269,7 +269,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
|
||||
Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
|
||||
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
|
||||
|
||||
blockwise_in_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0>{}, True);
|
||||
blockwise_in_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0>{}, True);
|
||||
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStrides()[0];
|
||||
|
||||
__syncthreads();
|
||||
@@ -294,7 +294,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
|
||||
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
|
||||
|
||||
// even iteration
|
||||
blockwise_in_copy.MoveSrcSlicingWindow(Sequence<EPerBlock, 0>{}, True);
|
||||
blockwise_in_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0>{}, True);
|
||||
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStrides()[0];
|
||||
|
||||
__syncthreads();
|
||||
@@ -379,9 +379,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
|
||||
{
|
||||
threadwise_out_copy.Run(p_out_thread, p_out_global);
|
||||
|
||||
threadwise_out_copy.MoveSrcSlicingWindow(Sequence<0, 0, GemmNPerThreadSubC>{},
|
||||
True);
|
||||
threadwise_out_copy.MoveDstSlicingWindow(Sequence<0, 0, B1>{}, True);
|
||||
threadwise_out_copy.MoveSrcSliceWindow(Sequence<0, 0, GemmNPerThreadSubC>{}, True);
|
||||
threadwise_out_copy.MoveDstSliceWindow(Sequence<0, 0, B1>{}, True);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,6 +8,12 @@ namespace ck {
|
||||
template <index_t N>
|
||||
using MultiIndex = Array<index_t, N>;
|
||||
|
||||
template <typename... Xs>
|
||||
__host__ __device__ constexpr auto make_multi_index(Xs... xs)
|
||||
{
|
||||
return MultiIndex<sizeof...(Xs)>(xs...);
|
||||
}
|
||||
|
||||
template <index_t Length>
|
||||
struct PassThrough
|
||||
{
|
||||
|
||||
@@ -408,8 +408,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
|
||||
template <class T, bool PositiveDirection>
|
||||
__device__ void
|
||||
MoveSrcSlicingWindow(T step_sizes,
|
||||
integral_constant<bool, PositiveDirection> positive_direction)
|
||||
MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
|
||||
{
|
||||
static_for<0, nDim, 1>{}([&](auto idim) {
|
||||
if(step_sizes[idim] != 0)
|
||||
@@ -506,18 +505,16 @@ struct BlockwiseGenericTensorSliceCopy_v2
|
||||
|
||||
template <class T, bool PositiveDirection>
|
||||
__device__ void
|
||||
MoveSrcSlicingWindow(T step_sizes,
|
||||
integral_constant<bool, PositiveDirection> positive_direction)
|
||||
MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
|
||||
{
|
||||
mThreadwiseLoad.MoveSrcSlicingWindow(step_sizes, positive_direction);
|
||||
mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
|
||||
}
|
||||
|
||||
template <class T, bool PositiveDirection>
|
||||
__device__ void
|
||||
MoveDstSlicingWindow(T step_sizes,
|
||||
integral_constant<bool, PositiveDirection> positive_direction)
|
||||
MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
|
||||
{
|
||||
mThreadwiseStore.MoveDstSlicingWindow(step_sizes, positive_direction);
|
||||
mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction);
|
||||
}
|
||||
|
||||
private:
|
||||
@@ -753,18 +750,16 @@ struct BlockwiseGenericTensorSliceCopy_v4
|
||||
|
||||
template <class T, bool PositiveDirection>
|
||||
__device__ void
|
||||
MoveSrcSlicingWindow(T step_sizes,
|
||||
integral_constant<bool, PositiveDirection> positive_direction)
|
||||
MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
|
||||
{
|
||||
mThreadwiseLoad.MoveSrcSlicingWindow(step_sizes, positive_direction);
|
||||
mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
|
||||
}
|
||||
|
||||
template <class T, bool PositiveDirection>
|
||||
__device__ void
|
||||
MoveDstSlicingWindow(T step_sizes,
|
||||
integral_constant<bool, PositiveDirection> positive_direction)
|
||||
MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
|
||||
{
|
||||
mThreadwiseStore.MoveDstSlicingWindow(step_sizes, positive_direction);
|
||||
mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction);
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
@@ -757,7 +757,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
|
||||
|
||||
// T can be Sequence or Array
|
||||
template <class T, bool PositiveDirection>
|
||||
__device__ void MoveSrcSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
|
||||
__device__ void MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
|
||||
{
|
||||
static_if<PositiveDirection>{}([&](auto) {
|
||||
mSrcSliceOrigin += step_sizes;
|
||||
@@ -765,7 +765,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
|
||||
}
|
||||
|
||||
template <class T, bool PositiveDirection>
|
||||
__device__ void MoveDstSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
|
||||
__device__ void MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
|
||||
{
|
||||
static_if<PositiveDirection>{}([&](auto) {
|
||||
mDstSliceOrigin += step_sizes;
|
||||
@@ -1045,8 +1045,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
// TODO: still kind of messy
|
||||
if(!src_coord.IsAnyLevelIndexInPaddingArea())
|
||||
{
|
||||
const index_t src_offset =
|
||||
(mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id)).GetOffset();
|
||||
const index_t src_offset = src_coord.GetOffset();
|
||||
|
||||
const index_t buffer_offset = i * src_data_per_access;
|
||||
|
||||
@@ -1073,7 +1072,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
}
|
||||
|
||||
template <class T, bool PositiveDirection>
|
||||
__device__ void MoveSrcSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
|
||||
__device__ void MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
|
||||
{
|
||||
static_if<PositiveDirection>{}([&](auto) {
|
||||
mSrcSliceOrigin += step_sizes;
|
||||
@@ -1081,7 +1080,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
}
|
||||
|
||||
template <class T, bool PositiveDirection>
|
||||
__device__ void MoveDstSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
|
||||
__device__ void MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
|
||||
{
|
||||
static_if<PositiveDirection>{}([&](auto) {
|
||||
mDstSliceOrigin += step_sizes;
|
||||
|
||||
@@ -72,20 +72,20 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
constexpr index_t N = 32;
|
||||
constexpr index_t C = 8;
|
||||
constexpr index_t HI = 2;
|
||||
constexpr index_t WI = 2;
|
||||
constexpr index_t HI = 1;
|
||||
constexpr index_t WI = 1;
|
||||
constexpr index_t K = 128;
|
||||
constexpr index_t Y = 3;
|
||||
constexpr index_t X = 3;
|
||||
constexpr index_t Y = 1;
|
||||
constexpr index_t X = 1;
|
||||
|
||||
using ConvStrides = Sequence<1, 1>;
|
||||
using ConvDilations = Sequence<1, 1>;
|
||||
|
||||
constexpr index_t HPad = 1;
|
||||
constexpr index_t WPad = 1;
|
||||
using LeftPads = Sequence<1, 1>;
|
||||
using RightPads = Sequence<0, 0>;
|
||||
#elif 1
|
||||
// 3x3, 34x34
|
||||
constexpr index_t N = 64;
|
||||
@@ -99,8 +99,8 @@ int main(int argc, char* argv[])
|
||||
using ConvStrides = Sequence<1, 1>;
|
||||
using ConvDilations = Sequence<1, 1>;
|
||||
|
||||
constexpr index_t HPad = 1;
|
||||
constexpr index_t WPad = 1;
|
||||
using LeftPads = Sequence<1, 1>;
|
||||
using RightPads = Sequence<1, 1>;
|
||||
#elif 0
|
||||
// 1x1 filter, 8x8 image
|
||||
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
|
||||
@@ -311,13 +311,10 @@ int main(int argc, char* argv[])
|
||||
constexpr index_t WPad = 0;
|
||||
#endif
|
||||
|
||||
auto lower_pads = Sequence<HPad, WPad>{};
|
||||
auto upper_pads = Sequence<HPad, WPad>{};
|
||||
|
||||
auto in_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence<N, C, HI, WI>{});
|
||||
auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
|
||||
auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor(
|
||||
in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, lower_pads, upper_pads);
|
||||
in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{});
|
||||
|
||||
ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
|
||||
ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
|
||||
@@ -378,8 +375,8 @@ int main(int argc, char* argv[])
|
||||
wei_kcyx,
|
||||
out_nkhw_desc,
|
||||
out_nkhw_device,
|
||||
lower_pads,
|
||||
upper_pads,
|
||||
LeftPads{},
|
||||
RightPads{},
|
||||
nrepeat);
|
||||
#elif 0
|
||||
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(
|
||||
@@ -434,11 +431,12 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
#if 0
|
||||
#if 1
|
||||
if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 &&
|
||||
ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1)
|
||||
{
|
||||
host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads);
|
||||
host_winograd_3x3_convolution(
|
||||
in_nchw, wei_kcyx, out_nkhw_host, LeftPads{}, RightPads{});
|
||||
}
|
||||
else
|
||||
#endif
|
||||
@@ -448,8 +446,8 @@ int main(int argc, char* argv[])
|
||||
out_nkhw_host,
|
||||
ConvStrides{},
|
||||
ConvDilations{},
|
||||
lower_pads,
|
||||
upper_pads);
|
||||
LeftPads{},
|
||||
RightPads{});
|
||||
}
|
||||
check_error(out_nkhw_host, out_nkhw_device);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user