From df29a7e09769ddb78eef4e8a1197b120a8175a66 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 24 Jun 2019 11:20:19 -0500 Subject: [PATCH] enabling vector load on merged dim --- .../blockwise_generic_tensor_slice_copy.hpp | 58 ++++++++++--------- ...lution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp | 36 +++++++++++- driver/src/driver.cpp | 9 ++- 3 files changed, 71 insertions(+), 32 deletions(-) diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index e7fb7e2a71..981e0ad8db 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -13,8 +13,10 @@ namespace ck { // slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor -// memory layout (ordering of dimensions) can be different between src and dst -// For now, only support SubLengths[...] == 1 on a merged dimension +// memory layout (ordering of dimensions) can be different between src and dst. +// on a merged dimension that constains multiple original dimensions, +// its sub-length need to evenly divide the length of the last original dimension +// so each thread is effectively reading a normal (not merged) tensor template {}([&](auto IDim) { - static_assert(SubLengths::Get(IDim) == 1 || - (!SrcDesc::ContainMultipleOriginalDimensions(IDim) && - !DstDesc::ContainMultipleOriginalDimensions(IDim)), - "wrong! only surpport Sub-Length == 1 on a merged dimension"); + constexpr auto sub_length = SubLengths::Get(IDim); + + constexpr auto idim_original_src = SrcDesc::GetContainedOriginalDimensions(IDim).Back(); + static_assert(SrcDesc::GetOriginalTensorDescriptor().GetLength(idim_original_src) % + sub_length == + 0, + "wrong!"); + + constexpr auto idim_original_dst = DstDesc::GetContainedOriginalDimensions(IDim).Back(); + static_assert(DstDesc::GetOriginalTensorDescriptor().GetLength(idim_original_dst) % + sub_length == + 0, + "wrong!"); }); // calculate mThreadSrcOffset, mThreadDstOffset @@ -118,28 +130,24 @@ struct BlockwiseGenericTensorSliceCopy_v1 // partial offset on each dimension static_for<0, nDim, 1>{}([&](auto IDim) { - constexpr index_t idim = IDim; - constexpr auto src_partial_original_dims = SrcDesc::GetContainedOriginalDimensions(IDim); constexpr auto src_partial_original_desc = SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims); - mThreadSrcPartialOffsets(idim) = src_partial_original_desc.GetOffsetFromMultiIndex( + mThreadSrcPartialOffsets(IDim) = src_partial_original_desc.GetOffsetFromMultiIndex( extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims)); }); static_for<0, nDim, 1>{}([&](auto IDim) { - constexpr index_t idim = IDim; - constexpr auto dst_partial_original_dims = DstDesc::GetContainedOriginalDimensions(IDim); constexpr auto dst_partial_original_desc = DstDesc::GetOriginalTensorDescriptor().Extract(dst_partial_original_dims); - mThreadDstPartialOffsets(idim) = dst_partial_original_desc.GetOffsetFromMultiIndex( + mThreadDstPartialOffsets(IDim) = dst_partial_original_desc.GetOffsetFromMultiIndex( extract_array(mThreadDstOriginalMultiId, dst_partial_original_dims)); }); @@ -173,10 +181,8 @@ struct BlockwiseGenericTensorSliceCopy_v1 constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths); - static_ford{}([&](auto repeat_multi_id_) { #if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 - constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; - + static_ford{}([&](auto repeat_multi_id) { constexpr auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims; @@ -189,14 +195,13 @@ struct BlockwiseGenericTensorSliceCopy_v1 constexpr index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin); #else - constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){}); - + ford{}([&](auto repeat_multi_id) { const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims; const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; const index_t src_offset = - SrcDesc{}.GetOffsetFromMultiIndex(src_thread_data_multi_id_begin); + SrcDesc::GetOffsetFromMultiIndex(src_thread_data_multi_id_begin); const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin); @@ -233,10 +238,8 @@ struct BlockwiseGenericTensorSliceCopy_v1 constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths); - static_ford{}([&](auto repeat_multi_id_) { #if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 - constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; - + static_ford{}([&](auto repeat_multi_id) { constexpr auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; @@ -246,10 +249,9 @@ struct BlockwiseGenericTensorSliceCopy_v1 thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin); constexpr index_t dst_offset = - DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin); + DstDesc::GetOffsetFromMultiIndex(dst_data_multi_id_begin); #else - constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){}); - + ford{}([&](auto repeat_multi_id) { const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims; @@ -257,7 +259,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin); - const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin); + const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_multi_id_begin); #endif // By position the origin of the per-thread window at the point, where multi-index diff --git a/driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp index 3d12acd24a..94d8f203f1 100644 --- a/driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp @@ -59,7 +59,7 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc, constexpr index_t B = (N * Ho * Wo) / (N1 * N2); -#if 1 +#if 0 constexpr index_t BlockSize = 256; constexpr index_t BPerBlock = 16; @@ -91,6 +91,40 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc, using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] + constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; + constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; +#elif 1 + constexpr index_t BlockSize = 256; + + constexpr index_t BPerBlock = 16; + constexpr index_t KPerBlock = 128; + constexpr index_t CPerBlock = 8; + + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 4; + constexpr index_t GemmMLevel1Cluster = 4; + constexpr index_t GemmNLevel1Cluster = 4; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmDataPerReadA = 4; + constexpr index_t GemmDataPerReadB = 4; + + using InBlockCopySubLengths_E_N1_B_N2 = Sequence<1, 1, 2, 2>; + using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<8, 2, 8, 2>; + using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B] + using InBlockCopySrcAccessOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B] + using InBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [E, N1, B, N2] + + constexpr index_t InBlockCopySrcDataPerRead_B = 2; + constexpr index_t InBlockCopyDstDataPerWrite_N2 = 2; + + using WeiBlockCopySubLengths_E_K = Sequence<4, 1>; + using WeiBlockCopyClusterLengths_E_K = Sequence<2, 128>; + using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] + constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; #endif diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index bfca88bc53..8826b62ab3 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -454,7 +454,7 @@ int main(int argc, char* argv[]) constexpr index_t HPad = 0; constexpr index_t WPad = 0; -#elif 1 +#elif 0 // 3x3 filter, 28x28 image constexpr index_t N = 128; constexpr index_t C = 256; @@ -464,12 +464,12 @@ int main(int argc, char* argv[]) constexpr index_t Y = 3; constexpr index_t X = 3; - using ConvStrides = Sequence<2, 2>; + using ConvStrides = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>; constexpr index_t HPad = 0; constexpr index_t WPad = 0; -#elif 0 +#elif 1 // 1x1 filter, 28x28 image constexpr index_t N = 128; constexpr index_t C = 512; @@ -479,6 +479,9 @@ int main(int argc, char* argv[]) constexpr index_t Y = 1; constexpr index_t X = 1; + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + constexpr index_t HPad = 0; constexpr index_t WPad = 0; #elif 0