From e87aa851eabd7d4e6733e77be744d2ca39caa1c8 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sat, 13 Jul 2019 17:44:13 -0500 Subject: [PATCH] adding implcit GEMM v4r2 --- ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 11 +- ..._v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp | 165 +++-- .../ConstantMatrixDescriptor.hpp | 12 +- .../ConstantTensorDescriptor.hpp | 22 +- ...tion_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp | 55 +- driver/src/driver.cpp | 32 +- driver/src/driver.cu | 581 +----------------- 7 files changed, 199 insertions(+), 679 deletions(-) mode change 100644 => 120000 driver/src/driver.cu diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index 07ac2d8a88..82b097c5e6 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -208,13 +208,12 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer // b_mtx[EPerBlocl, N1 * BPerBlock * N2] is in LDS // c_mtx[KPerBlock, N1 * BPerBlock * N2] is distributed among threads, and saved in // register - constexpr auto a_e_k_block_mtx_desc = make_ConstantMatrixDescriptor( - Number{}, Number{}, Number{}); + constexpr auto a_e_k_block_mtx_desc = + make_ConstantMatrixDescriptor_from_ConstantTensorDescriptor(wei_e_k_block_desc); constexpr auto b_e_n1bn2_block_mtx_desc = - make_ConstantMatrixDescriptor(Number{}, - Number{}, - Number{}); + make_ConstantMatrixDescriptor_from_ConstantTensorDescriptor( + in_e_n1_b_n2_block_desc.Unfold(I1, I3)); // sanity check static_assert(KPerBlock % (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster) == @@ -226,7 +225,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer // c_thread_mtx definition: this is a mess // TODO:: more elegent way of defining c_thread_mtx - constexpr auto c_k0k2_n1n2_thread_mtx_desc = make_ConstantMatrixDescriptor( + constexpr auto c_k0k2_n1n2_thread_mtx_desc = make_ConstantMatrixDescriptor_packed( Number{}, Number{}); const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2< diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp index 13a523b521..499cc1f0df 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -20,15 +20,18 @@ template {}; constexpr auto I3 = Number<3>{}; constexpr auto I5 = Number<5>{}; + constexpr auto I7 = Number<7>{}; constexpr auto True = integral_constant{}; @@ -96,13 +99,16 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer constexpr index_t E = C * Y * X; - constexpr index_t N1 = N / (N0 * N2); - constexpr index_t Ho1 = Ho / (Ho0 * Ho2); - constexpr index_t Wo1 = Wo / (Wo0 * Wo2); + constexpr index_t B = N1 * Ho1 * Wo1; - constexpr index_t B1 = N1 * Ho1 * Wo1; + static_assert(N % (N1 * N2) == 0 && Ho % (Ho1 * Ho2) == 0 && Wo % (Wo1 * Wo2) == 0, + "wrong!"); - static_assert((X == 1 || ConvDilationW % InBlockCopySrcDataPerRead_B == 0), + constexpr index_t N0 = N / (N1 * N2); + constexpr index_t Ho0 = Ho / (Ho1 * Ho2); + constexpr index_t Wo0 = Wo / (Wo1 * Wo2); + + static_assert((X == 1 || ConvDilationW % InBlockCopyDataPerAccess_W2 == 0), "wrong! aligment requirement for vectorized global load of input tensor will " "be violated"); @@ -110,17 +116,17 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer static_assert(K % KPerBlock == 0 && B % BPerBlock == 0 && E % (2 * EPerBlock) == 0, "wrong! cannot divide work evenly among block"); - constexpr index_t KBlockWork = K / KPerBlock; - constexpr index_t B1BlockWork = B1 / B1PerBlock; + constexpr index_t KBlockWork = K / KPerBlock; + constexpr index_t BBlockWork = B / BPerBlock; constexpr auto block_work_desc = - make_ConstantTensorDescriptor_packed(Sequence{}); + make_ConstantTensorDescriptor_packed(Sequence{}); const auto block_work_multi_id = block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); - const index_t k_block_data_on_global = block_work_multi_id[0] * KPerBlock; - const index_t b1_block_data_on_global = block_work_multi_id[1] * B1PerBlock; + const index_t k_block_data_on_global = block_work_multi_id[0] * KPerBlock; + const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock; // input tensor // tensor descriptor in device memory [N0, N1, N2, Ho0, Ho1, Ho2, Wo0, Wo1, Wo2] @@ -143,7 +149,7 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer .Extract(Sequence<1, 2, 3>{}); // merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy - constexpr auto in_e_n0_ho0_wo0_b1_n2_ho2_wo2_global_merged_desc = + constexpr auto in_e_n0_ho0_wo0_b_n2_ho2_wo2_global_merged_desc = make_ConstantMergedTensorDescriptor( in_c_y_x_global_desc.Embed(in_n0_ho0_wo0_n1_ho1_wo1_n2_ho2_wo2_global_desc), Sequence<0, 1, 2>{}, @@ -157,8 +163,15 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer // memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy // be careful of LDS alignment - constexpr auto in_e_n0_ho0_wo0_b1_n2_ho2_wo2_block_desc = - in_e_n0_ho0_wo0_b1_n2_ho2_wo2_global_merged_desc.Pack(); + constexpr auto in_e_n0_ho0_wo0_b_n2_ho2_wo2_block_desc = + make_ConstantTensorDescriptor_packed(Sequence{}); // input blockwise copy // slice a merged tensor, reorder and copy to a normal tensor @@ -166,17 +179,17 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1< BlockSize, Float, - decltype(in_e_n0_ho0_wo0_b1_n2_ho2_wo2_global_merged_desc), - decltype(in_e_n0_ho0_wo0_b1_n2_ho2_wo2_block_desc), - decltype(in_e_n0_ho0_wo0_b1_n2_ho2_wo2_block_desc.GetLengths()), - InBlockCopySubLengths_E_N0_Ho0_Wo0_B1_N2_Ho2_Wo2, - InBlockCopyClusterLengths_E_N0_Ho0_Wo0_B1_N2_Ho2_Wo2, + decltype(in_e_n0_ho0_wo0_b_n2_ho2_wo2_global_merged_desc), + decltype(in_e_n0_ho0_wo0_b_n2_ho2_wo2_block_desc), + decltype(in_e_n0_ho0_wo0_b_n2_ho2_wo2_block_desc.GetLengths()), + InBlockCopySubLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2, + InBlockCopyClusterLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2, InBlockCopyThreadClusterArrangeOrder, InBlockCopySrcAccessOrder, InBlockCopyDstAccessOrder, - InBlockCopyDataPerAccess_Wo2, - InBlockCopyDataPerAccess_Wo2>({0, 0, 0, 0, b1_block_data_on_global, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0}); + InBlockCopyDataPerAccess_W2, + InBlockCopyDataPerAccess_W2>({0, 0, 0, 0, b_block_data_on_global, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0}); // weight tensor // tensor descriptor in device memory, src of blockwise copy @@ -219,13 +232,13 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer // this check is ad-hoc // TODO: need to properly implement tensor descriptor with multiple alignment // requirements - static_assert(in_e_n0_ho0_wo0_b1_n2_ho2_wo2_block_desc.GetStrides()[3] % GemmDataPerReadB == + static_assert(in_e_n0_ho0_wo0_b_n2_ho2_wo2_block_desc.GetStrides()[3] % GemmDataPerReadB == 0, "GemmDataPerReadB alignment requirement is not satisfied"); - constexpr auto b_e_n0ho0wo0b1n2ho2wo2_block_mtx_desc = + constexpr auto b_e_n0ho0wo0bn2ho2wo2_block_mtx_desc = make_ConstantMatrixDescriptor_from_ConstantTensorDescriptor( - in_e_n0_ho0_wo0_b1_n2_ho2_wo2_block_desc.Unfold(I1, I7)); + in_e_n0_ho0_wo0_b_n2_ho2_wo2_block_desc.Unfold(I1, I7)); // sanity check static_assert(KPerBlock % (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster) == @@ -237,13 +250,15 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer // c_thread_mtx definition: this is a mess // TODO:: more elegent way of defining c_thread_mtx - constexpr auto c_k0k2_n0ho0wo0n2ho2wo2_thread_mtx_desc = make_ConstantMatrixDescriptor( - Number{}, Number{}); + constexpr auto c_k0k2_n0ho0wo0n2ho2wo2_thread_mtx_desc = + make_ConstantMatrixDescriptor_packed( + Number{}, + Number{}); const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2< BlockSize, decltype(a_e_k_block_mtx_desc), - decltype(b_e_n0ho0wo0b1n2ho2wo2_block_mtx_desc), + decltype(b_e_n0ho0wo0bn2ho2wo2_block_mtx_desc), decltype(c_k0k2_n0ho0wo0n2ho2wo2_thread_mtx_desc), GemmMPerThreadSubC, GemmNPerThreadSubC, @@ -256,13 +271,13 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer GemmDataPerReadB>{}; // LDS allocation for input and weight: be careful of alignment - constexpr index_t max_align = math::lcm(InBlockCopyDstDataPerWrite_N2, + constexpr index_t max_align = math::lcm(InBlockCopyDataPerAccess_W2, WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA, GemmDataPerReadB); constexpr index_t in_block_space = math::integer_least_multiple( - in_e_n0_ho0_wo0_b1_n2_ho2_wo2_block_desc.GetElementSpace(), max_align); + in_e_n0_ho0_wo0_b_n2_ho2_wo2_block_desc.GetElementSpace(), max_align); constexpr index_t wei_block_space = math::integer_least_multiple(wei_e_k_block_desc.GetElementSpace(), max_align); @@ -369,8 +384,18 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer // define tensor descriptor for threadwise copy // output memory layout descriptor in register constexpr auto out_k0_k1_k2_n0_ho0_wo0_n1_ho1_wo1_n2_ho2_wo2_thread_mem_desc = - make_ConstantTensorDescriptor_packed( - Sequence{}); + make_ConstantTensorDescriptor_packed(Sequence{}); // output tensor descriptor in register, src of threadwise copy constexpr auto out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_thread_desc = @@ -378,7 +403,7 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer Sequence<3, 6, 9, 0, 1, 2, 4, 7, 10, 5, 8, 11>{}); // output memory layout descriptor in device memory, dst of threadwise copy - constexpr auto out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_global_desc = + constexpr auto out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_global_mem_desc = out_n_k_h_w_global_desc.Fold(I3, Sequence{}) .Fold(I2, Sequence{}) .Fold(I1, Sequence{}) @@ -393,33 +418,55 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer k_block_data_on_global + c_thread_mtx_on_block.row; const index_t b_thread_data_on_global = - b_block_data_on_global + c_thread_mtx_on_block.col / N2; + b_block_data_on_global + c_thread_mtx_on_block.col / (N2 * Ho2 * Wo2); // output merged global tensor descriptor, for calculating origin of thread tensor // in global memory - constexpr auto out_k_n1_b_n2_global_merged_desc = make_ConstantMergedTensorDescriptor( - out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.Unfold(I3, I5), - Sequence<3>{}, - Sequence<1>{}, - Sequence<0, 4, 5>{}, - Sequence<2>{}); + constexpr auto out_k_n0_ho0_wo0_b_n2_ho2_wo2_global_merged_desc = + make_ConstantMergedTensorDescriptor( + out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_global_mem_desc.Unfold(I3, I5), + Sequence<3>{}, + Sequence<0>{}, + Sequence<4>{}, + Sequence<7>{}, + Sequence<1, 5, 8>{}, + Sequence<2>{}, + Sequence<6>{}, + Sequence<9>{}); // origin of dst in device memory Float* p_out_thread_on_global = p_out_global + - out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex( - k_thread_data_on_global, 0, b_thread_data_on_global, 0); + out_k_n0_ho0_wo0_b_n2_ho2_wo2_global_merged_desc.GetOffsetFromMultiIndex( + k_thread_data_on_global, 0, 0, 0, b_thread_data_on_global, 0, 0, 0); +#if 1 threadwise_generic_tensor_slice_copy_v1( - out_n0_n1_n2_k0_k1_k2_h_w_thread_desc, + out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_thread_desc, p_out_thread, - {0, 0, 0, 0, 0, 0, 0, 0}, - out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_global_mem_desc, p_out_thread_on_global, - {0, 0, 0, 0, 0, 0, 0, 0}, - out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(), - arithmetic_sequence_gen<0, 8, 1>::type{}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_thread_desc.GetLengths(), + arithmetic_sequence_gen<0, 12, 1>::type{}, Number<1>{}); +#else + if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) + { + print_ConstantTensorDescriptor( + "out thread: ", out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_thread_desc); + printf("size: %d\n", + out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_thread_desc.GetElementSize()); + + for(index_t i = 0; + i < out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_thread_desc.GetElementSize(); + ++i) + { + p_out_global[0] = p_out_thread[i]; + } + } +#endif } } }; diff --git a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp index 5084e6f959..ef3676a64f 100644 --- a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp +++ b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp @@ -52,11 +52,15 @@ __host__ __device__ constexpr auto return ConstantMatrixDescriptor{}; } -template -__host__ __device__ constexpr auto make_ConstantMatrixDescriptor_from_ConstantTensorDescriptor( - ConstantTensorDescriptor, Sequence>) +template +__host__ __device__ constexpr auto + make_ConstantMatrixDescriptor_from_ConstantTensorDescriptor(TDesc) { - return ConstantMatrixDescriptor{}; + static_assert(TDesc::GetNumOfDimension() == 2, "wrong"); + static_assert(TDesc::GetStrides()[1] == 1, "wrong"); + return ConstantMatrixDescriptor{}; } template diff --git a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp index f95ceeea7f..9833ef3200 100644 --- a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp +++ b/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp @@ -456,7 +456,7 @@ print_ConstantTensorDescriptor(const char* s, { constexpr index_t ndim = sizeof...(Lengths); - static_assert(ndim > 0 && ndim <= 10, "wrong!"); + static_assert(ndim > 0 && ndim <= 12, "wrong!"); static_if{}([&](auto) { printf("%s dim %u, lengths {%u}, strides {%u}\n", s, ndim, Lengths..., Strides...); @@ -528,6 +528,26 @@ print_ConstantTensorDescriptor(const char* s, Lengths..., Strides...); }); + + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u " + "%u %u " + "%u %u %u}\n", + s, + ndim, + Lengths..., + Strides...); + }); + + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u " + "%u %u %u %u " + "%u %u %u}\n", + s, + ndim, + Lengths..., + Strides...); + }); } } // namespace ck diff --git a/driver/include/device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp index 5f6c2ec3f2..ab41e325b3 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp @@ -53,18 +53,27 @@ void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc, wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); - constexpr index_t N1 = 2; - constexpr index_t N2 = 4; - - constexpr index_t B = (N * Ho * Wo) / (N1 * N2); - #if 1 + // 1x1 filter, 8x8 image + constexpr index_t N1 = 2; + constexpr index_t N2 = 1; + + constexpr index_t Ho1 = 8; + constexpr index_t Ho2 = 1; + + constexpr index_t Wo1 = 1; + constexpr index_t Wo2 = 4; + constexpr index_t BlockSize = 256; constexpr index_t BPerBlock = 16; constexpr index_t KPerBlock = 128; constexpr index_t EPerBlock = 8; + constexpr index_t N0PerBlock = 1; + constexpr index_t Ho0PerBlock = 1; + constexpr index_t Wo0PerBlock = 2; + constexpr index_t GemmMPerThreadSubC = 4; constexpr index_t GemmNPerThreadSubC = 4; constexpr index_t GemmMLevel0Cluster = 4; @@ -75,14 +84,16 @@ void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc, constexpr index_t GemmDataPerReadA = 4; constexpr index_t GemmDataPerReadB = 4; - using InBlockCopySubLengths_E_N1_B_N2 = Sequence<1, 1, 1, 4>; - using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<8, 2, 16, 1>; - using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B] - using InBlockCopySrcAccessOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B] - using InBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [E, N1, B, N2] + using InBlockCopySubLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2 = Sequence<1, 1, 1, 1, 1, 1, 1, 4>; + using InBlockCopyClusterLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2 = Sequence<8, 1, 1, 2, 16, 1, 1, 1>; + using InBlockCopyThreadClusterArrangeOrder = + Sequence<0, 1, 5, 2, 6, 3, 4, 7>; // [E, N0, N2, Ho0, Ho2, Wo0, B, Wo2] + using InBlockCopySrcAccessOrder = + Sequence<0, 1, 5, 2, 6, 3, 4, 7>; // [E, N0, N2, Ho0, Ho2, Wo0, B, Wo2] + using InBlockCopyDstAccessOrder = + Sequence<0, 1, 2, 3, 4, 5, 6, 7>; // [E, N0, Ho0, Wo0, B, N2, Ho2, Wo2] - constexpr index_t InBlockCopySrcDataPerRead_B = 1; - constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4; + constexpr index_t InBlockCopyDataPerAccess_W2 = 4; using WeiBlockCopySubLengths_E_K = Sequence<4, 1>; using WeiBlockCopyClusterLengths_E_K = Sequence<2, 128>; @@ -94,6 +105,8 @@ void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc, constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; #endif + constexpr index_t B = N1 * Ho1 * Wo1; + constexpr index_t GridSize = ((B + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock); @@ -111,11 +124,18 @@ void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc, decltype(out_nkhw_desc), ConvStrides, ConvDilations, + N1, + N2, + Ho1, + Ho2, + Wo1, + Wo2, BPerBlock, KPerBlock, EPerBlock, - N1, - N2, + N0PerBlock, + Ho0PerBlock, + Wo0PerBlock, GemmMPerThreadSubC, GemmNPerThreadSubC, GemmMLevel0Cluster, @@ -125,13 +145,12 @@ void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc, GemmKPerThreadLoop, GemmDataPerReadA, GemmDataPerReadB, - InBlockCopySubLengths_E_N1_B_N2, - InBlockCopyClusterLengths_E_N1_B_N2, + InBlockCopySubLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2, + InBlockCopyClusterLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2, InBlockCopyThreadClusterArrangeOrder, InBlockCopySrcAccessOrder, InBlockCopyDstAccessOrder, - InBlockCopySrcDataPerRead_B, - InBlockCopyDstDataPerWrite_N2, + InBlockCopyDataPerAccess_W2, WeiBlockCopySubLengths_E_K, WeiBlockCopyClusterLengths_E_K, WeiBlockCopyThreadClusterArrangeOrder, diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index cd9b4c7260..24728a7857 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -14,7 +14,7 @@ #include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp" #include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp" #include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp" -//#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp" +#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp" struct GeneratorTensor_1 { @@ -524,19 +524,29 @@ int main(int argc, char* argv[]) #elif 0 device_convolution_implicit_gemm_v2_chwn_cyxk_khwn in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 1 +#elif 0 device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); #elif 0 - device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(in_nchw_desc, - in_nchw, - wei_kcyx_desc, - wei_kcyx, - out_nkhw_desc, - out_nkhw_device, - ConvStrides{}, - ConvDilations{}, - nrepeat); + device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, + in_nchw, + wei_kcyx_desc, + wei_kcyx, + out_nkhw_desc, + out_nkhw_device, + ConvStrides{}, + ConvDilations{}, + nrepeat); +#elif 1 + device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(in_nchw_desc, + in_nchw, + wei_kcyx_desc, + wei_kcyx, + out_nkhw_desc, + out_nkhw_device, + ConvStrides{}, + ConvDilations{}, + nrepeat); #elif 0 device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(in_nchw_desc, in_nchw, diff --git a/driver/src/driver.cu b/driver/src/driver.cu deleted file mode 100644 index cd9b4c7260..0000000000 --- a/driver/src/driver.cu +++ /dev/null @@ -1,580 +0,0 @@ -#include -#include -#include -#include -#include -#include "config.hpp" -#include "ConstantTensorDescriptor.hpp" -#include "device.hpp" -#include "conv_common.hpp" -#include "host_conv.hpp" -#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp" -#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp" -#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp" -#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp" -#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp" -#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp" -//#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp" - -struct GeneratorTensor_1 -{ - template - double operator()(Is... is) - { - return 1; - } -}; - -struct GeneratorTensor_2 -{ - int min_value = 0; - int max_value = 1; - - template - double operator()(Is...) - { - return (std::rand() % (max_value - min_value)) + min_value; - } -}; - -struct GeneratorTensor_3 -{ - template - double operator()(Is... is) - { - std::array dims = {{static_cast(is)...}}; - - auto f_acc = [](auto a, auto b) { return 100 * a + b; }; - - return std::accumulate(dims.begin(), dims.end(), index_t(0), f_acc); - } -}; - -struct GeneratorTensor_Checkboard -{ - template - double operator()(Ts... Xs) const - { - std::array dims = {{Xs...}}; - return std::accumulate(dims.begin(), - dims.end(), - true, - [](bool init, index_t x) -> int { return init != (x % 2); }) - ? 1 - : -1; - } -}; - -int main(int argc, char* argv[]) -{ - using namespace ck; - -#if 0 - constexpr index_t N = 8; - constexpr index_t C = 16; - constexpr index_t HI = 3; - constexpr index_t WI = 18; - constexpr index_t K = 128; - constexpr index_t Y = 3; - constexpr index_t X = 3; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#elif 0 - // 3x3, 34x34 - constexpr index_t N = 128; - constexpr index_t C = 256; - constexpr index_t HI = 34; - constexpr index_t WI = 34; - constexpr index_t K = 128; - constexpr index_t Y = 3; - constexpr index_t X = 3; - - using ConvStrides = Sequence<2, 2>; - using ConvDilations = Sequence<1, 1>; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#elif 0 - // 3x3, 56x56 - constexpr index_t N = 64; - constexpr index_t C = 64; - constexpr index_t HI = 56; - constexpr index_t WI = 56; - constexpr index_t K = 128; - constexpr index_t Y = 3; - constexpr index_t X = 3; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#elif 0 - // 3x3 filter, 28x28 image - constexpr index_t N = 128; - constexpr index_t C = 256; - constexpr index_t HI = 28; - constexpr index_t WI = 28; - constexpr index_t K = 128; - constexpr index_t Y = 3; - constexpr index_t X = 3; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#elif 0 - // 1x1 filter, 28x28 image - constexpr index_t N = 128; - constexpr index_t C = 512; - constexpr index_t HI = 28; - constexpr index_t WI = 28; - constexpr index_t K = 512; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#elif 0 - // 3x3 filter, 20x84 image, 1x1 padding - constexpr index_t N = 16; - constexpr index_t C = 256; - constexpr index_t HI = 20; - constexpr index_t WI = 84; - constexpr index_t K = 256; - constexpr index_t Y = 3; - constexpr index_t X = 3; - - constexpr index_t HPad = 1; - constexpr index_t WPad = 1; -#elif 0 - // 3x3 filter, 112x112 image, 1x1 padding - constexpr index_t N = 16; - constexpr index_t C = 64; - constexpr index_t HI = 112; - constexpr index_t WI = 112; - constexpr index_t K = 128; - constexpr index_t Y = 3; - constexpr index_t X = 3; - - constexpr index_t HPad = 1; - constexpr index_t WPad = 1; -#elif 0 - // 5x5 filter, 20x86 image - constexpr index_t N = 16; - constexpr index_t C = 256; - constexpr index_t HI = 20; - constexpr index_t WI = 86; - constexpr index_t K = 512; - constexpr index_t Y = 5; - constexpr index_t X = 5; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#elif 0 - // 5x5 filter, 20x86 image, 1x1 padding - constexpr index_t N = 16; - constexpr index_t C = 256; - constexpr index_t HI = 20; - constexpr index_t WI = 86; - constexpr index_t K = 512; - constexpr index_t Y = 5; - constexpr index_t X = 5; - - constexpr index_t HPad = 1; - constexpr index_t WPad = 1; -#elif 0 - // 5x5 filter, 28x28 image, 2x2 padding - constexpr index_t N = 16; - constexpr index_t C = 192; - constexpr index_t HI = 28; - constexpr index_t WI = 28; - constexpr index_t K = 32; - constexpr index_t Y = 5; - constexpr index_t X = 5; - - constexpr index_t HPad = 2; - constexpr index_t WPad = 2; -#elif 0 - // 3x3 filter, 14x14 image - constexpr index_t N = 128; - constexpr index_t C = 256; - constexpr index_t HI = 14; - constexpr index_t WI = 14; - constexpr index_t K = 128; - constexpr index_t Y = 3; - constexpr index_t X = 3; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#elif 0 - // 1x1 filter, 14x14 image - constexpr index_t N = 128; - constexpr index_t C = 512; - constexpr index_t HI = 14; - constexpr index_t WI = 14; - constexpr index_t K = 512; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#elif 0 - // 1x1 filter, 7x7 image - constexpr index_t N = 128; - constexpr index_t C = 512; - constexpr index_t HI = 7; - constexpr index_t WI = 7; - constexpr index_t K = 2048; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#elif 0 - // 1x1 filter, 73x73 image - constexpr index_t N = 128; - constexpr index_t C = 512; - constexpr index_t HI = 73; - constexpr index_t WI = 73; - constexpr index_t K = 128; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#elif 0 - // 1x1 filter, 8x8 image - // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% - constexpr index_t N = 64; - constexpr index_t C = 1536; - constexpr index_t HI = 8; - constexpr index_t WI = 8; - constexpr index_t K = 256; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#elif 1 - // 1x1 filter, 8x8 image - // cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@VII 51% - constexpr index_t N = 128; - constexpr index_t C = 2048; - constexpr index_t HI = 8; - constexpr index_t WI = 8; - constexpr index_t K = 384; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#elif 0 - // 1x1 filter, 7x7 image - // cudnn@V100 82%, ck@V100 76%, ck@P100 67%, ck@VII 64% - constexpr index_t N = 128; - constexpr index_t C = 832; - constexpr index_t HI = 7; - constexpr index_t WI = 7; - constexpr index_t K = 384; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#elif 0 - // 1x1 filter, 8x8 image - // cudnn@V100 83%, ck@V100 75%, ck@P100 78%, ck@VII 65% - constexpr index_t N = 128; - constexpr index_t C = 1280; - constexpr index_t HI = 8; - constexpr index_t WI = 8; - constexpr index_t K = 384; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#elif 0 - // 1x1 filter, 14x14 image - // cudnn@V100 62%, ck@V100 68%, ck@P100 70%, ck@VII 50% - constexpr index_t N = 128; - constexpr index_t C = 512; - constexpr index_t HI = 14; - constexpr index_t WI = 14; - constexpr index_t K = 128; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#elif 0 - // 1x1 filter, 8x8 image - // cudnn@V100 74%, ck@V100 57%, ck@P100 78%, ck@VII 61% - constexpr index_t N = 64; - constexpr index_t C = 1536; - constexpr index_t HI = 8; - constexpr index_t WI = 8; - constexpr index_t K = 384; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#elif 0 - // 1x1 filter, 28x28 image - // cudnn@V100 86%, ck@V100 84%, ck@P100 80%, ck@VII 69% - constexpr index_t N = 128; - constexpr index_t C = 256; - constexpr index_t HI = 28; - constexpr index_t WI = 28; - constexpr index_t K = 128; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#elif 0 - // 1x1 filter, 7x7 image - // cudnn@V100 71%, ck@V100 55%, ck@P100 70%, ck@VII 62% - constexpr index_t N = 128; - constexpr index_t C = 832; - constexpr index_t HI = 7; - constexpr index_t WI = 7; - constexpr index_t K = 256; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#elif 0 - // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output - // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% - constexpr index_t N = 128; - constexpr index_t C = 288; - constexpr index_t HI = 35; - constexpr index_t WI = 35; - constexpr index_t K = 384; - constexpr index_t Y = 3; - constexpr index_t X = 3; - - using ConvStrides = Sequence<2, 2>; - using ConvDilations = Sequence<1, 1>; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#elif 0 - // 1x1 filter, 17x17 input - // cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@VII 76% - constexpr index_t N = 128; - constexpr index_t C = 768; - constexpr index_t HI = 17; - constexpr index_t WI = 17; - constexpr index_t K = 128; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#elif 0 - // 1x1 filter, 14x14 image - // cudnn@V100 73%, ck@V100 71%, ck@P100 70%, ck@VII 64% - constexpr index_t N = 128; - constexpr index_t C = 528; - constexpr index_t HI = 14; - constexpr index_t WI = 14; - constexpr index_t K = 128; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#elif 0 - // 1x1 filter, 14x14 image - // cudnn@V100 73%, ck@V100 72%, ck@P100 79%, ck@VII 75% - constexpr index_t N = 128; - constexpr index_t C = 528; - constexpr index_t HI = 14; - constexpr index_t WI = 14; - constexpr index_t K = 256; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#elif 0 - // 1x1 filter, 7x7 image - // cudnn@V100 49%, ck@V100 50%, ck@P100 61%, ck@VII 52% - constexpr index_t N = 128; - constexpr index_t C = 832; - constexpr index_t HI = 7; - constexpr index_t WI = 7; - constexpr index_t K = 128; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; -#endif - - auto lower_pads = Sequence{}; - auto upper_pads = Sequence{}; - - auto in_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence{}); - auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence{}); - auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor( - in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, lower_pads, upper_pads); - - ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: "); - ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: "); - ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: "); - - using in_data_t = float; - using out_data_t = float; - Tensor in_nchw(make_TensorDescriptor(in_nchw_desc)); - Tensor wei_kcyx(make_TensorDescriptor(wei_kcyx_desc)); - Tensor out_nkhw_host(make_TensorDescriptor(out_nkhw_desc)); - Tensor out_nkhw_device(make_TensorDescriptor(out_nkhw_desc)); - - std::size_t num_thread = std::thread::hardware_concurrency(); - - if(argc != 3) - { - printf("arg1: do_verification, arg2: nrepeat\n"); - exit(1); - } - - bool do_verification = atoi(argv[1]); - index_t nrepeat = atoi(argv[2]); - - if(do_verification) - { -#if 0 - in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread); - wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread); -#elif 0 - in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread); - wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); -#elif 0 - in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread); - wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread); -#elif 1 - in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); -#elif 0 - in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); - - auto gen_wei = [](auto... is) { - return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...); - }; - wei_kcyx.GenerateTensorValue(gen_wei, num_thread); -#endif - } - -#if 0 - device_convolution_direct_v2_nchw_kcyx_nkhw - in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 0 - device_convolution_implicit_gemm_v1_chwn_cyxk_khwn - in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 0 - device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw - in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 0 - device_convolution_implicit_gemm_v2_chwn_cyxk_khwn - in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 1 - device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( - in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 0 - device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(in_nchw_desc, - in_nchw, - wei_kcyx_desc, - wei_kcyx, - out_nkhw_desc, - out_nkhw_device, - ConvStrides{}, - ConvDilations{}, - nrepeat); -#elif 0 - device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(in_nchw_desc, - in_nchw, - wei_kcyx_desc, - wei_kcyx, - out_nkhw_desc, - out_nkhw_device, - lower_pads, - upper_pads, - nrepeat); -#endif - - if(do_verification) - { -#if 1 - if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 && - ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1) - { - host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads); - } - else -#endif - { - host_direct_convolution(in_nchw, - wei_kcyx, - out_nkhw_host, - ConvStrides{}, - ConvDilations{}, - lower_pads, - upper_pads); - } - check_error(out_nkhw_host, out_nkhw_device); - -#if 0 - LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl; - LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl; - LogRange(std::cout << "out_nkhw_host : ", out_nkhw_host.mData, ",") << std::endl; - LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl; -#endif - } -} diff --git a/driver/src/driver.cu b/driver/src/driver.cu new file mode 120000 index 0000000000..1ca4fea9d7 --- /dev/null +++ b/driver/src/driver.cu @@ -0,0 +1 @@ +driver.cpp \ No newline at end of file