From 0979fb4af9d78b821334a8021e9b8bf0812d8ae6 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 15 Aug 2019 13:21:51 -0500 Subject: [PATCH] clean up --- ..._v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp | 68 ++++++++------ ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 94 +++++++------------ ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp | 37 -------- ...tion_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp | 2 +- driver/src/driver.cpp | 6 +- 5 files changed, 74 insertions(+), 133 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp index 4f297fac3d..3852780451 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp @@ -77,11 +77,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer constexpr index_t HiPerBlock = HoPerBlock + Y - 1; constexpr index_t WiPerBlock = WoPerBlock + X - 1; - // assert for LDS double buffer - static_assert(C % (2 * CPerBlock) == 0, "C cannot be evenly divided"); - // divide block work: [K, Ho, Wo, N] - static_assert(N % NPerBlock == 0 && K % KPerBlock == 0 && C % CPerBlock == 0 && + static_assert(N % NPerBlock == 0 && K % KPerBlock == 0 && C % (2 * CPerBlock) == 0 && Ho % HoPerBlock == 0 && Wo % WoPerBlock == 0, "wrong! cannot evenly divide work for workgroup "); @@ -132,37 +129,46 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer // blockwise copy // input: format is [C, Hi, Wi, N] auto blockwise_in_copy = - BlockwiseGenericTensorSliceCopy_v1, - Sequence<0, 1, 2, 3>, - Sequence<0, 1, 2, 3>, - 3, - 3, - InBlockCopyDataPerAccess_N, - InBlockCopyDataPerAccess_N>({0, 0, 0, 0}, - {0, 0, 0, 0}); +#if 0 + BlockwiseGenericTensorSliceCopy_v1 +#else + BlockwiseGenericTensorSliceCopy_v2 +#endif + , + Sequence<0, 1, 2, 3>, + Sequence<0, 1, 2, 3>, + 3, + 3, + InBlockCopyDataPerAccess_N, + InBlockCopyDataPerAccess_N>({0, 0, 0, 0}, {0, 0, 0, 0}); // blockwise wei copy // format is [CPerBlock, X * KPerBlock] const auto blockwise_wei_copy = - BlockwiseGenericTensorSliceCopy_v1, - Sequence<0, 1>, - Sequence<0, 1>, - 1, - 1, - WeiBlockCopyDataPerAccess_K, - WeiBlockCopyDataPerAccess_K>({0, 0}, {0, 0}); +#if 0 + BlockwiseGenericTensorSliceCopy_v1 +#else + BlockwiseGenericTensorSliceCopy_v2 +#endif + , + Sequence<0, 1>, + Sequence<0, 1>, + 1, + 1, + WeiBlockCopyDataPerAccess_K, + WeiBlockCopyDataPerAccess_K>({0, 0}, {0, 0}); // a series of blockwise batched GEMM // C_matrix += transpose(A_matrix) * B_matrix diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index 3d67486a0f..a5ea753dd0 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -156,42 +156,28 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0, "GemmDataPerReadB alignment requirement is not satisfied"); -#if 0 // input blockwise copy // slice a merged tensor, reorder and copy to a normal tensor // this copy operator already has blockwise offset built-in auto blockwise_in_copy = - BlockwiseGenericTensorSliceCopy_v1( - {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); +#if 0 + BlockwiseGenericTensorSliceCopy_v1 #else - auto blockwise_in_copy = - BlockwiseGenericTensorSliceCopy_v2( - {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); + BlockwiseGenericTensorSliceCopy_v2 #endif + ({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); // weight tensor // tensor descriptor in device memory, src of blockwise copy @@ -204,42 +190,28 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer Sequence{}, Number{}); -#if 0 // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in auto blockwise_wei_copy = - BlockwiseGenericTensorSliceCopy_v1( - {0, k_block_data_on_global}, {0, 0}); +#if 0 + BlockwiseGenericTensorSliceCopy_v1 #else - auto blockwise_wei_copy = - BlockwiseGenericTensorSliceCopy_v2( - {0, k_block_data_on_global}, {0, 0}); + BlockwiseGenericTensorSliceCopy_v2 #endif + ({0, k_block_data_on_global}, {0, 0}); // GEMM definition // c_mtx += transpose(a_mtx) * b_mtx @@ -430,7 +402,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex( k_thread_data_on_global, 0, b_thread_data_on_global, 0); -#if 1 +#if 0 ThreadwiseGenericTensorSliceCopy_v1r2< decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc), diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp index 96ec3ff3bc..69a82c1268 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp @@ -91,43 +91,6 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; -#elif 1 - // each thread hold 64 data - constexpr index_t BlockSize = 256; - - constexpr index_t BPerBlock = 16; - constexpr index_t KPerBlock = 128; - constexpr index_t EPerBlock = 8; - - constexpr index_t GemmNRepeat = 2; - - constexpr index_t GemmMPerThreadSubC = 4; - constexpr index_t GemmNPerThreadSubC = 4; - constexpr index_t GemmMLevel0Cluster = 4; - constexpr index_t GemmNLevel0Cluster = 4; - constexpr index_t GemmMLevel1Cluster = 4; - constexpr index_t GemmNLevel1Cluster = 4; - constexpr index_t GemmKPerThreadLoop = 1; - constexpr index_t GemmDataPerReadA = 4; - constexpr index_t GemmDataPerReadB = 4; - - using InBlockCopySubLengths_E_N1_B_N2 = Sequence<1, 1, 2, 2>; - using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<8, 2, 8, 2>; - using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B] - using InBlockCopySrcAccessOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B] - using InBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [E, N1, B, N2] - - constexpr index_t InBlockCopySrcDataPerRead_B = 2; - constexpr index_t InBlockCopyDstDataPerWrite_N2 = 2; - - using WeiBlockCopySubLengths_E_K = Sequence<2, 2>; - using WeiBlockCopyClusterLengths_E_K = Sequence<4, 64>; - using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E] - using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] - using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] - - constexpr index_t WeiBlockCopySrcDataPerRead_E = 2; - constexpr index_t WeiBlockCopyDstDataPerWrite_K = 2; #elif 0 // each thread hold 32 data constexpr index_t BlockSize = 256; diff --git a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp index 79b7da25f5..67d2bf7dcf 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp @@ -54,7 +54,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); -#if 0 +#if 1 constexpr index_t BlockSize = 256; constexpr index_t BPerBlock = 128; diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 7ea05e243e..f8192c4994 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -85,7 +85,7 @@ int main(int argc, char* argv[]) constexpr index_t HPad = 0; constexpr index_t WPad = 0; -#elif 1 +#elif 0 // 3x3, 34x34 constexpr index_t N = 64; constexpr index_t C = 256; @@ -367,7 +367,7 @@ int main(int argc, char* argv[]) #if 0 device_convolution_direct_v2_nchw_kcyx_nkhw (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 1 +#elif 0 device_convolution_implicit_gemm_v1_chwn_cyxk_khwn( in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); #elif 0 @@ -379,7 +379,7 @@ int main(int argc, char* argv[]) #elif 0 device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 0 +#elif 1 device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, in_nchw, wei_kcyx_desc,