From 9bbe9073ab90ba1ff4e680eaaa985d6d2ee81e92 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 4 Feb 2019 15:40:34 -0600 Subject: [PATCH] refactor --- driver/conv.cu | 63 ++++++++++++++++--- ...icit_gemm_convolution_1_chwn_csrk_khwn.cuh | 17 ++++- ...volution_1_chwn_csrk_khwn_with_padding.cuh | 49 ++++++++++++++- ...icit_gemm_convolution_2_cnhw_srck_knhw.cuh | 19 +++++- ...icit_gemm_convolution_2_cnhw_srck_knhw.cuh | 2 +- ...icit_gemm_convolution_3_cnhw_srck_knhw.cuh | 2 +- 6 files changed, 138 insertions(+), 14 deletions(-) diff --git a/driver/conv.cu b/driver/conv.cu index de5996c7aa..999c03f03b 100644 --- a/driver/conv.cu +++ b/driver/conv.cu @@ -453,7 +453,7 @@ int main() constexpr unsigned HPad = 0; constexpr unsigned WPad = 0; -#elif 1 +#elif 0 // 3x3 filter, 56x56 image, 1x1 padding constexpr unsigned N = 16; constexpr unsigned C = 128; @@ -477,6 +477,18 @@ int main() constexpr unsigned HPad = 1; constexpr unsigned WPad = 1; +#elif 1 + // 1x1 filter, 28x28 image + constexpr unsigned N = 16; + constexpr unsigned C = 256; + constexpr unsigned HI = 28; + constexpr unsigned WI = 28; + constexpr unsigned K = 512; + constexpr unsigned S = 1; + constexpr unsigned R = 1; + + constexpr unsigned HPad = 0; + constexpr unsigned WPad = 0; #elif 0 // 3x3 filter, 20x84 image, 1x1 padding constexpr unsigned N = 16; @@ -489,6 +501,42 @@ int main() constexpr unsigned HPad = 1; constexpr unsigned WPad = 1; +#elif 0 + // 3x3 filter, 112x112 image, 1x1 padding + constexpr unsigned N = 16; + constexpr unsigned C = 64; + constexpr unsigned HI = 112; + constexpr unsigned WI = 112; + constexpr unsigned K = 128; + constexpr unsigned S = 3; + constexpr unsigned R = 3; + + constexpr unsigned HPad = 1; + constexpr unsigned WPad = 1; +#elif 0 + // 5x5 filter, 20x86 image, 1x1 padding + constexpr unsigned N = 16; + constexpr unsigned C = 256; + constexpr unsigned HI = 20; + constexpr unsigned WI = 86; + constexpr unsigned K = 512; + constexpr unsigned S = 5; + constexpr unsigned R = 5; + + constexpr unsigned HPad = 1; + constexpr unsigned WPad = 1; +#elif 0 + // 5x5 filter, 28x28 image, 2x2 padding + constexpr unsigned N = 16; + constexpr unsigned C = 192; + constexpr unsigned HI = 28; + constexpr unsigned WI = 28; + constexpr unsigned K = 32; + constexpr unsigned S = 5; + constexpr unsigned R = 5; + + constexpr unsigned HPad = 2; + constexpr unsigned WPad = 2; #endif auto lower_pads = Sequence{}; @@ -510,7 +558,7 @@ int main() std::size_t num_thread = std::thread::hardware_concurrency(); -#if 1 +#if 0 in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread); wei_kcsr.GenerateTensorValue(GeneratorTensor_1{}, num_thread); #elif 1 @@ -518,9 +566,9 @@ int main() wei_kcsr.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); #endif - unsigned nrepeat = 50; + unsigned nrepeat = 100; -#if 0 +#if 1 #if 0 device_direct_convolution_1 #elif 0 @@ -531,15 +579,14 @@ int main() device_implicit_gemm_convolution_1_nchw_srck_nkhw #elif 0 device_implicit_gemm_convolution_1_chwn_csrk_khwn -#elif 0 +#elif 1 device_implicit_gemm_convolution_2_cnhw_srck_knhw #elif 0 device_winograd_convolution #endif (in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device, nrepeat); -#endif -#if 1 +#elif 1 device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(in_nchw_desc, in_nchw, wei_kcsr_desc, @@ -551,7 +598,7 @@ int main() nrepeat); #endif -#if 1 +#if 0 if(S == 3 && R == 3) { host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads); diff --git a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh index c8996f1ad7..7d3435823d 100644 --- a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh +++ b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh @@ -102,7 +102,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc, constexpr unsigned WoPerThread = 1; constexpr unsigned BlockSize = 128; -#elif 1 +#elif 0 // 3x3 58x58, NKC = 16,256,128 constexpr unsigned NPerBlock = 8; constexpr unsigned KPerBlock = 64; @@ -161,6 +161,21 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc, constexpr unsigned HoPerThread = 1; constexpr unsigned WoPerThread = 1; + constexpr unsigned BlockSize = 128; +#elif 1 + // for 1x1, 28x28 + constexpr unsigned NPerBlock = 16; + constexpr unsigned KPerBlock = 128; + constexpr unsigned CPerBlock = 8; + constexpr unsigned HoPerBlock = 2; + constexpr unsigned WoPerBlock = 2; + + constexpr unsigned NPerThread = 4; + constexpr unsigned KPerThread = 16; + constexpr unsigned CPerThread = 2; + constexpr unsigned HoPerThread = 1; + constexpr unsigned WoPerThread = 1; + constexpr unsigned BlockSize = 128; #endif diff --git a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh index 94da496f5a..46aa80fbe2 100644 --- a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh +++ b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh @@ -164,7 +164,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc, constexpr unsigned WoPerThread = 1; constexpr unsigned BlockSize = 128; -#elif 1 +#elif 0 // 3x3 56x56, NKC = 16,256,128, with padding // 3x3 28x28, NKC = 16,512,256, with padding // 3x3 20x84, NKC = 16,256,256, with padding @@ -180,6 +180,51 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc, constexpr unsigned HoPerThread = 1; constexpr unsigned WoPerThread = 1; + constexpr unsigned BlockSize = 128; +#elif 0 + // for 5x5 filter, 20x84 image, 1x1 padding + constexpr unsigned NPerBlock = 16; + constexpr unsigned KPerBlock = 64; + constexpr unsigned CPerBlock = 1; + constexpr unsigned HoPerBlock = 2; + constexpr unsigned WoPerBlock = 4; + + constexpr unsigned NPerThread = 4; + constexpr unsigned KPerThread = 16; + constexpr unsigned CPerThread = 1; + constexpr unsigned HoPerThread = 1; + constexpr unsigned WoPerThread = 1; + + constexpr unsigned BlockSize = 128; +#elif 0 + // 5x5 filter, 28x28 image, 2x2 padding + constexpr unsigned NPerBlock = 16; + constexpr unsigned KPerBlock = 32; + constexpr unsigned CPerBlock = 2; + constexpr unsigned HoPerBlock = 4; + constexpr unsigned WoPerBlock = 4; + + constexpr unsigned NPerThread = 4; + constexpr unsigned KPerThread = 16; + constexpr unsigned CPerThread = 1; + constexpr unsigned HoPerThread = 1; + constexpr unsigned WoPerThread = 1; + + constexpr unsigned BlockSize = 128; +#elif 1 + // for 1x1, 28x28 + constexpr unsigned NPerBlock = 16; + constexpr unsigned KPerBlock = 128; + constexpr unsigned CPerBlock = 8; + constexpr unsigned HoPerBlock = 2; + constexpr unsigned WoPerBlock = 2; + + constexpr unsigned NPerThread = 4; + constexpr unsigned KPerThread = 16; + constexpr unsigned CPerThread = 2; + constexpr unsigned HoPerThread = 1; + constexpr unsigned WoPerThread = 1; + constexpr unsigned BlockSize = 128; #endif @@ -229,7 +274,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc, cudaEventElapsedTime(&elapsedTime, start, stop); printf("Elapsed time : %f ms\n", elapsedTime); - usleep(10000); + usleep(elapsedTime * 1000); } checkCudaErrors(cudaGetLastError()); diff --git a/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh b/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh index 18c639ea1b..f9b0394e03 100644 --- a/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh +++ b/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh @@ -93,7 +93,7 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc, constexpr unsigned GemmThreadPerClusterColumn = 4; constexpr unsigned BlockSize = 128; -#elif 1 +#elif 0 constexpr unsigned BPerBlock = 128; constexpr unsigned KPerBlock = 64; constexpr unsigned CPerBlock = 2; @@ -108,6 +108,23 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc, constexpr unsigned InBlockCopyThreadPerDim0 = 2; constexpr unsigned InBlockCopyThreadPerDim1 = 64; + constexpr unsigned BlockSize = 128; +#elif 1 + // 1x1, 28x28 + constexpr unsigned BPerBlock = 64; + constexpr unsigned KPerBlock = 128; + constexpr unsigned CPerBlock = 8; + + constexpr unsigned BPerThread = 4; + constexpr unsigned KPerThread = 16; + constexpr unsigned CPerThread = 2; + + constexpr unsigned GemmRowThreadPerCluster = 8; + constexpr unsigned GemmColumnThreadPerCluster = 8; + + constexpr unsigned InBlockCopyThreadPerDim0 = 2; + constexpr unsigned InBlockCopyThreadPerDim1 = 64; + constexpr unsigned BlockSize = 128; #endif diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh index 43d47c2ceb..4b62118323 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh @@ -7,7 +7,7 @@ #include "threadwise_2d_tensor_op.cuh" #include "gemm.cuh" -// define B = N*Hi*Wi +// define B = flatten(N, Hi, Wi) template {}; -#elif 0 +#elif 1 const auto blockwise_in_copy = blockwise_2d_tensor_copy_2