From 9bbe9073ab90ba1ff4e680eaaa985d6d2ee81e92 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Mon, 4 Feb 2019 15:40:34 -0600
Subject: [PATCH] refactor

---
 driver/conv.cu                                | 63 ++++++++++++++++---
 ...icit_gemm_convolution_1_chwn_csrk_khwn.cuh | 17 ++++-
 ...volution_1_chwn_csrk_khwn_with_padding.cuh | 49 ++++++++++++++-
 ...icit_gemm_convolution_2_cnhw_srck_knhw.cuh | 19 +++++-
 ...icit_gemm_convolution_2_cnhw_srck_knhw.cuh |  2 +-
 ...icit_gemm_convolution_3_cnhw_srck_knhw.cuh |  2 +-
 6 files changed, 138 insertions(+), 14 deletions(-)

diff --git a/driver/conv.cu b/driver/conv.cu
index de5996c7aa..999c03f03b 100644
--- a/driver/conv.cu
+++ b/driver/conv.cu
@@ -453,7 +453,7 @@ int main()
 
     constexpr unsigned HPad = 0;
     constexpr unsigned WPad = 0;
-#elif 1
+#elif 0
     // 3x3 filter, 56x56 image, 1x1 padding
     constexpr unsigned N  = 16;
     constexpr unsigned C  = 128;
@@ -477,6 +477,18 @@ int main()
 
     constexpr unsigned HPad = 1;
     constexpr unsigned WPad = 1;
+#elif 1
+    // 1x1 filter, 28x28 image
+    constexpr unsigned N  = 16;
+    constexpr unsigned C  = 256;
+    constexpr unsigned HI = 28;
+    constexpr unsigned WI = 28;
+    constexpr unsigned K  = 512;
+    constexpr unsigned S  = 1;
+    constexpr unsigned R  = 1;
+
+    constexpr unsigned HPad = 0;
+    constexpr unsigned WPad = 0;
 #elif 0
     // 3x3 filter, 20x84 image, 1x1 padding
     constexpr unsigned N  = 16;
@@ -489,6 +501,42 @@ int main()
 
     constexpr unsigned HPad = 1;
     constexpr unsigned WPad = 1;
+#elif 0
+    // 3x3 filter, 112x112 image, 1x1 padding
+    constexpr unsigned N  = 16;
+    constexpr unsigned C  = 64;
+    constexpr unsigned HI = 112;
+    constexpr unsigned WI = 112;
+    constexpr unsigned K  = 128;
+    constexpr unsigned S  = 3;
+    constexpr unsigned R  = 3;
+
+    constexpr unsigned HPad = 1;
+    constexpr unsigned WPad = 1;
+#elif 0
+    // 5x5 filter, 20x86 image, 1x1 padding
+    constexpr unsigned N  = 16;
+    constexpr unsigned C  = 256;
+    constexpr unsigned HI = 20;
+    constexpr unsigned WI = 86;
+    constexpr unsigned K  = 512;
+    constexpr unsigned S  = 5;
+    constexpr unsigned R  = 5;
+
+    constexpr unsigned HPad = 1;
+    constexpr unsigned WPad = 1;
+#elif 0
+    // 5x5 filter, 28x28 image, 2x2 padding
+    constexpr unsigned N  = 16;
+    constexpr unsigned C  = 192;
+    constexpr unsigned HI = 28;
+    constexpr unsigned WI = 28;
+    constexpr unsigned K  = 32;
+    constexpr unsigned S  = 5;
+    constexpr unsigned R  = 5;
+
+    constexpr unsigned HPad = 2;
+    constexpr unsigned WPad = 2;
 #endif
 
     auto lower_pads = Sequence<HPad, WPad>{};
@@ -510,7 +558,7 @@ int main()
 
     std::size_t num_thread = std::thread::hardware_concurrency();
 
-#if 1
+#if 0
     in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
     wei_kcsr.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
 #elif 1
@@ -518,9 +566,9 @@ int main()
     wei_kcsr.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
 #endif
 
-    unsigned nrepeat = 50;
+    unsigned nrepeat = 100;
 
-#if 0
+#if 1
 #if 0
     device_direct_convolution_1
 #elif 0
@@ -531,15 +579,14 @@ int main()
     device_implicit_gemm_convolution_1_nchw_srck_nkhw
 #elif 0
     device_implicit_gemm_convolution_1_chwn_csrk_khwn
-#elif 0
+#elif 1
     device_implicit_gemm_convolution_2_cnhw_srck_knhw
 #elif 0
     device_winograd_convolution
 #endif
     (in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device, nrepeat);
-#endif
 
-#if 1
+#elif 1
     device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(in_nchw_desc,
                                                                    in_nchw,
                                                                    wei_kcsr_desc,
@@ -551,7 +598,7 @@ int main()
                                                                    nrepeat);
 #endif
 
-#if 1
+#if 0
     if(S == 3 && R == 3)
     {
         host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
diff --git a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
index c8996f1ad7..7d3435823d 100644
--- a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
+++ b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
@@ -102,7 +102,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
     constexpr unsigned WoPerThread = 1;
 
     constexpr unsigned BlockSize = 128;
-#elif 1
+#elif 0
     // 3x3 58x58, NKC = 16,256,128
     constexpr unsigned NPerBlock  = 8;
     constexpr unsigned KPerBlock  = 64;
@@ -161,6 +161,21 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
     constexpr unsigned HoPerThread = 1;
     constexpr unsigned WoPerThread = 1;
 
+    constexpr unsigned BlockSize = 128;
+#elif 1
+    // for 1x1, 28x28
+    constexpr unsigned NPerBlock  = 16;
+    constexpr unsigned KPerBlock  = 128;
+    constexpr unsigned CPerBlock  = 8;
+    constexpr unsigned HoPerBlock = 2;
+    constexpr unsigned WoPerBlock = 2;
+
+    constexpr unsigned NPerThread  = 4;
+    constexpr unsigned KPerThread  = 16;
+    constexpr unsigned CPerThread  = 2;
+    constexpr unsigned HoPerThread = 1;
+    constexpr unsigned WoPerThread = 1;
+
     constexpr unsigned BlockSize = 128;
 #endif
 
diff --git a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh
index 94da496f5a..46aa80fbe2 100644
--- a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh
+++ b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh
@@ -164,7 +164,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc,
     constexpr unsigned WoPerThread = 1;
 
     constexpr unsigned BlockSize = 128;
-#elif 1
+#elif 0
     // 3x3 56x56, NKC = 16,256,128, with padding
     // 3x3 28x28, NKC = 16,512,256, with padding
     // 3x3 20x84, NKC = 16,256,256, with padding
@@ -180,6 +180,51 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc,
     constexpr unsigned HoPerThread = 1;
     constexpr unsigned WoPerThread = 1;
 
+    constexpr unsigned BlockSize = 128;
+#elif 0
+    // for 5x5 filter, 20x84 image, 1x1 padding
+    constexpr unsigned NPerBlock  = 16;
+    constexpr unsigned KPerBlock  = 64;
+    constexpr unsigned CPerBlock  = 1;
+    constexpr unsigned HoPerBlock = 2;
+    constexpr unsigned WoPerBlock = 4;
+
+    constexpr unsigned NPerThread  = 4;
+    constexpr unsigned KPerThread  = 16;
+    constexpr unsigned CPerThread  = 1;
+    constexpr unsigned HoPerThread = 1;
+    constexpr unsigned WoPerThread = 1;
+
+    constexpr unsigned BlockSize = 128;
+#elif 0
+    // 5x5 filter, 28x28 image, 2x2 padding
+    constexpr unsigned NPerBlock  = 16;
+    constexpr unsigned KPerBlock  = 32;
+    constexpr unsigned CPerBlock  = 2;
+    constexpr unsigned HoPerBlock = 4;
+    constexpr unsigned WoPerBlock = 4;
+
+    constexpr unsigned NPerThread  = 4;
+    constexpr unsigned KPerThread  = 16;
+    constexpr unsigned CPerThread  = 1;
+    constexpr unsigned HoPerThread = 1;
+    constexpr unsigned WoPerThread = 1;
+
+    constexpr unsigned BlockSize = 128;
+#elif 1
+    // for 1x1, 28x28
+    constexpr unsigned NPerBlock  = 16;
+    constexpr unsigned KPerBlock  = 128;
+    constexpr unsigned CPerBlock  = 8;
+    constexpr unsigned HoPerBlock = 2;
+    constexpr unsigned WoPerBlock = 2;
+
+    constexpr unsigned NPerThread  = 4;
+    constexpr unsigned KPerThread  = 16;
+    constexpr unsigned CPerThread  = 2;
+    constexpr unsigned HoPerThread = 1;
+    constexpr unsigned WoPerThread = 1;
+
     constexpr unsigned BlockSize = 128;
 #endif
 
@@ -229,7 +274,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc,
         cudaEventElapsedTime(&elapsedTime, start, stop);
         printf("Elapsed time : %f ms\n", elapsedTime);
 
-        usleep(10000);
+        usleep(elapsedTime * 1000);
     }
 
     checkCudaErrors(cudaGetLastError());
diff --git a/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh b/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
index 18c639ea1b..f9b0394e03 100644
--- a/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
+++ b/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
@@ -93,7 +93,7 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
     constexpr unsigned GemmThreadPerClusterColumn = 4;
 
     constexpr unsigned BlockSize = 128;
-#elif 1
+#elif 0
     constexpr unsigned BPerBlock = 128;
     constexpr unsigned KPerBlock = 64;
     constexpr unsigned CPerBlock = 2;
@@ -108,6 +108,23 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
     constexpr unsigned InBlockCopyThreadPerDim0 = 2;
     constexpr unsigned InBlockCopyThreadPerDim1 = 64;
 
+    constexpr unsigned BlockSize = 128;
+#elif 1
+    // 1x1, 28x28
+    constexpr unsigned BPerBlock = 64;
+    constexpr unsigned KPerBlock = 128;
+    constexpr unsigned CPerBlock = 8;
+
+    constexpr unsigned BPerThread = 4;
+    constexpr unsigned KPerThread = 16;
+    constexpr unsigned CPerThread = 2;
+
+    constexpr unsigned GemmRowThreadPerCluster    = 8;
+    constexpr unsigned GemmColumnThreadPerCluster = 8;
+
+    constexpr unsigned InBlockCopyThreadPerDim0 = 2;
+    constexpr unsigned InBlockCopyThreadPerDim1 = 64;
+
     constexpr unsigned BlockSize = 128;
 #endif
 
diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
index 43d47c2ceb..4b62118323 100644
--- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
+++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
@@ -7,7 +7,7 @@
 #include "threadwise_2d_tensor_op.cuh"
 #include "gemm.cuh"
 
-// define B = N*Hi*Wi
+// define B = flatten(N, Hi, Wi)
 template <unsigned GridSize,
           unsigned BlockSize,
           class Float,
diff --git a/src/include/gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw.cuh b/src/include/gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw.cuh
index 0d1b0fca0b..f9acb31d8d 100644
--- a/src/include/gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw.cuh
+++ b/src/include/gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw.cuh
@@ -115,7 +115,7 @@ gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw(InGlobalDesc,
                                    decltype(in_cb_global_desc),
                                    decltype(in_cb_block_desc),
                                    decltype(in_cb_block_desc.GetLengths())>{};
-#elif 0
+#elif 1
     const auto blockwise_in_copy =
         blockwise_2d_tensor_copy_2<BlockSize,
                                    Float,