refactor

2026-06-29 03:07:02 +00:00 · 2019-02-04 15:40:34 -06:00
parent 3439e4b5b7
commit 9bbe9073ab
6 changed files with 138 additions and 14 deletions
--- a/driver/conv.cu
+++ b/driver/conv.cu
@@ -453,7 +453,7 @@ int main()

    constexpr unsigned HPad = 0;
    constexpr unsigned WPad = 0;
-#elif 1
+#elif 0
    // 3x3 filter, 56x56 image, 1x1 padding
    constexpr unsigned N  = 16;
    constexpr unsigned C  = 128;
@@ -477,6 +477,18 @@ int main()

    constexpr unsigned HPad = 1;
    constexpr unsigned WPad = 1;
+#elif 1
+    // 1x1 filter, 28x28 image
+    constexpr unsigned N  = 16;
+    constexpr unsigned C  = 256;
+    constexpr unsigned HI = 28;
+    constexpr unsigned WI = 28;
+    constexpr unsigned K  = 512;
+    constexpr unsigned S  = 1;
+    constexpr unsigned R  = 1;
+
+    constexpr unsigned HPad = 0;
+    constexpr unsigned WPad = 0;
 #elif 0
    // 3x3 filter, 20x84 image, 1x1 padding
    constexpr unsigned N  = 16;
@@ -489,6 +501,42 @@ int main()

    constexpr unsigned HPad = 1;
    constexpr unsigned WPad = 1;
+#elif 0
+    // 3x3 filter, 112x112 image, 1x1 padding
+    constexpr unsigned N  = 16;
+    constexpr unsigned C  = 64;
+    constexpr unsigned HI = 112;
+    constexpr unsigned WI = 112;
+    constexpr unsigned K  = 128;
+    constexpr unsigned S  = 3;
+    constexpr unsigned R  = 3;
+
+    constexpr unsigned HPad = 1;
+    constexpr unsigned WPad = 1;
+#elif 0
+    // 5x5 filter, 20x86 image, 1x1 padding
+    constexpr unsigned N  = 16;
+    constexpr unsigned C  = 256;
+    constexpr unsigned HI = 20;
+    constexpr unsigned WI = 86;
+    constexpr unsigned K  = 512;
+    constexpr unsigned S  = 5;
+    constexpr unsigned R  = 5;
+
+    constexpr unsigned HPad = 1;
+    constexpr unsigned WPad = 1;
+#elif 0
+    // 5x5 filter, 28x28 image, 2x2 padding
+    constexpr unsigned N  = 16;
+    constexpr unsigned C  = 192;
+    constexpr unsigned HI = 28;
+    constexpr unsigned WI = 28;
+    constexpr unsigned K  = 32;
+    constexpr unsigned S  = 5;
+    constexpr unsigned R  = 5;
+
+    constexpr unsigned HPad = 2;
+    constexpr unsigned WPad = 2;
 #endif

    auto lower_pads = Sequence<HPad, WPad>{};
@@ -510,7 +558,7 @@ int main()

    std::size_t num_thread = std::thread::hardware_concurrency();

-#if 1
+#if 0
    in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
    wei_kcsr.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
 #elif 1
@@ -518,9 +566,9 @@ int main()
    wei_kcsr.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
 #endif

-    unsigned nrepeat = 50;
+    unsigned nrepeat = 100;

-#if 0
+#if 1
 #if 0
    device_direct_convolution_1
 #elif 0
@@ -531,15 +579,14 @@ int main()
    device_implicit_gemm_convolution_1_nchw_srck_nkhw
 #elif 0
    device_implicit_gemm_convolution_1_chwn_csrk_khwn
-#elif 0
+#elif 1
    device_implicit_gemm_convolution_2_cnhw_srck_knhw
 #elif 0
    device_winograd_convolution
 #endif
    (in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device, nrepeat);
-#endif

-#if 1
+#elif 1
    device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(in_nchw_desc,
                                                                   in_nchw,
                                                                   wei_kcsr_desc,
@@ -551,7 +598,7 @@ int main()
                                                                   nrepeat);
 #endif

-#if 1
+#if 0
    if(S == 3 && R == 3)
    {
        host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
--- a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
+++ b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
@@ -102,7 +102,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
    constexpr unsigned WoPerThread = 1;

    constexpr unsigned BlockSize = 128;
-#elif 1
+#elif 0
    // 3x3 58x58, NKC = 16,256,128
    constexpr unsigned NPerBlock  = 8;
    constexpr unsigned KPerBlock  = 64;
@@ -161,6 +161,21 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
    constexpr unsigned HoPerThread = 1;
    constexpr unsigned WoPerThread = 1;

+    constexpr unsigned BlockSize = 128;
+#elif 1
+    // for 1x1, 28x28
+    constexpr unsigned NPerBlock  = 16;
+    constexpr unsigned KPerBlock  = 128;
+    constexpr unsigned CPerBlock  = 8;
+    constexpr unsigned HoPerBlock = 2;
+    constexpr unsigned WoPerBlock = 2;
+
+    constexpr unsigned NPerThread  = 4;
+    constexpr unsigned KPerThread  = 16;
+    constexpr unsigned CPerThread  = 2;
+    constexpr unsigned HoPerThread = 1;
+    constexpr unsigned WoPerThread = 1;
+
    constexpr unsigned BlockSize = 128;
 #endif

--- a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh
+++ b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh
@@ -164,7 +164,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc,
    constexpr unsigned WoPerThread = 1;

    constexpr unsigned BlockSize = 128;
-#elif 1
+#elif 0
    // 3x3 56x56, NKC = 16,256,128, with padding
    // 3x3 28x28, NKC = 16,512,256, with padding
    // 3x3 20x84, NKC = 16,256,256, with padding
@@ -180,6 +180,51 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc,
    constexpr unsigned HoPerThread = 1;
    constexpr unsigned WoPerThread = 1;

+    constexpr unsigned BlockSize = 128;
+#elif 0
+    // for 5x5 filter, 20x84 image, 1x1 padding
+    constexpr unsigned NPerBlock  = 16;
+    constexpr unsigned KPerBlock  = 64;
+    constexpr unsigned CPerBlock  = 1;
+    constexpr unsigned HoPerBlock = 2;
+    constexpr unsigned WoPerBlock = 4;
+
+    constexpr unsigned NPerThread  = 4;
+    constexpr unsigned KPerThread  = 16;
+    constexpr unsigned CPerThread  = 1;
+    constexpr unsigned HoPerThread = 1;
+    constexpr unsigned WoPerThread = 1;
+
+    constexpr unsigned BlockSize = 128;
+#elif 0
+    // 5x5 filter, 28x28 image, 2x2 padding
+    constexpr unsigned NPerBlock  = 16;
+    constexpr unsigned KPerBlock  = 32;
+    constexpr unsigned CPerBlock  = 2;
+    constexpr unsigned HoPerBlock = 4;
+    constexpr unsigned WoPerBlock = 4;
+
+    constexpr unsigned NPerThread  = 4;
+    constexpr unsigned KPerThread  = 16;
+    constexpr unsigned CPerThread  = 1;
+    constexpr unsigned HoPerThread = 1;
+    constexpr unsigned WoPerThread = 1;
+
+    constexpr unsigned BlockSize = 128;
+#elif 1
+    // for 1x1, 28x28
+    constexpr unsigned NPerBlock  = 16;
+    constexpr unsigned KPerBlock  = 128;
+    constexpr unsigned CPerBlock  = 8;
+    constexpr unsigned HoPerBlock = 2;
+    constexpr unsigned WoPerBlock = 2;
+
+    constexpr unsigned NPerThread  = 4;
+    constexpr unsigned KPerThread  = 16;
+    constexpr unsigned CPerThread  = 2;
+    constexpr unsigned HoPerThread = 1;
+    constexpr unsigned WoPerThread = 1;
+
    constexpr unsigned BlockSize = 128;
 #endif

@@ -229,7 +274,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc,
        cudaEventElapsedTime(&elapsedTime, start, stop);
        printf("Elapsed time : %f ms\n", elapsedTime);

-        usleep(10000);
+        usleep(elapsedTime * 1000);
    }

    checkCudaErrors(cudaGetLastError());
--- a/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
+++ b/driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
@@ -93,7 +93,7 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
    constexpr unsigned GemmThreadPerClusterColumn = 4;

    constexpr unsigned BlockSize = 128;
-#elif 1
+#elif 0
    constexpr unsigned BPerBlock = 128;
    constexpr unsigned KPerBlock = 64;
    constexpr unsigned CPerBlock = 2;
@@ -108,6 +108,23 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
    constexpr unsigned InBlockCopyThreadPerDim0 = 2;
    constexpr unsigned InBlockCopyThreadPerDim1 = 64;

+    constexpr unsigned BlockSize = 128;
+#elif 1
+    // 1x1, 28x28
+    constexpr unsigned BPerBlock = 64;
+    constexpr unsigned KPerBlock = 128;
+    constexpr unsigned CPerBlock = 8;
+
+    constexpr unsigned BPerThread = 4;
+    constexpr unsigned KPerThread = 16;
+    constexpr unsigned CPerThread = 2;
+
+    constexpr unsigned GemmRowThreadPerCluster    = 8;
+    constexpr unsigned GemmColumnThreadPerCluster = 8;
+
+    constexpr unsigned InBlockCopyThreadPerDim0 = 2;
+    constexpr unsigned InBlockCopyThreadPerDim1 = 64;
+
    constexpr unsigned BlockSize = 128;
 #endif

--- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
+++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
@@ -7,7 +7,7 @@
 #include "threadwise_2d_tensor_op.cuh"
 #include "gemm.cuh"

-// define B = N*Hi*Wi
+// define B = flatten(N, Hi, Wi)
 template <unsigned GridSize,
          unsigned BlockSize,
          class Float,
--- a/src/include/gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw.cuh
+++ b/src/include/gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw.cuh
@@ -115,7 +115,7 @@ gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw(InGlobalDesc,
                                   decltype(in_cb_global_desc),
                                   decltype(in_cb_block_desc),
                                   decltype(in_cb_block_desc.GetLengths())>{};
-#elif 0
+#elif 1
    const auto blockwise_in_copy =
        blockwise_2d_tensor_copy_2<BlockSize,
                                   Float,