adding implicit gemm

2026-05-11 17:00:18 +00:00 · 2019-01-14 11:13:36 -06:00
parent dc60d16962
commit aa0199a31c
3 changed files with 70 additions and 109 deletions
--- a/driver/conv.cu
+++ b/driver/conv.cu
@@ -8,7 +8,7 @@
 #include "conv_common.cuh"
 #include "device_direct_convolution_1.cuh"
 #include "device_direct_convolution_2.cuh"
-//#include "device_implicit_gemm_convolution.cuh"
+#include "device_implicit_gemm_convolution.cuh"
 //#include "device_winograd_convolution.cuh"

 struct GeneratorTensor_1
@@ -393,9 +393,9 @@ int main()
    {
 #if 0
        device_direct_convolution_1(in_desc, in, wei_desc, wei, out_desc, out_device);
-#elif 1
-        device_direct_convolution_2(in_desc, in, wei_desc, wei, out_desc, out_device);
 #elif 0
+        device_direct_convolution_2(in_desc, in, wei_desc, wei, out_desc, out_device);
+#elif 1
        device_implicit_gemm_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
 #elif 0
        device_winograd_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
--- a/driver/device_implicit_gemm_convolution.cuh
+++ b/driver/device_implicit_gemm_convolution.cuh
@@ -26,53 +26,24 @@ void device_implicit_gemm_convolution(
    constexpr auto out_desc = OutDesc{};

 #if 1
-    constexpr unsigned OutTileSizeH = 2;
-    constexpr unsigned OutTileSizeW = 2;
-    constexpr unsigned NPerBlock    = 2;
-    constexpr unsigned KPerBlock    = 32;
-    constexpr unsigned CPerBlock    = 4;
-    constexpr unsigned YPerBlock    = 1;
-    constexpr unsigned XPerBlock    = 16;
+    constexpr unsigned NPerBlock  = 2;
+    constexpr unsigned KPerBlock  = 128;
+    constexpr unsigned CPerBlock  = 4;
+    constexpr unsigned HoPerBlock = 2;
+    constexpr unsigned WoPerBlock = 32;

    constexpr unsigned NPerThread = 2;
-    constexpr unsigned KPerThread = 4;
-    constexpr unsigned CPerThread = 2;
-
-    constexpr unsigned BlockSize = 128;
-#elif 0
-    constexpr unsigned OutTileSizeH = 2;
-    constexpr unsigned OutTileSizeW = 2;
-    constexpr unsigned NPerBlock    = 2;
-    constexpr unsigned KPerBlock    = 32;
-    constexpr unsigned CPerBlock    = 4;
-    constexpr unsigned YPerBlock    = 1;
-    constexpr unsigned XPerBlock    = 27;
-
-    constexpr unsigned NPerThread = 2;
-    constexpr unsigned KPerThread = 4;
-    constexpr unsigned CPerThread = 2;
-
-    constexpr unsigned BlockSize = 216;
-#elif 0
-    constexpr unsigned OutTileSizeH = 2;
-    constexpr unsigned OutTileSizeW = 2;
-    constexpr unsigned NPerBlock    = 2;
-    constexpr unsigned KPerBlock    = 32;
-    constexpr unsigned CPerBlock    = 4;
-    constexpr unsigned YPerBlock    = 1;
-    constexpr unsigned XPerBlock    = 32;
-
-    constexpr unsigned NPerThread = 2;
-    constexpr unsigned KPerThread = 4;
+    constexpr unsigned KPerThread = 8;
    constexpr unsigned CPerThread = 2;
+    constexpr unsigned HoPerThread = 1;
+    constexpr unsigned WoPerThread = 4;

    constexpr unsigned BlockSize = 256;
 #endif

-    constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) *
-                                  (out_desc.GetLength(I1) / KPerBlock) *
-                                  (out_desc.GetLength(I2) / (OutTileSizeH * YPerBlock)) *
-                                  (out_desc.GetLength(I3) / (OutTileSizeW * XPerBlock));
+    constexpr unsigned GridSize =
+        (out_desc.GetLength(I0) / NPerBlock) * (out_desc.GetLength(I1) / KPerBlock) *
+        (out_desc.GetLength(I2) / HoPerBlock) * (out_desc.GetLength(I3) / WoPerBlock);

    dim3 block_dim(BlockSize);
    dim3 grid_dim(GridSize);
@@ -85,22 +56,21 @@ void device_implicit_gemm_convolution(
    cudaEventCreate(&start);
    cudaEventRecord(start, 0);

-    gridwise_implicit_gemm_convolution<T,
-                                       InDesc,
-                                       WeiDesc,
-                                       OutDesc,
-                                       OutTileSizeH,
-                                       OutTileSizeW,
-                                       NPerBlock,
-                                       KPerBlock,
-                                       CPerBlock,
-                                       YPerBlock,
-                                       XPerBlock,
-                                       NPerThread,
-                                       KPerThread,
-                                       CPerThread,
-                                       BlockSize,
-                                       GridSize>
+    gridwise_implicit_gemm_convolution_nchw_kcsr<GridSize,
+                                                 BlockSize,
+                                                 T,
+                                                 InDesc,
+                                                 WeiDesc,
+                                                 OutDesc,
+                                                 NPerBlock,
+                                                 KPerBlock,
+                                                 CPerBlock,
+                                                 HoPerBlock,
+                                                 WoPerBlock,
+                                                 KPerThread,
+                                                 CPerThread,
+                                                 HoPerThread,
+                                                 WoPerThread>
        <<<grid_dim, block_dim>>>(InDesc{},
                                  static_cast<T*>(in_device_buf.GetDeviceBuffer()),
                                  WeiDesc{},