device_implicit_gemm_convolution_1_chwn_csrk_khwn: use tensor copy (instead of pointwise) for writing output, 3x3 increased from 78% to 84%, 5x5 from 80% to 84%

[ROCm/composable_kernel commit: a65ef90308]
2026-05-19 12:30:16 +00:00 · 2019-02-19 11:47:46 -06:00
parent f6f3f52faa
commit 1c962a13ee
7 changed files with 795 additions and 60 deletions
--- a/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.hpp
+++ b/driver/device_implicit_gemm_convolution_1_chwn_csrk_khwn.hpp
@@ -75,7 +75,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
    out_khwn_device_buf.ToDevice(out_khwn.mData.data());

 #if 1
-    // for 3x3, 34x34, try
+    // for 3x3, 34x34
    constexpr unsigned NPerBlock  = 16;
    constexpr unsigned KPerBlock  = 64;
    constexpr unsigned CPerBlock  = 4;
@@ -106,9 +106,46 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
    constexpr unsigned GemmNLevel1Cluster = 4;
    constexpr unsigned GemmKPerThreadLoop = 1;

+    constexpr unsigned OutThreadCopyDataPerWrite = 2;
+
    constexpr unsigned BlockSize = 128;
 #elif 0
-    // for 3x3, 34x34 | 3x3 58x58, NKC = 64, 64, 256
+    // for 5x5, 36x36
+    constexpr unsigned NPerBlock  = 16;
+    constexpr unsigned KPerBlock  = 64;
+    constexpr unsigned CPerBlock  = 2;
+    constexpr unsigned HoPerBlock = 2;
+    constexpr unsigned WoPerBlock = 4;
+
+    constexpr unsigned NPerThread  = 8;
+    constexpr unsigned KPerThread  = 8;
+    constexpr unsigned HoPerThread = 1;
+    constexpr unsigned WoPerThread = 1;
+
+    constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
+    constexpr unsigned WeiBlockCopyThreadPerDim1 = 32;
+
+    constexpr unsigned InBlockCopy_ThreadPerDimC = 2;
+    constexpr unsigned InBlockCopy_ThreadPerDimH = 2;
+    constexpr unsigned InBlockCopy_ThreadPerDimW = 4;
+    constexpr unsigned InBlockCopy_ThreadPerDimN = 4;
+    constexpr unsigned InBlockCopyDataPerRead    = 4;
+
+    constexpr unsigned WeiBlockCopyDataPerRead = 2;
+
+    constexpr unsigned GemmMPerThreadSubC = 4;
+    constexpr unsigned GemmNPerThreadSubC = 4;
+    constexpr unsigned GemmMLevel0Cluster = 4;
+    constexpr unsigned GemmNLevel0Cluster = 2;
+    constexpr unsigned GemmMLevel1Cluster = 2;
+    constexpr unsigned GemmNLevel1Cluster = 4;
+    constexpr unsigned GemmKPerThreadLoop = 1;
+
+    constexpr unsigned OutThreadCopyDataPerWrite = 2;
+
+    constexpr unsigned BlockSize = 128;
+#elif 0
+    // 3x3 58x58, NKC = 64, 64, 256
    constexpr unsigned NPerBlock  = 16;
    constexpr unsigned KPerBlock  = 64;
    constexpr unsigned CPerBlock  = 4;
@@ -142,27 +179,6 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
    constexpr unsigned HoPerThread = 1;
    constexpr unsigned WoPerThread = 1;

-    constexpr unsigned BlockSize = 128;
-#elif 0
-    // for 5x5, 36x36
-    constexpr unsigned NPerBlock  = 16;
-    constexpr unsigned KPerBlock  = 64;
-    constexpr unsigned CPerBlock  = 2;
-    constexpr unsigned HoPerBlock = 2;
-    constexpr unsigned WoPerBlock = 4;
-
-    constexpr unsigned NPerThread  = 4;
-    constexpr unsigned KPerThread  = 16;
-    constexpr unsigned CPerThread  = 1;
-    constexpr unsigned HoPerThread = 1;
-    constexpr unsigned WoPerThread = 1;
-
-    constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
-    constexpr unsigned WeiBlockCopyThreadPerDim1 = 32;
-
-    constexpr unsigned InBlockCopyDataPerRead  = 4; // not used, yet
-    constexpr unsigned WeiBlockCopyDataPerRead = 4;
-
    constexpr unsigned BlockSize = 128;
 #elif 0
    // for 7x7, 38x38
@@ -200,7 +216,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
    constexpr unsigned WoPerThread = 1;

    constexpr unsigned BlockSize = 128;
-#elif 0
+#elif 1
    // for 1x1, 28x28
    constexpr unsigned NPerBlock  = 16;
    constexpr unsigned KPerBlock  = 128;
@@ -210,7 +226,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,

    constexpr unsigned NPerThread  = 4;
    constexpr unsigned KPerThread  = 16;
-    constexpr unsigned CPerThread  = 2;
+    constexpr unsigned CPerThread  = 1;
    constexpr unsigned HoPerThread = 1;
    constexpr unsigned WoPerThread = 1;

@@ -225,6 +241,16 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,

    constexpr unsigned WeiBlockCopyDataPerRead = 4;

+    constexpr unsigned GemmMPerThreadSubC = 4;
+    constexpr unsigned GemmNPerThreadSubC = 4;
+    constexpr unsigned GemmMLevel0Cluster = 4;
+    constexpr unsigned GemmNLevel0Cluster = 2;
+    constexpr unsigned GemmMLevel1Cluster = 2;
+    constexpr unsigned GemmNLevel1Cluster = 4;
+    constexpr unsigned GemmKPerThreadLoop = 1;
+
+    constexpr unsigned OutThreadCopyDataPerWrite = 2;
+
    constexpr unsigned BlockSize = 128;
 #endif

@@ -266,7 +292,8 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
                                                                GemmNLevel0Cluster,
                                                                GemmMLevel1Cluster,
                                                                GemmNLevel1Cluster,
-                                                                GemmKPerThreadLoop>,
+                                                                GemmKPerThreadLoop,
+                                                                OutThreadCopyDataPerWrite>,
            dim3(GridSize),
            dim3(BlockSize),
            static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -571,16 +571,21 @@ int main()

    std::size_t num_thread = std::thread::hardware_concurrency();

+    bool do_verification = true;
+
+    if(do_verification)
+    {
 #if 0
-    in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-    wei_kcsr.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei_kcsr.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
 #elif 1
-    in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-    wei_kcsr.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei_kcsr.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
 #elif 1
-    in_nchw.GenerateTensorValue(GeneratorTensor_2{-2, 2}, num_thread);
-    wei_kcsr.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        in_nchw.GenerateTensorValue(GeneratorTensor_2{-2, 2}, num_thread);
+        wei_kcsr.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
 #endif
+    }

    unsigned nrepeat = 200;

@@ -614,22 +619,23 @@ int main()
                                                             nrepeat);
 #endif

-#if 1
-    if(S == 3 && R == 3)
+    if(do_verification)
    {
-        host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
-    }
-    else
-    {
-        host_direct_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
-    }
-    check_error(out_nkhw_host, out_nkhw_device);
-#endif
+        if(S == 3 && R == 3)
+        {
+            host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
+        }
+        else
+        {
+            host_direct_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
+        }
+        check_error(out_nkhw_host, out_nkhw_device);

 #if 0
-    LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
-    LogRange(std::cout << "wei_kcsr: ", wei_kcsr.mData, ",") << std::endl;
-    LogRange(std::cout << "out_nkhw_host  : ", out_nkhw_host.mData, ",") << std::endl;
-    LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl;
+        LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
+        LogRange(std::cout << "wei_kcsr: ", wei_kcsr.mData, ",") << std::endl;
+        LogRange(std::cout << "out_nkhw_host  : ", out_nkhw_host.mData, ",") << std::endl;
+        LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl;
 #endif
+    }
 }