use ford/for instead of static_ford/static_for in threadwise copy, somehow register spill is greatly reduced on AMD

2026-05-13 09:45:56 +00:00 · 2019-08-07 19:09:13 -05:00
parent 5636576f9b
commit bc9ea646f8
7 changed files with 122 additions and 35 deletions
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -112,14 +112,14 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    constexpr index_t GemmDataPerReadA   = 4;
    constexpr index_t GemmDataPerReadB   = 4;

-    using InBlockCopySubLengths_E_N1_B_N2      = Sequence<1, 1, 4, 1>;
-    using InBlockCopyClusterLengths_E_N1_B_N2  = Sequence<8, 2, 4, 4>;
+    using InBlockCopySubLengths_E_N1_B_N2      = Sequence<1, 1, 2, 2>;
+    using InBlockCopyClusterLengths_E_N1_B_N2  = Sequence<8, 2, 8, 2>;
    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
    using InBlockCopySrcAccessOrder            = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
    using InBlockCopyDstAccessOrder            = Sequence<0, 1, 2, 3>; // [E, N1, B, N2]

-    constexpr index_t InBlockCopySrcDataPerRead_B   = 4;
-    constexpr index_t InBlockCopyDstDataPerWrite_N2 = 1;
+    constexpr index_t InBlockCopySrcDataPerRead_B   = 2;
+    constexpr index_t InBlockCopyDstDataPerWrite_N2 = 2;

    using WeiBlockCopySubLengths_E_K            = Sequence<2, 2>;
    using WeiBlockCopyClusterLengths_E_K        = Sequence<4, 64>;
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -16,7 +16,7 @@
 #include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
 //#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
 //#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
-//#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"

 struct GeneratorTensor_1
 {
@@ -379,7 +379,7 @@ int main(int argc, char* argv[])
 #elif 0
    device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
        (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 1
+#elif 0
    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
                                                         in_nchw,
                                                         wei_kcyx_desc,