use ford/for instead of static_ford/static_for in threadwise copy, somehow register spill is greatly reduced on AMD

2026-05-12 17:26:00 +00:00 · 2019-08-07 19:09:13 -05:00
parent 5636576f9b
commit bc9ea646f8
7 changed files with 122 additions and 35 deletions
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -16,7 +16,7 @@
 #include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
 //#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
 //#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
-//#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"

 struct GeneratorTensor_1
 {
@@ -379,7 +379,7 @@ int main(int argc, char* argv[])
 #elif 0
    device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
        (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 1
+#elif 0
    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
                                                         in_nchw,
                                                         wei_kcyx_desc,