From 3ce77700b62df4bb17832c048bbbcb965a457833 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Fri, 26 Apr 2019 15:34:55 -0500
Subject: [PATCH] debugging ds_read asm

---
 ...lution_implicit_gemm_v1_chwn_cyxk_khwn.hpp |   2 +-
 ...lution_implicit_gemm_v1_nchw_cyxk_khwn.hpp | 109 ++++++++++++------
 driver/driver.hip.cpp                         |   2 +-
 ..._implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp |  27 +++--
 ...3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp |  47 +++-----
 ..._implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp |  15 ++-
 6 files changed, 118 insertions(+), 84 deletions(-)

diff --git a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
index abd771285d..938fba037a 100644
--- a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
@@ -475,7 +475,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
             GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
 #elif 0
             GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
-#elif 1
+#elif 0
             GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
 #elif 1
             GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
index 6fe414b89d..22658d35ef 100644
--- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
@@ -65,6 +65,76 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
     out_khwn_device_buf.ToDevice(out_khwn.mData.data());
 
 #if 0
+    // for 3x3, 34x34, v1r3, Pascal
+    constexpr index_t BlockSize = 128;
+
+    constexpr index_t NPerBlock  = 2;
+    constexpr index_t KPerBlock  = 128;
+    constexpr index_t CPerBlock  = 8;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 16;
+
+    constexpr index_t NPerThread  = 2;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 4;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 2;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<2, 1, 2, 1>;
+    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<1, 8, 1, 16>;
+    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
+    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load input for NCHW
+    constexpr index_t InBlockReorderDataPerWrite_N = 2;
+
+    using WeiBlockCopyClusterLengths            = Sequence<0, 0>; // not used
+    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
+
+    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
+#elif 1
+    // for 3x3, 34x34, v1r3, Vega 20
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t NPerBlock  = 2;
+    constexpr index_t KPerBlock  = 128;
+    constexpr index_t CPerBlock  = 8;
+    constexpr index_t HoPerBlock = 4;
+    constexpr index_t WoPerBlock = 16;
+
+    constexpr index_t NPerThread  = 2;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 4;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 2;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<2, 1, 2, 1>;
+    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<1, 8, 2, 16>;
+    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
+    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load input for NCHW
+    constexpr index_t InBlockReorderDataPerWrite_N = 2;
+
+    using WeiBlockCopyClusterLengths            = Sequence<0, 0>; // not used
+    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
+
+    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
+#elif 0
     // for 3x3, 28x28, v1r2, Pascal
     constexpr index_t BlockSize = 128;
 
@@ -133,41 +203,6 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
     using WeiBlockCopyClusterLengths            = Sequence<0, 0>; // not used
     constexpr index_t WeiBlockCopyDataPerRead_K = 4;
 
-    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
-#elif 1
-    // for 3x3, 34x34, v1r3, Pascal
-    constexpr index_t BlockSize = 128;
-
-    constexpr index_t NPerBlock  = 2;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 16;
-
-    constexpr index_t NPerThread  = 2;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 4;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<2, 1, 2, 1>;
-    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<1, 8, 1, 16>;
-    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
-    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load input for NCHW
-    constexpr index_t InBlockReorderDataPerWrite_N = 2;
-
-    using WeiBlockCopyClusterLengths            = Sequence<0, 0>; // not used
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-
     constexpr index_t OutThreadCopyDataPerWrite_N = 2;
 #endif
 
@@ -182,9 +217,9 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
         constexpr auto gridwise_conv =
 #if 0
             GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
-#elif 0
-            GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
 #elif 1
+            GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
+#elif 0
             GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
 #endif
             <GridSize,
diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp
index ab08356262..09d0e61433 100644
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -606,7 +606,7 @@ int main(int argc, char* argv[])
     device_direct_convolution_2_nchw_kcyx_nkhw
 #elif 0
     device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
-#elif 1
+#elif 0
     device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
 #elif 1
     device_convolution_implicit_gemm_v1_nchw_cyxk_khwn
diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
index 4a3459119d..ab67f3afff 100644
--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
@@ -183,6 +183,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
                 GemmDataPerReadA,
                 GemmDataPerReadB>{};
 
+        // choose GEMM implementation here
+        const auto run_blockwise_batch_gemm = [&](auto... Xs) {
+#if 1
+            return blockwise_batch_gemm.Run(Xs...);
+#elif 0
+            return blockwise_batch_gemm.Run_asm(Xs...);
+#else
+            return blockwise_batch_gemm.Run_asm_v2(Xs...);
+#endif
+        };
+
         // LDS: be careful of alignment
         // TODO:: need to properly implement tensor descriptor with alignment
         constexpr index_t in_block_space =
@@ -241,13 +252,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
 
                     __syncthreads();
 
-#if 1
-                    blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread);
-#elif 0
-                    blockwise_batch_gemm.Run_asm(p_wei_block, p_in_block, p_out_thread);
-#elif 1
-                    blockwise_batch_gemm.Run_asm_v2(p_wei_block, p_in_block, p_out_thread);
-#endif
+                    run_blockwise_batch_gemm(p_wei_block, p_in_block, p_out_thread);
 
                     __syncthreads();
                 }
@@ -279,13 +284,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
 
                     __syncthreads();
 
-#if 1
-                    blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread);
-#elif 0
-                    blockwise_batch_gemm.Run_asm(p_wei_block, p_in_block, p_out_thread);
-#elif 1
-                    blockwise_batch_gemm.Run_asm_v2(p_wei_block, p_in_block, p_out_thread);
-#endif
+                    run_blockwise_batch_gemm(p_wei_block, p_in_block, p_out_thread);
 
                     __syncthreads();
                 }
diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
index 5595d596e9..a266b2de1e 100644
--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
@@ -199,6 +199,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
                 GemmDataPerReadA,
                 GemmDataPerReadB>{};
 
+        // choose GEMM implementation here
+        const auto run_blockwise_batch_gemm = [&](auto... Xs) {
+#if 0
+            return blockwise_batch_gemm.Run(Xs...);
+#elif 0
+            return blockwise_batch_gemm.Run_asm(Xs...);
+#else
+            return blockwise_batch_gemm.Run_asm_v2(Xs...);
+#endif
+        };
+
         // LDS: be careful of alignment
         constexpr index_t in_block_space =
             in_c_h_w_n_block_desc.GetElementSpace(Number<max_align>{});
@@ -293,15 +304,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
                         blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
                                                                     p_wei_register_clipboard);
 
-// LDS double buffer: GEMM on current data
-#if 1
-                        blockwise_batch_gemm.Run
-#elif 0
-                        blockwise_batch_gemm.Run_asm
-#else
-                        blockwise_batch_gemm.Run_asm_v2
-#endif
-                            (p_wei_block_now, p_in_block_now, p_out_thread);
+                        run_blockwise_batch_gemm(p_wei_block_now, p_in_block_now, p_out_thread);
 
                         // LDS double buffer: store next data to LDS
                         blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
@@ -328,15 +331,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
                     blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
                                                                 p_wei_register_clipboard);
 
-// LDS double buffer: GEMM on current data
-#if 1
-                    blockwise_batch_gemm.Run
-#elif 0
-                    blockwise_batch_gemm.Run_asm
-#else
-                    blockwise_batch_gemm.Run_asm_v2
-#endif
-                        (p_wei_block_double, p_in_block_double, p_out_thread);
+                    // LDS double buffer: GEMM on current data
+                    run_blockwise_batch_gemm(p_wei_block_double, p_in_block_double, p_out_thread);
 
                     // LDS double buffer: store next data to LDS
                     blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
@@ -347,17 +343,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
                     // odd iteration
                     __syncthreads();
 
-// LDS double buffer: GEMM on current data
-#if 1
-                    blockwise_batch_gemm.Run
-#elif 0
-                    blockwise_batch_gemm.Run_asm
-#else
-                    blockwise_batch_gemm.Run_asm_v2
-#endif
-                        (p_wei_block_double + wei_block_space,
-                         p_in_block_double + in_block_space,
-                         p_out_thread);
+                    // LDS double buffer: GEMM on current data
+                    run_blockwise_batch_gemm(p_wei_block_double + wei_block_space,
+                                             p_in_block_double + in_block_space,
+                                             p_out_thread);
                 }
             }
         }
diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
index 062d3c4540..5e6fde6ef2 100644
--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
@@ -193,6 +193,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
                 GemmDataPerReadA,
                 GemmDataPerReadB>{};
 
+        // choose GEMM implementation here
+        const auto run_blockwise_batch_gemm = [&](auto... Xs) {
+#if 0
+            return blockwise_batch_gemm.Run(Xs...);
+#elif 0
+            return blockwise_batch_gemm.Run_asm(Xs...);
+#else
+            return blockwise_batch_gemm.Run_asm_v2(Xs...);
+#endif
+        };
+
         // LDS: be careful of alignment
         constexpr index_t in_block_space =
             in_c_h_w_n_block_desc.GetElementSpace(Number<max_align>{});
@@ -267,7 +278,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
 
                     __syncthreads();
 
-                    blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread);
+                    run_blockwise_batch_gemm(p_wei_block, p_in_block, p_out_thread);
 
                     __syncthreads();
                 }
@@ -314,7 +325,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
 
                     __syncthreads();
 
-                    blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread);
+                    run_blockwise_batch_gemm(p_wei_block, p_in_block, p_out_thread);
 
                     __syncthreads();
                 }