debugging ds_read asm

2026-07-17 17:19:12 +00:00 · 2019-04-26 15:34:55 -05:00
parent b93d2e1b57
commit 3ce77700b6
6 changed files with 118 additions and 84 deletions
--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
@@ -183,6 +183,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
                GemmDataPerReadA,
                GemmDataPerReadB>{};

+        // choose GEMM implementation here
+        const auto run_blockwise_batch_gemm = [&](auto... Xs) {
+#if 1
+            return blockwise_batch_gemm.Run(Xs...);
+#elif 0
+            return blockwise_batch_gemm.Run_asm(Xs...);
+#else
+            return blockwise_batch_gemm.Run_asm_v2(Xs...);
+#endif
+        };
+
        // LDS: be careful of alignment
        // TODO:: need to properly implement tensor descriptor with alignment
        constexpr index_t in_block_space =
@@ -241,13 +252,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn

                    __syncthreads();

-#if 1
-                    blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread);
-#elif 0
-                    blockwise_batch_gemm.Run_asm(p_wei_block, p_in_block, p_out_thread);
-#elif 1
-                    blockwise_batch_gemm.Run_asm_v2(p_wei_block, p_in_block, p_out_thread);
-#endif
+                    run_blockwise_batch_gemm(p_wei_block, p_in_block, p_out_thread);

                    __syncthreads();
                }
@@ -279,13 +284,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn

                    __syncthreads();

-#if 1
-                    blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread);
-#elif 0
-                    blockwise_batch_gemm.Run_asm(p_wei_block, p_in_block, p_out_thread);
-#elif 1
-                    blockwise_batch_gemm.Run_asm_v2(p_wei_block, p_in_block, p_out_thread);
-#endif
+                    run_blockwise_batch_gemm(p_wei_block, p_in_block, p_out_thread);

                    __syncthreads();
                }
--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
@@ -199,6 +199,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
                GemmDataPerReadA,
                GemmDataPerReadB>{};

+        // choose GEMM implementation here
+        const auto run_blockwise_batch_gemm = [&](auto... Xs) {
+#if 0
+            return blockwise_batch_gemm.Run(Xs...);
+#elif 0
+            return blockwise_batch_gemm.Run_asm(Xs...);
+#else
+            return blockwise_batch_gemm.Run_asm_v2(Xs...);
+#endif
+        };
+
        // LDS: be careful of alignment
        constexpr index_t in_block_space =
            in_c_h_w_n_block_desc.GetElementSpace(Number<max_align>{});
@@ -293,15 +304,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
                        blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
                                                                    p_wei_register_clipboard);

-// LDS double buffer: GEMM on current data
-#if 1
-                        blockwise_batch_gemm.Run
-#elif 0
-                        blockwise_batch_gemm.Run_asm
-#else
-                        blockwise_batch_gemm.Run_asm_v2
-#endif
-                            (p_wei_block_now, p_in_block_now, p_out_thread);
+                        run_blockwise_batch_gemm(p_wei_block_now, p_in_block_now, p_out_thread);

                        // LDS double buffer: store next data to LDS
                        blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
@@ -328,15 +331,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
                    blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
                                                                p_wei_register_clipboard);

-// LDS double buffer: GEMM on current data
-#if 1
-                    blockwise_batch_gemm.Run
-#elif 0
-                    blockwise_batch_gemm.Run_asm
-#else
-                    blockwise_batch_gemm.Run_asm_v2
-#endif
-                        (p_wei_block_double, p_in_block_double, p_out_thread);
+                    // LDS double buffer: GEMM on current data
+                    run_blockwise_batch_gemm(p_wei_block_double, p_in_block_double, p_out_thread);

                    // LDS double buffer: store next data to LDS
                    blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
@@ -347,17 +343,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
                    // odd iteration
                    __syncthreads();

-// LDS double buffer: GEMM on current data
-#if 1
-                    blockwise_batch_gemm.Run
-#elif 0
-                    blockwise_batch_gemm.Run_asm
-#else
-                    blockwise_batch_gemm.Run_asm_v2
-#endif
-                        (p_wei_block_double + wei_block_space,
-                         p_in_block_double + in_block_space,
-                         p_out_thread);
+                    // LDS double buffer: GEMM on current data
+                    run_blockwise_batch_gemm(p_wei_block_double + wei_block_space,
+                                             p_in_block_double + in_block_space,
+                                             p_out_thread);
                }
            }
        }
--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
@@ -193,6 +193,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
                GemmDataPerReadA,
                GemmDataPerReadB>{};

+        // choose GEMM implementation here
+        const auto run_blockwise_batch_gemm = [&](auto... Xs) {
+#if 0
+            return blockwise_batch_gemm.Run(Xs...);
+#elif 0
+            return blockwise_batch_gemm.Run_asm(Xs...);
+#else
+            return blockwise_batch_gemm.Run_asm_v2(Xs...);
+#endif
+        };
+
        // LDS: be careful of alignment
        constexpr index_t in_block_space =
            in_c_h_w_n_block_desc.GetElementSpace(Number<max_align>{});
@@ -267,7 +278,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn

                    __syncthreads();

-                    blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread);
+                    run_blockwise_batch_gemm(p_wei_block, p_in_block, p_out_thread);

                    __syncthreads();
                }
@@ -314,7 +325,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn

                    __syncthreads();

-                    blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread);
+                    run_blockwise_batch_gemm(p_wei_block, p_in_block, p_out_thread);

                    __syncthreads();
                }