From 686e38c5f0fb86113515cd57fcc7643fdf03920d Mon Sep 17 00:00:00 2001 From: rocking5566 Date: Mon, 22 Aug 2022 20:50:28 +0800 Subject: [PATCH] [What] Fix bug of verification fail on E Matrix (#371) [Why] We need to sync lds even in first loop because Gemm also use the same LDS. [ROCm/composable_kernel commit: c366de553ede7ccb931ad32b03db5dd1b8655201] --- example/16_gemm_multi_d_multi_reduces/CMakeLists.txt | 6 +----- example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp | 2 +- .../gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp | 3 +-- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt b/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt index 21897a2bcc..8f5d4eaa47 100644 --- a/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt +++ b/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt @@ -1,7 +1,3 @@ add_example_executable(example_gemm_add_add_mean_meansquare_xdl_fp16 gemm_add_add_mean_meansquare_xdl_fp16.cpp) add_example_executable(example_gemm_mean_meansquare_xdl_fp16 gemm_mean_meansquare_xdl_fp16.cpp) - -#exclude GEMM+max exampe from testing, since there is random failure on gfx908 -#https://github.com/ROCmSoftwarePlatform/composable_kernel/issues/358 -#TODO: fix the failure and re-enable this test -add_example_executable_no_testing(example_gemm_max_xdl_fp16 gemm_max_xdl_fp16.cpp) +add_example_executable(example_gemm_max_xdl_fp16 gemm_max_xdl_fp16.cpp) diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp index 870f4aece3..8119f7cb3b 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp @@ -211,7 +211,7 @@ int main() r0_device_buf.FromDevice(r0_m.mData.data()); pass = ck::utils::check_err( - e_m_n.mData, e_m_n_host.mData, "Error: Incorrect results c", 1e-2, 1e-2); + e_m_n.mData, e_m_n_host.mData, "Error: Incorrect results e", 1e-2, 1e-2); pass &= ck::utils::check_err( r0_m.mData, r0_m_host.mData, "Error: Incorrect results d0", 1e-2, 1e-2); } diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp index 744cf35dda..58cd1cce2f 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp @@ -776,8 +776,7 @@ struct GridwiseGemmMultipleDMultipleR_k0mk1_k0nk1_mn_xdl_cshuffle_v1 static_for<0, num_access, 1>{}([&](auto access_id) { // make sure it's safe to read from LDS - if constexpr(access_id > 0) - block_sync_lds(); + block_sync_lds(); // each thread shuffle data from VGPR to LDS c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,