diff --git a/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt b/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt index 21897a2bcc..8f5d4eaa47 100644 --- a/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt +++ b/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt @@ -1,7 +1,3 @@ add_example_executable(example_gemm_add_add_mean_meansquare_xdl_fp16 gemm_add_add_mean_meansquare_xdl_fp16.cpp) add_example_executable(example_gemm_mean_meansquare_xdl_fp16 gemm_mean_meansquare_xdl_fp16.cpp) - -#exclude GEMM+max exampe from testing, since there is random failure on gfx908 -#https://github.com/ROCmSoftwarePlatform/composable_kernel/issues/358 -#TODO: fix the failure and re-enable this test -add_example_executable_no_testing(example_gemm_max_xdl_fp16 gemm_max_xdl_fp16.cpp) +add_example_executable(example_gemm_max_xdl_fp16 gemm_max_xdl_fp16.cpp) diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp index 870f4aece3..8119f7cb3b 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp @@ -211,7 +211,7 @@ int main() r0_device_buf.FromDevice(r0_m.mData.data()); pass = ck::utils::check_err( - e_m_n.mData, e_m_n_host.mData, "Error: Incorrect results c", 1e-2, 1e-2); + e_m_n.mData, e_m_n_host.mData, "Error: Incorrect results e", 1e-2, 1e-2); pass &= ck::utils::check_err( r0_m.mData, r0_m_host.mData, "Error: Incorrect results d0", 1e-2, 1e-2); } diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp index 744cf35dda..58cd1cce2f 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp @@ -776,8 +776,7 @@ struct GridwiseGemmMultipleDMultipleR_k0mk1_k0nk1_mn_xdl_cshuffle_v1 static_for<0, num_access, 1>{}([&](auto access_id) { // make sure it's safe to read from LDS - if constexpr(access_id > 0) - block_sync_lds(); + block_sync_lds(); // each thread shuffle data from VGPR to LDS c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,