diff --git a/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt b/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
index 21897a2bcc..8f5d4eaa47 100644
--- a/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
+++ b/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
@@ -1,7 +1,3 @@
 add_example_executable(example_gemm_add_add_mean_meansquare_xdl_fp16 gemm_add_add_mean_meansquare_xdl_fp16.cpp)
 add_example_executable(example_gemm_mean_meansquare_xdl_fp16 gemm_mean_meansquare_xdl_fp16.cpp)
-
-#exclude GEMM+max exampe from testing, since there is random failure on gfx908
-#https://github.com/ROCmSoftwarePlatform/composable_kernel/issues/358
-#TODO: fix the failure and re-enable this test
-add_example_executable_no_testing(example_gemm_max_xdl_fp16 gemm_max_xdl_fp16.cpp)
+add_example_executable(example_gemm_max_xdl_fp16 gemm_max_xdl_fp16.cpp)
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp
index 870f4aece3..8119f7cb3b 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp
@@ -211,7 +211,7 @@ int main()
         r0_device_buf.FromDevice(r0_m.mData.data());
 
         pass = ck::utils::check_err(
-            e_m_n.mData, e_m_n_host.mData, "Error: Incorrect results c", 1e-2, 1e-2);
+            e_m_n.mData, e_m_n_host.mData, "Error: Incorrect results e", 1e-2, 1e-2);
         pass &= ck::utils::check_err(
             r0_m.mData, r0_m_host.mData, "Error: Incorrect results d0", 1e-2, 1e-2);
     }
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
index 744cf35dda..58cd1cce2f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -776,8 +776,7 @@ struct GridwiseGemmMultipleDMultipleR_k0mk1_k0nk1_mn_xdl_cshuffle_v1
 
             static_for<0, num_access, 1>{}([&](auto access_id) {
                 // make sure it's safe to read from LDS
-                if constexpr(access_id > 0)
-                    block_sync_lds();
+                block_sync_lds();
 
                 // each thread shuffle data from VGPR to LDS
                 c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,