diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
index 9dda0a7636..fbbff21cf4 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
@@ -720,6 +720,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
             static_for<0, acc_thread_buf.Size(), 1>{}(
                 [&](auto i) { acc_element_op(acc_thread_buf(i), acc_thread_buf[i]); });
 
+            block_sync_lds(); // wait for lds read in gemm0 blockwise gemm
+
             // softmax
             SoftmaxBuf& max = blockwise_softmax.max_value_buf;
             SoftmaxBuf& sum = blockwise_softmax.sum_value_buf;
diff --git a/profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp b/profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp
index 48f722830c..b2457ec919 100644
--- a/profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp
+++ b/profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp
@@ -142,6 +142,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
     std::cout << "b1_g_n_o: " << b1_g_n_o.mDesc << std::endl;
     std::cout << "c_g_m_o: " << c_g_m_o_host_result.mDesc << std::endl;
 
+    std::srand(1); // work around test flakiness
     switch(init_method)
     {
     case 0: break;