diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp index 9dda0a7636..fbbff21cf4 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp @@ -720,6 +720,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle static_for<0, acc_thread_buf.Size(), 1>{}( [&](auto i) { acc_element_op(acc_thread_buf(i), acc_thread_buf[i]); }); + block_sync_lds(); // wait for lds read in gemm0 blockwise gemm + // softmax SoftmaxBuf& max = blockwise_softmax.max_value_buf; SoftmaxBuf& sum = blockwise_softmax.sum_value_buf; diff --git a/profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp b/profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp index 48f722830c..b2457ec919 100644 --- a/profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp +++ b/profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp @@ -142,6 +142,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification, std::cout << "b1_g_n_o: " << b1_g_n_o.mDesc << std::endl; std::cout << "c_g_m_o: " << c_g_m_o_host_result.mDesc << std::endl; + std::srand(1); // work around test flakiness switch(init_method) { case 0: break;