diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp index 98b29ff82e..0344e68305 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp @@ -250,8 +250,10 @@ struct GridwiseSoftmax_mk_to_mk reducedTiles++; } while(reducedTiles < num_k_block_tile_iteration); - static_for<0, MThreadSliceSize, 1>{}( - [&](auto I) { BlockwiseMaxReduce::Reduce(reduce_work_buf, max_value_buf(I)); }); + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + BlockwiseMaxReduce::Reduce(reduce_work_buf, max_value_buf(I)); + block_sync_lds(); + }); threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_bwd_step); @@ -303,9 +305,10 @@ struct GridwiseSoftmax_mk_to_mk reducedTiles++; } while(reducedTiles < num_k_block_tile_iteration); + block_sync_lds(); // wait for reading being complete before writing to LDS static_for<0, MThreadSliceSize, 1>{}([&](auto I) { BlockwiseSumReduce::Reduce(reduce_work_buf, accu_value_buf(I)); - // block_sync_lds(); + block_sync_lds(); }); threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_fwd_step);