diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp
index 98b29ff82e..0344e68305 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp
@@ -250,8 +250,10 @@ struct GridwiseSoftmax_mk_to_mk
             reducedTiles++;
         } while(reducedTiles < num_k_block_tile_iteration);
 
-        static_for<0, MThreadSliceSize, 1>{}(
-            [&](auto I) { BlockwiseMaxReduce::Reduce(reduce_work_buf, max_value_buf(I)); });
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            BlockwiseMaxReduce::Reduce(reduce_work_buf, max_value_buf(I));
+            block_sync_lds();
+        });
 
         threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_bwd_step);
 
@@ -303,9 +305,10 @@ struct GridwiseSoftmax_mk_to_mk
             reducedTiles++;
         } while(reducedTiles < num_k_block_tile_iteration);
 
+        block_sync_lds(); // wait for reading being complete before writing to LDS
         static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
             BlockwiseSumReduce::Reduce(reduce_work_buf, accu_value_buf(I));
-            // block_sync_lds();
+            block_sync_lds();
         });
 
         threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_fwd_step);