mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-03 13:11:25 +00:00
fix standalone softmax race condition around blockwise reduction (#323)
This commit is contained in:
@@ -250,8 +250,10 @@ struct GridwiseSoftmax_mk_to_mk
|
||||
reducedTiles++;
|
||||
} while(reducedTiles < num_k_block_tile_iteration);
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}(
|
||||
[&](auto I) { BlockwiseMaxReduce::Reduce(reduce_work_buf, max_value_buf(I)); });
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
BlockwiseMaxReduce::Reduce(reduce_work_buf, max_value_buf(I));
|
||||
block_sync_lds();
|
||||
});
|
||||
|
||||
threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_bwd_step);
|
||||
|
||||
@@ -303,9 +305,10 @@ struct GridwiseSoftmax_mk_to_mk
|
||||
reducedTiles++;
|
||||
} while(reducedTiles < num_k_block_tile_iteration);
|
||||
|
||||
block_sync_lds(); // wait for reading being complete before writing to LDS
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
BlockwiseSumReduce::Reduce(reduce_work_buf, accu_value_buf(I));
|
||||
// block_sync_lds();
|
||||
block_sync_lds();
|
||||
});
|
||||
|
||||
threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_fwd_step);
|
||||
|
||||
Reference in New Issue
Block a user