mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-15 18:42:06 +00:00
Add _builtin_amdgcn_sched_barrier(0) for instructing the compiler for better codes isolation
This commit is contained in:
@@ -139,11 +139,15 @@ struct BlockGemmARegBSmemCRegV2Hack_0
|
||||
{nIter * NPerBlockPerIter, 0 * KPerBlockPerIter});
|
||||
b_warp_tensors[I0] = load_tile(b_warp_windows(nIter)(I0));
|
||||
|
||||
__builtin_amdgcn_sched_barrier(0);
|
||||
|
||||
b_warp_windows(nIter)(I1) = b_warp_window_tmp;
|
||||
move_tile_window(b_warp_windows(nIter)(I1),
|
||||
{nIter * NPerBlockPerIter, 1 * KPerBlockPerIter});
|
||||
b_warp_tensors[I1] = load_tile(b_warp_windows(nIter)(I1));
|
||||
|
||||
__builtin_amdgcn_sched_barrier(0);
|
||||
|
||||
static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
|
||||
// read A warp tensor from A block tensor
|
||||
AWarpTensor a_warp_tensor;
|
||||
@@ -174,6 +178,8 @@ struct BlockGemmARegBSmemCRegV2Hack_0
|
||||
load_tile(b_warp_windows(nIter)(number<kIter + 1>{}));
|
||||
};
|
||||
|
||||
__builtin_amdgcn_sched_barrier(0);
|
||||
|
||||
static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
|
||||
// read A warp tensor from A block tensor
|
||||
AWarpTensor a_warp_tensor;
|
||||
|
||||
Reference in New Issue
Block a user