Add _builtin_amdgcn_sched_barrier(0) for instructing the compiler for better codes isolation

This commit is contained in:
Qianfeng Zhang
2025-05-18 16:19:30 +00:00
parent e4e70f8b0a
commit 4e65469fe8

View File

@@ -139,11 +139,15 @@ struct BlockGemmARegBSmemCRegV2Hack_0
{nIter * NPerBlockPerIter, 0 * KPerBlockPerIter});
b_warp_tensors[I0] = load_tile(b_warp_windows(nIter)(I0));
__builtin_amdgcn_sched_barrier(0);
b_warp_windows(nIter)(I1) = b_warp_window_tmp;
move_tile_window(b_warp_windows(nIter)(I1),
{nIter * NPerBlockPerIter, 1 * KPerBlockPerIter});
b_warp_tensors[I1] = load_tile(b_warp_windows(nIter)(I1));
__builtin_amdgcn_sched_barrier(0);
static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
// read A warp tensor from A block tensor
AWarpTensor a_warp_tensor;
@@ -174,6 +178,8 @@ struct BlockGemmARegBSmemCRegV2Hack_0
load_tile(b_warp_windows(nIter)(number<kIter + 1>{}));
};
__builtin_amdgcn_sched_barrier(0);
static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
// read A warp tensor from A block tensor
AWarpTensor a_warp_tensor;