diff --git a/example/ck_tile/18_hstu_attention/block_gemm_areg_bsmem_creg_v2_hack_0.hpp b/example/ck_tile/18_hstu_attention/block_gemm_areg_bsmem_creg_v2_hack_0.hpp index 7a8df380a3..d33c1d2dc2 100644 --- a/example/ck_tile/18_hstu_attention/block_gemm_areg_bsmem_creg_v2_hack_0.hpp +++ b/example/ck_tile/18_hstu_attention/block_gemm_areg_bsmem_creg_v2_hack_0.hpp @@ -139,11 +139,15 @@ struct BlockGemmARegBSmemCRegV2Hack_0 {nIter * NPerBlockPerIter, 0 * KPerBlockPerIter}); b_warp_tensors[I0] = load_tile(b_warp_windows(nIter)(I0)); + __builtin_amdgcn_sched_barrier(0); + b_warp_windows(nIter)(I1) = b_warp_window_tmp; move_tile_window(b_warp_windows(nIter)(I1), {nIter * NPerBlockPerIter, 1 * KPerBlockPerIter}); b_warp_tensors[I1] = load_tile(b_warp_windows(nIter)(I1)); + __builtin_amdgcn_sched_barrier(0); + static_for<0, MIterPerWarp, 1>{}([&](auto mIter) { // read A warp tensor from A block tensor AWarpTensor a_warp_tensor; @@ -174,6 +178,8 @@ struct BlockGemmARegBSmemCRegV2Hack_0 load_tile(b_warp_windows(nIter)(number{})); }; + __builtin_amdgcn_sched_barrier(0); + static_for<0, MIterPerWarp, 1>{}([&](auto mIter) { // read A warp tensor from A block tensor AWarpTensor a_warp_tensor;