diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc index b583612cfb..8f39b07be5 100644 --- a/example/ck_tile/18_flatmm/run_flatmm_example.inc +++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc @@ -18,7 +18,7 @@ constexpr const char* DataTypeToString() { return "bf8"; } - else if constexpr(std::is_same_v) + else if constexpr(std::is_same_v) { return "bf16"; } diff --git a/example/ck_tile/36_copy/test_copy.hpp b/example/ck_tile/36_copy/test_copy.hpp index 3c1c9bb9ed..7285b4a29f 100644 --- a/example/ck_tile/36_copy/test_copy.hpp +++ b/example/ck_tile/36_copy/test_copy.hpp @@ -130,6 +130,7 @@ struct TileCopy make_tuple(number{}, number{}), {iM, 0}, MakeDRAMDistribution()); + // We don't have prefetch here, wait the data back immediately. constexpr auto async_copy_fence_cnt = 0; // Output tensor diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp index 8c0b9d56d7..33572b38c5 100644 --- a/include/ck_tile/core/arch/arch.hpp +++ b/include/ck_tile/core/arch/arch.hpp @@ -116,8 +116,10 @@ CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0) template CK_TILE_DEVICE void block_sync_lds_direct_load() { + // we maximum track 64 insts back + static_assert(vmcnt <= 63); // We don't sync the lds insts here. - constexpr auto s_waitcnt_imm = 3952 + ((vmcnt >> 4) << 14) + (vmcnt & 0xff); + constexpr auto s_waitcnt_imm = 3952 + (((vmcnt & 0xf0) << 10) | (vmcnt & 0xf)); __builtin_amdgcn_s_waitcnt(s_waitcnt_imm); __builtin_amdgcn_s_barrier(); }