fix the s_waitcnt_imm calculation

2026-06-08 15:30:23 +00:00 · 2025-07-16 05:39:50 +00:00
parent ec0a45b29f
commit c30f8b709b
3 changed files with 5 additions and 2 deletions
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -18,7 +18,7 @@ constexpr const char* DataTypeToString()
    {
        return "bf8";
    }
-    else if  constexpr(std::is_same_v<T, ck_tile::bf16_t>)
+    else if constexpr(std::is_same_v<T, ck_tile::bf16_t>)
    {
        return "bf16";
    }
--- a/example/ck_tile/36_copy/test_copy.hpp
+++ b/example/ck_tile/36_copy/test_copy.hpp
@@ -130,6 +130,7 @@ struct TileCopy
                             make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
                             {iM, 0},
                             MakeDRAMDistribution<Problem>());
+        // We don't have prefetch here, wait the data back immediately.
        constexpr auto async_copy_fence_cnt = 0;

        // Output tensor
--- a/include/ck_tile/core/arch/arch.hpp
+++ b/include/ck_tile/core/arch/arch.hpp
@@ -116,8 +116,10 @@ CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0)
 template <index_t vmcnt>
 CK_TILE_DEVICE void block_sync_lds_direct_load()
 {
+    // we maximum track 64 insts back
+    static_assert(vmcnt <= 63);
    // We don't sync the lds insts here.
-    constexpr auto s_waitcnt_imm = 3952 + ((vmcnt >> 4) << 14) + (vmcnt & 0xff);
+    constexpr auto s_waitcnt_imm = 3952 + (((vmcnt & 0xf0) << 10) | (vmcnt & 0xf));
    __builtin_amdgcn_s_waitcnt(s_waitcnt_imm);
    __builtin_amdgcn_s_barrier();
 }