Add block_sync_lds_direct_load utility

2026-06-29 11:16:59 +00:00 · 2025-07-16 03:54:33 +00:00
parent eea58629cf
commit e5cc4af808
2 changed files with 7 additions and 7 deletions
--- a/example/ck_tile/36_copy/test_copy.hpp
+++ b/example/ck_tile/36_copy/test_copy.hpp
@@ -130,6 +130,7 @@ struct TileCopy
                             make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
                             {iM, 0},
                             MakeDRAMDistribution<Problem>());
+        constexpr auto async_copy_fence_cnt = 0;

        // Output tensor
        const auto y_m = make_naive_tensor_view<address_space_enum::global>(
@@ -152,9 +153,8 @@ struct TileCopy
                    async_load_tile(x_block_lds_write_window, x_block_window);

                    // Wait all asyncload insts complete.
-                    __builtin_amdgcn_s_waitcnt(3952);
                    // Wait all waves synced
-                    __builtin_amdgcn_s_barrier();
+                    block_sync_lds_direct_load<async_copy_fence_cnt>();

                    auto lds_tile = load_tile(x_block_lds_read_window);

--- a/include/ck_tile/core/arch/arch.hpp
+++ b/include/ck_tile/core/arch/arch.hpp
@@ -113,13 +113,13 @@ CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0)
 #endif
 }

+template <index_t vmcnt>
 CK_TILE_DEVICE void block_sync_lds_direct_load()
 {
-    asm volatile("\
-    s_waitcnt vmcnt(0) \n \
-    s_waitcnt lgkmcnt(0) \n \
-    s_barrier \
-    " ::);
+    // We don't sync the lds insts here.
+    constexpr auto s_waitcnt_imm = 3952 + ((vmcnt >> 4) << 14) + (vmcnt & 0xff);
+    __builtin_amdgcn_s_waitcnt(s_waitcnt_imm);
+    __builtin_amdgcn_s_barrier();
 }

 CK_TILE_DEVICE void s_nop(index_t cnt = 0)