From 41cb5058b76b470605335bdc63b4699cd2de97d6 Mon Sep 17 00:00:00 2001 From: Aaryaman Vasishta Date: Wed, 6 May 2026 02:46:41 +0900 Subject: [PATCH] Route gfx12 async tile loads through sync path --- .../ck_tile/core/arch/amd_buffer_addressing.hpp | 2 +- include/ck_tile/core/tensor/load_tile.hpp | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp index 6e2a453bf7..59879efe8c 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp @@ -1383,7 +1383,7 @@ CK_TILE_DEVICE void async_buffer_load_dwordxn_v(void* smem, index_t /*flag*/ = 0, bool_constant = {}) { -#if defined(__gfx103__) || defined(__gfx11__) || defined(__gfx12__) +#if defined(__gfx103__) || defined(__gfx11__) static_assert(num_dwords == 1 || num_dwords == 3 || num_dwords == 4, "wrong! not implemented data width"); diff --git a/include/ck_tile/core/tensor/load_tile.hpp b/include/ck_tile/core/tensor/load_tile.hpp index d1c06d4378..4cdad8bb9f 100644 --- a/include/ck_tile/core/tensor/load_tile.hpp +++ b/include/ck_tile/core/tensor/load_tile.hpp @@ -15,6 +15,7 @@ #include "ck_tile/core/tensor/tile_window_linear.hpp" #include "ck_tile/core/tensor/null_tile_window.hpp" #include "ck_tile/core/tensor/null_tensor.hpp" +#include "ck_tile/core/tensor/store_tile.hpp" namespace ck_tile { // Per-lane read-offset tweaks allow swizzling patterns not representable by tile_distribution. @@ -159,7 +160,17 @@ CK_TILE_DEVICE void async_load_tile_with_offset(LdsTileWindow_&& lds_tile, bool_constant occ = {}, bool_constant smy = {}) { +#if defined(__gfx12__) + // gfx12 does not support the direct global-to-LDS async buffer load used by + // tile_window::async_load*. Keep the architecture choice at the tile-load layer by using a + // regular buffer load followed by an explicit LDS store instead of hiding it in the primitive. + static_assert(!static_move_ys, + "gfx12 synchronous async_load_tile fallback does not support static_move_ys"); + auto tile = load_tile_with_offset(tile_window, offset, number{}, occ); + store_tile(lds_tile, tile); +#else tile_window.async_load_with_offset(offset, lds_tile, number{}, occ, smy); +#endif } template = {}, bool_constant = {}) { +#if defined(__gfx12__) + // See async_load_tile_with_offset: gfx12 uses regular buffer loads plus explicit LDS stores. + auto tile = load_tile(tile_window, number{}, bool_constant{}); + store_tile(lds_tile, tile); +#else tile_window.async_load_raw(lds_tile, number{}, bool_constant{}, bool_constant{}); +#endif } CK_TILE_DEVICE void async_load_fence(index_t cnt = 0)