From db5c12db899afc7cb7e973da65e3431a5566707c Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Sun, 21 Dec 2025 15:13:14 +0000 Subject: [PATCH] Update to the non-whole-k-prefetch path in the whoke_k_prefetch pipeline --- .../block_fmha_pipeline_qr_ks_vs_whole_k_prefetch.hpp | 8 +++++--- ...ock_fmha_pipeline_qr_ks_vs_whole_k_prefetch_trload.hpp | 8 +++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch.hpp index ba826196be..bcc85876a9 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch.hpp @@ -505,10 +505,12 @@ struct BlockFmhaPipelineQRKSVSWholeKPrefetch move_tile_window(k_dram_window, {kN0Sub, 0}); }; - if constexpr(i_n0 < NumPrefetchV) + if constexpr(i_n0 == n0_loops - 1) { - v_tiles[i_n0] = load_tile(v_dram_window); - move_tile_window(v_dram_window, {0, kK1}); + static_for<0, NumPrefetchV, 1>{}([&](auto i_k1) { + v_tiles[i_k1] = load_tile(v_dram_window); + move_tile_window(v_dram_window, {0, kK1}); + }); }; __builtin_amdgcn_sched_barrier(0x00000001); diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch_trload.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch_trload.hpp index 65c1af72de..df3f567a7c 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch_trload.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch_trload.hpp @@ -509,10 +509,12 @@ struct BlockFmhaPipelineQRKSVSWholeKPrefetchTrLoad move_tile_window(k_dram_window, {kN0Sub, 0}); }; - if constexpr(i_n0 < NumPrefetchV) + if constexpr(i_n0 == n0_loops - 1) { - v_tiles[i_n0] = load_tile(v_dram_window); - move_tile_window(v_dram_window, {kK1, 0}); + static_for<0, NumPrefetchV, 1>{}([&](auto i_k1) { + v_tiles[i_k1] = load_tile(v_dram_window); + move_tile_window(v_dram_window, {kK1, 0}); + }); }; __builtin_amdgcn_sched_barrier(0x00000001);