From b77fdbf304f68571db65314719b7910443c54ed9 Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Mon, 22 Dec 2025 15:34:10 +0000 Subject: [PATCH] Move the loading of k_file for next iteration into the Gemm1 loop (non whole_k_prefetch path) --- ...mha_pipeline_qr_ks_vs_whole_k_prefetch.hpp | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch.hpp index a6324bf6ee..7ea53b9346 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch.hpp @@ -677,17 +677,6 @@ struct BlockFmhaPipelineQRKSVSWholeKPrefetch __builtin_amdgcn_sched_barrier(0x00000001); - if constexpr(!kPreloadWholeNextIterationK) - { - if(seqlen_k_curr < seqlen_k_end) - { - k_tiles[I0] = load_tile(k_dram_window); - move_tile_window(k_dram_window, {kN0Sub, 0}); - }; - } - - __builtin_amdgcn_sched_barrier(0x00000001); - // STAGE 3, Gemm_1 ( O = P@V ) static_for<0, k1_loops, 1>{}([&](auto i_k1) { if constexpr(i_k1 < k1_loops - NumPrefetchV) @@ -696,6 +685,18 @@ struct BlockFmhaPipelineQRKSVSWholeKPrefetch move_tile_window(v_dram_window, {0, kK1}); }; + if constexpr(i_k1 == k1_loops - NumPrefetchV) + { + if constexpr(!kPreloadWholeNextIterationK) + { + if(seqlen_k_curr < seqlen_k_end) + { + k_tiles[I0] = load_tile(k_dram_window); + move_tile_window(k_dram_window, {kN0Sub, 0}); + }; + } + }; + block_sync_lds(); gemm_1( o_acc,