Move the loading of k_tile for next iteration into the Gemm1 loop (non whole_k_prefetch path in trload pipeline)

This commit is contained in:
Qianfeng Zhang
2025-12-23 07:07:15 +00:00
parent e7e6ebc91c
commit 6c91b0c407

View File

@@ -678,17 +678,6 @@ struct BlockFmhaPipelineQRKSVSWholeKPrefetchTrLoad
__builtin_amdgcn_sched_barrier(0x00000001);
if constexpr(!kPreloadWholeNextIterationK)
{
if(seqlen_k_curr < seqlen_k_end)
{
k_tiles[I0] = load_tile(k_dram_window);
move_tile_window(k_dram_window, {kN0Sub, 0});
};
}
__builtin_amdgcn_sched_barrier(0x00000001);
// STAGE 3, Gemm_1 ( O = P@V )
static_for<0, k1_loops, 1>{}([&](auto i_k1) {
if constexpr(i_k1 < k1_loops - NumPrefetchV)
@@ -697,6 +686,18 @@ struct BlockFmhaPipelineQRKSVSWholeKPrefetchTrLoad
move_tile_window(v_dram_window, {kK1, 0});
};
if constexpr(i_k1 == k1_loops - NumPrefetchV)
{
if constexpr(!kPreloadWholeNextIterationK)
{
if(seqlen_k_curr < seqlen_k_end)
{
k_tiles[I0] = load_tile(k_dram_window);
move_tile_window(k_dram_window, {kN0Sub, 0});
};
}
};
block_sync_lds();
gemm_1(
o_acc,