mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-20 12:59:49 +00:00
Move k_tile loading in the loop earlier
This commit is contained in:
@@ -337,6 +337,14 @@ struct HstuAttentionFwdPipelineQRKSVS
|
||||
tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, sacc_tile);
|
||||
}
|
||||
|
||||
__builtin_amdgcn_sched_barrier(0);
|
||||
|
||||
// for i_k1 = k1_loop-1, the loading is for next iteration
|
||||
k_tile = load_tile(k_dram_window);
|
||||
move_tile_window(k_dram_window, {kK1, 0});
|
||||
|
||||
__builtin_amdgcn_sched_barrier(0);
|
||||
|
||||
if constexpr(HstuMask::IsMasking)
|
||||
{
|
||||
if constexpr(HstuMask::kUseLocal)
|
||||
@@ -428,10 +436,6 @@ struct HstuAttentionFwdPipelineQRKSVS
|
||||
tile_elementwise_in(v_element_func, v_tile)); // store the prefetch
|
||||
};
|
||||
|
||||
// for i_k1 = k1_loop-1, the loading is for next iteration
|
||||
k_tile = load_tile(k_dram_window);
|
||||
move_tile_window(k_dram_window, {kK1, 0});
|
||||
|
||||
__builtin_amdgcn_sched_barrier(0);
|
||||
|
||||
tile_elementwise_inout(f_silu, pcomp_tile);
|
||||
|
||||
Reference in New Issue
Block a user