Remove additional barriers from double buffer implementation.

This commit is contained in:
Ville Pietilä
2026-02-16 09:53:43 -06:00
parent 5494425e7b
commit fb782ba133

View File

@@ -291,10 +291,7 @@ struct GridwiseGemmPipeline_v1<2, true, true>
index_t num_loop)
{
// We have two thread scratches indexed by I0, I1 for double buffering.
// Optimized with IGLP scheduling for better instruction-level parallelism
// Scheduling barrier to mark pipeline start
__builtin_amdgcn_sched_barrier(0);
// Optimized with IGLP scheduling for better instruction-level parallelism.
// Prologue - load data into buffer 0
// Read from global mem to registers (I0 scratch)
@@ -328,9 +325,7 @@ struct GridwiseGemmPipeline_v1<2, true, true>
b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
// Sync LDS to ensure buffer 0 is ready
__builtin_amdgcn_sched_barrier(0);
block_sync_lds();
__builtin_amdgcn_sched_barrier(0);
// Run GEMM on buffer 0 while buffer 1 is loading
blockwise_gemm.Run(a_block_buf_0, b_block_buf_0, c_thread_buf);
@@ -348,9 +343,7 @@ struct GridwiseGemmPipeline_v1<2, true, true>
b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
// Sync LDS to ensure buffer 1 is ready
__builtin_amdgcn_sched_barrier(0);
block_sync_lds();
__builtin_amdgcn_sched_barrier(0);
// Run GEMM on buffer 1 while buffer 0 is loading
blockwise_gemm.Run(a_block_buf_1, b_block_buf_1, c_thread_buf);
@@ -372,9 +365,7 @@ struct GridwiseGemmPipeline_v1<2, true, true>
b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I1);
// Sync LDS to ensure buffer 0 is ready
__builtin_amdgcn_sched_barrier(0);
block_sync_lds();
__builtin_amdgcn_sched_barrier(0);
// Run GEMM on buffer 0
blockwise_gemm.Run(a_block_buf_0, b_block_buf_0, c_thread_buf);
@@ -384,9 +375,7 @@ struct GridwiseGemmPipeline_v1<2, true, true>
b_blockwise_copy.RunWrite(b_block_desc, b_block_buf_1, I1);
// Sync LDS to ensure buffer 1 is ready
__builtin_amdgcn_sched_barrier(0);
block_sync_lds();
__builtin_amdgcn_sched_barrier(0);
// Run GEMM on buffer 1
blockwise_gemm.Run(a_block_buf_1, b_block_buf_1, c_thread_buf);
@@ -395,16 +384,12 @@ struct GridwiseGemmPipeline_v1<2, true, true>
{
// Odd number of loops: need to process 1 more iteration
// Sync LDS to ensure buffer 0 is ready
__builtin_amdgcn_sched_barrier(0);
block_sync_lds();
__builtin_amdgcn_sched_barrier(0);
// Run GEMM on buffer 0
blockwise_gemm.Run(a_block_buf_0, b_block_buf_0, c_thread_buf);
}
// Final barrier to complete pipeline
__builtin_amdgcn_sched_barrier(0);
}
};