WIP: fix assembly

This commit is contained in:
Enrico Degregori
2026-06-23 16:32:25 +00:00
parent 6670e53589
commit 9073fa6235
4 changed files with 11 additions and 7 deletions

View File

@@ -248,6 +248,14 @@ struct BlockGemmARegBRegCRegEightWavesV1
merge_sequences(c_iter_idx{}, c_warp_y_index_zeros),
merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
c_warp_tensor.get_thread_buffer());
__builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
if constexpr(nIter == 0 && mIter == MIterPerWarp - 1 && kIter == 0)
{
s_nop();
s_waitcnt_lgkm<4>();
__builtin_amdgcn_sched_group_barrier(0x004, 1, 0);
__builtin_amdgcn_sched_barrier(0);
}
});
});
}

View File

@@ -183,12 +183,6 @@ struct GemmPipelineAgBgCrCompAsyncEightWaves : public BaseGemmPipelineAgBgCrComp
// Hot loop scheduler
// ------------------
auto hot_loop_scheduler = [&]() {
__builtin_amdgcn_sched_group_barrier(0x008, MIterPerWarp, 0); // MFMA
s_waitcnt_lgkm<4>();
__builtin_amdgcn_sched_group_barrier(0x004, 1, 0); // lgkmcnt / SALU
static_for<0, MFMA_INST - MIterPerWarp, 1>{}([&](auto) {
__builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
});
__builtin_amdgcn_sched_barrier(0);
};

View File

@@ -383,7 +383,6 @@ struct GemmPipelineAgBgCrEightWavesImplBase : public GemmPipelineAgBgCrImplBase<
auto calc_gemm = [&](index_t i) {
__builtin_amdgcn_sched_barrier(0);
s_nop();
block_gemm(
c_block_tile, a_block_tile, b_block_tiles, aq_block_tile[i], bq_block_tile[i]);
scheduler_func();
@@ -392,6 +391,7 @@ struct GemmPipelineAgBgCrEightWavesImplBase : public GemmPipelineAgBgCrImplBase<
auto main_body = [&](auto tic, auto toc) {
__builtin_amdgcn_sched_barrier(0);
__builtin_amdgcn_s_setprio(1);
__builtin_amdgcn_sched_barrier(0);
calc_gemm(tic);

View File

@@ -255,6 +255,8 @@ struct ABQuantBlockUniversalGemmAsBsCrAsync : public BlockGemmQuantBase
"C block tensor data type!");
constexpr auto warp_size = get_warp_size();
s_nop();
auto q_block_tensor = aq_block_tensor;
if constexpr(Traits::NQPerBlock / NWarp == 1)
{