diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp index c44edc59e9..e8c368888a 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp @@ -186,19 +186,17 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1{}([&](auto i) { ignore = i; - if constexpr(num_mfma > num_ds_read_inst_a + num_buffer_load_inst_a + - num_buffer_load_inst_b * 3 / 2) + if constexpr(MPerBlock >= 128 && NPerBlock >= 128) { - __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA + __builtin_amdgcn_sched_group_barrier(0x008, 2 * mfma_interleave, 0); } else { - __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA + __builtin_amdgcn_sched_group_barrier(0x008, mfma_interleave, 0); } __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read }); @@ -213,10 +211,10 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1{}([&](auto i) { + static_for<0, num_ds_read_inst_a / 2 * mfma_interleave, 1>{}([&](auto i) { ignore = i; __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA - __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS read + __builtin_amdgcn_sched_group_barrier(0x100, 2 / mfma_interleave, 0); // DS read }); }