diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
index c44edc59e9..e8c368888a 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
@@ -186,19 +186,17 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
         constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
         constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
         constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num * MWaves;
-
-        constexpr auto num_mfma = HotLoopInstList::C_MFMA_Inst_Num;
+        constexpr auto mfma_interleave = MPerXDL == 32 ? 1 : 2;
         // B global
         static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
             ignore = i;
-            if constexpr(num_mfma > num_ds_read_inst_a + num_buffer_load_inst_a +
-                                        num_buffer_load_inst_b * 3 / 2)
+            if constexpr(MPerBlock >= 128 && NPerBlock >= 128)
             {
-                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x008, 2 * mfma_interleave, 0);
             }
             else
             {
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x008, mfma_interleave, 0);
             }
             __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
         });
@@ -213,10 +211,10 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
         });
 
         // A local
-        static_for<0, num_ds_read_inst_a / 2, 1>{}([&](auto i) {
+        static_for<0, num_ds_read_inst_a / 2 * mfma_interleave, 1>{}([&](auto i) {
             ignore = i;
             __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x100, 2 / mfma_interleave, 0); // DS read
         });
     }