From d6e2dd92fea1e86ff0fd415cb7538a34fac53632 Mon Sep 17 00:00:00 2001
From: aska-0096 <haocwang@amd.com>
Date: Wed, 23 Apr 2025 15:53:15 +0000
Subject: [PATCH 1/8] enable f8 new mfma for preshuffle gemm. found some
 vectorizer issue even with slp flag

---
 .../65_gemm_multiply_multiply/CMakeLists.txt  |   5 +
 ..._multiply_multiply_xdl_fp8_bpreshuffle.cpp |   4 +-
 ...e_gemm_pipeline_xdlops_b_preshuffle_v1.hpp |  50 ++---
 ...e_gemm_pipeline_xdlops_b_preshuffle_v2.hpp |  64 ++++---
 ...e_gemm_pipeline_xdlops_b_preshuffle_v3.hpp | 175 +++++++++++-------
 .../blockwise_gemm_pipeline_xdlops_base.hpp   |   5 +
 ...kwise_gemm_pipeline_xdlops_v1_ab_scale.hpp |   2 +-
 .../blockwise_gemm_pipeline_xdlops_v3.hpp     |   2 +-
 ...kwise_gemm_pipeline_xdlops_v3_ab_scale.hpp |   2 +-
 ...ckwise_gemm_pipeline_xdlops_v3_b_scale.hpp |   2 +-
 .../blockwise_gemm_pipeline_xdlops_v5.hpp     |   2 +-
 ...m_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp |  11 +-
 include/ck/utility/blkgemmpipe_scheduler.hpp  |  14 +-
 13 files changed, 212 insertions(+), 126 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt
index deca85ae64..1f407244fc 100644
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -16,3 +16,8 @@ foreach(gpu IN LISTS GPU_TARGETS)
         set(target 1)
     endif()
 endforeach()
+
+set(GEMM_OPTIONS)
+list(APPEND GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-16")
+list(APPEND GEMM_OPTIONS -v --save-temps -Wno-gnu-line-marker)
+target_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVATE ${GEMM_OPTIONS})
diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
index e4e6a4f1a7..21fa0e8453 100644
--- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
@@ -140,10 +140,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShu
     // clang-format off
     <   Row, Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
         AElementOp,  BElementOp, CDEElementOp, GemmSpec, 256,
-        128,   128,    128,
+        32,   128,    128,
         16,   16,
         32,   32,
-        2,    2,
+        1,    1,
         S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
         S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
         1,    1,   S<1, 32, 1, 8>, S<8, 8, 1>,
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
index d751543175..a003befc3c 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
@@ -122,6 +122,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
     using Base::B_K1;
     using Base::I0;
     using Base::I1;
+    using Base::KGroup;
     using Base::KRepeat;
     using Base::xdlops_gemm;
     using typename Base::HotLoopInstList;
@@ -153,9 +154,9 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
         constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
-        constexpr index_t K2 = KPack;
+        constexpr index_t K2 = KPack / KGroup;
         constexpr index_t K1 = 64 / NPerXDL;
-        constexpr index_t K0 = KRepeat;
+        constexpr index_t K0 = KRepeat * KGroup;
 
         return transform_tensor_descriptor(
             TileDesc_M0_M1_M2_K{},
@@ -280,12 +281,14 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
         block_sync_lds();
         static_for<0, MRepeat, 1>{}([&](auto m0) {
             static_for<0, KRepeat, 1>{}([&](auto k0) {
-                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                   make_tuple(m0, I0, I0, k0, I0, I0),
-                                   a_block_buf,
-                                   a_thread_desc_,
-                                   make_tuple(m0, I0, I0, k0, I0, I0),
-                                   a_thread_buf);
+                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                       a_thread_buf);
+                });
             });
         });
 
@@ -348,12 +351,15 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
 
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
                         static_for<0, KRepeat, 1>{}([&](auto k0) {
-                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                               make_tuple(m0, I0, I0, k0, I0, I0),
-                                               a_block_buf,
-                                               a_thread_desc_,
-                                               make_tuple(m0, I0, I0, k0, I0, I0),
-                                               a_thread_buf);
+                            static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_k0_k1_k2,
+                                    make_tuple(m0, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
+                                    a_block_buf,
+                                    a_thread_desc_,
+                                    make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                    a_thread_buf);
+                            });
                         });
                     });
 
@@ -411,12 +417,14 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
 
             static_for<0, MRepeat, 1>{}([&](auto m0) {
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
-                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                       make_tuple(m0, I0, I0, k0, I0, I0),
-                                       a_block_buf,
-                                       a_thread_desc_,
-                                       make_tuple(m0, I0, I0, k0, I0, I0),
-                                       a_thread_buf);
+                    static_for<0, KGroup, 1>{}([&](auto kg0) {
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                           make_tuple(m0, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
+                                           a_block_buf,
+                                           a_thread_desc_,
+                                           make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                           a_thread_buf);
+                    });
                 });
             });
 
@@ -495,7 +503,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
                                                          ComputeDataType,
                                                          decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
                                                          decltype(a_thread_desc_),
-                                                         Sequence<1, 1, 1, 1, 1, KPack>,
+                                                         Sequence<1, 1, 1, 1, 1, KPack / KGroup>,
                                                          Sequence<0, 1, 2, 3, 4, 5>,
                                                          5,
                                                          A_K1,
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp
index 4c019a41a4..2d4ae048ac 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp
@@ -122,6 +122,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v2<BlockGemmPipelineScheduler::I
     using Base::B_K1;
     using Base::I0;
     using Base::I1;
+    using Base::KGroup;
     using Base::KRepeat;
     using Base::xdlops_gemm;
     using typename Base::HotLoopInstList;
@@ -152,9 +153,9 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v2<BlockGemmPipelineScheduler::I
         constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
-        constexpr index_t K2 = KPack;
+        constexpr index_t K2 = KPack / KGroup;
         constexpr index_t K1 = 64 / NPerXDL;
-        constexpr index_t K0 = KRepeat;
+        constexpr index_t K0 = KRepeat * KGroup;
 
         return transform_tensor_descriptor(
             TileDesc_M0_M1_M2_K{},
@@ -281,12 +282,14 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v2<BlockGemmPipelineScheduler::I
         block_sync_lds();
         static_for<0, MRepeat, 1>{}([&](auto m0) {
             static_for<0, KRepeat, 1>{}([&](auto k0) {
-                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                   make_tuple(m0, I0, I0, k0, I0, I0),
-                                   a_block_buf.At(I0),
-                                   a_thread_desc_,
-                                   make_tuple(m0, I0, I0, k0, I0, I0),
-                                   a_thread_bufs(I0));
+                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
+                                       a_block_buf.At(I0),
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                       a_thread_bufs(I0));
+                });
             });
         });
 
@@ -320,12 +323,15 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v2<BlockGemmPipelineScheduler::I
 
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
                         static_for<0, KRepeat, 1>{}([&](auto k0) {
-                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                               make_tuple(m0, I0, I0, k0, I0, I0),
-                                               a_block_buf.At(local_read_buf),
-                                               a_thread_desc_,
-                                               make_tuple(m0, I0, I0, k0, I0, I0),
-                                               a_thread_bufs(local_read_buf));
+                            static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_k0_k1_k2,
+                                    make_tuple(m0, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
+                                    a_block_buf.At(local_read_buf),
+                                    a_thread_desc_,
+                                    make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                    a_thread_bufs(local_read_buf));
+                            });
                         });
                     });
 
@@ -391,12 +397,14 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v2<BlockGemmPipelineScheduler::I
 
             static_for<0, MRepeat, 1>{}([&](auto m0) {
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
-                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                       make_tuple(m0, I0, I0, k0, I0, I0),
-                                       a_block_buf.At(local_read_reg),
-                                       a_thread_desc_,
-                                       make_tuple(m0, I0, I0, k0, I0, I0),
-                                       a_thread_bufs(local_read_reg));
+                    static_for<0, KGroup, 1>{}([&](auto kg0) {
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                           make_tuple(m0, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
+                                           a_block_buf.At(local_read_reg),
+                                           a_thread_desc_,
+                                           make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                           a_thread_bufs(local_read_reg));
+                    });
                 });
             });
 
@@ -445,12 +453,14 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v2<BlockGemmPipelineScheduler::I
 
             static_for<0, MRepeat, 1>{}([&](auto m0) {
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
-                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                       make_tuple(m0, I0, I0, k0, I0, I0),
-                                       a_block_buf.At(local_read_reg),
-                                       a_thread_desc_,
-                                       make_tuple(m0, I0, I0, k0, I0, I0),
-                                       a_thread_bufs(local_read_reg));
+                    static_for<0, KGroup, 1>{}([&](auto kg0) {
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                           make_tuple(m0, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
+                                           a_block_buf.At(local_read_reg),
+                                           a_thread_desc_,
+                                           make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                           a_thread_bufs(local_read_reg));
+                    });
                 });
             });
 
@@ -539,7 +549,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v2<BlockGemmPipelineScheduler::I
                                                          ComputeDataType,
                                                          decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
                                                          decltype(a_thread_desc_),
-                                                         Sequence<1, 1, 1, 1, 1, KPack>,
+                                                         Sequence<1, 1, 1, 1, 1, KPack / KGroup>,
                                                          Sequence<0, 1, 2, 3, 4, 5>,
                                                          5,
                                                          A_K1,
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
index 6d115e7620..da70929c46 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
@@ -123,6 +123,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
     using Base::I0;
     using Base::I1;
     using Base::I2;
+    using Base::KGroup;
     using Base::KRepeat;
     using Base::xdlops_gemm;
     using typename Base::HotLoopInstList;
@@ -156,9 +157,9 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
         constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
-        constexpr index_t K2 = KPack;
+        constexpr index_t K2 = KPack / KGroup;
         constexpr index_t K1 = 64 / NPerXDL;
-        constexpr index_t K0 = KRepeat;
+        constexpr index_t K0 = KRepeat * KGroup;
 
         return transform_tensor_descriptor(
             TileDesc_M0_M1_M2_K{},
@@ -538,12 +539,14 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
         // Local prefetch A1
         block_sync_lds();
         static_for<0, KRepeat, 1>{}([&](auto k0) {
-            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                               make_tuple(I0, I0, I0, k0, I0, I0),
-                               a_block_buf.At(I0),
-                               a_thread_desc_,
-                               make_tuple(I0, I0, I0, k0, I0, I0),
-                               a_thread_buf);
+            static_for<0, KGroup, 1>{}([&](auto kg0) {
+                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                   make_tuple(I0, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
+                                   a_block_buf.At(I0),
+                                   a_thread_desc_,
+                                   make_tuple(I0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                   a_thread_buf);
+            });
         });
 
         // Initialize C
@@ -618,39 +621,53 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                             block_sync_lds();
 
                             static_for<0, KRepeat, 1>{}([&](auto k0) {
-                                a_thread_copy_.Run(
-                                    a_block_desc_m0_m1_m2_k0_k1_k2,
-                                    make_tuple(Number<(m0 + 1) % MRepeat>{}, I0, I0, k0, I0, I0),
-                                    a_block_buf.At(local_read_buf),
-                                    a_thread_desc_,
-                                    make_tuple(
-                                        Number<(m0 + 1 + HotloopLocalBufSwitch * mfma_reg_buf) %
-                                               2>{},
-                                        I0,
-                                        I0,
-                                        k0,
-                                        I0,
-                                        I0),
-                                    a_thread_buf);
+                                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                    a_thread_copy_.Run(
+                                        a_block_desc_m0_m1_m2_k0_k1_k2,
+                                        make_tuple(Number<(m0 + 1) % MRepeat>{},
+                                                   I0,
+                                                   I0,
+                                                   Number<k0 * 2 + kg0>{},
+                                                   I0,
+                                                   I0),
+                                        a_block_buf.At(local_read_buf),
+                                        a_thread_desc_,
+                                        make_tuple(
+                                            Number<(m0 + 1 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                   2>{},
+                                            I0,
+                                            I0,
+                                            k0,
+                                            I0,
+                                            Number<kg0 * A_K1>{}),
+                                        a_thread_buf);
+                                });
                             });
                         }
                         else
                         {
                             static_for<0, KRepeat, 1>{}([&](auto k0) {
-                                a_thread_copy_.Run(
-                                    a_block_desc_m0_m1_m2_k0_k1_k2,
-                                    make_tuple(Number<(m0 + 1) % MRepeat>{}, I0, I0, k0, I0, I0),
-                                    a_block_buf.At(mfma_reg_buf),
-                                    a_thread_desc_,
-                                    make_tuple(
-                                        Number<(m0 + 1 + HotloopLocalBufSwitch * mfma_reg_buf) %
-                                               2>{},
-                                        I0,
-                                        I0,
-                                        k0,
-                                        I0,
-                                        I0),
-                                    a_thread_buf);
+                                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                    a_thread_copy_.Run(
+                                        a_block_desc_m0_m1_m2_k0_k1_k2,
+                                        make_tuple(Number<(m0 + 1) % MRepeat>{},
+                                                   I0,
+                                                   I0,
+                                                   Number<k0 * 2 + kg0>{},
+                                                   I0,
+                                                   I0),
+                                        a_block_buf.At(mfma_reg_buf),
+                                        a_thread_desc_,
+                                        make_tuple(
+                                            Number<(m0 + 1 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                   2>{},
+                                            I0,
+                                            I0,
+                                            k0,
+                                            I0,
+                                            Number<kg0 * A_K1>{}),
+                                        a_thread_buf);
+                                });
                             });
                         }
 
@@ -712,25 +729,41 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                     block_sync_lds();
 
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        a_thread_copy_.Run(
-                            a_block_desc_m0_m1_m2_k0_k1_k2,
-                            make_tuple(Number<(m0 + 1) % MRepeat>{}, I0, I0, k0, I0, I0),
-                            a_block_buf.At(I1),
-                            a_thread_desc_,
-                            make_tuple(Number<(m0 + 1) % 2>{}, I0, I0, k0, I0, I0),
-                            a_thread_buf);
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(Number<(m0 + 1) % MRepeat>{},
+                                           I0,
+                                           I0,
+                                           Number<k0 * 2 + kg0>{},
+                                           I0,
+                                           I0),
+                                a_block_buf.At(I1),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 1) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
                     });
                 }
                 else
                 {
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        a_thread_copy_.Run(
-                            a_block_desc_m0_m1_m2_k0_k1_k2,
-                            make_tuple(Number<(m0 + 1) % MRepeat>{}, I0, I0, k0, I0, I0),
-                            a_block_buf.At(I0),
-                            a_thread_desc_,
-                            make_tuple(Number<(m0 + 1) % 2>{}, I0, I0, k0, I0, I0),
-                            a_thread_buf);
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(Number<(m0 + 1) % MRepeat>{},
+                                           I0,
+                                           I0,
+                                           Number<k0 * 2 + kg0>{},
+                                           I0,
+                                           I0),
+                                a_block_buf.At(I0),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 1) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
                     });
                 }
 
@@ -767,14 +800,21 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                 if constexpr(m0.value != (MRepeat - 1))
                 {
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        a_thread_copy_.Run(
-                            a_block_desc_m0_m1_m2_k0_k1_k2,
-                            make_tuple(Number<m0 + 1>{}, I0, I0, k0, I0, I0),
-                            a_block_buf.At(I1),
-                            a_thread_desc_,
-                            make_tuple(
-                                Number<(m0 + 1 + HotloopLocalBufSwitch) % 2>{}, I0, I0, k0, I0, I0),
-                            a_thread_buf);
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(
+                                    Number<m0 + 1>{}, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
+                                a_block_buf.At(I1),
+                                a_thread_desc_,
+                                make_tuple(Number<(m0 + 1 + HotloopLocalBufSwitch) % 2>{},
+                                           I0,
+                                           I0,
+                                           k0,
+                                           I0,
+                                           Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
                     });
 
                     EpilogueScheduler_2();
@@ -816,12 +856,17 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                 if constexpr(m0.value != (MRepeat - 1))
                 {
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                           make_tuple(Number<m0 + 1>{}, I0, I0, k0, I0, I0),
-                                           a_block_buf.At(I0),
-                                           a_thread_desc_,
-                                           make_tuple(Number<(m0 + 1) % 2>{}, I0, I0, k0, I0, I0),
-                                           a_thread_buf);
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(
+                                    Number<m0 + 1>{}, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
+                                a_block_buf.At(I0),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 1) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
                     });
 
                     EpilogueScheduler_2();
@@ -841,7 +886,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                                                          ComputeDataType,
                                                          decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
                                                          decltype(a_thread_desc_),
-                                                         Sequence<1, 1, 1, 1, 1, KPack>,
+                                                         Sequence<1, 1, 1, 1, 1, KPack / KGroup>,
                                                          Sequence<0, 1, 2, 3, 4, 5>,
                                                          5,
                                                          A_K1,
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
index d7ba2559ea..f55a3b9e4b 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
@@ -57,6 +57,11 @@ struct BlockwiseGemmXdlops_pipeline_base
     static constexpr index_t KPerThread    = KPerBlock / xdlops_gemm.K0PerXdlops;
     static constexpr index_t KRepeat       = KPerThread / KPack;
     static constexpr index_t KPerInnerLoop = KPack;
+    static constexpr index_t KGroup =
+        ((MPerXDL == 16 && MPerXDL == 16 && xdlops_gemm.KPerXdlops == 128) ||
+         (MPerXDL == 32 && MPerXDL == 32 && xdlops_gemm.KPerXdlops == 64))
+            ? 2
+            : 1;
 
     static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
     static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_ab_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_ab_scale.hpp
index 8375e81fa0..ea4f5e4a28 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_ab_scale.hpp
@@ -205,7 +205,7 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
 
         constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num;
 
-        constexpr auto mfma_cycle = NPerXDL == 16 ? 16 : 32;
+        constexpr auto mfma_cycle = HotLoopInstList::C_MFMA_Inst_Cycle;
         constexpr auto ds_read_a_issue_cycle =
             HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
         constexpr auto ds_read_b_issue_cycle =
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp
index fe66d320e0..b5d6180ab3 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp
@@ -177,8 +177,8 @@ struct BlockwiseGemmXdlops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
         constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
 
         constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num;
+        constexpr auto mfma_cycle    = HotLoopInstList::C_MFMA_Inst_Cycle;
 
-        constexpr auto mfma_cycle = NPerXDL == 16 ? 32 : 64;
         constexpr auto ds_read_a_issue_cycle =
             HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
         constexpr auto ds_read_b_issue_cycle =
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp
index fc0075b196..a4038e9543 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp
@@ -179,7 +179,7 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
 
         constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num;
 
-        constexpr auto mfma_cycle = NPerXDL == 16 ? 16 : 32;
+        constexpr auto mfma_cycle = HotLoopInstList::C_MFMA_Inst_Cycle;
         constexpr auto ds_read_a_issue_cycle =
             HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
         constexpr auto ds_read_b_issue_cycle =
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp
index d1be88dd63..3179a90b7f 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp
@@ -178,7 +178,7 @@ struct BlockwiseGemmXdlops_pipeline_v3_b_scale<BlockGemmPipelineScheduler::Intra
 
         constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num;
 
-        constexpr auto mfma_cycle = NPerXDL == 16 ? 16 : 32;
+        constexpr auto mfma_cycle = HotLoopInstList::C_MFMA_Inst_Cycle;
         constexpr auto ds_read_a_issue_cycle =
             HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
         constexpr auto ds_read_b_issue_cycle =
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v5.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v5.hpp
index b6a4f05502..99934fa74e 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v5.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v5.hpp
@@ -188,7 +188,7 @@ struct BlockwiseGemmXdlops_pipeline_v5<BlockGemmPipelineScheduler::Intrawave,
 
         constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num;
 
-        constexpr auto mfma_cycle = NPerXDL == 16 ? 16 : 32;
+        constexpr auto mfma_cycle = HotLoopInstList::C_MFMA_Inst_Cycle;
         constexpr auto ds_read_a_issue_cycle =
             HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
         constexpr auto ds_read_b_issue_cycle =
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
index 238ab14606..0ce0a29f36 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
@@ -167,9 +167,10 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
     using mfma_selector = MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB>;
     static constexpr index_t KPack =
         math::max(math::lcm(AK1Number, BK1Number), mfma_selector::selected_mfma.k_per_blk);
+    static constexpr index_t KGroup = mfma_selector::selected_mfma.k_per_blk == 32 ? 2 : 1;
     static constexpr index_t KLane =
         mfma_selector::GetKPerXdlops() / mfma_selector::GetK1PerXdlops();
-    static constexpr index_t KRepeat = KPerBlock / KLane / KPack;
+    static constexpr index_t KRepeat = KPerBlock / KLane / (KPack / KGroup);
     static constexpr index_t NLane   = NPerXdl;
     static constexpr index_t NWave   = NPerBlock / NPerXdl / NXdlPerWave;
 
@@ -209,7 +210,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
     }
     __host__ __device__ static auto CalculateBK0Shuffled(index_t K)
     {
-        return math::integer_divide_ceil(K, KLane * KPack);
+        return math::integer_divide_ceil(K, KLane * KPack / KGroup);
     }
 
     __host__ __device__ static auto CalculateKPadded(index_t K)
@@ -351,7 +352,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
 
     __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
     {
-        constexpr index_t NkSwizzleNumber = Number<warpSize * KPack>{};
+        constexpr index_t NkSwizzleNumber = Number<warpSize * KPack / KGroup>{};
         return make_naive_tensor_descriptor(
             make_tuple(N0 / NWave, NWave, K0, NkSwizzleNumber),
             make_tuple(NWave * K0 * NkSwizzleNumber, K0 * NkSwizzleNumber, NkSwizzleNumber, I1));
@@ -1228,7 +1229,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
                   make_multi_index(n_block_data_idx_on_grid,
                                    get_warp_local_1d_id() % NWave,
                                    0,
-                                   KPack * (get_thread_local_1d_id() % warpSize)));
+                                   KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         // Cast after lds
@@ -1668,7 +1669,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
                   make_multi_index(n_block_data_idx_on_grid,
                                    get_warp_local_1d_id() % NWave,
                                    0,
-                                   KPack * (get_thread_local_1d_id() % warpSize)));
+                                   KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         // Cast after lds
diff --git a/include/ck/utility/blkgemmpipe_scheduler.hpp b/include/ck/utility/blkgemmpipe_scheduler.hpp
index 39407cb8f6..34c874353d 100644
--- a/include/ck/utility/blkgemmpipe_scheduler.hpp
+++ b/include/ck/utility/blkgemmpipe_scheduler.hpp
@@ -90,6 +90,17 @@ struct BlockwiseGemmXdlops_pipeline_hotloop_inst
     static constexpr index_t C_MFMA_Inst_Num =
         MPerBlock * NPerBlock * KPerBlock / (BlockSize / WaveSize) / (MPerXDL * NPerXDL * KPerXDL);
 
+    static constexpr index_t C_MFMA_Inst_Cycle = []() {
+        if constexpr(NPerXDL == 16)
+        {
+            return KPerXDL == 128 ? 32 : 16;
+        }
+        else if constexpr(NPerXDL == 32)
+        {
+            return KPerXDL == 64 ? 64 : 32;
+        }
+    }();
+
     static constexpr auto Print()
     {
         printf(" Blk/Wave Size: %d, %d, M/N/K PerBlk: %d, %d, %d, M/N/K PerXdl: %d, %d, %d\n",
@@ -103,7 +114,7 @@ struct BlockwiseGemmXdlops_pipeline_hotloop_inst
                KPerXDL);
 
         printf(" A/B buffer load inst: %d, %d\n A/B LDS write inst: %d, %d\n A/B LDS read inst: "
-               "%d, %d\n C MFMA inst: %d\n"
+               "%d, %d\n C MFMA inst: %d C MFMA cycle: %d\n"
                "A/B LDS read width: %d, %d, A/B LDS write width: %d, %d, A/B buffer load width: "
                "%d/ %d\n",
                A_Buffer_Load_Inst_Num,
@@ -113,6 +124,7 @@ struct BlockwiseGemmXdlops_pipeline_hotloop_inst
                A_LDS_Read_Inst_Num,
                B_LDS_Read_Inst_Num,
                C_MFMA_Inst_Num,
+               C_MFMA_Inst_Cycle,
                A_LDS_Read_Width,
                B_LDS_Read_Width,
                ALDSWriteWidth,

From abd40d35691745dcc70dca778ad140b997f10ba7 Mon Sep 17 00:00:00 2001
From: aska-0096 <haocwang@amd.com>
Date: Wed, 23 Apr 2025 15:55:59 +0000
Subject: [PATCH 2/8] found a case that seems like have vectorizer issue

---
 .../gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
index 21fa0e8453..d5db3199ca 100644
--- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
@@ -147,7 +147,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShu
         S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
         S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
         1,    1,   S<1, 32, 1, 8>, S<8, 8, 1>,
-        ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, FP8>;
+        ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v2, FP8>;
 // clang-format on
 
 int main(int argc, char* argv[])

From 2054e165bc4905e8ad003db44e0c2cfc6463e222 Mon Sep 17 00:00:00 2001
From: coderfeli <coderfeli@163.com>
Date: Thu, 24 Apr 2025 06:26:12 +0000
Subject: [PATCH 3/8] fix moe pipeline and change to compute tile

---
 .../gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp   |  6 +++---
 .../65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp  |  8 ++++----
 .../tensor_operation/gpu/grid/gridwise_moe_gemm.hpp  | 12 +++++++-----
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
index d5db3199ca..bd9c358b57 100644
--- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
@@ -140,14 +140,14 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShu
     // clang-format off
     <   Row, Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
         AElementOp,  BElementOp, CDEElementOp, GemmSpec, 256,
-        32,   128,    128,
+        128,   128,    128,
         16,   16,
         32,   32,
-        1,    1,
+        4,    1,
         S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
         S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
         1,    1,   S<1, 32, 1, 8>, S<8, 8, 1>,
-        ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v2, FP8>;
+        ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, FP8>;
 // clang-format on
 
 int main(int argc, char* argv[])
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
index 04f10b53ae..12a6f4bcb2 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
@@ -185,8 +185,8 @@ int main(int argc, char* argv[])
     ck::index_t N               = 4096;
     ck::index_t K               = 4096;
     ck::index_t experts         = 8;
-    ck::index_t sorted_tile_num = 6;
-    ck::index_t valid_tile_num  = 6;
+    ck::index_t sorted_tile_num = 18;
+    ck::index_t valid_tile_num  = 16;
     ck::index_t sorted_size     = sorted_tile_num * MPerBlock;
     ck::index_t valid_size      = valid_tile_num * MPerBlock;
     ck::index_t tokens          = 128;
@@ -237,10 +237,10 @@ int main(int argc, char* argv[])
     // max_token_id.mData = {valid_size, 0, 2, 3, 4, 6, 8, 10, 12, 13};
     // int eids[]         = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 7, 7, 3, 3, 3};
     max_token_id.mData = {valid_size, 0, 1, 2, 3, 4, 5, 6, 7, 8};
-    int eids[]         = {0, 1, 2, 3, 4, 5, 6, 7, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2}
+    // int eids[]         = {0, 1, 2, 3, 4, 5, 6, 7, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2}
     for(int i = 0; i < sorted_tile_num; i++)
     {
-        expert_ids.mData[i] = eids[i];
+        expert_ids.mData[i] = i / ((valid_tile_num + experts - 1) / experts);
     }
     if(tokens * topk > valid_size)
     {
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
index a2d1114bbe..fba46d4ac6 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
@@ -186,7 +186,9 @@ struct GridwiseMoeGemm
         math::max(math::lcm(AK1Number, BK1Number), mfma_selector::selected_mfma.k_per_blk);
     static constexpr index_t KLane =
         mfma_selector::GetKPerXdlops() / mfma_selector::GetK1PerXdlops();
-    static constexpr index_t KRepeat = KPerBlock / KLane / KPack;
+
+    static constexpr index_t KGroup = mfma_selector::selected_mfma.k_per_blk == 32 ? 2 : 1;
+    static constexpr index_t KRepeat = KPerBlock / KLane / (KPack / KGroup);
     static constexpr index_t NLane   = NPerXdl;
     static constexpr index_t NWave   = NPerBlock / NPerXdl / NXdlPerWave;
     // static constexpr index_t NumTokens = 1;
@@ -246,7 +248,7 @@ struct GridwiseMoeGemm
     }
     __host__ __device__ static auto CalculateBK0Shuffled(index_t K)
     {
-        return math::integer_divide_ceil(K, KLane * KPack);
+        return math::integer_divide_ceil(K, KLane * KPack / KGroup);
     }
 
     __host__ __device__ static auto CalculateKPadded(index_t K)
@@ -388,7 +390,7 @@ struct GridwiseMoeGemm
 
     __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
     {
-        constexpr index_t NkSwizzleNumber = Number<warpSize * KPack>{};
+        constexpr index_t NkSwizzleNumber = Number<warpSize * KPack / KGroup>{};
         return make_naive_tensor_descriptor(
             make_tuple(N0 / NWave, NWave, K0, NkSwizzleNumber),
             make_tuple(NWave * K0 * NkSwizzleNumber, K0 * NkSwizzleNumber, NkSwizzleNumber, I1));
@@ -1297,7 +1299,7 @@ struct GridwiseMoeGemm
                   make_multi_index(n_block_data_idx_on_grid,
                                    get_warp_local_1d_id() % NWave,
                                    0,
-                                   KPack * (get_thread_local_1d_id() % warpSize)));
+                                   KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         // Cast after lds
@@ -1804,7 +1806,7 @@ struct GridwiseMoeGemm
                   make_multi_index(n_block_data_idx_on_grid,
                                    get_warp_local_1d_id() % NWave,
                                    0,
-                                   KPack * (get_thread_local_1d_id() % warpSize)));
+                                   KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         // Cast after lds

From f9c29b5ec796efe04840d583a2d807bfc185676f Mon Sep 17 00:00:00 2001
From: coderfeli <coderfeli@163.com>
Date: Fri, 25 Apr 2025 03:09:53 +0000
Subject: [PATCH 4/8] set 16x16

---
 .../moe_gemm1_xdl_fp8.cpp                     | 23 ++++++++--------
 .../gpu/grid/gridwise_moe_gemm.hpp            | 26 +++++++++----------
 2 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
index f594080755..ec33dbc24f 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
@@ -155,13 +155,13 @@ using BElementOp = PassThrough;
 
 static constexpr auto GemmSpec         = ck::tensor_operation::device::GemmSpecialization::Default;
 static constexpr ck::index_t MPerBlock = 128;
-static constexpr ck::index_t MXDLPerWave = 2;
-static constexpr ck::index_t NXDLPerWave = 2;
+static constexpr ck::index_t MXDLPerWave = 4;
+static constexpr ck::index_t NXDLPerWave = 4;
 static constexpr ck::index_t BLOCKSIZE   = 256;
 static constexpr ck::index_t NPerBlock   = 128;
-static constexpr ck::index_t MNPerXDL    = 32;
+static constexpr ck::index_t MNPerXDL    = 16;
 static constexpr ck::index_t KPerBlock   = 128 / sizeof(A0DataType);
-static constexpr ck::index_t Nswizzle    = true;
+static constexpr ck::index_t Nswizzle    = false;
 static constexpr bool MulRoutedWeight    = false;
 static constexpr ck::index_t AK1         = 16 / sizeof(A0DataType);
 static constexpr ck::index_t BK1         = 16 / sizeof(B0DataType);
@@ -188,7 +188,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemm
                //    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
                //    MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
                 //  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-                2,    1,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec, D2Vec>,
+                2,    2,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec, D2Vec>,
                ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, Nswizzle, true, MulRoutedWeight, A0DataType>;
 
 // clang-format on
@@ -201,11 +201,11 @@ int main(int argc, char* argv[])
 
     // GEMM shape
     ck::index_t N               = 4096;
-    ck::index_t K               = 4096;
+    ck::index_t K               = 6144;
     ck::index_t experts         = 8;
-    ck::index_t sorted_tile_num = 8;
-    ck::index_t valid_tile_num  = 8;
-    ck::index_t tokens          = 128;
+    ck::index_t sorted_tile_num = 133;
+    ck::index_t valid_tile_num  = 128;
+    ck::index_t tokens          = 8192;
     ck::index_t topk            = 2;
 
     // ck::index_t tokens = batch * topk;
@@ -268,11 +268,10 @@ int main(int argc, char* argv[])
     // int eids[] = {0, 0,1, 2,3, 3, 4,4, 5, 5, 6, 6, 7, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2}
     // max_token_id.mData = {valid_size, 0, 2, 3, 4, 6, 8, 10, 12, 13};
     // int eids[] = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2}
-    max_token_id.mData = {valid_size, 0, 1, 2, 3, 4, 5, 6, 7, 8};
-    int eids[]         = {0, 1, 2, 3, 4, 5, 6, 7, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2}
+    max_token_id.mData = {valid_size};
     for(int i = 0; i < sorted_tile_num; i++)
     {
-        expert_ids.mData[i] = eids[i];
+        expert_ids.mData[i] = i / (valid_tile_num / experts);
     }
     int token_per_tile = (tokens * topk + valid_tile_num - 1) / valid_tile_num;
     int tokenid        = 0;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
index fba46d4ac6..7b399c6daa 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
@@ -1681,7 +1681,8 @@ struct GridwiseMoeGemm
         const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
             MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                 c_grid_desc_m_n, problem.MBlock, problem.NBlock);
-        const index_t max_token_id    = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
+        const index_t max_token_id = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
+        // static_assert(NSwizzle == false, "to do fix: need another pr in sorting merged");
         const index_t expert_block_id = NSwizzle ? blockIdx.x / problem.NBlock : blockIdx.y;
         if(expert_block_id * MPerBlock >= max_token_id)
             return;
@@ -1690,12 +1691,13 @@ struct GridwiseMoeGemm
         const auto block_mn = [&]() -> std::pair<int, int> {
             if constexpr(NSwizzle)
             {
-                const index_t ecnt_prefix    = p_max_token_id[1 + expert_id];
-                const index_t prefix_block   = ecnt_prefix * problem.NBlock;
-                const index_t ecnt           = p_max_token_id[2 + expert_id] - ecnt_prefix;
-                const index_t expert_swizzle = ecnt > 0 ? ecnt : 1;
-                const index_t bid_new        = blockIdx.x - prefix_block;
-                const index_t nid            = __builtin_amdgcn_readfirstlane(
+                const index_t ecnt_prefix  = p_max_token_id[1 + expert_id];
+                const index_t prefix_block = ecnt_prefix * problem.NBlock;
+                const index_t ecnt         = p_max_token_id[2 + expert_id] - ecnt_prefix;
+                const index_t expert_swizzle =
+                    ecnt > 0 ? ecnt : 1; // p_max_token_id[expert_id + 1]; // 2
+                const index_t bid_new = blockIdx.x - prefix_block;
+                const index_t nid     = __builtin_amdgcn_readfirstlane(
                     bid_new % 8 + bid_new / (8 * expert_swizzle) * 8);
                 const index_t mid =
                     __builtin_amdgcn_readfirstlane(ecnt_prefix + bid_new / 8 % expert_swizzle);
@@ -1708,7 +1710,6 @@ struct GridwiseMoeGemm
         }();
         const index_t block_n_id = block_mn.first;
         const index_t block_m_id = block_mn.second;
-
         const index_t token0 =
             __builtin_amdgcn_readfirstlane(p_sorted_token_ids[block_m_id * MPerBlock] & 0xffffff);
 
@@ -1720,11 +1721,9 @@ struct GridwiseMoeGemm
         constexpr auto AMRepeats  = MPerBlock / AMThreads;
         const index_t token_pos   = block_m_id * MPerBlock + threadIdx.x / AKThreads * AMRepeats;
 
-        if(token_pos >= max_token_id || expert_block_id * MPerBlock >= max_token_id ||
-           token0 >= problem.NumTokens)
+        if(token_pos >= max_token_id || token0 >= problem.NumTokens)
             return;
-        StaticallyIndexedArray<index_t, AMRepeats>
-            gather_offsets; //= p_sorted_token_ids[token_pos];
+        StaticallyIndexedArray<index_t, AMRepeats> gather_offsets;
         static_for<0, AMRepeats, 1>{}([&](auto m0) {
             const index_t fused_token = p_sorted_token_ids[token_pos + m0];
             index_t token_offset      = fused_token & 0xffffff;
@@ -2083,8 +2082,7 @@ struct GridwiseMoeGemm
             const float* p_sorted_weights_0 = p_ds_grid[I0];
             static_for<0, num_access, 1>{}([&](auto access_id) {
                 // make sure it's safe to write to LDS
-                StaticallyIndexedArray<index_t, EMRepeats>
-                    scatter_offsets; //= p_sorted_token_ids[c_token_pos];
+                StaticallyIndexedArray<index_t, EMRepeats> scatter_offsets;
                 StaticallyIndexedArray<float, EMRepeats> scatter_weights; //= for topk
 
                 auto dstidx = sfc_cde_block.GetIndex(access_id);

From e07ed1eda8439508ac415fbf949eb2b1bc3ef703 Mon Sep 17 00:00:00 2001
From: coderfeli <coderfeli@163.com>
Date: Fri, 25 Apr 2025 03:15:48 +0000
Subject: [PATCH 5/8] use v3

---
 example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
index ec33dbc24f..ec85afaec4 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
@@ -155,8 +155,8 @@ using BElementOp = PassThrough;
 
 static constexpr auto GemmSpec         = ck::tensor_operation::device::GemmSpecialization::Default;
 static constexpr ck::index_t MPerBlock = 128;
-static constexpr ck::index_t MXDLPerWave = 4;
-static constexpr ck::index_t NXDLPerWave = 4;
+static constexpr ck::index_t MXDLPerWave = 8;
+static constexpr ck::index_t NXDLPerWave = 2;
 static constexpr ck::index_t BLOCKSIZE   = 256;
 static constexpr ck::index_t NPerBlock   = 128;
 static constexpr ck::index_t MNPerXDL    = 16;
@@ -189,7 +189,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemm
                //    MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
                 //  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
                 2,    2,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec, D2Vec>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, Nswizzle, true, MulRoutedWeight, A0DataType>;
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, Nswizzle, true, MulRoutedWeight, A0DataType>;
 
 // clang-format on
 

From 946a2119cde03b2e7254c0a0cc0c9ab53ef9bfdc Mon Sep 17 00:00:00 2001
From: aska-0096 <haocwang@amd.com>
Date: Fri, 25 Apr 2025 05:12:47 +0000
Subject: [PATCH 6/8] temp save

---
 .../65_gemm_multiply_multiply/CMakeLists.txt  |   3 +-
 ..._multiply_multiply_xdl_fp8_bpreshuffle.cpp |  10 +-
 .../moe_gemm1_xdl_fp8.cpp                     |  26 +-
 ...e_gemm_pipeline_xdlops_b_preshuffle_v3.hpp | 625 +++++++++++++-----
 4 files changed, 467 insertions(+), 197 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt
index 1f407244fc..4f20431ac8 100644
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -18,6 +18,7 @@ foreach(gpu IN LISTS GPU_TARGETS)
 endforeach()
 
 set(GEMM_OPTIONS)
-list(APPEND GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-16")
+list(APPEND GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
 list(APPEND GEMM_OPTIONS -v --save-temps -Wno-gnu-line-marker)
 target_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVATE ${GEMM_OPTIONS})
+target_compile_options(example_moe_gemm1_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
index d5db3199ca..c20dcaf31b 100644
--- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
@@ -140,14 +140,14 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShu
     // clang-format off
     <   Row, Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
         AElementOp,  BElementOp, CDEElementOp, GemmSpec, 256,
-        32,   128,    128,
+        256,   256,    128,
         16,   16,
-        32,   32,
-        1,    1,
+        16,   16,
+        16,    4,
         S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
         S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
-        1,    1,   S<1, 32, 1, 8>, S<8, 8, 1>,
-        ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v2, FP8>;
+        2,    1,   S<1, 32, 1, 8>, S<8, 8, 1>,
+        ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, FP8>;
 // clang-format on
 
 int main(int argc, char* argv[])
diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
index f594080755..c29a63dfec 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
@@ -154,12 +154,12 @@ using AElementOp = PassThrough;
 using BElementOp = PassThrough;
 
 static constexpr auto GemmSpec         = ck::tensor_operation::device::GemmSpecialization::Default;
-static constexpr ck::index_t MPerBlock = 128;
-static constexpr ck::index_t MXDLPerWave = 2;
-static constexpr ck::index_t NXDLPerWave = 2;
+static constexpr ck::index_t MPerBlock = 256;
+static constexpr ck::index_t MXDLPerWave = 16;
+static constexpr ck::index_t NXDLPerWave = 4;
 static constexpr ck::index_t BLOCKSIZE   = 256;
-static constexpr ck::index_t NPerBlock   = 128;
-static constexpr ck::index_t MNPerXDL    = 32;
+static constexpr ck::index_t NPerBlock   = 256;
+static constexpr ck::index_t MNPerXDL    = 16;
 static constexpr ck::index_t KPerBlock   = 128 / sizeof(A0DataType);
 static constexpr ck::index_t Nswizzle    = true;
 static constexpr bool MulRoutedWeight    = false;
@@ -189,7 +189,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemm
                //    MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
                 //  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
                 2,    1,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec, D2Vec>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, Nswizzle, true, MulRoutedWeight, A0DataType>;
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, Nswizzle, true, MulRoutedWeight, A0DataType>;
 
 // clang-format on
 
@@ -201,11 +201,11 @@ int main(int argc, char* argv[])
 
     // GEMM shape
     ck::index_t N               = 4096;
-    ck::index_t K               = 4096;
+    ck::index_t K               = 6144;
     ck::index_t experts         = 8;
-    ck::index_t sorted_tile_num = 8;
-    ck::index_t valid_tile_num  = 8;
-    ck::index_t tokens          = 128;
+    ck::index_t sorted_tile_num = 133;
+    ck::index_t valid_tile_num  = 128;
+    ck::index_t tokens          = 8192;
     ck::index_t topk            = 2;
 
     // ck::index_t tokens = batch * topk;
@@ -269,11 +269,13 @@ int main(int argc, char* argv[])
     // max_token_id.mData = {valid_size, 0, 2, 3, 4, 6, 8, 10, 12, 13};
     // int eids[] = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2}
     max_token_id.mData = {valid_size, 0, 1, 2, 3, 4, 5, 6, 7, 8};
-    int eids[]         = {0, 1, 2, 3, 4, 5, 6, 7, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2}
+    // int eids[]         = {0, 1, 2, 3, 4, 5, 6, 7, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2}
+    
     for(int i = 0; i < sorted_tile_num; i++)
     {
-        expert_ids.mData[i] = eids[i];
+        expert_ids.mData[i] = i / (valid_tile_num / experts);
     }
+ 
     int token_per_tile = (tokens * topk + valid_tile_num - 1) / valid_tile_num;
     int tokenid        = 0;
     // sorted_token_ids.mData[0] = 0;
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
index da70929c46..ad071c210f 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
@@ -185,156 +185,374 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
         return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
     }
 
-    template <typename Stage>
-    __device__ static constexpr auto HotLoopScheduler(Stage stage)
+    struct HotLoopScheduler
     {
-        constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
-        constexpr auto num_ds_write_inst_a    = HotLoopInstList::A_LDS_Write_Inst_Num;
-        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
-        constexpr auto num_buffer_load_inst_b = MWaves * HotLoopInstList::B_Buffer_Load_Inst_Num;
+        // MRepeat=16
+        // ds_read 256x128/64/16=32
+        // buffer_load_b 256x128/256/16 = 16
+        // 8
+        // ds_write_a = 8
+        // buffer_load_a 256x128/256/16 = 16
+        // mfma 256x256x128/4/16/16/128 = 64
+        // ds_write: 8 mfma, 4 repeat
+        // 6, 6,
+        // 15, 
+        
+        // 
 
-        constexpr auto num_mfma = HotLoopInstList::C_MFMA_Inst_Num;
+        // Per repeat: 4 mfma
+        // ds_read 16x128/64/16=2
+        // 4 mfma assioate with 2 dsread
+        // calculate the stage that issue each instructions.
+        
+        // GMEM, SMEM and MFMA instructions
+        static constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
+        static constexpr auto num_ds_write_inst_a    = HotLoopInstList::A_LDS_Write_Inst_Num;
+        static constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        static constexpr auto num_buffer_load_inst_b =
+            MWaves * HotLoopInstList::B_Buffer_Load_Inst_Num;
 
-        constexpr auto staged_num_ds_read_inst_a = num_ds_read_inst_a / MRepeat;
-        constexpr auto staged_num_mfma           = num_mfma / MRepeat;
+        static constexpr auto num_mfma = HotLoopInstList::C_MFMA_Inst_Num;
 
-        constexpr auto staged_num_mfma_per_ds_read_a = staged_num_mfma / staged_num_ds_read_inst_a;
+        // We reuse the register between adjacent MRepeat stages.
+        // staged_num_ds_read_inst_a staged_num_mfma should be associated in a group
+        // So that the register resource released and could be reused in next iMRepeat stage
+        static constexpr auto staged_num_ds_read_inst_a = num_ds_read_inst_a / MRepeat;
+        static constexpr auto staged_num_mfma           = num_mfma / MRepeat;
+        
+        static constexpr auto buffer_load_b_stage =0;
+        static constexpr auto ds_write_a_stage =4;
+        static constexpr auto buffer_load_a_stage =8;
+        static constexpr auto buffer_load_a_stage_end =12;
 
-        if constexpr(stage.value == 0)
+        // template <typename Stage>
+        __device__ constexpr auto operator()()
         {
-            constexpr auto staged_num_buffer_load_b_per_ds_read_a =
-                num_buffer_load_inst_b / staged_num_ds_read_inst_a;
-            constexpr auto staged_num_mfma_per_buffer_load_b =
-                staged_num_mfma / num_buffer_load_inst_b;
-            // B global
-            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
-                ignore = i_inst;
-
-                static_for<0, staged_num_buffer_load_b_per_ds_read_a - 1, 1>{}([&](auto ibuf_inst) {
-                    ignore = ibuf_inst;
-                    __builtin_amdgcn_sched_group_barrier(
-                        0x008, staged_num_mfma_per_buffer_load_b, 0);  // MFMA
+#if 0
+            if constexpr(stage.value >= buffer_load_b_stage && stage.value < ds_write_a_stage)
+            {
+                static_for<0, 4, 1>{}([&](auto i) {
+                    ignore = i;
                     __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                });
+#if 0 
+                constexpr auto staged_num_buffer_load_b_per_ds_read_a =
+                    num_buffer_load_inst_b / staged_num_ds_read_inst_a;
+                constexpr auto staged_num_mfma_per_buffer_load_b =
+                    staged_num_mfma / num_buffer_load_inst_b;
+                // B global
+                static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
+                    ignore = i_inst;
+
+                    static_for<0, staged_num_buffer_load_b_per_ds_read_a - 1, 1>{}(
+                        [&](auto ibuf_inst) {
+                            ignore = ibuf_inst;
+                            __builtin_amdgcn_sched_group_barrier(
+                                0x008, staged_num_mfma_per_buffer_load_b, 0);  // MFMA
+                            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                        });
+
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    __builtin_amdgcn_sched_group_barrier(
+                        0x008, staged_num_mfma_per_buffer_load_b - 1, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
+                });
+#endif
+                // __builtin_amdgcn_sched_barrier(0);
+            }
+            else if constexpr(stage.value >= ds_write_a_stage && stage.value< buffer_load_a_stage)
+            {
+                static_for<0, 4, 1>{}([&](auto i) {
+                    ignore = i;
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS read
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS read
+                });
+#if 0
+                constexpr auto staged_num_mfma_per_ds_write_a =
+                    math::integer_divide_ceil(staged_num_mfma, num_ds_write_inst_a);
+
+                constexpr auto stage_more_mfma =
+                    staged_num_mfma - (staged_num_mfma_per_ds_write_a - 1) * num_ds_write_inst_a;
+
+                // A local write
+                static_for<0, num_ds_write_inst_a, 1>{}([&](auto i_inst) {
+                    if constexpr(i_inst.value < stage_more_mfma)
+                    {
+                        if(i_inst.value < staged_num_ds_read_inst_a)
+                        {
+                            __builtin_amdgcn_sched_group_barrier(
+                                0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
+                            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                        }
+                        else
+                        {
+                            __builtin_amdgcn_sched_group_barrier(
+                                0x008, staged_num_mfma_per_ds_write_a, 0);     // MFMA
+                            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                        }
+                    }
+                    else
+                    {
+                        if(i_inst.value < staged_num_ds_read_inst_a)
+                        {
+                            __builtin_amdgcn_sched_group_barrier(
+                                0x008, staged_num_mfma_per_ds_write_a - 2, 0); // MFMA
+                            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                        }
+                        else
+                        {
+                            __builtin_amdgcn_sched_group_barrier(
+                                0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
+                            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                        }
+                    }
+                });
+#endif
+                // __builtin_amdgcn_sched_barrier(0);
+            }
+            else if constexpr(stage.value >= buffer_load_a_stage && stage.value <buffer_load_a_stage_end)
+            {
+                static_for<0, 4, 1>{}([&](auto i) {
+                    ignore = i;
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                });
+#if 0 
+                constexpr auto staged_num_mfma_per_buffer_load_a =
+                    math::integer_divide_ceil(staged_num_mfma, num_buffer_load_inst_a);
+
+                constexpr auto stage_more_mfma =
+                    staged_num_mfma -
+                    (staged_num_mfma_per_buffer_load_a - 1) * num_buffer_load_inst_a;
+
+                // A global
+                static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i_inst) {
+                    if constexpr(i_inst.value < stage_more_mfma)
+                    {
+                        if(i_inst.value < staged_num_ds_read_inst_a)
+                        {
+                            __builtin_amdgcn_sched_group_barrier(
+                                0x008, staged_num_mfma_per_buffer_load_a - 1, 0); // MFMA
+                            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
+                            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);    // MFMA
+                            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);    // DS read
+                        }
+                        else
+                        {
+                            __builtin_amdgcn_sched_group_barrier(
+                                0x008, staged_num_mfma_per_buffer_load_a, 0);  // MFMA
+                            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                        }
+                    }
+                    else
+                    {
+                        if(i_inst.value < staged_num_ds_read_inst_a)
+                        {
+                            __builtin_amdgcn_sched_group_barrier(
+                                0x008, staged_num_mfma_per_buffer_load_a - 2, 0); // MFMA
+                            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
+                            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);    // MFMA
+                            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);    // DS read
+                        }
+                        else
+                        {
+                            __builtin_amdgcn_sched_group_barrier(
+                                0x008, staged_num_mfma_per_buffer_load_a - 1, 0); // MFMA
+                            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
+                        }
+                    }
                 });
 
+                __builtin_amdgcn_sched_barrier(0);
+#endif
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+#if 0
+                // A local Read
+                static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
+                    ignore = i_inst;
+                    __builtin_amdgcn_sched_group_barrier(
+                        0x008, staged_num_mfma_per_ds_read_a, 0);      // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                });
+
+                __builtin_amdgcn_sched_barrier(0);
+#endif
+            }
+#endif
+#if 0
+            static_for<0, 4, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            });
+
+            static_for<0, 4, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
                 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
                 __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                __builtin_amdgcn_sched_group_barrier(
-                    0x008, staged_num_mfma_per_buffer_load_b - 1, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
-            });
-
-            __builtin_amdgcn_sched_barrier(0);
-        }
-        else if constexpr(stage.value == 1)
-        {
-            constexpr auto staged_num_mfma_per_ds_write_a =
-                math::integer_divide_ceil(staged_num_mfma, num_ds_write_inst_a);
-
-            constexpr auto stage_more_mfma =
-                staged_num_mfma - (staged_num_mfma_per_ds_write_a - 1) * num_ds_write_inst_a;
-
-            // A local write
-            static_for<0, num_ds_write_inst_a, 1>{}([&](auto i_inst) {
-                if constexpr(i_inst.value < stage_more_mfma)
-                {
-                    if(i_inst.value < staged_num_ds_read_inst_a)
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    }
-                    else
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_ds_write_a, 0);     // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                    }
-                }
-                else
-                {
-                    if(i_inst.value < staged_num_ds_read_inst_a)
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_ds_write_a - 2, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    }
-                    else
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                    }
-                }
-            });
-
-            __builtin_amdgcn_sched_barrier(0);
-        }
-        else if constexpr(stage.value == 2)
-        {
-            constexpr auto staged_num_mfma_per_buffer_load_a =
-                math::integer_divide_ceil(staged_num_mfma, num_buffer_load_inst_a);
-
-            constexpr auto stage_more_mfma =
-                staged_num_mfma - (staged_num_mfma_per_buffer_load_a - 1) * num_buffer_load_inst_a;
-
-            // A global
-            static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i_inst) {
-                if constexpr(i_inst.value < stage_more_mfma)
-                {
-                    if(i_inst.value < staged_num_ds_read_inst_a)
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_buffer_load_a - 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);    // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);    // DS read
-                    }
-                    else
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_buffer_load_a, 0);  // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                    }
-                }
-                else
-                {
-                    if(i_inst.value < staged_num_ds_read_inst_a)
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_buffer_load_a - 2, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);    // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);    // DS read
-                    }
-                    else
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_buffer_load_a - 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
-                    }
-                }
-            });
-
-            __builtin_amdgcn_sched_barrier(0);
-        }
-        else
-        {
-            // A local Read
-            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
-                ignore = i_inst;
-                __builtin_amdgcn_sched_group_barrier(
-                    0x008, staged_num_mfma_per_ds_read_a, 0);      // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
                 __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
             });
 
-            __builtin_amdgcn_sched_barrier(0);
+            static_for<0, 4, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            });
+
+            static_for<0, 4, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            });
+#endif
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+
+            static_for<0, 4, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            });
         }
-    }
+    };
 
     template <typename Stage>
     __device__ static constexpr auto EpilogueScheduler_1(Stage stage)
@@ -538,14 +756,16 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
 
         // Local prefetch A1
         block_sync_lds();
-        static_for<0, KRepeat, 1>{}([&](auto k0) {
-            static_for<0, KGroup, 1>{}([&](auto kg0) {
-                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                   make_tuple(I0, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
-                                   a_block_buf.At(I0),
-                                   a_thread_desc_,
-                                   make_tuple(I0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
-                                   a_thread_buf);
+        static_for<0, 2, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
+                                       a_block_buf.At(I0),
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                       a_thread_buf);
+                });
             });
         });
 
@@ -557,12 +777,25 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
         // main body
         if constexpr(HasMainLoop)
         {
+            HotLoopScheduler scheduler;
             index_t i = 0;
             do
             {
                 auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
+                    b_blockwise_copy.Run(b_grid_desc,
+                        b_grid_buf,
+                        b_block_desc_n0_n1_k0_k1,
+                        b_block_origin_idx,
+                        b_thread_bufs(local_read_buf));
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(local_read_buf));
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
-                        if constexpr(m0.value == 0)
+#if 0
+                        if constexpr(m0.value == scheduler.buffer_load_b_stage)
                         {
                             b_blockwise_copy.Run(b_grid_desc,
                                                  b_grid_buf,
@@ -571,16 +804,16 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                                                  b_thread_bufs(local_read_buf));
                             b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
                         }
-                        else if constexpr(m0.value == 1)
+                        else if constexpr(m0.value == scheduler.ds_write_a_stage)
                         {
                             a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(local_read_buf));
                         }
-                        else if constexpr(m0.value == 2)
+                        else if constexpr(m0.value == scheduler.buffer_load_a_stage)
                         {
                             a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
                             a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
                         }
-
+#endif
                         static_for<0, KRepeat, 1>{}([&](auto k0) {
                             static_for<0, NRepeat, 1>{}([&](auto n0) {
                                 vector_type<ComputeDataType, KPack> a_thread_vec;
@@ -589,13 +822,8 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                                 static_for<0, KPack, 1>{}([&](auto ik) {
                                     a_thread_vec.template AsType<ComputeDataType>()(ik) =
                                         a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                            make_tuple((m0 + HotloopLocalBufSwitch * mfma_reg_buf) %
-                                                           2,
-                                                       I0,
-                                                       I0,
-                                                       k0,
-                                                       I0,
-                                                       ik))>{}];
+                                            make_tuple((m0 + HotloopLocalBufSwitch * mfma_reg_buf) % 2,
+                                                       I0, I0, k0, I0, ik))>{}];
                                     b_thread_vec.template AsType<ComputeDataType>()(ik) =
                                         b_thread_bufs[mfma_reg_buf]
                                                      [Number<b_thread_desc_.CalculateOffset(
@@ -616,7 +844,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                             });
                         });
 
-                        if constexpr(m0.value == MRepeat - 1)
+                        if constexpr(m0.value == (MRepeat - 2))
                         {
                             block_sync_lds();
 
@@ -624,7 +852,24 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                                 static_for<0, KGroup, 1>{}([&](auto kg0) {
                                     a_thread_copy_.Run(
                                         a_block_desc_m0_m1_m2_k0_k1_k2,
-                                        make_tuple(Number<(m0 + 1) % MRepeat>{},
+                                        make_tuple(Number<(m0 + 2) % MRepeat>{}, I0, I0,
+                                                   Number<k0 * 2 + kg0>{}, I0, I0),
+                                        a_block_buf.At(local_read_buf),
+                                        a_thread_desc_,
+                                        make_tuple(
+                                            Number<(m0 + 2 + HotloopLocalBufSwitch * mfma_reg_buf) % 2>{},
+                                            I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                        a_thread_buf);
+                                });
+                            });
+                        }
+                        else if constexpr(m0.value == (MRepeat - 1))
+                        {
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                    a_thread_copy_.Run(
+                                        a_block_desc_m0_m1_m2_k0_k1_k2,
+                                        make_tuple(Number<(m0 + 2) % MRepeat>{},
                                                    I0,
                                                    I0,
                                                    Number<k0 * 2 + kg0>{},
@@ -633,7 +878,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                                         a_block_buf.At(local_read_buf),
                                         a_thread_desc_,
                                         make_tuple(
-                                            Number<(m0 + 1 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                            Number<(m0 + 2 + HotloopLocalBufSwitch * mfma_reg_buf) %
                                                    2>{},
                                             I0,
                                             I0,
@@ -650,7 +895,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                                 static_for<0, KGroup, 1>{}([&](auto kg0) {
                                     a_thread_copy_.Run(
                                         a_block_desc_m0_m1_m2_k0_k1_k2,
-                                        make_tuple(Number<(m0 + 1) % MRepeat>{},
+                                        make_tuple(Number<(m0 + 2) % MRepeat>{},
                                                    I0,
                                                    I0,
                                                    Number<k0 * 2 + kg0>{},
@@ -659,7 +904,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                                         a_block_buf.At(mfma_reg_buf),
                                         a_thread_desc_,
                                         make_tuple(
-                                            Number<(m0 + 1 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                            Number<(m0 + 2 + HotloopLocalBufSwitch * mfma_reg_buf) %
                                                    2>{},
                                             I0,
                                             I0,
@@ -671,8 +916,8 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                             });
                         }
 
-                        HotLoopScheduler(m0);
                     });
+                    scheduler();
                 };
 
                 LoopFunc(I0, I1);
@@ -684,7 +929,15 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
         // tail
         if constexpr(TailNum == TailNumber::Even)
         {
+            b_blockwise_copy.Run(b_grid_desc,
+                                 b_grid_buf,
+                                 b_block_desc_n0_n1_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs(I1));
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1));
+
             static_for<0, MRepeat, 1>{}([&](auto m0) {
+#if 0
                 if constexpr(m0.value == 0)
                 {
                     b_blockwise_copy.Run(b_grid_desc,
@@ -693,11 +946,11 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                                          b_block_origin_idx,
                                          b_thread_bufs(I1));
                 }
-                else if constexpr(m0.value == MRepeat - 1)
+                else if constexpr(m0.value == (MRepeat - 2))
                 {
                     a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1));
                 }
-
+#endif
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
                         vector_type<ComputeDataType, KPack> a_thread_vec;
@@ -724,7 +977,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                     });
                 });
 
-                if constexpr(m0.value == MRepeat - 1)
+                if constexpr(m0.value == (MRepeat - 2))
                 {
                     block_sync_lds();
 
@@ -732,7 +985,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                         static_for<0, KGroup, 1>{}([&](auto kg0) {
                             a_thread_copy_.Run(
                                 a_block_desc_m0_m1_m2_k0_k1_k2,
-                                make_tuple(Number<(m0 + 1) % MRepeat>{},
+                                make_tuple(Number<(m0 + 2) % MRepeat>{},
                                            I0,
                                            I0,
                                            Number<k0 * 2 + kg0>{},
@@ -741,7 +994,27 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                                 a_block_buf.At(I1),
                                 a_thread_desc_,
                                 make_tuple(
-                                    Number<(m0 + 1) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+                else if constexpr(m0.value == (MRepeat - 1))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                           I0,
+                                           I0,
+                                           Number<k0 * 2 + kg0>{},
+                                           I0,
+                                           I0),
+                                a_block_buf.At(I1),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
                                 a_thread_buf);
                         });
                     });
@@ -752,7 +1025,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                         static_for<0, KGroup, 1>{}([&](auto kg0) {
                             a_thread_copy_.Run(
                                 a_block_desc_m0_m1_m2_k0_k1_k2,
-                                make_tuple(Number<(m0 + 1) % MRepeat>{},
+                                make_tuple(Number<(m0 + 2) % MRepeat>{},
                                            I0,
                                            I0,
                                            Number<k0 * 2 + kg0>{},
@@ -761,13 +1034,11 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                                 a_block_buf.At(I0),
                                 a_thread_desc_,
                                 make_tuple(
-                                    Number<(m0 + 1) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
                                 a_thread_buf);
                         });
                     });
                 }
-
-                EpilogueScheduler_1(m0);
             });
 
             static_for<0, MRepeat, 1>{}([&](auto m0) {
@@ -797,17 +1068,17 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                     });
                 });
 
-                if constexpr(m0.value != (MRepeat - 1))
+                if constexpr(m0.value < (MRepeat - 2))
                 {
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
                         static_for<0, KGroup, 1>{}([&](auto kg0) {
                             a_thread_copy_.Run(
                                 a_block_desc_m0_m1_m2_k0_k1_k2,
                                 make_tuple(
-                                    Number<m0 + 1>{}, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
+                                    Number<m0 + 2>{}, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
                                 a_block_buf.At(I1),
                                 a_thread_desc_,
-                                make_tuple(Number<(m0 + 1 + HotloopLocalBufSwitch) % 2>{},
+                                make_tuple(Number<(m0 + 2 + HotloopLocalBufSwitch) % 2>{},
                                            I0,
                                            I0,
                                            k0,
@@ -817,12 +1088,10 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                         });
                     });
 
-                    EpilogueScheduler_2();
                 }
             });
             // Let's leak last MFMA block to epilogue region, cover the potential lds-shuffle
             // latency
-            // __builtin_amdgcn_sched_barrier(0);
         }
         else if constexpr(TailNum == TailNumber::Odd)
         {
@@ -853,23 +1122,21 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                     });
                 });
 
-                if constexpr(m0.value != (MRepeat - 1))
+                if constexpr(m0.value < (MRepeat - 2))
                 {
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
                         static_for<0, KGroup, 1>{}([&](auto kg0) {
                             a_thread_copy_.Run(
                                 a_block_desc_m0_m1_m2_k0_k1_k2,
                                 make_tuple(
-                                    Number<m0 + 1>{}, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
+                                    Number<m0 + 2>{}, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
                                 a_block_buf.At(I0),
                                 a_thread_desc_,
                                 make_tuple(
-                                    Number<(m0 + 1) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
                                 a_thread_buf);
                         });
                     });
-
-                    EpilogueScheduler_2();
                 }
             });
         }

From 49338edb1b3977411a4523bdeade44142ed9b848 Mon Sep 17 00:00:00 2001
From: aska-0096 <haocwang@amd.com>
Date: Sun, 27 Apr 2025 08:05:20 +0000
Subject: [PATCH 7/8] tempsave

---
 .../65_gemm_multiply_multiply/CMakeLists.txt  |   1 +
 .../moe_gemm1_xdl_fp8.cpp                     |   4 +-
 .../moe_gemm2_xdl_fp8.cpp                     |  18 +-
 ...e_gemm_pipeline_xdlops_b_preshuffle_v3.hpp | 651 +++---------------
 .../gpu/grid/gridwise_moe_gemm.hpp            |   2 +-
 5 files changed, 118 insertions(+), 558 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt
index 4f20431ac8..01ac4a3179 100644
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -22,3 +22,4 @@ list(APPEND GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllv
 list(APPEND GEMM_OPTIONS -v --save-temps -Wno-gnu-line-marker)
 target_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVATE ${GEMM_OPTIONS})
 target_compile_options(example_moe_gemm1_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
+target_compile_options(example_moe_gemm2_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
index 9387c022da..6227f727a2 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
@@ -181,7 +181,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemm
                // mn_perxdl
                MNPerXDL,   MNPerXDL,
                // mn_xdlperwave 
-               MXDLPerWave,    NXDLPerWave,
+               MXDLPerWave,  NXDLPerWave,
                // a,b: loadtranfer cluster, cluster order, srcorder,VECDIM, srcpervec, dstpervec, lds_extra
                S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
                S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, BK1, BK1, 0,
@@ -273,7 +273,7 @@ int main(int argc, char* argv[])
     {
         expert_ids.mData[i] = i / (valid_tile_num / experts);
     }
- 
+
     int token_per_tile = (tokens * topk + valid_tile_num - 1) / valid_tile_num;
     int tokenid        = 0;
     // sorted_token_ids.mData[0] = 0;
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
index 12a6f4bcb2..f7935f9d65 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
@@ -117,12 +117,12 @@ using BElementOp   = PassThrough;
 using CDEElementOp = MulABScaleExpertWeight;
 
 static constexpr auto GemmSpec         = ck::tensor_operation::device::GemmSpecialization::Default;
-static constexpr ck::index_t MPerBlock = 128;
+static constexpr ck::index_t MPerBlock = 256;
 static constexpr ck::index_t BLOCKSIZE = 256;
-static constexpr ck::index_t MXDLPerWave = 2;
-static constexpr ck::index_t NXDLPerWave = 2;
-static constexpr ck::index_t NPerBlock   = 128;
-static constexpr ck::index_t MNPerXDL    = 32;
+static constexpr ck::index_t MXDLPerWave = 16;
+static constexpr ck::index_t NXDLPerWave = 4;
+static constexpr ck::index_t NPerBlock   = 256;
+static constexpr ck::index_t MNPerXDL    = 16;
 static constexpr ck::index_t KPerBlock   = 128 / sizeof(A0DataType);
 
 // static constexpr ck::index_t MXDLPerWave = MPerBlock / 32; //todo fix this constraint
@@ -160,12 +160,12 @@ using DeviceOpInstance                     = ck::tensor_operation::device::Devic
             //    S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0,
             //    S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0,
                S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
-               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, BK1, BK1, 0,
                //    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
                //    MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
                 //  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
                2,        1,         S<1, CShuffleMLane, 1, CShuffleNLane>, S<EVec, D0Vec, D1Vec, D2Vec>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, false, false, MulRoutedWeight, A0DataType>;
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, false, false, MulRoutedWeight, A0DataType>;
         // kernel 2: 128->32x128x128
         //  <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   128,   32,   128,    128,  16,  16,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,     S<8, 16, 1>,    S<1, 0, 2>,     S<1, 0, 2>,             2,              16,             16,          0,          1,           1,               S<1, 16, 1, 8>,      S<8, 8, 1>,  ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, EDataType>;
 
@@ -185,8 +185,8 @@ int main(int argc, char* argv[])
     ck::index_t N               = 4096;
     ck::index_t K               = 4096;
     ck::index_t experts         = 8;
-    ck::index_t sorted_tile_num = 18;
-    ck::index_t valid_tile_num  = 16;
+    ck::index_t sorted_tile_num = 133;
+    ck::index_t valid_tile_num  = 128;
     ck::index_t sorted_size     = sorted_tile_num * MPerBlock;
     ck::index_t valid_size      = valid_tile_num * MPerBlock;
     ck::index_t tokens          = 128;
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
index ad071c210f..2f1616a866 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
@@ -185,516 +185,94 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
         return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
     }
 
-    struct HotLoopScheduler
+    __device__ static constexpr auto HotLoopScheduler()
     {
-        // MRepeat=16
-        // ds_read 256x128/64/16=32
-        // buffer_load_b 256x128/256/16 = 16
-        // 8
-        // ds_write_a = 8
-        // buffer_load_a 256x128/256/16 = 16
-        // mfma 256x256x128/4/16/16/128 = 64
-        // ds_write: 8 mfma, 4 repeat
-        // 6, 6,
-        // 15, 
-        
-        // 
+        // A/B split schedule
+        // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
+        constexpr auto num_ds_read_inst_a =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
+                ? HotLoopInstList::A_LDS_Read_Inst_Num
+                : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
 
-        // Per repeat: 4 mfma
-        // ds_read 16x128/64/16=2
-        // 4 mfma assioate with 2 dsread
-        // calculate the stage that issue each instructions.
-        
-        // GMEM, SMEM and MFMA instructions
-        static constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
-        static constexpr auto num_ds_write_inst_a    = HotLoopInstList::A_LDS_Write_Inst_Num;
-        static constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
-        static constexpr auto num_buffer_load_inst_b =
-            MWaves * HotLoopInstList::B_Buffer_Load_Inst_Num;
+        constexpr auto num_ds_write_inst_a = HotLoopInstList::A_LDS_Write_Inst_Num;
 
-        static constexpr auto num_mfma = HotLoopInstList::C_MFMA_Inst_Num;
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
 
-        // We reuse the register between adjacent MRepeat stages.
-        // staged_num_ds_read_inst_a staged_num_mfma should be associated in a group
-        // So that the register resource released and could be reused in next iMRepeat stage
-        static constexpr auto staged_num_ds_read_inst_a = num_ds_read_inst_a / MRepeat;
-        static constexpr auto staged_num_mfma           = num_mfma / MRepeat;
-        
-        static constexpr auto buffer_load_b_stage =0;
-        static constexpr auto ds_write_a_stage =4;
-        static constexpr auto buffer_load_a_stage =8;
-        static constexpr auto buffer_load_a_stage_end =12;
+        constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num;
+        constexpr auto mfma_cycle    = HotLoopInstList::C_MFMA_Inst_Cycle;
 
-        // template <typename Stage>
-        __device__ constexpr auto operator()()
-        {
-#if 0
-            if constexpr(stage.value >= buffer_load_b_stage && stage.value < ds_write_a_stage)
+        constexpr auto ds_read_a_issue_cycle =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_a_mfma_rate =
+            math::integer_divide_ceil(mfma_cycle - 4, 2 * ds_read_a_issue_cycle);
+
+        // constexpr auto num_dsread_a_mfma =
+        //     (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
+
+        constexpr auto num_stages = MRepeat;
+
+        // Group num_mfma_perstage num_ds_read_a_perstage
+        // since we want to reuse a local register buffer
+        constexpr auto num_mfma_perstage      = num_mfma_inst / num_stages;
+        constexpr auto num_ds_read_a_perstage = num_ds_read_inst_a / num_stages;
+
+        constexpr auto num_ds_read_a_mfma_perstage =
+            math::integer_divide_ceil(num_ds_read_a_perstage, ds_read_a_mfma_rate);
+
+        constexpr auto num_mfma_per_issue_more = math::integer_divide_ceil(
+            num_mfma_inst, num_buffer_load_inst_a + num_buffer_load_inst_b);
+        constexpr auto num_mfma_per_issue_less = math::integer_divide_floor(
+            num_mfma_inst, num_buffer_load_inst_a + num_buffer_load_inst_b);
+        // Insert more mfmas between bufferloads
+        constexpr auto num_stage1_bufferloads =
+            num_mfma_inst -
+            (num_buffer_load_inst_a + num_buffer_load_inst_b) * num_mfma_per_issue_less;
+        constexpr auto num_stage1_mfma = num_mfma_per_issue_more * num_stage1_bufferloads;
+        // Insert less mfmas between bufferloads
+        // constexpr auto num_stage2_mfma = num_mfma_inst - num_stage1_mfma;
+
+        constexpr auto buffer_load_issue_point     = 0;
+        constexpr auto ds_write_issue_point_stage1 = num_mfma_per_issue_more >= 3 ? 1 : 0;
+        constexpr auto ds_write_issue_point_stage2 = num_mfma_per_issue_less >= 3 ? 1 : 0;
+
+        static_for<0, num_mfma_inst, 1>{}([&](auto i) {
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+
+            // Group num_mfma_perstage num_ds_read_a_perstage
+            // Hide A lds rd issue latency at begining of each stage
+            if constexpr((i % num_mfma_perstage) >=
+                         (num_mfma_perstage - num_ds_read_a_mfma_perstage))
             {
-                static_for<0, 4, 1>{}([&](auto i) {
-                    ignore = i;
-                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                    __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                    __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                });
-#if 0 
-                constexpr auto staged_num_buffer_load_b_per_ds_read_a =
-                    num_buffer_load_inst_b / staged_num_ds_read_inst_a;
-                constexpr auto staged_num_mfma_per_buffer_load_b =
-                    staged_num_mfma / num_buffer_load_inst_b;
-                // B global
-                static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
-                    ignore = i_inst;
-
-                    static_for<0, staged_num_buffer_load_b_per_ds_read_a - 1, 1>{}(
-                        [&](auto ibuf_inst) {
-                            ignore = ibuf_inst;
-                            __builtin_amdgcn_sched_group_barrier(
-                                0x008, staged_num_mfma_per_buffer_load_b, 0);  // MFMA
-                            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                        });
-
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    __builtin_amdgcn_sched_group_barrier(
-                        0x008, staged_num_mfma_per_buffer_load_b - 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
-                });
-#endif
-                // __builtin_amdgcn_sched_barrier(0);
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
             }
-            else if constexpr(stage.value >= ds_write_a_stage && stage.value< buffer_load_a_stage)
+
+            // Schedule VMEM access instruction distributed evenly in the loop
+            // Hide B/A global rd issue latency
+            if constexpr(((i < num_stage1_mfma) &&
+                          (i % num_mfma_per_issue_more == buffer_load_issue_point)) ||
+                         ((i >= num_stage1_mfma) &&
+                          ((i - num_stage1_mfma) % num_mfma_per_issue_less ==
+                           buffer_load_issue_point)))
             {
-                static_for<0, 4, 1>{}([&](auto i) {
-                    ignore = i;
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS read
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS read
-                });
-#if 0
-                constexpr auto staged_num_mfma_per_ds_write_a =
-                    math::integer_divide_ceil(staged_num_mfma, num_ds_write_inst_a);
-
-                constexpr auto stage_more_mfma =
-                    staged_num_mfma - (staged_num_mfma_per_ds_write_a - 1) * num_ds_write_inst_a;
-
-                // A local write
-                static_for<0, num_ds_write_inst_a, 1>{}([&](auto i_inst) {
-                    if constexpr(i_inst.value < stage_more_mfma)
-                    {
-                        if(i_inst.value < staged_num_ds_read_inst_a)
-                        {
-                            __builtin_amdgcn_sched_group_barrier(
-                                0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
-                            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                        }
-                        else
-                        {
-                            __builtin_amdgcn_sched_group_barrier(
-                                0x008, staged_num_mfma_per_ds_write_a, 0);     // MFMA
-                            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                        }
-                    }
-                    else
-                    {
-                        if(i_inst.value < staged_num_ds_read_inst_a)
-                        {
-                            __builtin_amdgcn_sched_group_barrier(
-                                0x008, staged_num_mfma_per_ds_write_a - 2, 0); // MFMA
-                            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                        }
-                        else
-                        {
-                            __builtin_amdgcn_sched_group_barrier(
-                                0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
-                            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                        }
-                    }
-                });
-#endif
-                // __builtin_amdgcn_sched_barrier(0);
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
             }
-            else if constexpr(stage.value >= buffer_load_a_stage && stage.value <buffer_load_a_stage_end)
+
+            // Hide B lds wr issue latency
+            if constexpr((((i < num_stage1_mfma) &&
+                           (i % num_mfma_per_issue_more == ds_write_issue_point_stage1)) ||
+                          ((i >= num_stage1_mfma) &&
+                           ((i - num_stage1_mfma) % num_mfma_per_issue_less ==
+                            ds_write_issue_point_stage2))) &&
+                         (((i < num_stage1_mfma) &&
+                           ((i / num_mfma_per_issue_more) < num_ds_write_inst_a)) ||
+                          ((i >= num_stage1_mfma) &&
+                           ((i - num_stage1_mfma) / num_mfma_per_issue_less +
+                            num_stage1_bufferloads) < num_ds_write_inst_a)))
             {
-                static_for<0, 4, 1>{}([&](auto i) {
-                    ignore = i;
-                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                    __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                    __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                });
-#if 0 
-                constexpr auto staged_num_mfma_per_buffer_load_a =
-                    math::integer_divide_ceil(staged_num_mfma, num_buffer_load_inst_a);
-
-                constexpr auto stage_more_mfma =
-                    staged_num_mfma -
-                    (staged_num_mfma_per_buffer_load_a - 1) * num_buffer_load_inst_a;
-
-                // A global
-                static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i_inst) {
-                    if constexpr(i_inst.value < stage_more_mfma)
-                    {
-                        if(i_inst.value < staged_num_ds_read_inst_a)
-                        {
-                            __builtin_amdgcn_sched_group_barrier(
-                                0x008, staged_num_mfma_per_buffer_load_a - 1, 0); // MFMA
-                            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
-                            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);    // MFMA
-                            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);    // DS read
-                        }
-                        else
-                        {
-                            __builtin_amdgcn_sched_group_barrier(
-                                0x008, staged_num_mfma_per_buffer_load_a, 0);  // MFMA
-                            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                        }
-                    }
-                    else
-                    {
-                        if(i_inst.value < staged_num_ds_read_inst_a)
-                        {
-                            __builtin_amdgcn_sched_group_barrier(
-                                0x008, staged_num_mfma_per_buffer_load_a - 2, 0); // MFMA
-                            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
-                            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);    // MFMA
-                            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);    // DS read
-                        }
-                        else
-                        {
-                            __builtin_amdgcn_sched_group_barrier(
-                                0x008, staged_num_mfma_per_buffer_load_a - 1, 0); // MFMA
-                            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
-                        }
-                    }
-                });
-
-                __builtin_amdgcn_sched_barrier(0);
-#endif
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
             }
-            else
-            {
-                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-#if 0
-                // A local Read
-                static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
-                    ignore = i_inst;
-                    __builtin_amdgcn_sched_group_barrier(
-                        0x008, staged_num_mfma_per_ds_read_a, 0);      // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                });
-
-                __builtin_amdgcn_sched_barrier(0);
-#endif
-            }
-#endif
-#if 0
-            static_for<0, 4, 1>{}([&](auto i) {
-                ignore = i;
-                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            });
-
-            static_for<0, 4, 1>{}([&](auto i) {
-                ignore = i;
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            });
-
-            static_for<0, 4, 1>{}([&](auto i) {
-                ignore = i;
-                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            });
-
-            static_for<0, 4, 1>{}([&](auto i) {
-                ignore = i;
-                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            });
-#endif
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-
-            static_for<0, 4, 1>{}([&](auto i) {
-                ignore = i;
-                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            });
-        }
-    };
-
-    template <typename Stage>
-    __device__ static constexpr auto EpilogueScheduler_1(Stage stage)
-    {
-        constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
-        constexpr auto num_ds_write_inst_a    = HotLoopInstList::A_LDS_Write_Inst_Num;
-        constexpr auto num_buffer_load_inst_b = MWaves * HotLoopInstList::B_Buffer_Load_Inst_Num;
-
-        constexpr auto num_mfma = HotLoopInstList::C_MFMA_Inst_Num;
-
-        constexpr auto staged_num_ds_read_inst_a = num_ds_read_inst_a / MRepeat;
-        constexpr auto staged_num_mfma           = num_mfma / MRepeat;
-
-        constexpr auto staged_num_mfma_per_ds_read_a = staged_num_mfma / staged_num_ds_read_inst_a;
-
-        if constexpr(stage.value == 0)
-        {
-            constexpr auto staged_num_buffer_load_b_per_ds_read_a =
-                num_buffer_load_inst_b / staged_num_ds_read_inst_a;
-            constexpr auto staged_num_mfma_per_buffer_load_b =
-                staged_num_mfma / num_buffer_load_inst_b;
-            // B global
-            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
-                ignore = i_inst;
-
-                static_for<0, staged_num_buffer_load_b_per_ds_read_a, 1>{}([&](auto ibuf_inst) {
-                    ignore = ibuf_inst;
-                    __builtin_amdgcn_sched_group_barrier(
-                        0x008, staged_num_mfma_per_buffer_load_b, 0);  // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                });
-
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                __builtin_amdgcn_sched_group_barrier(
-                    0x008, staged_num_mfma_per_buffer_load_b - 1, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
-            });
-
-            __builtin_amdgcn_sched_barrier(0);
-        }
-        else if constexpr(stage.value == 1)
-        {
-#if 0
-            constexpr auto staged_num_ds_write_a_per_ds_read_a =
-                num_ds_write_inst_a / staged_num_ds_read_inst_a;
-            constexpr auto staged_num_mfma_per_ds_write_a = staged_num_mfma / num_ds_write_inst_a;
-            // A local write
-            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
-                ignore = i_inst;
-
-                static_for<0, staged_num_ds_write_a_per_ds_read_a, 1>{}([&](auto idswrite_inst) {
-                    ignore = idswrite_inst;
-                    __builtin_amdgcn_sched_group_barrier(
-                        0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                });
-
-                __builtin_amdgcn_sched_group_barrier(
-                    0x008, staged_num_ds_write_a_per_ds_read_a, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);  // DS read
-            });
-#elif 1
-            constexpr auto staged_num_mfma_per_ds_write_a =
-                math::integer_divide_ceil(staged_num_mfma, num_ds_write_inst_a);
-
-            constexpr auto stage_more_mfma =
-                staged_num_mfma - (staged_num_mfma_per_ds_write_a - 1) * num_ds_write_inst_a;
-
-            // A local write
-            static_for<0, num_ds_write_inst_a, 1>{}([&](auto i_inst) {
-                if constexpr(i_inst.value < stage_more_mfma)
-                {
-                    if(i_inst.value < staged_num_ds_read_inst_a)
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    }
-                    else
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_ds_write_a, 0);     // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                    }
-                }
-                else
-                {
-                    if(i_inst.value < staged_num_ds_read_inst_a)
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_ds_write_a - 2, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    }
-                    else
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                    }
-                }
-            });
-#endif
-            __builtin_amdgcn_sched_barrier(0);
-        }
-        else
-        {
-            // A local Read
-            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
-                ignore = i_inst;
-                __builtin_amdgcn_sched_group_barrier(
-                    0x008, staged_num_mfma_per_ds_read_a, 0);      // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            });
-
-            __builtin_amdgcn_sched_barrier(0);
-        }
-    }
-
-    __device__ static constexpr auto EpilogueScheduler_2()
-    {
-        constexpr auto num_ds_read_inst_a = HotLoopInstList::A_LDS_Read_Inst_Num;
-
-        constexpr auto num_mfma = HotLoopInstList::C_MFMA_Inst_Num;
-
-        constexpr auto staged_num_ds_read_inst_a = num_ds_read_inst_a / MRepeat;
-        constexpr auto staged_num_mfma           = num_mfma / MRepeat;
-
-        constexpr auto staged_num_mfma_per_ds_read_a = staged_num_mfma / staged_num_ds_read_inst_a;
-
-        // A local Read
-        static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
-            ignore = i_inst;
-            __builtin_amdgcn_sched_group_barrier(0x008, staged_num_mfma_per_ds_read_a, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
         });
-
-        __builtin_amdgcn_sched_barrier(0);
     }
 
     template <bool HasMainLoop,
@@ -777,16 +355,15 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
         // main body
         if constexpr(HasMainLoop)
         {
-            HotLoopScheduler scheduler;
             index_t i = 0;
             do
             {
                 auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
                     b_blockwise_copy.Run(b_grid_desc,
-                        b_grid_buf,
-                        b_block_desc_n0_n1_k0_k1,
-                        b_block_origin_idx,
-                        b_thread_bufs(local_read_buf));
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(local_read_buf));
                     b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
 
                     a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(local_read_buf));
@@ -794,26 +371,6 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                     a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
 
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
-#if 0
-                        if constexpr(m0.value == scheduler.buffer_load_b_stage)
-                        {
-                            b_blockwise_copy.Run(b_grid_desc,
-                                                 b_grid_buf,
-                                                 b_block_desc_n0_n1_k0_k1,
-                                                 b_block_origin_idx,
-                                                 b_thread_bufs(local_read_buf));
-                            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-                        }
-                        else if constexpr(m0.value == scheduler.ds_write_a_stage)
-                        {
-                            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(local_read_buf));
-                        }
-                        else if constexpr(m0.value == scheduler.buffer_load_a_stage)
-                        {
-                            a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
-                            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-                        }
-#endif
                         static_for<0, KRepeat, 1>{}([&](auto k0) {
                             static_for<0, NRepeat, 1>{}([&](auto n0) {
                                 vector_type<ComputeDataType, KPack> a_thread_vec;
@@ -822,8 +379,13 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                                 static_for<0, KPack, 1>{}([&](auto ik) {
                                     a_thread_vec.template AsType<ComputeDataType>()(ik) =
                                         a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                            make_tuple((m0 + HotloopLocalBufSwitch * mfma_reg_buf) % 2,
-                                                       I0, I0, k0, I0, ik))>{}];
+                                            make_tuple((m0 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                           2,
+                                                       I0,
+                                                       I0,
+                                                       k0,
+                                                       I0,
+                                                       ik))>{}];
                                     b_thread_vec.template AsType<ComputeDataType>()(ik) =
                                         b_thread_bufs[mfma_reg_buf]
                                                      [Number<b_thread_desc_.CalculateOffset(
@@ -852,13 +414,22 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                                 static_for<0, KGroup, 1>{}([&](auto kg0) {
                                     a_thread_copy_.Run(
                                         a_block_desc_m0_m1_m2_k0_k1_k2,
-                                        make_tuple(Number<(m0 + 2) % MRepeat>{}, I0, I0,
-                                                   Number<k0 * 2 + kg0>{}, I0, I0),
+                                        make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                                   I0,
+                                                   I0,
+                                                   Number<k0 * 2 + kg0>{},
+                                                   I0,
+                                                   I0),
                                         a_block_buf.At(local_read_buf),
                                         a_thread_desc_,
                                         make_tuple(
-                                            Number<(m0 + 2 + HotloopLocalBufSwitch * mfma_reg_buf) % 2>{},
-                                            I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                            Number<(m0 + 2 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                   2>{},
+                                            I0,
+                                            I0,
+                                            k0,
+                                            I0,
+                                            Number<kg0 * A_K1>{}),
                                         a_thread_buf);
                                 });
                             });
@@ -915,9 +486,8 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                                 });
                             });
                         }
-
                     });
-                    scheduler();
+                    HotLoopScheduler();
                 };
 
                 LoopFunc(I0, I1);
@@ -937,20 +507,6 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
             a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1));
 
             static_for<0, MRepeat, 1>{}([&](auto m0) {
-#if 0
-                if constexpr(m0.value == 0)
-                {
-                    b_blockwise_copy.Run(b_grid_desc,
-                                         b_grid_buf,
-                                         b_block_desc_n0_n1_k0_k1,
-                                         b_block_origin_idx,
-                                         b_thread_bufs(I1));
-                }
-                else if constexpr(m0.value == (MRepeat - 2))
-                {
-                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1));
-                }
-#endif
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
                         vector_type<ComputeDataType, KPack> a_thread_vec;
@@ -1041,6 +597,8 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                 }
             });
 
+            HotLoopScheduler();
+
             static_for<0, MRepeat, 1>{}([&](auto m0) {
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
@@ -1087,9 +645,10 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                                 a_thread_buf);
                         });
                     });
-
                 }
             });
+
+            HotLoopScheduler();
             // Let's leak last MFMA block to epilogue region, cover the potential lds-shuffle
             // latency
         }
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
index 7b399c6daa..bf7c79e2f3 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
@@ -187,7 +187,7 @@ struct GridwiseMoeGemm
     static constexpr index_t KLane =
         mfma_selector::GetKPerXdlops() / mfma_selector::GetK1PerXdlops();
 
-    static constexpr index_t KGroup = mfma_selector::selected_mfma.k_per_blk == 32 ? 2 : 1;
+    static constexpr index_t KGroup  = mfma_selector::selected_mfma.k_per_blk == 32 ? 2 : 1;
     static constexpr index_t KRepeat = KPerBlock / KLane / (KPack / KGroup);
     static constexpr index_t NLane   = NPerXdl;
     static constexpr index_t NWave   = NPerBlock / NPerXdl / NXdlPerWave;

From bc9c819aa41c97b2e49cf8534e1646bb675ee24d Mon Sep 17 00:00:00 2001
From: aska-0096 <haocwang@amd.com>
Date: Sun, 27 Apr 2025 11:50:30 +0000
Subject: [PATCH 8/8] generalized bpreshuffle pipeline optimization

---
 .../moe_gemm2_xdl_fp8.cpp                     |   1 +
 ...e_gemm_pipeline_xdlops_b_preshuffle_v3.hpp | 158 ++++++++++++++++--
 2 files changed, 148 insertions(+), 11 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
index f7935f9d65..6dbee579c2 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
@@ -132,6 +132,7 @@ static constexpr ck::index_t CShuffleMLane = BLOCKSIZE / CShuffleNLane;
 static constexpr ck::index_t AK1           = 16 / sizeof(A0DataType);
 static constexpr ck::index_t BK1           = 16 / sizeof(B0DataType);
 static constexpr ck::index_t EVec          = 2;
+// TODO: Epilogue performance issue. AtomicAdd lose 15~20% performance compare with Set.
 static constexpr ck::index_t D0Vec         = 1;
 static constexpr ck::index_t D1Vec         = 1;
 static constexpr ck::index_t D2Vec         = 1;
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
index 2f1616a866..6af20af484 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
@@ -187,6 +187,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
 
     __device__ static constexpr auto HotLoopScheduler()
     {
+#if 0
         // A/B split schedule
         // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
         constexpr auto num_ds_read_inst_a =
@@ -237,6 +238,10 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
         constexpr auto ds_write_issue_point_stage2 = num_mfma_per_issue_less >= 3 ? 1 : 0;
 
         static_for<0, num_mfma_inst, 1>{}([&](auto i) {
+            constexpr auto current_buffer_load_issue =
+                i < num_stage1_mfma
+                    ? (i / num_mfma_per_issue_more)
+                    : (num_stage1_bufferloads + (i - num_stage1_mfma) / num_mfma_per_issue_less);
             __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
 
             // Group num_mfma_perstage num_ds_read_a_perstage
@@ -258,21 +263,152 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
             }
 
-            // Hide B lds wr issue latency
-            if constexpr((((i < num_stage1_mfma) &&
-                           (i % num_mfma_per_issue_more == ds_write_issue_point_stage1)) ||
-                          ((i >= num_stage1_mfma) &&
-                           ((i - num_stage1_mfma) % num_mfma_per_issue_less ==
-                            ds_write_issue_point_stage2))) &&
-                         (((i < num_stage1_mfma) &&
-                           ((i / num_mfma_per_issue_more) < num_ds_write_inst_a)) ||
-                          ((i >= num_stage1_mfma) &&
-                           ((i - num_stage1_mfma) / num_mfma_per_issue_less +
-                            num_stage1_bufferloads) < num_ds_write_inst_a)))
+            // Hide A lds wr issue latency
+            if constexpr((current_buffer_load_issue >= num_buffer_load_inst_b) &&
+                         ((((i < num_stage1_mfma) &&
+                            (i % num_mfma_per_issue_more == ds_write_issue_point_stage1)) ||
+                           ((i >= num_stage1_mfma) &&
+                            ((i - num_stage1_mfma) % num_mfma_per_issue_less ==
+                             ds_write_issue_point_stage2))) &&
+                          (((i < num_stage1_mfma) &&
+                            ((i / num_mfma_per_issue_more - num_buffer_load_inst_b) < num_ds_write_inst_a)) ||
+                           ((i >= num_stage1_mfma) &&
+                            ((i - num_stage1_mfma) / num_mfma_per_issue_less +
+                             num_stage1_bufferloads - num_buffer_load_inst_b) < num_ds_write_inst_a))))
             {
                 __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
             }
         });
+#elif 1
+        // A/B split schedule
+        // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
+        constexpr auto num_ds_read_inst_a =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
+                ? HotLoopInstList::A_LDS_Read_Inst_Num
+                : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
+
+        constexpr auto num_ds_write_inst_a = HotLoopInstList::A_LDS_Write_Inst_Num;
+
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
+
+        static_assert(num_buffer_load_inst_a == num_ds_write_inst_a);
+
+        constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num;
+        constexpr auto mfma_cycle    = HotLoopInstList::C_MFMA_Inst_Cycle;
+
+        constexpr auto ds_read_a_issue_cycle =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_a_mfma_rate =
+            math::integer_divide_ceil(mfma_cycle - 4, 2 * ds_read_a_issue_cycle);
+
+        // constexpr auto num_dsread_a_mfma =
+        //     (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
+
+        constexpr auto num_total_stages = MRepeat;
+
+        // Group num_mfma_perstage num_ds_read_a_perstage
+        // since we want to reuse a local register buffer
+        constexpr auto num_mfma_perstage      = num_mfma_inst / num_total_stages;
+        constexpr auto num_ds_read_a_perstage = num_ds_read_inst_a / num_total_stages;
+
+        constexpr auto num_ds_read_a_mfma_perstage =
+            math::integer_divide_ceil(num_ds_read_a_perstage, ds_read_a_mfma_rate);
+
+        constexpr auto num_ds_read_a_prefetch_stages = 2;
+
+        constexpr auto buffer_load_perstage_more = math::integer_divide_ceil(
+            (num_buffer_load_inst_a + num_buffer_load_inst_b), (num_total_stages - 2));
+        constexpr auto buffer_load_perstage_less = math::integer_divide_floor(
+            (num_buffer_load_inst_a + num_buffer_load_inst_b), (num_total_stages - 2));
+
+        constexpr auto buffer_load_stages_more =
+            (num_buffer_load_inst_a + num_buffer_load_inst_b) -
+            math::integer_divide_floor((num_buffer_load_inst_a + num_buffer_load_inst_b),
+                                       (num_total_stages - 2)) *
+                ((num_total_stages - 2));
+
+        constexpr auto buffer_load_b_stages =
+            buffer_load_perstage_more * buffer_load_stages_more > num_buffer_load_inst_b
+                ? num_buffer_load_inst_b / buffer_load_perstage_more
+                : (buffer_load_stages_more +
+                   (num_buffer_load_inst_b - buffer_load_perstage_more * buffer_load_stages_more) /
+                       buffer_load_perstage_less);
+
+        constexpr auto buffer_load_a_stages =
+            num_total_stages - num_ds_read_a_prefetch_stages - buffer_load_b_stages;
+
+        constexpr auto buffer_load_issue_point_b = 0;
+        constexpr auto buffer_load_issue_point_interval_more =
+            num_mfma_perstage / buffer_load_perstage_more;
+        constexpr auto buffer_load_issue_point_interval_less =
+            num_mfma_perstage / buffer_load_perstage_less;
+        constexpr auto ds_write_issue_point      = 0;
+        constexpr auto buffer_load_issue_point_a = num_mfma_perstage >= 3 ? 1 : 0;
+
+        // B global read
+        static_for<0, buffer_load_b_stages, 1>{}([&](auto i) {
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+
+                if constexpr(((i < buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_more ==
+                               buffer_load_issue_point_b)) ||
+                             ((i >= buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_less ==
+                               buffer_load_issue_point_b)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+            });
+        });
+
+        // A global read + A local write
+        static_for<0, buffer_load_a_stages, 1>{}([&](auto i) {
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                if constexpr((((i + buffer_load_b_stages) < buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_more ==
+                               ds_write_issue_point)) ||
+                             (((i + buffer_load_b_stages) >= buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_less ==
+                               ds_write_issue_point)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                }
+                if constexpr((((i + buffer_load_b_stages) < buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_more ==
+                               buffer_load_issue_point_a)) ||
+                             (((i + buffer_load_b_stages) >= buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_less ==
+                               buffer_load_issue_point_a)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+            });
+        });
+
+        // lds synchronization, prefetch next loop local A
+        static_for<0, num_ds_read_a_prefetch_stages, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+            });
+        });
+#endif
     }
 
     template <bool HasMainLoop,