From ceaa5a984b058bf67453ba5d58d4462879157a24 Mon Sep 17 00:00:00 2001
From: coderfeli <coderfeli@163.com>
Date: Thu, 24 Apr 2025 11:05:08 +0000
Subject: [PATCH 01/18] gu fusion v3

---
 .../moe_gemm1_xdl_fp8.cpp                     |   8 +-
 ..._pipeline_xdlops_b_preshuffle_selector.hpp |  69 ++--
 .../gpu/grid/gridwise_moe_gemm.hpp            | 301 +++++++++++++++---
 3 files changed, 313 insertions(+), 65 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
index 3b31460953..001a30370b 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
@@ -158,8 +158,8 @@ using BElementOp = PassThrough;
 
 static constexpr auto GemmSpec         = ck::tensor_operation::device::GemmSpecialization::Default;
 static constexpr ck::index_t MPerBlock = 128;
-static constexpr ck::index_t MXDLPerWave = 4;
-static constexpr ck::index_t NXDLPerWave = 2;
+static constexpr ck::index_t MXDLPerWave = 8;
+static constexpr ck::index_t NXDLPerWave = 1;
 static constexpr ck::index_t BLOCKSIZE   = 256;
 static constexpr ck::index_t NPerBlock   = 64;
 static constexpr ck::index_t MNPerXDL    = 16;
@@ -190,8 +190,8 @@ using DeviceOpInstance                   = ck::tensor_operation::device::DeviceM
                //    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
                //    MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
                 //  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-                2,    2,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, ActOP, Nswizzle, true, MulRoutedWeight, true, int32_t, A0DataType>;
+                2,    1,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec>,
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, ActOP, Nswizzle, true, MulRoutedWeight, true, int32_t, A0DataType>;
 
 // clang-format on
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp
index 074b5873ee..df59513a88 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp
@@ -8,6 +8,7 @@
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp"
@@ -171,26 +172,54 @@ constexpr auto BlockGemmBPreshufflePipeline_Selector()
         static_assert(MRepeat >= 4, "MRepeat should at least be 4 in BlockGemmPipelineVersion::v3");
         if constexpr(std::is_same<ADataType, BDataType>::value)
         {
-            return BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlkGemmPipeSche,
-                                                               BlockSize,
-                                                               ADataType,
-                                                               BDataType,
-                                                               ComputeDataType,
-                                                               AccDataType,
-                                                               ATileDesc,
-                                                               BTileDesc,
-                                                               AMmaTileDesc,
-                                                               BMmaTileDesc,
-                                                               ABlockTransferSrcScalarPerVector,
-                                                               BBlockTransferSrcScalarPerVector,
-                                                               MPerBlock,
-                                                               NPerBlock,
-                                                               KPerBlock,
-                                                               MPerXDL,
-                                                               NPerXDL,
-                                                               MRepeat,
-                                                               NRepeat,
-                                                               KPack>{};
+            if constexpr(GUFusion)
+            {
+                return BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3<
+                    BlkGemmPipeSche,
+                    BlockSize,
+                    ADataType,
+                    BDataType,
+                    ComputeDataType,
+                    AccDataType,
+                    ATileDesc,
+                    BTileDesc,
+                    AMmaTileDesc,
+                    BMmaTileDesc,
+                    ABlockTransferSrcScalarPerVector,
+                    BBlockTransferSrcScalarPerVector,
+                    MPerBlock,
+                    NPerBlock,
+                    KPerBlock,
+                    MPerXDL,
+                    NPerXDL,
+                    MRepeat,
+                    NRepeat,
+                    KPack>{};
+            }
+            else 
+            {
+                    
+                return BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlkGemmPipeSche,
+                                                                BlockSize,
+                                                                ADataType,
+                                                                BDataType,
+                                                                ComputeDataType,
+                                                                AccDataType,
+                                                                ATileDesc,
+                                                                BTileDesc,
+                                                                AMmaTileDesc,
+                                                                BMmaTileDesc,
+                                                                ABlockTransferSrcScalarPerVector,
+                                                                BBlockTransferSrcScalarPerVector,
+                                                                MPerBlock,
+                                                                NPerBlock,
+                                                                KPerBlock,
+                                                                MPerXDL,
+                                                                NPerXDL,
+                                                                MRepeat,
+                                                                NRepeat,
+                                                                KPack>{};
+            }
         }
         else
         {
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
index 255fb8cff4..ead4399d81 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
@@ -1886,7 +1886,8 @@ struct GridwiseMoeGemm
         const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
             MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                 c_grid_desc_m_n, problem.MBlock, problem.NBlock);
-        const index_t max_token_id    = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
+        const index_t max_token_id = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
+        // static_assert(NSwizzle == false, "to do fix: need another pr in sorting merged");
         const index_t expert_block_id = NSwizzle ? blockIdx.x / problem.NBlock : blockIdx.y;
         if(expert_block_id * MPerBlock >= max_token_id)
             return;
@@ -1895,12 +1896,13 @@ struct GridwiseMoeGemm
         const auto block_mn = [&]() -> std::pair<int, int> {
             if constexpr(NSwizzle)
             {
-                const index_t ecnt_prefix    = p_max_token_id[1 + expert_id];
-                const index_t prefix_block   = ecnt_prefix * problem.NBlock;
-                const index_t ecnt           = p_max_token_id[2 + expert_id] - ecnt_prefix;
-                const index_t expert_swizzle = ecnt > 0 ? ecnt : 1;
-                const index_t bid_new        = blockIdx.x - prefix_block;
-                const index_t nid            = __builtin_amdgcn_readfirstlane(
+                const index_t ecnt_prefix  = p_max_token_id[1 + expert_id];
+                const index_t prefix_block = ecnt_prefix * problem.NBlock;
+                const index_t ecnt         = p_max_token_id[2 + expert_id] - ecnt_prefix;
+                const index_t expert_swizzle =
+                    ecnt > 0 ? ecnt : 1; // p_max_token_id[expert_id + 1]; // 2
+                const index_t bid_new = blockIdx.x - prefix_block;
+                const index_t nid     = __builtin_amdgcn_readfirstlane(
                     bid_new % 8 + bid_new / (8 * expert_swizzle) * 8);
                 const index_t mid =
                     __builtin_amdgcn_readfirstlane(ecnt_prefix + bid_new / 8 % expert_swizzle);
@@ -1911,9 +1913,9 @@ struct GridwiseMoeGemm
                 return {blockIdx.x, blockIdx.y};
             }
         }();
+
         const index_t block_n_id = block_mn.first;
         const index_t block_m_id = block_mn.second;
-
         const index_t token0 =
             __builtin_amdgcn_readfirstlane(p_sorted_token_ids[block_m_id * MPerBlock] & 0xffffff);
 
@@ -1925,11 +1927,9 @@ struct GridwiseMoeGemm
         constexpr auto AMRepeats  = MPerBlock / AMThreads;
         const index_t token_pos   = block_m_id * MPerBlock + threadIdx.x / AKThreads * AMRepeats;
 
-        if(token_pos >= max_token_id || expert_block_id * MPerBlock >= max_token_id ||
-           token0 >= problem.NumTokens)
+        if(token_pos >= max_token_id || token0 >= problem.NumTokens)
             return;
-        StaticallyIndexedArray<IndexType, AMRepeats>
-            gather_offsets; //= p_sorted_token_ids[token_pos];
+        StaticallyIndexedArray<IndexType, AMRepeats> gather_offsets;
         static_for<0, AMRepeats, 1>{}([&](auto m0) {
             const index_t fused_token = p_sorted_token_ids[token_pos + m0];
             index_t token_offset      = fused_token & 0xffffff;
@@ -1939,7 +1939,8 @@ struct GridwiseMoeGemm
             }
             gather_offsets(m0) = static_cast<IndexType>(token_offset) * problem.K;
         });
-        const index_t expert_stride = __builtin_amdgcn_readfirstlane(problem.N * problem.K);
+        const index_t expert_stride =
+            __builtin_amdgcn_readfirstlane(problem.N * problem.K * (IsInputGemm ? 2 : 1));
 
         // N0, K0, Blocksize*KPack
         const index_t n_block_data_idx_on_grid =
@@ -1950,7 +1951,6 @@ struct GridwiseMoeGemm
         const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_b_grid + expert_id * expert_stride / BPackedSize,
             b_grid_desc_bpreshuffled.GetElementSpaceSize());
-
         // A matrix in LDS memory, dst of blockwise copy
         constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
 
@@ -2029,24 +2029,75 @@ struct GridwiseMoeGemm
         static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
         auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
         auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+        decltype(c_thread_buf) c_thread_buf_up;
+
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  float,
+                                  c_thread_buf.num_of_v_,
+                                  c_thread_buf.s_per_v,
+                                  true>
+            c_thread_buf_fp32;
 
         const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
             (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
             KPerBlock);
 
-        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(a_grid_desc_ak0_m_ak1,
-                                                                         a_block_desc_ak0_m_ak1,
-                                                                         a_blockwise_copy,
-                                                                         a_grid_buf,
-                                                                         a_block_bufs,
-                                                                         a_block_slice_copy_step,
-                                                                         b_grid_desc_bpreshuffled,
-                                                                         b_blockwise_copy,
-                                                                         b_grid_buf,
-                                                                         b_block_bufs,
-                                                                         b_block_slice_copy_step,
-                                                                         c_thread_buf,
-                                                                         num_k_block_main_loop);
+        if constexpr(IsInputGemm)
+        {
+            const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2 / BPackedSize;
+            const auto b_grid_buf_up     = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_grid_up + expert_id * expert_stride / BPackedSize,
+                b_grid_desc_bpreshuffled.GetElementSpaceSize());
+            auto b_blockwise_copy_up = ThreadwiseTensorSliceTransfer_v2<
+                BDataType,
+                BDataType,
+                decltype(b_grid_desc_bpreshuffled),
+                decltype(b_block_desc_bk0_n_bk1),
+                Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
+                Sequence<1, 2, 0, 3>,
+                3,
+                BBlockTransferSrcScalarPerVector,
+                BThreadTransferSrcResetCoordinateAfterRun,
+                true>(b_grid_desc_bpreshuffled,
+                      make_multi_index(n_block_data_idx_on_grid,
+                                       get_warp_local_1d_id() % NWave,
+                                       0,
+                                       KPack * (get_thread_local_1d_id() % warpSize)));
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_bufs,
+                a_block_slice_copy_step,
+                b_grid_desc_bpreshuffled,
+                b_blockwise_copy,
+                b_blockwise_copy_up,
+                b_grid_buf,
+                b_grid_buf_up,
+                b_block_bufs,
+                b_block_slice_copy_step,
+                c_thread_buf,
+                c_thread_buf_up,
+                num_k_block_main_loop);
+        }
+	else 
+	{
+
+	        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(a_grid_desc_ak0_m_ak1,
+	                                                                         a_block_desc_ak0_m_ak1,
+	                                                                         a_blockwise_copy,
+	                                                                         a_grid_buf,
+	                                                                         a_block_bufs,
+	                                                                         a_block_slice_copy_step,
+	                                                                         b_grid_desc_bpreshuffled,
+	                                                                         b_blockwise_copy,
+	                                                                         b_grid_buf,
+	                                                                         b_block_bufs,
+	                                                                         b_block_slice_copy_step,
+	                                                                         c_thread_buf,
+	                                                                         num_k_block_main_loop);
+										 }
 
         // shuffle C and write out
         {
@@ -2074,6 +2125,185 @@ struct GridwiseMoeGemm
             constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
             constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
 
+            // mul scales
+            const float* p_sorted_weights_0 = p_ds_grid[I0];
+            const float* p_scale_b          = p_ds_grid[I1];
+
+            static_assert(M0 * M1 * M2 * M3 * M4 == MPerBlock);
+            static_assert(M4 == 4);
+            const index_t m1 = get_warp_local_1d_id() / NWave;
+            const index_t m3 = threadIdx.x % get_warp_size() / MPerXdl;
+
+            if(p_sorted_weights_0 != nullptr && p_scale_b != nullptr)
+            {
+                if constexpr(PerTokenQuant)
+                {
+                    constexpr index_t scale_stride = (IsInputGemm ? 2 : 1);
+                    p_scale_b += expert_id * problem.N * scale_stride + block_n_id * NPerBlock +
+                                 get_warp_local_1d_id() % NWave * NPerXdl + threadIdx.x % NPerXdl;
+                }
+                else
+                {
+                    p_scale_b += expert_id;
+                }
+
+                vector_type<int32_t, 4> scale_token_ids;
+                vector_type<float, 4> topk_weights;
+                static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
+                    const float scale_b = p_scale_b[n0 * NWave * NPerXdl * PerTokenQuant];
+                    static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
+                        static_for<0, M2, 1>{}([&](auto m2) {      // m_inst_num_groups_per_blk
+                            const index_t m_pos = block_m_id * MPerBlock + m0 * M1 * M2 * M3 * M4 +
+                                                  m1 * M2 * M3 * M4 + m2 * M3 * M4 + m3 * M4;
+                            if constexpr(PerTokenQuant)
+                            {
+                                scale_token_ids =
+                                    *c_style_pointer_cast<const vector_type<int32_t, M4>*>(
+                                        p_sorted_token_ids + m_pos);
+                            }
+                            if constexpr(MulRoutedWeight)
+                            {
+                                topk_weights = *c_style_pointer_cast<const vector_type<float, M4>*>(
+                                    p_ds_grid[I2] + m_pos);
+                            }
+                            static_for<0, M4, 1>{}([&](auto m4) { // m_inst_group_size
+                                float scale_a = [&]() {
+                                    if constexpr(PerTokenQuant)
+                                    {
+                                        index_t fused_token = scale_token_ids.AsType<index_t>()[m4];
+                                        const index_t token_offset = fused_token & 0xffffff;
+                                        return token_offset < problem.NumTokens
+                                                   ? p_sorted_weights_0[token_offset]
+                                                   : 0.0;
+                                    }
+                                    else
+                                    {
+                                        return p_sorted_weights_0[0];
+                                    }
+                                }();
+                                constexpr index_t c_offset =
+                                    blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
+                                        make_tuple(m0, n0, m2 * M4 + m4));
+                                constexpr auto cidx = Number<c_offset>{};
+                                if constexpr(IsInputGemm) // gu fusion
+                                {
+                                    if constexpr(ActivationOperation == Activation::silu_and_mul)
+                                    {
+                                        const float scale_up =
+                                            p_scale_b[(n0 * NWave * NPerXdl + problem.N) *
+                                                      PerTokenQuant];
+                                        float gate = scale_a * scale_b * c_thread_buf[cidx];
+                                        float up   = scale_a * scale_up * c_thread_buf_up[cidx];
+                                        if constexpr(MulRoutedWeight)
+                                        {
+                                            gate = gate * topk_weights.AsType<float>()[m4];
+                                            up   = up * topk_weights.AsType<float>()[m4];
+                                        }
+                                        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+                                        {
+                                            gate *= 16;
+                                            up *= 16;
+                                        }
+                                        tensor_operation::element_wise::Silu{}(gate, gate);
+                                        c_thread_buf_fp32(cidx) = gate * up;
+                                    }
+                                    else if(ActivationOperation == Activation::gelu_and_mul)
+                                    {
+                                        const float scale_up =
+                                            p_scale_b[(n0 * NWave * NPerXdl + problem.N) *
+                                                      PerTokenQuant];
+                                        float gate = scale_a * scale_b * c_thread_buf[cidx];
+                                        float up   = scale_a * scale_up * c_thread_buf_up[cidx];
+                                        if constexpr(MulRoutedWeight)
+                                        {
+                                            gate = gate * topk_weights.AsType<float>()[m4];
+                                            up   = up * topk_weights.AsType<float>()[m4];
+                                        }
+                                        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+                                        {
+                                            gate *= 16;
+                                            up *= 16;
+                                        }
+                                        tensor_operation::element_wise::Gelu{}(gate, gate);
+                                        c_thread_buf_fp32(cidx) = gate * up;
+                                    }
+                                }
+                                else
+                                {
+                                    c_thread_buf_fp32(cidx) =
+                                        scale_a * scale_b * c_thread_buf[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        c_thread_buf_fp32(cidx) = c_thread_buf_fp32(cidx) *
+                                                                  topk_weights.AsType<float>()[m4];
+                                    }
+                                }
+                            });
+                        });
+                    });
+                });
+            }
+            else
+            {
+                vector_type<float, 4> topk_weights; // for gemm2 only
+                static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
+                    static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
+                        static_for<0, M2, 1>{}([&](auto m2) {      // m_inst_num_groups_per_blk
+                            const index_t m_pos = block_m_id * MPerBlock + m0 * M1 * M2 * M3 * M4 +
+                                                  m1 * M2 * M3 * M4 + m2 * M3 * M4 + m3 * M4;
+                            if constexpr(MulRoutedWeight)
+                            {
+                                topk_weights = *c_style_pointer_cast<const vector_type<float, M4>*>(
+                                    p_ds_grid[I2] + m_pos);
+                            }
+                            static_for<0, M4, 1>{}([&](auto m4) { // m_inst_group_size
+                                constexpr index_t c_offset =
+                                    blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
+                                        make_tuple(m0, n0, m2 * M4 + m4));
+                                constexpr auto cidx = Number<c_offset>{};
+
+                                if constexpr(IsInputGemm) // gu fusion
+                                {
+                                    if constexpr(ActivationOperation == Activation::silu_and_mul)
+                                    {
+                                        float gate = c_thread_buf[cidx];
+                                        float up   = c_thread_buf_up[cidx];
+                                        if constexpr(MulRoutedWeight)
+                                        {
+                                            gate = gate * topk_weights.AsType<float>()[m4];
+                                            up   = up * topk_weights.AsType<float>()[m4];
+                                        }
+                                        tensor_operation::element_wise::Silu{}(gate, gate);
+                                        c_thread_buf_fp32(cidx) = gate * up;
+                                    }
+                                    else if(ActivationOperation == Activation::gelu_and_mul)
+                                    {
+                                        float gate = c_thread_buf[cidx];
+                                        float up   = c_thread_buf_up[cidx];
+                                        if constexpr(MulRoutedWeight)
+                                        {
+                                            gate = gate * topk_weights.AsType<float>()[m4];
+                                            up   = up * topk_weights.AsType<float>()[m4];
+                                        }
+                                        tensor_operation::element_wise::Gelu{}(gate, gate);
+                                        c_thread_buf_fp32(cidx) = gate * up;
+                                    }
+                                }
+                                else
+                                {
+                                    c_thread_buf_fp32(cidx) = c_thread_buf[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        c_thread_buf_fp32(cidx) = topk_weights.AsType<float>()[m4] *
+                                                                  c_thread_buf_fp32[cidx];
+                                    }
+                                }
+                            });
+                        });
+                    });
+                });
+            }
+
             constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
                 GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
 
@@ -2171,18 +2401,8 @@ struct GridwiseMoeGemm
 
             const auto ds_grid_buf = generate_tuple(
                 [&](auto i) {
-                    using DDataType       = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
-                    const DDataType* ptr_ = p_ds_grid[i];
-                    // hack logic here to support different kind of strides. todo fix it.
-                    // ascale t, 1; bscale E, N, 1, move ptr to E
-                    // if(i.value == 1)
-                    // {
-                    //     ptr_ +=
-                    //         expert_id * (problem.StrideDs[1] ? problem.StrideDs[1] * problem.N :
-                    //         1);
-                    // }
                     return make_dynamic_buffer<AddressSpaceEnum::Global>(
-                        ptr_, ds_grid_desc_m_n[i].GetElementSpaceSize());
+                        p_ds_grid[i], ds_grid_desc_m_n[i].GetElementSpaceSize());
                 },
                 Number<NumDTensor>{});
 
@@ -2258,7 +2478,6 @@ struct GridwiseMoeGemm
 
             auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
                 p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
-            // space filling curve for threadwise C in VGPR
             constexpr auto sfc_c_vgpr =
                 SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
@@ -2297,7 +2516,7 @@ struct GridwiseMoeGemm
                     block_m_id * MPerBlock + threadIdx.x / ENThreads * EMRepeats + dstidx(I1);
                 static_for<0, EMRepeats, 1>{}([&](auto m0) {
                     const index_t fused_token = p_sorted_token_ids[c_token_pos + m0];
-                    index_t token_offset      = fused_token & 0xffffff;
+                    IndexType token_offset    = fused_token & 0xffffff;
                     if constexpr(IsInputGemm)
                     {
                         token_offset = token_offset * problem.TopK + (fused_token >> 24);
@@ -2310,7 +2529,7 @@ struct GridwiseMoeGemm
                 // each thread write its data from VGPR to LDS
                 c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
                                               sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
-                                              c_thread_buf,
+                                              c_thread_buf_fp32,
                                               c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
                                               c_shuffle_block_buf);
 

From c3c4a1e25273a850b7dd16edb6f759df450a46d2 Mon Sep 17 00:00:00 2001
From: coderfeli <coderfeli@163.com>
Date: Thu, 24 Apr 2025 11:09:41 +0000
Subject: [PATCH 02/18] change test

---
 .../65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp    | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
index 001a30370b..dc56a044b3 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
@@ -205,9 +205,9 @@ int main(int argc, char* argv[])
     ck::index_t N               = 4096;
     ck::index_t K               = 6144;
     ck::index_t experts         = 8;
-    ck::index_t sorted_tile_num = 16;
-    ck::index_t valid_tile_num  = 13;
-    ck::index_t tokens          = 64;
+    ck::index_t sorted_tile_num = 133;
+    ck::index_t valid_tile_num  = 128;
+    ck::index_t tokens          = 8192;
     ck::index_t topk            = 2;
 
     if(argc == 1)
@@ -263,10 +263,10 @@ int main(int argc, char* argv[])
     Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({sorted_size}, {1}));
     Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1 + sorted_tile_num}));
     max_token_id.mData = {valid_size};
-    int eids[]         = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 3, 3, 3};
+    // int eids[]         = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 3, 3, 3};
     for(int i = 0; i < sorted_tile_num; i++)
     {
-        expert_ids.mData[i] = eids[i];
+        expert_ids.mData[i] = i / (valid_tile_num / experts);
     }
     int token_per_tile = (tokens * topk + valid_tile_num - 1) / valid_tile_num;
     int tokenid        = 0;

From ddb5f36eebe3e4ed0b2286815e62bf6e6a72615f Mon Sep 17 00:00:00 2001
From: coderfeli <coderfeli@163.com>
Date: Thu, 24 Apr 2025 11:10:31 +0000
Subject: [PATCH 03/18] add missing file

---
 ...peline_xdlops_b_preshuffle_gufusion_v3.hpp | 919 ++++++++++++++++++
 1 file changed, 919 insertions(+)
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp

diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp
new file mode 100644
index 0000000000..60be7e2d07
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp
@@ -0,0 +1,919 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3<BlockGemmPipelineScheduler::Intrawave,
+                                                   BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>
+    : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeDataType,
+                                        AccDataType,
+                                        ATileDesc,
+                                        BTileDesc,
+                                        AMmaTileDesc,
+                                        BMmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerXDL,
+                                        NPerXDL,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>;
+    using Base::A_K1;
+    using Base::B_K1;
+    using Base::I0;
+    using Base::I1;
+    using Base::I2;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+    using Base::c_thread_desc_;
+
+    using Base::MWaves;
+
+    static constexpr index_t PrefetchStages        = 2;
+    static constexpr index_t PrefillStages         = 1;
+    static constexpr index_t GlobalBufferNum       = 1;
+    static constexpr index_t HotloopLocalBufSwitch = MRepeat % 2 == 0 ? 0 : 1;
+
+    template <typename TileDesc_M0_M1_M2_K>
+    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
+    {
+        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
+        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
+        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
+        constexpr index_t K2 = KPack;
+        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K0 = KRepeat;
+
+        return transform_tensor_descriptor(
+            TileDesc_M0_M1_M2_K{},
+            make_tuple(
+                make_pass_through_transform(Number<M0>{}),
+                make_pass_through_transform(Number<M1>{}),
+                make_pass_through_transform(Number<M2>{}),
+                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
+        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    template <typename Stage>
+    __device__ static constexpr auto HotLoopScheduler(Stage stage)
+    {
+        constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
+        constexpr auto num_ds_write_inst_a    = HotLoopInstList::A_LDS_Write_Inst_Num;
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = MWaves * HotLoopInstList::B_Buffer_Load_Inst_Num * 2;
+
+        constexpr auto num_mfma = HotLoopInstList::C_MFMA_Inst_Num * 2;
+
+        constexpr auto staged_num_ds_read_inst_a = num_ds_read_inst_a / MRepeat;
+        constexpr auto staged_num_mfma           = num_mfma / MRepeat;
+
+        constexpr auto staged_num_mfma_per_ds_read_a = staged_num_mfma / staged_num_ds_read_inst_a;
+
+        if constexpr(stage.value == 0)
+        {
+            constexpr auto staged_num_buffer_load_b_per_ds_read_a =
+                num_buffer_load_inst_b / staged_num_ds_read_inst_a;
+            constexpr auto staged_num_mfma_per_buffer_load_b =
+                staged_num_mfma / num_buffer_load_inst_b;
+            // B global
+            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
+                ignore = i_inst;
+
+                static_for<0, staged_num_buffer_load_b_per_ds_read_a - 1, 1>{}([&](auto ibuf_inst) {
+                    ignore = ibuf_inst;
+                    __builtin_amdgcn_sched_group_barrier(
+                        0x008, staged_num_mfma_per_buffer_load_b, 0);  // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                });
+
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(
+                    0x008, staged_num_mfma_per_buffer_load_b - 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
+            });
+
+            __builtin_amdgcn_sched_barrier(0);
+        }
+        else if constexpr(stage.value == 1)
+        {
+            constexpr auto staged_num_mfma_per_ds_write_a =
+                math::integer_divide_ceil(staged_num_mfma, num_ds_write_inst_a);
+
+            constexpr auto stage_more_mfma =
+                staged_num_mfma - (staged_num_mfma_per_ds_write_a - 1) * num_ds_write_inst_a;
+
+            // A local write
+            static_for<0, num_ds_write_inst_a, 1>{}([&](auto i_inst) {
+                if constexpr(i_inst.value < stage_more_mfma)
+                {
+                    if(i_inst.value < staged_num_ds_read_inst_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    }
+                    else
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a, 0);     // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                    }
+                }
+                else
+                {
+                    if(i_inst.value < staged_num_ds_read_inst_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a - 2, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    }
+                    else
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                    }
+                }
+            });
+
+            __builtin_amdgcn_sched_barrier(0);
+        }
+        else if constexpr(stage.value == 2)
+        {
+            constexpr auto staged_num_mfma_per_buffer_load_a =
+                math::integer_divide_ceil(staged_num_mfma, num_buffer_load_inst_a);
+
+            constexpr auto stage_more_mfma =
+                staged_num_mfma - (staged_num_mfma_per_buffer_load_a - 1) * num_buffer_load_inst_a;
+
+            // A global
+            static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i_inst) {
+                if constexpr(i_inst.value < stage_more_mfma)
+                {
+                    if(i_inst.value < staged_num_ds_read_inst_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_buffer_load_a - 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);    // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);    // DS read
+                    }
+                    else
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_buffer_load_a, 0);  // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    }
+                }
+                else
+                {
+                    if(i_inst.value < staged_num_ds_read_inst_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_buffer_load_a - 2, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);    // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);    // DS read
+                    }
+                    else
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_buffer_load_a - 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
+                    }
+                }
+            });
+
+            __builtin_amdgcn_sched_barrier(0);
+        }
+        else
+        {
+            // A local Read
+            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
+                ignore = i_inst;
+                __builtin_amdgcn_sched_group_barrier(
+                    0x008, staged_num_mfma_per_ds_read_a, 0);      // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            });
+
+            __builtin_amdgcn_sched_barrier(0);
+        }
+    }
+
+    template <typename Stage>
+    __device__ static constexpr auto EpilogueScheduler_1(Stage stage)
+    {
+        constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
+        constexpr auto num_ds_write_inst_a    = HotLoopInstList::A_LDS_Write_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = MWaves * HotLoopInstList::B_Buffer_Load_Inst_Num * 2;
+
+        constexpr auto num_mfma = HotLoopInstList::C_MFMA_Inst_Num * 2;
+
+        constexpr auto staged_num_ds_read_inst_a = num_ds_read_inst_a / MRepeat;
+        constexpr auto staged_num_mfma           = num_mfma / MRepeat;
+
+        constexpr auto staged_num_mfma_per_ds_read_a = staged_num_mfma / staged_num_ds_read_inst_a;
+
+        if constexpr(stage.value == 0)
+        {
+            constexpr auto staged_num_buffer_load_b_per_ds_read_a =
+                num_buffer_load_inst_b / staged_num_ds_read_inst_a;
+            constexpr auto staged_num_mfma_per_buffer_load_b =
+                staged_num_mfma / num_buffer_load_inst_b;
+            // B global
+            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
+                ignore = i_inst;
+
+                static_for<0, staged_num_buffer_load_b_per_ds_read_a, 1>{}([&](auto ibuf_inst) {
+                    ignore = ibuf_inst;
+                    __builtin_amdgcn_sched_group_barrier(
+                        0x008, staged_num_mfma_per_buffer_load_b, 0);  // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                });
+
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(
+                    0x008, staged_num_mfma_per_buffer_load_b - 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
+            });
+
+            __builtin_amdgcn_sched_barrier(0);
+        }
+        else if constexpr(stage.value == 1)
+        {
+#if 0
+            constexpr auto staged_num_ds_write_a_per_ds_read_a =
+                num_ds_write_inst_a / staged_num_ds_read_inst_a;
+            constexpr auto staged_num_mfma_per_ds_write_a = staged_num_mfma / num_ds_write_inst_a;
+            // A local write
+            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
+                ignore = i_inst;
+
+                static_for<0, staged_num_ds_write_a_per_ds_read_a, 1>{}([&](auto idswrite_inst) {
+                    ignore = idswrite_inst;
+                    __builtin_amdgcn_sched_group_barrier(
+                        0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                });
+
+                __builtin_amdgcn_sched_group_barrier(
+                    0x008, staged_num_ds_write_a_per_ds_read_a, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);  // DS read
+            });
+#elif 1
+            constexpr auto staged_num_mfma_per_ds_write_a =
+                math::integer_divide_ceil(staged_num_mfma, num_ds_write_inst_a);
+
+            constexpr auto stage_more_mfma =
+                staged_num_mfma - (staged_num_mfma_per_ds_write_a - 1) * num_ds_write_inst_a;
+
+            // A local write
+            static_for<0, num_ds_write_inst_a, 1>{}([&](auto i_inst) {
+                if constexpr(i_inst.value < stage_more_mfma)
+                {
+                    if(i_inst.value < staged_num_ds_read_inst_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    }
+                    else
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a, 0);     // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                    }
+                }
+                else
+                {
+                    if(i_inst.value < staged_num_ds_read_inst_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a - 2, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    }
+                    else
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                    }
+                }
+            });
+#endif
+            __builtin_amdgcn_sched_barrier(0);
+        }
+        else
+        {
+            // A local Read
+            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
+                ignore = i_inst;
+                __builtin_amdgcn_sched_group_barrier(
+                    0x008, staged_num_mfma_per_ds_read_a, 0);      // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            });
+
+            __builtin_amdgcn_sched_barrier(0);
+        }
+    }
+
+    __device__ static constexpr auto EpilogueScheduler_2()
+    {
+        constexpr auto num_ds_read_inst_a = HotLoopInstList::A_LDS_Read_Inst_Num;
+
+        constexpr auto num_mfma = HotLoopInstList::C_MFMA_Inst_Num * 2;
+
+        constexpr auto staged_num_ds_read_inst_a = num_ds_read_inst_a / MRepeat;
+        constexpr auto staged_num_mfma           = num_mfma / MRepeat;
+
+        constexpr auto staged_num_mfma_per_ds_read_a = staged_num_mfma / staged_num_ds_read_inst_a;
+
+        // A local Read
+        static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
+            ignore = i_inst;
+            __builtin_amdgcn_sched_group_barrier(0x008, staged_num_mfma_per_ds_read_a, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+        });
+
+        __builtin_amdgcn_sched_barrier(0);
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer>
+    __device__ void Run(const AGridDesc& a_grid_desc,
+                        const ABlockDesc& a_block_desc,
+                        ABlockTransfer& a_blockwise_copy,
+                        const AGridBuffer& a_grid_buf,
+                        ABlockBuffer& a_block_buf,
+                        const ABlockTransferStep& a_block_copy_step,
+                        const BGridDesc& b_grid_desc,
+                        BBlockTransfer& b_blockwise_copy,
+                        BBlockTransfer& b_blockwise_copy_up,
+                        const BGridBuffer& b_grid_buf,
+                        const BGridBuffer& b_grid_buf_up,
+                        BBlockBuffer& b_block_buf,
+                        const BBlockTransferStep& b_block_copy_step,
+                        CThreadBuffer& c_thread_buf,
+                        CThreadBuffer& c_thread_buf_up,
+                        index_t num_loop) const
+    {
+        ignore = b_block_buf;
+        __builtin_amdgcn_sched_barrier(0);
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs_up;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
+
+        // Global prefetch A1 B1
+        b_blockwise_copy.Run(b_grid_desc,
+                             b_grid_buf,
+                             b_block_desc_n0_n1_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs(I0));
+                             
+        b_blockwise_copy_up.Run(b_grid_desc,
+                             b_grid_buf_up,
+                             b_block_desc_n0_n1_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs_up(I0));
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        __builtin_amdgcn_sched_barrier(0);
+
+        // // Local prefill A1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I0));
+
+        // // Global prefetch A2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        // Local prefetch A1
+        block_sync_lds();
+        static_for<0, KRepeat, 1>{}([&](auto k0) {
+            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                               make_tuple(I0, I0, I0, k0, I0, I0),
+                               a_block_buf.At(I0),
+                               a_thread_desc_,
+                               make_tuple(I0, I0, I0, k0, I0, I0),
+                               a_thread_buf);
+        });
+
+        // Initialize C
+        c_thread_buf.Clear();
+        c_thread_buf_up.Clear();
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        if constexpr(m0.value == 0)
+                        {
+                            b_blockwise_copy.Run(b_grid_desc,
+                                                 b_grid_buf,
+                                                 b_block_desc_n0_n1_k0_k1,
+                                                 b_block_origin_idx,
+                                                 b_thread_bufs(local_read_buf));
+                            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                            b_blockwise_copy_up.Run(b_grid_desc,
+                                                 b_grid_buf_up,
+                                                 b_block_desc_n0_n1_k0_k1,
+                                                 b_block_origin_idx,
+                                                 b_thread_bufs_up(local_read_buf));
+                            b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                        }
+                        else if constexpr(m0.value == 1)
+                        {
+                            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(local_read_buf));
+                        }
+                        else if constexpr(m0.value == 2)
+                        {
+                            a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                        }
+
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                                vector_type<ComputeDataType, KPack> a_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple((m0 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                           2,
+                                                       I0,
+                                                       I0,
+                                                       k0,
+                                                       I0,
+                                                       ik))>{}];
+                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_bufs[mfma_reg_buf]
+                                                     [Number<b_thread_desc_.CalculateOffset(
+                                                         make_tuple(n0, I0, k0, ik))>{}];
+                                                         
+                                    b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_bufs_up[mfma_reg_buf]
+                                                     [Number<b_thread_desc_.CalculateOffset(
+                                                         make_tuple(n0, I0, k0, ik))>{}];
+                                });
+
+                                using mfma_input_type =
+                                    typename vector_type<ComputeDataType,
+                                                         xdlops_gemm.K1PerXdlops>::type;
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                xdlops_gemm.Run(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                    
+                                xdlops_gemm.Run(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec_up.template AsType<mfma_input_type>(),
+                                    c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                            });
+                        });
+
+                        if constexpr(m0.value == MRepeat - 1)
+                        {
+                            block_sync_lds();
+
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_k0_k1_k2,
+                                    make_tuple(Number<(m0 + 1) % MRepeat>{}, I0, I0, k0, I0, I0),
+                                    a_block_buf.At(local_read_buf),
+                                    a_thread_desc_,
+                                    make_tuple(
+                                        Number<(m0 + 1 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                               2>{},
+                                        I0,
+                                        I0,
+                                        k0,
+                                        I0,
+                                        I0),
+                                    a_thread_buf);
+                            });
+                        }
+                        else
+                        {
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_k0_k1_k2,
+                                    make_tuple(Number<(m0 + 1) % MRepeat>{}, I0, I0, k0, I0, I0),
+                                    a_block_buf.At(mfma_reg_buf),
+                                    a_thread_desc_,
+                                    make_tuple(
+                                        Number<(m0 + 1 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                               2>{},
+                                        I0,
+                                        I0,
+                                        k0,
+                                        I0,
+                                        I0),
+                                    a_thread_buf);
+                            });
+                        }
+
+                        HotLoopScheduler(m0);
+                    });
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                if constexpr(m0.value == 0)
+                {
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(I1));
+                                         
+                    b_blockwise_copy_up.Run(b_grid_desc,
+                                         b_grid_buf_up,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs_up(I1));
+                }
+                else if constexpr(m0.value == MRepeat - 1)
+                {
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1));
+                }
+
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0 % 2, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                                    
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                        
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+
+                if constexpr(m0.value == MRepeat - 1)
+                {
+                    block_sync_lds();
+
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        a_thread_copy_.Run(
+                            a_block_desc_m0_m1_m2_k0_k1_k2,
+                            make_tuple(Number<(m0 + 1) % MRepeat>{}, I0, I0, k0, I0, I0),
+                            a_block_buf.At(I1),
+                            a_thread_desc_,
+                            make_tuple(Number<(m0 + 1) % 2>{}, I0, I0, k0, I0, I0),
+                            a_thread_buf);
+                    });
+                }
+                else
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        a_thread_copy_.Run(
+                            a_block_desc_m0_m1_m2_k0_k1_k2,
+                            make_tuple(Number<(m0 + 1) % MRepeat>{}, I0, I0, k0, I0, I0),
+                            a_block_buf.At(I0),
+                            a_thread_desc_,
+                            make_tuple(Number<(m0 + 1) % 2>{}, I0, I0, k0, I0, I0),
+                            a_thread_buf);
+                    });
+                }
+
+                EpilogueScheduler_1(m0);
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(make_tuple(
+                                    (m0 + HotloopLocalBufSwitch) % 2, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs_up[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                        
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+
+                if constexpr(m0.value != (MRepeat - 1))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        a_thread_copy_.Run(
+                            a_block_desc_m0_m1_m2_k0_k1_k2,
+                            make_tuple(Number<m0 + 1>{}, I0, I0, k0, I0, I0),
+                            a_block_buf.At(I1),
+                            a_thread_desc_,
+                            make_tuple(
+                                Number<(m0 + 1 + HotloopLocalBufSwitch) % 2>{}, I0, I0, k0, I0, I0),
+                            a_thread_buf);
+                    });
+
+                    EpilogueScheduler_2();
+                }
+            });
+            // Let's leak last MFMA block to epilogue region, cover the potential lds-shuffle
+            // latency
+            // __builtin_amdgcn_sched_barrier(0);
+        }
+        else if constexpr(TailNum == TailNumber::Odd)
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0 % 2, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+
+                if constexpr(m0.value != (MRepeat - 1))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                           make_tuple(Number<m0 + 1>{}, I0, I0, k0, I0, I0),
+                                           a_block_buf.At(I0),
+                                           a_thread_desc_,
+                                           make_tuple(Number<(m0 + 1) % 2>{}, I0, I0, k0, I0, I0),
+                                           a_thread_buf);
+                    });
+
+                    EpilogueScheduler_2();
+                }
+            });
+        }
+    }
+
+    protected:
+    // MRepeat MWave MLane KRepeat KLane KPack
+    // KRepeat -> MRepeat-> Mwave->KLane->MLane->KPack
+    // Reduce the vgpr usage here.
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(I2, I1, I1, Number<KRepeat>{}, I1, Number<KPack>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeDataType,
+                                                         decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, 1, 1, KPack>,
+                                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                                         5,
+                                                         A_K1,
+                                                         A_K1>;
+
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex6D()};
+
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
+
+    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
+
+};
+
+} // namespace ck

From 9c06c3817a2fd24176e5a4198697efe4ecf36aad Mon Sep 17 00:00:00 2001
From: lalala-sh <Jiaxing.Wen@amd.com>
Date: Wed, 30 Apr 2025 02:27:39 +0000
Subject: [PATCH 04/18] [fix] align v3 gufusion pipeline

---
 ...peline_xdlops_b_preshuffle_gufusion_v3.hpp | 25 +++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp
index b63c813955..df577b11fc 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp
@@ -143,7 +143,6 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3<BlockGemmPipelineSch
 
     using Base::AMmaKStride;
     using Base::BMmaKStride;
-    using Base::c_thread_desc_;
 
     using Base::MWaves;
 
@@ -532,13 +531,17 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3<BlockGemmPipelineSch
 
         // Local prefetch A1
         block_sync_lds();
-        static_for<0, KRepeat, 1>{}([&](auto k0) {
-            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                               make_tuple(I0, I0, I0, k0, I0, I0),
-                               a_block_buf.At(I0),
-                               a_thread_desc_,
-                               make_tuple(I0, I0, I0, k0, I0, I0),
-                               a_thread_buf);
+        static_for<0, 2, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
+                                       a_block_buf.At(I0),
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                       a_thread_buf);
+                });
+            });
         });
 
         // Initialize C
@@ -858,7 +861,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3<BlockGemmPipelineSch
                     });
                 });
 
-                if constexpr(m0.value != (MRepeat - 1))
+                if constexpr(m0.value < (MRepeat - 2))
                 {
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
                         static_for<0, KGroup, 1>{}([&](auto kg0) {
@@ -951,7 +954,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3<BlockGemmPipelineSch
                                                          ComputeDataType,
                                                          decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
                                                          decltype(a_thread_desc_),
-                                                         Sequence<1, 1, 1, 1, 1, KPack>,
+                                                         Sequence<1, 1, 1, 1, 1, KPack / KGroup>,
                                                          Sequence<0, 1, 2, 3, 4, 5>,
                                                          5,
                                                          A_K1,
@@ -963,6 +966,8 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3<BlockGemmPipelineSch
         make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
 
     static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
+
+    using Base::c_thread_desc_;
 };
 
 } // namespace ck

From 0ab978584d99c693f4b02fe870cd1ce75fdf2a5d Mon Sep 17 00:00:00 2001
From: lalala-sh <Jiaxing.Wen@amd.com>
Date: Tue, 6 May 2025 07:36:59 +0000
Subject: [PATCH 05/18] fix bugs

---
 ...peline_xdlops_b_preshuffle_gufusion_v3.hpp | 20 +++++++++----------
 .../gpu/grid/gridwise_moe_gemm.hpp            |  5 +++--
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp
index df577b11fc..c2fad6132f 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp
@@ -535,7 +535,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3<BlockGemmPipelineSch
             static_for<0, KRepeat, 1>{}([&](auto k0) {
                 static_for<0, KGroup, 1>{}([&](auto kg0) {
                     a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                       make_tuple(m0, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
+                                       make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
                                        a_block_buf.At(I0),
                                        a_thread_desc_,
                                        make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
@@ -631,7 +631,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3<BlockGemmPipelineSch
                                         make_tuple(Number<(m0 + 2) % MRepeat>{},
                                                    I0,
                                                    I0,
-                                                   Number<k0 * 2 + kg0>{},
+                                                   Number<k0 * KGroup + kg0>{},
                                                    I0,
                                                    I0),
                                         a_block_buf.At(local_read_buf),
@@ -657,7 +657,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3<BlockGemmPipelineSch
                                         make_tuple(Number<(m0 + 2) % MRepeat>{},
                                                    I0,
                                                    I0,
-                                                   Number<k0 * 2 + kg0>{},
+                                                   Number<k0 * KGroup + kg0>{},
                                                    I0,
                                                    I0),
                                         a_block_buf.At(local_read_buf),
@@ -683,7 +683,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3<BlockGemmPipelineSch
                                         make_tuple(Number<(m0 + 2) % MRepeat>{},
                                                    I0,
                                                    I0,
-                                                   Number<k0 * 2 + kg0>{},
+                                                   Number<k0 * KGroup + kg0>{},
                                                    I0,
                                                    I0),
                                         a_block_buf.At(mfma_reg_buf),
@@ -700,8 +700,8 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3<BlockGemmPipelineSch
                                 });
                             });
                         }
-                        HotLoopScheduler();
                     });
+                    HotLoopScheduler();
                 };
 
                 LoopFunc(I0, I1);
@@ -771,7 +771,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3<BlockGemmPipelineSch
                                 make_tuple(Number<(m0 + 2) % MRepeat>{},
                                            I0,
                                            I0,
-                                           Number<k0 * 2 + kg0>{},
+                                           Number<k0 * KGroup + kg0>{},
                                            I0,
                                            I0),
                                 a_block_buf.At(I1),
@@ -791,7 +791,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3<BlockGemmPipelineSch
                                 make_tuple(Number<(m0 + 2) % MRepeat>{},
                                            I0,
                                            I0,
-                                           Number<k0 * 2 + kg0>{},
+                                           Number<k0 * KGroup + kg0>{},
                                            I0,
                                            I0),
                                 a_block_buf.At(I1),
@@ -811,7 +811,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3<BlockGemmPipelineSch
                                 make_tuple(Number<(m0 + 2) % MRepeat>{},
                                            I0,
                                            I0,
-                                           Number<k0 * 2 + kg0>{},
+                                           Number<k0 * KGroup + kg0>{},
                                            I0,
                                            I0),
                                 a_block_buf.At(I0),
@@ -868,7 +868,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3<BlockGemmPipelineSch
                             a_thread_copy_.Run(
                                 a_block_desc_m0_m1_m2_k0_k1_k2,
                                 make_tuple(
-                                    Number<m0 + 2>{}, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
+                                    Number<m0 + 2>{}, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
                                 a_block_buf.At(I1),
                                 a_thread_desc_,
                                 make_tuple(Number<(m0 + 2 + HotloopLocalBufSwitch) % 2>{},
@@ -930,7 +930,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3<BlockGemmPipelineSch
                             a_thread_copy_.Run(
                                 a_block_desc_m0_m1_m2_k0_k1_k2,
                                 make_tuple(
-                                    Number<m0 + 2>{}, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
+                                    Number<m0 + 2>{}, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
                                 a_block_buf.At(I0),
                                 a_thread_desc_,
                                 make_tuple(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
index d963980f26..7b350952a6 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
@@ -190,6 +190,7 @@ struct GridwiseMoeGemm
         mfma_selector::GetKPerXdlops() / mfma_selector::GetK1PerXdlops();
 
     static constexpr index_t KGroup  = mfma_selector::selected_mfma.k_per_blk == 32 ? 2 : 1;
+    // static_assert(KGroup == 2, "");
     static constexpr index_t KRepeat = KPerBlock / KLane / (KPack / KGroup);
     static constexpr index_t NLane   = NPerXdl;
     static constexpr index_t NWave   = NPerBlock / NPerXdl / NXdlPerWave;
@@ -1349,7 +1350,7 @@ struct GridwiseMoeGemm
                       make_multi_index(n_block_data_idx_on_grid,
                                        get_warp_local_1d_id() % NWave,
                                        0,
-                                       KPack * (get_thread_local_1d_id() % warpSize)));
+                                       KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
             blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
                 a_grid_desc_ak0_m_ak1,
                 a_block_desc_ak0_m_ak1,
@@ -2064,7 +2065,7 @@ struct GridwiseMoeGemm
                       make_multi_index(n_block_data_idx_on_grid,
                                        get_warp_local_1d_id() % NWave,
                                        0,
-                                       KPack * (get_thread_local_1d_id() % warpSize)));
+                                       KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
             blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
                 a_grid_desc_ak0_m_ak1,
                 a_block_desc_ak0_m_ak1,

From abff33eaab3a772b7513c362c2cb86e0b3a171e2 Mon Sep 17 00:00:00 2001
From: lalala-sh <Jiaxing.Wen@amd.com>
Date: Tue, 6 May 2025 08:46:38 +0000
Subject: [PATCH 06/18] tune fp8 example

---
 .../moe_gemm1_xdl_fp8.cpp                         | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
index c8ee8fc79b..a05234ad3c 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
@@ -157,12 +157,15 @@ using AElementOp = PassThrough;
 using BElementOp = PassThrough;
 
 static constexpr auto GemmSpec         = ck::tensor_operation::device::GemmSpecialization::Default;
-static constexpr ck::index_t MPerBlock = 256;
-static constexpr ck::index_t MXDLPerWave = 16;
-static constexpr ck::index_t NXDLPerWave = 4;
-static constexpr ck::index_t BLOCKSIZE   = 256;
+static constexpr ck::index_t MPerBlock = 128;
 static constexpr ck::index_t NPerBlock   = 256;
 static constexpr ck::index_t MNPerXDL    = 16;
+static constexpr ck::index_t MXDLPerWave = MPerBlock / (MNPerXDL * 1);
+static constexpr ck::index_t NXDLPerWave = NPerBlock / (MNPerXDL * 4);
+static constexpr ck::index_t CShuffleMXDLPerWave = MXDLPerWave;
+static constexpr ck::index_t CShuffleNXDLPerWave = NXDLPerWave;
+static constexpr ck::index_t BLOCKSIZE   = 256;
+
 static constexpr ck::index_t KPerBlock   = 128 / sizeof(A0DataType);
 static constexpr ck::index_t Nswizzle    = false;
 static constexpr ck::index_t AK1         = 16 / sizeof(A0DataType);
@@ -190,7 +193,7 @@ using DeviceOpInstance                   = ck::tensor_operation::device::DeviceM
                //    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
                //    MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
                 //  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-                2,    2,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec>,
+                CShuffleMXDLPerWave,    CShuffleNXDLPerWave,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec, 1>,
                ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, ActOP, Nswizzle, true, MulRoutedWeight, true, int32_t, A0DataType>;
 
 // clang-format on
@@ -308,7 +311,7 @@ int main(int argc, char* argv[])
     case 0: break;
     case 1:
         a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
-        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.1, 0.1});
         d0_t_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
         d1_e_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
         d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});

From 68c7fe843dd2d4eb98a55359ed2572b4894b115f Mon Sep 17 00:00:00 2001
From: aska-0096 <haocwang@amd.com>
Date: Thu, 8 May 2025 07:44:06 +0000
Subject: [PATCH 07/18] temp save, a performant version.

---
 .../65_gemm_multiply_multiply/CMakeLists.txt  |   4 +-
 ...ultiply_xdl_fp8_blockscale_bpreshuffle.cpp |  20 +-
 ...line_xdlops_blockscale_b_preshuffle_v3.hpp | 240 ++++++++++++------
 ...xdl_cshuffle_v3_blockscale_bpreshuffle.hpp |  10 -
 ...fle_v3_multi_d_blockscale_b_preshuffle.hpp |  12 +-
 5 files changed, 171 insertions(+), 115 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt
index 1fa773f71b..f38ac6fe4a 100644
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -31,8 +31,8 @@ set(GEMM_OPTIONS)
 list(APPEND GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
 list(APPEND GEMM_OPTIONS -v --save-temps -Wno-gnu-line-marker)
 set(BLOCKSCALE_GEMM_OPTIONS)
-# list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --schedmodel=0 -mllvm --amdgpu-sched-strategy=gcn-iterative-max-occupancy-experimental -mllvm --misched-bottomup=1")
-list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
+list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --disable-schedmodel-in-sched-mi=1 -mllvm --amdgpu-sched-strategy=gcn-iterative-max-occupancy-experimental -mllvm --misched-bottomup=1")
+# list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
 list(APPEND BLOCKSCALE_GEMM_OPTIONS -v --save-temps -Wno-gnu-line-marker)
 target_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVATE ${GEMM_OPTIONS})
 target_compile_options(example_moe_gemm1_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_blockscale_bpreshuffle.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_blockscale_bpreshuffle.cpp
index 8cf533b4f6..40a8a01b24 100644
--- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_blockscale_bpreshuffle.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_blockscale_bpreshuffle.cpp
@@ -41,6 +41,7 @@ using DsDataType       = ck::Tuple<>;
 using EDataType        = BF16;
 
 using A0Layout = Row;
+using A1Layout = Col;
 using B0Layout = Col;
 using D0Layout = Row;
 using D1Layout = Col;
@@ -158,7 +159,8 @@ int main(int argc, char* argv[])
         exit(0);
     }
 
-    ck::index_t Scale_Stride_AM = (K + Scale_Block_K - 1) / Scale_Block_K;
+    // Transpose the AScale tensor for better performance
+    ck::index_t Scale_Stride_AK = (M + Scale_Block_M - 1) / Scale_Block_M;
     ck::index_t Scale_Stride_BN = (K + Scale_Block_K - 1) / Scale_Block_K;
 
     auto f_host_tensor_descriptor =
@@ -178,8 +180,8 @@ int main(int argc, char* argv[])
     Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
     Tensor<A1DataType> a1_m_k(f_host_tensor_descriptor((M + Scale_Block_M - 1) / Scale_Block_M,
                                                        (K + Scale_Block_K - 1) / Scale_Block_K,
-                                                       Scale_Stride_AM,
-                                                       A0Layout{}));
+                                                       Scale_Stride_AK,
+                                                       A1Layout{}));
     Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
     Tensor<B0DataType> b0_preshuffled(
         f_host_tensor_descriptor(K, N, StrideB, B0Layout{})); // use laout only for size
@@ -196,7 +198,6 @@ int main(int argc, char* argv[])
     std::cout << "b1_k_n: " << b1_k_n.mDesc << std::endl;
     std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
 
-#if 1
     switch(init_method)
     {
     case 0: break;
@@ -236,17 +237,6 @@ int main(int argc, char* argv[])
         a1_m_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0, 1.0});
         b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
     }
-#endif
-#if 0
-    for(int im =0; im< (M + Scale_Block_M - 1) / Scale_Block_M; im++){
-        float row_sum = .0;
-        for(int ik =0; ik< (K + Scale_Block_K - 1) / Scale_Block_K; ik++){
-            printf("%lf ",a1_m_k(im, ik));
-            row_sum += a1_m_k(im, ik);
-        }
-        printf("sum: %lf\n", row_sum * 128);
-    }
-#endif
 
     DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize());
     DeviceMem a1_device_buf(sizeof(A1DataType) * a1_m_k.mDesc.GetElementSpaceSize());
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v3.hpp
index 508d86b2e5..4b7e255934 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v3.hpp
@@ -262,13 +262,6 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
 
         // B global read
         static_for<0, buffer_load_b_stages, 1>{}([&](auto i) {
-            // Scale load, 1B
-            if constexpr (i.value==0){
-                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            }
-            // Scale load, 1A
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-
             static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
                 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
 
@@ -282,19 +275,26 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
                     __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
                 }
 
-                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                if constexpr((imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage - 1)) &&
+                             (imfma < (num_mfma_perstage - 1)))
                 {
                     __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
                 }
                 __builtin_amdgcn_sched_group_barrier(0x800, 2, 0); // v_pk_fma
+                // __builtin_amdgcn_sched_group_barrier(0x1000, 4, 0); // v_fmac
             });
+            // Scale load, 1B
+            if constexpr(i.value == 0)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            // Scale load, 1A
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
             // __builtin_amdgcn_sched_barrier(0);
         });
 
         // A global read + A local write
         static_for<0, buffer_load_a_stages, 1>{}([&](auto i) {
-            // Scale load, 1A
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
             static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
                 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
                 if constexpr((((i + buffer_load_b_stages) < buffer_load_stages_more) &&
@@ -315,12 +315,16 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
                 {
                     __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
                 }
-                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                if constexpr((imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage - 1)) &&
+                             (imfma < (num_mfma_perstage - 1)))
                 {
                     __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
                 }
                 __builtin_amdgcn_sched_group_barrier(0x800, 2, 0); // v_pk_fma
+                // __builtin_amdgcn_sched_group_barrier(0x1000, 4, 0); // v_fmac
             });
+            // Scale load, 1A
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
             // __builtin_amdgcn_sched_barrier(0);
         });
 
@@ -329,17 +333,17 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
             ignore = i;
             static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
                 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                // Scale load, 1A
-                if constexpr(imfma == 0){
-                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                }
 
-                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                if constexpr((imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage - 1)) &&
+                             (imfma < (num_mfma_perstage - 1)))
                 {
                     __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
                 }
                 __builtin_amdgcn_sched_group_barrier(0x800, 2, 0); // v_pk_fma
+                // __builtin_amdgcn_sched_group_barrier(0x1000, 4, 0); // v_fmac
             });
+            // Scale load, 1A
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
             // __builtin_amdgcn_sched_barrier(0);
         });
     }
@@ -420,7 +424,7 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
         StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
         constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
 
-        auto a_scale_thread_buf           = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
             a_scale_thread_desc.GetElementSpaceSize());
         auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
             b_scale_thread_desc.GetElementSpaceSize());
@@ -521,7 +525,7 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
         // 2. Decouple the dependency between mfma instruction and scale-fma instruction following.
         StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
                                   AccDataType,
-                                  1,
+                                  2,
                                   xdlops_gemm.GetRegSizePerXdlops(),
                                   true>
             c_thread_buf_per_scale;
@@ -541,7 +545,7 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
             });
         });
 
-#if 0
+#if 1
         static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
             c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
                 .template AsType<AccDataType>()(Number<t>{}) = 0;
@@ -586,7 +590,7 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
                     a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(local_read_buf));
                     a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
                     a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-                    
+
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
                         a_scale_thread_copy.Run(a_scale_grid_desc,
                                                 a_scale_grid_buf,
@@ -625,10 +629,25 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
                             c_scale_thread_buf[m0];
 
                         static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            constexpr auto mfma_buf_offset =
+                                ((m0 * NRepeat + n0 + 1) % 2) * xdlops_gemm.GetRegSizePerXdlops();
+                            constexpr auto scale_buf_offset =
+                                ((m0 * NRepeat + n0) % 2) * xdlops_gemm.GetRegSizePerXdlops();
+
+                            constexpr auto a_local_buf_offset =
+                                ((m0 * NRepeat + n0 + 1) % (MRepeat * NRepeat)) / NRepeat;
+                            constexpr auto b_local_buf_offset =
+                                ((m0 * NRepeat + n0 + 1) % (MRepeat * NRepeat)) % NRepeat;
+                            constexpr auto b_local_buf_id =
+                                Number<mfma_reg_buf ^
+                                       ((m0 * NRepeat + n0 + 1) / (MRepeat * NRepeat))>{};
+
                             static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
-                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                c_thread_buf_per_scale
+                                    .GetVectorTypeReference(Number<mfma_buf_offset>{})
                                     .template AsType<AccDataType>()(Number<t>{}) = 0;
                             });
+
                             static_for<0, KRepeat, 1>{}([&](auto k0) {
                                 vector_type<ComputeDataType, KPack> a_thread_vec;
                                 vector_type<ComputeDataType, KPack> b_thread_vec;
@@ -636,7 +655,8 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
                                 static_for<0, KPack, 1>{}([&](auto ik) {
                                     a_thread_vec.template AsType<ComputeDataType>()(ik) =
                                         a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                            make_tuple((m0 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                            make_tuple((a_local_buf_offset +
+                                                        HotloopLocalBufSwitch * mfma_reg_buf) %
                                                            2,
                                                        I0,
                                                        I0,
@@ -644,9 +664,9 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
                                                        I0,
                                                        ik))>{}];
                                     b_thread_vec.template AsType<ComputeDataType>()(ik) =
-                                        b_thread_bufs[mfma_reg_buf]
-                                                     [Number<b_thread_desc_.CalculateOffset(
-                                                         make_tuple(n0, I0, k0, ik))>{}];
+                                        b_thread_bufs
+                                            [b_local_buf_id][Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(b_local_buf_offset, I0, k0, ik))>{}];
                                 });
 
                                 using mfma_input_type =
@@ -656,7 +676,8 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
                                 xdlops_gemm.template Run<>(
                                     a_thread_vec.template AsType<mfma_input_type>(),
                                     b_thread_vec.template AsType<mfma_input_type>(),
-                                    c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                                    c_thread_buf_per_scale.GetVectorTypeReference(
+                                        Number<mfma_buf_offset>{}));
                             });
 
                             constexpr index_t c_offset =
@@ -667,7 +688,8 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
 
                                 c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
                                     .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
-                                    c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                    c_thread_buf_per_scale
+                                        .GetVectorTypeReference(Number<scale_buf_offset>{})
                                         .template AsType<pk_fma_type>()[t],
                                     c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
                                     c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
@@ -758,7 +780,8 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
                     });
 
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
-                        c_scale_thread_buf(m0) = a_scale_thread_bufs[mfma_reg_buf][m0] * b_scale_thread_bufs[mfma_reg_buf][I0];
+                        c_scale_thread_buf(m0) = a_scale_thread_bufs[mfma_reg_buf][m0] *
+                                                 b_scale_thread_bufs[mfma_reg_buf][I0];
                     });
 
                     HotLoopScheduler();
@@ -782,7 +805,7 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
                                  b_thread_bufs(I1));
             a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1));
 
-            static_for<0, MRepeat, 1>{}([&](auto m0) {                                                       
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
                 vector_type<AccDataType, 2> c_scale_thread_vec;
                 c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
                     c_scale_thread_buf[m0];
@@ -790,8 +813,21 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
                     c_scale_thread_buf[m0];
 
                 static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    constexpr auto mfma_buf_offset =
+                        ((m0 * NRepeat + n0 + 1) % 2) * xdlops_gemm.GetRegSizePerXdlops();
+                    constexpr auto scale_buf_offset =
+                        ((m0 * NRepeat + n0) % 2) * xdlops_gemm.GetRegSizePerXdlops();
+
+                    constexpr auto a_local_buf_offset =
+                        ((m0 * NRepeat + n0 + 1) % (MRepeat * NRepeat)) / NRepeat;
+                    constexpr auto b_local_buf_offset =
+                        ((m0 * NRepeat + n0 + 1) % (MRepeat * NRepeat)) % NRepeat;
+
+                    constexpr auto b_local_buf_id =
+                        Number<0 ^ ((m0 * NRepeat + n0 + 1) / (MRepeat * NRepeat))>{};
+
                     static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
-                        c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                        c_thread_buf_per_scale.GetVectorTypeReference(Number<mfma_buf_offset>{})
                             .template AsType<AccDataType>()(Number<t>{}) = 0;
                     });
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
@@ -801,19 +837,19 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
                         static_for<0, KPack, 1>{}([&](auto ik) {
                             a_thread_vec.template AsType<ComputeDataType>()(ik) =
                                 a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                    make_tuple(m0 % 2, I0, I0, k0, I0, ik))>{}];
+                                    make_tuple(a_local_buf_offset % 2, I0, I0, k0, I0, ik))>{}];
                             b_thread_vec.template AsType<ComputeDataType>()(ik) =
-                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
-                                    make_tuple(n0, I0, k0, ik))>{}];
+                                b_thread_bufs[b_local_buf_id][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(b_local_buf_offset, I0, k0, ik))>{}];
                         });
 
                         using mfma_input_type =
                             typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
 
-                        xdlops_gemm.template Run<>(
-                            a_thread_vec.template AsType<mfma_input_type>(),
-                            b_thread_vec.template AsType<mfma_input_type>(),
-                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                        xdlops_gemm.template Run<>(a_thread_vec.template AsType<mfma_input_type>(),
+                                                   b_thread_vec.template AsType<mfma_input_type>(),
+                                                   c_thread_buf_per_scale.GetVectorTypeReference(
+                                                       Number<mfma_buf_offset>{}));
                     });
 
                     constexpr index_t c_offset =
@@ -824,7 +860,8 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
 
                         c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
                             .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
-                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                            c_thread_buf_per_scale
+                                .GetVectorTypeReference(Number<scale_buf_offset>{})
                                 .template AsType<pk_fma_type>()[t],
                             c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
                             c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
@@ -910,31 +947,52 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
                     c_scale_thread_buf[m0];
 
                 static_for<0, NRepeat, 1>{}([&](auto n0) {
-                    static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
-                        c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
-                            .template AsType<AccDataType>()(Number<t>{}) = 0;
-                    });
-                    static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        vector_type<ComputeDataType, KPack> a_thread_vec;
-                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                    constexpr auto mfma_buf_offset =
+                        ((m0 * NRepeat + n0 + 1) % 2) * xdlops_gemm.GetRegSizePerXdlops();
+                    constexpr auto scale_buf_offset =
+                        ((m0 * NRepeat + n0) % 2) * xdlops_gemm.GetRegSizePerXdlops();
 
-                        static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
-                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(make_tuple(
-                                    (m0 + HotloopLocalBufSwitch) % 2, I0, I0, k0, I0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
-                                b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
-                                    make_tuple(n0, I0, k0, ik))>{}];
+                    constexpr auto a_local_buf_offset =
+                        ((m0 * NRepeat + n0 + 1) % (MRepeat * NRepeat)) / NRepeat;
+                    constexpr auto b_local_buf_offset =
+                        ((m0 * NRepeat + n0 + 1) % (MRepeat * NRepeat)) % NRepeat;
+
+                    if constexpr(!((m0 == (MRepeat - 1)) && (n0 == (NRepeat - 1))))
+                    {
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<mfma_buf_offset>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
                         });
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
 
-                        using mfma_input_type =
-                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple((a_local_buf_offset + HotloopLocalBufSwitch) % 2,
+                                                   I0,
+                                                   I0,
+                                                   k0,
+                                                   I0,
+                                                   ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(b_local_buf_offset, I0, k0, ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale.GetVectorTypeReference(
+                                    Number<mfma_buf_offset>{}));
+                        });
+                    }
 
-                        xdlops_gemm.template Run<>(
-                            a_thread_vec.template AsType<mfma_input_type>(),
-                            b_thread_vec.template AsType<mfma_input_type>(),
-                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
-                    });
                     constexpr index_t c_offset =
                         c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
 
@@ -943,7 +1001,8 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
 
                         c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
                             .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
-                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                            c_thread_buf_per_scale
+                                .GetVectorTypeReference(Number<scale_buf_offset>{})
                                 .template AsType<pk_fma_type>()[t],
                             c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
                             c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
@@ -986,31 +1045,47 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
                     c_scale_thread_buf[m0];
 
                 static_for<0, NRepeat, 1>{}([&](auto n0) {
-                    static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
-                        c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
-                            .template AsType<AccDataType>()(Number<t>{}) = 0;
-                    });
-                    static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        vector_type<ComputeDataType, KPack> a_thread_vec;
-                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                    constexpr auto mfma_buf_offset =
+                        ((m0 * NRepeat + n0 + 1) % 2) * xdlops_gemm.GetRegSizePerXdlops();
+                    constexpr auto scale_buf_offset =
+                        ((m0 * NRepeat + n0) % 2) * xdlops_gemm.GetRegSizePerXdlops();
 
-                        static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
-                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                    make_tuple(m0 % 2, I0, I0, k0, I0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
-                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
-                                    make_tuple(n0, I0, k0, ik))>{}];
+                    constexpr auto a_local_buf_offset =
+                        ((m0 * NRepeat + n0 + 1) % (MRepeat * NRepeat)) / NRepeat;
+                    constexpr auto b_local_buf_offset =
+                        ((m0 * NRepeat + n0 + 1) % (MRepeat * NRepeat)) % NRepeat;
+
+                    if constexpr(!((m0 == (MRepeat - 1)) && (n0 == (NRepeat - 1))))
+                    {
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<mfma_buf_offset>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
                         });
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
 
-                        using mfma_input_type =
-                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(a_local_buf_offset % 2, I0, I0, k0, I0, ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(b_local_buf_offset, I0, k0, ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale.GetVectorTypeReference(
+                                    Number<mfma_buf_offset>{}));
+                        });
+                    }
 
-                        xdlops_gemm.template Run<>(
-                            a_thread_vec.template AsType<mfma_input_type>(),
-                            b_thread_vec.template AsType<mfma_input_type>(),
-                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
-                    });
                     constexpr index_t c_offset =
                         c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
 
@@ -1019,7 +1094,8 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineS
 
                         c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
                             .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
-                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                            c_thread_buf_per_scale
+                                .GetVectorTypeReference(Number<scale_buf_offset>{})
                                 .template AsType<pk_fma_type>()[t],
                             c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
                             c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_blockscale_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_blockscale_bpreshuffle.hpp
index 79a42856f6..dd6d6ba316 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_blockscale_bpreshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_blockscale_bpreshuffle.hpp
@@ -266,7 +266,6 @@ struct DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle
                 }
                 else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
                 {
-                    #if 0
                     if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
                     {
                         const auto kernel =
@@ -289,15 +288,6 @@ struct DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle
                                 TailNumber::Even>;
                         Run(kernel);
                     }
-                    #endif
-                    const auto kernel =
-                            kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle_2lds<
-                                GridwiseGemm,
-                                true,
-                                InMemoryDataOperationEnum::Set,
-                                minimum_occupancy,
-                                TailNumber::Even>;
-                    Run(kernel);
                 }
             }
             else
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp
index 9484361d3e..9e1ae0ead7 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp
@@ -1134,7 +1134,7 @@ struct GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
         const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor(
             make_tuple(math::integer_divide_ceil(problem.M, ScaleBlockM),
                        math::integer_divide_ceil(problem.K, ScaleBlockK)),
-            make_tuple(math::integer_divide_ceil(problem.K, ScaleBlockK), 1));
+            make_tuple(1, math::integer_divide_ceil(problem.M, ScaleBlockM)));
         const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor(
             make_tuple(math::integer_divide_ceil(problem.N, ScaleBlockN),
                        math::integer_divide_ceil(problem.K, ScaleBlockK)),
@@ -1282,9 +1282,9 @@ struct GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
                                              decltype(a_scale_grid_desc_am_ak),
                                              decltype(a_scale_thread_desc),
                                              Sequence<1, ScaleSliceSizeK>,
-                                             Sequence<0, 1>,
+                                             Sequence<1, 0>,
+                                             0,
                                              1,
-                                             ScaleSliceSizeK,
                                              1,
                                              false>(
                 a_scale_grid_desc_am_ak,
@@ -1630,7 +1630,7 @@ struct GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
         const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor(
             make_tuple(math::integer_divide_ceil(problem.M, ScaleBlockM),
                        math::integer_divide_ceil(problem.K, ScaleBlockK)),
-            make_tuple(math::integer_divide_ceil(problem.K, ScaleBlockK), 1));
+            make_tuple(1, math::integer_divide_ceil(problem.M, ScaleBlockM)));
         const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor(
             make_tuple(math::integer_divide_ceil(problem.N, ScaleBlockN),
                        math::integer_divide_ceil(problem.K, ScaleBlockK)),
@@ -1784,9 +1784,9 @@ struct GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
                                              decltype(a_scale_grid_desc_am_ak),
                                              decltype(a_scale_thread_desc),
                                              Sequence<1, ScaleSliceSizeK>,
-                                             Sequence<0, 1>,
+                                             Sequence<1, 0>,
+                                             0,
                                              1,
-                                             ScaleSliceSizeK,
                                              1,
                                              false>(
                 a_scale_grid_desc_am_ak,

From 960b2bce1ca879ee8b7d95a41b3dc35e573a315b Mon Sep 17 00:00:00 2001
From: lalala-sh <Jiaxing.Wen@amd.com>
Date: Thu, 8 May 2025 09:48:23 +0000
Subject: [PATCH 08/18] update

---
 ...e_gemm_pipeline_xdlops_b_preshuffle_v3.hpp | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
index 6af20af484..02da036d10 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -474,7 +474,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
             static_for<0, KRepeat, 1>{}([&](auto k0) {
                 static_for<0, KGroup, 1>{}([&](auto kg0) {
                     a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                       make_tuple(m0, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
+                                       make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
                                        a_block_buf.At(I0),
                                        a_thread_desc_,
                                        make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
@@ -553,7 +553,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                                         make_tuple(Number<(m0 + 2) % MRepeat>{},
                                                    I0,
                                                    I0,
-                                                   Number<k0 * 2 + kg0>{},
+                                                   Number<k0 * KGroup + kg0>{},
                                                    I0,
                                                    I0),
                                         a_block_buf.At(local_read_buf),
@@ -579,7 +579,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                                         make_tuple(Number<(m0 + 2) % MRepeat>{},
                                                    I0,
                                                    I0,
-                                                   Number<k0 * 2 + kg0>{},
+                                                   Number<k0 * KGroup + kg0>{},
                                                    I0,
                                                    I0),
                                         a_block_buf.At(local_read_buf),
@@ -605,7 +605,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                                         make_tuple(Number<(m0 + 2) % MRepeat>{},
                                                    I0,
                                                    I0,
-                                                   Number<k0 * 2 + kg0>{},
+                                                   Number<k0 * KGroup + kg0>{},
                                                    I0,
                                                    I0),
                                         a_block_buf.At(mfma_reg_buf),
@@ -680,7 +680,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                                 make_tuple(Number<(m0 + 2) % MRepeat>{},
                                            I0,
                                            I0,
-                                           Number<k0 * 2 + kg0>{},
+                                           Number<k0 * KGroup + kg0>{},
                                            I0,
                                            I0),
                                 a_block_buf.At(I1),
@@ -700,7 +700,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                                 make_tuple(Number<(m0 + 2) % MRepeat>{},
                                            I0,
                                            I0,
-                                           Number<k0 * 2 + kg0>{},
+                                           Number<k0 * KGroup + kg0>{},
                                            I0,
                                            I0),
                                 a_block_buf.At(I1),
@@ -720,7 +720,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                                 make_tuple(Number<(m0 + 2) % MRepeat>{},
                                            I0,
                                            I0,
-                                           Number<k0 * 2 + kg0>{},
+                                           Number<k0 * KGroup + kg0>{},
                                            I0,
                                            I0),
                                 a_block_buf.At(I0),
@@ -769,7 +769,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                             a_thread_copy_.Run(
                                 a_block_desc_m0_m1_m2_k0_k1_k2,
                                 make_tuple(
-                                    Number<m0 + 2>{}, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
+                                    Number<m0 + 2>{}, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
                                 a_block_buf.At(I1),
                                 a_thread_desc_,
                                 make_tuple(Number<(m0 + 2 + HotloopLocalBufSwitch) % 2>{},
@@ -824,7 +824,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                             a_thread_copy_.Run(
                                 a_block_desc_m0_m1_m2_k0_k1_k2,
                                 make_tuple(
-                                    Number<m0 + 2>{}, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
+                                    Number<m0 + 2>{}, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
                                 a_block_buf.At(I0),
                                 a_thread_desc_,
                                 make_tuple(

From def952a178bbb73e0940cf6a3cf69802e38b4dd7 Mon Sep 17 00:00:00 2001
From: lalala-sh <Jiaxing.Wen@amd.com>
Date: Thu, 8 May 2025 09:49:16 +0000
Subject: [PATCH 09/18] use mem_op::set when topk=1

---
 .../gpu/device/impl/device_moe_gemm.hpp       | 175 +++++++++++++-----
 1 file changed, 125 insertions(+), 50 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp
index 08d177035e..a6110d2bfc 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp
@@ -264,77 +264,152 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
 
             constexpr index_t minimum_occupancy = (estimated_reg_total >= 256) ? 1 : 2;
 
-            constexpr auto MemoryDataOp =
-                IsInputGemm ? InMemoryDataOperationEnum::Set : InMemoryDataOperationEnum::AtomicAdd;
-            if(has_main_k_block_loop)
+            if(IsInputGemm || arg.TopK == 1)
             {
-                // Tail number always full
-                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                constexpr auto MemoryDataOp = InMemoryDataOperationEnum::Set;
+
+                if(has_main_k_block_loop)
                 {
+                    // Tail number always full
+                    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                    {
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                            {
+                                const auto kernel = kernel_moe_gemm<GridwiseGemm,
+                                                                    true,
+                                                                    MemoryDataOp,
+                                                                    minimum_occupancy,
+                                                                    TailNumber::Odd>;
+                                RunKernel(kernel);
+                            }
+                            else
+                            {
+                                const auto kernel = kernel_moe_gemm<GridwiseGemm,
+                                                                    true,
+                                                                    MemoryDataOp,
+                                                                    minimum_occupancy,
+                                                                    TailNumber::Even>;
+                                RunKernel(kernel);
+                            }
+                        }
+                    }
+                    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2 ||
+                                      BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
                     {
                         if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
                         {
-                            const auto kernel = kernel_moe_gemm<GridwiseGemm,
-                                                                true,
-                                                                MemoryDataOp,
-                                                                minimum_occupancy,
-                                                                TailNumber::Odd>;
+                            const auto kernel = kernel_moe_gemm_2lds<GridwiseGemm,
+                                                                     true,
+                                                                     MemoryDataOp,
+                                                                     minimum_occupancy,
+                                                                     TailNumber::Odd>;
                             RunKernel(kernel);
                         }
                         else
                         {
-                            const auto kernel = kernel_moe_gemm<GridwiseGemm,
-                                                                true,
-                                                                MemoryDataOp,
-                                                                minimum_occupancy,
-                                                                TailNumber::Even>;
+                            const auto kernel = kernel_moe_gemm_2lds<GridwiseGemm,
+                                                                     true,
+                                                                     MemoryDataOp,
+                                                                     minimum_occupancy,
+                                                                     TailNumber::Even>;
                             RunKernel(kernel);
                         }
                     }
-                }
-                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2 ||
-                                  BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
-                {
-                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                    else
                     {
-                        const auto kernel = kernel_moe_gemm_2lds<GridwiseGemm,
-                                                                 true,
-                                                                 MemoryDataOp,
-                                                                 minimum_occupancy,
-                                                                 TailNumber::Odd>;
+                        throw std::runtime_error("todo: only v1 & v2 support now");
+                    }
+                }
+#if 1
+                else
+                {
+                    // Tail number always 1
+                    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                    {
+                        const auto kernel = kernel_moe_gemm<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::Odd>;
                         RunKernel(kernel);
                     }
+                }
+#endif
+            }
+            else
+            {
+                constexpr auto MemoryDataOp = InMemoryDataOperationEnum::AtomicAdd;
+
+                if(has_main_k_block_loop)
+                {
+                    // Tail number always full
+                    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                    {
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                            {
+                                const auto kernel = kernel_moe_gemm<GridwiseGemm,
+                                                                    true,
+                                                                    MemoryDataOp,
+                                                                    minimum_occupancy,
+                                                                    TailNumber::Odd>;
+                                RunKernel(kernel);
+                            }
+                            else
+                            {
+                                const auto kernel = kernel_moe_gemm<GridwiseGemm,
+                                                                    true,
+                                                                    MemoryDataOp,
+                                                                    minimum_occupancy,
+                                                                    TailNumber::Even>;
+                                RunKernel(kernel);
+                            }
+                        }
+                    }
+                    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2 ||
+                                      BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_moe_gemm_2lds<GridwiseGemm,
+                                                                     true,
+                                                                     MemoryDataOp,
+                                                                     minimum_occupancy,
+                                                                     TailNumber::Odd>;
+                            RunKernel(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_moe_gemm_2lds<GridwiseGemm,
+                                                                     true,
+                                                                     MemoryDataOp,
+                                                                     minimum_occupancy,
+                                                                     TailNumber::Even>;
+                            RunKernel(kernel);
+                        }
+                    }
                     else
                     {
-                        const auto kernel = kernel_moe_gemm_2lds<GridwiseGemm,
-                                                                 true,
-                                                                 MemoryDataOp,
-                                                                 minimum_occupancy,
-                                                                 TailNumber::Even>;
+                        throw std::runtime_error("todo: only v1 & v2 support now");
+                    }
+                }
+#if 1
+                else
+                {
+                    // Tail number always 1
+                    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                    {
+                        const auto kernel = kernel_moe_gemm<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::Odd>;
                         RunKernel(kernel);
                     }
                 }
-                else
-                {
-                    throw std::runtime_error("todo: only v1 & v2 support now");
-                }
-            }
-#if 1
-            else
-            {
-                // Tail number always 1
-                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
-                {
-                    const auto kernel = kernel_moe_gemm<GridwiseGemm,
-                                                        true,
-                                                        InMemoryDataOperationEnum::Set,
-                                                        minimum_occupancy,
-                                                        TailNumber::Odd>;
-                    RunKernel(kernel);
-                }
-            }
 #endif
-
+            }
             return ave_time;
         }
 

From 7729fdb34942030a61b0ae72b0c594ce82b13014 Mon Sep 17 00:00:00 2001
From: OscarXu <huaiguxu@amd.com>
Date: Mon, 12 May 2025 09:26:44 +0800
Subject: [PATCH 10/18] temp save for gemm1. Function not ready

---
 .../65_gemm_multiply_multiply/CMakeLists.txt  |   14 +-
 .../moe_gemm1_xdl_fp8_blockscale.cpp          |  533 ++++++++
 .../moe_gemm2_xdl_fp8_blockscale.cpp          |   24 +-
 include/ck/ck.hpp                             |    2 +-
 ...peline_xdlops_b_preshuffle_gufusion_v3.hpp |    2 +-
 ...oe_blockscale_b_preshuffle_gufusion_v3.hpp | 1193 +++++++++++++++++
 ...s_moe_blockscale_b_preshuffle_selector.hpp |  117 +-
 .../impl/device_moe_gemm_blockscale.hpp       |   39 +-
 .../gpu/grid/gridwise_moe_gemm_blockscale.hpp |  449 +++++--
 .../cpu/reference_moe_gemm1_blockscale.hpp    |  281 ++++
 10 files changed, 2509 insertions(+), 145 deletions(-)
 create mode 100644 example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v3.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp

diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt
index a1915d388b..4953bc515b 100644
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -17,6 +17,7 @@ target_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVAT
 add_example_executable(example_moe_gemm1_xdl_fp8 moe_gemm1_xdl_fp8.cpp)
 add_example_executable(example_moe_gemm2_xdl_fp8 moe_gemm2_xdl_fp8.cpp)
 add_example_executable(example_moe_gemm2_xdl_fp8_blockscale moe_gemm2_xdl_fp8_blockscale.cpp)
+add_example_executable(example_moe_gemm1_xdl_fp8_blockscale moe_gemm1_xdl_fp8_blockscale.cpp)
 
 list(APPEND gpu_list gfx942)
 set(target 0)
@@ -41,7 +42,7 @@ set(GEMM_OPTIONS)
 list(APPEND GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
 list(APPEND GEMM_OPTIONS -v --save-temps -Wno-gnu-line-marker)
 set(BLOCKSCALE_GEMM_OPTIONS)
-list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --disable-schedmodel-in-sched-mi=1 -mllvm --amdgpu-sched-strategy=gcn-iterative-max-occupancy-experimental -mllvm --misched-bottomup=1")
+list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --disable-schedmodel-in-sched-mi=1 -mllvm --misched=gcn-iterative-max-occupancy-experimental -mllvm --misched-bottomup=1")
 # list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
 list(APPEND BLOCKSCALE_GEMM_OPTIONS -v --save-temps -Wno-gnu-line-marker)
 target_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVATE ${GEMM_OPTIONS})
@@ -51,14 +52,7 @@ target_compile_options(example_gemm_multiply_multiply_xdl_fp8_ab_scale PRIVATE $
 target_compile_options(example_gemm_multiply_multiply_xdl_fp8_blockscale_bpreshuffle PRIVATE ${BLOCKSCALE_GEMM_OPTIONS})
 
 target_compile_options(example_moe_gemm2_xdl_fp8_blockscale PRIVATE ${BLOCKSCALE_GEMM_OPTIONS})
+target_compile_options(example_moe_gemm1_xdl_fp8_blockscale PRIVATE ${BLOCKSCALE_GEMM_OPTIONS})
 
 #hacky fix for bs_moe_stage2 with rocm < 6.4
-add_custom_command(
-    TARGET example_moe_gemm2_xdl_fp8_blockscale
-    PRE_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy_directory
-    ${CMAKE_CURRENT_SOURCE_DIR}/hsa/
-    ${CMAKE_CURRENT_BINARY_DIR}/hsa/
-)
-
-target_compile_definitions(example_moe_gemm2_xdl_fp8_blockscale PRIVATE MOE_STAGE2_ASM_DIR="${CMAKE_CURRENT_BINARY_DIR}/hsa/")
\ No newline at end of file
+target_compile_definitions(example_moe_gemm2_xdl_fp8_blockscale PRIVATE MOE_STAGE2_ASM_DIR="${CMAKE_CURRENT_SOURCE_DIR}/hsa/")
\ No newline at end of file
diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
new file mode 100644
index 0000000000..dea651b1ba
--- /dev/null
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
@@ -0,0 +1,533 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F8  = ck::f8_t;
+using F32 = float;
+using I64 = int64_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = F8;
+using A1DataType       = F32;
+using B0DataType       = F8;
+using B1DataType       = F32;
+using EDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D2DataType       = F32;
+using DsDataType       = ck::Tuple<D2DataType>;
+
+using A0Layout = Row;
+using B0Layout = Col;
+using ELayout  = Row;
+using D0Layout = Row;
+using D1Layout = Col;
+using D2Layout = ELayout;
+using DsLayout = ck::Tuple<D2Layout>;
+
+
+struct MulABScaleExpertWeight
+{
+    template <typename E, typename C, typename D2>
+    __host__ __device__ constexpr void operator()(E& e, const C& c, const D2& d2) const;
+    // for real kernel use
+    template <>
+    __host__ __device__ constexpr void
+    operator()<EDataType, float, float>(EDataType& e, const float& c, const float& d2) const
+    {
+        // for real kernel use
+        e = ck::type_convert<EDataType>(c * d2);
+    }
+
+    // for reference cpu
+    template <>
+    __host__ __device__ constexpr void
+    operator()<float, float, float>(float& e, const float& c, const float& d2) const
+    {
+        // for reference cpu
+        e = ck::type_convert<EDataType>(c * d2);
+    }
+};
+
+void preShuffleBuffer(const B0DataType* src, B0DataType* dst, int N, int K, int NXdl)
+{
+    int KPack = 16 / sizeof(B0DataType);
+    int NLane = NXdl;
+    int KLane = 64 / NLane;
+
+    int K0 = K / (KLane * KPack);
+    // K -> K0 KLane KPack
+    // N -> N0 NLane
+    // N, K -> N0 K0 KLane NLane KPack
+    int tempk;
+    for(I64 n = 0; n < N; ++n)
+    {
+        for(I64 k = 0; k < K; ++k)
+        {
+            I64 n0 = n / NLane;
+            I64 n1 = n % NLane;
+
+            I64 k0 = k / (KLane * KPack);
+            tempk  = k % (KLane * KPack);
+            I64 k1 = tempk / KPack;
+            I64 k2 = tempk % KPack;
+
+            I64 outputIndex = n0 * KPack * NLane * KLane * K0 + k0 * KPack * NLane * KLane +
+                              k1 * KPack * NLane + n1 * KPack + k2;
+
+            dst[outputIndex] = src[n * static_cast<I64>(K) + k];
+        }
+    }
+}
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr ck::index_t Scale_Block_M = 1;
+static constexpr ck::index_t Scale_Block_N = 128;
+static constexpr ck::index_t Scale_Block_K = 128;
+#if 1
+static constexpr ck::index_t MPerBlock = 128;
+static constexpr ck::index_t NPerBlock   = 256;
+static constexpr ck::index_t MNPerXDL    = 16;
+static constexpr ck::index_t MXDLPerWave = MPerBlock / (MNPerXDL * 1);
+static constexpr ck::index_t NXDLPerWave = NPerBlock / (MNPerXDL * 4);
+static constexpr ck::index_t CShuffleMXDLPerWave = MXDLPerWave;
+static constexpr ck::index_t CShuffleNXDLPerWave = NXDLPerWave;
+static constexpr ck::index_t BLOCKSIZE   = 256;
+
+static constexpr ck::index_t KPerBlock   = 128 / sizeof(A0DataType);
+static constexpr ck::index_t Nswizzle    = false;
+static constexpr ck::index_t AK1         = 16 / sizeof(A0DataType);
+static constexpr ck::index_t BK1         = 16 / sizeof(B0DataType);
+static constexpr ck::index_t EVec        = 16 / sizeof(EDataType);
+static constexpr ck::index_t D0Vec       = 1;
+static constexpr ck::index_t D1Vec       = 1;
+static constexpr ck::index_t ActOP       = 0; // 0: gelu_and_mul, 1: silu_and_mul
+static constexpr bool MulRoutedWeight    = false;
+using DeviceOpInstance                   = ck::tensor_operation::device::DeviceMoeGemmBlockScale
+    // clang-format off
+        <      Row, Col, DsLayout, ELayout,
+               A0DataType, A1DataType, B0DataType, B1DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
+               AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   
+               //threadnum, mblock, nblock, kblock
+               BLOCKSIZE, Scale_Block_M, Scale_Block_N, Scale_Block_K,
+               MPerBlock,   NPerBlock,    KPerBlock,
+               // ak1, bk1
+               AK1,   BK1,
+               // mn_perxdl
+               MNPerXDL,   MNPerXDL,
+               // mn_xdlperwave 
+               MXDLPerWave,  NXDLPerWave,
+               // a,b: loadtranfer cluster, cluster order, srcorder,VECDIM, srcpervec, dstpervec, lds_extra
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, BK1, BK1, 0,
+               //    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+               //    MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+                //  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+                CShuffleMXDLPerWave,    CShuffleNXDLPerWave,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec, 1>,
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, ActOP, Nswizzle, true, MulRoutedWeight, int32_t, A0DataType>;
+#else
+static constexpr ck::index_t MPerBlock = 128; using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmBlockScale<
+               Row, Col, DsLayout, ELayout,
+               A0DataType, A1DataType, B0DataType, B1DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
+               AElementOp,  BElementOp, CDEElementOp,   GemmSpec,   
+               256,  Scale_Block_M, Scale_Block_N, Scale_Block_K,
+               MPerBlock,   128,    128,
+               16,   16,
+               16,   16,
+               4,    4,
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
+               2,    2,   S<1, 32, 1, 8>, S<2, 1, 1, 1>,
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, ActOP, Nswizzle, true, MulRoutedWeight, int32_t, A0DataType>;
+#endif
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+#if 1
+    // GEMM shape
+    ck::index_t N               = 4096;
+    ck::index_t K               = 6144;
+    ck::index_t experts         = 8;
+    ck::index_t topk            = 2;
+    // ck::index_t sorted_tile_num = 133;
+    // ck::index_t valid_tile_num  = 128;
+    // ck::index_t tokens          = 8192;
+    ck::index_t sorted_tile_num = 15;
+    ck::index_t valid_tile_num  = 13;
+    ck::index_t tokens          = 832;
+    #else
+    //deepseek
+    ck::index_t N               = 2048;
+    ck::index_t K               = 7160;
+    ck::index_t experts         = 256;
+    ck::index_t topk            = 8;
+    ck::index_t tokens          = 4096;
+    ck::index_t sorted_tile_num = 261;
+    ck::index_t valid_tile_num  = 256;
+#endif
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 3)
+    {
+        // use default case
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 7)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        N               = std::stoi(argv[4]);
+        K               = std::stoi(argv[5]);
+        tokens          = std::stoi(argv[6]);
+    }
+    else if(argc == 9)
+    {
+
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        N               = std::stoi(argv[4]);
+        K               = std::stoi(argv[5]);
+        tokens          = std::stoi(argv[6]);
+        sorted_tile_num = std::stoi(argv[7]);
+        valid_tile_num  = std::stoi(argv[8]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 6: N, K, tokens\n");
+        exit(0);
+    }
+
+    ck::index_t sorted_size = sorted_tile_num * MPerBlock;
+    ck::index_t valid_size  = valid_tile_num * MPerBlock;
+    if(tokens * topk > valid_size)
+    {
+        printf("err config, tokens * topk > valid_size\n");
+        exit(-1);
+    }
+    ck::index_t StrideA              = K;
+    ck::index_t StrideB              = K;
+    ck::index_t StrideE              = N;
+    constexpr ck::index_t NumDTensor = DsDataType::Size();
+    constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{0};
+    ck::index_t Scale_Stride_AM      = (K + Scale_Block_K - 1) / Scale_Block_K;
+    ck::index_t Scale_Stride_BN      = (K + Scale_Block_K - 1) / Scale_Block_K;
+    ck::index_t Scale_Stride_B       = (N * 2 + Scale_Block_N - 1) / Scale_Block_N;
+
+    ck::index_t KBatch = 1;
+
+    Tensor<ck::index_t> expert_ids(HostTensorDescriptor({sorted_tile_num}, {1}));
+    Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({sorted_size}, {1}));
+    Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1 + sorted_tile_num}));
+    max_token_id.mData = {valid_size};
+    // int eids[]         = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 3, 3, 3};
+    for(int i = 0; i < sorted_tile_num; i++)
+    {
+        expert_ids.mData[i] = i / (valid_tile_num / experts);
+    }
+
+    int token_per_tile = (tokens * topk + valid_tile_num - 1) / valid_tile_num;
+    int tokenid        = 0;
+
+    for(int i = 0; i < sorted_size; i++)
+    {
+        int tile_off = i % MPerBlock;
+        if(tile_off < token_per_tile && tokenid < tokens * topk)
+        {
+            sorted_token_ids.mData[i] = (tokenid % tokens) | ((tokenid / tokens) << 24);
+            tokenid++;
+        }
+        else
+        {
+            sorted_token_ids.mData[i] = tokens;
+        }
+    }
+    Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
+    Tensor<A1DataType> a1_t_k(HostTensorDescriptor({tokens, (K + Scale_Block_K - 1) / Scale_Block_K},
+                                                   {Scale_Stride_AM, 1}));
+    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
+    Tensor<B1DataType> b1_e_n_k(HostTensorDescriptor(
+        {experts, (K + Scale_Block_K - 1) / Scale_Block_K, (N * 2 + Scale_Block_N - 1) / Scale_Block_N},
+        {(Scale_Stride_B * Scale_Stride_BN), 1, Scale_Stride_BN}));
+    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
+    Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
+    Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+    Tensor<EDataType> e_t_n_device_result(
+        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+    e_t_n_device_result.SetZero();
+    std::cout << "a0_t_k: " << a0_t_k.mDesc << std::endl;
+    std::cout << "a1_t_k: " << a1_t_k.mDesc << std::endl;
+    std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
+    std::cout << "b1_e_n_k: " << b1_e_n_k.mDesc << std::endl;
+    std::cout << "d2_e_n: " << d2_e_n.mDesc << std::endl;
+    std::cout << "e_t_n: " << e_t_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{-0.5, 0.5});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+        break;
+    case 2:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_1<A1DataType>{});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<B1DataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 3:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+        break;
+    case 4:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{-0.5, 0.5});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+        break;
+    case 5:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{-0.5, 0.5});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_1<A1DataType>{});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+        break;
+    case 6:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{-0.5, 0.5});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<B1DataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+        break;
+    default:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{-0.5, 0.5});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+    }
+    DeviceMem sorted_token_ids_dev(sizeof(ck::index_t) *
+                                   sorted_token_ids.mDesc.GetElementSpaceSize());
+    DeviceMem expert_ids_dev(sizeof(ck::index_t) * expert_ids.mDesc.GetElementSpaceSize());
+    DeviceMem max_token_id_dev(sizeof(ck::index_t) * max_token_id.mDesc.GetElementSpaceSize());
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_t_k.mDesc.GetElementSpaceSize());
+    DeviceMem a1_device_buf(sizeof(A1DataType) * a1_t_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_e_n_k.mDesc.GetElementSpaceSize());
+    DeviceMem b1_device_buf(sizeof(B1DataType) * b1_e_n_k.mDesc.GetElementSpaceSize());
+    DeviceMem d2_device_buf(sizeof(D2DataType) * d2_e_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_t_n_device_result.mDesc.GetElementSpaceSize());
+    // a0_t_k.savetxt("a.txt");
+    // expert_ids.savetxt("expert_ids.txt", "int");
+    // sorted_token_ids.savetxt("sorted_token_ids.txt", "int");
+    // d2_e_n.savetxt("d2_e_n.txt", "int");
+    sorted_token_ids_dev.ToDevice(sorted_token_ids.mData.data());
+    expert_ids_dev.ToDevice(expert_ids.mData.data());
+    max_token_id_dev.ToDevice(max_token_id.mData.data());
+    a0_device_buf.ToDevice(a0_t_k.mData.data());
+    a1_device_buf.ToDevice(a1_t_k.mData.data());
+    b1_device_buf.ToDevice(b1_e_n_k.mData.data());
+    d2_device_buf.ToDevice(d2_e_n.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+
+    int NPerXdl = device_op.GetPreShuffleParameters();
+
+    preShuffleBuffer(
+        b0_e_n_k.mData.data(), b0_preshuffled.mData.data(), N * 2 * experts, K, NPerXdl);
+
+    b0_device_buf.ToDevice(b0_preshuffled.mData.data());
+
+    auto invoker = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(sorted_token_ids_dev.GetDeviceBuffer(),
+                               expert_ids_dev.GetDeviceBuffer(),
+                               max_token_id_dev.GetDeviceBuffer(),
+                               a0_device_buf.GetDeviceBuffer(),
+                               b0_device_buf.GetDeviceBuffer(),
+                               std::array<const void*, NumDTensor>{d2_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               tokens,
+                               topk,
+                               sorted_size,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               StrideDs,
+                               StrideE,
+                               a1_device_buf.GetDeviceBuffer(),
+                               b1_device_buf.GetDeviceBuffer(),
+                               KBatch,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+    if(time_kernel)
+    {
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+        std::size_t flop      = std::size_t(2) * tokens * topk * N * 2 * K;
+        std::size_t num_btype = sizeof(A0DataType) * valid_tile_num * K +
+                                sizeof(B0DataType) * K * N * 2 * experts +
+                                sizeof(EDataType) * valid_tile_num * N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s.\n"  << device_op.GetTypeString() << std::endl;
+    }
+
+    if(do_verification)
+    {
+        invoker.Run(argument, StreamConfig{nullptr, false, 0, 0, 1});
+
+        Tensor<float> a_t_k({tokens, K});
+        Tensor<float> b_e_n_k({experts, K, N * 2});
+        e_device_buf.FromDevice(e_t_n_device_result.mData.data());
+
+        Tensor<CShuffleDataType> c_t_k_n({tokens, topk, N}, {topk * N, N, 1});
+
+        //handle scale before ref.
+        for(int t = 0; t < tokens; ++t)
+        {
+            for(int k = 0; k < K; ++k)
+            {
+                a_t_k(t, k) = ck::type_convert<float>(a0_t_k(t, k)) *
+                                    a1_t_k(t, k / Scale_Block_K);
+            }
+        }
+
+        for(int e = 0; e < experts; ++e)
+        {
+            for(int k = 0; k < K; ++k)
+            {
+                for(int n = 0; n < N * 2; ++n)
+                {
+                    b_e_n_k(e, k, n) = ck::type_convert<float>(b0_e_n_k(e, k, n)) *
+                                       b1_e_n_k(e, k / Scale_Block_K, n / Scale_Block_N);
+                }
+            }
+        }
+        using ReferenceGemmInstance =
+            ck::tensor_operation::host::ReferenceMoeGemm1BlockScale<float,
+                                                                    float,
+                                                                    CShuffleDataType,
+                                                                    D2DataType,
+                                                                    AccDataType,
+                                                                    PassThrough,
+                                                                    PassThrough,
+                                                                    PassThrough,
+                                                                    ActOP,
+                                                                    MulRoutedWeight>;
+        auto ref_moe_gemm           = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_moe_gemm.MakeInvoker();
+
+        auto ref_argument = ref_moe_gemm.MakeArgument(sorted_token_ids,
+                                                      expert_ids,
+                                                      max_token_id,
+                                                      MPerBlock,
+                                                      a0_t_k,
+                                                      b0_e_n_k,
+                                                      c_t_k_n,
+                                                      d2_e_n,
+                                                      PassThrough{},
+                                                      PassThrough{},
+                                                      PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+        for(int m = 0; m < valid_size; ++m)
+        {
+
+            const int fuse_t  = sorted_token_ids.mData[m];
+            const int t       = fuse_t & 0xffffff;
+            const int topk_id = (fuse_t & 0xff000000) >> 24;
+
+            if(t >= tokens)
+            {
+                continue;
+            }
+            const int e = expert_ids(m / MPerBlock);
+            for(int n = 0; n < N; ++n)
+            {
+                e_t_n_host_result(t, topk_id, n) = ck::type_convert<EDataType>(c_t_k_n(t, topk_id, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_t_n_device_result.mData.data());
+
+        auto status = ck::utils::check_err(
+                   e_t_n_device_result, e_t_n_host_result, "Error: Incorrect results!", 1e-3, 5e-1)
+                   ? 0
+                   : 1;
+        if (status == 0){
+            printf("Validation Pass.\n");
+        }
+        return status;
+    }
+
+    return 0;
+}
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp
index 7d34749a1a..b1e74fe00d 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp
@@ -121,13 +121,13 @@ static constexpr ck::index_t Scale_Block_K = 128;
 #if 0
 static constexpr ck::index_t MPerBlock = 32;
 static constexpr ck::index_t BLOCKSIZE = 256;
-static constexpr ck::index_t MXDLPerWave = 1;
-static constexpr ck::index_t NXDLPerWave = 1;
+static constexpr ck::index_t MXDLPerWave = 2;
+static constexpr ck::index_t NXDLPerWave = 2;
 static constexpr ck::index_t NPerBlock   = 128;
-static constexpr ck::index_t MNPerXDL    = 32;
+static constexpr ck::index_t MNPerXDL    = 16;
 static constexpr ck::index_t KPerBlock   = 256 / sizeof(A0DataType);
 
-static constexpr ck::index_t CShuffleNLane = 32;
+static constexpr ck::index_t CShuffleNLane = 16;
 static constexpr ck::index_t CShuffleMLane = BLOCKSIZE / CShuffleNLane;
 static constexpr ck::index_t AK1           = 16 / sizeof(A0DataType);
 static constexpr ck::index_t BK1           = 16 / sizeof(B0DataType);
@@ -147,10 +147,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmBlockScale<
                AK1,   BK1,
                MNPerXDL,   MNPerXDL,
                MXDLPerWave,  NXDLPerWave,
-               S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
-               S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
-               1,        1,         S<1, CShuffleMLane, 1, CShuffleNLane>, S<EVec, D0Vec, D1Vec, D2Vec>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, false, false, A0DataType>;
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
+               2,        2,         S<1, CShuffleMLane, 1, CShuffleNLane>, S<EVec, D0Vec, D1Vec, D2Vec>,
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, 0, false, false, false, int32_t, A0DataType>;
 
 #else
 static constexpr ck::index_t MPerBlock = 128; using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmBlockScale<
@@ -160,12 +160,12 @@ static constexpr ck::index_t MPerBlock = 128; using DeviceOpInstance = ck::tenso
                256,  Scale_Block_M, Scale_Block_N, Scale_Block_K,
                MPerBlock,   128,    128,
                16,   16,
-               32,   32,
-               2,    2,
+               16,   16,
+               4,    4,
                S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
                S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
-               1,    1,   S<1, 32, 1, 8>, S<2, 1, 1, 1>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, false, false, A0DataType>;
+               2,    2,   S<1, 32, 1, 8>, S<2, 1, 1, 1>,
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, 0, false, false, false, int32_t, A0DataType>;
 #endif
 // clang-format on
 
diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 6428c9779c..105cd65a33 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -170,7 +170,7 @@
 #define CK_USE_PK4_LAYOUT_SHUFFLE 1
 
 // using .co compiled shader for moe_stage2_blockscale
-#define CK_USE_ASM_MOE_STAGE2_BLOCKSCALE 1
+#define CK_USE_ASM_MOE_STAGE2_BLOCKSCALE 0
 
 // block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
 #define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp
index c2fad6132f..9e9978330b 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp
@@ -601,7 +601,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3<BlockGemmPipelineSch
                                                             make_tuple(n0, I0, k0, ik))>{}];
                                 });
 
-                                using mfma_input_type =
+                                using mfma_input_type =xdlops_gemm
                                     typename vector_type<ComputeDataType,
                                                          xdlops_gemm.K1PerXdlops>::type;
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v3.hpp
new file mode 100644
index 0000000000..d1aa51bbc5
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v3.hpp
@@ -0,0 +1,1193 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MScaleBlock,
+          index_t NScaleBlock,
+          index_t KScaleBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v3
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MScaleBlock,
+          index_t NScaleBlock,
+          index_t KScaleBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v3<
+    BlockGemmPipelineScheduler::Intrawave,
+    BlockSize,
+    ADataType,
+    BDataType,
+    ComputeDataType,
+    AccDataType,
+    ATileDesc,
+    BTileDesc,
+    AMmaTileDesc,
+    BMmaTileDesc,
+    ABlockTransferSrcScalarPerVector,
+    BBlockTransferSrcScalarPerVector,
+    MPerBlock,
+    NPerBlock,
+    KPerBlock,
+    MScaleBlock,
+    NScaleBlock,
+    KScaleBlock,
+    MPerXDL,
+    NPerXDL,
+    MRepeat,
+    NRepeat,
+    KPack> : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                               ADataType,
+                                               BDataType,
+                                               ComputeDataType,
+                                               AccDataType,
+                                               ATileDesc,
+                                               BTileDesc,
+                                               AMmaTileDesc,
+                                               BMmaTileDesc,
+                                               ABlockTransferSrcScalarPerVector,
+                                               BBlockTransferSrcScalarPerVector,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               MPerXDL,
+                                               NPerXDL,
+                                               MRepeat,
+                                               NRepeat,
+                                               KPack,
+                                               true>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack,
+                                                   true>;
+    using Base::A_K1;
+    using Base::B_K1;
+    using Base::I0;
+    using Base::I1;
+    using Base::I2;
+    using Base::KGroup;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MWaves;
+
+    static constexpr index_t PrefetchStages        = 2;
+    static constexpr index_t PrefillStages         = 1;
+    static constexpr index_t GlobalBufferNum       = 1;
+    static constexpr index_t HotloopLocalBufSwitch = MRepeat % 2 == 0 ? 0 : 1;
+
+    template <typename TileDesc_M0_M1_M2_K>
+    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
+    {
+        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
+        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
+        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
+        constexpr index_t K2 = KPack / KGroup;
+        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K0 = KRepeat * KGroup;
+
+        return transform_tensor_descriptor(
+            TileDesc_M0_M1_M2_K{},
+            make_tuple(
+                make_pass_through_transform(Number<M0>{}),
+                make_pass_through_transform(Number<M1>{}),
+                make_pass_through_transform(Number<M2>{}),
+                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
+        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        // A/B split schedule
+        // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
+        constexpr auto num_ds_read_inst_a =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
+                ? HotLoopInstList::A_LDS_Read_Inst_Num
+                : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
+
+        constexpr auto num_ds_write_inst_a = HotLoopInstList::A_LDS_Write_Inst_Num;
+
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num * 2;
+
+        static_assert(num_buffer_load_inst_a == num_ds_write_inst_a);
+
+        constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num * 2;
+        constexpr auto mfma_cycle    = HotLoopInstList::C_MFMA_Inst_Cycle;
+
+        constexpr auto ds_read_a_issue_cycle =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_a_mfma_rate =
+            math::integer_divide_ceil(mfma_cycle - 4, 2 * ds_read_a_issue_cycle);
+
+        // constexpr auto num_dsread_a_mfma =
+        //     (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
+
+        constexpr auto num_total_stages = MRepeat;
+
+        // Group num_mfma_perstage num_ds_read_a_perstage
+        // since we want to reuse a local register buffer
+        constexpr auto num_mfma_perstage      = num_mfma_inst / num_total_stages;
+        constexpr auto num_ds_read_a_perstage = num_ds_read_inst_a / num_total_stages;
+
+        constexpr auto num_ds_read_a_mfma_perstage =
+            math::integer_divide_ceil(num_ds_read_a_perstage, ds_read_a_mfma_rate);
+
+        constexpr auto num_ds_read_a_prefetch_stages = 2;
+
+        constexpr auto buffer_load_perstage_more = math::integer_divide_ceil(
+            (num_buffer_load_inst_a + num_buffer_load_inst_b), (num_total_stages - 2));
+        constexpr auto buffer_load_perstage_less = math::integer_divide_floor(
+            (num_buffer_load_inst_a + num_buffer_load_inst_b), (num_total_stages - 2));
+
+        constexpr auto buffer_load_stages_more =
+            (num_buffer_load_inst_a + num_buffer_load_inst_b) -
+            math::integer_divide_floor((num_buffer_load_inst_a + num_buffer_load_inst_b),
+                                       (num_total_stages - 2)) *
+                ((num_total_stages - 2));
+
+        constexpr auto buffer_load_b_stages =
+            buffer_load_perstage_more * buffer_load_stages_more > num_buffer_load_inst_b
+                ? num_buffer_load_inst_b / buffer_load_perstage_more
+                : (buffer_load_stages_more +
+                   (num_buffer_load_inst_b - buffer_load_perstage_more * buffer_load_stages_more) /
+                       buffer_load_perstage_less);
+
+        constexpr auto buffer_load_a_stages =
+            num_total_stages - num_ds_read_a_prefetch_stages - buffer_load_b_stages;
+
+        constexpr auto buffer_load_issue_point_b = 0;
+        constexpr auto buffer_load_issue_point_interval_more =
+            num_mfma_perstage / buffer_load_perstage_more;
+        constexpr auto buffer_load_issue_point_interval_less =
+            num_mfma_perstage / buffer_load_perstage_less;
+        constexpr auto ds_write_issue_point      = 0;
+        constexpr auto buffer_load_issue_point_a = num_mfma_perstage >= 3 ? 1 : 0;
+
+        // B global read
+        static_for<0, buffer_load_b_stages, 1>{}([&](auto i) {
+            // Scale load, 1B
+            if constexpr (i.value==0){
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            // Scale load, 1A
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+
+                if constexpr(((i < buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_more ==
+                               buffer_load_issue_point_b)) ||
+                             ((i >= buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_less ==
+                               buffer_load_issue_point_b)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+                __builtin_amdgcn_sched_group_barrier(0x800, 2, 0); // v_pk_fma
+            });
+            // __builtin_amdgcn_sched_barrier(0);
+        });
+
+        // A global read + A local write
+        static_for<0, buffer_load_a_stages, 1>{}([&](auto i) {
+            // Scale load, 1A
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                if constexpr((((i + buffer_load_b_stages) < buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_more ==
+                               ds_write_issue_point)) ||
+                             (((i + buffer_load_b_stages) >= buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_less ==
+                               ds_write_issue_point)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                }
+                if constexpr((((i + buffer_load_b_stages) < buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_more ==
+                               buffer_load_issue_point_a)) ||
+                             (((i + buffer_load_b_stages) >= buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_less ==
+                               buffer_load_issue_point_a)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+                __builtin_amdgcn_sched_group_barrier(0x800, 2, 0); // v_pk_fma
+            });
+            // __builtin_amdgcn_sched_barrier(0);
+        });
+
+        // lds synchronization, prefetch next loop local A
+        static_for<0, num_ds_read_a_prefetch_stages, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                // Scale load, 1A
+                if constexpr(imfma == 0){
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+                __builtin_amdgcn_sched_group_barrier(0x800, 2, 0); // v_pk_fma
+            });
+            // __builtin_amdgcn_sched_barrier(0);
+        });
+    }
+
+    template <bool HasMainLoop,
+              int NumKBlockPerScale,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CScaleThreadDesc,
+              typename CThreadBuffer,
+              typename AScaleGridBuffer,
+              typename AScaleGridDesc,
+              typename AScaleThreadDesc,
+              typename AScaleThreadTransfer,
+              typename AScaleThreadTransferStep,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadDesc,
+              typename BScaleThreadTransfer,
+              typename BScaleThreadTransferStep>
+    __device__ void Run(
+        // ABlockCopy
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_buf,
+        const ABlockTransferStep& a_block_copy_step,
+        // BBlockCopy
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        BBlockTransfer& b_blockwise_copy_up,
+        const BGridBuffer& b_grid_buf,
+        const BGridBuffer& b_grid_buf_up,
+        BBlockBuffer& b_block_buf,
+        const BBlockTransferStep& b_block_copy_step,
+        // CThread
+        const CScaleThreadDesc& c_scale_thread_desc,
+        CThreadBuffer& c_thread_buf,
+        CThreadBuffer& c_thread_buf_up,
+        // AScaleThreadCopy
+        const AScaleGridDesc& a_scale_grid_desc,
+        const AScaleThreadDesc& a_scale_thread_desc,
+        AScaleThreadTransfer& a_scale_thread_copy,
+        const AScaleGridBuffer& a_scale_grid_buf,
+        const AScaleThreadTransferStep& a_scale_thread_copy_step,
+        // BScaleThreadCopy
+        const BScaleGridDesc& b_scale_grid_desc,
+        const BScaleThreadDesc& b_scale_thread_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        BScaleThreadTransfer& b_scale_thread_copy_up,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        const BScaleGridBuffer& b_scale_grid_buf_up,
+        const BScaleThreadTransferStep& b_scale_thread_copy_step,
+        // num_loop
+        index_t num_loop) const
+    {
+        ignore = b_block_desc;
+        ignore = b_block_buf;
+        __builtin_amdgcn_sched_barrier(0);
+        static_assert(CScaleThreadDesc{}.GetLength(Number<0>{}) == 1,
+                      "Pipeline v3 only support scaleblocksliceK=1");
+        static_assert(CScaleThreadDesc{}.GetLength(Number<2>{}) == 1,
+                      "Pipeline v3 only support scaleblocksliceN=1");
+        // assume kperblock = scaleblockk
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs_up;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
+
+        auto a_scale_thread_buf           = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            a_scale_thread_desc.GetElementSpaceSize());
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+        auto c_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            c_scale_thread_desc.GetElementSpaceSize());
+        auto c_scale_thread_buf_up = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            c_scale_thread_desc.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(a_scale_thread_buf), Number<2>{}> a_scale_thread_bufs;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs_up;
+        // StaticallyIndexedArray<decltype(c_scale_thread_buf), Number<2>{}> c_scale_thread_bufs;
+
+        // Global prefetch A1 B1, AScale1 BScale1
+        b_blockwise_copy.Run(b_grid_desc,
+                             b_grid_buf,
+                             b_block_desc_n0_n1_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs(I0));
+
+        b_blockwise_copy_up.Run(b_grid_desc,
+                                b_grid_buf_up,
+                                b_block_desc_n0_n1_k0_k1,
+                                b_block_origin_idx,
+                                b_thread_bufs_up(I0));
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        __builtin_amdgcn_sched_barrier(0);
+
+        a_scale_thread_copy.Run(a_scale_grid_desc,
+                                a_scale_grid_buf,
+                                a_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                a_scale_thread_bufs(I0));
+
+        if constexpr(NumKBlockPerScale == 1)
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<1>{}));
+        }
+        else
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<0>{}));
+        }
+
+        b_scale_thread_copy.Run(b_scale_grid_desc,
+                                b_scale_grid_buf,
+                                b_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                b_scale_thread_bufs(I0));
+
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+        
+        b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                b_scale_grid_buf_up,
+                                b_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                b_scale_thread_bufs_up(I0));
+
+        b_scale_thread_copy_up.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            c_scale_thread_buf(m0) = a_scale_thread_bufs[I0][m0] * b_scale_thread_bufs[I0][I0];
+        });
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            c_scale_thread_buf_up(m0) = a_scale_thread_bufs[I0][m0] * b_scale_thread_bufs_up[I0][I0];
+        });
+
+        // Local prefill A1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I0));
+
+        // Global prefetch A2, AScale2 BScale2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        a_scale_thread_copy.Run(a_scale_grid_desc,
+                                a_scale_grid_buf,
+                                a_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                a_scale_thread_bufs(I0));
+
+        if constexpr(NumKBlockPerScale == 1)
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<1>{}));
+        }
+        else
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<0>{}));
+        }
+
+        b_scale_thread_copy.Run(b_scale_grid_desc,
+                                b_scale_grid_buf,
+                                b_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                b_scale_thread_bufs(I0));
+
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
+        b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                b_scale_grid_buf_up,
+                                b_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                b_scale_thread_bufs_up(I0));
+
+        b_scale_thread_copy_up.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
+        // Initialize C
+        c_thread_buf.Clear();
+        c_thread_buf_up.Clear();
+
+        // Double register buffer for non-scaled gemm computation
+        // 1. Reduce register pressure
+        // 2. Decouple the dependency between mfma instruction and scale-fma instruction following.
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  AccDataType,
+                                  1,
+                                  xdlops_gemm.GetRegSizePerXdlops(),
+                                  true>
+            c_thread_buf_per_scale;
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  AccDataType,
+                                  1,
+                                  xdlops_gemm.GetRegSizePerXdlops(),
+                                  true>
+            c_thread_buf_per_scale_up;
+
+        // Local prefetch A1
+        block_sync_lds();
+        static_for<0, 2, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                       a_block_buf.At(I0),
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                       a_thread_buf);
+                });
+            });
+        });
+
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(local_read_buf));
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                    b_blockwise_copy_up.Run(b_grid_desc,
+                                            b_grid_buf_up,
+                                            b_block_desc_n0_n1_k0_k1,
+                                            b_block_origin_idx,
+                                            b_thread_bufs_up(local_read_buf));
+                    b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(local_read_buf));
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc,
+                                            make_tuple(I0, I0),
+                                            a_scale_thread_bufs(local_read_buf));
+
+                    if constexpr(NumKBlockPerScale == 1)
+                    {
+                        a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                            a_scale_thread_copy_step.At(Number<1>{}));
+                    }
+                    else
+                    {
+                        a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                            a_scale_thread_copy_step.At(Number<0>{}));
+                    }
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc,
+                                            make_tuple(I0, I0),
+                                            b_scale_thread_bufs(local_read_buf));
+
+                    b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                           b_scale_thread_copy_step);
+
+                    b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf_up,
+                                            b_scale_thread_desc,
+                                            make_tuple(I0, I0),
+                                            b_scale_thread_bufs_up(local_read_buf));
+
+                    b_scale_thread_copy_up.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                           b_scale_thread_copy_step);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        vector_type<AccDataType, 2> c_scale_thread_vec;
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf[m0];
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf[m0];
+                        vector_type<AccDataType, 2> c_scale_thread_vec_up;
+                        c_scale_thread_vec_up.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf_up[m0];
+                        c_scale_thread_vec_up.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf_up[m0];
+
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<AccDataType>()(Number<t>{}) = 0;
+                                c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<AccDataType>()(Number<t>{}) = 0;
+                            });
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                vector_type<ComputeDataType, KPack> a_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple((m0 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                           2,
+                                                       I0,
+                                                       I0,
+                                                       k0,
+                                                       I0,
+                                                       ik))>{}];
+                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_bufs[mfma_reg_buf]
+                                                     [Number<b_thread_desc_.CalculateOffset(
+                                                         make_tuple(n0, I0, k0, ik))>{}];
+
+                                    b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_bufs_up[mfma_reg_buf]
+                                                     [Number<b_thread_desc_.CalculateOffset(
+                                                         make_tuple(n0, I0, k0, ik))>{}];
+                                });
+
+                                using mfma_input_type =
+                                    typename vector_type<ComputeDataType,
+                                                         xdlops_gemm.K1PerXdlops>::type;
+
+                                xdlops_gemm.template Run<>(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                                xdlops_gemm.template Run<>(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec_up.template AsType<mfma_input_type>(),
+                                    c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{}));
+                            });
+
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                            static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                                using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                    c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                        .template AsType<pk_fma_type>()[t],
+                                    c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                        .template AsType<pk_fma_type>()[t]);
+                                c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                    c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                        .template AsType<pk_fma_type>()[t],
+                                    c_scale_thread_vec_up.template AsType<pk_fma_type>()[Number<0>{}],
+                                    c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                        .template AsType<pk_fma_type>()[t]);
+                            });
+                        });
+
+                        if constexpr(m0.value == (MRepeat - 2))
+                        {
+                            block_sync_lds();
+
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                    a_thread_copy_.Run(
+                                        a_block_desc_m0_m1_m2_k0_k1_k2,
+                                        make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                                   I0,
+                                                   I0,
+                                                   Number<k0 * KGroup + kg0>{},
+                                                   I0,
+                                                   I0),
+                                        a_block_buf.At(local_read_buf),
+                                        a_thread_desc_,
+                                        make_tuple(
+                                            Number<(m0 + 2 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                   2>{},
+                                            I0,
+                                            I0,
+                                            k0,
+                                            I0,
+                                            Number<kg0 * A_K1>{}),
+                                        a_thread_buf);
+                                });
+                            });
+                        }
+                        else if constexpr(m0.value == (MRepeat - 1))
+                        {
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                    a_thread_copy_.Run(
+                                        a_block_desc_m0_m1_m2_k0_k1_k2,
+                                        make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                                   I0,
+                                                   I0,
+                                                   Number<k0 * KGroup + kg0>{},
+                                                   I0,
+                                                   I0),
+                                        a_block_buf.At(local_read_buf),
+                                        a_thread_desc_,
+                                        make_tuple(
+                                            Number<(m0 + 2 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                   2>{},
+                                            I0,
+                                            I0,
+                                            k0,
+                                            I0,
+                                            Number<kg0 * A_K1>{}),
+                                        a_thread_buf);
+                                });
+                            });
+                        }
+                        else
+                        {
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                    a_thread_copy_.Run(
+                                        a_block_desc_m0_m1_m2_k0_k1_k2,
+                                        make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                                   I0,
+                                                   I0,
+                                                   Number<k0 * KGroup + kg0>{},
+                                                   I0,
+                                                   I0),
+                                        a_block_buf.At(mfma_reg_buf),
+                                        a_thread_desc_,
+                                        make_tuple(
+                                            Number<(m0 + 2 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                   2>{},
+                                            I0,
+                                            I0,
+                                            k0,
+                                            I0,
+                                            Number<kg0 * A_K1>{}),
+                                        a_thread_buf);
+                                });
+                            });
+                        }
+                    });
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        c_scale_thread_buf(m0) = a_scale_thread_bufs[mfma_reg_buf][m0] * b_scale_thread_bufs[mfma_reg_buf][I0];
+                        c_scale_thread_buf_up(m0) = a_scale_thread_bufs[mfma_reg_buf][m0] * b_scale_thread_bufs_up[mfma_reg_buf][I0];
+                    });
+
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            b_blockwise_copy.Run(b_grid_desc,
+                                 b_grid_buf,
+                                 b_block_desc_n0_n1_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs(I1));
+            b_blockwise_copy_up.Run(b_grid_desc,
+                                 b_grid_buf_up,
+                                 b_block_desc_n0_n1_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs_up(I1));
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1));
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {                                                       
+                vector_type<AccDataType, 2> c_scale_thread_vec;
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                    c_scale_thread_buf[m0];
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                    c_scale_thread_buf[m0];
+                vector_type<AccDataType, 2> c_scale_thread_vec_up;
+                c_scale_thread_vec_up.template AsType<AccDataType>()(Number<0>{}) =
+                    c_scale_thread_buf_up[m0];
+                c_scale_thread_vec_up.template AsType<AccDataType>()(Number<1>{}) =
+                    c_scale_thread_buf_up[m0];
+
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                        c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                            .template AsType<AccDataType>()(Number<t>{}) = 0;
+                        c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                            .template AsType<AccDataType>()(Number<t>{}) = 0;
+                    });
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0 % 2, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec.template AsType<mfma_input_type>(),
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec_up.template AsType<mfma_input_type>(),
+                            c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{}));
+                    });
+
+                    constexpr index_t c_offset =
+                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                        using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                            .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<pk_fma_type>()[t],
+                            c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()[t]);
+                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                            .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                            c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                .template AsType<pk_fma_type>()[t],
+                            c_scale_thread_vec_up.template AsType<pk_fma_type>()[Number<0>{}],
+                            c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()[t]);
+                    });
+                });
+
+                if constexpr(m0.value == (MRepeat - 2))
+                {
+                    block_sync_lds();
+
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                           I0,
+                                           I0,
+                                           Number<k0 * KGroup + kg0>{},
+                                           I0,
+                                           I0),
+                                a_block_buf.At(I1),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+                else if constexpr(m0.value == (MRepeat - 1))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                           I0,
+                                           I0,
+                                           Number<k0 * KGroup + kg0>{},
+                                           I0,
+                                           I0),
+                                a_block_buf.At(I1),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+                else
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                           I0,
+                                           I0,
+                                           Number<k0 * KGroup + kg0>{},
+                                           I0,
+                                           I0),
+                                a_block_buf.At(I0),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+            });
+
+            HotLoopScheduler();
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                c_scale_thread_buf(m0) = a_scale_thread_bufs[I0][m0] * b_scale_thread_bufs[I0][I0];
+                c_scale_thread_buf_up(m0) = a_scale_thread_bufs[I0][m0] * b_scale_thread_bufs_up[I0][I0];
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                vector_type<AccDataType, 2> c_scale_thread_vec;
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                    c_scale_thread_buf[m0];
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                    c_scale_thread_buf[m0];
+                vector_type<AccDataType, 2> c_scale_thread_vec_up;
+                c_scale_thread_vec_up.template AsType<AccDataType>()(Number<0>{}) =
+                    c_scale_thread_buf_up[m0];
+                c_scale_thread_vec_up.template AsType<AccDataType>()(Number<1>{}) =
+                    c_scale_thread_buf_up[m0];
+
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                        c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                            .template AsType<AccDataType>()(Number<t>{}) = 0;
+                        c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                            .template AsType<AccDataType>()(Number<t>{}) = 0;
+                    });
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(make_tuple(
+                                    (m0 + HotloopLocalBufSwitch) % 2, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs_up[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec.template AsType<mfma_input_type>(),
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec_up.template AsType<mfma_input_type>(),
+                            c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{}));
+                    });
+                    constexpr index_t c_offset =
+                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                        using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                            .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<pk_fma_type>()[t],
+                            c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()[t]);
+
+                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                            .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                            c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                .template AsType<pk_fma_type>()[t],
+                            c_scale_thread_vec_up.template AsType<pk_fma_type>()[Number<0>{}],
+                            c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()[t]);
+                    });
+                });
+
+                if constexpr(m0.value < (MRepeat - 2))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(
+                                    Number<m0 + 2>{}, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                a_block_buf.At(I1),
+                                a_thread_desc_,
+                                make_tuple(Number<(m0 + 2 + HotloopLocalBufSwitch) % 2>{},
+                                           I0,
+                                           I0,
+                                           k0,
+                                           I0,
+                                           Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+            });
+            // Let's leak last MFMA block to epilogue region, cover the potential lds-shuffle
+            // latency
+            // // __builtin_amdgcn_sched_barrier(0);
+        }
+        else
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                vector_type<AccDataType, 2> c_scale_thread_vec;
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                    c_scale_thread_buf[m0];
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                    c_scale_thread_buf[m0];
+                vector_type<AccDataType, 2> c_scale_thread_vec_up;
+                c_scale_thread_vec_up.template AsType<AccDataType>()(Number<0>{}) =
+                    c_scale_thread_buf_up[m0];
+                c_scale_thread_vec_up.template AsType<AccDataType>()(Number<1>{}) =
+                    c_scale_thread_buf_up[m0];
+
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                        c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                            .template AsType<AccDataType>()(Number<t>{}) = 0;
+                        c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                            .template AsType<AccDataType>()(Number<t>{}) = 0;
+                    });
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0 % 2, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec.template AsType<mfma_input_type>(),
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec_up.template AsType<mfma_input_type>(),
+                            c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{}));
+                    });
+                    constexpr index_t c_offset =
+                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                        using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                            .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<pk_fma_type>()[t],
+                            c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()[t]);
+                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                            .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                            c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                .template AsType<pk_fma_type>()[t],
+                            c_scale_thread_vec_up.template AsType<pk_fma_type>()[Number<0>{}],
+                            c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()[t]);
+                    });
+                });
+
+                if constexpr(m0.value < (MRepeat - 2))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(
+                                    Number<m0 + 2>{}, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                a_block_buf.At(I0),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+            });
+        }
+    }
+
+    protected:
+    // MRepeat MWave MLane KRepeat KLane KPack
+    // KRepeat -> MRepeat-> Mwave->KLane->MLane->KPack
+    // Reduce the vgpr usage here.
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(I2, I1, I1, Number<KRepeat>{}, I1, Number<KPack>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeDataType,
+                                                         decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, 1, 1, KPack / KGroup>,
+                                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                                         5,
+                                                         A_K1,
+                                                         A_K1>;
+
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex6D()};
+
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
+
+    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
+
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp
index 01d5ab7e73..6bff0373f4 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp
@@ -3,9 +3,11 @@
 
 #pragma once
 
+// #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v1.hpp"
 // #include
 // "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v2.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v3.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v3.hpp"
 namespace ck {
 
@@ -32,12 +34,16 @@ template <BlockGemmPipelineVersion BlkGemmPipelineVer,
           index_t NPerXDL,
           index_t MRepeat,
           index_t NRepeat,
-          index_t KPack>
+          index_t KPack,
+          bool GUFusion = false>
 constexpr auto BlockGemmBlockMoeScaleBPreshufflePipeline_Selector()
 {
     if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
     {
-        return BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v1<
+        #if 0
+        if constexpr(GUFusion)
+        {
+            return BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1<
             BlkGemmPipeSche,
             BlockSize,
             ADataType,
@@ -61,6 +67,35 @@ constexpr auto BlockGemmBlockMoeScaleBPreshufflePipeline_Selector()
             MRepeat,
             NRepeat,
             KPack>{};
+        }
+        else
+        #endif
+        {
+            return BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v1<
+            BlkGemmPipeSche,
+            BlockSize,
+            ADataType,
+            BDataType,
+            ComputeDataType,
+            AccDataType,
+            ATileDesc,
+            BTileDesc,
+            AMmaTileDesc,
+            BMmaTileDesc,
+            ABlockTransferSrcScalarPerVector,
+            BBlockTransferSrcScalarPerVector,
+            MPerBlock,
+            NPerBlock,
+            KPerBlock,
+            MScaleBlock,
+            NScaleBlock,
+            KScaleBlock,
+            MPerXDL,
+            NPerXDL,
+            MRepeat,
+            NRepeat,
+            KPack>{};
+        }
     }
 #if 0
     else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
@@ -91,30 +126,60 @@ constexpr auto BlockGemmBlockMoeScaleBPreshufflePipeline_Selector()
     else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
     {
         static_assert(MRepeat >= 4, "MRepeat should at least be 4 in BlockGemmPipelineVersion::v3");
-        return BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v3<
-            BlkGemmPipeSche,
-            BlockSize,
-            ADataType,
-            BDataType,
-            ComputeDataType,
-            AccDataType,
-            ATileDesc,
-            BTileDesc,
-            AMmaTileDesc,
-            BMmaTileDesc,
-            ABlockTransferSrcScalarPerVector,
-            BBlockTransferSrcScalarPerVector,
-            MPerBlock,
-            NPerBlock,
-            KPerBlock,
-            MScaleBlock,
-            NScaleBlock,
-            KScaleBlock,
-            MPerXDL,
-            NPerXDL,
-            MRepeat,
-            NRepeat,
-            KPack>{};
+        if constexpr(GUFusion)
+        {
+            return BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v3<
+                BlkGemmPipeSche,
+                BlockSize,
+                ADataType,
+                BDataType,
+                ComputeDataType,
+                AccDataType,
+                ATileDesc,
+                BTileDesc,
+                AMmaTileDesc,
+                BMmaTileDesc,
+                ABlockTransferSrcScalarPerVector,
+                BBlockTransferSrcScalarPerVector,
+                MPerBlock,
+                NPerBlock,
+                KPerBlock,
+                MScaleBlock,
+                NScaleBlock,
+                KScaleBlock,
+                MPerXDL,
+                NPerXDL,
+                MRepeat,
+                NRepeat,
+                KPack>{};
+        }
+        else
+        {
+            return BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v3<
+                BlkGemmPipeSche,
+                BlockSize,
+                ADataType,
+                BDataType,
+                ComputeDataType,
+                AccDataType,
+                ATileDesc,
+                BTileDesc,
+                AMmaTileDesc,
+                BMmaTileDesc,
+                ABlockTransferSrcScalarPerVector,
+                BBlockTransferSrcScalarPerVector,
+                MPerBlock,
+                NPerBlock,
+                KPerBlock,
+                MScaleBlock,
+                NScaleBlock,
+                KScaleBlock,
+                MPerXDL,
+                NPerXDL,
+                MRepeat,
+                NRepeat,
+                KPack>{};
+        }
     }
     else
     {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp
index af5f9c49ad..719375c114 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp
@@ -71,8 +71,11 @@ template <typename ALayout,
           typename CDEShuffleBlockTransferScalarPerVectors,
           BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
           BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          index_t ActivationOP                        = 0,
           bool NSwizzle                               = false,
           bool IsInputGemm                            = true,
+          bool MulRoutedWeight                        = false,
+          typename IndexType                          = index_t,
           typename ComputeTypeA                       = CDataType,
           typename ComputeTypeB                       = ComputeTypeA,
           typename LDSTypeA                           = ComputeTypeA,
@@ -146,7 +149,10 @@ struct DeviceMoeGemmBlockScale
                          CDEShuffleBlockTransferScalarPerVectors,
                          BlkGemmPipeSched,
                          BlkGemmPipelineVer,
+                         ActivationOP,
                          NSwizzle,
+                         IsInputGemm,
+                         MulRoutedWeight,
                          ComputeTypeA,
                          ComputeTypeB,
                          LDSTypeA,
@@ -154,6 +160,20 @@ struct DeviceMoeGemmBlockScale
 
     using Argument = typename GridwiseGemm::Argument;
 
+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
     int GetPreShuffleParameters() override { return NPerXDL; }
 
     // Invoker
@@ -349,10 +369,10 @@ struct DeviceMoeGemmBlockScale
                     const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
                         arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
 
-                    auto size_a_buffer =
-                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
-                    auto size_b_buffer =
-                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
+                    auto size_a_buffer = a_grid_desc_ak0_m_ak1.GetElementSpaceSize() *
+                                         sizeof(ADataType) / APackedSize;
+                    auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
+                                         sizeof(BDataType) / BPackedSize;
 
                     const auto ds_grid_desc_m_n = GridwiseGemm::MakeDsGridDescriptor_M_N(
                         arg_.M, arg_.MPadded, arg_.N, arg_.NPadded, arg_.StrideDs);
@@ -412,8 +432,7 @@ struct DeviceMoeGemmBlockScale
 
             constexpr index_t minimum_occupancy = 2;
 
-            constexpr auto MemoryDataOp =
-                IsInputGemm ? InMemoryDataOperationEnum::Set : InMemoryDataOperationEnum::AtomicAdd;
+            constexpr auto MemoryDataOp = IsInputGemm  ? InMemoryDataOperationEnum::Set : InMemoryDataOperationEnum::AtomicAdd;
 
             #if CK_USE_ASM_MOE_STAGE2_BLOCKSCALE
             (void)minimum_occupancy;
@@ -486,7 +505,6 @@ struct DeviceMoeGemmBlockScale
                                                                 true,
                                                                 MemoryDataOp,
                                                                 minimum_occupancy,
-                                                                IsInputGemm,
                                                                 TailNumber::Odd>;
                             RunKernel(kernel);
                         }
@@ -496,7 +514,6 @@ struct DeviceMoeGemmBlockScale
                                                                 true,
                                                                 MemoryDataOp,
                                                                 minimum_occupancy,
-                                                                IsInputGemm,
                                                                 TailNumber::Even>;
                             RunKernel(kernel);
                         }
@@ -511,7 +528,6 @@ struct DeviceMoeGemmBlockScale
                                                                  true,
                                                                  MemoryDataOp,
                                                                  minimum_occupancy,
-                                                                 IsInputGemm,
                                                                  TailNumber::Odd>;
                         RunKernel(kernel);
                     }
@@ -521,7 +537,6 @@ struct DeviceMoeGemmBlockScale
                                                                  true,
                                                                  MemoryDataOp,
                                                                  minimum_occupancy,
-                                                                 IsInputGemm,
                                                                  TailNumber::Even>;
                         RunKernel(kernel);
                     }
@@ -543,7 +558,6 @@ struct DeviceMoeGemmBlockScale
                                                             false,
                                                             MemoryDataOp,
                                                             minimum_occupancy,
-                                                            IsInputGemm,
                                                             TailNumber::Odd>;
                         RunKernel(kernel);
                     }
@@ -553,7 +567,6 @@ struct DeviceMoeGemmBlockScale
                                                             false,
                                                             MemoryDataOp,
                                                             minimum_occupancy,
-                                                            IsInputGemm,
                                                             TailNumber::Even>;
                         RunKernel(kernel);
                     }
@@ -567,7 +580,6 @@ struct DeviceMoeGemmBlockScale
                                                                  false,
                                                                  MemoryDataOp,
                                                                  minimum_occupancy,
-                                                                 IsInputGemm,
                                                                  TailNumber::Odd>;
                         RunKernel(kernel);
                     }
@@ -577,7 +589,6 @@ struct DeviceMoeGemmBlockScale
                                                                  false,
                                                                  MemoryDataOp,
                                                                  minimum_occupancy,
-                                                                 IsInputGemm,
                                                                  TailNumber::Even>;
                         RunKernel(kernel);
                     }
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
index 3b2c59ba72..0455258668 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -26,12 +26,18 @@ namespace ck {
 // two lds chunks.
 // 2. Occupied __shared__ won't release until whole shader end, a.k.a AB and C may not use same lds
 // buffer when we declare __shared__ inside blkgemmpipe
+
+enum Activation
+{
+    gelu_and_mul = 0,
+    silu_and_mul = 1
+};
+
 template <typename GridwiseGemm,
           bool HasMainKBlockLoop,
           InMemoryDataOperationEnum CGlobalMemoryDataOperation,
           index_t MinimumOccupancy = 1,
-          bool IsInputGemm         = false,
-          TailNumber TailNum       = TailNumber::Full>
+          TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
@@ -44,7 +50,7 @@ __global__ void
 
     auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, IsInputGemm, TailNum>(
+    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
         karg.p_sorted_token_ids,
         karg.p_sorted_expert_ids,
         karg.p_max_token_id,
@@ -68,7 +74,6 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop,
           InMemoryDataOperationEnum CGlobalMemoryDataOperation,
           index_t MinimumOccupancy = 1,
-          bool IsInputGemm         = false,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -83,8 +88,7 @@ __global__ void
 
     auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
-    GridwiseGemm::
-        template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, IsInputGemm, TailNum>(
+    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
             karg.p_sorted_token_ids,
             karg.p_sorted_expert_ids,
             karg.p_max_token_id,
@@ -154,7 +158,11 @@ template <typename ALayout,
           typename CDEShuffleBlockTransferScalarPerVectors,
           BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
           BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          index_t ActivationOperation                 = 0,
           bool NSwizzle                               = false,
+          bool IsInputGemm                            = true,
+          bool MulRoutedWeight                        = true,
+          typename IndexType                          = index_t,
           typename ComputeTypeA                       = CDataType,
           typename ComputeTypeB                       = ComputeTypeA,
           typename LDSTypeA                           = ADataType,
@@ -309,7 +317,7 @@ struct GridwiseMoeGemmBlockScale
     }
 
     __host__ __device__ static auto MakeAGridDescriptor_AK0_M_AK1(
-        index_t M, index_t MPad, index_t K, index_t KPad, index_t StrideA, index_t AK0)
+        IndexType M, IndexType MPad, IndexType K, IndexType KPad, IndexType StrideA, IndexType AK0)
     {
         const auto a_grid_desc_mraw_kraw = [&]() {
             if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
@@ -501,8 +509,8 @@ struct GridwiseMoeGemmBlockScale
     }
 
     template <typename ELayout>
-    __host__ __device__ static auto
-    MakeCGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC)
+    __host__ __device__ static auto MakeCGridDescriptor_M_N(
+        IndexType M, IndexType MPad, IndexType N, IndexType NPad, IndexType StrideC)
     {
         const auto c_grid_desc_mraw_nraw = [&]() {
             if constexpr(is_same<tensor_layout::gemm::RowMajor, ELayout>::value)
@@ -925,7 +933,8 @@ struct GridwiseMoeGemmBlockScale
                                 NPerXdl,
                                 MXdlPerWave,
                                 NXdlPerWave,
-                                KPack>())>;
+                                KPack,
+                                IsInputGemm>())>;
 
     __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
     {
@@ -1157,8 +1166,7 @@ struct GridwiseMoeGemmBlockScale
 
     template <bool HasMainKBlockLoop,
               InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-              bool IsInputGemm   = true,
-              TailNumber TailNum = TailNumber::Odd>
+                            TailNumber TailNum = TailNumber::Odd>
     __device__ static void Run(const index_t* p_sorted_token_ids,
                                const index_t* p_sorted_expert_ids,
                                const index_t* p_max_token_id,
@@ -1198,7 +1206,7 @@ struct GridwiseMoeGemmBlockScale
                        math::integer_divide_ceil(problem.K, ScaleBlockK)),
             make_tuple(math::integer_divide_ceil(problem.K, ScaleBlockK), 1));
         const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor(
-            make_tuple(math::integer_divide_ceil(problem.N, ScaleBlockN),
+            make_tuple(math::integer_divide_ceil(problem.N * (IsInputGemm ? 2 : 1), ScaleBlockN),
                        math::integer_divide_ceil(problem.K, ScaleBlockK)),
             make_tuple(math::integer_divide_ceil(problem.K, ScaleBlockK), 1));
 
@@ -1247,7 +1255,7 @@ struct GridwiseMoeGemmBlockScale
 
         if(token_pos >= max_token_id || token0 >= problem.NumTokens)
             return;
-        StaticallyIndexedArray<index_t, AMRepeats> gather_offsets;
+        StaticallyIndexedArray<IndexType, AMRepeats> gather_offsets;
         static_for<0, AMRepeats, 1>{}([&](auto m0) {
             const index_t fused_token = p_sorted_token_ids[token_pos + m0];
             index_t token_offset      = fused_token & 0xffffff;
@@ -1255,11 +1263,11 @@ struct GridwiseMoeGemmBlockScale
             {
                 token_offset = token_offset * problem.TopK + (fused_token >> 24);
             }
-            gather_offsets(m0) = token_offset * problem.K;
+            gather_offsets(m0) = static_cast<IndexType>(token_offset) * problem.K;
         });
-        const index_t expert_stride = __builtin_amdgcn_readfirstlane(problem.N * problem.K);
+        const index_t expert_stride = __builtin_amdgcn_readfirstlane(problem.N * problem.K * (IsInputGemm ? 2 : 1));
         const index_t expert_scale_stride =
-            __builtin_amdgcn_readfirstlane(math::integer_divide_ceil(problem.N, ScaleBlockN) *
+            __builtin_amdgcn_readfirstlane(math::integer_divide_ceil(problem.N * (IsInputGemm ? 2 : 1), ScaleBlockN) *
                                            math::integer_divide_ceil(problem.K, ScaleBlockK));
 
         // N0, K0, Blocksize*KPack
@@ -1307,6 +1315,7 @@ struct GridwiseMoeGemmBlockScale
             1,
             AThreadTransferSrcResetCoordinateAfterRun,
             true,
+            IndexType,
             1,
             BlockwiseGemmPipe::GlobalBufferNum>(a_grid_desc_ak0_m_ak1,
                                                 make_multi_index(0, 0, 0),
@@ -1349,6 +1358,7 @@ struct GridwiseMoeGemmBlockScale
         static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
         auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
         auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+        decltype(c_thread_buf) c_thread_buf_up;
 
         const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
             (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
@@ -1430,38 +1440,115 @@ struct GridwiseMoeGemmBlockScale
         constexpr auto b_scale_thread_slice_copy_step = make_multi_index(0, ScaleSliceSizeK);
 
         constexpr auto NumKBlockPerScale = math::integer_divide_ceil(ScaleBlockK, KPerBlock);
+        if constexpr(IsInputGemm)
+        {
+            const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2 / BPackedSize;
+            const auto b_grid_buf_up     = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_grid_up + expert_id * static_cast<long_index_t>(expert_stride) / BPackedSize,
+                b_grid_desc_bpreshuffled.GetElementSpaceSize());
+            auto b_blockwise_copy_up = ThreadwiseTensorSliceTransfer_v2<
+                BDataType,
+                BDataType,
+                decltype(b_grid_desc_bpreshuffled),
+                decltype(b_block_desc_bk0_n_bk1),
+                Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
+                Sequence<1, 2, 0, 3>,
+                3,
+                BBlockTransferSrcScalarPerVector,
+                BThreadTransferSrcResetCoordinateAfterRun,
+                true>(b_grid_desc_bpreshuffled,
+                      make_multi_index(n_block_data_idx_on_grid,
+                                       get_warp_local_1d_id() % NWave,
+                                       0,
+                                       KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+            const BDataType* p_b_scale_grid_up = p_b_scale_grid + expert_scale_stride / 2 / BPackedSize;
+            const auto b_scale_grid_buf_up = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_scale_grid_up + expert_id * expert_scale_stride,
+                b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+            auto b_scale_thread_copy_up =
+                ThreadwiseTensorSliceTransfer_v2<BScaleType,
+                                                BScaleType,
+                                                decltype(b_scale_grid_desc_bn_ak),
+                                                decltype(b_scale_thread_desc),
+                                                Sequence<ScaleSliceSizeN, ScaleSliceSizeK>,
+                                                Sequence<0, 1>,
+                                                1,
+                                                ScaleSliceSizeK,
+                                                1,
+                                                false>(
+                    b_scale_grid_desc_bn_ak, make_multi_index(block_n_id * NPerBlock / ScaleBlockN, 0));
 
-        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, NumKBlockPerScale, TailNum>(
-            a_grid_desc_ak0_m_ak1,
-            a_block_desc_ak0_m_ak1,
-            a_blockwise_copy,
-            a_grid_buf,
-            a_block_buf,
-            a_block_slice_copy_step,
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_buf,
+                a_block_slice_copy_step,
 
-            b_grid_desc_bpreshuffled,
-            b_block_desc_bk0_n_bk1,
-            b_blockwise_copy,
-            b_grid_buf,
-            b_block_buf,
-            b_block_slice_copy_step,
+                b_grid_desc_bpreshuffled,
+                b_block_desc_bk0_n_bk1,
+                b_blockwise_copy,
+                b_blockwise_copy_up,
+                b_grid_buf,
+                b_grid_buf_up,
+                b_block_buf,
+                b_block_slice_copy_step,
 
-            c_scale_thread_desc,
-            c_thread_buf,
+                c_scale_thread_desc,
+                c_thread_buf,
+                c_thread_buf_up,
 
-            a_scale_grid_desc_am_ak,
-            a_scale_thread_desc,
-            a_scale_thread_copy,
-            a_scale_grid_buf,
-            a_scale_thread_slice_copy_step,
+                a_scale_grid_desc_am_ak,
+                a_scale_thread_desc,
+                a_scale_thread_copy,
+                a_scale_grid_buf,
+                a_scale_thread_slice_copy_step,
 
-            b_scale_grid_desc_bn_ak,
-            b_scale_thread_desc,
-            b_scale_thread_copy,
-            b_scale_grid_buf,
-            b_scale_thread_slice_copy_step,
+                b_scale_grid_desc_bn_ak,
+                b_scale_thread_desc,
+                b_scale_thread_copy,
+                b_scale_thread_copy_up,
+                b_scale_grid_buf,
+                b_scale_grid_buf_up,
+                b_scale_thread_slice_copy_step,
 
-            num_k_block_main_loop);
+                num_k_block_main_loop);
+        }
+        else
+        {
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, NumKBlockPerScale, TailNum>(
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_buf,
+                a_block_slice_copy_step,
+
+                b_grid_desc_bpreshuffled,
+                b_block_desc_bk0_n_bk1,
+                b_blockwise_copy,
+                b_grid_buf,
+                b_block_buf,
+                b_block_slice_copy_step,
+
+                c_scale_thread_desc,
+                c_thread_buf,
+
+                a_scale_grid_desc_am_ak,
+                a_scale_thread_desc,
+                a_scale_thread_copy,
+                a_scale_grid_buf,
+                a_scale_thread_slice_copy_step,
+
+                b_scale_grid_desc_bn_ak,
+                b_scale_thread_desc,
+                b_scale_thread_copy,
+                b_scale_grid_buf,
+                b_scale_thread_slice_copy_step,
+
+                num_k_block_main_loop);
+        }
 
         // shuffle C and write out
         {
@@ -1477,7 +1564,7 @@ struct GridwiseMoeGemmBlockScale
                 blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4();
 
             // TODO: hacky, fix it!
-            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            // c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp is only used to get lengths
             constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp =
                 blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4();
 
@@ -1489,6 +1576,69 @@ struct GridwiseMoeGemmBlockScale
             constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I5);
             constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I6);
             constexpr auto N4 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I7);
+            // if constexpr(IsInputGemm) // gu fusion
+            // {
+            //     static_assert(N0 * N1 * N2 * N3 * N4 == NPerBlock);
+            //     static_assert(N4 == 4);
+            //     const index_t n1 = get_warp_local_1d_id() / MWave;
+            //     const index_t n3 = threadIdx.y % get_warp_size() / NPerXdl;
+            //     vector_type<float, 4> topk_weights;
+            //     static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
+            //         static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
+            //             static_for<0, N2, 1>{}([&](auto n2) {      // m_inst_num_groups_per_blk
+            //                 const index_t m_pos = block_m_id * MPerBlock + n0 * N1 * N2 * N3 * N4 +
+            //                                         n1 * N2 * N3 * N4 + n2 * N3 * N4 + n3 * N4;
+            //                 if constexpr(MulRoutedWeight)
+            //                 {
+            //                     topk_weights = *c_style_pointer_cast<const vector_type<float, N4>*>(
+            //                         p_ds_grid[I0] + m_pos);
+            //                 }
+            //                 static_for<0, N4, 1>{}([&](auto n4) { // m_inst_group_size
+            //                         constexpr index_t c_offset =
+            //                             blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
+            //                                 make_tuple(m0, n0, n2 * N4 + n4));
+            //                         constexpr auto cidx = Number<c_offset>{};
+
+            //                         if constexpr(ActivationOperation == Activation::silu_and_mul)
+            //                         {
+            //                             float gate = c_thread_buf[cidx];
+            //                             float up   = c_thread_buf_up[cidx];
+            //                             if constexpr(MulRoutedWeight)
+            //                             {
+            //                                 gate = gate * topk_weights.AsType<float>()[n4];
+            //                                 up   = up * topk_weights.AsType<float>()[n4];
+            //                             }
+            //                             if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            //                             {
+            //                                 gate *= 16;
+            //                                 up *= 16;
+            //                             }
+            //                             tensor_operation::element_wise::Silu{}(gate, gate);
+            //                             c_thread_buf(cidx) = gate * up;
+            //                         }
+            //                         else if(ActivationOperation == Activation::gelu_and_mul)
+            //                         {
+            //                             float gate = c_thread_buf[cidx];
+            //                             float up   = c_thread_buf_up[cidx];
+            //                             if constexpr(MulRoutedWeight)
+            //                             {
+            //                                 gate = gate * topk_weights.AsType<float>()[n3];
+            //                                 up   = up * topk_weights.AsType<float>()[n3];
+            //                             }
+            //                             if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            //                             {
+            //                                 gate *= 16;
+            //                                 up *= 16;
+            //                             }
+            //                             tensor_operation::element_wise::Gelu{}(gate, gate);
+            //                             c_thread_buf(cidx) = gate * up;
+            //                         }
+            //                     });
+            //                 });
+            //             });
+            //         });
+            //     });
+            // }
 
             constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
                 GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
@@ -1655,7 +1805,8 @@ struct GridwiseMoeGemmBlockScale
                     Sequence<true>,
                     uniform_sequence_gen_t<NumDTensor,
                                            false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
-                Sequence<false>,   // ThreadTransferDstResetCoordinateAfterRunFlags
+                Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
+                IndexType,
                 1,                 // ScatterDim
                 true,              // OutputScatter: false, only use scatter weights
                 scatter_weight_idx // ScatterWeightIdx: ascale
@@ -1699,7 +1850,7 @@ struct GridwiseMoeGemmBlockScale
                 CDEBlockTransferCluster{}.At(I2) * CDEBlockTransferCluster{}.At(I3);
             static_for<0, num_access, 1>{}([&](auto access_id) {
                 // make sure it's safe to write to LDS
-                StaticallyIndexedArray<index_t, EMRepeats> scatter_offsets;
+                StaticallyIndexedArray<IndexType, EMRepeats> scatter_offsets;
                 StaticallyIndexedArray<float, EMRepeats> scatter_weights; //= for topk
 
                 auto dstidx = sfc_cde_block.GetIndex(access_id);
@@ -1766,7 +1917,6 @@ struct GridwiseMoeGemmBlockScale
 
     template <bool HasMainKBlockLoop,
               InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-              bool IsInputGemm   = true,
               TailNumber TailNum = TailNumber::Odd>
     __device__ static void Run_2Lds(const index_t* p_sorted_token_ids,
                                     const index_t* p_sorted_expert_ids,
@@ -1808,7 +1958,7 @@ struct GridwiseMoeGemmBlockScale
                        math::integer_divide_ceil(problem.K, ScaleBlockK)),
             make_tuple(math::integer_divide_ceil(problem.K, ScaleBlockK), 1));
         const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor(
-            make_tuple(math::integer_divide_ceil(problem.N, ScaleBlockN),
+            make_tuple(math::integer_divide_ceil(problem.N * (IsInputGemm ? 2 : 1), ScaleBlockN),
                        math::integer_divide_ceil(problem.K, ScaleBlockK)),
             make_tuple(math::integer_divide_ceil(problem.K, ScaleBlockK), 1));
         const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
@@ -1856,7 +2006,7 @@ struct GridwiseMoeGemmBlockScale
         if(token_pos >= max_token_id || expert_block_id * MPerBlock >= max_token_id ||
            token0 >= problem.NumTokens)
             return;
-        StaticallyIndexedArray<index_t, AMRepeats>
+        StaticallyIndexedArray<IndexType, AMRepeats>
             gather_offsets; //= p_sorted_token_ids[token_pos];
         static_for<0, AMRepeats, 1>{}([&](auto m0) {
             const index_t fused_token = p_sorted_token_ids[token_pos + m0];
@@ -1865,11 +2015,11 @@ struct GridwiseMoeGemmBlockScale
             {
                 token_offset = token_offset * problem.TopK + (fused_token >> 24);
             }
-            gather_offsets(m0) = token_offset * problem.K;
+            gather_offsets(m0) = static_cast<IndexType>(token_offset) * problem.K;
         });
-        const index_t expert_stride = __builtin_amdgcn_readfirstlane(problem.N * problem.K);
+        const index_t expert_stride = __builtin_amdgcn_readfirstlane(problem.N * problem.K * (IsInputGemm ? 2 : 1));
         const index_t expert_scale_stride =
-            __builtin_amdgcn_readfirstlane(math::integer_divide_ceil(problem.N, ScaleBlockN) *
+            __builtin_amdgcn_readfirstlane(math::integer_divide_ceil(problem.N  * (IsInputGemm ? 2 : 1), ScaleBlockN) *
                                            math::integer_divide_ceil(problem.K, ScaleBlockK));
         // N0, K0, Blocksize*KPack
         const index_t n_block_data_idx_on_grid =
@@ -1916,6 +2066,7 @@ struct GridwiseMoeGemmBlockScale
             1,
             AThreadTransferSrcResetCoordinateAfterRun,
             true,
+            IndexType,
             1,
             BlockwiseGemmPipe::GlobalBufferNum>(a_grid_desc_ak0_m_ak1,
                make_multi_index(0, 0, 0),
@@ -1964,6 +2115,7 @@ struct GridwiseMoeGemmBlockScale
         static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
         auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
         auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+        decltype(c_thread_buf) c_thread_buf_up;
 
         const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
             (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
@@ -2047,33 +2199,105 @@ struct GridwiseMoeGemmBlockScale
         constexpr auto b_scale_thread_slice_copy_step = make_multi_index(0, ScaleSliceSizeK);
 
         constexpr auto NumKBlockPerScale = math::integer_divide_ceil(ScaleBlockK, KPerBlock);
+        if constexpr(IsInputGemm)
+        {
+            const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2 / BPackedSize;
+            const auto b_grid_buf_up     = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_grid_up + expert_id * static_cast<long_index_t>(expert_stride) / BPackedSize,
+                b_grid_desc_bpreshuffled.GetElementSpaceSize());
+            auto b_blockwise_copy_up = ThreadwiseTensorSliceTransfer_v2<
+                BDataType,
+                BDataType,
+                decltype(b_grid_desc_bpreshuffled),
+                decltype(b_block_desc_bk0_n_bk1),
+                Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
+                Sequence<1, 2, 0, 3>,
+                3,
+                BBlockTransferSrcScalarPerVector,
+                BThreadTransferSrcResetCoordinateAfterRun,
+                true>(b_grid_desc_bpreshuffled,
+                      make_multi_index(n_block_data_idx_on_grid,
+                                       get_warp_local_1d_id() % NWave,
+                                       0,
+                                       KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+            const BDataType* p_b_scale_grid_up = p_b_scale_grid + expert_scale_stride / 2 / BPackedSize;
+            const auto b_scale_grid_buf_up = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_scale_grid_up + expert_id * expert_scale_stride,
+                b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+            auto b_scale_thread_copy_up =
+                ThreadwiseTensorSliceTransfer_v2<BScaleType,
+                                                BScaleType,
+                                                decltype(b_scale_grid_desc_bn_ak),
+                                                decltype(b_scale_thread_desc),
+                                                Sequence<ScaleSliceSizeN, ScaleSliceSizeK>,
+                                                Sequence<0, 1>,
+                                                1,
+                                                ScaleSliceSizeK,
+                                                1,
+                                                false>(
+                    b_scale_grid_desc_bn_ak, make_multi_index(block_n_id * NPerBlock / ScaleBlockN, 0));
 
-        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, NumKBlockPerScale, TailNum>(
-            a_grid_desc_ak0_m_ak1,
-            a_block_desc_ak0_m_ak1,
-            a_blockwise_copy,
-            a_grid_buf,
-            a_block_bufs,
-            a_block_slice_copy_step,
-            b_grid_desc_bpreshuffled,
-            b_block_desc_bk0_n_bk1,
-            b_blockwise_copy,
-            b_grid_buf,
-            b_block_bufs,
-            b_block_slice_copy_step,
-            c_scale_thread_desc,
-            c_thread_buf,
-            a_scale_grid_desc_am_ak,
-            a_scale_thread_desc,
-            a_scale_thread_copy,
-            a_scale_grid_buf,
-            a_scale_thread_slice_copy_step,
-            b_scale_grid_desc_bn_ak,
-            b_scale_thread_desc,
-            b_scale_thread_copy,
-            b_scale_grid_buf,
-            b_scale_thread_slice_copy_step,
-            num_k_block_main_loop);
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, NumKBlockPerScale, TailNum>(
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_bufs,
+                a_block_slice_copy_step,
+                b_grid_desc_bpreshuffled,
+                b_block_desc_bk0_n_bk1,
+                b_blockwise_copy,
+                b_blockwise_copy_up,
+                b_grid_buf,
+                b_grid_buf_up,
+                b_block_bufs,
+                b_block_slice_copy_step,
+                c_scale_thread_desc,
+                c_thread_buf,
+                c_thread_buf_up,
+                a_scale_grid_desc_am_ak,
+                a_scale_thread_desc,
+                a_scale_thread_copy,
+                a_scale_grid_buf,
+                a_scale_thread_slice_copy_step,
+                b_scale_grid_desc_bn_ak,
+                b_scale_thread_desc,
+                b_scale_thread_copy,
+                b_scale_thread_copy_up,
+                b_scale_grid_buf,
+                b_scale_grid_buf_up,
+                b_scale_thread_slice_copy_step,
+                num_k_block_main_loop);
+        }
+        else
+        {
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, NumKBlockPerScale, TailNum>(
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_bufs,
+                a_block_slice_copy_step,
+                b_grid_desc_bpreshuffled,
+                b_block_desc_bk0_n_bk1,
+                b_blockwise_copy,
+                b_grid_buf,
+                b_block_bufs,
+                b_block_slice_copy_step,
+                c_scale_thread_desc,
+                c_thread_buf,
+                a_scale_grid_desc_am_ak,
+                a_scale_thread_desc,
+                a_scale_thread_copy,
+                a_scale_grid_buf,
+                a_scale_thread_slice_copy_step,
+                b_scale_grid_desc_bn_ak,
+                b_scale_thread_desc,
+                b_scale_thread_copy,
+                b_scale_grid_buf,
+                b_scale_thread_slice_copy_step,
+                num_k_block_main_loop);
+        }
 
         // shuffle C and write out
         {
@@ -2103,6 +2327,68 @@ struct GridwiseMoeGemmBlockScale
             constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I6);
             constexpr auto N4 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I7);
 
+            // if constexpr(IsInputGemm) // gu fusion
+            // {
+                
+            //     vector_type<float, 4> topk_weights;
+            //     static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
+            //         static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
+            //             static_for<0, M2, 1>{}([&](auto m2) {      // m_inst_num_groups_per_blk
+            //                 static_for<0, N3, 1>{}([&](auto n3) { // m_inst_group_size
+            //                     const index_t m_pos = block_m_id * MPerBlock + n0 * N1 * N2 * N3 * N4 +
+            //                                             n1 * N2 * N3 * N4 + n2 * N3 * N4 + n3 * N4;
+            //                     if constexpr(MulRoutedWeight)
+            //                     {
+            //                         topk_weights = *c_style_pointer_cast<const vector_type<float, N3>*>(
+            //                             p_ds_grid[I0] + m_pos);
+            //                     }
+            //                     static_for<0, N4, 1>{}([&](auto n4) { // m_inst_group_size
+            //                         constexpr index_t c_offset =
+            //                             blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
+            //                                 make_tuple(m0, n0, m2 * N3 + n3));
+            //                         constexpr auto cidx = Number<c_offset>{};
+
+            //                         if constexpr(ActivationOperation == Activation::silu_and_mul)
+            //                         {
+            //                             float gate = c_thread_buf[cidx];
+            //                             float up   = c_thread_buf_up[cidx];
+            //                             if constexpr(MulRoutedWeight)
+            //                             {
+            //                                 gate = gate * topk_weights.AsType<float>()[n3];
+            //                                 up   = up * topk_weights.AsType<float>()[n3];
+            //                             }
+            //                             if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            //                             {
+            //                                 gate *= 16;
+            //                                 up *= 16;
+            //                             }
+            //                             tensor_operation::element_wise::Silu{}(gate, gate);
+            //                             c_thread_buf(cidx) = gate * up;
+            //                         }
+            //                         else if(ActivationOperation == Activation::gelu_and_mul)
+            //                         {
+            //                             float gate = c_thread_buf[cidx];
+            //                             float up   = c_thread_buf_up[cidx];
+            //                             if constexpr(MulRoutedWeight)
+            //                             {
+            //                                 gate = gate * topk_weights.AsType<float>()[n3];
+            //                                 up   = up * topk_weights.AsType<float>()[n3];
+            //                             }
+            //                             if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            //                             {
+            //                                 gate *= 16;
+            //                                 up *= 16;
+            //                             }
+            //                             tensor_operation::element_wise::Gelu{}(gate, gate);
+            //                             c_thread_buf(cidx) = gate * up;
+            //                         }
+            //                     });
+            //                 });
+            //             });
+            //         });
+            //     });
+            // }
+
             constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
                 GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
 
@@ -2267,7 +2553,8 @@ struct GridwiseMoeGemmBlockScale
                     Sequence<true>,
                     uniform_sequence_gen_t<NumDTensor,
                                            false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
-                Sequence<false>,   // ThreadTransferDstResetCoordinateAfterRunFlags
+                Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
+                IndexType,
                 1,                 // ScatterDim
                 true,              // OutputScatter: false, only use scatter weights
                 scatter_weight_idx // ScatterWeightIdx: ascale
@@ -2311,7 +2598,7 @@ struct GridwiseMoeGemmBlockScale
                 CDEBlockTransferCluster{}.At(I2) * CDEBlockTransferCluster{}.At(I3);
             static_for<0, num_access, 1>{}([&](auto access_id) {
                 // make sure it's safe to write to LDS
-                StaticallyIndexedArray<index_t, EMRepeats>
+                StaticallyIndexedArray<IndexType, EMRepeats>
                     scatter_offsets; //= p_sorted_token_ids[c_token_pos];
                 StaticallyIndexedArray<float, EMRepeats> scatter_weights; //= for topk
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp
new file mode 100644
index 0000000000..f949762b09
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <unordered_map>
+
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename D2DataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          index_t ActivationType_ = 0,
+          bool MulRoutedWeight    = true,
+          typename ComputeTypeA   = CDataType,
+          typename ComputeTypeB   = ComputeTypeA>
+struct ReferenceMoeGemm1BlockScale : public device::BaseOperator
+{
+    // Argument
+    static constexpr auto ActivationType = ActivationType_;
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<ck::index_t>& sorted_token_ids,
+                 const Tensor<ck::index_t>& expert_ids,
+                 const Tensor<ck::index_t>& max_token_id,
+                 const index_t sorted_tile_size,
+                 const Tensor<ADataType>& a_t_k,
+                 const Tensor<BDataType>& b_e_n_k,
+                 Tensor<CDataType>& c_t_k_n,
+                 const Tensor<D2DataType>& d2,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : sorted_token_ids_{sorted_token_ids},
+              expert_ids_{expert_ids},
+              max_token_id_{max_token_id},
+              sorted_tile_size_{sorted_tile_size},
+              a_t_k_{a_t_k},
+              b_e_n_k_{b_e_n_k},
+              c_t_k_n_{c_t_k_n},
+              d2_{d2},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op}
+        {
+        }
+
+        const Tensor<ck::index_t>& sorted_token_ids_;
+        const Tensor<ck::index_t>& expert_ids_;
+        const Tensor<ck::index_t>& max_token_id_;
+        index_t sorted_tile_size_;
+        const Tensor<ADataType>& a_t_k_;
+        const Tensor<BDataType>& b_e_n_k_;
+        Tensor<CDataType>& c_t_k_n_;
+        const Tensor<D2DataType>& d2_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceMoeGemm1BlockScale::Argument;
+
+        float Run(const Argument& arg)
+        {
+            static_assert(ActivationType < 2, "Not supported activation type");
+            const int full_n = arg.c_t_k_n_.mDesc.GetLengths()[2];
+            auto f_mk_kn_mn  = [&](auto m, auto n) {
+                const int K = arg.a_t_k_.mDesc.GetLengths()[1];
+                AccDataType v_acc_up{0};
+                ComputeTypeB v_b_up{0};
+                AccDataType v_acc{0};
+
+                ComputeTypeA v_a{0};
+                ComputeTypeB v_b{0};
+
+                const int t         = arg.sorted_token_ids_(m) & 0xffffff;
+                const int topk_id   = (arg.sorted_token_ids_(m) & 0xff000000) >> 24;
+                const int e         = arg.expert_ids_(m / arg.sorted_tile_size_);
+                const int token_cnt = arg.a_t_k_.mDesc.GetLengths()[0];
+                D2DataType v_topk_w = arg.d2_(m, 0); // expert
+                if(t < token_cnt)
+                {
+                    for(int k = 0; k < K; ++k)
+                    {
+                        if constexpr(is_same_v<ADataType, pk_i4_t>)
+                        {
+                            uint8_t i4x2 = arg.a_t_k_(t, k).data;
+                            uint8_t i4   = 0;
+                            if(k % 2 == 1)
+                                i4 = (i4x2 >> 0) & 0xf;
+                            else
+                                i4 = (i4x2 >> 4) & 0xf;
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+                            v_a = i4_to_f32_gfx9(i4);
+#else
+                            v_a    = i4 - 8;
+#endif
+                        }
+                        else
+                        {
+                            arg.a_element_op_(v_a, arg.a_t_k_(t, k));
+                        }
+                        // same for B matrix
+                        if constexpr(is_same_v<BDataType, pk_i4_t>)
+                        {
+                            uint8_t i4x2    = arg.b_e_n_k_(e, k, n).data;
+                            uint8_t i4x2_up = arg.b_e_n_k_(e, k, n + full_n).data;
+                            uint8_t i4      = 0;
+                            uint8_t i4_up   = 0;
+                            if(k % 2 == 1)
+                            {
+                                i4    = (i4x2 >> 0) & 0xf;
+                                i4_up = (i4x2_up >> 0) & 0xf;
+                            }
+                            else
+                            {
+                                i4    = (i4x2 >> 4) & 0xf;
+                                i4_up = (i4x2_up >> 4) & 0xf;
+                            }
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+                            v_b    = i4_to_f32_gfx9(i4);
+                            v_b_up = i4_to_f32_gfx9(i4_up);
+#else
+                            v_b    = i4 - 8;
+                            v_b_up = i4_up - 8;
+#endif
+                        }
+                        else
+                        {
+                            arg.b_element_op_(v_b, arg.b_e_n_k_(e, k, n));
+                            arg.b_element_op_(v_b_up, arg.b_e_n_k_(e, k, n + full_n));
+                        }
+
+                        v_acc +=
+                            ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
+                        v_acc_up += ck::type_convert<AccDataType>(v_a) *
+                                    ck::type_convert<AccDataType>(v_b_up);
+                    }
+                    CDataType v_c{0};
+                    CDataType v_c_up{0};
+                    if constexpr(MulRoutedWeight)
+                    {
+                        v_acc *= v_topk_w;
+                        v_acc_up *= v_topk_w;
+                    }
+
+                    arg.c_element_op_(v_c, v_acc);
+                    arg.c_element_op_(v_c_up, v_acc_up);
+
+                    if constexpr(ActivationType == 1)
+                    {
+                        if constexpr(is_same_v<BDataType, pk_i4_t>)
+                        {
+                            v_c_up *= 16;
+                            v_c *= 16;
+                        }
+                        tensor_operation::element_wise::Silu{}(v_c, v_c);
+                        arg.c_t_k_n_(t, topk_id, n) = v_c * v_c_up;
+                    }
+                    else if constexpr(ActivationType == 0)
+                    {
+                        if constexpr(is_same_v<BDataType, pk_i4_t>)
+                        {
+                            v_c_up *= 16;
+                            v_c *= 16;
+                        }
+                        tensor_operation::element_wise::Gelu{}(v_c, v_c);
+                        arg.c_t_k_n_(t, topk_id, n) = v_c * v_c_up;
+                    }
+                }
+            };
+
+            const ck::index_t max_token_id = arg.max_token_id_(0);
+            make_ParallelTensorFunctor(f_mk_kn_mn, max_token_id, full_n)(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<ck::index_t>& sorted_token_ids,
+                             const Tensor<ck::index_t>& expert_ids,
+                             const Tensor<ck::index_t>& max_token_id,
+                             const index_t sorted_tile_size,
+                             const Tensor<ADataType>& a_t_k,
+                             const Tensor<BDataType>& b_e_n_k,
+                             Tensor<CDataType>& c_t_k_n,
+                             const Tensor<D2DataType>& d2,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{sorted_token_ids,
+                        expert_ids,
+                        max_token_id,
+                        sorted_tile_size,
+                        a_t_k,
+                        b_e_n_k,
+                        c_t_k_n,
+                        d2,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceMoeGemm1BlaockScale"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+
+    static float i4_to_f32_gfx9(uint8_t i4)
+    {
+        static std::unordered_map<uint8_t, float> u = {{0b1000, -0.5000f},
+                                                       {0b1001, -0.4375f},
+                                                       {0b1010, -0.3750f},
+                                                       {0b1011, -0.3125f},
+                                                       {0b1100, -0.2500f},
+                                                       {0b1101, -0.1875f},
+                                                       {0b1110, -0.1250f},
+                                                       {0b1111, -0.0625f},
+                                                       {0b0, +0.0000f},
+                                                       {0b1, +0.0625f},
+                                                       {0b10, +0.1250f},
+                                                       {0b11, +0.1875f},
+                                                       {0b100, +0.2500f},
+                                                       {0b101, +0.3125f},
+                                                       {0b110, +0.3750f},
+                                                       {0b111, +0.4375f}};
+
+        return u[i4];
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck

From 3e5005a47bf725463a0f0baa346c84d8e28f6347 Mon Sep 17 00:00:00 2001
From: OscarXu <huaiguxu@amd.com>
Date: Mon, 12 May 2025 11:21:14 +0800
Subject: [PATCH 11/18] fix compile error. Gemm2 pass. Gemm1 WIP

---
 .../moe_gemm1_xdl_fp8_blockscale.cpp          | 23 ++++++++----
 .../moe_gemm2_xdl_fp8_blockscale.cpp          |  2 +-
 ...oup_tensor_slice_transfer_v7r3_scatter.hpp |  8 ++--
 .../impl/device_moe_gemm_blockscale.hpp       |  1 +
 .../gpu/grid/gridwise_moe_gemm_blockscale.hpp |  4 +-
 ...ise_tensor_slice_transfer_v7r3_scatter.hpp | 37 +++++++++++++++----
 .../cpu/reference_moe_gemm1_blockscale.hpp    | 10 ++---
 7 files changed, 60 insertions(+), 25 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
index dea651b1ba..427ed03fb2 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
@@ -61,19 +61,29 @@ struct MulABScaleExpertWeight
     operator()<EDataType, float, float>(EDataType& e, const float& c, const float& d2) const
     {
         // for real kernel use
-        e = ck::type_convert<EDataType>(c * d2);
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
+    }
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, EDataType, float>(
+        EDataType& e, const EDataType& c, const float& d2) const
+    {
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
     }
-
     // for reference cpu
     template <>
     __host__ __device__ constexpr void
     operator()<float, float, float>(float& e, const float& c, const float& d2) const
     {
         // for reference cpu
-        e = ck::type_convert<EDataType>(c * d2);
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
     }
 };
 
+using CDEElementOp = MulABScaleExpertWeight;
+
 void preShuffleBuffer(const B0DataType* src, B0DataType* dst, int N, int K, int NXdl)
 {
     int KPack = 16 / sizeof(B0DataType);
@@ -116,7 +126,7 @@ static constexpr ck::index_t Scale_Block_N = 128;
 static constexpr ck::index_t Scale_Block_K = 128;
 #if 1
 static constexpr ck::index_t MPerBlock = 128;
-static constexpr ck::index_t NPerBlock   = 256;
+static constexpr ck::index_t NPerBlock   = 128;
 static constexpr ck::index_t MNPerXDL    = 16;
 static constexpr ck::index_t MXDLPerWave = MPerBlock / (MNPerXDL * 1);
 static constexpr ck::index_t NXDLPerWave = NPerBlock / (MNPerXDL * 4);
@@ -475,8 +485,8 @@ int main(int argc, char* argv[])
         using ReferenceGemmInstance =
             ck::tensor_operation::host::ReferenceMoeGemm1BlockScale<float,
                                                                     float,
-                                                                    CShuffleDataType,
                                                                     D2DataType,
+                                                                    CShuffleDataType,
                                                                     AccDataType,
                                                                     PassThrough,
                                                                     PassThrough,
@@ -492,8 +502,8 @@ int main(int argc, char* argv[])
                                                       MPerBlock,
                                                       a0_t_k,
                                                       b0_e_n_k,
-                                                      c_t_k_n,
                                                       d2_e_n,
+                                                      c_t_k_n,
                                                       PassThrough{},
                                                       PassThrough{},
                                                       PassThrough{});
@@ -510,7 +520,6 @@ int main(int argc, char* argv[])
             {
                 continue;
             }
-            const int e = expert_ids(m / MPerBlock);
             for(int n = 0; n < N; ++n)
             {
                 e_t_n_host_result(t, topk_id, n) = ck::type_convert<EDataType>(c_t_k_n(t, topk_id, n));
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp
index b1e74fe00d..64614a29e2 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp
@@ -165,7 +165,7 @@ static constexpr ck::index_t MPerBlock = 128; using DeviceOpInstance = ck::tenso
                S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
                S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
                2,    2,   S<1, 32, 1, 8>, S<2, 1, 1, 1>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, 0, false, false, false, int32_t, A0DataType>;
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, 0, false, false, false, int32_t, A0DataType>;
 #endif
 // clang-format on
 
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp
index bee0b01a74..1cfe72981b 100644
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp
@@ -134,12 +134,13 @@ struct ThreadGroupTensorSliceTransfer_v7r3_scatter
     template <typename SrcBuffers, index_t ThreadScratchId = 0>
     __device__ void RunRead(const SrcDescs& src_descs,
                             const SrcBuffers& src_bufs,
+                            StaticallyIndexedArray<float, scatter_num>& scatter_weights,
                             Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
     {
         if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
            ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
         {
-            threadwise_transfer_.RunRead(src_descs, src_bufs, thread_scratch_id);
+            threadwise_transfer_.RunRead(src_descs, src_bufs, scatter_weights, thread_scratch_id);
         }
     }
 
@@ -169,9 +170,10 @@ struct ThreadGroupTensorSliceTransfer_v7r3_scatter
                         const SrcBuffers& src_bufs,
                         const DstDescs& dst_descs,
                         DstBuffers dst_bufs,
-                        StaticallyIndexedArray<IndexType, scatter_num>& scatter_offsets)
+                        StaticallyIndexedArray<index_t, scatter_num>& scatter_offsets,
+                        StaticallyIndexedArray<float, scatter_num>& scatter_weights)
     {
-        RunRead(src_descs, src_bufs);
+        RunRead(src_descs, src_bufs, scatter_weights);
         RunWrite(dst_descs, dst_bufs, scatter_offsets);
     }
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp
index 719375c114..442cf5958b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp
@@ -153,6 +153,7 @@ struct DeviceMoeGemmBlockScale
                          NSwizzle,
                          IsInputGemm,
                          MulRoutedWeight,
+                         IndexType,
                          ComputeTypeA,
                          ComputeTypeB,
                          LDSTypeA,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
index 0455258668..8e4f3b3ae8 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
@@ -1461,7 +1461,7 @@ struct GridwiseMoeGemmBlockScale
                                        get_warp_local_1d_id() % NWave,
                                        0,
                                        KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
-            const BDataType* p_b_scale_grid_up = p_b_scale_grid + expert_scale_stride / 2 / BPackedSize;
+            const BScaleType* p_b_scale_grid_up = p_b_scale_grid + expert_scale_stride / 2 / BPackedSize;
             const auto b_scale_grid_buf_up = make_dynamic_buffer<AddressSpaceEnum::Global>(
                 p_b_scale_grid_up + expert_id * expert_scale_stride,
                 b_scale_grid_desc_bn_ak.GetElementSpaceSize());
@@ -2220,7 +2220,7 @@ struct GridwiseMoeGemmBlockScale
                                        get_warp_local_1d_id() % NWave,
                                        0,
                                        KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
-            const BDataType* p_b_scale_grid_up = p_b_scale_grid + expert_scale_stride / 2 / BPackedSize;
+            const BScaleType* p_b_scale_grid_up = p_b_scale_grid + expert_scale_stride / 2 / BPackedSize;
             const auto b_scale_grid_buf_up = make_dynamic_buffer<AddressSpaceEnum::Global>(
                 p_b_scale_grid_up + expert_id * expert_scale_stride,
                 b_scale_grid_desc_bn_ak.GetElementSpaceSize());
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp
index 7cd0a0fc7f..ae76b8739f 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp
@@ -154,6 +154,7 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter
               enable_if_t<SrcDescs::Size() == SrcBuffers::Size(), bool> = false>
     __device__ void RunRead(const SrcDescs& src_descs,
                             const SrcBuffers& src_bufs,
+                            StaticallyIndexedArray<float, scatter_num>& scatter_weights,
                             Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
     {
         // loop over space-filling curve
@@ -172,8 +173,31 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter
                                                                                 src_coords_[i]);
 
                 oob_val = oob_val & is_src_valid;
-                src_vectors(i).template AsType<src_vector_t>()(I0) =
-                    src_bufs[i].template Get<src_vector_t>(src_coords_[i].GetOffset(), true);
+                if(i.value == ScatterWeightIdx)
+                {
+                    static_assert(SrcScalarPerVectors{}[Number<ScatterWeightIdx>{}] == 1,
+                                  "scatter weight dim, should only one vec");
+                    constexpr auto iScatter =
+                        SrcSpaceFillingCurve::GetIndex(iAccess)(Number<ScatterDim>{});
+                    static_for<0, SrcScalarPerVector, 1>{}([&](auto j) {
+                        src_vectors(i).template AsType<float>()(j) =
+                            scatter_weights(Number<iScatter>{});
+                    });
+                }
+                else if constexpr(SrcScalarPerVectors{}[i] == 1)
+                {
+                    auto data_types = SrcDatas{};
+                    using DataType  = remove_cvref_t<decltype(data_types[i])>;
+                    const auto tmp =
+                        src_bufs[i].template Get<DataType>(src_coords_[i].GetOffset(), true);
+                    static_for<0, SrcScalarPerVector, 1>{}(
+                        [&](auto j) { src_vectors(i).template AsType<DataType>()(j) = tmp; });
+                }
+                else
+                {
+                    src_vectors(i).template AsType<src_vector_t>()(I0) =
+                        src_bufs[i].template Get<src_vector_t>(src_coords_[i].GetOffset(), true);
+                }
             });
 
             constexpr auto get_elem_op_vec_len = []() {
@@ -389,7 +413,7 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter
               enable_if_t<DstDescs::Size() == 1 && DstBuffers::Size() == 1, bool> = false>
     __device__ void RunWrite(const DstDescs& dst_descs,
                              DstBuffers dst_bufs,
-                             StaticallyIndexedArray<IndexType, scatter_num>& scatter_offsets,
+                             StaticallyIndexedArray<index_t, scatter_num>& scatter_offsets,
                              Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
     {
         OOBCheck(thread_scratch_id);
@@ -410,8 +434,6 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter
                 using dst_vector_t      = typename remove_cvref_t<decltype(dst_vectors[i])>::type;
                 IndexType dst_offset    = scatter_offset + (dst_coords_[i].GetOffset());
                 const bool is_dst_valid = dst_offset < dst_descs[i].GetElementSpaceSize();
-                // coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_descs[i],
-                //                                                             dst_coords_[i]);
                 constexpr InMemoryDataOperationEnum DstInMemOp =
                     static_cast<InMemoryDataOperationEnum>(DstInMemOps::At(i.value));
                 dst_bufs(i).template Update<DstInMemOp, dst_vector_t>(
@@ -467,9 +489,10 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter
                         const SrcBuffers& src_bufs,
                         const DstDescs& dst_descs,
                         DstBuffers dst_bufs,
-                        StaticallyIndexedArray<IndexType, scatter_num>& scatter_offsets)
+                        StaticallyIndexedArray<IndexType, scatter_num>& scatter_offsets,
+                        StaticallyIndexedArray<float, scatter_num>& scatter_weights)
     {
-        RunRead(src_descs, src_bufs);
+        RunRead(src_descs, src_bufs, scatter_weights);
         RunWrite(dst_descs, dst_bufs, scatter_offsets);
     }
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp
index f949762b09..367f2ea586 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp
@@ -39,8 +39,8 @@ struct ReferenceMoeGemm1BlockScale : public device::BaseOperator
                  const index_t sorted_tile_size,
                  const Tensor<ADataType>& a_t_k,
                  const Tensor<BDataType>& b_e_n_k,
-                 Tensor<CDataType>& c_t_k_n,
                  const Tensor<D2DataType>& d2,
+                 Tensor<CDataType>& c_t_k_n,
                  AElementwiseOperation a_element_op,
                  BElementwiseOperation b_element_op,
                  CElementwiseOperation c_element_op)
@@ -50,8 +50,8 @@ struct ReferenceMoeGemm1BlockScale : public device::BaseOperator
               sorted_tile_size_{sorted_tile_size},
               a_t_k_{a_t_k},
               b_e_n_k_{b_e_n_k},
-              c_t_k_n_{c_t_k_n},
               d2_{d2},
+              c_t_k_n_{c_t_k_n},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
               c_element_op_{c_element_op}
@@ -64,8 +64,8 @@ struct ReferenceMoeGemm1BlockScale : public device::BaseOperator
         index_t sorted_tile_size_;
         const Tensor<ADataType>& a_t_k_;
         const Tensor<BDataType>& b_e_n_k_;
-        Tensor<CDataType>& c_t_k_n_;
         const Tensor<D2DataType>& d2_;
+        Tensor<CDataType>& c_t_k_n_;
 
         AElementwiseOperation a_element_op_;
         BElementwiseOperation b_element_op_;
@@ -215,8 +215,8 @@ struct ReferenceMoeGemm1BlockScale : public device::BaseOperator
                              const index_t sorted_tile_size,
                              const Tensor<ADataType>& a_t_k,
                              const Tensor<BDataType>& b_e_n_k,
-                             Tensor<CDataType>& c_t_k_n,
                              const Tensor<D2DataType>& d2,
+                             Tensor<CDataType>& c_t_k_n,
                              AElementwiseOperation a_element_op,
                              BElementwiseOperation b_element_op,
                              CElementwiseOperation c_element_op)
@@ -227,8 +227,8 @@ struct ReferenceMoeGemm1BlockScale : public device::BaseOperator
                         sorted_tile_size,
                         a_t_k,
                         b_e_n_k,
-                        c_t_k_n,
                         d2,
+                        c_t_k_n,
                         a_element_op,
                         b_element_op,
                         c_element_op};

From f1a534f6e7c7533a1a45723c1cfa117f4cf71988 Mon Sep 17 00:00:00 2001
From: OscarXu <huaiguxu@amd.com>
Date: Mon, 12 May 2025 13:19:24 +0800
Subject: [PATCH 12/18] Compile pass. Gemm1 function WIP

---
 .../moe_gemm1_xdl_fp8_blockscale.cpp                | 13 ++++++++-----
 .../gpu/grid/gridwise_moe_gemm_blockscale.hpp       |  4 ++--
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
index 427ed03fb2..d5942a506d 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
@@ -124,7 +124,12 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio
 static constexpr ck::index_t Scale_Block_M = 1;
 static constexpr ck::index_t Scale_Block_N = 128;
 static constexpr ck::index_t Scale_Block_K = 128;
-#if 1
+
+static constexpr ck::index_t Nswizzle    = false;
+static constexpr ck::index_t ActOP       = 0; // 0: gelu_and_mul, 1: silu_and_mul
+static constexpr bool MulRoutedWeight    = false;
+
+#if 0
 static constexpr ck::index_t MPerBlock = 128;
 static constexpr ck::index_t NPerBlock   = 128;
 static constexpr ck::index_t MNPerXDL    = 16;
@@ -135,14 +140,12 @@ static constexpr ck::index_t CShuffleNXDLPerWave = NXDLPerWave;
 static constexpr ck::index_t BLOCKSIZE   = 256;
 
 static constexpr ck::index_t KPerBlock   = 128 / sizeof(A0DataType);
-static constexpr ck::index_t Nswizzle    = false;
 static constexpr ck::index_t AK1         = 16 / sizeof(A0DataType);
 static constexpr ck::index_t BK1         = 16 / sizeof(B0DataType);
 static constexpr ck::index_t EVec        = 16 / sizeof(EDataType);
 static constexpr ck::index_t D0Vec       = 1;
 static constexpr ck::index_t D1Vec       = 1;
-static constexpr ck::index_t ActOP       = 0; // 0: gelu_and_mul, 1: silu_and_mul
-static constexpr bool MulRoutedWeight    = false;
+
 using DeviceOpInstance                   = ck::tensor_operation::device::DeviceMoeGemmBlockScale
     // clang-format off
         <      Row, Col, DsLayout, ELayout,
@@ -178,7 +181,7 @@ static constexpr ck::index_t MPerBlock = 128; using DeviceOpInstance = ck::tenso
                S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
                S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
                2,    2,   S<1, 32, 1, 8>, S<2, 1, 1, 1>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, ActOP, Nswizzle, true, MulRoutedWeight, int32_t, A0DataType>;
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, ActOP, Nswizzle, true, MulRoutedWeight, int32_t, A0DataType>;
 #endif
 // clang-format on
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
index 8e4f3b3ae8..5240d29f1e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
@@ -1166,7 +1166,7 @@ struct GridwiseMoeGemmBlockScale
 
     template <bool HasMainKBlockLoop,
               InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-                            TailNumber TailNum = TailNumber::Odd>
+              TailNumber TailNum = TailNumber::Odd>
     __device__ static void Run(const index_t* p_sorted_token_ids,
                                const index_t* p_sorted_expert_ids,
                                const index_t* p_max_token_id,
@@ -1478,7 +1478,7 @@ struct GridwiseMoeGemmBlockScale
                                                 false>(
                     b_scale_grid_desc_bn_ak, make_multi_index(block_n_id * NPerBlock / ScaleBlockN, 0));
 
-            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, NumKBlockPerScale, TailNum>(
                 a_grid_desc_ak0_m_ak1,
                 a_block_desc_ak0_m_ak1,
                 a_blockwise_copy,

From fe8bb251da64e173c4bbb366098a352cf1d258b0 Mon Sep 17 00:00:00 2001
From: OscarXu <huaiguxu@amd.com>
Date: Mon, 12 May 2025 21:08:42 +0800
Subject: [PATCH 13/18] gemm1 up-only pass. GU WIP

---
 .../moe_gemm1_xdl_fp8.cpp                     |   2 +-
 .../moe_gemm1_xdl_fp8_blockscale.cpp          |  14 +-
 ...oup_tensor_slice_transfer_v7r3_scatter.hpp |  25 +++-
 .../gpu/grid/gridwise_moe_gemm_blockscale.hpp |  14 +-
 ...ise_tensor_slice_transfer_v7r3_scatter.hpp | 130 +++++++++++++++++-
 .../cpu/reference_moe_gemm1_blockscale.hpp    |   4 +-
 6 files changed, 171 insertions(+), 18 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
index a05234ad3c..fa8f95db25 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
@@ -269,7 +269,7 @@ int main(int argc, char* argv[])
     // int eids[]         = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 3, 3, 3};
     for(int i = 0; i < sorted_tile_num; i++)
     {
-        expert_ids.mData[i] = i / (valid_tile_num / experts);
+        expert_ids.mData[i] = i / ck::math::integer_divide_ceil(valid_tile_num, experts);
     }
 
     int token_per_tile = (tokens * topk + valid_tile_num - 1) / valid_tile_num;
diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
index d5942a506d..155c052b4e 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
@@ -268,7 +268,7 @@ int main(int argc, char* argv[])
     constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{0};
     ck::index_t Scale_Stride_AM      = (K + Scale_Block_K - 1) / Scale_Block_K;
     ck::index_t Scale_Stride_BN      = (K + Scale_Block_K - 1) / Scale_Block_K;
-    ck::index_t Scale_Stride_B       = (N * 2 + Scale_Block_N - 1) / Scale_Block_N;
+    ck::index_t Scale_Stride_B       = (N + Scale_Block_N - 1) / Scale_Block_N * 2;
 
     ck::index_t KBatch = 1;
 
@@ -279,7 +279,7 @@ int main(int argc, char* argv[])
     // int eids[]         = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 3, 3, 3};
     for(int i = 0; i < sorted_tile_num; i++)
     {
-        expert_ids.mData[i] = i / (valid_tile_num / experts);
+        expert_ids.mData[i] = (i / valid_tile_num) / experts;
     }
 
     int token_per_tile = (tokens * topk + valid_tile_num - 1) / valid_tile_num;
@@ -303,7 +303,7 @@ int main(int argc, char* argv[])
                                                    {Scale_Stride_AM, 1}));
     Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
     Tensor<B1DataType> b1_e_n_k(HostTensorDescriptor(
-        {experts, (K + Scale_Block_K - 1) / Scale_Block_K, (N * 2 + Scale_Block_N - 1) / Scale_Block_N},
+        {experts, (K + Scale_Block_K - 1) / Scale_Block_K, (N  + Scale_Block_N - 1) / Scale_Block_N * 2},
         {(Scale_Stride_B * Scale_Stride_BN), 1, Scale_Stride_BN}));
     Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
     Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
@@ -381,8 +381,8 @@ int main(int argc, char* argv[])
     DeviceMem d2_device_buf(sizeof(D2DataType) * d2_e_n.mDesc.GetElementSpaceSize());
     DeviceMem e_device_buf(sizeof(EDataType) * e_t_n_device_result.mDesc.GetElementSpaceSize());
     // a0_t_k.savetxt("a.txt");
-    // expert_ids.savetxt("expert_ids.txt", "int");
-    // sorted_token_ids.savetxt("sorted_token_ids.txt", "int");
+    expert_ids.savetxt("expert_ids.txt", "int");
+    sorted_token_ids.savetxt("sorted_token_ids.txt", "int");
     // d2_e_n.savetxt("d2_e_n.txt", "int");
     sorted_token_ids_dev.ToDevice(sorted_token_ids.mData.data());
     expert_ids_dev.ToDevice(expert_ids.mData.data());
@@ -503,8 +503,8 @@ int main(int argc, char* argv[])
                                                       expert_ids,
                                                       max_token_id,
                                                       MPerBlock,
-                                                      a0_t_k,
-                                                      b0_e_n_k,
+                                                      a_t_k,
+                                                      b_e_n_k,
                                                       d2_e_n,
                                                       c_t_k_n,
                                                       PassThrough{},
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp
index 1cfe72981b..b815fcf83c 100644
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp
@@ -131,6 +131,18 @@ struct ThreadGroupTensorSliceTransfer_v7r3_scatter
         }
     }
 
+    template <typename SrcBuffers, index_t ThreadScratchId = 0>
+    __device__ void RunRead(const SrcDescs& src_descs,
+                            const SrcBuffers& src_bufs,
+                            Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunRead(src_descs, src_bufs, thread_scratch_id);
+        }
+    }
+
     template <typename SrcBuffers, index_t ThreadScratchId = 0>
     __device__ void RunRead(const SrcDescs& src_descs,
                             const SrcBuffers& src_bufs,
@@ -170,7 +182,18 @@ struct ThreadGroupTensorSliceTransfer_v7r3_scatter
                         const SrcBuffers& src_bufs,
                         const DstDescs& dst_descs,
                         DstBuffers dst_bufs,
-                        StaticallyIndexedArray<index_t, scatter_num>& scatter_offsets,
+                        StaticallyIndexedArray<IndexType, scatter_num>& scatter_offsets)
+    {
+        RunRead(src_descs, src_bufs);
+        RunWrite(dst_descs, dst_bufs, scatter_offsets);
+    }
+
+    template <typename SrcBuffers, typename DstBuffers>
+    __device__ void Run(const SrcDescs& src_descs,
+                        const SrcBuffers& src_bufs,
+                        const DstDescs& dst_descs,
+                        DstBuffers dst_bufs,
+                        StaticallyIndexedArray<IndexType, scatter_num>& scatter_offsets,
                         StaticallyIndexedArray<float, scatter_num>& scatter_weights)
     {
         RunRead(src_descs, src_bufs, scatter_weights);
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
index 5240d29f1e..f5e4b759f0 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
@@ -1206,7 +1206,7 @@ struct GridwiseMoeGemmBlockScale
                        math::integer_divide_ceil(problem.K, ScaleBlockK)),
             make_tuple(math::integer_divide_ceil(problem.K, ScaleBlockK), 1));
         const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor(
-            make_tuple(math::integer_divide_ceil(problem.N * (IsInputGemm ? 2 : 1), ScaleBlockN),
+            make_tuple(math::integer_divide_ceil(problem.N , ScaleBlockN),
                        math::integer_divide_ceil(problem.K, ScaleBlockK)),
             make_tuple(math::integer_divide_ceil(problem.K, ScaleBlockK), 1));
 
@@ -1267,7 +1267,7 @@ struct GridwiseMoeGemmBlockScale
         });
         const index_t expert_stride = __builtin_amdgcn_readfirstlane(problem.N * problem.K * (IsInputGemm ? 2 : 1));
         const index_t expert_scale_stride =
-            __builtin_amdgcn_readfirstlane(math::integer_divide_ceil(problem.N * (IsInputGemm ? 2 : 1), ScaleBlockN) *
+            __builtin_amdgcn_readfirstlane(math::integer_divide_ceil(problem.N, ScaleBlockN) * (IsInputGemm ? 2 : 1) *
                                            math::integer_divide_ceil(problem.K, ScaleBlockK));
 
         // N0, K0, Blocksize*KPack
@@ -1958,7 +1958,7 @@ struct GridwiseMoeGemmBlockScale
                        math::integer_divide_ceil(problem.K, ScaleBlockK)),
             make_tuple(math::integer_divide_ceil(problem.K, ScaleBlockK), 1));
         const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor(
-            make_tuple(math::integer_divide_ceil(problem.N * (IsInputGemm ? 2 : 1), ScaleBlockN),
+            make_tuple(math::integer_divide_ceil(problem.N, ScaleBlockN),
                        math::integer_divide_ceil(problem.K, ScaleBlockK)),
             make_tuple(math::integer_divide_ceil(problem.K, ScaleBlockK), 1));
         const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
@@ -2019,7 +2019,7 @@ struct GridwiseMoeGemmBlockScale
         });
         const index_t expert_stride = __builtin_amdgcn_readfirstlane(problem.N * problem.K * (IsInputGemm ? 2 : 1));
         const index_t expert_scale_stride =
-            __builtin_amdgcn_readfirstlane(math::integer_divide_ceil(problem.N  * (IsInputGemm ? 2 : 1), ScaleBlockN) *
+            __builtin_amdgcn_readfirstlane(math::integer_divide_ceil(problem.N , ScaleBlockN)  * (IsInputGemm ? 2 : 1) * 
                                            math::integer_divide_ceil(problem.K, ScaleBlockK));
         // N0, K0, Blocksize*KPack
         const index_t n_block_data_idx_on_grid =
@@ -2158,8 +2158,8 @@ struct GridwiseMoeGemmBlockScale
             {
                 token_offset = token_offset * problem.TopK + (fused_token >> 24);
             }
-            scale_gather_offsets(m0) =
-                token_offset * math::integer_divide_ceil(problem.K, ScaleBlockK);
+            scale_gather_offsets(m0) = static_cast<IndexType>(token_offset) * 
+                                       math::integer_divide_ceil(problem.K, ScaleBlockK);
         });
 
         // printf("blkid: %d, tid:%d, a_thread_offset: %d, scale_gather_offsets: %d\n", block_m_id,
@@ -2222,7 +2222,7 @@ struct GridwiseMoeGemmBlockScale
                                        KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
             const BScaleType* p_b_scale_grid_up = p_b_scale_grid + expert_scale_stride / 2 / BPackedSize;
             const auto b_scale_grid_buf_up = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                p_b_scale_grid_up + expert_id * expert_scale_stride,
+                p_b_scale_grid_up + expert_id * expert_scale_stride / BPackedSize,
                 b_scale_grid_desc_bn_ak.GetElementSpaceSize());
             auto b_scale_thread_copy_up =
                 ThreadwiseTensorSliceTransfer_v2<BScaleType,
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp
index ae76b8739f..21d4cf68b3 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp
@@ -149,6 +149,119 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter
 
     // SrcDescs: Tuple<const SrcDesc0&, const SrcDesc1&, ...>
     // SrcBuffers: Tuple<const SrcBuffer0&, const SrcBuffer1&, ...>
+    template <typename SrcBuffers,
+              index_t ThreadScratchId                                   = 0,
+              enable_if_t<SrcDescs::Size() == SrcBuffers::Size(), bool> = false>
+    __device__ void RunRead(const SrcDescs& src_descs,
+                            const SrcBuffers& src_bufs,
+                            Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        // loop over space-filling curve
+        static_for<0, src_num_access, 1>{}([&](auto iAccess) {
+            auto src_vectors = generate_vectors<SrcDatas, SrcScalarPerVector>();
+            auto elm_vectors = generate_vectors<DstDatas, SrcScalarPerVector>();
+
+            bool oob_val = true;
+
+            // copy data from src_bufs into src_vectors
+            static_for<0, nSrc, 1>{}([&](auto i) {
+                using src_vector_t = typename remove_cvref_t<decltype(src_vectors[i])>::type;
+
+                const bool is_src_valid =
+                    coordinate_has_valid_offset_assuming_visible_index_is_valid(src_descs[i],
+                                                                                src_coords_[i]);
+
+                oob_val = oob_val & is_src_valid;
+                src_vectors(i).template AsType<src_vector_t>()(I0) =
+                    src_bufs[i].template Get<src_vector_t>(src_coords_[i].GetOffset(), true);
+            });
+
+            constexpr auto get_elem_op_vec_len = []() {
+                if constexpr(is_detected<is_pack8_invocable_t, decltype(element_op_)>::value)
+                {
+                    if constexpr(decltype(element_op_)::is_pack8_invocable)
+                        return math::min(8, SrcScalarPerVector);
+                }
+                if constexpr(is_detected<is_pack4_invocable_t, decltype(element_op_)>::value)
+                {
+                    if constexpr(decltype(element_op_)::is_pack4_invocable)
+                        return math::min(4, SrcScalarPerVector);
+                }
+                if constexpr(is_detected<is_pack2_invocable_t, decltype(element_op_)>::value)
+                {
+                    if constexpr(decltype(element_op_)::is_pack2_invocable)
+                        return math::min(2, SrcScalarPerVector);
+                }
+                return 1;
+            };
+
+            constexpr index_t elem_op_vec_len = get_elem_op_vec_len();
+
+            // apply pointwise function
+            static_for<0, SrcScalarPerVector / elem_op_vec_len, 1>{}([&](auto i) {
+                // get reference to src data
+                const auto src_data_refs = generate_tie(
+                    // return type should be lvalue
+                    [&](auto iSrc) -> const auto& {
+                        using SrcData = remove_cvref_t<tuple_element_t<iSrc.value, SrcDatas>>;
+
+                        using elem_op_vec_t = typename vector_type<SrcData, elem_op_vec_len>::type;
+
+                        return src_vectors[iSrc].template AsType<elem_op_vec_t>()[i];
+                    },
+                    Number<nSrc>{});
+
+                // get reference to dst data
+                auto dst_data_refs = generate_tie(
+                    // return type should be lvalue
+                    [&](auto iDst) -> auto& {
+                        using DstData = remove_cvref_t<tuple_element_t<iDst.value, DstDatas>>;
+
+                        using elem_op_vec_t = typename vector_type<DstData, elem_op_vec_len>::type;
+
+                        return elm_vectors(iDst).template AsType<elem_op_vec_t>()(i);
+                    },
+                    Number<nDst>{});
+
+                // apply pointwise function
+                // pointwise function signature:
+                // element_op_(dst_data_refs[I0],
+                //             dst_data_refs[I1],
+                //             ...,
+                //             src_data_refs[I0],
+                //             src_data_refs[I1],
+                //             ...)
+                unpack2(element_op_, dst_data_refs, src_data_refs);
+            });
+
+            elm_vectors_tuple_(thread_scratch_id)(iAccess) = elm_vectors;
+            oob_vectors_tuple_(thread_scratch_id)(iAccess) = oob_val;
+
+            // move coordinate
+            if constexpr(iAccess.value != src_num_access - 1)
+            {
+                constexpr auto forward_step = SrcSpaceFillingCurve::GetForwardStep(iAccess);
+
+                static_for<0, nSrc, 1>{}([&](auto i) {
+                    move_tensor_coordinate(src_descs[i],
+                                           src_coords_(i),
+                                           make_tensor_coordinate_step(src_descs[i], forward_step));
+                });
+            }
+        });
+
+        // move coordinate back to slice origin (or not)
+        static_for<0, nSrc, 1>{}([&](auto i) {
+            if constexpr(SrcResetCoordinateAfterRunFlags::At(i))
+            {
+                const auto src_reset_step =
+                    make_tensor_coordinate_step(src_descs[i], GetSrcCoordinateResetStep());
+
+                move_tensor_coordinate(src_descs[i], src_coords_(i), src_reset_step);
+            }
+        });
+    }
+
     template <typename SrcBuffers,
               index_t ThreadScratchId                                   = 0,
               enable_if_t<SrcDescs::Size() == SrcBuffers::Size(), bool> = false>
@@ -413,7 +526,7 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter
               enable_if_t<DstDescs::Size() == 1 && DstBuffers::Size() == 1, bool> = false>
     __device__ void RunWrite(const DstDescs& dst_descs,
                              DstBuffers dst_bufs,
-                             StaticallyIndexedArray<index_t, scatter_num>& scatter_offsets,
+                             StaticallyIndexedArray<IndexType, scatter_num>& scatter_offsets,
                              Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
     {
         OOBCheck(thread_scratch_id);
@@ -485,6 +598,21 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter
               enable_if_t<SrcDescs::Size() == SrcBuffers::Size() &&
                               DstDescs::Size() == DstBuffers::Size(),
                           bool> = false>
+    __device__ void Run(const SrcDescs& src_descs,
+                        const SrcBuffers& src_bufs,
+                        const DstDescs& dst_descs,
+                        DstBuffers dst_bufs,
+                        StaticallyIndexedArray<IndexType, scatter_num>& scatter_offsets)
+    {
+        RunRead(src_descs, src_bufs);
+        RunWrite(dst_descs, dst_bufs, scatter_offsets);
+    }
+
+        template <typename SrcBuffers,
+              typename DstBuffers,
+              enable_if_t<SrcDescs::Size() == SrcBuffers::Size() &&
+                              DstDescs::Size() == DstBuffers::Size(),
+                          bool> = false>
     __device__ void Run(const SrcDescs& src_descs,
                         const SrcBuffers& src_bufs,
                         const DstDescs& dst_descs,
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp
index 367f2ea586..6abccc7bc6 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp
@@ -163,7 +163,7 @@ struct ReferenceMoeGemm1BlockScale : public device::BaseOperator
 
                     arg.c_element_op_(v_c, v_acc);
                     arg.c_element_op_(v_c_up, v_acc_up);
-
+                    #if 0
                     if constexpr(ActivationType == 1)
                     {
                         if constexpr(is_same_v<BDataType, pk_i4_t>)
@@ -184,6 +184,8 @@ struct ReferenceMoeGemm1BlockScale : public device::BaseOperator
                         tensor_operation::element_wise::Gelu{}(v_c, v_c);
                         arg.c_t_k_n_(t, topk_id, n) = v_c * v_c_up;
                     }
+                    #endif
+                    arg.c_t_k_n_(t, topk_id, n) = v_c;
                 }
             };
 

From f392874e3da586734b8e593c7b0b39742cf61477 Mon Sep 17 00:00:00 2001
From: OscarXu <huaiguxu@amd.com>
Date: Tue, 13 May 2025 09:51:08 +0800
Subject: [PATCH 14/18] revert unexpected file change

---
 example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp         | 2 +-
 .../blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
index fa8f95db25..a05234ad3c 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
@@ -269,7 +269,7 @@ int main(int argc, char* argv[])
     // int eids[]         = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 3, 3, 3};
     for(int i = 0; i < sorted_tile_num; i++)
     {
-        expert_ids.mData[i] = i / ck::math::integer_divide_ceil(valid_tile_num, experts);
+        expert_ids.mData[i] = i / (valid_tile_num / experts);
     }
 
     int token_per_tile = (tokens * topk + valid_tile_num - 1) / valid_tile_num;
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp
index 9e9978330b..c2fad6132f 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp
@@ -601,7 +601,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3<BlockGemmPipelineSch
                                                             make_tuple(n0, I0, k0, ik))>{}];
                                 });
 
-                                using mfma_input_type =xdlops_gemm
+                                using mfma_input_type =
                                     typename vector_type<ComputeDataType,
                                                          xdlops_gemm.K1PerXdlops>::type;
 

From 9ead312164806a2831cb7ddec0d000adbf8a942c Mon Sep 17 00:00:00 2001
From: OscarXu <huaiguxu@amd.com>
Date: Tue, 13 May 2025 15:32:39 +0800
Subject: [PATCH 15/18] Gemm1 GUFusion function pass. Perf WIP

---
 .../moe_gemm1_xdl_fp8_blockscale.cpp          |   8 +-
 .../gpu/grid/gridwise_moe_gemm_blockscale.hpp | 249 +++++++++---------
 .../cpu/reference_moe_gemm1_blockscale.hpp    |   3 -
 3 files changed, 131 insertions(+), 129 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
index 155c052b4e..05de0c98dc 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
@@ -190,7 +190,7 @@ int main(int argc, char* argv[])
     bool do_verification = true;
     int init_method      = 1;
     bool time_kernel     = true;
-#if 1
+#if 0
     // GEMM shape
     ck::index_t N               = 4096;
     ck::index_t K               = 6144;
@@ -205,7 +205,7 @@ int main(int argc, char* argv[])
     #else
     //deepseek
     ck::index_t N               = 2048;
-    ck::index_t K               = 7160;
+    ck::index_t K               = 7168;
     ck::index_t experts         = 256;
     ck::index_t topk            = 8;
     ck::index_t tokens          = 4096;
@@ -217,7 +217,7 @@ int main(int argc, char* argv[])
     {
         // use default case
     }
-    else if(argc == 3)
+    else if(argc == 4)
     {
         // use default case
         do_verification = std::stoi(argv[1]);
@@ -279,7 +279,7 @@ int main(int argc, char* argv[])
     // int eids[]         = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 3, 3, 3};
     for(int i = 0; i < sorted_tile_num; i++)
     {
-        expert_ids.mData[i] = (i / valid_tile_num) / experts;
+        expert_ids.mData[i] = i / ck::math::integer_divide_ceil(valid_tile_num, experts);
     }
 
     int token_per_tile = (tokens * topk + valid_tile_num - 1) / valid_tile_num;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
index f5e4b759f0..eb7031d2b1 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
@@ -1576,69 +1576,71 @@ struct GridwiseMoeGemmBlockScale
             constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I5);
             constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I6);
             constexpr auto N4 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I7);
-            // if constexpr(IsInputGemm) // gu fusion
-            // {
-            //     static_assert(N0 * N1 * N2 * N3 * N4 == NPerBlock);
-            //     static_assert(N4 == 4);
-            //     const index_t n1 = get_warp_local_1d_id() / MWave;
-            //     const index_t n3 = threadIdx.y % get_warp_size() / NPerXdl;
-            //     vector_type<float, 4> topk_weights;
-            //     static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
-            //         static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
-            //             static_for<0, N2, 1>{}([&](auto n2) {      // m_inst_num_groups_per_blk
-            //                 const index_t m_pos = block_m_id * MPerBlock + n0 * N1 * N2 * N3 * N4 +
-            //                                         n1 * N2 * N3 * N4 + n2 * N3 * N4 + n3 * N4;
-            //                 if constexpr(MulRoutedWeight)
-            //                 {
-            //                     topk_weights = *c_style_pointer_cast<const vector_type<float, N4>*>(
-            //                         p_ds_grid[I0] + m_pos);
-            //                 }
-            //                 static_for<0, N4, 1>{}([&](auto n4) { // m_inst_group_size
-            //                         constexpr index_t c_offset =
-            //                             blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
-            //                                 make_tuple(m0, n0, n2 * N4 + n4));
-            //                         constexpr auto cidx = Number<c_offset>{};
 
-            //                         if constexpr(ActivationOperation == Activation::silu_and_mul)
-            //                         {
-            //                             float gate = c_thread_buf[cidx];
-            //                             float up   = c_thread_buf_up[cidx];
-            //                             if constexpr(MulRoutedWeight)
-            //                             {
-            //                                 gate = gate * topk_weights.AsType<float>()[n4];
-            //                                 up   = up * topk_weights.AsType<float>()[n4];
-            //                             }
-            //                             if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
-            //                             {
-            //                                 gate *= 16;
-            //                                 up *= 16;
-            //                             }
-            //                             tensor_operation::element_wise::Silu{}(gate, gate);
-            //                             c_thread_buf(cidx) = gate * up;
-            //                         }
-            //                         else if(ActivationOperation == Activation::gelu_and_mul)
-            //                         {
-            //                             float gate = c_thread_buf[cidx];
-            //                             float up   = c_thread_buf_up[cidx];
-            //                             if constexpr(MulRoutedWeight)
-            //                             {
-            //                                 gate = gate * topk_weights.AsType<float>()[n3];
-            //                                 up   = up * topk_weights.AsType<float>()[n3];
-            //                             }
-            //                             if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
-            //                             {
-            //                                 gate *= 16;
-            //                                 up *= 16;
-            //                             }
-            //                             tensor_operation::element_wise::Gelu{}(gate, gate);
-            //                             c_thread_buf(cidx) = gate * up;
-            //                         }
-            //                     });
-            //                 });
-            //             });
-            //         });
-            //     });
-            // }
+            if constexpr(IsInputGemm) // gu fusion, elementwise
+            {
+                static_assert(N0 * N1 * N2 * N3 * N4 == NPerBlock);
+                static_assert(N4 == 4);
+                const index_t n1 = get_warp_local_1d_id() / MWave;
+                const index_t n3 = threadIdx.x % get_warp_size() / NPerXdl;
+
+                vector_type<float, 4> topk_weights;
+                static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
+                    static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
+                        static_for<0, N2, 1>{}([&](auto n2) {      // num_groups_per_blk
+                            const index_t n_pos = block_n_id * NPerBlock + n0 * N1 * N2 * N3 * N4 +
+                                                    n1 * N2 * N3 * N4 + n2 * N3 * N4 + n3 * N4;
+                            if constexpr(MulRoutedWeight)
+                            {
+                                topk_weights = *c_style_pointer_cast<const vector_type<float, N4>*>(
+                                                p_ds_grid[I0] + n_pos);
+                            }
+                            // if((blockIdx.x == 0) && (blockIdx.y == 0)){printf("m0:%d, n_pos:%d\n", static_cast<int>(m0), n_pos);}
+                            static_for<0, N4, 1>{}([&](auto n4) { // inst_group_size
+                                constexpr index_t c_offset =
+                                    blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
+                                        make_tuple(m0, n0, n2 * N4 + n4));
+                                constexpr auto cidx = Number<c_offset>{};
+
+                                if constexpr(ActivationOperation == Activation::silu_and_mul)
+                                {
+                                    float gate = c_thread_buf[cidx];
+                                    float up   = c_thread_buf_up[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        gate = gate * topk_weights.AsType<float>()[n4];
+                                        up   = up * topk_weights.AsType<float>()[n4];
+                                    }
+                                    if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+                                    {
+                                        gate *= 16;
+                                        up *= 16;
+                                    }
+                                    tensor_operation::element_wise::Silu{}(gate, gate);
+                                    c_thread_buf(cidx) = gate * up;
+                                }
+                                else if(ActivationOperation == Activation::gelu_and_mul)
+                                {
+                                    float gate = c_thread_buf[cidx];
+                                    float up   = c_thread_buf_up[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        gate = gate * topk_weights.AsType<float>()[n4];
+                                        up   = up * topk_weights.AsType<float>()[n4];
+                                    }
+                                    if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+                                    {
+                                        gate *= 16;
+                                        up *= 16;
+                                    }
+                                    tensor_operation::element_wise::Gelu{}(gate, gate);
+                                    c_thread_buf(cidx) = gate * up;
+                                }
+                            });
+                        });
+                    });
+                });
+            }
 
             constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
                 GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
@@ -2327,67 +2329,70 @@ struct GridwiseMoeGemmBlockScale
             constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I6);
             constexpr auto N4 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I7);
 
-            // if constexpr(IsInputGemm) // gu fusion
-            // {
-                
-            //     vector_type<float, 4> topk_weights;
-            //     static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
-            //         static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
-            //             static_for<0, M2, 1>{}([&](auto m2) {      // m_inst_num_groups_per_blk
-            //                 static_for<0, N3, 1>{}([&](auto n3) { // m_inst_group_size
-            //                     const index_t m_pos = block_m_id * MPerBlock + n0 * N1 * N2 * N3 * N4 +
-            //                                             n1 * N2 * N3 * N4 + n2 * N3 * N4 + n3 * N4;
-            //                     if constexpr(MulRoutedWeight)
-            //                     {
-            //                         topk_weights = *c_style_pointer_cast<const vector_type<float, N3>*>(
-            //                             p_ds_grid[I0] + m_pos);
-            //                     }
-            //                     static_for<0, N4, 1>{}([&](auto n4) { // m_inst_group_size
-            //                         constexpr index_t c_offset =
-            //                             blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
-            //                                 make_tuple(m0, n0, m2 * N3 + n3));
-            //                         constexpr auto cidx = Number<c_offset>{};
+            if constexpr(IsInputGemm) // gu fusion, elementwise
+            {
+                static_assert(N0 * N1 * N2 * N3 * N4 == NPerBlock);
+                static_assert(N4 == 4);
+                const index_t n1 = get_warp_local_1d_id() / MWave;
+                const index_t n3 = threadIdx.x % get_warp_size() / NPerXdl;
 
-            //                         if constexpr(ActivationOperation == Activation::silu_and_mul)
-            //                         {
-            //                             float gate = c_thread_buf[cidx];
-            //                             float up   = c_thread_buf_up[cidx];
-            //                             if constexpr(MulRoutedWeight)
-            //                             {
-            //                                 gate = gate * topk_weights.AsType<float>()[n3];
-            //                                 up   = up * topk_weights.AsType<float>()[n3];
-            //                             }
-            //                             if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
-            //                             {
-            //                                 gate *= 16;
-            //                                 up *= 16;
-            //                             }
-            //                             tensor_operation::element_wise::Silu{}(gate, gate);
-            //                             c_thread_buf(cidx) = gate * up;
-            //                         }
-            //                         else if(ActivationOperation == Activation::gelu_and_mul)
-            //                         {
-            //                             float gate = c_thread_buf[cidx];
-            //                             float up   = c_thread_buf_up[cidx];
-            //                             if constexpr(MulRoutedWeight)
-            //                             {
-            //                                 gate = gate * topk_weights.AsType<float>()[n3];
-            //                                 up   = up * topk_weights.AsType<float>()[n3];
-            //                             }
-            //                             if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
-            //                             {
-            //                                 gate *= 16;
-            //                                 up *= 16;
-            //                             }
-            //                             tensor_operation::element_wise::Gelu{}(gate, gate);
-            //                             c_thread_buf(cidx) = gate * up;
-            //                         }
-            //                     });
-            //                 });
-            //             });
-            //         });
-            //     });
-            // }
+                vector_type<float, 4> topk_weights;
+                static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
+                    static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
+                        static_for<0, N2, 1>{}([&](auto n2) {      // num_groups_per_blk
+                            const index_t n_pos = block_n_id * NPerBlock + n0 * N1 * N2 * N3 * N4 +
+                                                    n1 * N2 * N3 * N4 + n2 * N3 * N4 + n3 * N4;
+                            if constexpr(MulRoutedWeight)
+                            {
+                                topk_weights = *c_style_pointer_cast<const vector_type<float, N4>*>(
+                                                p_ds_grid[I0] + n_pos);
+                            }
+                            // if((blockIdx.x == 0) && (blockIdx.y == 0)){printf("m0:%d, n_pos:%d\n", static_cast<int>(m0), n_pos);}
+                            static_for<0, N4, 1>{}([&](auto n4) { // inst_group_size
+                                constexpr index_t c_offset =
+                                    blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
+                                        make_tuple(m0, n0, n2 * N4 + n4));
+                                constexpr auto cidx = Number<c_offset>{};
+
+                                if constexpr(ActivationOperation == Activation::silu_and_mul)
+                                {
+                                    float gate = c_thread_buf[cidx];
+                                    float up   = c_thread_buf_up[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        gate = gate * topk_weights.AsType<float>()[n4];
+                                        up   = up * topk_weights.AsType<float>()[n4];
+                                    }
+                                    if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+                                    {
+                                        gate *= 16;
+                                        up *= 16;
+                                    }
+                                    tensor_operation::element_wise::Silu{}(gate, gate);
+                                    c_thread_buf(cidx) = gate * up;
+                                }
+                                else if(ActivationOperation == Activation::gelu_and_mul)
+                                {
+                                    float gate = c_thread_buf[cidx];
+                                    float up   = c_thread_buf_up[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        gate = gate * topk_weights.AsType<float>()[n4];
+                                        up   = up * topk_weights.AsType<float>()[n4];
+                                    }
+                                    if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+                                    {
+                                        gate *= 16;
+                                        up *= 16;
+                                    }
+                                    tensor_operation::element_wise::Gelu{}(gate, gate);
+                                    c_thread_buf(cidx) = gate * up;
+                                }
+                            });
+                        });
+                    });
+                });
+            }
 
             constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
                 GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp
index 6abccc7bc6..8845e6b0a2 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp
@@ -163,7 +163,6 @@ struct ReferenceMoeGemm1BlockScale : public device::BaseOperator
 
                     arg.c_element_op_(v_c, v_acc);
                     arg.c_element_op_(v_c_up, v_acc_up);
-                    #if 0
                     if constexpr(ActivationType == 1)
                     {
                         if constexpr(is_same_v<BDataType, pk_i4_t>)
@@ -184,8 +183,6 @@ struct ReferenceMoeGemm1BlockScale : public device::BaseOperator
                         tensor_operation::element_wise::Gelu{}(v_c, v_c);
                         arg.c_t_k_n_(t, topk_id, n) = v_c * v_c_up;
                     }
-                    #endif
-                    arg.c_t_k_n_(t, topk_id, n) = v_c;
                 }
             };
 

From 50590b8f176218a5ced201b3532886baff75ef11 Mon Sep 17 00:00:00 2001
From: OscarXu <huaiguxu@amd.com>
Date: Tue, 13 May 2025 16:46:51 +0800
Subject: [PATCH 16/18] Add gemm1 v1 32x128x128

---
 .../moe_gemm1_xdl_fp8_blockscale.cpp          |   16 +-
 ...oe_blockscale_b_preshuffle_gufusion_v1.hpp | 1017 +++++++++++++++++
 ...s_moe_blockscale_b_preshuffle_selector.hpp |    4 +-
 3 files changed, 1027 insertions(+), 10 deletions(-)
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp

diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
index 05de0c98dc..3e93037496 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
@@ -129,8 +129,8 @@ static constexpr ck::index_t Nswizzle    = false;
 static constexpr ck::index_t ActOP       = 0; // 0: gelu_and_mul, 1: silu_and_mul
 static constexpr bool MulRoutedWeight    = false;
 
-#if 0
-static constexpr ck::index_t MPerBlock = 128;
+#if 1
+static constexpr ck::index_t MPerBlock = 32;
 static constexpr ck::index_t NPerBlock   = 128;
 static constexpr ck::index_t MNPerXDL    = 16;
 static constexpr ck::index_t MXDLPerWave = MPerBlock / (MNPerXDL * 1);
@@ -167,7 +167,7 @@ using DeviceOpInstance                   = ck::tensor_operation::device::DeviceM
                //    MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
                 //  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
                 CShuffleMXDLPerWave,    CShuffleNXDLPerWave,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec, 1>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, ActOP, Nswizzle, true, MulRoutedWeight, int32_t, A0DataType>;
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, ActOP, Nswizzle, true, MulRoutedWeight, int32_t, A0DataType>;
 #else
 static constexpr ck::index_t MPerBlock = 128; using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmBlockScale<
                Row, Col, DsLayout, ELayout,
@@ -190,7 +190,7 @@ int main(int argc, char* argv[])
     bool do_verification = true;
     int init_method      = 1;
     bool time_kernel     = true;
-#if 0
+#if 1
     // GEMM shape
     ck::index_t N               = 4096;
     ck::index_t K               = 6144;
@@ -199,10 +199,12 @@ int main(int argc, char* argv[])
     // ck::index_t sorted_tile_num = 133;
     // ck::index_t valid_tile_num  = 128;
     // ck::index_t tokens          = 8192;
-    ck::index_t sorted_tile_num = 15;
-    ck::index_t valid_tile_num  = 13;
+    // ck::index_t sorted_tile_num = 15;
+    // ck::index_t valid_tile_num  = 13;
+    ck::index_t sorted_tile_num = 55;
+    ck::index_t valid_tile_num  = 52;
     ck::index_t tokens          = 832;
-    #else
+#else
     //deepseek
     ck::index_t N               = 2048;
     ck::index_t K               = 7168;
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp
new file mode 100644
index 0000000000..2e8067b85d
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp
@@ -0,0 +1,1017 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MScaleBlock,
+          index_t NScaleBlock,
+          index_t KScaleBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MScaleBlock,
+          index_t NScaleBlock,
+          index_t KScaleBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1<
+    BlockGemmPipelineScheduler::Intrawave,
+    BlockSize,
+    ADataType,
+    BDataType,
+    ComputeDataType,
+    AccDataType,
+    ATileDesc,
+    BTileDesc,
+    AMmaTileDesc,
+    BMmaTileDesc,
+    ABlockTransferSrcScalarPerVector,
+    BBlockTransferSrcScalarPerVector,
+    MPerBlock,
+    NPerBlock,
+    KPerBlock,
+    MScaleBlock,
+    NScaleBlock,
+    KScaleBlock,
+    MPerXDL,
+    NPerXDL,
+    MRepeat,
+    NRepeat,
+    KPack> : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                               ADataType,
+                                               BDataType,
+                                               ComputeDataType,
+                                               AccDataType,
+                                               ATileDesc,
+                                               BTileDesc,
+                                               AMmaTileDesc,
+                                               BMmaTileDesc,
+                                               ABlockTransferSrcScalarPerVector,
+                                               BBlockTransferSrcScalarPerVector,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               MPerXDL,
+                                               NPerXDL,
+                                               MRepeat,
+                                               NRepeat,
+                                               KPack,
+                                               true>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack,
+                                                   true>;
+    using Base::A_K1;
+    using Base::B_K1;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::MWaves;
+    using Base::NWaves;
+
+    static constexpr index_t PrefetchStages  = 2;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 2;
+
+    template <typename TileDesc_M0_M1_M2_K>
+    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
+    {
+        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
+        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
+        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
+        constexpr index_t K2 = KPack;
+        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K0 = KRepeat;
+
+        return transform_tensor_descriptor(
+            TileDesc_M0_M1_M2_K{},
+            make_tuple(
+                make_pass_through_transform(Number<M0>{}),
+                make_pass_through_transform(Number<M1>{}),
+                make_pass_through_transform(Number<M2>{}),
+                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
+        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num * MWaves * 2;
+
+        // B global
+        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+        });
+
+        // A global
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+        });
+
+        // A local
+        static_for<0, num_ds_read_inst_a / 2, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS read
+        });
+    }
+
+    template <bool HasMainLoop,
+              int NumKBlockPerScale,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CScaleThreadDesc,
+              typename CThreadBuffer,
+              typename AScaleGridBuffer,
+              typename AScaleGridDesc,
+              typename AScaleThreadDesc,
+              typename AScaleThreadTransfer,
+              typename AScaleThreadTransferStep,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadDesc,
+              typename BScaleThreadTransfer,
+              typename BScaleThreadTransferStep>
+    __device__ void Run(
+        // ABlockCopy
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_buf,
+        const ABlockTransferStep& a_block_copy_step,
+        // BBlockCopy
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        BBlockTransfer& b_blockwise_copy_up,
+        const BGridBuffer& b_grid_buf,
+        const BGridBuffer& b_grid_buf_up,
+        BBlockBuffer& b_block_buf,
+        const BBlockTransferStep& b_block_copy_step,
+        // CThread
+        const CScaleThreadDesc& c_scale_thread_desc,
+        CThreadBuffer& c_thread_buf,
+        CThreadBuffer& c_thread_buf_up,
+        // AScaleThreadCopy
+        const AScaleGridDesc& a_scale_grid_desc,
+        const AScaleThreadDesc& a_scale_thread_desc,
+        AScaleThreadTransfer& a_scale_thread_copy,
+        const AScaleGridBuffer& a_scale_grid_buf,
+        const AScaleThreadTransferStep& a_scale_thread_copy_step,
+        // BScaleThreadCopy
+        const BScaleGridDesc& b_scale_grid_desc,
+        const BScaleThreadDesc& b_scale_thread_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        BScaleThreadTransfer& b_scale_thread_copy_up,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        const BScaleGridBuffer& b_scale_grid_buf_up,
+        const BScaleThreadTransferStep& b_scale_thread_copy_step,
+        // num_loop
+        index_t num_loop) const
+    {
+        ignore = b_block_desc;
+        ignore = b_block_buf;
+        // __builtin_amdgcn_sched_barrier(0);
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs_up;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
+
+        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            a_scale_thread_desc.GetElementSpaceSize());
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+        auto b_scale_thread_buf_up = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+        auto c_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            c_scale_thread_desc.GetElementSpaceSize());
+        auto c_scale_thread_buf_up = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            c_scale_thread_desc.GetElementSpaceSize());
+
+        // Global prefetch A1 B1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        b_blockwise_copy.Run(b_grid_desc,
+                             b_grid_buf,
+                             b_block_desc_n0_n1_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs(I0));
+        b_blockwise_copy_up.Run(b_grid_desc,
+                                b_grid_buf_up,
+                                b_block_desc_n0_n1_k0_k1,
+                                b_block_origin_idx,
+                                b_thread_bufs_up(I0));
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        a_scale_thread_copy.Run(a_scale_grid_desc,
+                                a_scale_grid_buf,
+                                a_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                a_scale_thread_buf);
+
+        if constexpr(NumKBlockPerScale == 1)
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<1>{}));
+        }
+        else
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<0>{}));
+        }
+
+        b_scale_thread_copy.Run(b_scale_grid_desc,
+                                b_scale_grid_buf,
+                                b_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                b_scale_thread_buf);
+
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
+        b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                b_scale_grid_buf_up,
+                                b_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                b_scale_thread_buf_up);
+
+        b_scale_thread_copy_up.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        constexpr auto num_scale_k_block = CScaleThreadDesc{}.GetLength(Number<0>{});
+        constexpr auto num_scale_m_block = CScaleThreadDesc{}.GetLength(Number<1>{});
+        constexpr auto num_scale_n_block = CScaleThreadDesc{}.GetLength(Number<2>{});
+
+        static_for<0, num_scale_m_block, 1>{}([&](auto m0) {
+            static_for<0, num_scale_n_block, 1>{}([&](auto n0) {
+                static_for<0, num_scale_k_block, 1>{}([&](auto k0) {
+                    constexpr index_t c_offset =
+                        CScaleThreadDesc{}.CalculateOffset(make_tuple(k0, m0, n0));
+                    constexpr index_t a_offset =
+                        AScaleThreadDesc{}.CalculateOffset(make_tuple(m0, k0));
+                    constexpr index_t b_offset =
+                        BScaleThreadDesc{}.CalculateOffset(make_tuple(n0, k0));
+
+                    c_scale_thread_buf(Number<c_offset>{}) =
+                        a_scale_thread_buf[Number<a_offset>{}] *
+                        b_scale_thread_buf[Number<b_offset>{}];
+                    c_scale_thread_buf_up(Number<c_offset>{}) =
+                        a_scale_thread_buf[Number<a_offset>{}] *
+                        b_scale_thread_buf_up[Number<b_offset>{}];
+                });
+            });
+        });
+
+        // Local prefill A1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
+
+        // Global prefetch A2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        a_scale_thread_copy.Run(a_scale_grid_desc,
+                                a_scale_grid_buf,
+                                a_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                a_scale_thread_buf);
+
+        if constexpr(NumKBlockPerScale == 1)
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<1>{}));
+        }
+        else
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<0>{}));
+        }
+
+        b_scale_thread_copy.Run(b_scale_grid_desc,
+                                b_scale_grid_buf,
+                                b_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                b_scale_thread_buf);
+
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
+        b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                b_scale_grid_buf_up,
+                                b_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                b_scale_thread_buf_up);
+
+        b_scale_thread_copy_up.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  AccDataType,
+                                  1,
+                                  xdlops_gemm.GetRegSizePerXdlops(),
+                                  true>
+            c_thread_buf_per_scale;
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  AccDataType,
+                                  1,
+                                  xdlops_gemm.GetRegSizePerXdlops(),
+                                  true>
+            c_thread_buf_per_scale_up;
+
+        // Local prefetch A1
+        block_sync_lds();
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                   make_tuple(m0, I0, I0, k0, I0, I0),
+                                   a_block_buf,
+                                   a_thread_desc_,
+                                   make_tuple(m0, I0, I0, k0, I0, I0),
+                                   a_thread_buf);
+            });
+        });
+
+        // Initialize C
+        c_thread_buf.Clear();
+        c_thread_buf_up.Clear();
+
+        // __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(local_read_buf));
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    b_blockwise_copy_up.Run(b_grid_desc,
+                                            b_grid_buf_up,
+                                            b_block_desc_n0_n1_k0_k1,
+                                            b_block_origin_idx,
+                                            b_thread_bufs_up(local_read_buf));
+                    b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                    block_sync_lds();
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, mfma_reg_buf);
+
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, local_read_buf);
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) {
+                                static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                                    c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                        .template AsType<AccDataType>()(Number<t>{}) = 0;
+                                    c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                        .template AsType<AccDataType>()(Number<t>{}) = 0;
+                                });
+                                vector_type<AccDataType, 2> c_scale_thread_vec;
+                                vector_type<AccDataType, 2> c_scale_thread_vec_up;
+                                constexpr index_t cscale_offset =
+                                    CScaleThreadDesc{}.CalculateOffset(
+                                        make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat));
+
+                                c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                                    c_scale_thread_buf[Number<cscale_offset>{}];
+                                c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                                    c_scale_thread_buf[Number<cscale_offset>{}];
+                                c_scale_thread_vec_up.template AsType<AccDataType>()(Number<0>{}) =
+                                    c_scale_thread_buf_up[Number<cscale_offset>{}];
+                                c_scale_thread_vec_up.template AsType<AccDataType>()(Number<1>{}) =
+                                    c_scale_thread_buf_up[Number<cscale_offset>{}];
+
+                                static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
+                                    vector_type<ComputeDataType, KPack> a_thread_vec;
+                                    vector_type<ComputeDataType, KPack> b_thread_vec;
+                                    vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0,
+                                                           I0,
+                                                           I0,
+                                                           kscale0 * KRepeat / num_scale_k_block +
+                                                               k0,
+                                                           I0,
+                                                           ik))>{}];
+                                        b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                            b_thread_bufs[mfma_reg_buf][Number<
+                                                b_thread_desc_.CalculateOffset(make_tuple(
+                                                    n0,
+                                                    I0,
+                                                    kscale0 * KRepeat / num_scale_k_block + k0,
+                                                    ik))>{}];
+                                        b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                            b_thread_bufs_up[mfma_reg_buf][Number<
+                                                b_thread_desc_.CalculateOffset(make_tuple(
+                                                    n0,
+                                                    I0,
+                                                    kscale0 * KRepeat / num_scale_k_block + k0,
+                                                    ik))>{}];
+                                    });
+
+                                    using mfma_input_type =
+                                        typename vector_type<ComputeDataType,
+                                                             xdlops_gemm.K1PerXdlops>::type;
+
+                                    xdlops_gemm.template Run<>(
+                                        a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                                    xdlops_gemm.template Run<>(
+                                        a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type>(),
+                                        c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{}));
+                                });
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}(
+                                    [&](auto t) {
+                                        using pk_fma_type =
+                                            typename vector_type<AccDataType, 2>::type;
+
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                            .template AsType<pk_fma_type>()(t) =
+                                            __builtin_elementwise_fma(
+                                                c_thread_buf_per_scale
+                                                    .GetVectorTypeReference(Number<0>{})
+                                                    .template AsType<pk_fma_type>()[t],
+                                                c_scale_thread_vec
+                                                    .template AsType<pk_fma_type>()[Number<0>{}],
+                                                c_thread_buf
+                                                    .GetVectorTypeReference(Number<c_offset>{})
+                                                    .template AsType<pk_fma_type>()[t]);
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                            .template AsType<pk_fma_type>()(t) =
+                                            __builtin_elementwise_fma(
+                                                c_thread_buf_per_scale_up
+                                                    .GetVectorTypeReference(Number<0>{})
+                                                    .template AsType<pk_fma_type>()[t],
+                                                c_scale_thread_vec_up
+                                                    .template AsType<pk_fma_type>()[Number<0>{}],
+                                                c_thread_buf_up
+                                                    .GetVectorTypeReference(Number<c_offset>{})
+                                                    .template AsType<pk_fma_type>()[t]);
+                                    });
+                            });
+                        });
+                    });
+
+                    block_sync_lds();
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                               make_tuple(m0, I0, I0, k0, I0, I0),
+                                               a_block_buf,
+                                               a_thread_desc_,
+                                               make_tuple(m0, I0, I0, k0, I0, I0),
+                                               a_thread_buf);
+                        });
+                    });
+
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, num_scale_n_block, 1>{}([&](auto n0) {
+                            static_for<0, num_scale_k_block, 1>{}([&](auto k0) {
+                                constexpr index_t c_offset =
+                                    CScaleThreadDesc{}.CalculateOffset(make_tuple(k0, m0, n0));
+                                constexpr index_t a_offset =
+                                    AScaleThreadDesc{}.CalculateOffset(make_tuple(m0, k0));
+                                constexpr index_t b_offset =
+                                    BScaleThreadDesc{}.CalculateOffset(make_tuple(n0, k0));
+
+                                c_scale_thread_buf(Number<c_offset>{}) =
+                                    a_scale_thread_buf[Number<a_offset>{}] *
+                                    b_scale_thread_buf[Number<b_offset>{}];
+                                c_scale_thread_buf_up(Number<c_offset>{}) =
+                                    a_scale_thread_buf[Number<a_offset>{}] *
+                                    b_scale_thread_buf_up[Number<b_offset>{}];
+                            });
+                        });
+                    });
+
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc,
+                                            make_tuple(I0, I0),
+                                            a_scale_thread_buf);
+
+                    if constexpr(NumKBlockPerScale == 1)
+                    {
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, a_scale_thread_copy_step.At(Number<1>{}));
+                    }
+                    else
+                    {
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, a_scale_thread_copy_step.At(Number<0>{}));
+                    }
+
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc,
+                                            make_tuple(I0, I0),
+                                            b_scale_thread_buf);
+
+                    b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                           b_scale_thread_copy_step);
+                    b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf_up,
+                                            b_scale_thread_desc,
+                                            make_tuple(I0, I0),
+                                            b_scale_thread_buf_up);
+
+                    b_scale_thread_copy_up.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                           b_scale_thread_copy_step);
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            b_blockwise_copy.Run(b_grid_desc,
+                                 b_grid_buf,
+                                 b_block_desc_n0_n1_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs(I1));
+
+            b_blockwise_copy_up.Run(b_grid_desc,
+                                    b_grid_buf_up,
+                                    b_block_desc_n0_n1_k0_k1,
+                                    b_block_origin_idx,
+                                    b_thread_bufs_up(I1));
+            block_sync_lds();
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) {
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
+                            c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
+                        });
+                        vector_type<AccDataType, 2> c_scale_thread_vec;
+                        vector_type<AccDataType, 2> c_scale_thread_vec_up;
+                        constexpr index_t cscale_offset = CScaleThreadDesc{}.CalculateOffset(
+                            make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat));
+
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+                        c_scale_thread_vec_up.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf_up[Number<cscale_offset>{}];
+                        c_scale_thread_vec_up.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf_up[Number<cscale_offset>{}];
+
+                        static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0,
+                                                   I0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   I0,
+                                                   ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
+                                b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec_up.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{}));
+                        });
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                            using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<pk_fma_type>()[t],
+                                c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()[t]);
+                            c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<pk_fma_type>()[t],
+                                c_scale_thread_vec_up.template AsType<pk_fma_type>()[Number<0>{}],
+                                c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()[t]);
+                        });
+                    });
+                });
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, num_scale_n_block, 1>{}([&](auto n0) {
+                    static_for<0, num_scale_k_block, 1>{}([&](auto k0) {
+                        constexpr index_t c_offset =
+                            CScaleThreadDesc{}.CalculateOffset(make_tuple(k0, m0, n0));
+                        constexpr index_t a_offset =
+                            AScaleThreadDesc{}.CalculateOffset(make_tuple(m0, k0));
+                        constexpr index_t b_offset =
+                            BScaleThreadDesc{}.CalculateOffset(make_tuple(n0, k0));
+
+                        c_scale_thread_buf(Number<c_offset>{}) =
+                            a_scale_thread_buf[Number<a_offset>{}] *
+                            b_scale_thread_buf[Number<b_offset>{}];
+                        c_scale_thread_buf_up(Number<c_offset>{}) =
+                            a_scale_thread_buf[Number<a_offset>{}] *
+                            b_scale_thread_buf_up[Number<b_offset>{}];
+                    });
+                });
+            });
+
+            block_sync_lds();
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, k0, I0, I0),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, I0),
+                                       a_thread_buf);
+                });
+            });
+
+            // __builtin_amdgcn_sched_barrier(0);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) {
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
+                            c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
+                        });
+                        vector_type<AccDataType, 2> c_scale_thread_vec;
+                        vector_type<AccDataType, 2> c_scale_thread_vec_up;
+                        constexpr index_t cscale_offset = CScaleThreadDesc{}.CalculateOffset(
+                            make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat));
+
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+                        c_scale_thread_vec_up.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf_up[Number<cscale_offset>{}];
+                        c_scale_thread_vec_up.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf_up[Number<cscale_offset>{}];
+
+                        static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0,
+                                                   I0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   I0,
+                                                   ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
+                                b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_bufs_up[I1][Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec_up.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{}));
+                        });
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                            using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<pk_fma_type>()[t],
+                                c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()[t]);
+                            c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<pk_fma_type>()[t],
+                                c_scale_thread_vec_up.template AsType<pk_fma_type>()[Number<0>{}],
+                                c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()[t]);
+                        });
+                    });
+                });
+            });
+        }
+        else if constexpr(TailNum == TailNumber::Odd)
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) {
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
+                            c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
+                        });
+                        vector_type<AccDataType, 2> c_scale_thread_vec;
+                        vector_type<AccDataType, 2> c_scale_thread_vec_up;
+                        constexpr index_t cscale_offset = CScaleThreadDesc{}.CalculateOffset(
+                            make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat));
+
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+                        c_scale_thread_vec_up.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf_up[Number<cscale_offset>{}];
+                        c_scale_thread_vec_up.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf_up[Number<cscale_offset>{}];
+
+                        static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0,
+                                                   I0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   I0,
+                                                   ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
+                                b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec_up.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{}));
+                        });
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                            using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<pk_fma_type>()[t],
+                                c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()[t]);
+                            c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<pk_fma_type>()[t],
+                                c_scale_thread_vec_up.template AsType<pk_fma_type>()[Number<0>{}],
+                                c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()[t]);
+                        });
+                    });
+                });
+            });
+        }
+    }
+
+    protected:
+    // MRepeat MWave MLane KRepeat KLane KPack
+    // KRepeat -> MRepeat-> Mwave->KLane->MLane->KPack
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, I1, I1, Number<KRepeat>{}, I1, Number<KPack>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeDataType,
+                                                         decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, 1, 1, KPack>,
+                                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                                         5,
+                                                         A_K1,
+                                                         A_K1>;
+
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex6D()};
+
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
+
+    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
+
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp
index 6bff0373f4..a04563f458 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp
@@ -3,7 +3,7 @@
 
 #pragma once
 
-// #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v1.hpp"
 // #include
 // "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v2.hpp"
@@ -40,7 +40,6 @@ constexpr auto BlockGemmBlockMoeScaleBPreshufflePipeline_Selector()
 {
     if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
     {
-        #if 0
         if constexpr(GUFusion)
         {
             return BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1<
@@ -69,7 +68,6 @@ constexpr auto BlockGemmBlockMoeScaleBPreshufflePipeline_Selector()
             KPack>{};
         }
         else
-        #endif
         {
             return BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v1<
             BlkGemmPipeSche,

From b2c0c3cfeb804df4051f46cac75081f66d5a25da Mon Sep 17 00:00:00 2001
From: OscarXu <huaiguxu@amd.com>
Date: Wed, 14 May 2025 10:07:11 +0800
Subject: [PATCH 17/18] gemm1 v3 64x128x128 debug

---
 .../moe_gemm1_xdl_fp8_blockscale.cpp          |  8 +++---
 ...oe_blockscale_b_preshuffle_gufusion_v1.hpp | 25 +++++++++++++------
 2 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
index 3e93037496..4f77d6b9b5 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
@@ -129,7 +129,7 @@ static constexpr ck::index_t Nswizzle    = false;
 static constexpr ck::index_t ActOP       = 0; // 0: gelu_and_mul, 1: silu_and_mul
 static constexpr bool MulRoutedWeight    = false;
 
-#if 1
+#if 0
 static constexpr ck::index_t MPerBlock = 32;
 static constexpr ck::index_t NPerBlock   = 128;
 static constexpr ck::index_t MNPerXDL    = 16;
@@ -169,7 +169,7 @@ using DeviceOpInstance                   = ck::tensor_operation::device::DeviceM
                 CShuffleMXDLPerWave,    CShuffleNXDLPerWave,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec, 1>,
                ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, ActOP, Nswizzle, true, MulRoutedWeight, int32_t, A0DataType>;
 #else
-static constexpr ck::index_t MPerBlock = 128; using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmBlockScale<
+static constexpr ck::index_t MPerBlock = 64; using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmBlockScale<
                Row, Col, DsLayout, ELayout,
                A0DataType, A1DataType, B0DataType, B1DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
                AElementOp,  BElementOp, CDEElementOp,   GemmSpec,   
@@ -177,10 +177,10 @@ static constexpr ck::index_t MPerBlock = 128; using DeviceOpInstance = ck::tenso
                MPerBlock,   128,    128,
                16,   16,
                16,   16,
-               4,    4,
+               4,    2,
                S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
                S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
-               2,    2,   S<1, 32, 1, 8>, S<2, 1, 1, 1>,
+               4,    2,   S<1, 32, 1, 8>, S<2, 1, 1, 1>,
                ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, ActOP, Nswizzle, true, MulRoutedWeight, int32_t, A0DataType>;
 #endif
 // clang-format on
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp
index 2e8067b85d..7d7498c844 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp
@@ -195,12 +195,20 @@ struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1<
     {
         constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
         constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
-        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num * MWaves * 2;
-
+        constexpr auto num_buffer_load_inst_b =
+            HotLoopInstList::B_Buffer_Load_Inst_Num * MWaves * 2;
+        constexpr auto mfma_interleave = MPerXDL == 32 ? 1 : 2;
         // B global
         static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
             ignore = i;
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            if constexpr(MPerBlock >= 128 && NPerBlock >= 64)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x008, 2 * mfma_interleave, 0);
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x008, mfma_interleave, 0);
+            }
             __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
         });
 
@@ -214,11 +222,12 @@ struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1<
         });
 
         // A local
-        static_for<0, num_ds_read_inst_a / 2, 1>{}([&](auto i) {
-            ignore = i;
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS read
-        });
+        static_for<0, MPerXDL == 32 ? num_ds_read_inst_a / 2 : num_ds_read_inst_a, 1>{}(
+            [&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);                     // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, MPerXDL == 32 ? 2 : 1, 0); // DS read
+            });
     }
 
     template <bool HasMainLoop,

From c84ad839051608cbd8ad1fd70c4299033c9be8b3 Mon Sep 17 00:00:00 2001
From: OscarXu <huaiguxu@amd.com>
Date: Wed, 14 May 2025 08:05:17 -0500
Subject: [PATCH 18/18] v1 fix for mi350.

---
 .../65_gemm_multiply_multiply/CMakeLists.txt  |  2 +-
 ...oe_blockscale_b_preshuffle_gufusion_v1.hpp | 52 +++++++++++--------
 ...oe_blockscale_b_preshuffle_gufusion_v3.hpp |  4 +-
 3 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt
index 87f540c5d3..4f6b0478d9 100644
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -42,7 +42,7 @@ set(GEMM_OPTIONS)
 list(APPEND GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
 list(APPEND GEMM_OPTIONS -v --save-temps -Wno-gnu-line-marker)
 set(BLOCKSCALE_GEMM_OPTIONS)
-list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --disable-schedmodel-in-sched-mi=1 -mllvm --misched=gcn-iterative-max-occupancy-experimental -mllvm --misched-bottomup=1")
+list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --disable-schedmodel-in-sched-mi=1 -mllvm --amdgpu-sched-strategy=gcn-iterative-max-occupancy-experimental -mllvm --misched-bottomup=1")
 # list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
 list(APPEND BLOCKSCALE_GEMM_OPTIONS -v --save-temps -Wno-gnu-line-marker)
 target_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVATE ${GEMM_OPTIONS})
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp
index 7d7498c844..7f5dd86f62 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp
@@ -133,6 +133,7 @@ struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1<
     using Base::B_K1;
     using Base::I0;
     using Base::I1;
+    using Base::KGroup;
     using Base::KRepeat;
     using Base::xdlops_gemm;
     using typename Base::HotLoopInstList;
@@ -163,9 +164,9 @@ struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1<
         constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
-        constexpr index_t K2 = KPack;
+        constexpr index_t K2 = KPack / KGroup;
         constexpr index_t K1 = 64 / NPerXDL;
-        constexpr index_t K0 = KRepeat;
+        constexpr index_t K0 = KRepeat * KGroup;
 
         return transform_tensor_descriptor(
             TileDesc_M0_M1_M2_K{},
@@ -368,12 +369,11 @@ struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1<
 
         b_scale_thread_copy_up.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
 
-        __builtin_amdgcn_sched_barrier(0);
+        // __builtin_amdgcn_sched_barrier(0);
 
         constexpr auto num_scale_k_block = CScaleThreadDesc{}.GetLength(Number<0>{});
         constexpr auto num_scale_m_block = CScaleThreadDesc{}.GetLength(Number<1>{});
         constexpr auto num_scale_n_block = CScaleThreadDesc{}.GetLength(Number<2>{});
-
         static_for<0, num_scale_m_block, 1>{}([&](auto m0) {
             static_for<0, num_scale_n_block, 1>{}([&](auto n0) {
                 static_for<0, num_scale_k_block, 1>{}([&](auto k0) {
@@ -451,12 +451,14 @@ struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1<
         block_sync_lds();
         static_for<0, MRepeat, 1>{}([&](auto m0) {
             static_for<0, KRepeat, 1>{}([&](auto k0) {
-                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                   make_tuple(m0, I0, I0, k0, I0, I0),
-                                   a_block_buf,
-                                   a_thread_desc_,
-                                   make_tuple(m0, I0, I0, k0, I0, I0),
-                                   a_thread_buf);
+                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                    make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                    a_block_buf,
+                                    a_thread_desc_,
+                                    make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                    a_thread_buf);
+                });
             });
         });
 
@@ -600,12 +602,14 @@ struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1<
 
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
                         static_for<0, KRepeat, 1>{}([&](auto k0) {
-                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                               make_tuple(m0, I0, I0, k0, I0, I0),
-                                               a_block_buf,
-                                               a_thread_desc_,
-                                               make_tuple(m0, I0, I0, k0, I0, I0),
-                                               a_thread_buf);
+                            static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                                make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                                a_block_buf,
+                                                a_thread_desc_,
+                                                make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                                a_thread_buf);
+                            });
                         });
                     });
 
@@ -804,12 +808,14 @@ struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1<
 
             static_for<0, MRepeat, 1>{}([&](auto m0) {
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
-                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                       make_tuple(m0, I0, I0, k0, I0, I0),
-                                       a_block_buf,
-                                       a_thread_desc_,
-                                       make_tuple(m0, I0, I0, k0, I0, I0),
-                                       a_thread_buf);
+                    static_for<0, KGroup, 1>{}([&](auto kg0) {
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                        make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                        a_block_buf,
+                                        a_thread_desc_,
+                                        make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                        a_thread_buf);
+                    });
                 });
             });
 
@@ -1007,7 +1013,7 @@ struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1<
                                                          ComputeDataType,
                                                          decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
                                                          decltype(a_thread_desc_),
-                                                         Sequence<1, 1, 1, 1, 1, KPack>,
+                                                         Sequence<1, 1, 1, 1, 1, KPack / KGroup>,
                                                          Sequence<0, 1, 2, 3, 4, 5>,
                                                          5,
                                                          A_K1,
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v3.hpp
index d1aa51bbc5..8bb468d2e3 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v3.hpp
@@ -254,9 +254,9 @@ struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v3<
 
         constexpr auto buffer_load_issue_point_b = 0;
         constexpr auto buffer_load_issue_point_interval_more =
-            num_mfma_perstage / buffer_load_perstage_more;
+            num_mfma_perstage / buffer_load_perstage_more ? num_mfma_perstage / buffer_load_perstage_more : 1;
         constexpr auto buffer_load_issue_point_interval_less =
-            num_mfma_perstage / buffer_load_perstage_less;
+            num_mfma_perstage / buffer_load_perstage_less ? num_mfma_perstage / buffer_load_perstage_less : 1;
         constexpr auto ds_write_issue_point      = 0;
         constexpr auto buffer_load_issue_point_a = num_mfma_perstage >= 3 ? 1 : 0;