diff --git a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp
index f531daa24f..0881f74cc2 100644
--- a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp
+++ b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp
@@ -147,8 +147,8 @@ constexpr ck::index_t ScaleBlockSize   = 32;                   // scaling block
 constexpr ck::index_t KPerBlock        = 256 / DataPackedSize; // 256 f4 = 128 fp4x2
 static constexpr ck::index_t Nswizzle  = false;
 static constexpr ck::index_t ActOP     = 0; // 0: gelu_and_mul, 1: silu_and_mul
-static constexpr ck::index_t MPerBlock = 64;
-static constexpr ck::index_t NPerBlock = 128;
+static constexpr ck::index_t MPerBlock = 128;
+static constexpr ck::index_t NPerBlock = 64;
 static constexpr ck::index_t BlockSize = 256;
 static constexpr bool MulRoutedWeight  = true;
 
@@ -161,7 +161,7 @@ using DeviceOpInstance                     = ck::tensor_operation::device::Devic
     MPerBlock,      NPerBlock,    KPerBlock,
     16,   16, 
     16,   16,
-    2,     4,
+    4,     2,
     S<8, 32, 1>, S<1, 0, 2>,     S<1, 0, 2>,    2, 16, 16, 0,
     S<8, 32, 1>, S<1, 0, 2>,     S<1, 0, 2>,    2, 16, 16, 0,
     2,    2,     S<1, 32, 1, 8>, S<8, 1, 1, 1>,