From 69bbe0480b0692b285de978b25bec570f008e96e Mon Sep 17 00:00:00 2001 From: xudoyuan Date: Fri, 24 Oct 2025 16:13:55 +0000 Subject: [PATCH] config block_m = 32 --- .../moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp index 1adf039b70..8a3d3c30f7 100644 --- a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp +++ b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp @@ -178,10 +178,10 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio constexpr ck::index_t DataPackedSize = 2; // Packed representation of data constexpr ck::index_t ScaleBlockSize = 32; // scaling block size -constexpr ck::index_t KPerBlock = 256 / DataPackedSize; // 256 f4 = 128 fp4x2 +constexpr ck::index_t KPerBlock = 128 / DataPackedSize; // 256 f4 = 128 fp4x2 static constexpr ck::index_t Nswizzle = false; static constexpr ck::index_t ActOP = 0; // 0: gelu_and_mul, 1: silu_and_mul -static constexpr ck::index_t MPerBlock = 128; +static constexpr ck::index_t MPerBlock = 32; static constexpr bool MulRoutedWeight = true; // clang-format off @@ -190,10 +190,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmMXBPreShuffl A0DataType, A1DataType, B0DataType, B1DataType, DsDataType, EDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, ScaleBlockSize, 256, - MPerBlock, 64, KPerBlock, + MPerBlock, 128, KPerBlock, 16, 16, 16, 16, - 4, 2, + 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 2, 2, S<1, 32, 1, 8>, S<8, 1, 1, 1>,