diff --git a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp index 8c8782a9d8..f531daa24f 100644 --- a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp +++ b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp @@ -159,7 +159,7 @@ using DeviceOpInstance = ck::tensor_operation::device::Devic AElementOp, BElementOp, CDEElementOp, GemmSpec, ScaleBlockSize, BlockSize, MPerBlock, NPerBlock, KPerBlock, - 16, 16, + 16, 16, 16, 16, 2, 4, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, @@ -237,6 +237,12 @@ int main(int argc, char* argv[]) Tensor max_token_id(HostTensorDescriptor({sorted_tile_num + 1})); max_token_id.mData[0] = valid_size; + if(tokens * topk > valid_size) + { + printf("err config, tokens * topk > valid_size\n"); + exit(-1); + } + for(int i = 0; i < sorted_tile_num; i++) { expert_ids.mData[i] = i / ck::math::integer_divide_ceil(valid_tile_num, experts); diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_gufusion_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_gufusion_v3.hpp index bc4f40f208..2cd9ef2547 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_gufusion_v3.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_gufusion_v3.hpp @@ -201,18 +201,18 @@ struct BlockwiseGemmXdlops_pipeline_mx_moe_bns_gufusion_v3