diff --git a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp index ed98e49bd5..1b9af3d9f4 100644 --- a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp +++ b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp @@ -434,11 +434,16 @@ int main(int argc, char* argv[]) float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = - std::size_t(2) * tokens * N * 2 * K + std::size_t(2) * tokens * N * K / ScaleBlockSize; - std::size_t num_btype = - sizeof(A0DataType) / 2 * tokens * K + sizeof(B0DataType) / 2 * K * N * 2 * experts + - sizeof(XDataType) * tokens * K + sizeof(XDataType) * K * N * 2 * experts + - sizeof(EDataType) * tokens * topk * N; + // FMA * tokens * N * (Gate+Up) * topk * K + + // FMA * tokens * N * (Gate+Up) * topk * (K/BlockScale) + std::size_t(2) * tokens * N * 2 * topk * K + + std::size_t(2) * tokens * N * 2 * topk * K / ScaleBlockSize; + + std::size_t num_btype = sizeof(A0DataType) / 2 * tokens * topk * K + + sizeof(B0DataType) / 2 * K * N * 2 * experts + + sizeof(XDataType) * tokens * topk * K / ScaleBlockSize + + sizeof(XDataType) * K / ScaleBlockSize * N * 2 * experts + + sizeof(EDataType) * tokens * topk * N; float tflops = static_cast(flop) / 1.E9 / ave_time; diff --git a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp index 08addd7862..8c8782a9d8 100644 --- a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp +++ b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp @@ -561,12 +561,16 @@ int main(int argc, char* argv[]) float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = - std::size_t(2) * tokens * N * 2 * K + std::size_t(2) * tokens * N * K / ScaleBlockSize; + // FMA * tokens * N * (Gate+Up) * topk * K + + // FMA * tokens * N * (Gate+Up) * topk * (K/BlockScale) + std::size_t(2) * tokens * N * 2 * topk * K + + std::size_t(2) * tokens * N * 2 * topk * K / ScaleBlockSize; - std::size_t num_btype = - sizeof(A0DataType) / 2 * tokens * K + sizeof(B0DataType) / 2 * K * N * 2 * experts + - sizeof(XDataType) / 2 * tokens * K + sizeof(XDataType) / 2 * K * N * 2 * experts + - sizeof(EDataType) * tokens * N; + std::size_t num_btype = sizeof(A0DataType) / 2 * tokens * topk * K + + sizeof(B0DataType) / 2 * K * N * 2 * experts + + sizeof(XDataType) * tokens * topk * K / ScaleBlockSize + + sizeof(XDataType) * K / ScaleBlockSize * N * 2 * experts + + sizeof(EDataType) * tokens * topk * N; float tflops = static_cast(flop) / 1.E9 / ave_time; @@ -635,6 +639,7 @@ int main(int argc, char* argv[]) e_device_buf.FromDevice(e_t_k_n_device_result.mData.data()); #if 0 + e_t_k_n_device_result.savetxt("e_t_k_n_device_result.txt", "float"); printf("e_t_k_n_device_result:\n"); for(int t = 0; t < tokens; ++t) { diff --git a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp index 2a7b8d19da..f1b17f54be 100644 --- a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp +++ b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp @@ -621,11 +621,15 @@ int main(int argc, char* argv[]) // not result correct here because output buf not setzero float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); - std::size_t flop = std::size_t(2) * tokens * topk * N * 2 * K + + // FMA * tokens * N * topk * K + + // FMA * tokens * N * topk * (K/BlockScale) + std::size_t flop = std::size_t(2) * tokens * topk * N * K + std::size_t(2) * tokens * topk * N * K / ScaleBlockSize; - std::size_t num_btype = sizeof(A0DataType) / 2 * tokens * K * topk + - sizeof(B0DataType) / 2 * K * N * experts + - sizeof(EDataType) * tokens * N; + + std::size_t num_btype = + sizeof(A0DataType) / 2 * tokens * K * topk + sizeof(B0DataType) / 2 * K * N * experts + + sizeof(XDataType) * tokens * topk * K / ScaleBlockSize + + sizeof(XDataType) * K / ScaleBlockSize * N * experts + sizeof(EDataType) * tokens * N; float tflops = static_cast(flop) / 1.E9 / ave_time; diff --git a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp index 51d4c8785a..a47763abb1 100644 --- a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp +++ b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp @@ -558,12 +558,15 @@ int main(int argc, char* argv[]) // not result correct here because output buf not setzero float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); + // FMA * tokens * N * topk * K + + // FMA * tokens * N * topk * (K/BlockScale) std::size_t flop = std::size_t(2) * tokens * topk * N * K + std::size_t(2) * tokens * topk * N * K / ScaleBlockSize; - std::size_t num_btype = sizeof(A0DataType) / 2 * tokens * K * topk + - sizeof(B0DataType) / 2 * K * N * experts + - sizeof(EDataType) * tokens * N; + std::size_t num_btype = + sizeof(A0DataType) / 2 * tokens * K * topk + sizeof(B0DataType) / 2 * K * N * experts + + sizeof(XDataType) * tokens * topk * K / ScaleBlockSize + + sizeof(XDataType) * K / ScaleBlockSize * N * experts + sizeof(EDataType) * tokens * N; float tflops = static_cast(flop) / 1.E9 / ave_time;