This commit is contained in:
feifei14119
2025-05-23 11:23:01 +00:00
parent 2216ff0521
commit 2e39bf06f7
4 changed files with 34 additions and 17 deletions

View File

@@ -434,11 +434,16 @@ int main(int argc, char* argv[])
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop =
std::size_t(2) * tokens * N * 2 * K + std::size_t(2) * tokens * N * K / ScaleBlockSize;
std::size_t num_btype =
sizeof(A0DataType) / 2 * tokens * K + sizeof(B0DataType) / 2 * K * N * 2 * experts +
sizeof(XDataType) * tokens * K + sizeof(XDataType) * K * N * 2 * experts +
sizeof(EDataType) * tokens * topk * N;
// FMA * tokens * N * (Gate+Up) * topk * K +
// FMA * tokens * N * (Gate+Up) * topk * (K/BlockScale)
std::size_t(2) * tokens * N * 2 * topk * K +
std::size_t(2) * tokens * N * 2 * topk * K / ScaleBlockSize;
std::size_t num_btype = sizeof(A0DataType) / 2 * tokens * topk * K +
sizeof(B0DataType) / 2 * K * N * 2 * experts +
sizeof(XDataType) * tokens * topk * K / ScaleBlockSize +
sizeof(XDataType) * K / ScaleBlockSize * N * 2 * experts +
sizeof(EDataType) * tokens * topk * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;

View File

@@ -561,12 +561,16 @@ int main(int argc, char* argv[])
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop =
std::size_t(2) * tokens * N * 2 * K + std::size_t(2) * tokens * N * K / ScaleBlockSize;
// FMA * tokens * N * (Gate+Up) * topk * K +
// FMA * tokens * N * (Gate+Up) * topk * (K/BlockScale)
std::size_t(2) * tokens * N * 2 * topk * K +
std::size_t(2) * tokens * N * 2 * topk * K / ScaleBlockSize;
std::size_t num_btype =
sizeof(A0DataType) / 2 * tokens * K + sizeof(B0DataType) / 2 * K * N * 2 * experts +
sizeof(XDataType) / 2 * tokens * K + sizeof(XDataType) / 2 * K * N * 2 * experts +
sizeof(EDataType) * tokens * N;
std::size_t num_btype = sizeof(A0DataType) / 2 * tokens * topk * K +
sizeof(B0DataType) / 2 * K * N * 2 * experts +
sizeof(XDataType) * tokens * topk * K / ScaleBlockSize +
sizeof(XDataType) * K / ScaleBlockSize * N * 2 * experts +
sizeof(EDataType) * tokens * topk * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
@@ -635,6 +639,7 @@ int main(int argc, char* argv[])
e_device_buf.FromDevice(e_t_k_n_device_result.mData.data());
#if 0
e_t_k_n_device_result.savetxt("e_t_k_n_device_result.txt", "float");
printf("e_t_k_n_device_result:\n");
for(int t = 0; t < tokens; ++t)
{

View File

@@ -621,11 +621,15 @@ int main(int argc, char* argv[])
// not result correct here because output buf not setzero
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * tokens * topk * N * 2 * K +
// FMA * tokens * N * topk * K +
// FMA * tokens * N * topk * (K/BlockScale)
std::size_t flop = std::size_t(2) * tokens * topk * N * K +
std::size_t(2) * tokens * topk * N * K / ScaleBlockSize;
std::size_t num_btype = sizeof(A0DataType) / 2 * tokens * K * topk +
sizeof(B0DataType) / 2 * K * N * experts +
sizeof(EDataType) * tokens * N;
std::size_t num_btype =
sizeof(A0DataType) / 2 * tokens * K * topk + sizeof(B0DataType) / 2 * K * N * experts +
sizeof(XDataType) * tokens * topk * K / ScaleBlockSize +
sizeof(XDataType) * K / ScaleBlockSize * N * experts + sizeof(EDataType) * tokens * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;

View File

@@ -558,12 +558,15 @@ int main(int argc, char* argv[])
// not result correct here because output buf not setzero
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
// FMA * tokens * N * topk * K +
// FMA * tokens * N * topk * (K/BlockScale)
std::size_t flop = std::size_t(2) * tokens * topk * N * K +
std::size_t(2) * tokens * topk * N * K / ScaleBlockSize;
std::size_t num_btype = sizeof(A0DataType) / 2 * tokens * K * topk +
sizeof(B0DataType) / 2 * K * N * experts +
sizeof(EDataType) * tokens * N;
std::size_t num_btype =
sizeof(A0DataType) / 2 * tokens * K * topk + sizeof(B0DataType) / 2 * K * N * experts +
sizeof(XDataType) * tokens * topk * K / ScaleBlockSize +
sizeof(XDataType) * K / ScaleBlockSize * N * experts + sizeof(EDataType) * tokens * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;