[fix](kt-kernel): fix typo in moe-tp's forward time-profiling (#1720)

* [fix](kt-kernel): fix typo in moe-tp's forward time-profiling

* [fix](kt-kernel): fix the experts count in profiling

---------

Co-authored-by: KMSorSMS <yzwliam@126.com>
This commit is contained in:
SCDESPERTATE
2025-12-17 12:06:33 +08:00
committed by GitHub
parent 661e19a8e5
commit 6fc4080a7d

View File

@@ -184,17 +184,25 @@ class TP_MOE_Common : public MoE_Interface {
#ifdef FORWARD_TIME_REPORT
auto end = std::chrono::high_resolution_clock::now();
auto forward_time = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
auto band_width = (1.0 * config.routed_expert_num * config.hidden_size * config.intermediate_size * 3 / 1e9) /
(1.0 * forward_time / 1e6);
int unique_experts = 0;
{
std::unordered_set<int64_t> expert_set;
for (int i = 0; i < qlen * config.num_experts_per_tok; i++) {
expert_set.insert(expert_ids[i]);
}
unique_experts = expert_set.size();
}
auto band_width =
(1.0 * unique_experts * config.hidden_size * config.intermediate_size * 3 / 1e9) / (1.0 * forward_time / 1e6);
auto GFLOPS =
(1.0 * config.hidden_size * config.intermediate_size * qlen * 3 * config.routed_expert_num * 2 / 1e9) /
(1.0 * config.hidden_size * config.intermediate_size * qlen * 3 * config.num_experts_per_tok * 2 / 1e9) /
(1.0 * forward_time / 1e6);
if (qlen <= 10) {
forward_time_sum_ns += forward_time;
forward_count++;
}
auto average_bandwidth =
(1.0 * forward_count * config.routed_expert_num * config.hidden_size * config.intermediate_size * 3 / 1e9) /
(1.0 * forward_count * unique_experts * config.hidden_size * config.intermediate_size * 3 / 1e9) /
(1.0 * forward_time_sum_ns / 1e6);
printf(
"forward time %ld, time stamp:%ld, band width %f GElement/s, ave bandwidth %f GElement/s (only "