From 63afb25ab369f87a07782e84e532bf823f32f910 Mon Sep 17 00:00:00 2001
From: Qinghua Zhou <qinghuazhou@microsoft.com>
Date: Thu, 23 Apr 2026 21:53:34 +0000
Subject: [PATCH] =?UTF-8?q?tests/ep:=20LL=20bench=20combine=20uses=20recv?=
 =?UTF-8?q?=5Ftokens=C3=97hidden=20for=20payload=20bytes?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Each local expert sends one copy per dispatched token back to its owner,
so the bytes actually on the wire during combine match dispatch. The
previous num_tokens×hidden under-counted by ~num_topk×, making combine
BW look artificially low next to dispatch.
---
 test/python/ext/ep/test_low_latency_multirank.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/test/python/ext/ep/test_low_latency_multirank.py b/test/python/ext/ep/test_low_latency_multirank.py
index 1e9f6ba5..d82ea309 100644
--- a/test/python/ext/ep/test_low_latency_multirank.py
+++ b/test/python/ext/ep/test_low_latency_multirank.py
@@ -256,10 +256,12 @@ def main():
     comb_us = start_ev.elapsed_time(end_ev) * 1e3 / iters
 
     # Dispatch payload: recv_tokens × hidden × bf16 (received on this rank).
-    # Combine payload: num_tokens × hidden × bf16 (sent from each local expert
-    # back to the owning rank; one token's worth of bytes per reduction).
+    # Combine payload: recv_tokens × hidden × bf16 as well -- each local expert
+    # sends one copy per dispatched token back to its owner, so the bytes on
+    # the wire match dispatch. Using num_tokens × hidden here would under-count
+    # the actual send payload by ~num_topk×.
     disp_bytes = recv_tokens * hidden * 2
-    comb_bytes = num_tokens * hidden * 2
+    comb_bytes = recv_tokens * hidden * 2
     disp_bw = disp_bytes / (disp_us * 1e-6) / 1e9
     comb_bw = comb_bytes / (comb_us * 1e-6) / 1e9