From 25297748681703b888381cb5fdb1dc5e678d7b89 Mon Sep 17 00:00:00 2001 From: Qinghua Zhou Date: Wed, 29 Apr 2026 23:31:47 +0000 Subject: [PATCH] tests/ep: intranode send-side counts unique (token, dst_node) to match NCCL-EP Previously total_send_tokens was Sigma over dst_rank of num_tokens_per_rank which over-counts intra-node fan-out. NCCL-EP's ep_bench collapses multiple destinations on the same node into one count; on a single-node run that means total_send_tokens = number of tokens with at least one valid expert. Switching to is_token_in_rank.any(dim=1).sum() makes the send-side BW comparable to NCCL-EP's send: total_bw / nvl_bw line. --- test/python/ext/ep/test_intranode_multirank.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test/python/ext/ep/test_intranode_multirank.py b/test/python/ext/ep/test_intranode_multirank.py index 54d6b689..4dcfad9f 100644 --- a/test/python/ext/ep/test_intranode_multirank.py +++ b/test/python/ext/ep/test_intranode_multirank.py @@ -297,8 +297,14 @@ def main(): # NCCL-EP `ep_bench` six-metric breakdown # (intranode -> single node, so rdma_*=0; nvl_*=total_*). + # + # Send side follows NCCL-EP: count unique (token, dst_node) pairs. With a + # single node every selected destination collapses to that node, so a + # token with at least one valid expert contributes exactly one to + # `total_send_tokens`. Recv side counts unique (src_rank, token) pairs + # landing on this rank. bytes_per_token = bench_hidden * x_b.element_size() - total_send_tokens_local = int(num_tokens_per_rank_b.sum().item()) + total_send_tokens_local = int(is_token_in_rank_b.any(dim=1).sum().item()) rdma_send_tokens_local = 0 # intranode: no remote nodes recv_from_src = torch.empty(num_ranks, dtype=torch.int64, device="cuda") dist.all_to_all_single(