From 36940dbacfa679a3dbd013b47289b207fea7d93e Mon Sep 17 00:00:00 2001 From: Qinghua Zhou Date: Mon, 30 Mar 2026 03:40:05 +0000 Subject: [PATCH] Match the message size for EP bench HT of 16 GPUs in test 6 --- python/test/test_alltoallv_mscclpp.py | 31 ++++++++++++++++++--------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/python/test/test_alltoallv_mscclpp.py b/python/test/test_alltoallv_mscclpp.py index 6b95dd85..96695f94 100644 --- a/python/test/test_alltoallv_mscclpp.py +++ b/python/test/test_alltoallv_mscclpp.py @@ -665,10 +665,6 @@ def main(): # Matches the data volume of: # mpirun -np N ep_bench -a ht -t 4096 -d 7168 # - # Target byte counts (per rank avg, 8 GPUs): - # RDMA_send = 58.72 MB (4096 tokens × 7168 × 2 bytes) - # total_recv = 469.76 MB (32768 tokens = 8 peers × 4096 tokens each) - # # ep_bench config: 4096 tokens/rank, 256 experts, top_k=8, # hidden=7168, bf16. Each token is dispatched to top_k=8 experts, # so each rank receives ~4096 token-expert pairs from each peer. @@ -677,12 +673,21 @@ def main(): # srand(rank + 42), for each of 4096 tokens pick a random first_expert # in [0, num_experts), then assign top_k=8 consecutive experts. # target_rank = expert_id // num_local_experts. + # + # Target send bytes vary by GPU count (to match ep_bench reports): + # 8 GPUs: 4096 tokens/rank → 58.72 MB (no cross-boundary inflation) + # 16 GPUs: 4317 tokens/rank → 61.88 MB (matches ep_bench RDMA_send) EP_NUM_TOKENS = 4096 # tokens per rank (input) EP_NUM_EXPERTS = 256 EP_TOP_K = 8 EP_HIDDEN = 7168 # bf16 elements per token + # Target send tokens per rank, keyed by world_size. + # 8 GPUs: top_k=8 = num_local_experts=32, so no boundary-crossing → 4096 + # 16 GPUs: num_local_experts=16, boundary crossing inflates to ~4317 + EP_TARGET_TOKENS = {8: 4096, 16: 4317} + if world_size >= 2: num_local_experts = EP_NUM_EXPERTS // world_size @@ -705,9 +710,9 @@ def main(): for tr in target_ranks_seen: send_counts[tr] += 1 - # Normalize send_counts so each rank sends exactly EP_NUM_TOKENS - # tokens total, ensuring total_send_bytes = 4096 × 7168 × 2 = 58,720,256 bytes. - TARGET_SEND_TOKENS = EP_NUM_TOKENS # 4096 + # Normalize send_counts to the target for this world_size. + # For unknown world_size, keep raw counts. + TARGET_SEND_TOKENS = EP_TARGET_TOKENS.get(world_size, sum(send_counts)) raw_total = sum(send_counts) if raw_total > 0 and raw_total != TARGET_SEND_TOKENS: scaled = [int(c * TARGET_SEND_TOKENS / raw_total) for c in send_counts] @@ -720,7 +725,7 @@ def main(): scaled[indices[i % world_size]] -= 1 send_counts = scaled - # Gather 8×8 send matrix via allgather + # Gather send matrix via allgather send_tensor = torch.tensor(send_counts, dtype=torch.int32, device='cuda') all_sends = [torch.zeros(world_size, dtype=torch.int32, device='cuda') for _ in range(world_size)] @@ -739,6 +744,10 @@ def main(): total_send_bytes = sum(in_splits) * 2 total_recv_bytes = sum(out_splits) * 2 + target_send_mb = TARGET_SEND_TOKENS * EP_HIDDEN * 2 / 1e6 + target_recv_tokens = world_size * EP_NUM_TOKENS + target_recv_mb = target_recv_tokens * EP_HIDDEN * 2 / 1e6 + if rank == 0: print(f"\n[Test 6] NCCL EP HT-equivalent workload " f"(tokens={EP_NUM_TOKENS}, experts={EP_NUM_EXPERTS}, " @@ -747,8 +756,10 @@ def main(): print(f" Rank 0 recv tokens: {out_splits_tokens} (total {total_recv_tokens})") print(f" Send {total_send_bytes / 1e6:.2f}MB, " f"Recv {total_recv_bytes / 1e6:.2f}MB") - print(f" Target: RDMA_send=58.72 MB, total_recv=469.76 MB (8 GPUs)") - # Show imbalance + print(f" Target: RDMA_send={target_send_mb:.2f} MB " + f"({TARGET_SEND_TOKENS} tokens), " + f"total_recv={target_recv_mb:.2f} MB " + f"({target_recv_tokens} tokens)") max_out = max(out_splits_tokens) min_out = min(out_splits_tokens) print(f" Recv imbalance: {max_out/min_out:.2f}x "