From 3f459a995d8beb3ee767ae2a8ecf4bdf50ee15d6 Mon Sep 17 00:00:00 2001 From: qinghuazhou Date: Tue, 12 May 2026 02:53:40 +0000 Subject: [PATCH] =?UTF-8?q?test/ext/ep:=20HT=20tests=20=E2=80=94=20env-dri?= =?UTF-8?q?ven=20cfg=20+=20allgather=20bookkeeping?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Internode HT test: accept MSCCLPP_EP_HT_{TOKENS,HIDDEN,TOPK,EXPERTS} env vars to override the functional-check problem size (was hardcoded to num_tokens=128, hidden=1024, num_topk=min(4,num_ranks), num_experts=num_ranks*4). - Both intranode + internode HT tests: replace dist.all_to_all_single bookkeeping (per-(src,dst) recv-count matrix used for the six-metric NVL/RDMA BW breakdown) with dist.all_gather_into_tensor + transpose. Functionally identical (gathered[:, rank] gives the same recv-from-src column) but works on socket-NCCL with NCCL_IB_DISABLE=1, which is required on rigs where NCCL IB cannot coexist with mscclpp RDMA. Sends num_ranks^2 int64 instead of num_ranks per rank — negligible (64 ints at 8 ranks). --- .../python/ext/ep/test_internode_multirank.py | 24 +++++++++++-------- .../python/ext/ep/test_intranode_multirank.py | 13 +++++----- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/test/python/ext/ep/test_internode_multirank.py b/test/python/ext/ep/test_internode_multirank.py index c789cc03..901a67a7 100644 --- a/test/python/ext/ep/test_internode_multirank.py +++ b/test/python/ext/ep/test_internode_multirank.py @@ -79,10 +79,13 @@ def main(): num_local_ranks = NUM_MAX_NVL_PEERS # Small settings for functional check - num_tokens = 128 - hidden = 1024 - num_topk = min(4, num_ranks) - num_experts = num_ranks * 4 # multiple of num_ranks + import os as _os + num_tokens = int(_os.environ.get("MSCCLPP_EP_HT_TOKENS", "128")) + hidden = int(_os.environ.get("MSCCLPP_EP_HT_HIDDEN", "1024")) + num_topk = int(_os.environ.get("MSCCLPP_EP_HT_TOPK", str(min(4, num_ranks)))) + _experts_env = _os.environ.get("MSCCLPP_EP_HT_EXPERTS", "") + num_experts = int(_experts_env) if _experts_env else num_ranks * 4 + assert num_experts % num_ranks == 0 torch.manual_seed(0xA1B2 + rank) @@ -425,12 +428,13 @@ def main(): total_send_tokens_local = int(nodes_unique.sum().item()) nvl_send_tokens_local = int(nodes_unique[local_node].item()) rdma_send_tokens_local = total_send_tokens_local - nvl_send_tokens_local - recv_from_src = torch.empty(num_ranks, dtype=torch.int64, device="cuda") - dist.all_to_all_single( - recv_from_src, - num_tokens_per_rank_b.to(torch.int64), - group=group, - ) + # Replaced dist.all_to_all_single (NCCL socket transport fails with + # NCCL_IB_DISABLE=1 internode) with all_gather_into_tensor + transpose, + # which works on the same socket-NCCL setup the LL test uses. + _send_row = num_tokens_per_rank_b.to(torch.int64).contiguous() + _gathered = torch.empty(num_ranks * num_ranks, dtype=torch.int64, device="cuda") + dist.all_gather_into_tensor(_gathered, _send_row, group=group) + recv_from_src = _gathered.view(num_ranks, num_ranks)[:, rank].contiguous() src_node = torch.arange(num_ranks, device="cuda") // num_local_ranks remote_mask = (src_node != local_node).to(torch.int64) total_recv_tokens_local = int(recv_from_src.sum().item()) diff --git a/test/python/ext/ep/test_intranode_multirank.py b/test/python/ext/ep/test_intranode_multirank.py index 08bf4355..22dd2da5 100644 --- a/test/python/ext/ep/test_intranode_multirank.py +++ b/test/python/ext/ep/test_intranode_multirank.py @@ -354,12 +354,13 @@ def main(): bytes_per_token = bench_hidden * x_b.element_size() total_send_tokens_local = int(is_token_in_rank_b.any(dim=1).sum().item()) rdma_send_tokens_local = 0 # intranode: no remote nodes - recv_from_src = torch.empty(num_ranks, dtype=torch.int64, device="cuda") - dist.all_to_all_single( - recv_from_src, - num_tokens_per_rank_b.to(torch.int64), - group=group, - ) + # Replaced dist.all_to_all_single (NCCL socket transport fails with + # NCCL_IB_DISABLE=1 internode) with all_gather_into_tensor + transpose, + # which works on the same socket-NCCL setup the LL test uses. + _send_row = num_tokens_per_rank_b.to(torch.int64).contiguous() + _gathered = torch.empty(num_ranks * num_ranks, dtype=torch.int64, device="cuda") + dist.all_gather_into_tensor(_gathered, _send_row, group=group) + recv_from_src = _gathered.view(num_ranks, num_ranks)[:, rank].contiguous() total_recv_tokens_local = int(recv_from_src.sum().item()) rdma_recv_tokens_local = 0 # intranode