From e5ccac520cdb06e3c4934b02d76dccb75832d729 Mon Sep 17 00:00:00 2001 From: Qinghua Zhou Date: Fri, 8 May 2026 21:56:31 +0000 Subject: [PATCH] ext/ep: env-tunable IBGDA channel count (MSCCLPP_EP_IBGDA_CHANNELS) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Default still 16. Lets us sweep QP count for diagnostic purposes. Sweep results at TOKENS=128/TOPK=8 BF16, 16 ranks across 2 nodes: channels=4 dispatch=37.16 combine=36.57 agg=1180 GB/s channels=8 dispatch=36.95 combine=36.64 agg=1177 GB/s channels=16 dispatch=36.18 combine=37.36 agg=1177 GB/s channels=32 dispatch=36.45 combine=37.67 agg=1186 GB/s QP count is essentially flat across 4..32 (within ±2% measurement noise), confirming SQ/doorbell coalescing is not the bottleneck. The ~200us 'wait' window in both dispatch and combine is genuine RDMA arrival jitter, not back-pressure from over-provisioned QPs. --- src/ext/ep/buffer.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/ext/ep/buffer.cc b/src/ext/ep/buffer.cc index 51cd40bb..fb8153a2 100644 --- a/src/ext/ep/buffer.cc +++ b/src/ext/ep/buffer.cc @@ -546,7 +546,12 @@ void Buffer::sync(const std::vector& device_ids, // outcomes must be identical. // ------------------------------------------------------------------ if (use_ibgda_path_ && num_rdma_bytes > 0 && num_rdma_ranks > 1) { - constexpr int kNumIbgdaChannels = 16; // mirrors num_port_channels_per_rank above + int num_ibgda_channels = 16; // mirrors num_port_channels_per_rank above + if (const char* e = std::getenv("MSCCLPP_EP_IBGDA_CHANNELS")) { + int v = std::atoi(e); + if (v > 0) num_ibgda_channels = v; + } + const int kNumIbgdaChannels = num_ibgda_channels; try { ibgda_setup_ = mscclpp::ep::build_ibgda_setup(rank, num_ranks, /*ib_transport_index=*/device_id, kNumIbgdaChannels, rdma_buffer_ptr,