mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-11 17:00:22 +00:00
ext/ep: env-tunable IBGDA channel count (MSCCLPP_EP_IBGDA_CHANNELS)
Default still 16. Lets us sweep QP count for diagnostic purposes. Sweep results at TOKENS=128/TOPK=8 BF16, 16 ranks across 2 nodes: channels=4 dispatch=37.16 combine=36.57 agg=1180 GB/s channels=8 dispatch=36.95 combine=36.64 agg=1177 GB/s channels=16 dispatch=36.18 combine=37.36 agg=1177 GB/s channels=32 dispatch=36.45 combine=37.67 agg=1186 GB/s QP count is essentially flat across 4..32 (within ±2% measurement noise), confirming SQ/doorbell coalescing is not the bottleneck. The ~200us 'wait' window in both dispatch and combine is genuine RDMA arrival jitter, not back-pressure from over-provisioned QPs.
This commit is contained in:
@@ -546,7 +546,12 @@ void Buffer::sync(const std::vector<int>& device_ids,
|
||||
// outcomes must be identical.
|
||||
// ------------------------------------------------------------------
|
||||
if (use_ibgda_path_ && num_rdma_bytes > 0 && num_rdma_ranks > 1) {
|
||||
constexpr int kNumIbgdaChannels = 16; // mirrors num_port_channels_per_rank above
|
||||
int num_ibgda_channels = 16; // mirrors num_port_channels_per_rank above
|
||||
if (const char* e = std::getenv("MSCCLPP_EP_IBGDA_CHANNELS")) {
|
||||
int v = std::atoi(e);
|
||||
if (v > 0) num_ibgda_channels = v;
|
||||
}
|
||||
const int kNumIbgdaChannels = num_ibgda_channels;
|
||||
try {
|
||||
ibgda_setup_ = mscclpp::ep::build_ibgda_setup(rank, num_ranks, /*ib_transport_index=*/device_id,
|
||||
kNumIbgdaChannels, rdma_buffer_ptr,
|
||||
|
||||
Reference in New Issue
Block a user