Unique QP per channel and env-controlled GID index

- Change executor to create one connection (unique QP) per channel entry
  instead of sharing connections per peer. This is required for HostNoAtomic
  IB mode where each connection can only forward signals to one semaphore
  via setSignalForwardingDst.

- Add MSCCLPP_IB_GID_INDEX environment variable to override the default
  GID index (3) used for IB transport. Set to the desired GID index value,
  or leave unset/-1 to use the default.
This commit is contained in:
Ubuntu
2026-03-09 20:27:28 +00:00
parent 3efb1fd0d3
commit 2478553b22
3 changed files with 13 additions and 1 deletions

View File

@@ -115,6 +115,10 @@ class Env {
/// Default is false.
const bool forceDisableGdr;
/// Env name: `MSCCLPP_IB_GID_INDEX`. The GID index to use for IB transport.
/// If unset or set to -1, it defaults to `EndpointConfig::Ib::DefaultGidIndex` (3).
const int ibGidIndex;
private:
Env();

View File

@@ -49,8 +49,14 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl)
int maxRecvWr = ibNoAtomic_ ? config_.ib.maxRecvWr : 0;
// Override GID index from environment variable if set
int gidIndex = config_.ib.gidIndex;
if (env()->ibGidIndex >= 0) {
gidIndex = env()->ibGidIndex;
}
ibQp_ = contextImpl.getIbContext(config_.transport)
->createQp(config_.ib.port, config_.ib.gidIndex, config_.ib.maxCqSize, config_.ib.maxCqPollNum,
->createQp(config_.ib.port, gidIndex, config_.ib.maxCqSize, config_.ib.maxCqPollNum,
config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend, ibNoAtomic_);
ibQpInfo_ = ibQp_->getInfo();

View File

@@ -67,6 +67,7 @@ Env::Env()
ncclSymmetricMemory(readEnv<bool>("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)),
forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)),
forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)) {}
ibGidIndex(readEnv<int>("MSCCLPP_IB_GID_INDEX", -1)) {}
std::shared_ptr<Env> env() {
static std::shared_ptr<Env> globalEnv = std::shared_ptr<Env>(new Env());
@@ -95,6 +96,7 @@ std::shared_ptr<Env> env() {
logEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", globalEnv->ncclSymmetricMemory);
logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls);
logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr);
logEnv("MSCCLPP_IB_GID_INDEX", globalEnv->ibGidIndex);
}
return globalEnv;
}