mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-04-20 14:59:29 +00:00
Unique QP per channel and env-controlled GID index
- Change executor to create one connection (unique QP) per channel entry instead of sharing connections per peer. This is required for HostNoAtomic IB mode where each connection can only forward signals to one semaphore via setSignalForwardingDst. - Add MSCCLPP_IB_GID_INDEX environment variable to override the default GID index (3) used for IB transport. Set to the desired GID index value, or leave unset/-1 to use the default.
This commit is contained in:
@@ -115,6 +115,10 @@ class Env {
|
||||
/// Default is false.
|
||||
const bool forceDisableGdr;
|
||||
|
||||
/// Env name: `MSCCLPP_IB_GID_INDEX`. The GID index to use for IB transport.
|
||||
/// If unset or set to -1, it defaults to `EndpointConfig::Ib::DefaultGidIndex` (3).
|
||||
const int ibGidIndex;
|
||||
|
||||
private:
|
||||
Env();
|
||||
|
||||
|
||||
@@ -49,8 +49,14 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl)
|
||||
|
||||
int maxRecvWr = ibNoAtomic_ ? config_.ib.maxRecvWr : 0;
|
||||
|
||||
// Override GID index from environment variable if set
|
||||
int gidIndex = config_.ib.gidIndex;
|
||||
if (env()->ibGidIndex >= 0) {
|
||||
gidIndex = env()->ibGidIndex;
|
||||
}
|
||||
|
||||
ibQp_ = contextImpl.getIbContext(config_.transport)
|
||||
->createQp(config_.ib.port, config_.ib.gidIndex, config_.ib.maxCqSize, config_.ib.maxCqPollNum,
|
||||
->createQp(config_.ib.port, gidIndex, config_.ib.maxCqSize, config_.ib.maxCqPollNum,
|
||||
config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend, ibNoAtomic_);
|
||||
ibQpInfo_ = ibQp_->getInfo();
|
||||
|
||||
|
||||
@@ -67,6 +67,7 @@ Env::Env()
|
||||
ncclSymmetricMemory(readEnv<bool>("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)),
|
||||
forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)),
|
||||
forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)) {}
|
||||
ibGidIndex(readEnv<int>("MSCCLPP_IB_GID_INDEX", -1)) {}
|
||||
|
||||
std::shared_ptr<Env> env() {
|
||||
static std::shared_ptr<Env> globalEnv = std::shared_ptr<Env>(new Env());
|
||||
@@ -95,6 +96,7 @@ std::shared_ptr<Env> env() {
|
||||
logEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", globalEnv->ncclSymmetricMemory);
|
||||
logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls);
|
||||
logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr);
|
||||
logEnv("MSCCLPP_IB_GID_INDEX", globalEnv->ibGidIndex);
|
||||
}
|
||||
return globalEnv;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user