From 5380a4ac6ef705f9f6e25141234dacaa6a95ffa0 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 13 Apr 2026 09:59:42 -0700 Subject: [PATCH] Add MSCCLPP_IB_GID_INDEX env (#780) Use MSCCLPP_IB_GID_INDEX to control ib gid index --------- Co-authored-by: Changho Hwang Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- include/mscclpp/core.hpp | 4 ++-- include/mscclpp/env.hpp | 4 ++++ python/csrc/env_py.cpp | 3 ++- src/core/endpoint.cc | 5 +++++ src/core/env.cpp | 4 +++- 5 files changed, 16 insertions(+), 4 deletions(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 37bdbd51..ca2fc34f 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -389,7 +389,7 @@ struct EndpointConfig { }; static constexpr int DefaultPort = -1; - static constexpr int DefaultGidIndex = 0; + static constexpr int DefaultGidIndex = -1; static constexpr int DefaultMaxCqSize = 1024; static constexpr int DefaultMaxCqPollNum = 1; static constexpr int DefaultMaxSendWr = 8192; @@ -418,7 +418,7 @@ struct EndpointConfig { /// Constructor. /// @param deviceIndex Device index. /// @param port Port number. - /// @param gidIndex GID index. + /// @param gidIndex GID index. If -1 (default), uses `MSCCLPP_IB_GID_INDEX` env variable. /// @param maxCqSize Maximum send completion queue size. /// @param maxCqPollNum Maximum send completion queue poll count. /// @param maxSendWr Maximum outstanding send work requests. diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp index fb1da22c..a6dd306b 100644 --- a/include/mscclpp/env.hpp +++ b/include/mscclpp/env.hpp @@ -115,6 +115,10 @@ class Env { /// Default is false. const bool forceDisableGdr; + /// Env name: `MSCCLPP_IB_GID_INDEX`. The GID index to use for IB transport. + /// Default is 0. Used when `EndpointConfig::Ib::gidIndex` is -1 (unspecified). + const int ibGidIndex; + private: Env(); diff --git a/python/csrc/env_py.cpp b/python/csrc/env_py.cpp index ce89fd3d..d4b2f5da 100644 --- a/python/csrc/env_py.cpp +++ b/python/csrc/env_py.cpp @@ -23,7 +23,8 @@ void register_env(nb::module_& m) { .def_ro("ibv_mode", &Env::ibvMode) .def_ro("cache_dir", &Env::cacheDir) .def_ro("npkit_dump_dir", &Env::npkitDumpDir) - .def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream); + .def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream) + .def_ro("ib_gid_index", &Env::ibGidIndex); m.def("env", &env); } diff --git a/src/core/endpoint.cc b/src/core/endpoint.cc index 5ab4bad0..fe51e348 100644 --- a/src/core/endpoint.cc +++ b/src/core/endpoint.cc @@ -47,6 +47,11 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl) } } + // Resolve GID index: explicit value (>= 0) takes priority, otherwise use env + if (config_.ib.gidIndex < 0) { + config_.ib.gidIndex = env()->ibGidIndex; + } + int maxRecvWr = ibNoAtomic_ ? config_.ib.maxRecvWr : 0; ibQp_ = contextImpl.getIbContext(config_.transport) diff --git a/src/core/env.cpp b/src/core/env.cpp index 96f53492..7a42471b 100644 --- a/src/core/env.cpp +++ b/src/core/env.cpp @@ -66,7 +66,8 @@ Env::Env() forceNcclFallbackOperation(readEnv("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", "")), ncclSymmetricMemory(readEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)), forceDisableNvls(readEnv("MSCCLPP_FORCE_DISABLE_NVLS", false)), - forceDisableGdr(readEnv("MSCCLPP_FORCE_DISABLE_GDR", false)) {} + forceDisableGdr(readEnv("MSCCLPP_FORCE_DISABLE_GDR", false)), + ibGidIndex(readEnv("MSCCLPP_IB_GID_INDEX", 0)) {} std::shared_ptr env() { static std::shared_ptr globalEnv = std::shared_ptr(new Env()); @@ -95,6 +96,7 @@ std::shared_ptr env() { logEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", globalEnv->ncclSymmetricMemory); logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls); logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr); + logEnv("MSCCLPP_IB_GID_INDEX", globalEnv->ibGidIndex); } return globalEnv; }