Adding necessary macros for enabling mrc support (#797)

This PR adds necessary macros and instructions for enabling mrc support with no atomic.
This commit is contained in:
Mahdieh Ghazi
2026-05-05 17:17:41 -04:00
committed by GitHub
parent 9ec26fa4d1
commit 822fbb2351
4 changed files with 80 additions and 0 deletions

View File

@@ -54,6 +54,7 @@ option(MSCCLPP_BUILD_EXT_COLLECTIVES "Build collective algorithms" ON)
option(MSCCLPP_USE_CUDA "Use NVIDIA/CUDA." OFF)
option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF)
option(MSCCLPP_USE_IB "Use InfiniBand." ON)
option(MSCCLPP_USE_MRC "Enable MRC support" OFF)
option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
option(MSCCLPP_NPKIT_FLAGS "Set NPKIT flags" OFF)
option(MSCCLPP_ENABLE_COVERAGE "Enable code coverage" OFF)

View File

@@ -126,6 +126,45 @@ $ python -m pip install ".[cuda12,benchmark]"
$ python -m pip install ".[cuda12,benchmark,test]"
```
(mrc-support)=
## MRC Support
MSCCL++ supports execution over **Multi-path Reliable Connection (MRC)**, which enables the use of multiple network paths to improve bandwidth utilization and resilience.
To enable MRC support, you must configure both the **build-time** and **runtime** environments as described below.
---
### 1. Install MRC Verbs Shim
MSCCL++ relies on a custom verbs shim library that intercepts standard `libibverbs` calls and redirects them to an MRC-enabled implementation.
- Install the [MRC verbs shim library](https://github.com/microsoft/mrc-verbs-shim-lib) on all nodes in the cluster.
- Ensure that the underlying system has MRC support enabled.
---
### 2. Build MSCCL++ with MRC Enabled
Enable MRC support during the build by adding the following CMake option:
```bash
-DMSCCLPP_USE_MRC=ON
```
This configures MSCCL++ to use the MRC-enabled verbs layer at runtime.
### 3. Configure Runtime Environment
At runtime, you must configure environment variables to override the default RDMA libraries and link against the MRC-enabled stack:
```bash
-x MSCCLPP_IBV_SO=:$MRC-SHIM-HOME/libibverbs.so
-x LD_LIBRARY_PATH=$MRC-SHIM-HOME/mrc-header-lib:$LD_LIBRARY_PATH
-x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so"
-x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1
```
(vscode-dev-container)=
## VSCode Dev Container

View File

@@ -59,6 +59,10 @@ if(MSCCLPP_NPKIT_FLAGS)
target_compile_definitions(mscclpp_obj PRIVATE ${MSCCLPP_NPKIT_FLAGS})
endif()
if(MSCCLPP_USE_MRC)
target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_MRC)
endif()
# libmscclpp
add_library(mscclpp SHARED)
target_link_libraries(mscclpp PUBLIC mscclpp_obj)

View File

@@ -10,11 +10,28 @@
#include "logger.hpp"
// Adding MSCCLPP_USE_MRC micro for MRC enablement.
// Non-MRC environments will not be affected by this macro as long as VMRC_LIBIBVERBS_SO
// environment variable is not set.
#if (MSCCLPP_USE_MRC)
#include <cstdlib>
#include <set>
#endif // (MSCCLPP_USE_MRC)
namespace mscclpp {
static std::unique_ptr<void, int (*)(void*)> globalIBVerbsHandle(nullptr, &::dlclose);
#if (MSCCLPP_USE_MRC)
static std::unique_ptr<void, int (*)(void*)> globalOrigIBVerbsHandle(nullptr, &::dlclose);
#endif // (MSCCLPP_USE_MRC)
void* IBVerbs::dlsym(const std::string& symbol, bool allowReturnNull) {
#if (MSCCLPP_USE_MRC)
static std::set<std::string> mrcSymbols = {
"ibv_get_device_list", "ibv_get_device_name", "ibv_open_device", "ibv_close_device", "ibv_query_qp",
"ibv_create_cq", "ibv_destroy_cq", "ibv_create_qp", "ibv_modify_qp", "ibv_destroy_qp",
};
#endif // (MSCCLPP_USE_MRC)
if (!globalIBVerbsHandle) {
if (mscclpp::env()->ibvSo != "") {
void* handle = ::dlopen(mscclpp::env()->ibvSo.c_str(), RTLD_NOW);
@@ -38,7 +55,26 @@ void* IBVerbs::dlsym(const std::string& symbol, bool allowReturnNull) {
THROW(NET, SysError, errno, "Failed to open libibverbs: ", std::string(::dlerror()));
}
}
#if (MSCCLPP_USE_MRC)
// In MRC mode, `VMRC_LIBIBVERBS_SO` should be set.
char* vmrcLibibverbsSo = ::getenv("VMRC_LIBIBVERBS_SO");
void* ptr;
if (vmrcLibibverbsSo != nullptr && mrcSymbols.find(symbol) == mrcSymbols.end()) {
// If we are in MRC mode and the symbol is not in the table, get it from the original libibverbs.
if (!globalOrigIBVerbsHandle) {
void* handle = ::dlopen(vmrcLibibverbsSo, RTLD_NOW);
if (!handle) {
THROW(NET, SysError, errno, "Failed to open ", std::string(vmrcLibibverbsSo));
}
globalOrigIBVerbsHandle.reset(handle);
}
ptr = ::dlsym(globalOrigIBVerbsHandle.get(), symbol.c_str());
} else {
ptr = ::dlsym(globalIBVerbsHandle.get(), symbol.c_str());
}
#else // !(MSCCLPP_USE_MRC)
void* ptr = ::dlsym(globalIBVerbsHandle.get(), symbol.c_str());
#endif // !(MSCCLPP_USE_MRC)
if (!ptr && !allowReturnNull) {
THROW(NET, SysError, errno, "Failed to load libibverbs symbol: ", symbol);
}