From 822fbb235168d3bf85112bdf53a55cbfc86e4737 Mon Sep 17 00:00:00 2001 From: Mahdieh Ghazi Date: Tue, 5 May 2026 17:17:41 -0400 Subject: [PATCH] Adding necessary macros for enabling mrc support (#797) This PR adds necessary macros and instructions for enabling mrc support with no atomic. --- CMakeLists.txt | 1 + docs/quickstart.md | 39 +++++++++++++++++++++++++++++++++++++ src/core/CMakeLists.txt | 4 ++++ src/core/ibverbs_wrapper.cc | 36 ++++++++++++++++++++++++++++++++++ 4 files changed, 80 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index ef8b785a..49154e0b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,6 +54,7 @@ option(MSCCLPP_BUILD_EXT_COLLECTIVES "Build collective algorithms" ON) option(MSCCLPP_USE_CUDA "Use NVIDIA/CUDA." OFF) option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF) option(MSCCLPP_USE_IB "Use InfiniBand." ON) +option(MSCCLPP_USE_MRC "Enable MRC support" OFF) option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF) option(MSCCLPP_NPKIT_FLAGS "Set NPKIT flags" OFF) option(MSCCLPP_ENABLE_COVERAGE "Enable code coverage" OFF) diff --git a/docs/quickstart.md b/docs/quickstart.md index 83a08d6a..716fcf61 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -126,6 +126,45 @@ $ python -m pip install ".[cuda12,benchmark]" $ python -m pip install ".[cuda12,benchmark,test]" ``` +(mrc-support)= +## MRC Support + +MSCCL++ supports execution over **Multi-path Reliable Connection (MRC)**, which enables the use of multiple network paths to improve bandwidth utilization and resilience. + +To enable MRC support, you must configure both the **build-time** and **runtime** environments as described below. + +--- + +### 1. Install MRC Verbs Shim + +MSCCL++ relies on a custom verbs shim library that intercepts standard `libibverbs` calls and redirects them to an MRC-enabled implementation. + +- Install the [MRC verbs shim library](https://github.com/microsoft/mrc-verbs-shim-lib) on all nodes in the cluster. +- Ensure that the underlying system has MRC support enabled. + +--- + +### 2. Build MSCCL++ with MRC Enabled + +Enable MRC support during the build by adding the following CMake option: + +```bash +-DMSCCLPP_USE_MRC=ON +``` + +This configures MSCCL++ to use the MRC-enabled verbs layer at runtime. + +### 3. Configure Runtime Environment + +At runtime, you must configure environment variables to override the default RDMA libraries and link against the MRC-enabled stack: + +```bash +-x MSCCLPP_IBV_SO=:$MRC-SHIM-HOME/libibverbs.so +-x LD_LIBRARY_PATH=$MRC-SHIM-HOME/mrc-header-lib:$LD_LIBRARY_PATH +-x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so" +-x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1 +``` + (vscode-dev-container)= ## VSCode Dev Container diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 9ca5fed3..5b89eedc 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -59,6 +59,10 @@ if(MSCCLPP_NPKIT_FLAGS) target_compile_definitions(mscclpp_obj PRIVATE ${MSCCLPP_NPKIT_FLAGS}) endif() +if(MSCCLPP_USE_MRC) + target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_MRC) +endif() + # libmscclpp add_library(mscclpp SHARED) target_link_libraries(mscclpp PUBLIC mscclpp_obj) diff --git a/src/core/ibverbs_wrapper.cc b/src/core/ibverbs_wrapper.cc index 51f3f29c..60ee0694 100644 --- a/src/core/ibverbs_wrapper.cc +++ b/src/core/ibverbs_wrapper.cc @@ -10,11 +10,28 @@ #include "logger.hpp" +// Adding MSCCLPP_USE_MRC micro for MRC enablement. +// Non-MRC environments will not be affected by this macro as long as VMRC_LIBIBVERBS_SO +// environment variable is not set. +#if (MSCCLPP_USE_MRC) +#include +#include +#endif // (MSCCLPP_USE_MRC) + namespace mscclpp { static std::unique_ptr globalIBVerbsHandle(nullptr, &::dlclose); +#if (MSCCLPP_USE_MRC) +static std::unique_ptr globalOrigIBVerbsHandle(nullptr, &::dlclose); +#endif // (MSCCLPP_USE_MRC) void* IBVerbs::dlsym(const std::string& symbol, bool allowReturnNull) { +#if (MSCCLPP_USE_MRC) + static std::set mrcSymbols = { + "ibv_get_device_list", "ibv_get_device_name", "ibv_open_device", "ibv_close_device", "ibv_query_qp", + "ibv_create_cq", "ibv_destroy_cq", "ibv_create_qp", "ibv_modify_qp", "ibv_destroy_qp", + }; +#endif // (MSCCLPP_USE_MRC) if (!globalIBVerbsHandle) { if (mscclpp::env()->ibvSo != "") { void* handle = ::dlopen(mscclpp::env()->ibvSo.c_str(), RTLD_NOW); @@ -38,7 +55,26 @@ void* IBVerbs::dlsym(const std::string& symbol, bool allowReturnNull) { THROW(NET, SysError, errno, "Failed to open libibverbs: ", std::string(::dlerror())); } } +#if (MSCCLPP_USE_MRC) + // In MRC mode, `VMRC_LIBIBVERBS_SO` should be set. + char* vmrcLibibverbsSo = ::getenv("VMRC_LIBIBVERBS_SO"); + void* ptr; + if (vmrcLibibverbsSo != nullptr && mrcSymbols.find(symbol) == mrcSymbols.end()) { + // If we are in MRC mode and the symbol is not in the table, get it from the original libibverbs. + if (!globalOrigIBVerbsHandle) { + void* handle = ::dlopen(vmrcLibibverbsSo, RTLD_NOW); + if (!handle) { + THROW(NET, SysError, errno, "Failed to open ", std::string(vmrcLibibverbsSo)); + } + globalOrigIBVerbsHandle.reset(handle); + } + ptr = ::dlsym(globalOrigIBVerbsHandle.get(), symbol.c_str()); + } else { + ptr = ::dlsym(globalIBVerbsHandle.get(), symbol.c_str()); + } +#else // !(MSCCLPP_USE_MRC) void* ptr = ::dlsym(globalIBVerbsHandle.get(), symbol.c_str()); +#endif // !(MSCCLPP_USE_MRC) if (!ptr && !allowReturnNull) { THROW(NET, SysError, errno, "Failed to load libibverbs symbol: ", symbol); }