mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-11 17:00:22 +00:00
Adding necessary macros for enabling mrc support (#797)
This PR adds necessary macros and instructions for enabling mrc support with no atomic.
This commit is contained in:
@@ -54,6 +54,7 @@ option(MSCCLPP_BUILD_EXT_COLLECTIVES "Build collective algorithms" ON)
|
||||
option(MSCCLPP_USE_CUDA "Use NVIDIA/CUDA." OFF)
|
||||
option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF)
|
||||
option(MSCCLPP_USE_IB "Use InfiniBand." ON)
|
||||
option(MSCCLPP_USE_MRC "Enable MRC support" OFF)
|
||||
option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
|
||||
option(MSCCLPP_NPKIT_FLAGS "Set NPKIT flags" OFF)
|
||||
option(MSCCLPP_ENABLE_COVERAGE "Enable code coverage" OFF)
|
||||
|
||||
@@ -126,6 +126,45 @@ $ python -m pip install ".[cuda12,benchmark]"
|
||||
$ python -m pip install ".[cuda12,benchmark,test]"
|
||||
```
|
||||
|
||||
(mrc-support)=
|
||||
## MRC Support
|
||||
|
||||
MSCCL++ supports execution over **Multi-path Reliable Connection (MRC)**, which enables the use of multiple network paths to improve bandwidth utilization and resilience.
|
||||
|
||||
To enable MRC support, you must configure both the **build-time** and **runtime** environments as described below.
|
||||
|
||||
---
|
||||
|
||||
### 1. Install MRC Verbs Shim
|
||||
|
||||
MSCCL++ relies on a custom verbs shim library that intercepts standard `libibverbs` calls and redirects them to an MRC-enabled implementation.
|
||||
|
||||
- Install the [MRC verbs shim library](https://github.com/microsoft/mrc-verbs-shim-lib) on all nodes in the cluster.
|
||||
- Ensure that the underlying system has MRC support enabled.
|
||||
|
||||
---
|
||||
|
||||
### 2. Build MSCCL++ with MRC Enabled
|
||||
|
||||
Enable MRC support during the build by adding the following CMake option:
|
||||
|
||||
```bash
|
||||
-DMSCCLPP_USE_MRC=ON
|
||||
```
|
||||
|
||||
This configures MSCCL++ to use the MRC-enabled verbs layer at runtime.
|
||||
|
||||
### 3. Configure Runtime Environment
|
||||
|
||||
At runtime, you must configure environment variables to override the default RDMA libraries and link against the MRC-enabled stack:
|
||||
|
||||
```bash
|
||||
-x MSCCLPP_IBV_SO=:$MRC-SHIM-HOME/libibverbs.so
|
||||
-x LD_LIBRARY_PATH=$MRC-SHIM-HOME/mrc-header-lib:$LD_LIBRARY_PATH
|
||||
-x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so"
|
||||
-x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1
|
||||
```
|
||||
|
||||
(vscode-dev-container)=
|
||||
## VSCode Dev Container
|
||||
|
||||
|
||||
@@ -59,6 +59,10 @@ if(MSCCLPP_NPKIT_FLAGS)
|
||||
target_compile_definitions(mscclpp_obj PRIVATE ${MSCCLPP_NPKIT_FLAGS})
|
||||
endif()
|
||||
|
||||
if(MSCCLPP_USE_MRC)
|
||||
target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_MRC)
|
||||
endif()
|
||||
|
||||
# libmscclpp
|
||||
add_library(mscclpp SHARED)
|
||||
target_link_libraries(mscclpp PUBLIC mscclpp_obj)
|
||||
|
||||
@@ -10,11 +10,28 @@
|
||||
|
||||
#include "logger.hpp"
|
||||
|
||||
// Adding MSCCLPP_USE_MRC micro for MRC enablement.
|
||||
// Non-MRC environments will not be affected by this macro as long as VMRC_LIBIBVERBS_SO
|
||||
// environment variable is not set.
|
||||
#if (MSCCLPP_USE_MRC)
|
||||
#include <cstdlib>
|
||||
#include <set>
|
||||
#endif // (MSCCLPP_USE_MRC)
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
static std::unique_ptr<void, int (*)(void*)> globalIBVerbsHandle(nullptr, &::dlclose);
|
||||
#if (MSCCLPP_USE_MRC)
|
||||
static std::unique_ptr<void, int (*)(void*)> globalOrigIBVerbsHandle(nullptr, &::dlclose);
|
||||
#endif // (MSCCLPP_USE_MRC)
|
||||
|
||||
void* IBVerbs::dlsym(const std::string& symbol, bool allowReturnNull) {
|
||||
#if (MSCCLPP_USE_MRC)
|
||||
static std::set<std::string> mrcSymbols = {
|
||||
"ibv_get_device_list", "ibv_get_device_name", "ibv_open_device", "ibv_close_device", "ibv_query_qp",
|
||||
"ibv_create_cq", "ibv_destroy_cq", "ibv_create_qp", "ibv_modify_qp", "ibv_destroy_qp",
|
||||
};
|
||||
#endif // (MSCCLPP_USE_MRC)
|
||||
if (!globalIBVerbsHandle) {
|
||||
if (mscclpp::env()->ibvSo != "") {
|
||||
void* handle = ::dlopen(mscclpp::env()->ibvSo.c_str(), RTLD_NOW);
|
||||
@@ -38,7 +55,26 @@ void* IBVerbs::dlsym(const std::string& symbol, bool allowReturnNull) {
|
||||
THROW(NET, SysError, errno, "Failed to open libibverbs: ", std::string(::dlerror()));
|
||||
}
|
||||
}
|
||||
#if (MSCCLPP_USE_MRC)
|
||||
// In MRC mode, `VMRC_LIBIBVERBS_SO` should be set.
|
||||
char* vmrcLibibverbsSo = ::getenv("VMRC_LIBIBVERBS_SO");
|
||||
void* ptr;
|
||||
if (vmrcLibibverbsSo != nullptr && mrcSymbols.find(symbol) == mrcSymbols.end()) {
|
||||
// If we are in MRC mode and the symbol is not in the table, get it from the original libibverbs.
|
||||
if (!globalOrigIBVerbsHandle) {
|
||||
void* handle = ::dlopen(vmrcLibibverbsSo, RTLD_NOW);
|
||||
if (!handle) {
|
||||
THROW(NET, SysError, errno, "Failed to open ", std::string(vmrcLibibverbsSo));
|
||||
}
|
||||
globalOrigIBVerbsHandle.reset(handle);
|
||||
}
|
||||
ptr = ::dlsym(globalOrigIBVerbsHandle.get(), symbol.c_str());
|
||||
} else {
|
||||
ptr = ::dlsym(globalIBVerbsHandle.get(), symbol.c_str());
|
||||
}
|
||||
#else // !(MSCCLPP_USE_MRC)
|
||||
void* ptr = ::dlsym(globalIBVerbsHandle.get(), symbol.c_str());
|
||||
#endif // !(MSCCLPP_USE_MRC)
|
||||
if (!ptr && !allowReturnNull) {
|
||||
THROW(NET, SysError, errno, "Failed to load libibverbs symbol: ", symbol);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user