Enhance cross-node CudaIpc availability check (#803)

This commit is contained in:
Changho Hwang
2026-05-14 14:06:12 -07:00
committed by GitHub
parent 40295df4c4
commit 5d608feaa5
5 changed files with 79 additions and 16 deletions

View File

@@ -46,6 +46,15 @@ std::string getIBDeviceName(Transport ibTransport);
/// @return The InfiniBand transport associated with the specified device name.
Transport getIBTransportByDeviceName(const std::string& ibDeviceName);
/// Check whether this process can allocate/import CUDA memory with NVIDIA fabric handles
/// (`CU_MEM_HANDLE_TYPE_FABRIC`). Fabric handles enable cross-node `Transport::CudaIpc` on
/// MNNVL systems (e.g., GB200 NVL72) when the IMEX service is running. Returns `false` on
/// hardware/software stacks without MNNVL+IMEX, in which case `Transport::CudaIpc` is
/// restricted to ranks within the same node.
///
/// @return `true` if fabric handles are usable from this process, `false` otherwise.
bool isFabricMemHandleAvailable();
} // namespace mscclpp
#endif // MSCCLPP_UTILS_HPP_

View File

@@ -82,6 +82,17 @@ MSCCLPP_API_CPP int Connection::getMaxWriteQueueSize() const { return impl_->get
CudaIpcConnection::CudaIpcConnection(std::shared_ptr<Context> context, const Endpoint& localEndpoint,
const Endpoint& remoteEndpoint)
: BaseConnection(context, localEndpoint) {
// Log fabric/MNNVL availability exactly once per process so any later cross-node CudaIpc failure
// is easy to triage. C++11 magic statics make this thread-safe without an explicit mutex.
// NOTE: assigning the message to a std::string first avoids the logger's pointer-formatting
// overload from kicking in on the const char* result of the ternary.
[[maybe_unused]] static const bool fabricAvailable_ = []() {
const bool avail = isFabricMemHandleAvailable();
const std::string status = avail ? "available (cross-node CudaIpc via MNNVL/IMEX is supported)"
: "NOT available (CudaIpc is restricted to intra-node ranks on this system)";
INFO(CONN, "CudaIpc transport selected: fabric handles ", status);
return avail;
}();
if (localEndpoint.transport() != Transport::CudaIpc || remoteEndpoint.transport() != Transport::CudaIpc) {
THROW(CONN, Error, ErrorCode::InternalError, "CudaIpc transport is required for CudaIpcConnection");
}

View File

@@ -4,12 +4,11 @@
#include "context.hpp"
#include <mscclpp/env.hpp>
#include <sstream>
#include "api.h"
#include "connection.hpp"
#include "debug.h"
#include "endpoint.hpp"
#include "logger.hpp"
#include "registered_memory.hpp"
namespace mscclpp {
@@ -78,19 +77,17 @@ MSCCLPP_API_CPP Endpoint Context::createEndpoint(EndpointConfig config) {
MSCCLPP_API_CPP Connection Context::connect(const Endpoint& localEndpoint, const Endpoint& remoteEndpoint) {
if (localEndpoint.device().type == DeviceType::GPU && localEndpoint.device().id < 0) {
throw Error("No GPU device ID provided for local endpoint", ErrorCode::InvalidUsage);
THROW(CONN, Error, ErrorCode::InvalidUsage, "No GPU device ID provided for local endpoint");
}
if (remoteEndpoint.device().type == DeviceType::GPU && remoteEndpoint.device().id < 0) {
throw Error("No GPU device ID provided for remote endpoint", ErrorCode::InvalidUsage);
THROW(CONN, Error, ErrorCode::InvalidUsage, "No GPU device ID provided for remote endpoint");
}
auto localTransport = localEndpoint.transport();
auto remoteTransport = remoteEndpoint.transport();
if (localTransport != remoteTransport &&
!(AllIBTransports.has(localTransport) && AllIBTransports.has(remoteTransport))) {
std::stringstream ss;
ss << "Transport mismatch between local (" << localTransport << ") and remote (" << remoteEndpoint.transport()
<< ") endpoints";
throw Error(ss.str(), ErrorCode::InvalidUsage);
THROW(CONN, Error, ErrorCode::InvalidUsage, "Transport mismatch between local (", localTransport, ") and remote (",
remoteTransport, ") endpoints");
}
std::shared_ptr<BaseConnection> conn;
if (localTransport == Transport::CudaIpc) {
@@ -100,7 +97,9 @@ MSCCLPP_API_CPP Connection Context::connect(const Endpoint& localEndpoint, const
} else if (localTransport == Transport::Ethernet) {
conn = std::make_shared<EthernetConnection>(shared_from_this(), localEndpoint, remoteEndpoint);
} else {
throw Error("Unsupported transport", ErrorCode::InternalError);
THROW(CONN, Error, ErrorCode::InternalError, "Unsupported transport: ", localTransport,
" (this usually means EndpointConfig.transport was left at Transport::Unknown — "
"set it explicitly to CudaIpc, an IB transport, or Ethernet)");
}
return Connection(conn);
}

View File

@@ -7,6 +7,7 @@
#include <cstring>
#include <mscclpp/gpu_utils.hpp>
#include <mscclpp/utils.hpp>
#include "logger.hpp"
#include "unix_socket.hpp"
@@ -35,7 +36,7 @@ std::ostream& operator<<(std::ostream& os, const GpuIpcMemHandle::TypeFlags& typ
return os;
}
[[maybe_unused]] static bool isFabricMemHandleAvailable() {
bool isFabricMemHandleAvailable() {
#if (CUDA_NVLS_API_AVAILABLE)
static int resultCache = -1; // -1: uninitialized, 0: not available, 1: available
if (resultCache != -1) {
@@ -283,11 +284,19 @@ GpuIpcMem::GpuIpcMem(const GpuIpcMemHandle& handle)
THROW(GPU, Error, ErrorCode::InvalidUsage, "GpuIpcMemHandle type is None, cannot create GpuIpcMem");
}
if ((type_ == GpuIpcMemHandle::Type::None) && (handle_.typeFlags & GpuIpcMemHandle::Type::Fabric)) {
if (cuMemImportFromShareableHandle(&allocHandle_, (void*)handle_.fabric.handle, CU_MEM_HANDLE_TYPE_FABRIC) ==
CUDA_SUCCESS) {
CUresult res =
cuMemImportFromShareableHandle(&allocHandle_, (void*)handle_.fabric.handle, CU_MEM_HANDLE_TYPE_FABRIC);
if (res == CUDA_SUCCESS) {
// Ignore allocHandle in the handle struct since it is process-local and not transferable across processes.
handle_.fabric.allocHandle = {};
type_ = GpuIpcMemHandle::Type::Fabric;
} else {
const char* errStr = nullptr;
(void)cuGetErrorString(res, &errStr);
const std::string errMsg = errStr ? std::string(errStr) : std::string("unknown CUDA error");
WARN(GPU, "Fabric IPC handle import failed (", errMsg,
"); cross-node CudaIpc requires NVIDIA MNNVL hardware and a running IMEX service. ",
"Falling back to other handle types if available.");
}
}
if ((type_ == GpuIpcMemHandle::Type::None) && (handle_.typeFlags & GpuIpcMemHandle::Type::PosixFd)) {
@@ -303,7 +312,17 @@ GpuIpcMem::GpuIpcMem(const GpuIpcMemHandle& handle)
type_ = GpuIpcMemHandle::Type::RuntimeIpc;
}
if (type_ == GpuIpcMemHandle::Type::None) {
THROW(GPU, Error, ErrorCode::Aborted, "Failed to open GpuIpcMemHandle (type: ", handle_.typeFlags, ")");
const bool fabricOnly = (handle_.typeFlags == GpuIpcMemHandle::Type::Fabric);
const std::string hint = fabricOnly
? std::string(
"The remote rank sent only a Fabric (MNNVL) handle, but this rank could not "
"import it. Check that the IMEX daemon is running on both nodes and that the "
"GPUs share an NVLink fabric.")
: std::string(
"All handle types failed to import; check IMEX service and POSIX FD socket "
"availability.");
THROW(GPU, Error, ErrorCode::Aborted, "Failed to open GpuIpcMemHandle (offered types: ", handle_.typeFlags, "). ",
hint);
}
}

View File

@@ -36,6 +36,18 @@ inline void requireGdrForIbMode(IbMode mode, mscclpp::Transport ibTransport) {
#define REQUIRE_GDR_FOR_IB_MODE(mode) // No extra requirements on non-CUDA platforms.
#endif
// Skip an IPC-only PortChannel test (useIPC=true, useIB=false, useEthernet=false) when CudaIpc
// cannot connect this rank pair. CudaIpc works intra-node always, and cross-node only on MNNVL
// systems (GB200 NVL72 + IMEX). The combined check is "at least 2 ranks per node" OR "fabric
// (MNNVL) handles are usable on this system".
#define REQUIRE_CUDA_IPC_AVAILABLE \
do { \
if (gEnv->nRanksPerNode < 2 && !mscclpp::isFabricMemHandleAvailable()) { \
SKIP_TEST() << "CudaIpc requires intra-node ranks (nRanksPerNode>=2) or MNNVL fabric handles, \
both unavailable here."; \
} \
} while (0)
void PortChannelOneToOneTest::SetUp() {
// Use only two ranks
setNumRanksToUse(2);
@@ -71,7 +83,10 @@ void PortChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::PortChan
continue;
}
mscclpp::EndpointConfig cfg;
if ((rankToNode(r) == rankToNode(gEnv->rank)) && useIPC) {
if (useIPC) {
// CudaIpc works intra-node always, and cross-node on MNNVL systems (GB200 NVL72 + IMEX)
// via fabric handles. Tests that exercise CudaIpc across nodes on non-MNNVL hardware should
// gate themselves with REQUIRE_CUDA_IPC_AVAILABLE; we always request CudaIpc here when asked.
cfg.transport = mscclpp::Transport::CudaIpc;
} else if (useIb) {
cfg.transport = ibTransport;
@@ -262,6 +277,7 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
}
TEST(PortChannelOneToOneTest, PingPong) {
REQUIRE_CUDA_IPC_AVAILABLE;
testPingPong(PingPongTestParams{
.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
}
@@ -279,6 +295,7 @@ TEST(PortChannelOneToOneTest, PingPongEthernet) {
}
TEST(PortChannelOneToOneTest, PingPongWithPoll) {
REQUIRE_CUDA_IPC_AVAILABLE;
testPingPong(PingPongTestParams{
.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Default});
}
@@ -291,6 +308,7 @@ TEST(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) {
}
PERF_TEST(PortChannelOneToOneTest, PingPongPerf) {
REQUIRE_CUDA_IPC_AVAILABLE;
testPingPongPerf(PingPongTestParams{
.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
}
@@ -482,7 +500,10 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode)
proxyService->stopProxy();
}
TEST(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false, IbMode::Default); }
TEST(PortChannelOneToOneTest, PacketPingPong) {
REQUIRE_CUDA_IPC_AVAILABLE;
testPacketPingPong(false, IbMode::Default);
}
TEST(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
REQUIRE_IBVERBS;
@@ -490,7 +511,10 @@ TEST(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
testPacketPingPong(true, IbMode::Host);
}
PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }
PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerf) {
REQUIRE_CUDA_IPC_AVAILABLE;
testPacketPingPongPerf(false, IbMode::Default);
}
PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
REQUIRE_IBVERBS;
@@ -583,6 +607,7 @@ void PortChannelOneToOneTest::testBandwidth(PingPongTestParams params) {
}
PERF_TEST(PortChannelOneToOneTest, Bandwidth) {
REQUIRE_CUDA_IPC_AVAILABLE;
testBandwidth(PingPongTestParams{
.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
}