Support cross-node CudaIPC

This commit is contained in:
Qinghua Zhou
2026-03-21 10:41:32 +00:00
parent 9ef1fb7cee
commit 7e1cb7b8cf
5 changed files with 100 additions and 19 deletions

View File

@@ -62,6 +62,7 @@ void register_algorithm(nb::module_& m) {
.def_prop_ro("buffer_mode", &Algorithm::bufferMode)
.def_prop_ro("constraint", &Algorithm::constraint)
.def_prop_ro("type", &Algorithm::type)
.def("reset", &Algorithm::reset)
.def(
"execute",
[](Algorithm& self, std::shared_ptr<Communicator> comm, uintptr_t input, uintptr_t output,

View File

@@ -239,6 +239,10 @@ class MscclppAlltoAllV:
# Fast path: skip GPU copies + bootstrap exchange if split sizes unchanged
splits_key = (tuple(send_counts_bytes), tuple(recv_counts_bytes))
if splits_key != self._cached_splits_key:
# Clear cached contexts to free RegisteredMemory for old (possibly freed) tensors.
# Without this, stale CUDA IPC handles accumulate and eventually SIGSEGV.
if hasattr(self._algo, 'reset'):
self._algo.reset()
# Copy counts/displacements to GPU
self._d_send_counts.copy_(torch.tensor(send_counts_bytes, dtype=torch.int64))
self._d_send_displs.copy_(torch.tensor(send_displs_bytes, dtype=torch.int64))
@@ -259,16 +263,24 @@ class MscclppAlltoAllV:
stream = torch.cuda.current_stream()
cuda_stream = stream.cuda_stream
input_size = self._cached_input_size
output_size = self._cached_output_size
# Use the full underlying storage size (not just the view's active data)
# for the context key, so that reusing views of the same tensor with
# different split sizes doesn't create new contexts (which leak
# RegisteredMemory for stale buffers).
try:
input_alloc_size = input.untyped_storage().size()
output_alloc_size = output.untyped_storage().size()
except Exception:
input_alloc_size = input.nelement() * input.element_size()
output_alloc_size = output.nelement() * output.element_size()
# Execute the optimized kernel
result = self._algo.execute(
self._comm,
input.data_ptr(),
output.data_ptr(),
input_size,
output_size,
input_alloc_size,
output_alloc_size,
_torch_dtype_to_mscclpp(dtype),
ReduceOp.NOP,
cuda_stream,

View File

@@ -92,19 +92,31 @@ def main():
# otherwise gloo avoids IB configuration issues on some clusters.
# Set ALLTOALLV_BACKEND=nccl to enable torch baseline comparison.
backend = os.environ.get("ALLTOALLV_BACKEND", "gloo")
# For multi-node: detect a routable IP instead of 127.0.0.1
# For multi-node: MASTER_ADDR must be set to rank 0's routable IP.
# Single-node auto-detects; multi-node requires it from the launcher.
if "MASTER_ADDR" not in os.environ:
if rank == 0:
os.environ["MASTER_ADDR"] = _get_routable_ip()
else:
# Non-zero ranks: MASTER_ADDR must be set externally for multi-node
os.environ["MASTER_ADDR"] = "127.0.0.1"
# Check if we're single-node (all ranks on same host)
n_gpus = torch.cuda.device_count()
if world_size <= n_gpus:
# Likely single-node 127.0.0.1 works
os.environ["MASTER_ADDR"] = "127.0.0.1"
else:
raise RuntimeError(
f"Rank {rank}: MASTER_ADDR not set for multi-node run "
f"(world_size={world_size} > local GPUs={n_gpus}). "
f"Set it in your launcher, e.g.:\n"
f" mpirun -x MASTER_ADDR=<node0_ip> -x MASTER_PORT=29500 ..."
)
os.environ.setdefault("MASTER_PORT", "29500")
os.environ["RANK"] = str(rank)
os.environ["WORLD_SIZE"] = str(world_size)
if backend == "nccl":
dist.init_process_group(backend="nccl", rank=rank, world_size=world_size,
device_id=torch.device(f"cuda:{local_rank}"))
# Don't use device_id= eager init — it triggers an immediate NCCL allreduce
# that fails on some platforms (e.g. GB200 with NCCL 2.28.9).
dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
else:
dist.init_process_group(backend=backend, rank=rank, world_size=world_size)
@@ -341,6 +353,13 @@ def main():
msg_sizes = [1 << s for s in range(10, 28) if s % 2 == 0]
msg_sizes.append(128 * 1024 * 1024)
# Pre-compute max split sizes across all sweep iterations to allocate
# fixed-size tensors. Reusing the same tensors keeps the NativeAlgorithm
# context key stable (same ptrs + sizes) and avoids the context cache
# leak that causes SIGSEGV when stale RegisteredMemory accumulates.
max_in_elems = 0
max_out_elems = 0
sweep_params = [] # (avg_msg_size, in_splits, out_splits)
for avg_msg_size in msg_sizes:
random.seed(12345)
avg_elems = avg_msg_size // 4
@@ -348,19 +367,27 @@ def main():
for i in range(world_size):
row = [max(1, int(avg_elems * (0.5 + random.random()))) for _ in range(world_size)]
send_matrix.append(row)
in_splits = send_matrix[rank]
out_splits = [send_matrix[j][rank] for j in range(world_size)]
max_in_elems = max(max_in_elems, sum(in_splits))
max_out_elems = max(max_out_elems, sum(out_splits))
sweep_params.append((avg_msg_size, in_splits, out_splits))
inp = torch.randn(sum(in_splits), dtype=torch.float32, device='cuda')
out = torch.empty(sum(out_splits), dtype=torch.float32, device='cuda')
# Allocate once at max size
inp = torch.randn(max_in_elems, dtype=torch.float32, device='cuda')
out = torch.empty(max_out_elems, dtype=torch.float32, device='cuda')
for avg_msg_size, in_splits, out_splits in sweep_params:
n_warmup = 3 if avg_msg_size >= 16 * 1024 * 1024 else 5
n_iters = 5 if avg_msg_size >= 64 * 1024 * 1024 else (10 if avg_msg_size >= 4 * 1024 * 1024 else 20)
m_lat, m_bw = bench_alltoallv(mscclpp_fn, inp, out, in_splits, out_splits, n_warmup, n_iters)
# Use views into the fixed buffers (same data_ptr → same context key)
inp_view = inp[:sum(in_splits)]
out_view = out[:sum(out_splits)]
m_lat, m_bw = bench_alltoallv(mscclpp_fn, inp_view, out_view, in_splits, out_splits, n_warmup, n_iters)
if use_torch_baseline:
t_lat, t_bw = bench_alltoallv(torch_fn, inp, out, in_splits, out_splits, n_warmup, n_iters)
t_lat, t_bw = bench_alltoallv(torch_fn, inp_view, out_view, in_splits, out_splits, n_warmup, n_iters)
print_row(fmt_size(avg_msg_size), m_lat, m_bw, t_lat, t_bw)
else:
print_row(fmt_size(avg_msg_size), m_lat, m_bw)

View File

@@ -164,8 +164,46 @@ RegisteredMemory::Impl::Impl(const std::vector<char>::const_iterator& begin,
this->remoteMemMap = gpuIpcMem->map();
this->data = this->remoteMemMap.get();
} else if (transports.has(Transport::CudaIpc) && getHostHash() != this->hostHash) {
WARN(GPU, "Skipping CudaIpc map for cross-node peer (local hostHash=", getHostHash(),
", remote hostHash=", this->hostHash, ")");
// Cross-node CudaIpc: try available handle types in order of preference.
// On GB200 NVSwitch, both Fabric and RuntimeIpc handles work cross-node.
// On H100 (no NVSwitch across nodes), none of these will work.
auto entry = getTransportInfo(Transport::CudaIpc);
bool mapped = false;
// 1) Try Fabric handle first (works on any NVSwitch-connected system)
if (!mapped && (entry.gpuIpcMemHandle.typeFlags & GpuIpcMemHandle::Type::Fabric)) {
GpuIpcMemHandle fabricOnlyHandle = entry.gpuIpcMemHandle;
fabricOnlyHandle.typeFlags = GpuIpcMemHandle::Type::Fabric;
try {
auto gpuIpcMem = GpuIpcMem::create(fabricOnlyHandle);
this->remoteMemMap = gpuIpcMem->map();
this->data = this->remoteMemMap.get();
mapped = true;
INFO(GPU, "Mapped cross-node CudaIpc memory via Fabric handle at pointer ", this->data);
} catch (const std::exception& e) {
INFO(GPU, "Fabric handle mapping failed (will try RuntimeIpc): ", e.what());
}
}
// 2) Try RuntimeIpc handle (cudaIpcOpenMemHandle — works on GB200 NVSwitch cross-node)
if (!mapped && (entry.gpuIpcMemHandle.typeFlags & GpuIpcMemHandle::Type::RuntimeIpc)) {
GpuIpcMemHandle runtimeOnlyHandle = entry.gpuIpcMemHandle;
runtimeOnlyHandle.typeFlags = GpuIpcMemHandle::Type::RuntimeIpc;
try {
auto gpuIpcMem = GpuIpcMem::create(runtimeOnlyHandle);
this->remoteMemMap = gpuIpcMem->map();
this->data = this->remoteMemMap.get();
mapped = true;
INFO(GPU, "Mapped cross-node CudaIpc memory via RuntimeIpc handle at pointer ", this->data);
} catch (const std::exception& e) {
INFO(GPU, "RuntimeIpc handle mapping failed for cross-node peer: ", e.what());
}
}
if (!mapped) {
WARN(GPU, "Skipping CudaIpc map for cross-node peer (all handle types failed, local hostHash=",
getHostHash(), ", remote hostHash=", this->hostHash, ")");
}
}
if (this->data != nullptr) {
INFO(GPU, "Opened CUDA IPC handle at pointer ", this->data);

View File

@@ -90,11 +90,14 @@ void AlltoallvFullmesh::initialize(std::shared_ptr<Communicator> comm) {
int nRanksPerNode = comm->bootstrap()->getNranksPerNode();
int localGpuIdx = rank % nRanksPerNode;
// Use hybrid connections: CudaIpc for intra-node, IB for inter-node
bool hasIB = getIBDeviceCount() > 0;
// Use hybrid connections: CudaIpc for intra-node, IB for inter-node.
// On systems where CudaIpc works across nodes (e.g. GB200 NVSwitch),
// set MSCCLPP_FORCE_CUDAIPC=1 to skip IB and use CudaIpc for all peers.
const char* forceCudaIpc = std::getenv("MSCCLPP_FORCE_CUDAIPC");
bool useIB = (getIBDeviceCount() > 0) && !(forceCudaIpc && std::string(forceCudaIpc) == "1");
bool isMultiNode = (worldSize_ > nRanksPerNode);
if (hasIB && isMultiNode) {
if (useIB && isMultiNode) {
this->conns_ = setupHybridConnections(comm, localGpuIdx);
// Check if any connections are actually inter-node
hasRemotePeers_ = false;