Fix use-after-free for fabric allocation handle in GpuIpcMemHandle (#764)

## Summary

Fix a use-after-free where the CUDA allocation handle
(`CUmemGenericAllocationHandle`) was released prematurely while the
exported fabric handle still referenced it.

## Problem

Unlike POSIX FD handles (where the kernel keeps the allocation alive via
the open file descriptor), fabric handles do not hold their own
reference to the underlying allocation. The original code called
`cuMemRelease(allocHandle)` immediately after exporting the fabric
handle, freeing the allocation. When a remote process later tries to
`cuMemImportFromShareableHandle` using that fabric handle, it references
a freed allocation — a **use-after-free**.

This affected both code paths:

1. **`GpuIpcMemHandle::create()`**: The local `allocHandle` obtained via
`cuMemRetainAllocationHandle` was released right after fabric export,
leaving the fabric handle dangling.
2. **`GpuIpcMemHandle::createMulticast()`**: The `allocHandle` from
`cuMulticastCreate` was unconditionally released, even when it was the
only thing keeping the multicast object alive for the fabric handle.

## Fix

- **Added `allocHandle` field** to the `fabric` struct in
`GpuIpcMemHandle` to store the allocation handle and keep it alive for
the lifetime of the `GpuIpcMemHandle`.
- **`create()`**: Retain an additional reference via
`cuMemRetainAllocationHandle` and store it in `fabric.allocHandle` when
a fabric handle is successfully exported.
- **`createMulticast()`**: Store the `allocHandle` directly in
`fabric.allocHandle` instead of unconditionally releasing it. Only
release if fabric export was not used.
- **`deleter()`**: Release `fabric.allocHandle` via `cuMemRelease` when
the handle type includes `Fabric`, ensuring proper cleanup.
- **`GpuIpcMem` constructor (importer side)**: Clear
`fabric.allocHandle` after importing, since the importer gets its own
handle via `cuMemImportFromShareableHandle` and should not release the
exporter's allocation handle.

## Files Changed

- `src/core/include/gpu_ipc_mem.hpp` — Added
`CUmemGenericAllocationHandle allocHandle` to fabric struct.
- `src/core/gpu_ipc_mem.cc` — Retain/release allocation handle properly
across create, createMulticast, deleter, and importer paths.
This commit is contained in:
Binyang Li
2026-03-19 11:52:09 -07:00
committed by GitHub
parent bf946ea51e
commit 5d18835417
2 changed files with 16 additions and 3 deletions

View File

@@ -140,6 +140,11 @@ void GpuIpcMemHandle::deleter(GpuIpcMemHandle* handle) {
UnixSocketServer::instance().unregisterFd(handle->posixFd.fd);
::close(handle->posixFd.fd);
}
if (handle->typeFlags & GpuIpcMemHandle::Type::Fabric) {
if (handle->fabric.allocHandle != 0) {
cuMemRelease(handle->fabric.allocHandle);
}
}
delete handle;
}
}
@@ -148,6 +153,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::create(const CUdeviceptr ptr) {
auto handle = UniqueGpuIpcMemHandle(new GpuIpcMemHandle(), &GpuIpcMemHandle::deleter);
handle->typeFlags = GpuIpcMemHandle::Type::None;
handle->posixFd.fd = -1;
handle->fabric.allocHandle = {};
CUdeviceptr basePtr;
size_t sz;
@@ -189,6 +195,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::create(const CUdeviceptr ptr) {
// FABRIC handle
if (cuMemExportToShareableHandle(&(handle->fabric.handle), allocHandle, CU_MEM_HANDLE_TYPE_FABRIC, 0) ==
CUDA_SUCCESS) {
MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&(handle->fabric.allocHandle), (void*)basePtr));
handle->typeFlags |= GpuIpcMemHandle::Type::Fabric;
}
@@ -232,6 +239,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b
handle->offsetFromBase = 0;
handle->typeFlags = GpuIpcMemHandle::Type::None;
handle->posixFd.fd = -1;
handle->fabric.allocHandle = {};
// POSIX FD handle
int fileDesc;
@@ -246,6 +254,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b
if (isFabricAvailable && (cuMemExportToShareableHandle(&(handle->fabric.handle), allocHandle,
CU_MEM_HANDLE_TYPE_FABRIC, 0) == CUDA_SUCCESS)) {
handle->typeFlags |= GpuIpcMemHandle::Type::Fabric;
handle->fabric.allocHandle = allocHandle;
}
if (handle->typeFlags == GpuIpcMemHandle::Type::None) {
@@ -253,9 +262,10 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b
THROW(GPU, Error, ErrorCode::SystemError, "createMulticast failed: neither POSIX FD nor FABRIC handle was created");
}
// Release the local allocation handle. The exported POSIX FD / Fabric handle keeps the
// multicast object alive. Each importer will get its own handle via cuMemImportFromShareableHandle.
MSCCLPP_CUTHROW(cuMemRelease(allocHandle));
// Only release allocHandle if it is not stored in fabric.allocHandle.
if (!(handle->typeFlags & GpuIpcMemHandle::Type::Fabric)) {
MSCCLPP_CUTHROW(cuMemRelease(allocHandle));
}
return handle;
#else // !(CUDA_NVLS_API_AVAILABLE)
THROW(GPU, Error, ErrorCode::InvalidUsage,
@@ -275,6 +285,8 @@ GpuIpcMem::GpuIpcMem(const GpuIpcMemHandle& handle)
if ((type_ == GpuIpcMemHandle::Type::None) && (handle_.typeFlags & GpuIpcMemHandle::Type::Fabric)) {
if (cuMemImportFromShareableHandle(&allocHandle_, (void*)handle_.fabric.handle, CU_MEM_HANDLE_TYPE_FABRIC) ==
CUDA_SUCCESS) {
// Ignore allocHandle in the handle struct since it is process-local and not transferable across processes.
handle_.fabric.allocHandle = {};
type_ = GpuIpcMemHandle::Type::Fabric;
}
}

View File

@@ -44,6 +44,7 @@ struct GpuIpcMemHandle {
struct {
char handle[64];
CUmemGenericAllocationHandle allocHandle;
} fabric;
static void deleter(GpuIpcMemHandle* handle);