Fix use-after-free for fabric allocation handle in GpuIpcMemHandle (#764)

## Summary Fix a use-after-free where the CUDA allocation handle (`CUmemGenericAllocationHandle`) was released prematurely while the exported fabric handle still referenced it. ## Problem Unlike POSIX FD handles (where the kernel keeps the allocation alive via the open file descriptor), fabric handles do not hold their own reference to the underlying allocation. The original code called `cuMemRelease(allocHandle)` immediately after exporting the fabric handle, freeing the allocation. When a remote process later tries to `cuMemImportFromShareableHandle` using that fabric handle, it references a freed allocation — a **use-after-free**. This affected both code paths: 1. **`GpuIpcMemHandle::create()`**: The local `allocHandle` obtained via `cuMemRetainAllocationHandle` was released right after fabric export, leaving the fabric handle dangling. 2. **`GpuIpcMemHandle::createMulticast()`**: The `allocHandle` from `cuMulticastCreate` was unconditionally released, even when it was the only thing keeping the multicast object alive for the fabric handle. ## Fix - **Added `allocHandle` field** to the `fabric` struct in `GpuIpcMemHandle` to store the allocation handle and keep it alive for the lifetime of the `GpuIpcMemHandle`. - **`create()`**: Retain an additional reference via `cuMemRetainAllocationHandle` and store it in `fabric.allocHandle` when a fabric handle is successfully exported. - **`createMulticast()`**: Store the `allocHandle` directly in `fabric.allocHandle` instead of unconditionally releasing it. Only release if fabric export was not used. - **`deleter()`**: Release `fabric.allocHandle` via `cuMemRelease` when the handle type includes `Fabric`, ensuring proper cleanup. - **`GpuIpcMem` constructor (importer side)**: Clear `fabric.allocHandle` after importing, since the importer gets its own handle via `cuMemImportFromShareableHandle` and should not release the exporter's allocation handle. ## Files Changed - `src/core/include/gpu_ipc_mem.hpp` — Added `CUmemGenericAllocationHandle allocHandle` to fabric struct. - `src/core/gpu_ipc_mem.cc` — Retain/release allocation handle properly across create, createMulticast, deleter, and importer paths.
2026-04-19 22:39:11 +00:00 · 2026-03-19 11:52:09 -07:00
parent bf946ea51e
commit 5d18835417
2 changed files with 16 additions and 3 deletions
--- a/src/core/gpu_ipc_mem.cc
+++ b/src/core/gpu_ipc_mem.cc
@@ -140,6 +140,11 @@ void GpuIpcMemHandle::deleter(GpuIpcMemHandle* handle) {
      UnixSocketServer::instance().unregisterFd(handle->posixFd.fd);
      ::close(handle->posixFd.fd);
    }
+    if (handle->typeFlags & GpuIpcMemHandle::Type::Fabric) {
+      if (handle->fabric.allocHandle != 0) {
+        cuMemRelease(handle->fabric.allocHandle);
+      }
+    }
    delete handle;
  }
 }
@@ -148,6 +153,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::create(const CUdeviceptr ptr) {
  auto handle = UniqueGpuIpcMemHandle(new GpuIpcMemHandle(), &GpuIpcMemHandle::deleter);
  handle->typeFlags = GpuIpcMemHandle::Type::None;
  handle->posixFd.fd = -1;
+  handle->fabric.allocHandle = {};

  CUdeviceptr basePtr;
  size_t sz;
@@ -189,6 +195,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::create(const CUdeviceptr ptr) {
  // FABRIC handle
  if (cuMemExportToShareableHandle(&(handle->fabric.handle), allocHandle, CU_MEM_HANDLE_TYPE_FABRIC, 0) ==
      CUDA_SUCCESS) {
+    MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&(handle->fabric.allocHandle), (void*)basePtr));
    handle->typeFlags |= GpuIpcMemHandle::Type::Fabric;
  }

@@ -232,6 +239,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b
  handle->offsetFromBase = 0;
  handle->typeFlags = GpuIpcMemHandle::Type::None;
  handle->posixFd.fd = -1;
+  handle->fabric.allocHandle = {};

  // POSIX FD handle
  int fileDesc;
@@ -246,6 +254,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b
  if (isFabricAvailable && (cuMemExportToShareableHandle(&(handle->fabric.handle), allocHandle,
                                                         CU_MEM_HANDLE_TYPE_FABRIC, 0) == CUDA_SUCCESS)) {
    handle->typeFlags |= GpuIpcMemHandle::Type::Fabric;
+    handle->fabric.allocHandle = allocHandle;
  }

  if (handle->typeFlags == GpuIpcMemHandle::Type::None) {
@@ -253,9 +262,10 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b
    THROW(GPU, Error, ErrorCode::SystemError, "createMulticast failed: neither POSIX FD nor FABRIC handle was created");
  }

-  // Release the local allocation handle. The exported POSIX FD / Fabric handle keeps the
-  // multicast object alive. Each importer will get its own handle via cuMemImportFromShareableHandle.
-  MSCCLPP_CUTHROW(cuMemRelease(allocHandle));
+  // Only release allocHandle if it is not stored in fabric.allocHandle.
+  if (!(handle->typeFlags & GpuIpcMemHandle::Type::Fabric)) {
+    MSCCLPP_CUTHROW(cuMemRelease(allocHandle));
+  }
  return handle;
 #else   // !(CUDA_NVLS_API_AVAILABLE)
  THROW(GPU, Error, ErrorCode::InvalidUsage,
@@ -275,6 +285,8 @@ GpuIpcMem::GpuIpcMem(const GpuIpcMemHandle& handle)
  if ((type_ == GpuIpcMemHandle::Type::None) && (handle_.typeFlags & GpuIpcMemHandle::Type::Fabric)) {
    if (cuMemImportFromShareableHandle(&allocHandle_, (void*)handle_.fabric.handle, CU_MEM_HANDLE_TYPE_FABRIC) ==
        CUDA_SUCCESS) {
+      // Ignore allocHandle in the handle struct since it is process-local and not transferable across processes.
+      handle_.fabric.allocHandle = {};
      type_ = GpuIpcMemHandle::Type::Fabric;
    }
  }
--- a/src/core/include/gpu_ipc_mem.hpp
+++ b/src/core/include/gpu_ipc_mem.hpp
@@ -44,6 +44,7 @@ struct GpuIpcMemHandle {

  struct {
    char handle[64];
+    CUmemGenericAllocationHandle allocHandle;
  } fabric;

  static void deleter(GpuIpcMemHandle* handle);