address flagBuffer ownership issue (#749)

This pull request updates the handling of the default flag buffer in the C++ and Python bindings to ensure proper memory management when interfacing with Python. Make sure the buffer will not be deallocated when transfer ownership from cpp to python
2026-04-19 22:39:11 +00:00 · 2026-02-20 13:42:29 -08:00
parent 4701ae3a95
commit 39865c218b
6 changed files with 39 additions and 22 deletions
--- a/include/mscclpp/algorithm.hpp
+++ b/include/mscclpp/algorithm.hpp
@@ -366,7 +366,7 @@ class AlgorithmCollection {

 /// Get a default GPU flag buffer (allocated once and reused).
 /// @return A pair of (shared_ptr to the flag buffer, size in bytes).
-std::pair<std::shared_ptr<void>, size_t> getDefaultFlagBuffer();
+std::pair<std::shared_ptr<void>, size_t> getFlagBuffer();

 }  // namespace mscclpp

--- a/python/csrc/algorithm.cpp
+++ b/python/csrc/algorithm.cpp
@@ -116,10 +116,15 @@ void register_algorithm(nb::module_& m) {
      .def("buffer_mode", &CollectiveRequest::bufferMode);

  m.def(
-      "cpp_get_default_flag_buffer",
+      "cpp_get_flag_buffer",
      []() {
-        auto [buffer, size] = getDefaultFlagBuffer();
-        return std::make_pair(reinterpret_cast<uintptr_t>(buffer.get()), size);
+        auto [buffer, size] = getFlagBuffer();
+        uintptr_t ptr = reinterpret_cast<uintptr_t>(buffer.get());
+        // Transfer shared_ptr ownership into a capsule so Python's GC manages the lifetime.
+        auto prevent = std::make_unique<std::shared_ptr<void>>(std::move(buffer));
+        nb::capsule owner(prevent.get(), [](void* p) noexcept { delete static_cast<std::shared_ptr<void>*>(p); });
+        prevent.release();  // capsule now owns the pointer
+        return nb::make_tuple(ptr, size, owner);
      },
-      "Get the default flag buffer. Returns a tuple of (buffer_ptr, buffer_size).");
+      "Get the default flag buffer. Returns a tuple of (buffer_ptr, buffer_size, owner).");
 }
--- a/python/mscclpp/_core/algorithm.py
+++ b/python/mscclpp/_core/algorithm.py
@@ -19,7 +19,7 @@ from mscclpp._mscclpp import (
    CppReduceOp,
    CppAlgorithmBuilder,
    CppAlgorithmCollection,
-    cpp_get_default_flag_buffer,
+    cpp_get_flag_buffer,
 )

 __all__ = ["Algorithm", "AlgorithmBuilder", "AlgorithmCollection"]
@@ -241,15 +241,22 @@ class AlgorithmCollection:
        self._algorithms.append(algorithm)


-def get_default_flag_buffer() -> cp.ndarray:
+_flag_buffer_cache = None
+
+
+def get_flag_buffer() -> cp.ndarray:
    """Get the default flag buffer for algorithm selection.

    This buffer is used internally by default algorithms to store selection flags.
    It is allocated as a shared GPU buffer and can be accessed from Python.
+    The result is cached so all callers share the same buffer.

    Returns:
        A CuPy array representing the flag buffer on the GPU.
    """
-    buffer_ptr, buffer_size = cpp_get_default_flag_buffer()
-    memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer_ptr, buffer_size, None), 0)
-    return cp.ndarray((buffer_size // 4,), dtype=cp.uint32, memptr=memptr)
+    global _flag_buffer_cache
+    if _flag_buffer_cache is None:
+        buffer_ptr, buffer_size, owner = cpp_get_flag_buffer()
+        memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer_ptr, buffer_size, owner), 0)
+        _flag_buffer_cache = cp.ndarray((buffer_size // 4,), dtype=cp.uint32, memptr=memptr)
+    return _flag_buffer_cache
--- a/python/mscclpp/ext/algorithm_collection_builder.py
+++ b/python/mscclpp/ext/algorithm_collection_builder.py
@@ -3,7 +3,7 @@

 from __future__ import annotations
 from typing import Union
-from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection, get_default_flag_buffer
+from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection, get_flag_buffer
 import atexit

 from mscclpp._mscclpp import CppAlgorithmCollectionBuilder
@@ -58,7 +58,7 @@ class AlgorithmCollectionBuilder:
        rank: int,
    ) -> AlgorithmCollection:
        if self._flag_buffer is None:
-            self._flag_buffer = get_default_flag_buffer()
+            self._flag_buffer = get_flag_buffer()
        native_collection = self._builder.build_default_algorithms(
            int(scratch_buffer), scratch_buffer_size, self._flag_buffer.data.ptr, self._flag_buffer.nbytes, rank
        )
--- a/src/core/algorithm.cc
+++ b/src/core/algorithm.cc
@@ -199,18 +199,23 @@ std::shared_ptr<Algorithm> DslAlgorithm::build() { return shared_from_this(); }
 // TODO: implement this
 void DslAlgorithm::reset() {}

-static std::weak_ptr<uint32_t> gDefaultFlagBuffer;
+static uint32_t* gDefaultFlagBuffer = nullptr;
+static std::weak_ptr<void> gDefaultFlagBufferWeak;
 static size_t gDefaultFlagCount = 128;

-std::pair<std::shared_ptr<void>, size_t> getDefaultFlagBuffer() {
-  std::shared_ptr<uint32_t> flagBuffer = gDefaultFlagBuffer.lock();
-  if (!flagBuffer) {
-    flagBuffer = mscclpp::detail::gpuCallocShared<uint32_t>(gDefaultFlagCount);
-    std::vector<uint32_t> initFlags(gDefaultFlagCount, 1);
-    mscclpp::gpuMemcpy(flagBuffer.get(), initFlags.data(), gDefaultFlagCount, cudaMemcpyHostToDevice);
-    gDefaultFlagBuffer = flagBuffer;
+std::pair<std::shared_ptr<void>, size_t> getFlagBuffer() {
+  auto ptr = gDefaultFlagBufferWeak.lock();
+  if (!ptr) {
+    if (!gDefaultFlagBuffer) {
+      // Intentionally never freed — CUDA driver reclaims GPU memory at process exit.
+      gDefaultFlagBuffer = static_cast<uint32_t*>(mscclpp::detail::gpuCalloc(gDefaultFlagCount * sizeof(uint32_t)));
+      std::vector<uint32_t> initFlags(gDefaultFlagCount, 1);
+      mscclpp::gpuMemcpy(gDefaultFlagBuffer, initFlags.data(), gDefaultFlagCount, cudaMemcpyHostToDevice);
+    }
+    ptr = std::shared_ptr<void>(gDefaultFlagBuffer, [](void*) {});
+    gDefaultFlagBufferWeak = ptr;
  }
-  return {flagBuffer, gDefaultFlagCount * sizeof(uint32_t)};
+  return {ptr, gDefaultFlagCount * sizeof(uint32_t)};
 }

 }  // namespace mscclpp
--- a/src/ext/nccl/nccl.cc
+++ b/src/ext/nccl/nccl.cc
@@ -294,7 +294,7 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
  commPtr->scratchBuffer_ = mscclpp::GpuBuffer<char>(commPtr->scratchBufferSize_).memory();
  commPtr->executor = std::make_shared<mscclpp::Executor>(mscclppComm, commPtr->scratchBuffer_);

-  auto [buffer, size] = mscclpp::getDefaultFlagBuffer();
+  auto [buffer, size] = mscclpp::getFlagBuffer();
  commPtr->flagBuffer_ = buffer;
  commPtr->flagBufferSize_ = size;