Expose NVLS multicast granularity option for GpuBuffer (#815)

Add a public Granularity enum (MultiCastMinimum, MultiCastRecommended) and let GpuBuffer choose the NVLS multicast allocation granularity via a constructor argument, defaulting to MultiCastMinimum to minimize memory usage. Expose the same option through the C++ and Python (nanobind) APIs. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-29 02:47:23 +00:00 · 2026-06-04 13:16:18 -07:00
parent c9f8be64bb
commit 7c390fffd6
5 changed files with 39 additions and 7 deletions
--- a/python/csrc/gpu_utils_py.cpp
+++ b/python/csrc/gpu_utils_py.cpp
@@ -114,8 +114,13 @@ static nb::capsule toDlpack(GpuBuffer<char> buffer, std::string dataType, std::v
 void register_gpu_utils(nb::module_& m) {
  m.def("is_nvls_supported", &isNvlsSupported);

+  nb::enum_<GpuBufferGranularity>(m, "CppGpuBufferGranularity")
+      .value("MultiCastMinimum", GpuBufferGranularity::MultiCastMinimum)
+      .value("MultiCastRecommended", GpuBufferGranularity::MultiCastRecommended);
+
  nb::class_<GpuBuffer<char>>(m, "CppRawGpuBuffer")
-      .def(nb::init<size_t>(), nb::arg("nelems"))
+      .def(nb::init<size_t, GpuBufferGranularity>(), nb::arg("nelems"),
+           nb::arg("granularity") = GpuBufferGranularity::MultiCastMinimum)
      .def("nelems", &GpuBuffer<char>::nelems)
      .def("bytes", &GpuBuffer<char>::bytes)
      .def("data", [](GpuBuffer<char>& self) { return reinterpret_cast<uintptr_t>(self.data()); })
--- a/python/mscclpp/init.py
+++ b/python/mscclpp/init.py
@@ -100,6 +100,7 @@ __all__ = [
    "AlgorithmCollection",
    "CommGroup",
    "GpuBuffer",
+    "GpuBufferGranularity",
 ]


--- a/python/mscclpp/_core/buffer.py
+++ b/python/mscclpp/_core/buffer.py
@@ -6,14 +6,21 @@ from typing import Union, Tuple

 import cupy as cp
 import numpy as np
-from mscclpp._mscclpp import CppRawGpuBuffer
+from mscclpp._mscclpp import CppRawGpuBuffer, CppGpuBufferGranularity

-__all__ = ["GpuBuffer"]
+__all__ = ["GpuBuffer", "GpuBufferGranularity"]
+
+GpuBufferGranularity = CppGpuBufferGranularity


 class GpuBuffer(cp.ndarray):
    def __new__(
-        cls, shape: Union[int, Tuple[int]], dtype: cp.dtype = float, strides: Tuple[int] = None, order: str = "C"
+        cls,
+        shape: Union[int, Tuple[int]],
+        dtype: cp.dtype = float,
+        strides: Tuple[int] = None,
+        order: str = "C",
+        granularity: CppGpuBufferGranularity = CppGpuBufferGranularity.MultiCastMinimum,
    ):
        # Check if `shape` is valid
        if isinstance(shape, int):
@@ -25,6 +32,6 @@ class GpuBuffer(cp.ndarray):
        if any(s <= 0 for s in shape):
            raise ValueError("Shape must be positive.")
        # Create the buffer
-        buffer = CppRawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize)
+        buffer = CppRawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize, granularity)
        memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer.data(), buffer.bytes(), buffer), 0)
        return cp.ndarray(shape, dtype=dtype, strides=strides, order=order, memptr=memptr)