From 7c390fffd607bcd11d822428afcd2f992c017ca1 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 4 Jun 2026 13:16:18 -0700 Subject: [PATCH] Expose NVLS multicast granularity option for GpuBuffer (#815) Add a public Granularity enum (MultiCastMinimum, MultiCastRecommended) and let GpuBuffer choose the NVLS multicast allocation granularity via a constructor argument, defaulting to MultiCastMinimum to minimize memory usage. Expose the same option through the C++ and Python (nanobind) APIs. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/cpp_api.rst | 2 ++ include/mscclpp/gpu_utils.hpp | 21 +++++++++++++++++++-- python/csrc/gpu_utils_py.cpp | 7 ++++++- python/mscclpp/__init__.py | 1 + python/mscclpp/_core/buffer.py | 15 +++++++++++---- 5 files changed, 39 insertions(+), 7 deletions(-) diff --git a/docs/cpp_api.rst b/docs/cpp_api.rst index 6b5f24c0..a7ebaaf9 100644 --- a/docs/cpp_api.rst +++ b/docs/cpp_api.rst @@ -128,6 +128,8 @@ Utilities .. doxygenclass:: mscclpp::GpuBuffer :members: +.. doxygenenum:: mscclpp::GpuBufferGranularity + .. doxygenclass:: mscclpp::GpuStream :members: diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index b079e0fd..82fa3ec0 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -317,6 +317,16 @@ bool isNvlsSupported(); /// @return True if the pointer is allocated by cuMemMap, false otherwise. bool isCuMemMapAllocated(void* ptr); +/// Granularity used to size a `GpuBuffer` allocation so that it is compatible with the multicast (NVLS) API. +enum class GpuBufferGranularity { + /// Minimum multicast granularity. Rounds the allocation up to the minimum granularity required for multicast + /// compatibility, minimizing memory footprint. This is the default. + MultiCastMinimum, + /// Recommended multicast granularity. Rounds the allocation up to the granularity recommended by the driver, + /// which may be larger than the minimum but can yield better performance. + MultiCastRecommended, +}; + /// Allocates a GPU memory space specialized for communication. The memory is zeroed out. Get the device pointer by /// `GpuBuffer::data()`. /// @@ -334,7 +344,11 @@ class GpuBuffer { public: /// Constructs a GpuBuffer with the specified number of elements. /// @param nelems Number of elements to allocate. If it is zero, `data()` will return a null pointer. - GpuBuffer(size_t nelems) : nelems_(nelems) { + /// @param granularity Granularity used to size the allocation for multicast (NVLS) compatibility. Defaults to + /// `GpuBufferGranularity::MultiCastMinimum`, which minimizes memory usage. This is ignored when the buffer is not + /// allocated through the multicast-compatible path. + GpuBuffer(size_t nelems, [[maybe_unused]] GpuBufferGranularity granularity = GpuBufferGranularity::MultiCastMinimum) + : nelems_(nelems) { if (nelems == 0) { bytes_ = 0; return; @@ -342,7 +356,10 @@ class GpuBuffer { MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId_)); #if (CUDA_NVLS_API_AVAILABLE) if (isNvlsSupported()) { - size_t gran = detail::getMulticastGranularity(nelems * sizeof(T), CU_MULTICAST_GRANULARITY_RECOMMENDED); + CUmulticastGranularity_flags granFlag = (granularity == GpuBufferGranularity::MultiCastRecommended) + ? CU_MULTICAST_GRANULARITY_RECOMMENDED + : CU_MULTICAST_GRANULARITY_MINIMUM; + size_t gran = detail::getMulticastGranularity(nelems * sizeof(T), granFlag); bytes_ = (nelems * sizeof(T) + gran - 1) / gran * gran / sizeof(T) * sizeof(T); memory_ = detail::gpuCallocPhysicalShared(nelems, gran); return; diff --git a/python/csrc/gpu_utils_py.cpp b/python/csrc/gpu_utils_py.cpp index 60880456..d6527502 100644 --- a/python/csrc/gpu_utils_py.cpp +++ b/python/csrc/gpu_utils_py.cpp @@ -114,8 +114,13 @@ static nb::capsule toDlpack(GpuBuffer buffer, std::string dataType, std::v void register_gpu_utils(nb::module_& m) { m.def("is_nvls_supported", &isNvlsSupported); + nb::enum_(m, "CppGpuBufferGranularity") + .value("MultiCastMinimum", GpuBufferGranularity::MultiCastMinimum) + .value("MultiCastRecommended", GpuBufferGranularity::MultiCastRecommended); + nb::class_>(m, "CppRawGpuBuffer") - .def(nb::init(), nb::arg("nelems")) + .def(nb::init(), nb::arg("nelems"), + nb::arg("granularity") = GpuBufferGranularity::MultiCastMinimum) .def("nelems", &GpuBuffer::nelems) .def("bytes", &GpuBuffer::bytes) .def("data", [](GpuBuffer& self) { return reinterpret_cast(self.data()); }) diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py index 5f3a2302..09408171 100644 --- a/python/mscclpp/__init__.py +++ b/python/mscclpp/__init__.py @@ -100,6 +100,7 @@ __all__ = [ "AlgorithmCollection", "CommGroup", "GpuBuffer", + "GpuBufferGranularity", ] diff --git a/python/mscclpp/_core/buffer.py b/python/mscclpp/_core/buffer.py index 0575ca68..e07424f5 100644 --- a/python/mscclpp/_core/buffer.py +++ b/python/mscclpp/_core/buffer.py @@ -6,14 +6,21 @@ from typing import Union, Tuple import cupy as cp import numpy as np -from mscclpp._mscclpp import CppRawGpuBuffer +from mscclpp._mscclpp import CppRawGpuBuffer, CppGpuBufferGranularity -__all__ = ["GpuBuffer"] +__all__ = ["GpuBuffer", "GpuBufferGranularity"] + +GpuBufferGranularity = CppGpuBufferGranularity class GpuBuffer(cp.ndarray): def __new__( - cls, shape: Union[int, Tuple[int]], dtype: cp.dtype = float, strides: Tuple[int] = None, order: str = "C" + cls, + shape: Union[int, Tuple[int]], + dtype: cp.dtype = float, + strides: Tuple[int] = None, + order: str = "C", + granularity: CppGpuBufferGranularity = CppGpuBufferGranularity.MultiCastMinimum, ): # Check if `shape` is valid if isinstance(shape, int): @@ -25,6 +32,6 @@ class GpuBuffer(cp.ndarray): if any(s <= 0 for s in shape): raise ValueError("Shape must be positive.") # Create the buffer - buffer = CppRawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize) + buffer = CppRawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize, granularity) memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer.data(), buffer.bytes(), buffer), 0) return cp.ndarray(shape, dtype=dtype, strides=strides, order=order, memptr=memptr)