mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-06-29 02:47:23 +00:00
Expose NVLS multicast granularity option for GpuBuffer (#815)
Add a public Granularity enum (MultiCastMinimum, MultiCastRecommended) and let GpuBuffer choose the NVLS multicast allocation granularity via a constructor argument, defaulting to MultiCastMinimum to minimize memory usage. Expose the same option through the C++ and Python (nanobind) APIs. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -114,8 +114,13 @@ static nb::capsule toDlpack(GpuBuffer<char> buffer, std::string dataType, std::v
|
||||
void register_gpu_utils(nb::module_& m) {
|
||||
m.def("is_nvls_supported", &isNvlsSupported);
|
||||
|
||||
nb::enum_<GpuBufferGranularity>(m, "CppGpuBufferGranularity")
|
||||
.value("MultiCastMinimum", GpuBufferGranularity::MultiCastMinimum)
|
||||
.value("MultiCastRecommended", GpuBufferGranularity::MultiCastRecommended);
|
||||
|
||||
nb::class_<GpuBuffer<char>>(m, "CppRawGpuBuffer")
|
||||
.def(nb::init<size_t>(), nb::arg("nelems"))
|
||||
.def(nb::init<size_t, GpuBufferGranularity>(), nb::arg("nelems"),
|
||||
nb::arg("granularity") = GpuBufferGranularity::MultiCastMinimum)
|
||||
.def("nelems", &GpuBuffer<char>::nelems)
|
||||
.def("bytes", &GpuBuffer<char>::bytes)
|
||||
.def("data", [](GpuBuffer<char>& self) { return reinterpret_cast<uintptr_t>(self.data()); })
|
||||
|
||||
@@ -100,6 +100,7 @@ __all__ = [
|
||||
"AlgorithmCollection",
|
||||
"CommGroup",
|
||||
"GpuBuffer",
|
||||
"GpuBufferGranularity",
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -6,14 +6,21 @@ from typing import Union, Tuple
|
||||
|
||||
import cupy as cp
|
||||
import numpy as np
|
||||
from mscclpp._mscclpp import CppRawGpuBuffer
|
||||
from mscclpp._mscclpp import CppRawGpuBuffer, CppGpuBufferGranularity
|
||||
|
||||
__all__ = ["GpuBuffer"]
|
||||
__all__ = ["GpuBuffer", "GpuBufferGranularity"]
|
||||
|
||||
GpuBufferGranularity = CppGpuBufferGranularity
|
||||
|
||||
|
||||
class GpuBuffer(cp.ndarray):
|
||||
def __new__(
|
||||
cls, shape: Union[int, Tuple[int]], dtype: cp.dtype = float, strides: Tuple[int] = None, order: str = "C"
|
||||
cls,
|
||||
shape: Union[int, Tuple[int]],
|
||||
dtype: cp.dtype = float,
|
||||
strides: Tuple[int] = None,
|
||||
order: str = "C",
|
||||
granularity: CppGpuBufferGranularity = CppGpuBufferGranularity.MultiCastMinimum,
|
||||
):
|
||||
# Check if `shape` is valid
|
||||
if isinstance(shape, int):
|
||||
@@ -25,6 +32,6 @@ class GpuBuffer(cp.ndarray):
|
||||
if any(s <= 0 for s in shape):
|
||||
raise ValueError("Shape must be positive.")
|
||||
# Create the buffer
|
||||
buffer = CppRawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize)
|
||||
buffer = CppRawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize, granularity)
|
||||
memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer.data(), buffer.bytes(), buffer), 0)
|
||||
return cp.ndarray(shape, dtype=dtype, strides=strides, order=order, memptr=memptr)
|
||||
|
||||
Reference in New Issue
Block a user