Add GpuBuffer class (#423)

* Renamed and moved mem alloc functions into the `mscclpp::detail::` namespace (now `mscclpp::detail::gpuCalloc*<T>()`) * Deprecated constructor-calling mem alloc functions (`mscclpp::makeShared*<T>()` and `mscclpp::makeUnique*<T>()`) * Added a new `mscclpp::GpuBuffer<T>()` class that should be used in general for allocating communication buffers * Added a new `mscclpp.utils.GpuBuffer` Python class that inherits `cupy.ndarray` and allocates using `mscclpp::gpuMemAlloc` * Renamed `mscclpp::memcpyCuda*<T>()` functions into `mscclpp::gpuMemcpy*<T>()` for name consistency * A few fixes in NVLS memory allocation * Tackled minor compiler warnings
2026-05-12 09:17:06 +00:00 · 2025-01-07 18:40:01 -08:00
parent 6d26b92665
commit 34945fb107
38 changed files with 527 additions and 555 deletions
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -27,7 +27,7 @@ from mscclpp import (
    npkit,
 )
 import mscclpp.comm as mscclpp_comm
-from mscclpp.utils import KernelBuilder, pack
+from mscclpp.utils import KernelBuilder, GpuBuffer, pack
 from ._cpp import _ext
 from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group

@@ -156,12 +156,26 @@ def test_group_with_connections(mpi_group: MpiGroup, transport: str):
    create_group_and_connection(mpi_group, transport)


+@parametrize_mpi_groups(1)
+@pytest.mark.parametrize("nelem", [2**i for i in [0, 10, 15, 20]])
+@pytest.mark.parametrize("dtype", [cp.float32, cp.float16])
+def test_gpu_buffer(mpi_group: MpiGroup, nelem: int, dtype: cp.dtype):
+    memory = GpuBuffer(nelem, dtype=dtype)
+    assert memory.shape == (nelem,)
+    assert memory.dtype == dtype
+    assert memory.itemsize == cp.dtype(dtype).itemsize
+    assert memory.nbytes == nelem * cp.dtype(dtype).itemsize
+    assert memory.data.ptr != 0
+    assert memory.data.mem.ptr != 0
+    assert memory.data.mem.size >= nelem * cp.dtype(dtype).itemsize
+
+
@parametrize_mpi_groups(2, 4, 8, 16)
@pytest.mark.parametrize("transport", ["IB", "NVLink"])
@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
 def test_connection_write(mpi_group: MpiGroup, transport: Transport, nelem: int):
    group, connections = create_group_and_connection(mpi_group, transport)
-    memory = cp.zeros(nelem, dtype=cp.int32)
+    memory = GpuBuffer(nelem, dtype=cp.int32)
    nelemPerRank = nelem // group.nranks
    sizePerRank = nelemPerRank * memory.itemsize
    memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
@@ -436,13 +450,12 @@ def test_d2d_semaphores(mpi_group: MpiGroup):
 def test_sm_channels(mpi_group: MpiGroup, nelem: int, use_packet: bool):
    group, connections = create_group_and_connection(mpi_group, "NVLink")

-    memory = cp.zeros(nelem, dtype=cp.int32)
+    memory = GpuBuffer(nelem, dtype=cp.int32)
    if use_packet:
-        scratch = cp.zeros(nelem * 2, dtype=cp.int32)
+        scratch = GpuBuffer(nelem * 2, dtype=cp.int32)
    else:
        scratch = None
    nelemPerRank = nelem // group.nranks
-    nelemPerRank * memory.itemsize
    memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
    memory_expected = cp.zeros_like(memory)
    for rank in range(group.nranks):
@@ -484,7 +497,7 @@ def test_fifo(
 def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
    group, connections = create_group_and_connection(mpi_group, transport)

-    memory = cp.zeros(nelem, dtype=cp.int32)
+    memory = GpuBuffer(nelem, dtype=cp.int32)
    nelemPerRank = nelem // group.nranks
    nelemPerRank * memory.itemsize
    memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
@@ -534,11 +547,11 @@ def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
 def test_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool):
    group, connections = create_group_and_connection(mpi_group, transport)

-    memory = cp.zeros(nelem, dtype=cp.int32)
+    memory = GpuBuffer(nelem, dtype=cp.int32)
    if use_packet:
-        scratch = cp.zeros(nelem * 2, dtype=cp.int32)
+        scratch = GpuBuffer(nelem * 2, dtype=cp.int32)
    else:
-        scratch = cp.zeros(1, dtype=cp.int32)  # just so that we can pass a valid ptr
+        scratch = GpuBuffer(1, dtype=cp.int32)  # just so that we can pass a valid ptr
    nelemPerRank = nelem // group.nranks
    nelemPerRank * memory.itemsize
    memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1