Add GpuBuffer class (#423)

* Renamed and moved mem alloc functions into the `mscclpp::detail::` namespace (now `mscclpp::detail::gpuCalloc*<T>()`) * Deprecated constructor-calling mem alloc functions (`mscclpp::makeShared*<T>()` and `mscclpp::makeUnique*<T>()`) * Added a new `mscclpp::GpuBuffer<T>()` class that should be used in general for allocating communication buffers * Added a new `mscclpp.utils.GpuBuffer` Python class that inherits `cupy.ndarray` and allocates using `mscclpp::gpuMemAlloc` * Renamed `mscclpp::memcpyCuda*<T>()` functions into `mscclpp::gpuMemcpy*<T>()` for name consistency * A few fixes in NVLS memory allocation * Tackled minor compiler warnings
2026-05-02 12:41:30 +00:00 · 2025-01-07 18:40:01 -08:00
parent 6d26b92665
commit 34945fb107
38 changed files with 527 additions and 555 deletions
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -8,11 +8,9 @@ from mscclpp import (
    ExecutionPlan,
    PacketType,
    npkit,
-    alloc_shared_physical_cuda,
-    is_nvls_supported,
 )
 import mscclpp.comm as mscclpp_comm
-from mscclpp.utils import KernelBuilder, pack
+from mscclpp.utils import KernelBuilder, GpuBuffer, pack
 import os
 import struct

@@ -129,18 +127,6 @@ def dtype_to_mscclpp_dtype(dtype):
        raise ValueError(f"Unknown data type: {dtype}")


-def allocate_buffer(nelems, dtype):
-    if is_nvls_supported():
-        buffer_raw = alloc_shared_physical_cuda(nelems * cp.dtype(dtype).itemsize)
-        buffer_ptr = cp.cuda.MemoryPointer(
-            cp.cuda.UnownedMemory(buffer_raw.get_ptr(), buffer_raw.size(), buffer_raw), 0
-        )
-        buffer = cp.ndarray(nelems, dtype=dtype, memptr=buffer_ptr)
-        return buffer
-    else:
-        return cp.zeros(nelems, dtype=dtype)
-
-
 def build_bufs(
    collective: str,
    size: int,
@@ -160,14 +146,14 @@ def build_bufs(
        nelems_input = nelems
    nelems_output = nelems

-    result_buf = allocate_buffer(nelems_output, dtype=dtype)
+    result_buf = GpuBuffer(nelems_output, dtype=dtype)
    if in_place:
        if "allgather" in collective:
            input_buf = cp.split(result_buf, num_ranks)[rank]
        else:
            input_buf = result_buf
    else:
-        input_buf = allocate_buffer(nelems_input, dtype=dtype)
+        input_buf = GpuBuffer(nelems_input, dtype=dtype)
    test_buf = cp.zeros(nelems_output, dtype=dtype)

    return input_buf, result_buf, test_buf