Add GpuBuffer class (#423)

* Renamed and moved mem alloc functions into the `mscclpp::detail::` namespace (now `mscclpp::detail::gpuCalloc*<T>()`) * Deprecated constructor-calling mem alloc functions (`mscclpp::makeShared*<T>()` and `mscclpp::makeUnique*<T>()`) * Added a new `mscclpp::GpuBuffer<T>()` class that should be used in general for allocating communication buffers * Added a new `mscclpp.utils.GpuBuffer` Python class that inherits `cupy.ndarray` and allocates using `mscclpp::gpuMemAlloc` * Renamed `mscclpp::memcpyCuda*<T>()` functions into `mscclpp::gpuMemcpy*<T>()` for name consistency * A few fixes in NVLS memory allocation * Tackled minor compiler warnings
2026-05-03 05:01:28 +00:00 · 2025-01-07 18:40:01 -08:00
parent 6d26b92665
commit 34945fb107
38 changed files with 527 additions and 555 deletions
--- a/docs/getting-started/tutorials/python-api.md
+++ b/docs/getting-started/tutorials/python-api.md
@@ -14,6 +14,7 @@ from mscclpp import (
    ProxyService,
    Transport,
 )
+from mscclpp.utils import GpuBuffer
 import mscclpp.comm as mscclpp_comm

 def create_connection(group: mscclpp_comm.CommGroup, transport: str):
@@ -32,7 +33,7 @@ if __name__ == "__main__":
    mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD)
    connections = create_connection(mscclpp_group, "NVLink")
    nelems = 1024
-    memory = cp.zeros(nelem, dtype=cp.int32)
+    memory = GpuBuffer(nelem, dtype=cp.int32)
    proxy_service = ProxyService()
    simple_channels = group.make_proxy_channels(proxy_service, memory, connections)
    proxy_service.start_proxy()