Add GpuBuffer class (#423)

* Renamed and moved mem alloc functions into the `mscclpp::detail::`
namespace (now `mscclpp::detail::gpuCalloc*<T>()`)
* Deprecated constructor-calling mem alloc functions
(`mscclpp::makeShared*<T>()` and `mscclpp::makeUnique*<T>()`)
* Added a new `mscclpp::GpuBuffer<T>()` class that should be used in
general for allocating communication buffers
* Added a new `mscclpp.utils.GpuBuffer` Python class that inherits
`cupy.ndarray` and allocates using `mscclpp::gpuMemAlloc`
* Renamed `mscclpp::memcpyCuda*<T>()` functions into
`mscclpp::gpuMemcpy*<T>()` for name consistency
* A few fixes in NVLS memory allocation
* Tackled minor compiler warnings
This commit is contained in:
Changho Hwang
2025-01-07 18:40:01 -08:00
committed by GitHub
parent 6d26b92665
commit 34945fb107
38 changed files with 527 additions and 555 deletions

View File

@@ -27,7 +27,7 @@ from mscclpp import (
npkit,
)
import mscclpp.comm as mscclpp_comm
from mscclpp.utils import KernelBuilder, pack
from mscclpp.utils import KernelBuilder, GpuBuffer, pack
from ._cpp import _ext
from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
@@ -156,12 +156,26 @@ def test_group_with_connections(mpi_group: MpiGroup, transport: str):
create_group_and_connection(mpi_group, transport)
@parametrize_mpi_groups(1)
@pytest.mark.parametrize("nelem", [2**i for i in [0, 10, 15, 20]])
@pytest.mark.parametrize("dtype", [cp.float32, cp.float16])
def test_gpu_buffer(mpi_group: MpiGroup, nelem: int, dtype: cp.dtype):
memory = GpuBuffer(nelem, dtype=dtype)
assert memory.shape == (nelem,)
assert memory.dtype == dtype
assert memory.itemsize == cp.dtype(dtype).itemsize
assert memory.nbytes == nelem * cp.dtype(dtype).itemsize
assert memory.data.ptr != 0
assert memory.data.mem.ptr != 0
assert memory.data.mem.size >= nelem * cp.dtype(dtype).itemsize
@parametrize_mpi_groups(2, 4, 8, 16)
@pytest.mark.parametrize("transport", ["IB", "NVLink"])
@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
def test_connection_write(mpi_group: MpiGroup, transport: Transport, nelem: int):
group, connections = create_group_and_connection(mpi_group, transport)
memory = cp.zeros(nelem, dtype=cp.int32)
memory = GpuBuffer(nelem, dtype=cp.int32)
nelemPerRank = nelem // group.nranks
sizePerRank = nelemPerRank * memory.itemsize
memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
@@ -436,13 +450,12 @@ def test_d2d_semaphores(mpi_group: MpiGroup):
def test_sm_channels(mpi_group: MpiGroup, nelem: int, use_packet: bool):
group, connections = create_group_and_connection(mpi_group, "NVLink")
memory = cp.zeros(nelem, dtype=cp.int32)
memory = GpuBuffer(nelem, dtype=cp.int32)
if use_packet:
scratch = cp.zeros(nelem * 2, dtype=cp.int32)
scratch = GpuBuffer(nelem * 2, dtype=cp.int32)
else:
scratch = None
nelemPerRank = nelem // group.nranks
nelemPerRank * memory.itemsize
memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
memory_expected = cp.zeros_like(memory)
for rank in range(group.nranks):
@@ -484,7 +497,7 @@ def test_fifo(
def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
group, connections = create_group_and_connection(mpi_group, transport)
memory = cp.zeros(nelem, dtype=cp.int32)
memory = GpuBuffer(nelem, dtype=cp.int32)
nelemPerRank = nelem // group.nranks
nelemPerRank * memory.itemsize
memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
@@ -534,11 +547,11 @@ def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
def test_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool):
group, connections = create_group_and_connection(mpi_group, transport)
memory = cp.zeros(nelem, dtype=cp.int32)
memory = GpuBuffer(nelem, dtype=cp.int32)
if use_packet:
scratch = cp.zeros(nelem * 2, dtype=cp.int32)
scratch = GpuBuffer(nelem * 2, dtype=cp.int32)
else:
scratch = cp.zeros(1, dtype=cp.int32) # just so that we can pass a valid ptr
scratch = GpuBuffer(1, dtype=cp.int32) # just so that we can pass a valid ptr
nelemPerRank = nelem // group.nranks
nelemPerRank * memory.itemsize
memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1