mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 09:17:06 +00:00
Add GpuBuffer class (#423)
* Renamed and moved mem alloc functions into the `mscclpp::detail::` namespace (now `mscclpp::detail::gpuCalloc*<T>()`) * Deprecated constructor-calling mem alloc functions (`mscclpp::makeShared*<T>()` and `mscclpp::makeUnique*<T>()`) * Added a new `mscclpp::GpuBuffer<T>()` class that should be used in general for allocating communication buffers * Added a new `mscclpp.utils.GpuBuffer` Python class that inherits `cupy.ndarray` and allocates using `mscclpp::gpuMemAlloc` * Renamed `mscclpp::memcpyCuda*<T>()` functions into `mscclpp::gpuMemcpy*<T>()` for name consistency * A few fixes in NVLS memory allocation * Tackled minor compiler warnings
This commit is contained in:
@@ -27,7 +27,7 @@ from mscclpp import (
|
||||
npkit,
|
||||
)
|
||||
import mscclpp.comm as mscclpp_comm
|
||||
from mscclpp.utils import KernelBuilder, pack
|
||||
from mscclpp.utils import KernelBuilder, GpuBuffer, pack
|
||||
from ._cpp import _ext
|
||||
from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
|
||||
|
||||
@@ -156,12 +156,26 @@ def test_group_with_connections(mpi_group: MpiGroup, transport: str):
|
||||
create_group_and_connection(mpi_group, transport)
|
||||
|
||||
|
||||
@parametrize_mpi_groups(1)
|
||||
@pytest.mark.parametrize("nelem", [2**i for i in [0, 10, 15, 20]])
|
||||
@pytest.mark.parametrize("dtype", [cp.float32, cp.float16])
|
||||
def test_gpu_buffer(mpi_group: MpiGroup, nelem: int, dtype: cp.dtype):
|
||||
memory = GpuBuffer(nelem, dtype=dtype)
|
||||
assert memory.shape == (nelem,)
|
||||
assert memory.dtype == dtype
|
||||
assert memory.itemsize == cp.dtype(dtype).itemsize
|
||||
assert memory.nbytes == nelem * cp.dtype(dtype).itemsize
|
||||
assert memory.data.ptr != 0
|
||||
assert memory.data.mem.ptr != 0
|
||||
assert memory.data.mem.size >= nelem * cp.dtype(dtype).itemsize
|
||||
|
||||
|
||||
@parametrize_mpi_groups(2, 4, 8, 16)
|
||||
@pytest.mark.parametrize("transport", ["IB", "NVLink"])
|
||||
@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
|
||||
def test_connection_write(mpi_group: MpiGroup, transport: Transport, nelem: int):
|
||||
group, connections = create_group_and_connection(mpi_group, transport)
|
||||
memory = cp.zeros(nelem, dtype=cp.int32)
|
||||
memory = GpuBuffer(nelem, dtype=cp.int32)
|
||||
nelemPerRank = nelem // group.nranks
|
||||
sizePerRank = nelemPerRank * memory.itemsize
|
||||
memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
|
||||
@@ -436,13 +450,12 @@ def test_d2d_semaphores(mpi_group: MpiGroup):
|
||||
def test_sm_channels(mpi_group: MpiGroup, nelem: int, use_packet: bool):
|
||||
group, connections = create_group_and_connection(mpi_group, "NVLink")
|
||||
|
||||
memory = cp.zeros(nelem, dtype=cp.int32)
|
||||
memory = GpuBuffer(nelem, dtype=cp.int32)
|
||||
if use_packet:
|
||||
scratch = cp.zeros(nelem * 2, dtype=cp.int32)
|
||||
scratch = GpuBuffer(nelem * 2, dtype=cp.int32)
|
||||
else:
|
||||
scratch = None
|
||||
nelemPerRank = nelem // group.nranks
|
||||
nelemPerRank * memory.itemsize
|
||||
memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
|
||||
memory_expected = cp.zeros_like(memory)
|
||||
for rank in range(group.nranks):
|
||||
@@ -484,7 +497,7 @@ def test_fifo(
|
||||
def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
|
||||
group, connections = create_group_and_connection(mpi_group, transport)
|
||||
|
||||
memory = cp.zeros(nelem, dtype=cp.int32)
|
||||
memory = GpuBuffer(nelem, dtype=cp.int32)
|
||||
nelemPerRank = nelem // group.nranks
|
||||
nelemPerRank * memory.itemsize
|
||||
memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
|
||||
@@ -534,11 +547,11 @@ def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
|
||||
def test_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool):
|
||||
group, connections = create_group_and_connection(mpi_group, transport)
|
||||
|
||||
memory = cp.zeros(nelem, dtype=cp.int32)
|
||||
memory = GpuBuffer(nelem, dtype=cp.int32)
|
||||
if use_packet:
|
||||
scratch = cp.zeros(nelem * 2, dtype=cp.int32)
|
||||
scratch = GpuBuffer(nelem * 2, dtype=cp.int32)
|
||||
else:
|
||||
scratch = cp.zeros(1, dtype=cp.int32) # just so that we can pass a valid ptr
|
||||
scratch = GpuBuffer(1, dtype=cp.int32) # just so that we can pass a valid ptr
|
||||
nelemPerRank = nelem // group.nranks
|
||||
nelemPerRank * memory.itemsize
|
||||
memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
|
||||
|
||||
Reference in New Issue
Block a user