mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-06-08 15:30:41 +00:00
Add executor to execute schedule-plan file (#283)
Add executor to execute the JSON schedule file generated by msccl-tools --------- Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
This commit is contained in:
@@ -19,6 +19,10 @@ from ._mscclpp import (
|
||||
TcpBootstrap,
|
||||
Transport,
|
||||
TransportFlags,
|
||||
DataType,
|
||||
Executor,
|
||||
ExecutionPlan,
|
||||
PacketType,
|
||||
version,
|
||||
is_nvls_supported,
|
||||
)
|
||||
|
||||
@@ -51,6 +51,7 @@ class CommGroup:
|
||||
self.communicator = Communicator(self.bootstrap)
|
||||
self.my_rank = self.bootstrap.get_rank()
|
||||
self.nranks = self.bootstrap.get_n_ranks()
|
||||
self.nranks_per_node = self.bootstrap.get_n_ranks_per_node()
|
||||
|
||||
def barrier(self):
|
||||
self.bootstrap.barrier()
|
||||
|
||||
@@ -20,6 +20,7 @@ extern void register_fifo(nb::module_& m);
|
||||
extern void register_semaphore(nb::module_& m);
|
||||
extern void register_utils(nb::module_& m);
|
||||
extern void register_numa(nb::module_& m);
|
||||
extern void register_executor(nb::module_& m);
|
||||
|
||||
template <typename T>
|
||||
void def_nonblocking_future(nb::handle& m, const std::string& typestr) {
|
||||
@@ -35,6 +36,7 @@ void register_core(nb::module_& m) {
|
||||
nb::class_<Bootstrap>(m, "Bootstrap")
|
||||
.def("get_rank", &Bootstrap::getRank)
|
||||
.def("get_n_ranks", &Bootstrap::getNranks)
|
||||
.def("get_n_ranks_per_node", &Bootstrap::getNranksPerNode)
|
||||
.def(
|
||||
"send",
|
||||
[](Bootstrap* self, uintptr_t ptr, size_t size, int peer, int tag) {
|
||||
@@ -204,4 +206,5 @@ NB_MODULE(_mscclpp, m) {
|
||||
register_utils(m);
|
||||
register_core(m);
|
||||
register_numa(m);
|
||||
register_executor(m);
|
||||
}
|
||||
|
||||
@@ -16,7 +16,8 @@ void register_error(nb::module_& m) {
|
||||
.value("RemoteError", ErrorCode::RemoteError)
|
||||
.value("InvalidUsage", ErrorCode::InvalidUsage)
|
||||
.value("Timeout", ErrorCode::Timeout)
|
||||
.value("Aborted", ErrorCode::Aborted);
|
||||
.value("Aborted", ErrorCode::Aborted)
|
||||
.value("ExecutorError", ErrorCode::ExecutorError);
|
||||
|
||||
nb::class_<BaseError>(m, "BaseError")
|
||||
.def(nb::init<std::string&, int>(), nb::arg("message"), nb::arg("errorCode"))
|
||||
|
||||
38
python/mscclpp/executor.cpp
Normal file
38
python/mscclpp/executor.cpp
Normal file
@@ -0,0 +1,38 @@
|
||||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#include <nanobind/nanobind.h>
|
||||
#include <nanobind/stl/shared_ptr.h>
|
||||
#include <nanobind/stl/string.h>
|
||||
|
||||
#include <mscclpp/executor.hpp>
|
||||
#include <mscclpp/gpu.hpp>
|
||||
|
||||
namespace nb = nanobind;
|
||||
using namespace mscclpp;
|
||||
|
||||
void register_executor(nb::module_& m) {
|
||||
nb::enum_<DataType>(m, "DataType")
|
||||
.value("int32", DataType::INT32)
|
||||
.value("uint32", DataType::UINT32)
|
||||
.value("float16", DataType::FLOAT16)
|
||||
.value("float32", DataType::FLOAT32);
|
||||
|
||||
nb::enum_<PacketType>(m, "PacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16);
|
||||
|
||||
nb::class_<ExecutionPlan>(m, "ExecutionPlan")
|
||||
.def(nb::init<const std::string, const std::string>(), nb::arg("name"), nb::arg("planPath"));
|
||||
|
||||
nb::class_<Executor>(m, "Executor")
|
||||
.def(nb::init<std::shared_ptr<Communicator>>(), nb::arg("comm"))
|
||||
.def(
|
||||
"execute",
|
||||
[](Executor* self, int rank, uintptr_t sendbuff, uintptr_t recvBuff, size_t sendBuffSize, size_t recvBuffSize,
|
||||
DataType dataType, int nthreads, const ExecutionPlan& plan, uintptr_t stream, PacketType packetType) {
|
||||
self->execute(rank, reinterpret_cast<void*>(sendbuff), reinterpret_cast<void*>(recvBuff), sendBuffSize,
|
||||
recvBuffSize, dataType, nthreads, plan, (cudaStream_t)stream, packetType);
|
||||
},
|
||||
nb::arg("rank"), nb::arg("sendbuff"), nb::arg("recvBuff"), nb::arg("sendBuffSize"), nb::arg("recvBuffSize"),
|
||||
nb::arg("dataType"), nb::arg("nthreads"), nb::arg("plan"), nb::arg("stream"),
|
||||
nb::arg("packetType") = PacketType::LL16);
|
||||
}
|
||||
75
python/test/executor_test.py
Normal file
75
python/test/executor_test.py
Normal file
@@ -0,0 +1,75 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
from os import path
|
||||
from mscclpp import (
|
||||
DataType,
|
||||
Executor,
|
||||
ExecutionPlan,
|
||||
)
|
||||
import mscclpp.comm as mscclpp_comm
|
||||
|
||||
import cupy as cp
|
||||
from mpi4py import MPI
|
||||
|
||||
MSCCLPP_ROOT_PATH = "/root/mscclpp"
|
||||
|
||||
|
||||
def bench_time(niters: int, ngraphIters: int, func):
|
||||
# capture cuda graph for niters of the kernel launch
|
||||
stream = cp.cuda.Stream(non_blocking=True)
|
||||
with stream:
|
||||
stream.begin_capture()
|
||||
for i in range(niters):
|
||||
func(stream)
|
||||
graph = stream.end_capture()
|
||||
|
||||
# now run a warm up round
|
||||
graph.launch(stream)
|
||||
|
||||
# now run the benchmark and measure time
|
||||
start = cp.cuda.Event()
|
||||
end = cp.cuda.Event()
|
||||
|
||||
start.record(stream)
|
||||
for _ in range(ngraphIters):
|
||||
graph.launch(stream)
|
||||
end.record(stream)
|
||||
end.synchronize()
|
||||
|
||||
return cp.cuda.get_elapsed_time(start, end) / niters * 1000.0 / ngraphIters
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD)
|
||||
cp.cuda.Device(MPI.COMM_WORLD.rank % mscclpp_group.nranks_per_node).use()
|
||||
executor = Executor(mscclpp_group.communicator)
|
||||
execution_plan = ExecutionPlan(
|
||||
"allreduce_pairs", path.join(MSCCLPP_ROOT_PATH, "test", "execution-files", "allreduce.json")
|
||||
)
|
||||
|
||||
nelems = 1024 * 1024
|
||||
cp.random.seed(42)
|
||||
buffer = cp.random.random(nelems).astype(cp.float16)
|
||||
sub_arrays = cp.split(buffer, MPI.COMM_WORLD.size)
|
||||
sendbuf = sub_arrays[MPI.COMM_WORLD.rank]
|
||||
mscclpp_group.barrier()
|
||||
|
||||
execution_time = bench_time(
|
||||
100,
|
||||
10,
|
||||
lambda stream: executor.execute(
|
||||
MPI.COMM_WORLD.rank,
|
||||
sendbuf.data.ptr,
|
||||
sendbuf.data.ptr,
|
||||
sendbuf.nbytes,
|
||||
sendbuf.nbytes,
|
||||
DataType.float16,
|
||||
512,
|
||||
execution_plan,
|
||||
stream.ptr,
|
||||
),
|
||||
)
|
||||
print(f"Rank: {MPI.COMM_WORLD.rank} Execution time: {execution_time} us, data size: {sendbuf.nbytes} bytes")
|
||||
executor = None
|
||||
mscclpp_group = None
|
||||
@@ -12,7 +12,10 @@ import netifaces as ni
|
||||
import pytest
|
||||
|
||||
from mscclpp import (
|
||||
DataType,
|
||||
EndpointConfig,
|
||||
ExecutionPlan,
|
||||
Executor,
|
||||
Fifo,
|
||||
Host2DeviceSemaphore,
|
||||
Host2HostSemaphore,
|
||||
@@ -590,3 +593,39 @@ def test_nvls(mpi_group: MpiGroup):
|
||||
kernel()
|
||||
cp.cuda.runtime.deviceSynchronize()
|
||||
group.barrier()
|
||||
|
||||
|
||||
@parametrize_mpi_groups(2)
|
||||
@pytest.mark.parametrize("filename", ["allreduce.json", "allreduce_packet.json"])
|
||||
def test_executor(mpi_group: MpiGroup, filename: str):
|
||||
if all_ranks_on_the_same_node(mpi_group) is False:
|
||||
pytest.skip("algo not support cross node")
|
||||
project_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
mscclpp_group = mscclpp_comm.CommGroup(mpi_group.comm)
|
||||
executor = Executor(mscclpp_group.communicator)
|
||||
execution_plan = ExecutionPlan("allreduce_pairs", os.path.join(project_dir, "test", "execution-files", filename))
|
||||
|
||||
nelems = 1024 * 1024
|
||||
cp.random.seed(42)
|
||||
buffer = cp.random.random(nelems).astype(cp.float16)
|
||||
sub_arrays = cp.split(buffer, mpi_group.comm.size)
|
||||
sendbuf = sub_arrays[mpi_group.comm.rank]
|
||||
expected = cp.zeros_like(sendbuf)
|
||||
for i in range(mpi_group.comm.size):
|
||||
expected += sub_arrays[i]
|
||||
mscclpp_group.barrier()
|
||||
|
||||
stream = cp.cuda.Stream(non_blocking=True)
|
||||
executor.execute(
|
||||
mpi_group.comm.rank,
|
||||
sendbuf.data.ptr,
|
||||
sendbuf.data.ptr,
|
||||
sendbuf.nbytes,
|
||||
sendbuf.nbytes,
|
||||
DataType.float16,
|
||||
512,
|
||||
execution_plan,
|
||||
stream.ptr,
|
||||
)
|
||||
stream.synchronize()
|
||||
assert cp.allclose(sendbuf, expected, atol=1e-3 * mpi_group.comm.size)
|
||||
|
||||
Reference in New Issue
Block a user