Add executor to execute schedule-plan file (#283)

Add executor to execute the JSON schedule file generated by msccl-tools --------- Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
2026-06-08 15:30:41 +00:00 · 2024-04-19 03:10:41 +08:00
parent 9406123711
commit 64d837f9ab
27 changed files with 2857 additions and 3 deletions
--- a/python/mscclpp/init.py
+++ b/python/mscclpp/init.py
@@ -19,6 +19,10 @@ from ._mscclpp import (
    TcpBootstrap,
    Transport,
    TransportFlags,
+    DataType,
+    Executor,
+    ExecutionPlan,
+    PacketType,
    version,
    is_nvls_supported,
 )
--- a/python/mscclpp/comm.py
+++ b/python/mscclpp/comm.py
@@ -51,6 +51,7 @@ class CommGroup:
        self.communicator = Communicator(self.bootstrap)
        self.my_rank = self.bootstrap.get_rank()
        self.nranks = self.bootstrap.get_n_ranks()
+        self.nranks_per_node = self.bootstrap.get_n_ranks_per_node()

    def barrier(self):
        self.bootstrap.barrier()
--- a/python/mscclpp/core_py.cpp
+++ b/python/mscclpp/core_py.cpp
@@ -20,6 +20,7 @@ extern void register_fifo(nb::module_& m);
 extern void register_semaphore(nb::module_& m);
 extern void register_utils(nb::module_& m);
 extern void register_numa(nb::module_& m);
+extern void register_executor(nb::module_& m);

 template <typename T>
 void def_nonblocking_future(nb::handle& m, const std::string& typestr) {
@@ -35,6 +36,7 @@ void register_core(nb::module_& m) {
  nb::class_<Bootstrap>(m, "Bootstrap")
      .def("get_rank", &Bootstrap::getRank)
      .def("get_n_ranks", &Bootstrap::getNranks)
+      .def("get_n_ranks_per_node", &Bootstrap::getNranksPerNode)
      .def(
          "send",
          [](Bootstrap* self, uintptr_t ptr, size_t size, int peer, int tag) {
@@ -204,4 +206,5 @@ NB_MODULE(_mscclpp, m) {
  register_utils(m);
  register_core(m);
  register_numa(m);
+  register_executor(m);
 }
--- a/python/mscclpp/error_py.cpp
+++ b/python/mscclpp/error_py.cpp
@@ -16,7 +16,8 @@ void register_error(nb::module_& m) {
      .value("RemoteError", ErrorCode::RemoteError)
      .value("InvalidUsage", ErrorCode::InvalidUsage)
      .value("Timeout", ErrorCode::Timeout)
-      .value("Aborted", ErrorCode::Aborted);
+      .value("Aborted", ErrorCode::Aborted)
+      .value("ExecutorError", ErrorCode::ExecutorError);

  nb::class_<BaseError>(m, "BaseError")
      .def(nb::init<std::string&, int>(), nb::arg("message"), nb::arg("errorCode"))
--- a/python/mscclpp/executor.cpp
+++ b/python/mscclpp/executor.cpp
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/string.h>
+
+#include <mscclpp/executor.hpp>
+#include <mscclpp/gpu.hpp>
+
+namespace nb = nanobind;
+using namespace mscclpp;
+
+void register_executor(nb::module_& m) {
+  nb::enum_<DataType>(m, "DataType")
+      .value("int32", DataType::INT32)
+      .value("uint32", DataType::UINT32)
+      .value("float16", DataType::FLOAT16)
+      .value("float32", DataType::FLOAT32);
+
+  nb::enum_<PacketType>(m, "PacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16);
+
+  nb::class_<ExecutionPlan>(m, "ExecutionPlan")
+      .def(nb::init<const std::string, const std::string>(), nb::arg("name"), nb::arg("planPath"));
+
+  nb::class_<Executor>(m, "Executor")
+      .def(nb::init<std::shared_ptr<Communicator>>(), nb::arg("comm"))
+      .def(
+          "execute",
+          [](Executor* self, int rank, uintptr_t sendbuff, uintptr_t recvBuff, size_t sendBuffSize, size_t recvBuffSize,
+             DataType dataType, int nthreads, const ExecutionPlan& plan, uintptr_t stream, PacketType packetType) {
+            self->execute(rank, reinterpret_cast<void*>(sendbuff), reinterpret_cast<void*>(recvBuff), sendBuffSize,
+                          recvBuffSize, dataType, nthreads, plan, (cudaStream_t)stream, packetType);
+          },
+          nb::arg("rank"), nb::arg("sendbuff"), nb::arg("recvBuff"), nb::arg("sendBuffSize"), nb::arg("recvBuffSize"),
+          nb::arg("dataType"), nb::arg("nthreads"), nb::arg("plan"), nb::arg("stream"),
+          nb::arg("packetType") = PacketType::LL16);
+}
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -0,0 +1,75 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from os import path
+from mscclpp import (
+    DataType,
+    Executor,
+    ExecutionPlan,
+)
+import mscclpp.comm as mscclpp_comm
+
+import cupy as cp
+from mpi4py import MPI
+
+MSCCLPP_ROOT_PATH = "/root/mscclpp"
+
+
+def bench_time(niters: int, ngraphIters: int, func):
+    # capture cuda graph for niters of the kernel launch
+    stream = cp.cuda.Stream(non_blocking=True)
+    with stream:
+        stream.begin_capture()
+        for i in range(niters):
+            func(stream)
+        graph = stream.end_capture()
+
+    # now run a warm up round
+    graph.launch(stream)
+
+    # now run the benchmark and measure time
+    start = cp.cuda.Event()
+    end = cp.cuda.Event()
+
+    start.record(stream)
+    for _ in range(ngraphIters):
+        graph.launch(stream)
+    end.record(stream)
+    end.synchronize()
+
+    return cp.cuda.get_elapsed_time(start, end) / niters * 1000.0 / ngraphIters
+
+
+if __name__ == "__main__":
+    mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD)
+    cp.cuda.Device(MPI.COMM_WORLD.rank % mscclpp_group.nranks_per_node).use()
+    executor = Executor(mscclpp_group.communicator)
+    execution_plan = ExecutionPlan(
+        "allreduce_pairs", path.join(MSCCLPP_ROOT_PATH, "test", "execution-files", "allreduce.json")
+    )
+
+    nelems = 1024 * 1024
+    cp.random.seed(42)
+    buffer = cp.random.random(nelems).astype(cp.float16)
+    sub_arrays = cp.split(buffer, MPI.COMM_WORLD.size)
+    sendbuf = sub_arrays[MPI.COMM_WORLD.rank]
+    mscclpp_group.barrier()
+
+    execution_time = bench_time(
+        100,
+        10,
+        lambda stream: executor.execute(
+            MPI.COMM_WORLD.rank,
+            sendbuf.data.ptr,
+            sendbuf.data.ptr,
+            sendbuf.nbytes,
+            sendbuf.nbytes,
+            DataType.float16,
+            512,
+            execution_plan,
+            stream.ptr,
+        ),
+    )
+    print(f"Rank: {MPI.COMM_WORLD.rank} Execution time: {execution_time} us, data size: {sendbuf.nbytes} bytes")
+    executor = None
+    mscclpp_group = None
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -12,7 +12,10 @@ import netifaces as ni
 import pytest

 from mscclpp import (
+    DataType,
    EndpointConfig,
+    ExecutionPlan,
+    Executor,
    Fifo,
    Host2DeviceSemaphore,
    Host2HostSemaphore,
@@ -590,3 +593,39 @@ def test_nvls(mpi_group: MpiGroup):
    kernel()
    cp.cuda.runtime.deviceSynchronize()
    group.barrier()
+
+
+@parametrize_mpi_groups(2)
+@pytest.mark.parametrize("filename", ["allreduce.json", "allreduce_packet.json"])
+def test_executor(mpi_group: MpiGroup, filename: str):
+    if all_ranks_on_the_same_node(mpi_group) is False:
+        pytest.skip("algo not support cross node")
+    project_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    mscclpp_group = mscclpp_comm.CommGroup(mpi_group.comm)
+    executor = Executor(mscclpp_group.communicator)
+    execution_plan = ExecutionPlan("allreduce_pairs", os.path.join(project_dir, "test", "execution-files", filename))
+
+    nelems = 1024 * 1024
+    cp.random.seed(42)
+    buffer = cp.random.random(nelems).astype(cp.float16)
+    sub_arrays = cp.split(buffer, mpi_group.comm.size)
+    sendbuf = sub_arrays[mpi_group.comm.rank]
+    expected = cp.zeros_like(sendbuf)
+    for i in range(mpi_group.comm.size):
+        expected += sub_arrays[i]
+    mscclpp_group.barrier()
+
+    stream = cp.cuda.Stream(non_blocking=True)
+    executor.execute(
+        mpi_group.comm.rank,
+        sendbuf.data.ptr,
+        sendbuf.data.ptr,
+        sendbuf.nbytes,
+        sendbuf.nbytes,
+        DataType.float16,
+        512,
+        execution_plan,
+        stream.ptr,
+    )
+    stream.synchronize()
+    assert cp.allclose(sendbuf, expected, atol=1e-3 * mpi_group.comm.size)