Add executor to execute schedule-plan file (#283)

Add executor to execute the JSON schedule file generated by msccl-tools --------- Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
2026-05-04 21:51:32 +00:00 · 2024-04-19 03:10:41 +08:00
parent 9406123711
commit 64d837f9ab
27 changed files with 2857 additions and 3 deletions
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -0,0 +1,75 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from os import path
+from mscclpp import (
+    DataType,
+    Executor,
+    ExecutionPlan,
+)
+import mscclpp.comm as mscclpp_comm
+
+import cupy as cp
+from mpi4py import MPI
+
+MSCCLPP_ROOT_PATH = "/root/mscclpp"
+
+
+def bench_time(niters: int, ngraphIters: int, func):
+    # capture cuda graph for niters of the kernel launch
+    stream = cp.cuda.Stream(non_blocking=True)
+    with stream:
+        stream.begin_capture()
+        for i in range(niters):
+            func(stream)
+        graph = stream.end_capture()
+
+    # now run a warm up round
+    graph.launch(stream)
+
+    # now run the benchmark and measure time
+    start = cp.cuda.Event()
+    end = cp.cuda.Event()
+
+    start.record(stream)
+    for _ in range(ngraphIters):
+        graph.launch(stream)
+    end.record(stream)
+    end.synchronize()
+
+    return cp.cuda.get_elapsed_time(start, end) / niters * 1000.0 / ngraphIters
+
+
+if __name__ == "__main__":
+    mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD)
+    cp.cuda.Device(MPI.COMM_WORLD.rank % mscclpp_group.nranks_per_node).use()
+    executor = Executor(mscclpp_group.communicator)
+    execution_plan = ExecutionPlan(
+        "allreduce_pairs", path.join(MSCCLPP_ROOT_PATH, "test", "execution-files", "allreduce.json")
+    )
+
+    nelems = 1024 * 1024
+    cp.random.seed(42)
+    buffer = cp.random.random(nelems).astype(cp.float16)
+    sub_arrays = cp.split(buffer, MPI.COMM_WORLD.size)
+    sendbuf = sub_arrays[MPI.COMM_WORLD.rank]
+    mscclpp_group.barrier()
+
+    execution_time = bench_time(
+        100,
+        10,
+        lambda stream: executor.execute(
+            MPI.COMM_WORLD.rank,
+            sendbuf.data.ptr,
+            sendbuf.data.ptr,
+            sendbuf.nbytes,
+            sendbuf.nbytes,
+            DataType.float16,
+            512,
+            execution_plan,
+            stream.ptr,
+        ),
+    )
+    print(f"Rank: {MPI.COMM_WORLD.rank} Execution time: {execution_time} us, data size: {sendbuf.nbytes} bytes")
+    executor = None
+    mscclpp_group = None