Add NPKit GPU event support (#310)

This commit is contained in:
Ziyue Yang
2024-06-13 13:59:50 +08:00
committed by GitHub
parent 80aefe55bc
commit 76328fe623
21 changed files with 500 additions and 165 deletions

View File

@@ -7,6 +7,7 @@ from mscclpp import (
Executor,
ExecutionPlan,
PacketType,
npkit,
)
import mscclpp.comm as mscclpp_comm
@@ -87,6 +88,9 @@ def main(
mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD)
cp.cuda.Device(mscclpp_group.my_rank % mscclpp_group.nranks_per_node).use()
executor = Executor(mscclpp_group.communicator)
npkit_dump_dir = os.getenv("NPKIT_DUMP_DIR")
if npkit_dump_dir is not None:
npkit.init(mscclpp_group.my_rank)
execution_plan = ExecutionPlan(execution_paln_name, execution_plan_path)
cp.random.seed(seed)
@@ -119,6 +123,9 @@ def main(
mscclpp_group.barrier()
execution_time = bench_time(100, 10, executor_func)
if npkit_dump_dir is not None:
npkit.dump(npkit_dump_dir)
npkit.shutdown()
print(
f"Rank: {MPI.COMM_WORLD.rank} Execution time: {execution_time} us, "
f"data size: {sendbuf.nbytes} bytes data type: {dtype().dtype.name} "

View File

@@ -24,6 +24,7 @@ from mscclpp import (
TcpBootstrap,
Transport,
is_nvls_supported,
npkit,
)
import mscclpp.comm as mscclpp_comm
from mscclpp.utils import KernelBuilder, pack
@@ -603,6 +604,9 @@ def test_executor(mpi_group: MpiGroup, filename: str):
project_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
mscclpp_group = mscclpp_comm.CommGroup(mpi_group.comm)
executor = Executor(mscclpp_group.communicator)
npkit_dump_dir = os.getenv("NPKIT_DUMP_DIR")
if npkit_dump_dir is not None:
npkit.init(mscclpp_group.my_rank)
execution_plan = ExecutionPlan("allreduce_pairs", os.path.join(project_dir, "test", "execution-files", filename))
nelems = 1024 * 1024
@@ -629,3 +633,6 @@ def test_executor(mpi_group: MpiGroup, filename: str):
)
stream.synchronize()
assert cp.allclose(sendbuf, expected, atol=1e-3 * mpi_group.comm.size)
if npkit_dump_dir is not None:
npkit.dump(npkit_dump_dir)
npkit.shutdown()