mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 01:10:22 +00:00
Manage runtime environments (#452)
* Add `Env` class that manages all runtime environments. * Changed `NPKIT_DUMP_DIR` to `MSCCLPP_NPKIT_DUMP_DIR`.
This commit is contained in:
@@ -4,6 +4,7 @@
|
||||
import os as _os
|
||||
|
||||
from ._mscclpp import (
|
||||
Env,
|
||||
ErrorCode,
|
||||
BaseError,
|
||||
Error,
|
||||
@@ -32,6 +33,7 @@ from ._mscclpp import (
|
||||
ExecutionPlan,
|
||||
PacketType,
|
||||
RawGpuBuffer,
|
||||
env,
|
||||
version,
|
||||
is_nvls_supported,
|
||||
npkit,
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
namespace nb = nanobind;
|
||||
using namespace mscclpp;
|
||||
|
||||
extern void register_env(nb::module_& m);
|
||||
extern void register_error(nb::module_& m);
|
||||
extern void register_proxy_channel(nb::module_& m);
|
||||
extern void register_sm_channel(nb::module_& m);
|
||||
@@ -184,6 +185,7 @@ void register_core(nb::module_& m) {
|
||||
}
|
||||
|
||||
NB_MODULE(_mscclpp, m) {
|
||||
register_env(m);
|
||||
register_error(m);
|
||||
register_proxy_channel(m);
|
||||
register_sm_channel(m);
|
||||
|
||||
28
python/mscclpp/env_py.cpp
Normal file
28
python/mscclpp/env_py.cpp
Normal file
@@ -0,0 +1,28 @@
|
||||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#include <nanobind/nanobind.h>
|
||||
#include <nanobind/stl/shared_ptr.h>
|
||||
#include <nanobind/stl/string.h>
|
||||
|
||||
#include <mscclpp/env.hpp>
|
||||
|
||||
namespace nb = nanobind;
|
||||
using namespace mscclpp;
|
||||
|
||||
void register_env(nb::module_& m) {
|
||||
nb::class_<Env>(m, "Env")
|
||||
.def_ro("debug", &Env::debug)
|
||||
.def_ro("debug_subsys", &Env::debugSubsys)
|
||||
.def_ro("debug_file", &Env::debugFile)
|
||||
.def_ro("hca_devices", &Env::hcaDevices)
|
||||
.def_ro("hostid", &Env::hostid)
|
||||
.def_ro("socket_family", &Env::socketFamily)
|
||||
.def_ro("socket_ifname", &Env::socketIfname)
|
||||
.def_ro("comm_id", &Env::commId)
|
||||
.def_ro("execution_plan_dir", &Env::executionPlanDir)
|
||||
.def_ro("npkit_dump_dir", &Env::npkitDumpDir)
|
||||
.def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream);
|
||||
|
||||
m.def("env", &env);
|
||||
}
|
||||
@@ -8,6 +8,7 @@ from mscclpp import (
|
||||
ExecutionPlan,
|
||||
PacketType,
|
||||
npkit,
|
||||
env,
|
||||
)
|
||||
import mscclpp.comm as mscclpp_comm
|
||||
from mscclpp.utils import KernelBuilder, GpuBuffer, pack
|
||||
@@ -171,8 +172,8 @@ def main(
|
||||
mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD)
|
||||
cp.cuda.Device(mscclpp_group.my_rank % mscclpp_group.nranks_per_node).use()
|
||||
executor = Executor(mscclpp_group.communicator)
|
||||
npkit_dump_dir = os.getenv("NPKIT_DUMP_DIR")
|
||||
if npkit_dump_dir is not None:
|
||||
npkit_dump_dir = env().npkit_dump_dir
|
||||
if npkit_dump_dir != "":
|
||||
npkit.init(mscclpp_group.my_rank)
|
||||
execution_plan = ExecutionPlan(execution_plan_path)
|
||||
collective = execution_plan.collective()
|
||||
|
||||
@@ -27,6 +27,7 @@ from mscclpp import (
|
||||
Transport,
|
||||
is_nvls_supported,
|
||||
npkit,
|
||||
env,
|
||||
)
|
||||
import mscclpp.comm as mscclpp_comm
|
||||
from mscclpp.utils import KernelBuilder, GpuBuffer, pack
|
||||
@@ -36,6 +37,19 @@ from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
|
||||
ethernet_interface_name = "eth0"
|
||||
|
||||
|
||||
@parametrize_mpi_groups(1)
|
||||
def test_env(mpi_group: MpiGroup):
|
||||
e = env()
|
||||
assert isinstance(e.debug, str)
|
||||
with pytest.raises(AttributeError):
|
||||
# all attributes should be read-only
|
||||
e.debug = "INFO"
|
||||
|
||||
# should be the same object
|
||||
e2 = env()
|
||||
assert e == e2
|
||||
|
||||
|
||||
def all_ranks_on_the_same_node(mpi_group: MpiGroup):
|
||||
if (ethernet_interface_name in ni.interfaces()) is False:
|
||||
pytest.skip(f"{ethernet_interface_name} is not an interface to use on this node")
|
||||
@@ -624,8 +638,8 @@ def test_executor(mpi_group: MpiGroup, filename: str):
|
||||
project_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
mscclpp_group = mscclpp_comm.CommGroup(mpi_group.comm)
|
||||
executor = Executor(mscclpp_group.communicator)
|
||||
npkit_dump_dir = os.getenv("NPKIT_DUMP_DIR")
|
||||
if npkit_dump_dir is not None:
|
||||
npkit_dump_dir = env().npkit_dump_dir
|
||||
if npkit_dump_dir != "":
|
||||
npkit.init(mscclpp_group.my_rank)
|
||||
execution_plan = ExecutionPlan(os.path.join(project_dir, "test", "execution-files", filename))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user