mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-06-07 08:14:51 +00:00
Manage runtime environments (#452)
* Add `Env` class that manages all runtime environments. * Changed `NPKIT_DUMP_DIR` to `MSCCLPP_NPKIT_DUMP_DIR`.
This commit is contained in:
@@ -241,7 +241,7 @@ jobs:
|
||||
cd /root/mscclpp; \
|
||||
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export NPKIT_DUMP_DIR=./npkit_dump; \
|
||||
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
|
||||
mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests --gtest_filter=\"ExecutorTest.TwoNodesAllreduce\"; \
|
||||
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
|
||||
@@ -270,7 +270,7 @@ jobs:
|
||||
cd /root/mscclpp; \
|
||||
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export NPKIT_DUMP_DIR=./npkit_dump; \
|
||||
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build:\$LD_LIBRARY_PATH; \
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'; \
|
||||
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
#include <filesystem>
|
||||
#include <mscclpp/concurrency_device.hpp>
|
||||
#include <mscclpp/core.hpp>
|
||||
#include <mscclpp/env.hpp>
|
||||
#include <mscclpp/executor.hpp>
|
||||
#include <mscclpp/sm_channel.hpp>
|
||||
#include <mscclpp/sm_channel_device.hpp>
|
||||
@@ -414,8 +415,8 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
|
||||
if (mscclppComm->bootstrap()->getNranks() == mscclppComm->bootstrap()->getNranksPerNode())
|
||||
ncclCommInitRankFallbackSingleNode(commPtr, mscclppComm, rank);
|
||||
|
||||
if (getenv("MSCCLPP_EXECUTION_PLAN_DIR")) {
|
||||
std::string collectiveDir = getenv("MSCCLPP_EXECUTION_PLAN_DIR");
|
||||
const std::string& collectiveDir = mscclpp::env()->executionPlanDir;
|
||||
if (collectiveDir != "") {
|
||||
if (!std::filesystem::is_directory(collectiveDir)) {
|
||||
WARN("The value of the environment variable %s is not a directory", collectiveDir.c_str());
|
||||
return ncclInvalidArgument;
|
||||
@@ -430,8 +431,7 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
|
||||
|
||||
*comm = commPtr;
|
||||
#if defined(ENABLE_NPKIT)
|
||||
const char* npkitDumpDir = getenv("NPKIT_DUMP_DIR");
|
||||
if (npkitDumpDir != nullptr) {
|
||||
if (mscclpp::env()->npkitDumpDir != "") {
|
||||
NpKit::Init(rank);
|
||||
}
|
||||
#endif
|
||||
@@ -455,8 +455,8 @@ NCCL_API ncclResult_t ncclCommDestroy(ncclComm_t comm) {
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
#if defined(ENABLE_NPKIT)
|
||||
const char* npkitDumpDir = getenv("NPKIT_DUMP_DIR");
|
||||
if (npkitDumpDir != nullptr) {
|
||||
const std::string& npkitDumpDir = mscclpp::env()->npkitDumpDir;
|
||||
if (npkitDumpDir != "") {
|
||||
NpKit::Dump(npkitDumpDir);
|
||||
NpKit::Shutdown();
|
||||
}
|
||||
|
||||
42
include/mscclpp/env.hpp
Normal file
42
include/mscclpp/env.hpp
Normal file
@@ -0,0 +1,42 @@
|
||||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#ifndef MSCCLPP_ENV_HPP_
|
||||
#define MSCCLPP_ENV_HPP_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
class Env;
|
||||
|
||||
/// Get the MSCCL++ environment.
|
||||
/// @return A reference to the global environment object.
|
||||
std::shared_ptr<Env> env();
|
||||
|
||||
/// The MSCCL++ environment. The constructor reads environment variables and sets the corresponding fields.
|
||||
/// Use the @ref env() function to get the environment object.
|
||||
class Env {
|
||||
public:
|
||||
const std::string debug;
|
||||
const std::string debugSubsys;
|
||||
const std::string debugFile;
|
||||
const std::string hcaDevices;
|
||||
const std::string hostid;
|
||||
const std::string socketFamily;
|
||||
const std::string socketIfname;
|
||||
const std::string commId;
|
||||
const std::string executionPlanDir;
|
||||
const std::string npkitDumpDir;
|
||||
const bool cudaIpcUseDefaultStream;
|
||||
|
||||
private:
|
||||
Env();
|
||||
|
||||
friend std::shared_ptr<Env> env();
|
||||
};
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
#endif // MSCCLPP_ENV_HPP_
|
||||
@@ -11,6 +11,7 @@ version = "0.6.0"
|
||||
|
||||
[tool.scikit-build]
|
||||
cmake.version = ">=3.25.0"
|
||||
cmake.build-type = "Release"
|
||||
build-dir = "build/{wheel_tag}"
|
||||
wheel.packages = ["python/mscclpp", "python/mscclpp_benchmark"]
|
||||
wheel.install-dir = "mscclpp"
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
import os as _os
|
||||
|
||||
from ._mscclpp import (
|
||||
Env,
|
||||
ErrorCode,
|
||||
BaseError,
|
||||
Error,
|
||||
@@ -32,6 +33,7 @@ from ._mscclpp import (
|
||||
ExecutionPlan,
|
||||
PacketType,
|
||||
RawGpuBuffer,
|
||||
env,
|
||||
version,
|
||||
is_nvls_supported,
|
||||
npkit,
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
namespace nb = nanobind;
|
||||
using namespace mscclpp;
|
||||
|
||||
extern void register_env(nb::module_& m);
|
||||
extern void register_error(nb::module_& m);
|
||||
extern void register_proxy_channel(nb::module_& m);
|
||||
extern void register_sm_channel(nb::module_& m);
|
||||
@@ -184,6 +185,7 @@ void register_core(nb::module_& m) {
|
||||
}
|
||||
|
||||
NB_MODULE(_mscclpp, m) {
|
||||
register_env(m);
|
||||
register_error(m);
|
||||
register_proxy_channel(m);
|
||||
register_sm_channel(m);
|
||||
|
||||
28
python/mscclpp/env_py.cpp
Normal file
28
python/mscclpp/env_py.cpp
Normal file
@@ -0,0 +1,28 @@
|
||||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#include <nanobind/nanobind.h>
|
||||
#include <nanobind/stl/shared_ptr.h>
|
||||
#include <nanobind/stl/string.h>
|
||||
|
||||
#include <mscclpp/env.hpp>
|
||||
|
||||
namespace nb = nanobind;
|
||||
using namespace mscclpp;
|
||||
|
||||
void register_env(nb::module_& m) {
|
||||
nb::class_<Env>(m, "Env")
|
||||
.def_ro("debug", &Env::debug)
|
||||
.def_ro("debug_subsys", &Env::debugSubsys)
|
||||
.def_ro("debug_file", &Env::debugFile)
|
||||
.def_ro("hca_devices", &Env::hcaDevices)
|
||||
.def_ro("hostid", &Env::hostid)
|
||||
.def_ro("socket_family", &Env::socketFamily)
|
||||
.def_ro("socket_ifname", &Env::socketIfname)
|
||||
.def_ro("comm_id", &Env::commId)
|
||||
.def_ro("execution_plan_dir", &Env::executionPlanDir)
|
||||
.def_ro("npkit_dump_dir", &Env::npkitDumpDir)
|
||||
.def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream);
|
||||
|
||||
m.def("env", &env);
|
||||
}
|
||||
@@ -8,6 +8,7 @@ from mscclpp import (
|
||||
ExecutionPlan,
|
||||
PacketType,
|
||||
npkit,
|
||||
env,
|
||||
)
|
||||
import mscclpp.comm as mscclpp_comm
|
||||
from mscclpp.utils import KernelBuilder, GpuBuffer, pack
|
||||
@@ -171,8 +172,8 @@ def main(
|
||||
mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD)
|
||||
cp.cuda.Device(mscclpp_group.my_rank % mscclpp_group.nranks_per_node).use()
|
||||
executor = Executor(mscclpp_group.communicator)
|
||||
npkit_dump_dir = os.getenv("NPKIT_DUMP_DIR")
|
||||
if npkit_dump_dir is not None:
|
||||
npkit_dump_dir = env().npkit_dump_dir
|
||||
if npkit_dump_dir != "":
|
||||
npkit.init(mscclpp_group.my_rank)
|
||||
execution_plan = ExecutionPlan(execution_plan_path)
|
||||
collective = execution_plan.collective()
|
||||
|
||||
@@ -27,6 +27,7 @@ from mscclpp import (
|
||||
Transport,
|
||||
is_nvls_supported,
|
||||
npkit,
|
||||
env,
|
||||
)
|
||||
import mscclpp.comm as mscclpp_comm
|
||||
from mscclpp.utils import KernelBuilder, GpuBuffer, pack
|
||||
@@ -36,6 +37,19 @@ from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
|
||||
ethernet_interface_name = "eth0"
|
||||
|
||||
|
||||
@parametrize_mpi_groups(1)
|
||||
def test_env(mpi_group: MpiGroup):
|
||||
e = env()
|
||||
assert isinstance(e.debug, str)
|
||||
with pytest.raises(AttributeError):
|
||||
# all attributes should be read-only
|
||||
e.debug = "INFO"
|
||||
|
||||
# should be the same object
|
||||
e2 = env()
|
||||
assert e == e2
|
||||
|
||||
|
||||
def all_ranks_on_the_same_node(mpi_group: MpiGroup):
|
||||
if (ethernet_interface_name in ni.interfaces()) is False:
|
||||
pytest.skip(f"{ethernet_interface_name} is not an interface to use on this node")
|
||||
@@ -624,8 +638,8 @@ def test_executor(mpi_group: MpiGroup, filename: str):
|
||||
project_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
mscclpp_group = mscclpp_comm.CommGroup(mpi_group.comm)
|
||||
executor = Executor(mscclpp_group.communicator)
|
||||
npkit_dump_dir = os.getenv("NPKIT_DUMP_DIR")
|
||||
if npkit_dump_dir is not None:
|
||||
npkit_dump_dir = env().npkit_dump_dir
|
||||
if npkit_dump_dir != "":
|
||||
npkit.init(mscclpp_group.my_rank)
|
||||
execution_plan = ExecutionPlan(os.path.join(project_dir, "test", "execution-files", filename))
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.cu)
|
||||
file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.cpp *.cu)
|
||||
target_sources(mscclpp_obj PRIVATE ${SOURCES})
|
||||
target_include_directories(mscclpp_obj PRIVATE include)
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include <unistd.h>
|
||||
|
||||
#include <fstream>
|
||||
#include <mscclpp/env.hpp>
|
||||
#include <mscclpp/errors.hpp>
|
||||
#include <mscclpp/utils.hpp>
|
||||
#include <sstream>
|
||||
@@ -64,14 +65,12 @@ static uint16_t socketToPort(union SocketAddress* addr) {
|
||||
/* Allow the user to force the IPv4/IPv6 interface selection */
|
||||
static int envSocketFamily(void) {
|
||||
int family = -1; // Family selection is not forced, will use first one found
|
||||
char* env = getenv("MSCCLPP_SOCKET_FAMILY");
|
||||
if (env == NULL) return family;
|
||||
const std::string& socketFamily = env()->socketFamily;
|
||||
if (socketFamily == "") return family;
|
||||
|
||||
INFO(MSCCLPP_ENV, "MSCCLPP_SOCKET_FAMILY set by environment to %s", env);
|
||||
|
||||
if (strcmp(env, "AF_INET") == 0)
|
||||
if (socketFamily == "AF_INET")
|
||||
family = AF_INET; // IPv4
|
||||
else if (strcmp(env, "AF_INET6") == 0)
|
||||
else if (socketFamily == "AF_INET6")
|
||||
family = AF_INET6; // IPv6
|
||||
return family;
|
||||
}
|
||||
@@ -306,27 +305,25 @@ int FindInterfaces(char* ifNames, union SocketAddress* ifAddrs, int ifNameMaxSiz
|
||||
// Allow user to force the INET socket family selection
|
||||
int sock_family = envSocketFamily();
|
||||
// User specified interface
|
||||
char* env = getenv("MSCCLPP_SOCKET_IFNAME");
|
||||
const std::string& socketIfname = env()->socketIfname;
|
||||
if (inputIfName) {
|
||||
INFO(MSCCLPP_NET, "using iterface %s", inputIfName);
|
||||
nIfs = findInterfaces(inputIfName, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
|
||||
} else if (env && strlen(env) > 1) {
|
||||
INFO(MSCCLPP_ENV, "MSCCLPP_SOCKET_IFNAME set by environment to %s", env);
|
||||
} else if (socketIfname != "") {
|
||||
// Specified by user : find or fail
|
||||
if (shownIfName++ == 0) INFO(MSCCLPP_NET, "MSCCLPP_SOCKET_IFNAME set to %s", env);
|
||||
nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
|
||||
if (shownIfName++ == 0) INFO(MSCCLPP_NET, "MSCCLPP_SOCKET_IFNAME set to %s", socketIfname.c_str());
|
||||
nIfs = findInterfaces(socketIfname.c_str(), ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
|
||||
} else {
|
||||
// Try to automatically pick the right one
|
||||
// Start with IB
|
||||
nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
|
||||
// else see if we can get some hint from COMM ID
|
||||
if (nIfs == 0) {
|
||||
char* commId = getenv("MSCCLPP_COMM_ID");
|
||||
if (commId && strlen(commId) > 1) {
|
||||
INFO(MSCCLPP_ENV, "MSCCLPP_COMM_ID set by environment to %s", commId);
|
||||
const std::string& commId = env()->commId;
|
||||
if (commId != "") {
|
||||
// Try to find interface that is in the same subnet as the IP in comm id
|
||||
union SocketAddress idAddr;
|
||||
SocketGetAddrFromString(&idAddr, commId);
|
||||
SocketGetAddrFromString(&idAddr, commId.c_str());
|
||||
nIfs = FindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);
|
||||
}
|
||||
}
|
||||
|
||||
10
src/debug.cc
10
src/debug.cc
@@ -9,6 +9,7 @@
|
||||
#include <sys/syscall.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <mscclpp/env.hpp>
|
||||
#include <mscclpp/gpu_utils.hpp>
|
||||
#include <mscclpp/utils.hpp>
|
||||
#include <string>
|
||||
@@ -34,7 +35,7 @@ void mscclppDebugInit() {
|
||||
pthread_mutex_unlock(&mscclppDebugLock);
|
||||
return;
|
||||
}
|
||||
const char* mscclpp_debug = getenv("MSCCLPP_DEBUG");
|
||||
const char* mscclpp_debug = mscclpp::env()->debug.c_str();
|
||||
int tempNcclDebugLevel = -1;
|
||||
if (mscclpp_debug == NULL) {
|
||||
tempNcclDebugLevel = MSCCLPP_LOG_NONE;
|
||||
@@ -54,8 +55,9 @@ void mscclppDebugInit() {
|
||||
* This can be a comma separated list such as INIT,COLL
|
||||
* or ^INIT,COLL etc
|
||||
*/
|
||||
char* mscclppDebugSubsysEnv = getenv("MSCCLPP_DEBUG_SUBSYS");
|
||||
if (mscclppDebugSubsysEnv != NULL) {
|
||||
std::string mscclppDebugSubsysStr = mscclpp::env()->debugSubsys;
|
||||
const char* mscclppDebugSubsysEnv = mscclppDebugSubsysStr.c_str();
|
||||
if (mscclppDebugSubsysStr != "") {
|
||||
int invert = 0;
|
||||
if (mscclppDebugSubsysEnv[0] == '^') {
|
||||
invert = 1;
|
||||
@@ -108,7 +110,7 @@ void mscclppDebugInit() {
|
||||
* then create the debug file. But don't bother unless the
|
||||
* MSCCLPP_DEBUG level is > VERSION
|
||||
*/
|
||||
const char* mscclppDebugFileEnv = getenv("MSCCLPP_DEBUG_FILE");
|
||||
const char* mscclppDebugFileEnv = mscclpp::env()->debugFile.c_str();
|
||||
if (tempNcclDebugLevel > MSCCLPP_LOG_VERSION && mscclppDebugFileEnv != NULL) {
|
||||
int c = 0;
|
||||
char debugFn[PATH_MAX + 1] = "";
|
||||
|
||||
87
src/env.cpp
Normal file
87
src/env.cpp
Normal file
@@ -0,0 +1,87 @@
|
||||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <type_traits>
|
||||
|
||||
// clang-format off
|
||||
#include <mscclpp/env.hpp>
|
||||
#include <mscclpp/errors.hpp>
|
||||
// clang-format on
|
||||
|
||||
#include "debug.h"
|
||||
|
||||
template <typename T>
|
||||
T readEnv(const std::string &envName, const T &defaultValue) {
|
||||
const char *envCstr = getenv(envName.c_str());
|
||||
if (envCstr == nullptr) return defaultValue;
|
||||
if constexpr (std::is_same_v<T, int>) {
|
||||
return atoi(envCstr);
|
||||
} else if constexpr (std::is_same_v<T, bool>) {
|
||||
return (std::string(envCstr) != "0");
|
||||
}
|
||||
return T(envCstr);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void readAndSetEnv(const std::string &envName, T &env) {
|
||||
const char *envCstr = getenv(envName.c_str());
|
||||
if (envCstr == nullptr) return;
|
||||
if constexpr (std::is_same_v<T, int>) {
|
||||
env = atoi(envCstr);
|
||||
} else if constexpr (std::is_same_v<T, bool>) {
|
||||
env = (std::string(envCstr) != "0");
|
||||
} else {
|
||||
env = std::string(envCstr);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void logEnv(const std::string &envName, const T &env) {
|
||||
if (!getenv(envName.c_str())) return;
|
||||
INFO(MSCCLPP_ENV, "%s=%d", envName.c_str(), env);
|
||||
}
|
||||
|
||||
template <>
|
||||
void logEnv(const std::string &envName, const std::string &env) {
|
||||
if (!getenv(envName.c_str())) return;
|
||||
INFO(MSCCLPP_ENV, "%s=%s", envName.c_str(), env.c_str());
|
||||
}
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
Env::Env()
|
||||
: debug(readEnv<std::string>("MSCCLPP_DEBUG", "")),
|
||||
debugSubsys(readEnv<std::string>("MSCCLPP_DEBUG_SUBSYS", "")),
|
||||
debugFile(readEnv<std::string>("MSCCLPP_DEBUG_FILE", "")),
|
||||
hcaDevices(readEnv<std::string>("MSCCLPP_HCA_DEVICES", "")),
|
||||
hostid(readEnv<std::string>("MSCCLPP_HOSTID", "")),
|
||||
socketFamily(readEnv<std::string>("MSCCLPP_SOCKET_FAMILY", "")),
|
||||
socketIfname(readEnv<std::string>("MSCCLPP_SOCKET_IFNAME", "")),
|
||||
commId(readEnv<std::string>("MSCCLPP_COMM_ID", "")),
|
||||
executionPlanDir(readEnv<std::string>("MSCCLPP_EXECUTION_PLAN_DIR", "")),
|
||||
npkitDumpDir(readEnv<std::string>("MSCCLPP_NPKIT_DUMP_DIR", "")),
|
||||
cudaIpcUseDefaultStream(readEnv<bool>("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", false)) {}
|
||||
|
||||
std::shared_ptr<Env> env() {
|
||||
static std::shared_ptr<Env> globalEnv = std::shared_ptr<Env>(new Env());
|
||||
static bool logged = false;
|
||||
if (!logged) {
|
||||
logged = true;
|
||||
// cannot log inside the constructor because of circular dependency
|
||||
logEnv("MSCCLPP_DEBUG", globalEnv->debug);
|
||||
logEnv("MSCCLPP_DEBUG_SUBSYS", globalEnv->debugSubsys);
|
||||
logEnv("MSCCLPP_DEBUG_FILE", globalEnv->debugFile);
|
||||
logEnv("MSCCLPP_HCA_DEVICES", globalEnv->hcaDevices);
|
||||
logEnv("MSCCLPP_HOSTID", globalEnv->hostid);
|
||||
logEnv("MSCCLPP_SOCKET_FAMILY", globalEnv->socketFamily);
|
||||
logEnv("MSCCLPP_SOCKET_IFNAME", globalEnv->socketIfname);
|
||||
logEnv("MSCCLPP_COMM_ID", globalEnv->commId);
|
||||
logEnv("MSCCLPP_EXECUTION_PLAN_DIR", globalEnv->executionPlanDir);
|
||||
logEnv("MSCCLPP_NPKIT_DUMP_DIR", globalEnv->npkitDumpDir);
|
||||
logEnv("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", globalEnv->cudaIpcUseDefaultStream);
|
||||
}
|
||||
return globalEnv;
|
||||
}
|
||||
|
||||
} // namespace mscclpp
|
||||
18
src/ib.cc
18
src/ib.cc
@@ -9,6 +9,8 @@
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <mscclpp/core.hpp>
|
||||
#include <mscclpp/env.hpp>
|
||||
#include <mscclpp/errors.hpp>
|
||||
#include <mscclpp/fifo.hpp>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
@@ -388,18 +390,16 @@ MSCCLPP_API_CPP int getIBDeviceCount() {
|
||||
}
|
||||
|
||||
std::string getHcaDevices(int deviceIndex) {
|
||||
const char* envValue = std::getenv("MSCCLPP_HCA_DEVICES");
|
||||
if (envValue) {
|
||||
std::string envStr = env()->hcaDevices;
|
||||
if (envStr != "") {
|
||||
std::vector<std::string> devices;
|
||||
std::string envStr(envValue);
|
||||
std::stringstream ss(envStr);
|
||||
std::string device;
|
||||
while (std::getline(ss, device, ',')) {
|
||||
devices.push_back(device);
|
||||
}
|
||||
if (deviceIndex >= (int)devices.size()) {
|
||||
throw std::invalid_argument("Not enough HCA devices are defined with MSCCLPP_HCA_DEVICES: " +
|
||||
std::string(envValue));
|
||||
throw Error("Not enough HCA devices are defined with MSCCLPP_HCA_DEVICES: " + envStr, ErrorCode::InvalidUsage);
|
||||
}
|
||||
return devices[deviceIndex];
|
||||
}
|
||||
@@ -434,7 +434,7 @@ MSCCLPP_API_CPP std::string getIBDeviceName(Transport ibTransport) {
|
||||
ibTransportIndex = 7;
|
||||
break;
|
||||
default:
|
||||
throw std::invalid_argument("Not an IB transport");
|
||||
throw Error("Not an IB transport", ErrorCode::InvalidUsage);
|
||||
}
|
||||
std::string userHcaDevice = getHcaDevices(ibTransportIndex);
|
||||
if (!userHcaDevice.empty()) {
|
||||
@@ -446,7 +446,7 @@ MSCCLPP_API_CPP std::string getIBDeviceName(Transport ibTransport) {
|
||||
if (ibTransportIndex >= num) {
|
||||
std::stringstream ss;
|
||||
ss << "IB transport out of range: " << ibTransportIndex << " >= " << num;
|
||||
throw std::out_of_range(ss.str());
|
||||
throw Error(ss.str(), ErrorCode::InvalidUsage);
|
||||
}
|
||||
return devices[ibTransportIndex]->name;
|
||||
}
|
||||
@@ -474,11 +474,11 @@ MSCCLPP_API_CPP Transport getIBTransportByDeviceName(const std::string& ibDevice
|
||||
case 7:
|
||||
return Transport::IB7;
|
||||
default:
|
||||
throw std::out_of_range("IB device index out of range");
|
||||
throw Error("IB device index out of range", ErrorCode::InvalidUsage);
|
||||
}
|
||||
}
|
||||
}
|
||||
throw std::invalid_argument("IB device not found");
|
||||
throw Error("IB device not found", ErrorCode::InvalidUsage);
|
||||
}
|
||||
|
||||
#else // !defined(USE_IBVERBS)
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <memory>
|
||||
#include <mscclpp/env.hpp>
|
||||
#include <mscclpp/errors.hpp>
|
||||
#include <string>
|
||||
|
||||
@@ -77,10 +78,9 @@ uint64_t computeHostHash(void) {
|
||||
std::string hostName = getHostName(hashLen, '\0');
|
||||
strncpy(hostHash, hostName.c_str(), hostName.size());
|
||||
|
||||
char* hostId;
|
||||
if ((hostId = getenv("MSCCLPP_HOSTID")) != NULL) {
|
||||
INFO(MSCCLPP_ENV, "MSCCLPP_HOSTID set by environment to %s", hostId);
|
||||
strncpy(hostHash, hostId, hashLen);
|
||||
std::string hostid = env()->hostid;
|
||||
if (hostid != "") {
|
||||
strncpy(hostHash, hostid.c_str(), hashLen);
|
||||
} else if (hostName.size() < hashLen) {
|
||||
std::ifstream file(HOSTID_FILE, std::ios::binary);
|
||||
if (file.is_open()) {
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
#include <unistd.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <mscclpp/env.hpp>
|
||||
#include <mscclpp/executor.hpp>
|
||||
#include <mscclpp/npkit/npkit.hpp>
|
||||
#include <mscclpp/utils.hpp>
|
||||
@@ -109,7 +110,7 @@ int main(int argc, char* argv[]) {
|
||||
const std::string executionPlanPath = argv[2];
|
||||
const int niters = std::stoi(argv[3]);
|
||||
const int ngraphIters = std::stoi(argv[4]);
|
||||
const char* npkitDumpDir = getenv("NPKIT_DUMP_DIR");
|
||||
const char* npkitDumpDir = mscclpp::env()->npkitDumpDir.c_str();
|
||||
mscclpp::PacketType packetType = mscclpp::PacketType::LL16;
|
||||
if (argc == 6) {
|
||||
packetType = parsePacketType(argv[5]);
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include <mpi.h>
|
||||
|
||||
#include <filesystem>
|
||||
#include <mscclpp/env.hpp>
|
||||
#include <mscclpp/npkit/npkit.hpp>
|
||||
|
||||
#include "mp_unit_tests.hpp"
|
||||
@@ -31,14 +32,14 @@ void ExecutorTest::SetUp() {
|
||||
bootstrap->initialize(id);
|
||||
std::shared_ptr<mscclpp::Communicator> communicator = std::make_shared<mscclpp::Communicator>(bootstrap);
|
||||
executor = std::make_shared<mscclpp::Executor>(communicator);
|
||||
npkitDumpDir = getenv("NPKIT_DUMP_DIR");
|
||||
if (npkitDumpDir != nullptr) {
|
||||
npkitDumpDir = mscclpp::env()->npkitDumpDir;
|
||||
if (npkitDumpDir != "") {
|
||||
NpKit::Init(gEnv->rank);
|
||||
}
|
||||
}
|
||||
|
||||
void ExecutorTest::TearDown() {
|
||||
if (npkitDumpDir != nullptr) {
|
||||
if (npkitDumpDir != "") {
|
||||
NpKit::Dump(npkitDumpDir);
|
||||
NpKit::Shutdown();
|
||||
}
|
||||
|
||||
@@ -170,6 +170,6 @@ class ExecutorTest : public MultiProcessTest {
|
||||
void TearDown() override;
|
||||
|
||||
std::shared_ptr<mscclpp::Executor> executor;
|
||||
const char* npkitDumpDir;
|
||||
std::string npkitDumpDir;
|
||||
};
|
||||
#endif // MSCCLPP_MP_UNIT_TESTS_HPP_
|
||||
|
||||
@@ -17,6 +17,6 @@ parallel-ssh -h ${HOSTFILE} "mkdir -p ${NPKIT_RUN_DIR}/npkit_dump"
|
||||
parallel-ssh -h ${HOSTFILE} "mkdir -p ${NPKIT_RUN_DIR}/npkit_trace"
|
||||
|
||||
# --bind-to numa is required because hardware timer from different cores (or core groups) can be non-synchronized.
|
||||
mpirun --allow-run-as-root -hostfile ${HOSTFILE} -map-by ppr:8:node --bind-to numa -x LD_PRELOAD=${NPKIT_RUN_DIR}/mscclpp/build/lib/libmscclpp.so -x MSCCLPP_DEBUG=WARN -x NPKIT_DUMP_DIR=${NPKIT_RUN_DIR}/npkit_dump ${NPKIT_RUN_DIR}/mscclpp/build/bin/tests/allgather_test -ip_port ${LEADER_IP_PORT} -kernel 0
|
||||
mpirun --allow-run-as-root -hostfile ${HOSTFILE} -map-by ppr:8:node --bind-to numa -x LD_PRELOAD=${NPKIT_RUN_DIR}/mscclpp/build/lib/libmscclpp.so -x MSCCLPP_DEBUG=WARN -x MSCCLPP_NPKIT_DUMP_DIR=${NPKIT_RUN_DIR}/npkit_dump ${NPKIT_RUN_DIR}/mscclpp/build/bin/tests/allgather_test -ip_port ${LEADER_IP_PORT} -kernel 0
|
||||
|
||||
parallel-ssh -h ${HOSTFILE} "cd ${NPKIT_RUN_DIR}/mscclpp/tools/npkit && python npkit_trace_generator.py --npkit_dump_dir ${NPKIT_RUN_DIR}/npkit_dump --npkit_event_header_path ${NPKIT_RUN_DIR}/mscclpp/src/include/npkit/npkit_event.h --output_dir ${NPKIT_RUN_DIR}/npkit_trace"
|
||||
|
||||
Reference in New Issue
Block a user