Manage runtime environments (#452)

* Add `Env` class that manages all runtime environments.
* Changed `NPKIT_DUMP_DIR` to `MSCCLPP_NPKIT_DUMP_DIR`.
This commit is contained in:
Changho Hwang
2025-01-15 09:44:52 -08:00
committed by GitHub
parent 8ac50dc85d
commit 869cdba00c
19 changed files with 229 additions and 51 deletions

View File

@@ -1,6 +1,6 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.cu)
file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.cpp *.cu)
target_sources(mscclpp_obj PRIVATE ${SOURCES})
target_include_directories(mscclpp_obj PRIVATE include)

View File

@@ -12,6 +12,7 @@
#include <unistd.h>
#include <fstream>
#include <mscclpp/env.hpp>
#include <mscclpp/errors.hpp>
#include <mscclpp/utils.hpp>
#include <sstream>
@@ -64,14 +65,12 @@ static uint16_t socketToPort(union SocketAddress* addr) {
/* Allow the user to force the IPv4/IPv6 interface selection */
static int envSocketFamily(void) {
int family = -1; // Family selection is not forced, will use first one found
char* env = getenv("MSCCLPP_SOCKET_FAMILY");
if (env == NULL) return family;
const std::string& socketFamily = env()->socketFamily;
if (socketFamily == "") return family;
INFO(MSCCLPP_ENV, "MSCCLPP_SOCKET_FAMILY set by environment to %s", env);
if (strcmp(env, "AF_INET") == 0)
if (socketFamily == "AF_INET")
family = AF_INET; // IPv4
else if (strcmp(env, "AF_INET6") == 0)
else if (socketFamily == "AF_INET6")
family = AF_INET6; // IPv6
return family;
}
@@ -306,27 +305,25 @@ int FindInterfaces(char* ifNames, union SocketAddress* ifAddrs, int ifNameMaxSiz
// Allow user to force the INET socket family selection
int sock_family = envSocketFamily();
// User specified interface
char* env = getenv("MSCCLPP_SOCKET_IFNAME");
const std::string& socketIfname = env()->socketIfname;
if (inputIfName) {
INFO(MSCCLPP_NET, "using iterface %s", inputIfName);
nIfs = findInterfaces(inputIfName, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
} else if (env && strlen(env) > 1) {
INFO(MSCCLPP_ENV, "MSCCLPP_SOCKET_IFNAME set by environment to %s", env);
} else if (socketIfname != "") {
// Specified by user : find or fail
if (shownIfName++ == 0) INFO(MSCCLPP_NET, "MSCCLPP_SOCKET_IFNAME set to %s", env);
nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
if (shownIfName++ == 0) INFO(MSCCLPP_NET, "MSCCLPP_SOCKET_IFNAME set to %s", socketIfname.c_str());
nIfs = findInterfaces(socketIfname.c_str(), ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
} else {
// Try to automatically pick the right one
// Start with IB
nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
// else see if we can get some hint from COMM ID
if (nIfs == 0) {
char* commId = getenv("MSCCLPP_COMM_ID");
if (commId && strlen(commId) > 1) {
INFO(MSCCLPP_ENV, "MSCCLPP_COMM_ID set by environment to %s", commId);
const std::string& commId = env()->commId;
if (commId != "") {
// Try to find interface that is in the same subnet as the IP in comm id
union SocketAddress idAddr;
SocketGetAddrFromString(&idAddr, commId);
SocketGetAddrFromString(&idAddr, commId.c_str());
nIfs = FindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);
}
}

View File

@@ -9,6 +9,7 @@
#include <sys/syscall.h>
#include <unistd.h>
#include <mscclpp/env.hpp>
#include <mscclpp/gpu_utils.hpp>
#include <mscclpp/utils.hpp>
#include <string>
@@ -34,7 +35,7 @@ void mscclppDebugInit() {
pthread_mutex_unlock(&mscclppDebugLock);
return;
}
const char* mscclpp_debug = getenv("MSCCLPP_DEBUG");
const char* mscclpp_debug = mscclpp::env()->debug.c_str();
int tempNcclDebugLevel = -1;
if (mscclpp_debug == NULL) {
tempNcclDebugLevel = MSCCLPP_LOG_NONE;
@@ -54,8 +55,9 @@ void mscclppDebugInit() {
* This can be a comma separated list such as INIT,COLL
* or ^INIT,COLL etc
*/
char* mscclppDebugSubsysEnv = getenv("MSCCLPP_DEBUG_SUBSYS");
if (mscclppDebugSubsysEnv != NULL) {
std::string mscclppDebugSubsysStr = mscclpp::env()->debugSubsys;
const char* mscclppDebugSubsysEnv = mscclppDebugSubsysStr.c_str();
if (mscclppDebugSubsysStr != "") {
int invert = 0;
if (mscclppDebugSubsysEnv[0] == '^') {
invert = 1;
@@ -108,7 +110,7 @@ void mscclppDebugInit() {
* then create the debug file. But don't bother unless the
* MSCCLPP_DEBUG level is > VERSION
*/
const char* mscclppDebugFileEnv = getenv("MSCCLPP_DEBUG_FILE");
const char* mscclppDebugFileEnv = mscclpp::env()->debugFile.c_str();
if (tempNcclDebugLevel > MSCCLPP_LOG_VERSION && mscclppDebugFileEnv != NULL) {
int c = 0;
char debugFn[PATH_MAX + 1] = "";

87
src/env.cpp Normal file
View File

@@ -0,0 +1,87 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include <cstdlib>
#include <type_traits>
// clang-format off
#include <mscclpp/env.hpp>
#include <mscclpp/errors.hpp>
// clang-format on
#include "debug.h"
template <typename T>
T readEnv(const std::string &envName, const T &defaultValue) {
const char *envCstr = getenv(envName.c_str());
if (envCstr == nullptr) return defaultValue;
if constexpr (std::is_same_v<T, int>) {
return atoi(envCstr);
} else if constexpr (std::is_same_v<T, bool>) {
return (std::string(envCstr) != "0");
}
return T(envCstr);
}
template <typename T>
void readAndSetEnv(const std::string &envName, T &env) {
const char *envCstr = getenv(envName.c_str());
if (envCstr == nullptr) return;
if constexpr (std::is_same_v<T, int>) {
env = atoi(envCstr);
} else if constexpr (std::is_same_v<T, bool>) {
env = (std::string(envCstr) != "0");
} else {
env = std::string(envCstr);
}
}
template <typename T>
void logEnv(const std::string &envName, const T &env) {
if (!getenv(envName.c_str())) return;
INFO(MSCCLPP_ENV, "%s=%d", envName.c_str(), env);
}
template <>
void logEnv(const std::string &envName, const std::string &env) {
if (!getenv(envName.c_str())) return;
INFO(MSCCLPP_ENV, "%s=%s", envName.c_str(), env.c_str());
}
namespace mscclpp {
Env::Env()
: debug(readEnv<std::string>("MSCCLPP_DEBUG", "")),
debugSubsys(readEnv<std::string>("MSCCLPP_DEBUG_SUBSYS", "")),
debugFile(readEnv<std::string>("MSCCLPP_DEBUG_FILE", "")),
hcaDevices(readEnv<std::string>("MSCCLPP_HCA_DEVICES", "")),
hostid(readEnv<std::string>("MSCCLPP_HOSTID", "")),
socketFamily(readEnv<std::string>("MSCCLPP_SOCKET_FAMILY", "")),
socketIfname(readEnv<std::string>("MSCCLPP_SOCKET_IFNAME", "")),
commId(readEnv<std::string>("MSCCLPP_COMM_ID", "")),
executionPlanDir(readEnv<std::string>("MSCCLPP_EXECUTION_PLAN_DIR", "")),
npkitDumpDir(readEnv<std::string>("MSCCLPP_NPKIT_DUMP_DIR", "")),
cudaIpcUseDefaultStream(readEnv<bool>("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", false)) {}
std::shared_ptr<Env> env() {
static std::shared_ptr<Env> globalEnv = std::shared_ptr<Env>(new Env());
static bool logged = false;
if (!logged) {
logged = true;
// cannot log inside the constructor because of circular dependency
logEnv("MSCCLPP_DEBUG", globalEnv->debug);
logEnv("MSCCLPP_DEBUG_SUBSYS", globalEnv->debugSubsys);
logEnv("MSCCLPP_DEBUG_FILE", globalEnv->debugFile);
logEnv("MSCCLPP_HCA_DEVICES", globalEnv->hcaDevices);
logEnv("MSCCLPP_HOSTID", globalEnv->hostid);
logEnv("MSCCLPP_SOCKET_FAMILY", globalEnv->socketFamily);
logEnv("MSCCLPP_SOCKET_IFNAME", globalEnv->socketIfname);
logEnv("MSCCLPP_COMM_ID", globalEnv->commId);
logEnv("MSCCLPP_EXECUTION_PLAN_DIR", globalEnv->executionPlanDir);
logEnv("MSCCLPP_NPKIT_DUMP_DIR", globalEnv->npkitDumpDir);
logEnv("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", globalEnv->cudaIpcUseDefaultStream);
}
return globalEnv;
}
} // namespace mscclpp

View File

@@ -9,6 +9,8 @@
#include <cstring>
#include <fstream>
#include <mscclpp/core.hpp>
#include <mscclpp/env.hpp>
#include <mscclpp/errors.hpp>
#include <mscclpp/fifo.hpp>
#include <sstream>
#include <string>
@@ -388,18 +390,16 @@ MSCCLPP_API_CPP int getIBDeviceCount() {
}
std::string getHcaDevices(int deviceIndex) {
const char* envValue = std::getenv("MSCCLPP_HCA_DEVICES");
if (envValue) {
std::string envStr = env()->hcaDevices;
if (envStr != "") {
std::vector<std::string> devices;
std::string envStr(envValue);
std::stringstream ss(envStr);
std::string device;
while (std::getline(ss, device, ',')) {
devices.push_back(device);
}
if (deviceIndex >= (int)devices.size()) {
throw std::invalid_argument("Not enough HCA devices are defined with MSCCLPP_HCA_DEVICES: " +
std::string(envValue));
throw Error("Not enough HCA devices are defined with MSCCLPP_HCA_DEVICES: " + envStr, ErrorCode::InvalidUsage);
}
return devices[deviceIndex];
}
@@ -434,7 +434,7 @@ MSCCLPP_API_CPP std::string getIBDeviceName(Transport ibTransport) {
ibTransportIndex = 7;
break;
default:
throw std::invalid_argument("Not an IB transport");
throw Error("Not an IB transport", ErrorCode::InvalidUsage);
}
std::string userHcaDevice = getHcaDevices(ibTransportIndex);
if (!userHcaDevice.empty()) {
@@ -446,7 +446,7 @@ MSCCLPP_API_CPP std::string getIBDeviceName(Transport ibTransport) {
if (ibTransportIndex >= num) {
std::stringstream ss;
ss << "IB transport out of range: " << ibTransportIndex << " >= " << num;
throw std::out_of_range(ss.str());
throw Error(ss.str(), ErrorCode::InvalidUsage);
}
return devices[ibTransportIndex]->name;
}
@@ -474,11 +474,11 @@ MSCCLPP_API_CPP Transport getIBTransportByDeviceName(const std::string& ibDevice
case 7:
return Transport::IB7;
default:
throw std::out_of_range("IB device index out of range");
throw Error("IB device index out of range", ErrorCode::InvalidUsage);
}
}
}
throw std::invalid_argument("IB device not found");
throw Error("IB device not found", ErrorCode::InvalidUsage);
}
#else // !defined(USE_IBVERBS)

View File

@@ -8,6 +8,7 @@
#include <cstring>
#include <fstream>
#include <memory>
#include <mscclpp/env.hpp>
#include <mscclpp/errors.hpp>
#include <string>
@@ -77,10 +78,9 @@ uint64_t computeHostHash(void) {
std::string hostName = getHostName(hashLen, '\0');
strncpy(hostHash, hostName.c_str(), hostName.size());
char* hostId;
if ((hostId = getenv("MSCCLPP_HOSTID")) != NULL) {
INFO(MSCCLPP_ENV, "MSCCLPP_HOSTID set by environment to %s", hostId);
strncpy(hostHash, hostId, hashLen);
std::string hostid = env()->hostid;
if (hostid != "") {
strncpy(hostHash, hostid.c_str(), hashLen);
} else if (hostName.size() < hashLen) {
std::ifstream file(HOSTID_FILE, std::ios::binary);
if (file.is_open()) {