mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-04-19 22:39:11 +00:00
link format correction
This commit is contained in:
@@ -12,8 +12,8 @@ namespace nb = nanobind;
|
||||
using namespace nb::literals;
|
||||
|
||||
// This is a poorman's substitute for std::format, which is a C++20 feature.
|
||||
template <typename... Args>
|
||||
std::string string_format(const std::string &format, Args... args) {
|
||||
template <typename... Args> std::string string_format(const std::string& format, Args... args)
|
||||
{
|
||||
// Shutup format warning.
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wformat-security"
|
||||
@@ -40,46 +40,50 @@ std::string string_format(const std::string &format, Args... args) {
|
||||
}
|
||||
|
||||
// Maybe return the value, maybe throw an exception.
|
||||
template <typename... Args>
|
||||
void checkResult(
|
||||
mscclppResult_t status, const std::string &format, Args... args) {
|
||||
template <typename... Args> void checkResult(mscclppResult_t status, const std::string& format, Args... args)
|
||||
{
|
||||
switch (status) {
|
||||
case mscclppSuccess:
|
||||
return;
|
||||
case mscclppSuccess:
|
||||
return;
|
||||
|
||||
case mscclppUnhandledCudaError:
|
||||
case mscclppSystemError:
|
||||
case mscclppInternalError:
|
||||
case mscclppRemoteError:
|
||||
case mscclppInProgress:
|
||||
case mscclppNumResults:
|
||||
throw std::runtime_error(string_format(format, args...));
|
||||
case mscclppUnhandledCudaError:
|
||||
case mscclppSystemError:
|
||||
case mscclppInternalError:
|
||||
case mscclppRemoteError:
|
||||
case mscclppInProgress:
|
||||
case mscclppNumResults:
|
||||
throw std::runtime_error(string_format(format, args...));
|
||||
|
||||
case mscclppInvalidArgument:
|
||||
case mscclppInvalidUsage:
|
||||
default:
|
||||
throw std::invalid_argument(string_format(format, args...));
|
||||
case mscclppInvalidArgument:
|
||||
case mscclppInvalidUsage:
|
||||
default:
|
||||
throw std::invalid_argument(string_format(format, args...));
|
||||
}
|
||||
}
|
||||
|
||||
// Maybe return the value, maybe throw an exception.
|
||||
template <typename Val, typename... Args>
|
||||
Val maybe(
|
||||
mscclppResult_t status, Val val, const std::string &format, Args... args) {
|
||||
Val maybe(mscclppResult_t status, Val val, const std::string& format, Args... args)
|
||||
{
|
||||
checkResult(status, format, args...);
|
||||
return val;
|
||||
}
|
||||
|
||||
// Wrapper around connection state.
|
||||
struct MscclppComm {
|
||||
struct MscclppComm
|
||||
{
|
||||
mscclppComm_t _handle;
|
||||
bool _is_open = false;
|
||||
|
||||
public:
|
||||
~MscclppComm() { close(); }
|
||||
public:
|
||||
~MscclppComm()
|
||||
{
|
||||
close();
|
||||
}
|
||||
|
||||
// Close should be safe to call on a closed handle.
|
||||
void close() {
|
||||
void close()
|
||||
{
|
||||
if (_is_open) {
|
||||
checkResult(mscclppCommDestroy(_handle), "Failed to close comm channel");
|
||||
_handle = 0;
|
||||
@@ -87,176 +91,116 @@ struct MscclppComm {
|
||||
}
|
||||
}
|
||||
|
||||
void check_open() {
|
||||
void check_open()
|
||||
{
|
||||
if (!_is_open) {
|
||||
throw std::invalid_argument("MscclppComm is not open");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static const std::string DOC_MscclppUniqueId =
|
||||
"MSCCLPP Unique Id; used by the MPI Interface";
|
||||
static const std::string DOC_MscclppUniqueId = "MSCCLPP Unique Id; used by the MPI Interface";
|
||||
|
||||
static const std::string DOC_MscclppComm = "MSCCLPP Communications Handle";
|
||||
|
||||
|
||||
NB_MODULE(_py_mscclpp, m) {
|
||||
NB_MODULE(_py_mscclpp, m)
|
||||
{
|
||||
m.doc() = "Python bindings for MSCCLPP: which is not NCCL";
|
||||
|
||||
m.attr("MSCCLPP_UNIQUE_ID_BYTES") = MSCCLPP_UNIQUE_ID_BYTES;
|
||||
|
||||
nb::class_<mscclppUniqueId>(m, "MscclppUniqueId")
|
||||
.def_ro_static("__doc__", &DOC_MscclppUniqueId)
|
||||
.def_static(
|
||||
"from_context",
|
||||
[]() {
|
||||
mscclppUniqueId uniqueId;
|
||||
return maybe(
|
||||
mscclppGetUniqueId(&uniqueId),
|
||||
uniqueId,
|
||||
"Failed to get MSCCLP Unique Id.");
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>())
|
||||
.def_static(
|
||||
"from_bytes",
|
||||
[](nb::bytes source) {
|
||||
if (source.size() != MSCCLPP_UNIQUE_ID_BYTES) {
|
||||
throw std::invalid_argument(string_format(
|
||||
"Requires exactly %d bytes; found %d",
|
||||
MSCCLPP_UNIQUE_ID_BYTES,
|
||||
source.size()));
|
||||
}
|
||||
.def_ro_static("__doc__", &DOC_MscclppUniqueId)
|
||||
.def_static(
|
||||
"from_context",
|
||||
[]() {
|
||||
mscclppUniqueId uniqueId;
|
||||
return maybe(mscclppGetUniqueId(&uniqueId), uniqueId, "Failed to get MSCCLP Unique Id.");
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>())
|
||||
.def_static("from_bytes",
|
||||
[](nb::bytes source) {
|
||||
if (source.size() != MSCCLPP_UNIQUE_ID_BYTES) {
|
||||
throw std::invalid_argument(
|
||||
string_format("Requires exactly %d bytes; found %d", MSCCLPP_UNIQUE_ID_BYTES, source.size()));
|
||||
}
|
||||
|
||||
mscclppUniqueId uniqueId;
|
||||
std::memcpy(
|
||||
uniqueId.internal, source.c_str(), sizeof(uniqueId.internal));
|
||||
return uniqueId;
|
||||
})
|
||||
.def("bytes", [](mscclppUniqueId id) {
|
||||
return nb::bytes(id.internal, sizeof(id.internal));
|
||||
});
|
||||
mscclppUniqueId uniqueId;
|
||||
std::memcpy(uniqueId.internal, source.c_str(), sizeof(uniqueId.internal));
|
||||
return uniqueId;
|
||||
})
|
||||
.def("bytes", [](mscclppUniqueId id) { return nb::bytes(id.internal, sizeof(id.internal)); });
|
||||
|
||||
nb::class_<MscclppComm>(m, "MscclppComm")
|
||||
.def_ro_static("__doc__", &DOC_MscclppComm)
|
||||
.def_static(
|
||||
"init_rank_from_address",
|
||||
[](const std::string &address, int rank, int world_size) {
|
||||
MscclppComm comm = {0};
|
||||
comm._is_open = true;
|
||||
return maybe(
|
||||
mscclppCommInitRank(
|
||||
&comm._handle, world_size, address.c_str(), rank),
|
||||
comm,
|
||||
"Failed to initialize comms: %s rank=%d world_size=%d",
|
||||
address,
|
||||
rank,
|
||||
world_size);
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>(),
|
||||
"address"_a,
|
||||
"rank"_a,
|
||||
"world_size"_a,
|
||||
"Initialize comms given an IP address, rank, and world_size")
|
||||
.def_static(
|
||||
"init_rank_from_id",
|
||||
[](const mscclppUniqueId &id, int rank, int world_size) {
|
||||
MscclppComm comm = {0};
|
||||
comm._is_open = true;
|
||||
return maybe(
|
||||
mscclppCommInitRankFromId(&comm._handle, world_size, id, rank),
|
||||
comm,
|
||||
"Failed to initialize comms: %02X%s rank=%d world_size=%d",
|
||||
id.internal,
|
||||
rank,
|
||||
world_size);
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>(),
|
||||
"id"_a,
|
||||
"rank"_a,
|
||||
"world_size"_a,
|
||||
"Initialize comms given u UniqueID, rank, and world_size")
|
||||
.def(
|
||||
"opened",
|
||||
[](MscclppComm &comm) { return comm._is_open; },
|
||||
"Is this comm object opened?")
|
||||
.def(
|
||||
"closed",
|
||||
[](MscclppComm &comm) { return !comm._is_open; },
|
||||
"Is this comm object closed?")
|
||||
.def(
|
||||
"rank",
|
||||
[](MscclppComm &comm) {
|
||||
comm.check_open();
|
||||
int rank;
|
||||
return maybe(
|
||||
mscclppCommRank(comm._handle, &rank),
|
||||
rank,
|
||||
"Failed to retrieve MSCCLPP rank");
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>(),
|
||||
"The rank of this node.")
|
||||
.def(
|
||||
"size",
|
||||
[](MscclppComm &comm) {
|
||||
comm.check_open();
|
||||
int size;
|
||||
return maybe(
|
||||
mscclppCommSize(comm._handle, &size),
|
||||
size,
|
||||
"Failed to retrieve MSCCLPP world size");
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>(),
|
||||
"The world size of this node.")
|
||||
.def(
|
||||
"connection_setup",
|
||||
[](MscclppComm &comm) {
|
||||
comm.check_open();
|
||||
return maybe(
|
||||
mscclppConnectionSetup(comm._handle),
|
||||
true,
|
||||
"Failed to settup MSCCLPP connection");
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>(),
|
||||
"Run connection setup for MSCCLPP.")
|
||||
.def(
|
||||
"launch_proxy",
|
||||
[](MscclppComm &comm) {
|
||||
comm.check_open();
|
||||
return maybe(
|
||||
mscclppProxyLaunch(comm._handle),
|
||||
true,
|
||||
"Failed to launch MSCCLPP proxy");
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>(),
|
||||
"Start the MSCCLPP proxy.")
|
||||
.def(
|
||||
"stop_proxy",
|
||||
[](MscclppComm &comm) {
|
||||
comm.check_open();
|
||||
return maybe(
|
||||
mscclppProxyStop(comm._handle),
|
||||
true,
|
||||
"Failed to stop MSCCLPP proxy");
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>(),
|
||||
"Start the MSCCLPP proxy.")
|
||||
.def(
|
||||
"close",
|
||||
&MscclppComm::close,
|
||||
nb::call_guard<nb::gil_scoped_release>())
|
||||
.def(
|
||||
"__del__",
|
||||
&MscclppComm::close,
|
||||
nb::call_guard<nb::gil_scoped_release>())
|
||||
.def(
|
||||
"bootstrap_all_gather",
|
||||
[](MscclppComm &comm, void *data, int size) {
|
||||
comm.check_open();
|
||||
return maybe(
|
||||
mscclppBootstrapAllGather(comm._handle, data, size),
|
||||
true,
|
||||
"Failed to stop MSCCLPP proxy");
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>());
|
||||
|
||||
.def_ro_static("__doc__", &DOC_MscclppComm)
|
||||
.def_static(
|
||||
"init_rank_from_address",
|
||||
[](const std::string& address, int rank, int world_size) {
|
||||
MscclppComm comm = {0};
|
||||
comm._is_open = true;
|
||||
return maybe(mscclppCommInitRank(&comm._handle, world_size, address.c_str(), rank), comm,
|
||||
"Failed to initialize comms: %s rank=%d world_size=%d", address, rank, world_size);
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>(), "address"_a, "rank"_a, "world_size"_a,
|
||||
"Initialize comms given an IP address, rank, and world_size")
|
||||
.def_static(
|
||||
"init_rank_from_id",
|
||||
[](const mscclppUniqueId& id, int rank, int world_size) {
|
||||
MscclppComm comm = {0};
|
||||
comm._is_open = true;
|
||||
return maybe(mscclppCommInitRankFromId(&comm._handle, world_size, id, rank), comm,
|
||||
"Failed to initialize comms: %02X%s rank=%d world_size=%d", id.internal, rank, world_size);
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>(), "id"_a, "rank"_a, "world_size"_a,
|
||||
"Initialize comms given u UniqueID, rank, and world_size")
|
||||
.def(
|
||||
"opened", [](MscclppComm& comm) { return comm._is_open; }, "Is this comm object opened?")
|
||||
.def(
|
||||
"closed", [](MscclppComm& comm) { return !comm._is_open; }, "Is this comm object closed?")
|
||||
.def(
|
||||
"rank",
|
||||
[](MscclppComm& comm) {
|
||||
comm.check_open();
|
||||
int rank;
|
||||
return maybe(mscclppCommRank(comm._handle, &rank), rank, "Failed to retrieve MSCCLPP rank");
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>(), "The rank of this node.")
|
||||
.def(
|
||||
"size",
|
||||
[](MscclppComm& comm) {
|
||||
comm.check_open();
|
||||
int size;
|
||||
return maybe(mscclppCommSize(comm._handle, &size), size, "Failed to retrieve MSCCLPP world size");
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>(), "The world size of this node.")
|
||||
.def(
|
||||
"connection_setup",
|
||||
[](MscclppComm& comm) {
|
||||
comm.check_open();
|
||||
return maybe(mscclppConnectionSetup(comm._handle), true, "Failed to settup MSCCLPP connection");
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>(), "Run connection setup for MSCCLPP.")
|
||||
.def(
|
||||
"launch_proxy",
|
||||
[](MscclppComm& comm) {
|
||||
comm.check_open();
|
||||
return maybe(mscclppProxyLaunch(comm._handle), true, "Failed to launch MSCCLPP proxy");
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>(), "Start the MSCCLPP proxy.")
|
||||
.def(
|
||||
"stop_proxy",
|
||||
[](MscclppComm& comm) {
|
||||
comm.check_open();
|
||||
return maybe(mscclppProxyStop(comm._handle), true, "Failed to stop MSCCLPP proxy");
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>(), "Start the MSCCLPP proxy.")
|
||||
.def("close", &MscclppComm::close, nb::call_guard<nb::gil_scoped_release>())
|
||||
.def("__del__", &MscclppComm::close, nb::call_guard<nb::gil_scoped_release>())
|
||||
.def(
|
||||
"bootstrap_all_gather",
|
||||
[](MscclppComm& comm, void* data, int size) {
|
||||
comm.check_open();
|
||||
return maybe(mscclppBootstrapAllGather(comm._handle, data, size), true, "Failed to stop MSCCLPP proxy");
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>());
|
||||
}
|
||||
|
||||
@@ -4,25 +4,27 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "mscclpp.h"
|
||||
#include "core.h"
|
||||
#include "utils.h"
|
||||
#include "bootstrap.h"
|
||||
#include <unistd.h>
|
||||
#include "core.h"
|
||||
#include "mscclpp.h"
|
||||
#include "utils.h"
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
struct bootstrapRootArgs {
|
||||
struct bootstrapRootArgs
|
||||
{
|
||||
struct mscclppSocket* listenSock;
|
||||
uint64_t magic;
|
||||
};
|
||||
|
||||
/* Init functions */
|
||||
static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
|
||||
static char bootstrapNetIfName[MAX_IF_NAME_SIZE + 1];
|
||||
static union mscclppSocketAddress bootstrapNetIfAddr;
|
||||
static int bootstrapNetInitDone = 0;
|
||||
pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
mscclppResult_t bootstrapNetInit(const char* ip_port_pair) {
|
||||
mscclppResult_t bootstrapNetInit(const char* ip_port_pair)
|
||||
{
|
||||
if (bootstrapNetInitDone == 0) {
|
||||
pthread_mutex_lock(&bootstrapNetLock);
|
||||
if (bootstrapNetInitDone == 0) {
|
||||
@@ -38,7 +40,8 @@ mscclppResult_t bootstrapNetInit(const char* ip_port_pair) {
|
||||
WARN("Invalid MSCCLPP_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
|
||||
return mscclppInvalidArgument;
|
||||
}
|
||||
if (mscclppFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
|
||||
if (mscclppFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE,
|
||||
1) <= 0) {
|
||||
WARN("NET/Socket : No usable listening interface found");
|
||||
return mscclppSystemError;
|
||||
}
|
||||
@@ -49,9 +52,9 @@ mscclppResult_t bootstrapNetInit(const char* ip_port_pair) {
|
||||
return mscclppInternalError;
|
||||
}
|
||||
}
|
||||
char line[SOCKET_NAME_MAXLEN+MAX_IF_NAME_SIZE+2];
|
||||
char line[SOCKET_NAME_MAXLEN + MAX_IF_NAME_SIZE + 2];
|
||||
sprintf(line, " %s:", bootstrapNetIfName);
|
||||
mscclppSocketToString(&bootstrapNetIfAddr, line+strlen(line));
|
||||
mscclppSocketToString(&bootstrapNetIfAddr, line + strlen(line));
|
||||
INFO(MSCCLPP_INIT, "Bootstrap : Using%s", line);
|
||||
bootstrapNetInitDone = 1;
|
||||
}
|
||||
@@ -61,15 +64,21 @@ mscclppResult_t bootstrapNetInit(const char* ip_port_pair) {
|
||||
}
|
||||
|
||||
/* Socket Interface Selection type */
|
||||
enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
|
||||
enum bootstrapInterface_t
|
||||
{
|
||||
findSubnetIf = -1,
|
||||
dontCareIf = -2
|
||||
};
|
||||
|
||||
// Additional sync functions
|
||||
static mscclppResult_t bootstrapNetSend(struct mscclppSocket* sock, void* data, int size) {
|
||||
static mscclppResult_t bootstrapNetSend(struct mscclppSocket* sock, void* data, int size)
|
||||
{
|
||||
MSCCLPPCHECK(mscclppSocketSend(sock, &size, sizeof(int)));
|
||||
MSCCLPPCHECK(mscclppSocketSend(sock, data, size));
|
||||
return mscclppSuccess;
|
||||
}
|
||||
static mscclppResult_t bootstrapNetRecv(struct mscclppSocket* sock, void* data, int size) {
|
||||
static mscclppResult_t bootstrapNetRecv(struct mscclppSocket* sock, void* data, int size)
|
||||
{
|
||||
int recvSize;
|
||||
MSCCLPPCHECK(mscclppSocketRecv(sock, &recvSize, sizeof(int)));
|
||||
if (recvSize > size) {
|
||||
@@ -80,7 +89,8 @@ static mscclppResult_t bootstrapNetRecv(struct mscclppSocket* sock, void* data,
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
struct extInfo {
|
||||
struct extInfo
|
||||
{
|
||||
int rank;
|
||||
int nranks;
|
||||
union mscclppSocketAddress extAddressListenRoot;
|
||||
@@ -89,7 +99,8 @@ struct extInfo {
|
||||
|
||||
#include <sys/resource.h>
|
||||
|
||||
static mscclppResult_t setFilesLimit() {
|
||||
static mscclppResult_t setFilesLimit()
|
||||
{
|
||||
struct rlimit filesLimit;
|
||||
SYSCHECK(getrlimit(RLIMIT_NOFILE, &filesLimit), "getrlimit");
|
||||
filesLimit.rlim_cur = filesLimit.rlim_max;
|
||||
@@ -97,16 +108,17 @@ static mscclppResult_t setFilesLimit() {
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
static void *bootstrapRoot(void* rargs) {
|
||||
static void* bootstrapRoot(void* rargs)
|
||||
{
|
||||
struct bootstrapRootArgs* args = (struct bootstrapRootArgs*)rargs;
|
||||
struct mscclppSocket* listenSock = args->listenSock;
|
||||
uint64_t magic = args->magic;
|
||||
mscclppResult_t res = mscclppSuccess;
|
||||
int nranks = 0, c = 0;
|
||||
struct extInfo info;
|
||||
union mscclppSocketAddress *rankAddresses = NULL;
|
||||
union mscclppSocketAddress *rankAddressesRoot = NULL; // for initial rank <-> root information exchange
|
||||
union mscclppSocketAddress *zero = NULL;
|
||||
union mscclppSocketAddress* rankAddresses = NULL;
|
||||
union mscclppSocketAddress* rankAddressesRoot = NULL; // for initial rank <-> root information exchange
|
||||
union mscclppSocketAddress* zero = NULL;
|
||||
MSCCLPPCHECKGOTO(mscclppCalloc(&zero, 1), res, out);
|
||||
setFilesLimit();
|
||||
|
||||
@@ -136,21 +148,21 @@ static void *bootstrapRoot(void* rargs) {
|
||||
}
|
||||
|
||||
// Save the connection handle for that rank
|
||||
memcpy(rankAddressesRoot+info.rank, &info.extAddressListenRoot, sizeof(union mscclppSocketAddress));
|
||||
memcpy(rankAddresses+info.rank, &info.extAddressListen, sizeof(union mscclppSocketAddress));
|
||||
memcpy(rankAddressesRoot + info.rank, &info.extAddressListenRoot, sizeof(union mscclppSocketAddress));
|
||||
memcpy(rankAddresses + info.rank, &info.extAddressListen, sizeof(union mscclppSocketAddress));
|
||||
|
||||
++c;
|
||||
TRACE(MSCCLPP_INIT, "Received connect from rank %d total %d/%d", info.rank, c, nranks);
|
||||
TRACE(MSCCLPP_INIT, "Received connect from rank %d total %d/%d", info.rank, c, nranks);
|
||||
} while (c < nranks);
|
||||
TRACE(MSCCLPP_INIT, "COLLECTED ALL %d HANDLES", nranks);
|
||||
|
||||
// Send the connect handle for the next rank in the AllGather ring
|
||||
for (int r=0; r<nranks; ++r) {
|
||||
int next = (r+1) % nranks;
|
||||
for (int r = 0; r < nranks; ++r) {
|
||||
int next = (r + 1) % nranks;
|
||||
struct mscclppSocket sock;
|
||||
MSCCLPPCHECKGOTO(mscclppSocketInit(&sock, rankAddressesRoot+r, magic, mscclppSocketTypeBootstrap), res, out);
|
||||
MSCCLPPCHECKGOTO(mscclppSocketInit(&sock, rankAddressesRoot + r, magic, mscclppSocketTypeBootstrap), res, out);
|
||||
MSCCLPPCHECKGOTO(mscclppSocketConnect(&sock), res, out);
|
||||
MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, rankAddresses+next, sizeof(union mscclppSocketAddress)), res, out);
|
||||
MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, rankAddresses + next, sizeof(union mscclppSocketAddress)), res, out);
|
||||
MSCCLPPCHECKGOTO(mscclppSocketClose(&sock), res, out);
|
||||
}
|
||||
TRACE(MSCCLPP_INIT, "SENT OUT ALL %d HANDLES", nranks);
|
||||
@@ -160,16 +172,20 @@ out:
|
||||
mscclppSocketClose(listenSock);
|
||||
free(listenSock);
|
||||
}
|
||||
if (rankAddresses) free(rankAddresses);
|
||||
if (rankAddressesRoot) free(rankAddressesRoot);
|
||||
if (zero) free(zero);
|
||||
if (rankAddresses)
|
||||
free(rankAddresses);
|
||||
if (rankAddressesRoot)
|
||||
free(rankAddressesRoot);
|
||||
if (zero)
|
||||
free(zero);
|
||||
free(rargs);
|
||||
|
||||
TRACE(MSCCLPP_INIT, "DONE");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
mscclppResult_t bootstrapCreateRoot(struct mscclppBootstrapHandle* handle) {
|
||||
mscclppResult_t bootstrapCreateRoot(struct mscclppBootstrapHandle* handle)
|
||||
{
|
||||
struct mscclppSocket* listenSock;
|
||||
struct bootstrapRootArgs* args;
|
||||
pthread_t thread;
|
||||
@@ -191,7 +207,8 @@ mscclppResult_t bootstrapCreateRoot(struct mscclppBootstrapHandle* handle) {
|
||||
// #include <netinet/in.h>
|
||||
// #include <arpa/inet.h>
|
||||
|
||||
mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle, bool isRoot, const char* ip_port_pair) {
|
||||
mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle, bool isRoot, const char* ip_port_pair)
|
||||
{
|
||||
memset(handle, 0, sizeof(mscclppBootstrapHandle));
|
||||
const char* env = NULL;
|
||||
if (ip_port_pair) {
|
||||
@@ -220,14 +237,16 @@ mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle, bool
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
struct unexConn {
|
||||
struct unexConn
|
||||
{
|
||||
int peer;
|
||||
int tag;
|
||||
struct mscclppSocket sock;
|
||||
struct unexConn* next;
|
||||
};
|
||||
|
||||
struct bootstrapState {
|
||||
struct bootstrapState
|
||||
{
|
||||
struct mscclppSocket listenSock;
|
||||
struct mscclppSocket ringRecvSocket;
|
||||
struct mscclppSocket ringSendSocket;
|
||||
@@ -238,10 +257,11 @@ struct bootstrapState {
|
||||
int rank;
|
||||
int nranks;
|
||||
uint64_t magic;
|
||||
volatile uint32_t *abortFlag;
|
||||
volatile uint32_t* abortFlag;
|
||||
};
|
||||
|
||||
mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscclppComm* comm) {
|
||||
mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscclppComm* comm)
|
||||
{
|
||||
int rank = comm->rank;
|
||||
int nranks = comm->nRanks;
|
||||
struct bootstrapState* state;
|
||||
@@ -262,12 +282,14 @@ mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscc
|
||||
info.rank = rank;
|
||||
info.nranks = nranks;
|
||||
// Create socket for other ranks to contact me
|
||||
MSCCLPPCHECK(mscclppSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->magic, mscclppSocketTypeBootstrap, comm->abortFlag));
|
||||
MSCCLPPCHECK(mscclppSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->magic, mscclppSocketTypeBootstrap,
|
||||
comm->abortFlag));
|
||||
MSCCLPPCHECK(mscclppSocketListen(&state->listenSock));
|
||||
MSCCLPPCHECK(mscclppSocketGetAddr(&state->listenSock, &info.extAddressListen));
|
||||
|
||||
// Create socket for root to contact me
|
||||
MSCCLPPCHECK(mscclppSocketInit(&listenSockRoot, &bootstrapNetIfAddr, comm->magic, mscclppSocketTypeBootstrap, comm->abortFlag));
|
||||
MSCCLPPCHECK(
|
||||
mscclppSocketInit(&listenSockRoot, &bootstrapNetIfAddr, comm->magic, mscclppSocketTypeBootstrap, comm->abortFlag));
|
||||
MSCCLPPCHECK(mscclppSocketListen(&listenSockRoot));
|
||||
MSCCLPPCHECK(mscclppSocketGetAddr(&listenSockRoot, &info.extAddressListenRoot));
|
||||
|
||||
@@ -278,7 +300,7 @@ mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscc
|
||||
tv.tv_sec = msec / 1000;
|
||||
tv.tv_nsec = 1000000 * (msec % 1000);
|
||||
TRACE(MSCCLPP_INIT, "rank %d delaying connection to root by %ld msec", rank, msec);
|
||||
(void) nanosleep(&tv, NULL);
|
||||
(void)nanosleep(&tv, NULL);
|
||||
}
|
||||
|
||||
// send info on my listening socket to root
|
||||
@@ -294,7 +316,8 @@ mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscc
|
||||
MSCCLPPCHECK(mscclppSocketClose(&sock));
|
||||
MSCCLPPCHECK(mscclppSocketClose(&listenSockRoot));
|
||||
|
||||
MSCCLPPCHECK(mscclppSocketInit(&state->ringSendSocket, &nextAddr, comm->magic, mscclppSocketTypeBootstrap, comm->abortFlag));
|
||||
MSCCLPPCHECK(
|
||||
mscclppSocketInit(&state->ringSendSocket, &nextAddr, comm->magic, mscclppSocketTypeBootstrap, comm->abortFlag));
|
||||
MSCCLPPCHECK(mscclppSocketConnect(&state->ringSendSocket));
|
||||
// Accept the connect request from the previous rank in the AllGather ring
|
||||
MSCCLPPCHECK(mscclppSocketInit(&state->ringRecvSocket));
|
||||
@@ -302,7 +325,7 @@ mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscc
|
||||
|
||||
// AllGather all listen handlers
|
||||
MSCCLPPCHECK(mscclppCalloc(&state->peerCommAddresses, nranks));
|
||||
MSCCLPPCHECK(mscclppSocketGetAddr(&state->listenSock, state->peerCommAddresses+rank));
|
||||
MSCCLPPCHECK(mscclppSocketGetAddr(&state->listenSock, state->peerCommAddresses + rank));
|
||||
MSCCLPPCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union mscclppSocketAddress)));
|
||||
|
||||
// Create the service proxy
|
||||
@@ -310,9 +333,10 @@ mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscc
|
||||
|
||||
// proxy is aborted through a message; don't set abortFlag
|
||||
MSCCLPPCHECK(mscclppCalloc(&proxySocket, 1));
|
||||
MSCCLPPCHECK(mscclppSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, mscclppSocketTypeProxy, comm->abortFlag));
|
||||
MSCCLPPCHECK(
|
||||
mscclppSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, mscclppSocketTypeProxy, comm->abortFlag));
|
||||
MSCCLPPCHECK(mscclppSocketListen(proxySocket));
|
||||
MSCCLPPCHECK(mscclppSocketGetAddr(proxySocket, state->peerProxyAddresses+rank));
|
||||
MSCCLPPCHECK(mscclppSocketGetAddr(proxySocket, state->peerProxyAddresses + rank));
|
||||
MSCCLPPCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union mscclppSocketAddress)));
|
||||
// MSCCLPPCHECK(mscclppProxyInit(comm, proxySocket, state->peerProxyAddresses));
|
||||
|
||||
@@ -321,7 +345,8 @@ mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscc
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t bootstrapAllGather(void* commState, void* allData, int size) {
|
||||
mscclppResult_t bootstrapAllGather(void* commState, void* allData, int size)
|
||||
{
|
||||
struct bootstrapState* state = (struct bootstrapState*)commState;
|
||||
char* data = (char*)allData;
|
||||
int rank = state->rank;
|
||||
@@ -333,26 +358,29 @@ mscclppResult_t bootstrapAllGather(void* commState, void* allData, int size) {
|
||||
* At each step i receive data from (rank-i-1) from left
|
||||
* and send previous step's data from (rank-i) to right
|
||||
*/
|
||||
for (int i=0; i<nranks-1; i++) {
|
||||
for (int i = 0; i < nranks - 1; i++) {
|
||||
size_t rslice = (rank - i - 1 + nranks) % nranks;
|
||||
size_t sslice = (rank - i + nranks) % nranks;
|
||||
|
||||
// Send slice to the right
|
||||
MSCCLPPCHECK(bootstrapNetSend(&state->ringSendSocket, data+sslice*size, size));
|
||||
MSCCLPPCHECK(bootstrapNetSend(&state->ringSendSocket, data + sslice * size, size));
|
||||
// Recv slice from the left
|
||||
MSCCLPPCHECK(bootstrapNetRecv(&state->ringRecvSocket, data+rslice*size, size));
|
||||
MSCCLPPCHECK(bootstrapNetRecv(&state->ringRecvSocket, data + rslice * size, size));
|
||||
}
|
||||
|
||||
TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) {
|
||||
mscclppResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size)
|
||||
{
|
||||
mscclppResult_t ret = mscclppSuccess;
|
||||
struct bootstrapState* state = (struct bootstrapState*)commState;
|
||||
struct mscclppSocket sock;
|
||||
|
||||
MSCCLPPCHECKGOTO(mscclppSocketInit(&sock, state->peerCommAddresses+peer, state->magic, mscclppSocketTypeBootstrap, state->abortFlag), ret, fail);
|
||||
MSCCLPPCHECKGOTO(mscclppSocketInit(&sock, state->peerCommAddresses + peer, state->magic, mscclppSocketTypeBootstrap,
|
||||
state->abortFlag),
|
||||
ret, fail);
|
||||
MSCCLPPCHECKGOTO(mscclppSocketConnect(&sock), ret, fail);
|
||||
MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, &state->rank, sizeof(int)), ret, fail);
|
||||
MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, &tag, sizeof(int)), ret, fail);
|
||||
@@ -365,8 +393,10 @@ fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
mscclppResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag) {
|
||||
if (nranks == 1) return mscclppSuccess;
|
||||
mscclppResult_t bootstrapBarrier(void* commState, int* ranks, int rank, int nranks, int tag)
|
||||
{
|
||||
if (nranks == 1)
|
||||
return mscclppSuccess;
|
||||
TRACE(MSCCLPP_INIT, "rank %d nranks %d tag %x - ENTER", rank, nranks, tag);
|
||||
|
||||
/* Simple intra process barrier
|
||||
@@ -375,7 +405,7 @@ mscclppResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nran
|
||||
* "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988"
|
||||
*/
|
||||
int data[1];
|
||||
for (int mask=1; mask<nranks; mask<<=1) {
|
||||
for (int mask = 1; mask < nranks; mask <<= 1) {
|
||||
int src = (rank - mask + nranks) % nranks;
|
||||
int dst = (rank + mask) % nranks;
|
||||
MSCCLPPCHECK(bootstrapSend(commState, ranks[dst], tag, data, sizeof(data)));
|
||||
@@ -386,23 +416,26 @@ mscclppResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nran
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size) {
|
||||
if (nranks == 1) return mscclppSuccess;
|
||||
mscclppResult_t bootstrapIntraNodeAllGather(void* commState, int* ranks, int rank, int nranks, void* allData, int size)
|
||||
{
|
||||
if (nranks == 1)
|
||||
return mscclppSuccess;
|
||||
char* data = (char*)allData;
|
||||
TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - ENTER", rank, nranks, size);
|
||||
|
||||
for (int i=1; i<nranks; i++) {
|
||||
for (int i = 1; i < nranks; i++) {
|
||||
int src = (rank - i + nranks) % nranks;
|
||||
int dst = (rank + i) % nranks;
|
||||
MSCCLPPCHECK(bootstrapSend(commState, ranks[dst], /*tag=*/i, data+rank*size, size));
|
||||
MSCCLPPCHECK(bootstrapRecv(commState, ranks[src], /*tag=*/i, data+src*size, size));
|
||||
MSCCLPPCHECK(bootstrapSend(commState, ranks[dst], /*tag=*/i, data + rank * size, size));
|
||||
MSCCLPPCHECK(bootstrapRecv(commState, ranks[src], /*tag=*/i, data + src * size, size));
|
||||
}
|
||||
|
||||
TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct mscclppSocket* sock) {
|
||||
mscclppResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct mscclppSocket* sock)
|
||||
{
|
||||
// New unex
|
||||
struct unexConn* unex;
|
||||
MSCCLPPCHECK(mscclppCalloc(&unex, 1));
|
||||
@@ -416,12 +449,15 @@ mscclppResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int ta
|
||||
state->unexpectedConnections = unex;
|
||||
return mscclppSuccess;
|
||||
}
|
||||
while (list->next) list = list->next;
|
||||
while (list->next)
|
||||
list = list->next;
|
||||
list->next = unex;
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct mscclppSocket* sock, int* found) {
|
||||
mscclppResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct mscclppSocket* sock,
|
||||
int* found)
|
||||
{
|
||||
struct unexConn* elem = state->unexpectedConnections;
|
||||
struct unexConn* prev = NULL;
|
||||
*found = 0;
|
||||
@@ -443,7 +479,8 @@ mscclppResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int ta
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
static void unexpectedFree(struct bootstrapState* state) {
|
||||
static void unexpectedFree(struct bootstrapState* state)
|
||||
{
|
||||
struct unexConn* elem = state->unexpectedConnections;
|
||||
struct unexConn* prev = NULL;
|
||||
|
||||
@@ -456,7 +493,8 @@ static void unexpectedFree(struct bootstrapState* state) {
|
||||
}
|
||||
|
||||
// We can't know who we'll receive from, so we need to receive everything at once
|
||||
mscclppResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) {
|
||||
mscclppResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size)
|
||||
{
|
||||
mscclppResult_t ret = mscclppSuccess;
|
||||
struct bootstrapState* state = (struct bootstrapState*)commState;
|
||||
struct mscclppSocket sock;
|
||||
@@ -490,7 +528,8 @@ fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
mscclppResult_t bootstrapClose(void* commState) {
|
||||
mscclppResult_t bootstrapClose(void* commState)
|
||||
{
|
||||
struct bootstrapState* state = (struct bootstrapState*)commState;
|
||||
if (state->unexpectedConnections != NULL) {
|
||||
unexpectedFree(state);
|
||||
@@ -510,9 +549,11 @@ mscclppResult_t bootstrapClose(void* commState) {
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t bootstrapAbort(void* commState) {
|
||||
mscclppResult_t bootstrapAbort(void* commState)
|
||||
{
|
||||
struct bootstrapState* state = (struct bootstrapState*)commState;
|
||||
if (commState == NULL) return mscclppSuccess;
|
||||
if (commState == NULL)
|
||||
return mscclppSuccess;
|
||||
MSCCLPPCHECK(mscclppSocketClose(&state->listenSock));
|
||||
MSCCLPPCHECK(mscclppSocketClose(&state->ringSendSocket));
|
||||
MSCCLPPCHECK(mscclppSocketClose(&state->ringRecvSocket));
|
||||
|
||||
@@ -8,25 +8,30 @@
|
||||
#include "utils.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <unistd.h>
|
||||
#include <ifaddrs.h>
|
||||
#include <net/if.h>
|
||||
#include <unistd.h>
|
||||
|
||||
static mscclppResult_t socketProgressOpt(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset, int block, int* closed) {
|
||||
static mscclppResult_t socketProgressOpt(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset,
|
||||
int block, int* closed)
|
||||
{
|
||||
int bytes = 0;
|
||||
*closed = 0;
|
||||
char* data = (char*)ptr;
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
char line[SOCKET_NAME_MAXLEN + 1];
|
||||
do {
|
||||
if (op == MSCCLPP_SOCKET_RECV) bytes = recv(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
|
||||
if (op == MSCCLPP_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? MSG_NOSIGNAL : MSG_DONTWAIT | MSG_NOSIGNAL);
|
||||
if (op == MSCCLPP_SOCKET_RECV)
|
||||
bytes = recv(sock->fd, data + (*offset), size - (*offset), block ? 0 : MSG_DONTWAIT);
|
||||
if (op == MSCCLPP_SOCKET_SEND)
|
||||
bytes = send(sock->fd, data + (*offset), size - (*offset), block ? MSG_NOSIGNAL : MSG_DONTWAIT | MSG_NOSIGNAL);
|
||||
if (op == MSCCLPP_SOCKET_RECV && bytes == 0) {
|
||||
*closed = 1;
|
||||
return mscclppSuccess;
|
||||
}
|
||||
if (bytes == -1) {
|
||||
if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
|
||||
WARN("socketProgressOpt: Call to recv from %s failed : %s", mscclppSocketToString(&sock->addr, line), strerror(errno));
|
||||
WARN("socketProgressOpt: Call to recv from %s failed : %s", mscclppSocketToString(&sock->addr, line),
|
||||
strerror(errno));
|
||||
return mscclppRemoteError;
|
||||
} else {
|
||||
bytes = 0;
|
||||
@@ -41,18 +46,20 @@ static mscclppResult_t socketProgressOpt(int op, struct mscclppSocket* sock, voi
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
static mscclppResult_t socketProgress(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset) {
|
||||
static mscclppResult_t socketProgress(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset)
|
||||
{
|
||||
int closed;
|
||||
MSCCLPPCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0, &closed));
|
||||
if (closed) {
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
char line[SOCKET_NAME_MAXLEN + 1];
|
||||
WARN("socketProgress: Connection closed by remote peer %s", mscclppSocketToString(&sock->addr, line, 0));
|
||||
return mscclppRemoteError;
|
||||
}
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
static mscclppResult_t socketWait(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset) {
|
||||
static mscclppResult_t socketWait(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset)
|
||||
{
|
||||
while (*offset < size)
|
||||
MSCCLPPCHECK(socketProgress(op, sock, ptr, size, offset));
|
||||
return mscclppSuccess;
|
||||
@@ -62,27 +69,34 @@ static mscclppResult_t socketWait(int op, struct mscclppSocket* sock, void* ptr,
|
||||
*
|
||||
* Output: "IPv4/IPv6 address<port>"
|
||||
*/
|
||||
const char *mscclppSocketToString(union mscclppSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) {
|
||||
if (buf == NULL || addr == NULL) return NULL;
|
||||
struct sockaddr *saddr = &addr->sa;
|
||||
if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; }
|
||||
const char* mscclppSocketToString(union mscclppSocketAddress* addr, char* buf, const int numericHostForm /*= 1*/)
|
||||
{
|
||||
if (buf == NULL || addr == NULL)
|
||||
return NULL;
|
||||
struct sockaddr* saddr = &addr->sa;
|
||||
if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) {
|
||||
buf[0] = '\0';
|
||||
return buf;
|
||||
}
|
||||
char host[NI_MAXHOST], service[NI_MAXSERV];
|
||||
/* NI_NUMERICHOST: If set, then the numeric form of the hostname is returned.
|
||||
* (When not set, this will still happen in case the node's name cannot be determined.)
|
||||
*/
|
||||
int flag = NI_NUMERICSERV | (numericHostForm ? NI_NUMERICHOST : 0);
|
||||
(void) getnameinfo(saddr, sizeof(union mscclppSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag);
|
||||
(void)getnameinfo(saddr, sizeof(union mscclppSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag);
|
||||
sprintf(buf, "%s<%s>", host, service);
|
||||
return buf;
|
||||
}
|
||||
|
||||
static uint16_t socketToPort(union mscclppSocketAddress *addr) {
|
||||
struct sockaddr *saddr = &addr->sa;
|
||||
static uint16_t socketToPort(union mscclppSocketAddress* addr)
|
||||
{
|
||||
struct sockaddr* saddr = &addr->sa;
|
||||
return ntohs(saddr->sa_family == AF_INET ? addr->sin.sin_port : addr->sin6.sin6_port);
|
||||
}
|
||||
|
||||
/* Allow the user to force the IPv4/IPv6 interface selection */
|
||||
static int envSocketFamily(void) {
|
||||
static int envSocketFamily(void)
|
||||
{
|
||||
int family = -1; // Family selection is not forced, will use first one found
|
||||
char* env = getenv("MSCCLPP_SOCKET_FAMILY");
|
||||
if (env == NULL)
|
||||
@@ -91,35 +105,41 @@ static int envSocketFamily(void) {
|
||||
INFO(MSCCLPP_ENV, "MSCCLPP_SOCKET_FAMILY set by environment to %s", env);
|
||||
|
||||
if (strcmp(env, "AF_INET") == 0)
|
||||
family = AF_INET; // IPv4
|
||||
family = AF_INET; // IPv4
|
||||
else if (strcmp(env, "AF_INET6") == 0)
|
||||
family = AF_INET6; // IPv6
|
||||
return family;
|
||||
}
|
||||
|
||||
static int findInterfaces(const char* prefixList, char* names, union mscclppSocketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
|
||||
static int findInterfaces(const char* prefixList, char* names, union mscclppSocketAddress* addrs, int sock_family,
|
||||
int maxIfNameSize, int maxIfs)
|
||||
{
|
||||
#ifdef ENABLE_TRACE
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
char line[SOCKET_NAME_MAXLEN + 1];
|
||||
#endif
|
||||
struct netIf userIfs[MAX_IFS];
|
||||
bool searchNot = prefixList && prefixList[0] == '^';
|
||||
if (searchNot) prefixList++;
|
||||
if (searchNot)
|
||||
prefixList++;
|
||||
bool searchExact = prefixList && prefixList[0] == '=';
|
||||
if (searchExact) prefixList++;
|
||||
if (searchExact)
|
||||
prefixList++;
|
||||
int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
|
||||
|
||||
int found = 0;
|
||||
struct ifaddrs *interfaces, *interface;
|
||||
getifaddrs(&interfaces);
|
||||
for (interface = interfaces; interface && found < maxIfs; interface = interface->ifa_next) {
|
||||
if (interface->ifa_addr == NULL) continue;
|
||||
if (interface->ifa_addr == NULL)
|
||||
continue;
|
||||
|
||||
/* We only support IPv4 & IPv6 */
|
||||
int family = interface->ifa_addr->sa_family;
|
||||
if (family != AF_INET && family != AF_INET6)
|
||||
continue;
|
||||
|
||||
TRACE(MSCCLPP_INIT|MSCCLPP_NET,"Found interface %s:%s", interface->ifa_name, mscclppSocketToString((union mscclppSocketAddress *) interface->ifa_addr, line));
|
||||
TRACE(MSCCLPP_INIT | MSCCLPP_NET, "Found interface %s:%s", interface->ifa_name,
|
||||
mscclppSocketToString((union mscclppSocketAddress*)interface->ifa_addr, line));
|
||||
|
||||
/* Allow the caller to force the socket family type */
|
||||
if (sock_family != -1 && family != sock_family)
|
||||
@@ -128,7 +148,8 @@ static int findInterfaces(const char* prefixList, char* names, union mscclppSock
|
||||
/* We also need to skip IPv6 loopback interfaces */
|
||||
if (family == AF_INET6) {
|
||||
struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr);
|
||||
if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue;
|
||||
if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr))
|
||||
continue;
|
||||
}
|
||||
|
||||
// check against user specified interfaces
|
||||
@@ -140,15 +161,18 @@ static int findInterfaces(const char* prefixList, char* names, union mscclppSock
|
||||
// getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link
|
||||
bool duplicate = false;
|
||||
for (int i = 0; i < found; i++) {
|
||||
if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; }
|
||||
if (strcmp(interface->ifa_name, names + i * maxIfNameSize) == 0) {
|
||||
duplicate = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!duplicate) {
|
||||
// Store the interface name
|
||||
strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize);
|
||||
strncpy(names + found * maxIfNameSize, interface->ifa_name, maxIfNameSize);
|
||||
// Store the IP address
|
||||
int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
|
||||
memcpy(addrs+found, interface->ifa_addr, salen);
|
||||
memcpy(addrs + found, interface->ifa_addr, salen);
|
||||
found++;
|
||||
}
|
||||
}
|
||||
@@ -157,7 +181,8 @@ static int findInterfaces(const char* prefixList, char* names, union mscclppSock
|
||||
return found;
|
||||
}
|
||||
|
||||
static bool matchSubnet(struct ifaddrs local_if, union mscclppSocketAddress* remote) {
|
||||
static bool matchSubnet(struct ifaddrs local_if, union mscclppSocketAddress* remote)
|
||||
{
|
||||
/* Check family first */
|
||||
int family = local_if.ifa_addr->sa_family;
|
||||
if (family != remote->sa.sa_family) {
|
||||
@@ -180,8 +205,8 @@ static bool matchSubnet(struct ifaddrs local_if, union mscclppSocketAddress* rem
|
||||
struct in6_addr& mask_in6 = mask->sin6_addr;
|
||||
struct in6_addr& remote_in6 = remote_addr.sin6_addr;
|
||||
bool same = true;
|
||||
int len = 16; //IPv6 address is 16 unsigned char
|
||||
for (int c = 0; c < len; c++) { //Network byte order is big-endian
|
||||
int len = 16; // IPv6 address is 16 unsigned char
|
||||
for (int c = 0; c < len; c++) { // Network byte order is big-endian
|
||||
char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c];
|
||||
char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c];
|
||||
if (c1 ^ c2) {
|
||||
@@ -200,16 +225,19 @@ static bool matchSubnet(struct ifaddrs local_if, union mscclppSocketAddress* rem
|
||||
}
|
||||
}
|
||||
|
||||
int mscclppFindInterfaceMatchSubnet(char* ifNames, union mscclppSocketAddress* localAddrs, union mscclppSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) {
|
||||
int mscclppFindInterfaceMatchSubnet(char* ifNames, union mscclppSocketAddress* localAddrs,
|
||||
union mscclppSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs)
|
||||
{
|
||||
#ifdef ENABLE_TRACE
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
char line[SOCKET_NAME_MAXLEN + 1];
|
||||
#endif
|
||||
char line_a[SOCKET_NAME_MAXLEN+1];
|
||||
char line_a[SOCKET_NAME_MAXLEN + 1];
|
||||
int found = 0;
|
||||
struct ifaddrs *interfaces, *interface;
|
||||
getifaddrs(&interfaces);
|
||||
for (interface = interfaces; interface && !found; interface = interface->ifa_next) {
|
||||
if (interface->ifa_addr == NULL) continue;
|
||||
if (interface->ifa_addr == NULL)
|
||||
continue;
|
||||
|
||||
/* We only support IPv4 & IPv6 */
|
||||
int family = interface->ifa_addr->sa_family;
|
||||
@@ -223,14 +251,17 @@ int mscclppFindInterfaceMatchSubnet(char* ifNames, union mscclppSocketAddress* l
|
||||
|
||||
// Store the local IP address
|
||||
int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
|
||||
memcpy(localAddrs+found, interface->ifa_addr, salen);
|
||||
memcpy(localAddrs + found, interface->ifa_addr, salen);
|
||||
|
||||
// Store the interface name
|
||||
strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
|
||||
strncpy(ifNames + found * ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
|
||||
|
||||
TRACE(MSCCLPP_INIT|MSCCLPP_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, mscclppSocketToString(localAddrs+found, line), mscclppSocketToString(remoteAddr, line_a));
|
||||
TRACE(MSCCLPP_INIT | MSCCLPP_NET, "NET : Found interface %s:%s in the same subnet as remote address %s",
|
||||
interface->ifa_name, mscclppSocketToString(localAddrs + found, line),
|
||||
mscclppSocketToString(remoteAddr, line_a));
|
||||
found++;
|
||||
if (found == maxIfs) break;
|
||||
if (found == maxIfs)
|
||||
break;
|
||||
}
|
||||
|
||||
if (found == 0) {
|
||||
@@ -240,7 +271,8 @@ int mscclppFindInterfaceMatchSubnet(char* ifNames, union mscclppSocketAddress* l
|
||||
return found;
|
||||
}
|
||||
|
||||
mscclppResult_t mscclppSocketGetAddrFromString(union mscclppSocketAddress* ua, const char* ip_port_pair) {
|
||||
mscclppResult_t mscclppSocketGetAddrFromString(union mscclppSocketAddress* ua, const char* ip_port_pair)
|
||||
{
|
||||
if (!(ip_port_pair && strlen(ip_port_pair) > 1)) {
|
||||
WARN("Net : string is null");
|
||||
return mscclppInvalidArgument;
|
||||
@@ -262,7 +294,7 @@ mscclppResult_t mscclppSocketGetAddrFromString(union mscclppSocketAddress* ua, c
|
||||
hints.ai_family = AF_UNSPEC;
|
||||
hints.ai_socktype = SOCK_STREAM;
|
||||
|
||||
if ( (rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) {
|
||||
if ((rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) {
|
||||
WARN("Net : error encountered when getting address info : %s", gai_strerror(rv));
|
||||
return mscclppInvalidArgument;
|
||||
}
|
||||
@@ -271,16 +303,16 @@ mscclppResult_t mscclppSocketGetAddrFromString(union mscclppSocketAddress* ua, c
|
||||
if (p->ai_family == AF_INET) {
|
||||
struct sockaddr_in& sin = ua->sin;
|
||||
memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in));
|
||||
sin.sin_family = AF_INET; // IPv4
|
||||
//inet_pton(AF_INET, ni.prefix, &(sin.sin_addr)); // IP address
|
||||
sin.sin_port = htons(ni.port); // port
|
||||
sin.sin_family = AF_INET; // IPv4
|
||||
// inet_pton(AF_INET, ni.prefix, &(sin.sin_addr)); // IP address
|
||||
sin.sin_port = htons(ni.port); // port
|
||||
} else if (p->ai_family == AF_INET6) {
|
||||
struct sockaddr_in6& sin6 = ua->sin6;
|
||||
memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6));
|
||||
sin6.sin6_family = AF_INET6; // IPv6
|
||||
sin6.sin6_port = htons(ni.port); // port
|
||||
sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete
|
||||
sin6.sin6_scope_id = 0; // should be global scope, set to 0
|
||||
sin6.sin6_family = AF_INET6; // IPv6
|
||||
sin6.sin6_port = htons(ni.port); // port
|
||||
sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete
|
||||
sin6.sin6_scope_id = 0; // should be global scope, set to 0
|
||||
} else {
|
||||
WARN("Net : unsupported IP family");
|
||||
return mscclppInvalidArgument;
|
||||
@@ -291,35 +323,39 @@ mscclppResult_t mscclppSocketGetAddrFromString(union mscclppSocketAddress* ua, c
|
||||
} else {
|
||||
int i, j = -1, len = strlen(ip_port_pair);
|
||||
for (i = 1; i < len; i++) {
|
||||
if (ip_port_pair[i] == '%') j = i;
|
||||
if (ip_port_pair[i] == ']') break;
|
||||
if (ip_port_pair[i] == '%')
|
||||
j = i;
|
||||
if (ip_port_pair[i] == ']')
|
||||
break;
|
||||
}
|
||||
if (i == len) {
|
||||
WARN("Net : No valid [IPv6]:port pair found");
|
||||
return mscclppInvalidArgument;
|
||||
}
|
||||
bool global_scope = (j == -1 ? true : false); // If no % found, global scope; otherwise, link scope
|
||||
bool global_scope = (j == -1 ? true : false); // If no % found, global scope; otherwise, link scope
|
||||
|
||||
char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ];
|
||||
memset(ip_str, '\0', sizeof(ip_str));
|
||||
memset(port_str, '\0', sizeof(port_str));
|
||||
memset(if_name, '\0', sizeof(if_name));
|
||||
strncpy(ip_str, ip_port_pair+1, global_scope ? i-1 : j-1);
|
||||
strncpy(port_str, ip_port_pair+i+2, len-i-1);
|
||||
strncpy(ip_str, ip_port_pair + 1, global_scope ? i - 1 : j - 1);
|
||||
strncpy(port_str, ip_port_pair + i + 2, len - i - 1);
|
||||
int port = atoi(port_str);
|
||||
if (!global_scope) strncpy(if_name, ip_port_pair+j+1, i-j-1); // If not global scope, we need the intf name
|
||||
if (!global_scope)
|
||||
strncpy(if_name, ip_port_pair + j + 1, i - j - 1); // If not global scope, we need the intf name
|
||||
|
||||
struct sockaddr_in6& sin6 = ua->sin6;
|
||||
sin6.sin6_family = AF_INET6; // IPv6
|
||||
inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr)); // IP address
|
||||
sin6.sin6_port = htons(port); // port
|
||||
sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete
|
||||
sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name); // 0 if global scope; intf index if link scope
|
||||
sin6.sin6_family = AF_INET6; // IPv6
|
||||
inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr)); // IP address
|
||||
sin6.sin6_port = htons(port); // port
|
||||
sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete
|
||||
sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name); // 0 if global scope; intf index if link scope
|
||||
}
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
int mscclppFindInterfaces(char* ifNames, union mscclppSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) {
|
||||
int mscclppFindInterfaces(char* ifNames, union mscclppSocketAddress* ifAddrs, int ifNameMaxSize, int maxIfs)
|
||||
{
|
||||
static int shownIfName = 0;
|
||||
int nIfs = 0;
|
||||
// Allow user to force the INET socket family selection
|
||||
@@ -329,7 +365,8 @@ int mscclppFindInterfaces(char* ifNames, union mscclppSocketAddress *ifAddrs, in
|
||||
if (env && strlen(env) > 1) {
|
||||
INFO(MSCCLPP_ENV, "MSCCLPP_SOCKET_IFNAME set by environment to %s", env);
|
||||
// Specified by user : find or fail
|
||||
if (shownIfName++ == 0) INFO(MSCCLPP_NET, "MSCCLPP_SOCKET_IFNAME set to %s", env);
|
||||
if (shownIfName++ == 0)
|
||||
INFO(MSCCLPP_NET, "MSCCLPP_SOCKET_IFNAME set to %s", env);
|
||||
nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
|
||||
} else {
|
||||
// Try to automatically pick the right one
|
||||
@@ -347,15 +384,19 @@ int mscclppFindInterfaces(char* ifNames, union mscclppSocketAddress *ifAddrs, in
|
||||
}
|
||||
}
|
||||
// Then look for anything else (but not docker or lo)
|
||||
if (nIfs == 0) nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
|
||||
if (nIfs == 0)
|
||||
nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
|
||||
// Finally look for docker, then lo.
|
||||
if (nIfs == 0) nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
|
||||
if (nIfs == 0) nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
|
||||
if (nIfs == 0)
|
||||
nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
|
||||
if (nIfs == 0)
|
||||
nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
|
||||
}
|
||||
return nIfs;
|
||||
}
|
||||
|
||||
mscclppResult_t mscclppSocketListen(struct mscclppSocket* sock) {
|
||||
mscclppResult_t mscclppSocketListen(struct mscclppSocket* sock)
|
||||
{
|
||||
if (sock == NULL) {
|
||||
WARN("mscclppSocketListen: pass NULL socket");
|
||||
return mscclppInvalidArgument;
|
||||
@@ -383,8 +424,8 @@ mscclppResult_t mscclppSocketListen(struct mscclppSocket* sock) {
|
||||
SYSCHECK(getsockname(sock->fd, &sock->addr.sa, &size), "getsockname");
|
||||
|
||||
#ifdef ENABLE_TRACE
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
TRACE(MSCCLPP_INIT|MSCCLPP_NET,"Listening on socket %s", mscclppSocketToString(&sock->addr, line));
|
||||
char line[SOCKET_NAME_MAXLEN + 1];
|
||||
TRACE(MSCCLPP_INIT | MSCCLPP_NET, "Listening on socket %s", mscclppSocketToString(&sock->addr, line));
|
||||
#endif
|
||||
|
||||
/* Put the socket in listen mode
|
||||
@@ -395,17 +436,20 @@ mscclppResult_t mscclppSocketListen(struct mscclppSocket* sock) {
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t mscclppSocketGetAddr(struct mscclppSocket* sock, union mscclppSocketAddress* addr) {
|
||||
mscclppResult_t mscclppSocketGetAddr(struct mscclppSocket* sock, union mscclppSocketAddress* addr)
|
||||
{
|
||||
if (sock == NULL) {
|
||||
WARN("mscclppSocketGetAddr: pass NULL socket");
|
||||
return mscclppInvalidArgument;
|
||||
}
|
||||
if (sock->state != mscclppSocketStateReady) return mscclppInternalError;
|
||||
if (sock->state != mscclppSocketStateReady)
|
||||
return mscclppInternalError;
|
||||
memcpy(addr, &sock->addr, sizeof(union mscclppSocketAddress));
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
static mscclppResult_t socketTryAccept(struct mscclppSocket* sock) {
|
||||
static mscclppResult_t socketTryAccept(struct mscclppSocket* sock)
|
||||
{
|
||||
socklen_t socklen = sizeof(union mscclppSocketAddress);
|
||||
sock->fd = accept(sock->acceptFd, &sock->addr.sa, &socklen);
|
||||
if (sock->fd != -1) {
|
||||
@@ -416,19 +460,22 @@ static mscclppResult_t socketTryAccept(struct mscclppSocket* sock) {
|
||||
} else if (++sock->acceptRetries == RETRY_ACCEPT_TIMES) {
|
||||
WARN("socketTryAccept: exceeded retries (%d)", sock->acceptRetries);
|
||||
return mscclppRemoteError;
|
||||
} else {
|
||||
} else {
|
||||
usleep(SLEEP_INT);
|
||||
if (sock->acceptRetries % 1000 == 0) INFO(MSCCLPP_ALL, "socketTryAccept: Call to try accept returned %s, retrying", strerror(errno));
|
||||
if (sock->acceptRetries % 1000 == 0)
|
||||
INFO(MSCCLPP_ALL, "socketTryAccept: Call to try accept returned %s, retrying", strerror(errno));
|
||||
}
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
static mscclppResult_t socketFinalizeAccept(struct mscclppSocket* sock) {
|
||||
static mscclppResult_t socketFinalizeAccept(struct mscclppSocket* sock)
|
||||
{
|
||||
uint64_t magic;
|
||||
enum mscclppSocketType type;
|
||||
int received = 0;
|
||||
MSCCLPPCHECK(mscclppSocketProgress(MSCCLPP_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
|
||||
if (received == 0) return mscclppSuccess;
|
||||
if (received == 0)
|
||||
return mscclppSuccess;
|
||||
MSCCLPPCHECK(socketWait(MSCCLPP_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
|
||||
if (magic != sock->magic) {
|
||||
WARN("socketFinalizeAccept: wrong magic %lx != %lx", magic, sock->magic);
|
||||
@@ -453,7 +500,8 @@ static mscclppResult_t socketFinalizeAccept(struct mscclppSocket* sock) {
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
static mscclppResult_t socketStartConnect(struct mscclppSocket* sock) {
|
||||
static mscclppResult_t socketStartConnect(struct mscclppSocket* sock)
|
||||
{
|
||||
/* blocking/non-blocking connect() is determined by asyncFlag. */
|
||||
int ret = connect(sock->fd, &sock->addr.sa, sock->salen);
|
||||
|
||||
@@ -470,7 +518,8 @@ static mscclppResult_t socketStartConnect(struct mscclppSocket* sock) {
|
||||
return mscclppRemoteError;
|
||||
}
|
||||
usleep(SLEEP_INT);
|
||||
if (sock->refusedRetries % 1000 == 0) INFO(MSCCLPP_ALL, "Call to connect returned %s, retrying", strerror(errno));
|
||||
if (sock->refusedRetries % 1000 == 0)
|
||||
INFO(MSCCLPP_ALL, "Call to connect returned %s, retrying", strerror(errno));
|
||||
return mscclppSuccess;
|
||||
} else if (errno == ETIMEDOUT) {
|
||||
if (++sock->timedOutRetries == RETRY_TIMEDOUT_TIMES) {
|
||||
@@ -481,14 +530,15 @@ static mscclppResult_t socketStartConnect(struct mscclppSocket* sock) {
|
||||
usleep(SLEEP_INT);
|
||||
return mscclppSuccess;
|
||||
} else {
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
char line[SOCKET_NAME_MAXLEN + 1];
|
||||
sock->state = mscclppSocketStateError;
|
||||
WARN("socketStartConnect: Connect to %s failed : %s", mscclppSocketToString(&sock->addr, line), strerror(errno));
|
||||
return mscclppSystemError;
|
||||
}
|
||||
}
|
||||
|
||||
static mscclppResult_t socketPollConnect(struct mscclppSocket* sock) {
|
||||
static mscclppResult_t socketPollConnect(struct mscclppSocket* sock)
|
||||
{
|
||||
struct pollfd pfd;
|
||||
int timeout = 1, ret;
|
||||
socklen_t rlen = sizeof(int);
|
||||
@@ -497,7 +547,8 @@ static mscclppResult_t socketPollConnect(struct mscclppSocket* sock) {
|
||||
pfd.fd = sock->fd;
|
||||
pfd.events = POLLOUT;
|
||||
SYSCHECK(ret = poll(&pfd, 1, timeout), "poll");
|
||||
if (ret == 0) return mscclppSuccess;
|
||||
if (ret == 0)
|
||||
return mscclppSuccess;
|
||||
|
||||
/* check socket status */
|
||||
EQCHECK(ret == 1 && (pfd.revents & POLLOUT), 0);
|
||||
@@ -511,7 +562,8 @@ static mscclppResult_t socketPollConnect(struct mscclppSocket* sock) {
|
||||
WARN("socketPollConnect: exceeded retries (%d)", sock->refusedRetries);
|
||||
return mscclppRemoteError;
|
||||
}
|
||||
if (sock->refusedRetries % 1000 == 0) INFO(MSCCLPP_ALL, "Call to connect returned %s, retrying", strerror(errno));
|
||||
if (sock->refusedRetries % 1000 == 0)
|
||||
INFO(MSCCLPP_ALL, "Call to connect returned %s, retrying", strerror(errno));
|
||||
usleep(SLEEP_INT);
|
||||
|
||||
close(sock->fd);
|
||||
@@ -535,7 +587,8 @@ static mscclppResult_t socketPollConnect(struct mscclppSocket* sock) {
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t mscclppSocketPollConnect(struct mscclppSocket* sock) {
|
||||
mscclppResult_t mscclppSocketPollConnect(struct mscclppSocket* sock)
|
||||
{
|
||||
if (sock == NULL) {
|
||||
WARN("mscclppSocketPollConnect: pass NULL socket");
|
||||
return mscclppInvalidArgument;
|
||||
@@ -544,10 +597,12 @@ mscclppResult_t mscclppSocketPollConnect(struct mscclppSocket* sock) {
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
static mscclppResult_t socketFinalizeConnect(struct mscclppSocket* sock) {
|
||||
static mscclppResult_t socketFinalizeConnect(struct mscclppSocket* sock)
|
||||
{
|
||||
int sent = 0;
|
||||
MSCCLPPCHECK(socketProgress(MSCCLPP_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
|
||||
if (sent == 0) return mscclppSuccess;
|
||||
if (sent == 0)
|
||||
return mscclppSuccess;
|
||||
MSCCLPPCHECK(socketWait(MSCCLPP_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
|
||||
sent = 0;
|
||||
MSCCLPPCHECK(socketWait(MSCCLPP_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent));
|
||||
@@ -555,7 +610,8 @@ static mscclppResult_t socketFinalizeConnect(struct mscclppSocket* sock) {
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
static mscclppResult_t socketProgressState(struct mscclppSocket* sock) {
|
||||
static mscclppResult_t socketProgressState(struct mscclppSocket* sock)
|
||||
{
|
||||
if (sock->state == mscclppSocketStateAccepting) {
|
||||
MSCCLPPCHECK(socketTryAccept(sock));
|
||||
}
|
||||
@@ -591,9 +647,10 @@ static mscclppResult_t socketProgressState(struct mscclppSocket* sock) {
|
||||
// return mscclppSuccess;
|
||||
// }
|
||||
|
||||
mscclppResult_t mscclppSocketConnect(struct mscclppSocket* sock) {
|
||||
mscclppResult_t mscclppSocketConnect(struct mscclppSocket* sock)
|
||||
{
|
||||
#ifdef ENABLE_TRACE
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
char line[SOCKET_NAME_MAXLEN + 1];
|
||||
#endif
|
||||
const int one = 1;
|
||||
|
||||
@@ -608,39 +665,40 @@ mscclppResult_t mscclppSocketConnect(struct mscclppSocket* sock) {
|
||||
|
||||
if (sock->state != mscclppSocketStateInitialized) {
|
||||
WARN("mscclppSocketConnect: wrong socket state %d", sock->state);
|
||||
if (sock->state == mscclppSocketStateError) return mscclppRemoteError;
|
||||
if (sock->state == mscclppSocketStateError)
|
||||
return mscclppRemoteError;
|
||||
return mscclppInternalError;
|
||||
}
|
||||
TRACE(MSCCLPP_INIT|MSCCLPP_NET,"Connecting to socket %s", mscclppSocketToString(&sock->addr, line));
|
||||
TRACE(MSCCLPP_INIT | MSCCLPP_NET, "Connecting to socket %s", mscclppSocketToString(&sock->addr, line));
|
||||
|
||||
SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
|
||||
|
||||
sock->state = mscclppSocketStateConnecting;
|
||||
do {
|
||||
MSCCLPPCHECK(socketProgressState(sock));
|
||||
} while (sock->asyncFlag == 0 &&
|
||||
(sock->abortFlag == NULL || *sock->abortFlag == 0) &&
|
||||
(sock->state == mscclppSocketStateConnecting ||
|
||||
sock->state == mscclppSocketStateConnectPolling ||
|
||||
sock->state == mscclppSocketStateConnected));
|
||||
} while (sock->asyncFlag == 0 && (sock->abortFlag == NULL || *sock->abortFlag == 0) &&
|
||||
(sock->state == mscclppSocketStateConnecting || sock->state == mscclppSocketStateConnectPolling ||
|
||||
sock->state == mscclppSocketStateConnected));
|
||||
|
||||
if (sock->abortFlag && *sock->abortFlag != 0) return mscclppInternalError;
|
||||
if (sock->abortFlag && *sock->abortFlag != 0)
|
||||
return mscclppInternalError;
|
||||
|
||||
switch (sock->state) {
|
||||
case mscclppSocketStateConnecting:
|
||||
case mscclppSocketStateConnectPolling:
|
||||
case mscclppSocketStateConnected:
|
||||
case mscclppSocketStateReady:
|
||||
return mscclppSuccess;
|
||||
case mscclppSocketStateError:
|
||||
return mscclppSystemError;
|
||||
default:
|
||||
WARN("mscclppSocketConnect: wrong socket state %d", sock->state);
|
||||
return mscclppInternalError;
|
||||
case mscclppSocketStateConnecting:
|
||||
case mscclppSocketStateConnectPolling:
|
||||
case mscclppSocketStateConnected:
|
||||
case mscclppSocketStateReady:
|
||||
return mscclppSuccess;
|
||||
case mscclppSocketStateError:
|
||||
return mscclppSystemError;
|
||||
default:
|
||||
WARN("mscclppSocketConnect: wrong socket state %d", sock->state);
|
||||
return mscclppInternalError;
|
||||
}
|
||||
}
|
||||
|
||||
mscclppResult_t mscclppSocketAccept(struct mscclppSocket* sock, struct mscclppSocket* listenSock) {
|
||||
mscclppResult_t mscclppSocketAccept(struct mscclppSocket* sock, struct mscclppSocket* listenSock)
|
||||
{
|
||||
mscclppResult_t ret = mscclppSuccess;
|
||||
|
||||
if (listenSock == NULL || sock == NULL) {
|
||||
@@ -665,36 +723,38 @@ mscclppResult_t mscclppSocketAccept(struct mscclppSocket* sock, struct mscclppSo
|
||||
|
||||
do {
|
||||
MSCCLPPCHECKGOTO(socketProgressState(sock), ret, exit);
|
||||
} while (sock->asyncFlag == 0 &&
|
||||
(sock->abortFlag == NULL || *sock->abortFlag == 0) &&
|
||||
(sock->state == mscclppSocketStateAccepting ||
|
||||
sock->state == mscclppSocketStateAccepted));
|
||||
} while (sock->asyncFlag == 0 && (sock->abortFlag == NULL || *sock->abortFlag == 0) &&
|
||||
(sock->state == mscclppSocketStateAccepting || sock->state == mscclppSocketStateAccepted));
|
||||
|
||||
if (sock->abortFlag && *sock->abortFlag != 0) return mscclppInternalError;
|
||||
if (sock->abortFlag && *sock->abortFlag != 0)
|
||||
return mscclppInternalError;
|
||||
|
||||
switch (sock->state) {
|
||||
case mscclppSocketStateAccepting:
|
||||
case mscclppSocketStateAccepted:
|
||||
case mscclppSocketStateReady:
|
||||
ret = mscclppSuccess;
|
||||
break;
|
||||
case mscclppSocketStateError:
|
||||
ret = mscclppSystemError;
|
||||
break;
|
||||
default:
|
||||
WARN("mscclppSocketAccept: wrong socket state %d", sock->state);
|
||||
ret = mscclppInternalError;
|
||||
break;
|
||||
case mscclppSocketStateAccepting:
|
||||
case mscclppSocketStateAccepted:
|
||||
case mscclppSocketStateReady:
|
||||
ret = mscclppSuccess;
|
||||
break;
|
||||
case mscclppSocketStateError:
|
||||
ret = mscclppSystemError;
|
||||
break;
|
||||
default:
|
||||
WARN("mscclppSocketAccept: wrong socket state %d", sock->state);
|
||||
ret = mscclppInternalError;
|
||||
break;
|
||||
}
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
}
|
||||
|
||||
mscclppResult_t mscclppSocketInit(struct mscclppSocket* sock, union mscclppSocketAddress* addr, uint64_t magic, enum mscclppSocketType type, volatile uint32_t* abortFlag, int asyncFlag) {
|
||||
mscclppResult_t mscclppSocketInit(struct mscclppSocket* sock, union mscclppSocketAddress* addr, uint64_t magic,
|
||||
enum mscclppSocketType type, volatile uint32_t* abortFlag, int asyncFlag)
|
||||
{
|
||||
mscclppResult_t ret = mscclppSuccess;
|
||||
|
||||
if (sock == NULL) goto exit;
|
||||
if (sock == NULL)
|
||||
goto exit;
|
||||
sock->timedOutRetries = 0;
|
||||
sock->refusedRetries = 0;
|
||||
sock->acceptRetries = 0;
|
||||
@@ -712,9 +772,9 @@ mscclppResult_t mscclppSocketInit(struct mscclppSocket* sock, union mscclppSocke
|
||||
memcpy(&sock->addr, addr, sizeof(union mscclppSocketAddress));
|
||||
family = sock->addr.sa.sa_family;
|
||||
if (family != AF_INET && family != AF_INET6) {
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
char line[SOCKET_NAME_MAXLEN + 1];
|
||||
WARN("mscclppSocketInit: connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)",
|
||||
mscclppSocketToString(&sock->addr, line), family, AF_INET, AF_INET6);
|
||||
mscclppSocketToString(&sock->addr, line), family, AF_INET, AF_INET6);
|
||||
ret = mscclppInternalError;
|
||||
goto fail;
|
||||
}
|
||||
@@ -744,7 +804,8 @@ fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
mscclppResult_t mscclppSocketProgress(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset) {
|
||||
mscclppResult_t mscclppSocketProgress(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset)
|
||||
{
|
||||
if (sock == NULL) {
|
||||
WARN("mscclppSocketProgress: pass NULL socket");
|
||||
return mscclppInvalidArgument;
|
||||
@@ -762,7 +823,8 @@ mscclppResult_t mscclppSocketProgress(int op, struct mscclppSocket* sock, void*
|
||||
// return mscclppSuccess;
|
||||
// }
|
||||
|
||||
mscclppResult_t mscclppSocketSend(struct mscclppSocket* sock, void* ptr, int size) {
|
||||
mscclppResult_t mscclppSocketSend(struct mscclppSocket* sock, void* ptr, int size)
|
||||
{
|
||||
int offset = 0;
|
||||
if (sock == NULL) {
|
||||
WARN("mscclppSocketSend: pass NULL socket");
|
||||
@@ -776,7 +838,8 @@ mscclppResult_t mscclppSocketSend(struct mscclppSocket* sock, void* ptr, int siz
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t mscclppSocketRecv(struct mscclppSocket* sock, void* ptr, int size) {
|
||||
mscclppResult_t mscclppSocketRecv(struct mscclppSocket* sock, void* ptr, int size)
|
||||
{
|
||||
int offset = 0;
|
||||
if (sock == NULL) {
|
||||
WARN("mscclppSocketRecv: pass NULL socket");
|
||||
@@ -805,9 +868,11 @@ mscclppResult_t mscclppSocketRecv(struct mscclppSocket* sock, void* ptr, int siz
|
||||
// return mscclppSuccess;
|
||||
// }
|
||||
|
||||
mscclppResult_t mscclppSocketClose(struct mscclppSocket* sock) {
|
||||
mscclppResult_t mscclppSocketClose(struct mscclppSocket* sock)
|
||||
{
|
||||
if (sock != NULL) {
|
||||
if (sock->fd >= 0) close(sock->fd);
|
||||
if (sock->fd >= 0)
|
||||
close(sock->fd);
|
||||
sock->state = mscclppSocketStateClosed;
|
||||
sock->fd = -1;
|
||||
}
|
||||
|
||||
99
src/debug.cc
99
src/debug.cc
@@ -4,27 +4,31 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "debug.h"
|
||||
#include <stdlib.h>
|
||||
#include "core.h"
|
||||
#include <stdarg.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/syscall.h>
|
||||
|
||||
int mscclppDebugLevel = -1;
|
||||
static int pid = -1;
|
||||
static char hostname[1024];
|
||||
thread_local int mscclppDebugNoWarn = 0;
|
||||
char mscclppLastError[1024] = ""; // Global string for the last error in human readable form
|
||||
char mscclppLastError[1024] = ""; // Global string for the last error in human readable form
|
||||
uint64_t mscclppDebugMask = MSCCLPP_INIT; // Default debug sub-system mask is INIT
|
||||
FILE *mscclppDebugFile = stdout;
|
||||
FILE* mscclppDebugFile = stdout;
|
||||
pthread_mutex_t mscclppDebugLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
std::chrono::steady_clock::time_point mscclppEpoch;
|
||||
|
||||
static __thread int tid = -1;
|
||||
|
||||
void mscclppDebugInit() {
|
||||
void mscclppDebugInit()
|
||||
{
|
||||
pthread_mutex_lock(&mscclppDebugLock);
|
||||
if (mscclppDebugLevel != -1) { pthread_mutex_unlock(&mscclppDebugLock); return; }
|
||||
if (mscclppDebugLevel != -1) {
|
||||
pthread_mutex_unlock(&mscclppDebugLock);
|
||||
return;
|
||||
}
|
||||
const char* mscclpp_debug = getenv("MSCCLPP_DEBUG");
|
||||
int tempNcclDebugLevel = -1;
|
||||
if (mscclpp_debug == NULL) {
|
||||
@@ -48,10 +52,13 @@ void mscclppDebugInit() {
|
||||
char* mscclppDebugSubsysEnv = getenv("MSCCLPP_DEBUG_SUBSYS");
|
||||
if (mscclppDebugSubsysEnv != NULL) {
|
||||
int invert = 0;
|
||||
if (mscclppDebugSubsysEnv[0] == '^') { invert = 1; mscclppDebugSubsysEnv++; }
|
||||
if (mscclppDebugSubsysEnv[0] == '^') {
|
||||
invert = 1;
|
||||
mscclppDebugSubsysEnv++;
|
||||
}
|
||||
mscclppDebugMask = invert ? ~0ULL : 0ULL;
|
||||
char *mscclppDebugSubsys = strdup(mscclppDebugSubsysEnv);
|
||||
char *subsys = strtok(mscclppDebugSubsys, ",");
|
||||
char* mscclppDebugSubsys = strdup(mscclppDebugSubsysEnv);
|
||||
char* subsys = strtok(mscclppDebugSubsys, ",");
|
||||
while (subsys != NULL) {
|
||||
uint64_t mask = 0;
|
||||
if (strcasecmp(subsys, "INIT") == 0) {
|
||||
@@ -78,7 +85,10 @@ void mscclppDebugInit() {
|
||||
mask = MSCCLPP_ALL;
|
||||
}
|
||||
if (mask) {
|
||||
if (invert) mscclppDebugMask &= ~mask; else mscclppDebugMask |= mask;
|
||||
if (invert)
|
||||
mscclppDebugMask &= ~mask;
|
||||
else
|
||||
mscclppDebugMask |= mask;
|
||||
}
|
||||
subsys = strtok(NULL, ",");
|
||||
}
|
||||
@@ -96,32 +106,32 @@ void mscclppDebugInit() {
|
||||
const char* mscclppDebugFileEnv = getenv("MSCCLPP_DEBUG_FILE");
|
||||
if (tempNcclDebugLevel > MSCCLPP_LOG_VERSION && mscclppDebugFileEnv != NULL) {
|
||||
int c = 0;
|
||||
char debugFn[PATH_MAX+1] = "";
|
||||
char *dfn = debugFn;
|
||||
char debugFn[PATH_MAX + 1] = "";
|
||||
char* dfn = debugFn;
|
||||
while (mscclppDebugFileEnv[c] != '\0' && c < PATH_MAX) {
|
||||
if (mscclppDebugFileEnv[c++] != '%') {
|
||||
*dfn++ = mscclppDebugFileEnv[c-1];
|
||||
*dfn++ = mscclppDebugFileEnv[c - 1];
|
||||
continue;
|
||||
}
|
||||
switch (mscclppDebugFileEnv[c++]) {
|
||||
case '%': // Double %
|
||||
*dfn++ = '%';
|
||||
break;
|
||||
case 'h': // %h = hostname
|
||||
dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
|
||||
break;
|
||||
case 'p': // %p = pid
|
||||
dfn += snprintf(dfn, PATH_MAX, "%d", pid);
|
||||
break;
|
||||
default: // Echo everything we don't understand
|
||||
*dfn++ = '%';
|
||||
*dfn++ = mscclppDebugFileEnv[c-1];
|
||||
break;
|
||||
case '%': // Double %
|
||||
*dfn++ = '%';
|
||||
break;
|
||||
case 'h': // %h = hostname
|
||||
dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
|
||||
break;
|
||||
case 'p': // %p = pid
|
||||
dfn += snprintf(dfn, PATH_MAX, "%d", pid);
|
||||
break;
|
||||
default: // Echo everything we don't understand
|
||||
*dfn++ = '%';
|
||||
*dfn++ = mscclppDebugFileEnv[c - 1];
|
||||
break;
|
||||
}
|
||||
}
|
||||
*dfn = '\0';
|
||||
if (debugFn[0] != '\0') {
|
||||
FILE *file = fopen(debugFn, "w");
|
||||
FILE* file = fopen(debugFn, "w");
|
||||
if (file != nullptr) {
|
||||
setbuf(file, nullptr); // disable buffering
|
||||
mscclppDebugFile = file;
|
||||
@@ -138,20 +148,27 @@ void mscclppDebugInit() {
|
||||
* Also exported to the dynamically loadable Net transport modules so
|
||||
* they can share the debugging mechanisms and output files
|
||||
*/
|
||||
void mscclppDebugLog(mscclppDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
|
||||
if (__atomic_load_n(&mscclppDebugLevel, __ATOMIC_ACQUIRE) == -1) mscclppDebugInit();
|
||||
if (mscclppDebugNoWarn != 0 && level == MSCCLPP_LOG_WARN) { level = MSCCLPP_LOG_INFO; flags = mscclppDebugNoWarn; }
|
||||
void mscclppDebugLog(mscclppDebugLogLevel level, unsigned long flags, const char* filefunc, int line, const char* fmt,
|
||||
...)
|
||||
{
|
||||
if (__atomic_load_n(&mscclppDebugLevel, __ATOMIC_ACQUIRE) == -1)
|
||||
mscclppDebugInit();
|
||||
if (mscclppDebugNoWarn != 0 && level == MSCCLPP_LOG_WARN) {
|
||||
level = MSCCLPP_LOG_INFO;
|
||||
flags = mscclppDebugNoWarn;
|
||||
}
|
||||
|
||||
// Save the last error (WARN) as a human readable string
|
||||
if (level == MSCCLPP_LOG_WARN) {
|
||||
pthread_mutex_lock(&mscclppDebugLock);
|
||||
va_list vargs;
|
||||
va_start(vargs, fmt);
|
||||
(void) vsnprintf(mscclppLastError, sizeof(mscclppLastError), fmt, vargs);
|
||||
(void)vsnprintf(mscclppLastError, sizeof(mscclppLastError), fmt, vargs);
|
||||
va_end(vargs);
|
||||
pthread_mutex_unlock(&mscclppDebugLock);
|
||||
}
|
||||
if (mscclppDebugLevel < level || ((flags & mscclppDebugMask) == 0)) return;
|
||||
if (mscclppDebugLevel < level || ((flags & mscclppDebugMask) == 0))
|
||||
return;
|
||||
|
||||
if (tid == -1) {
|
||||
tid = syscall(SYS_gettid);
|
||||
@@ -165,23 +182,23 @@ void mscclppDebugLog(mscclppDebugLogLevel level, unsigned long flags, const char
|
||||
char buffer[1024];
|
||||
size_t len = 0;
|
||||
if (level == MSCCLPP_LOG_WARN) {
|
||||
len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d MSCCLPP WARN ",
|
||||
hostname, pid, tid, cudaDev, filefunc, line);
|
||||
len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d MSCCLPP WARN ", hostname, pid, tid, cudaDev, filefunc,
|
||||
line);
|
||||
} else if (level == MSCCLPP_LOG_INFO) {
|
||||
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] MSCCLPP INFO ", hostname, pid, tid, cudaDev);
|
||||
} else if (level == MSCCLPP_LOG_TRACE && flags == MSCCLPP_CALL) {
|
||||
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d MSCCLPP CALL ", hostname, pid, tid);
|
||||
} else if (level == MSCCLPP_LOG_TRACE) {
|
||||
auto delta = std::chrono::steady_clock::now() - mscclppEpoch;
|
||||
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
|
||||
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d MSCCLPP TRACE ",
|
||||
hostname, pid, tid, cudaDev, timestamp, filefunc, line);
|
||||
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count() * 1000;
|
||||
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d MSCCLPP TRACE ", hostname, pid, tid, cudaDev,
|
||||
timestamp, filefunc, line);
|
||||
}
|
||||
|
||||
if (len) {
|
||||
va_list vargs;
|
||||
va_start(vargs, fmt);
|
||||
len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
|
||||
len += vsnprintf(buffer + len, sizeof(buffer) - len, fmt, vargs);
|
||||
va_end(vargs);
|
||||
buffer[len++] = '\n';
|
||||
fwrite(buffer, 1, len, mscclppDebugFile);
|
||||
@@ -190,11 +207,13 @@ void mscclppDebugLog(mscclppDebugLogLevel level, unsigned long flags, const char
|
||||
|
||||
MSCCLPP_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
|
||||
|
||||
void mscclppSetThreadName(pthread_t thread, const char *fmt, ...) {
|
||||
void mscclppSetThreadName(pthread_t thread, const char* fmt, ...)
|
||||
{
|
||||
// pthread_setname_np is nonstandard GNU extension
|
||||
// needs the following feature test macro
|
||||
#ifdef _GNU_SOURCE
|
||||
if (mscclppParamSetThreadName() != 1) return;
|
||||
if (mscclppParamSetThreadName() != 1)
|
||||
return;
|
||||
char threadName[MSCCLPP_THREAD_NAMELEN];
|
||||
va_list vargs;
|
||||
va_start(vargs, fmt);
|
||||
|
||||
22
src/gdr.cc
22
src/gdr.cc
@@ -3,11 +3,13 @@
|
||||
// Used to make the GDR library calls thread safe
|
||||
pthread_mutex_t gdrLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
gdr_t wrap_gdr_open(void) {
|
||||
gdr_t wrap_gdr_open(void)
|
||||
{
|
||||
return gdr_open();
|
||||
}
|
||||
|
||||
mscclppResult_t wrap_gdr_close(gdr_t g) {
|
||||
mscclppResult_t wrap_gdr_close(gdr_t g)
|
||||
{
|
||||
int ret = gdr_close(g);
|
||||
if (ret != 0) {
|
||||
WARN("gdr_close() failed: %d", ret);
|
||||
@@ -16,7 +18,9 @@ mscclppResult_t wrap_gdr_close(gdr_t g) {
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle) {
|
||||
mscclppResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space,
|
||||
gdr_mh_t* handle)
|
||||
{
|
||||
int ret;
|
||||
GDRLOCKCALL(gdr_pin_buffer(g, addr, size, p2p_token, va_space, handle), ret);
|
||||
if (ret != 0) {
|
||||
@@ -26,7 +30,8 @@ mscclppResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, ui
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) {
|
||||
mscclppResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle)
|
||||
{
|
||||
int ret;
|
||||
GDRLOCKCALL(gdr_unpin_buffer(g, handle), ret);
|
||||
if (ret != 0) {
|
||||
@@ -36,7 +41,8 @@ mscclppResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) {
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) {
|
||||
mscclppResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t* info)
|
||||
{
|
||||
int ret;
|
||||
GDRLOCKCALL(gdr_get_info(g, handle, info), ret);
|
||||
if (ret != 0) {
|
||||
@@ -46,7 +52,8 @@ mscclppResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) {
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) {
|
||||
mscclppResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void** va, size_t size)
|
||||
{
|
||||
int ret;
|
||||
GDRLOCKCALL(gdr_map(g, handle, va, size), ret);
|
||||
if (ret != 0) {
|
||||
@@ -56,7 +63,8 @@ mscclppResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) {
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) {
|
||||
mscclppResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void* va, size_t size)
|
||||
{
|
||||
int ret;
|
||||
GDRLOCKCALL(gdr_unmap(g, handle, va, size), ret);
|
||||
if (ret != 0) {
|
||||
|
||||
70
src/ib.cc
70
src/ib.cc
@@ -2,23 +2,23 @@
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <malloc.h>
|
||||
#include <vector>
|
||||
#include <unistd.h>
|
||||
#include <vector>
|
||||
|
||||
#include "debug.h"
|
||||
#include "alloc.h"
|
||||
#include "comm.h"
|
||||
#include "debug.h"
|
||||
#include "ib.h"
|
||||
|
||||
static int getIbDevNumaNode(const char *ibDevPath)
|
||||
static int getIbDevNumaNode(const char* ibDevPath)
|
||||
{
|
||||
if (ibDevPath == NULL) {
|
||||
WARN("ibDevPath is NULL");
|
||||
return -1;
|
||||
}
|
||||
const char *postfix = "/device/numa_node";
|
||||
FILE *fp = NULL;
|
||||
char *filePath = NULL;
|
||||
const char* postfix = "/device/numa_node";
|
||||
FILE* fp = NULL;
|
||||
char* filePath = NULL;
|
||||
int node = -1;
|
||||
int res;
|
||||
if (mscclppCalloc(&filePath, strlen(ibDevPath) + strlen(postfix) + 1) != mscclppSuccess) {
|
||||
@@ -52,16 +52,16 @@ exit:
|
||||
return node;
|
||||
}
|
||||
|
||||
mscclppResult_t mscclppIbContextCreate(struct mscclppIbContext **ctx, const char *ibDevName)
|
||||
mscclppResult_t mscclppIbContextCreate(struct mscclppIbContext** ctx, const char* ibDevName)
|
||||
{
|
||||
struct mscclppIbContext *_ctx;
|
||||
struct mscclppIbContext* _ctx;
|
||||
MSCCLPPCHECK(mscclppCalloc(&_ctx, 1));
|
||||
|
||||
std::vector<int> ports;
|
||||
|
||||
int num;
|
||||
const char *ibDevPath = NULL;
|
||||
struct ibv_device **devices = ibv_get_device_list(&num);
|
||||
const char* ibDevPath = NULL;
|
||||
struct ibv_device** devices = ibv_get_device_list(&num);
|
||||
for (int i = 0; i < num; ++i) {
|
||||
if (strncmp(devices[i]->name, ibDevName, IBV_SYSFS_NAME_MAX) == 0) {
|
||||
_ctx->ctx = ibv_open_device(devices[i]);
|
||||
@@ -96,8 +96,7 @@ mscclppResult_t mscclppIbContextCreate(struct mscclppIbContext **ctx, const char
|
||||
if (portAttr.state != IBV_PORT_ACTIVE) {
|
||||
continue;
|
||||
}
|
||||
if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND &&
|
||||
portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) {
|
||||
if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) {
|
||||
continue;
|
||||
}
|
||||
ports.push_back((int)i);
|
||||
@@ -129,7 +128,7 @@ fail:
|
||||
return mscclppInternalError;
|
||||
}
|
||||
|
||||
mscclppResult_t mscclppIbContextDestroy(struct mscclppIbContext *ctx)
|
||||
mscclppResult_t mscclppIbContextDestroy(struct mscclppIbContext* ctx)
|
||||
{
|
||||
for (int i = 0; i < ctx->nMrs; ++i) {
|
||||
if (ctx->mrs[i].mr) {
|
||||
@@ -158,7 +157,7 @@ mscclppResult_t mscclppIbContextDestroy(struct mscclppIbContext *ctx)
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t mscclppIbContextCreateQp(struct mscclppIbContext *ctx, struct mscclppIbQp **ibQp, int port/*=-1*/)
|
||||
mscclppResult_t mscclppIbContextCreateQp(struct mscclppIbContext* ctx, struct mscclppIbQp** ibQp, int port /*=-1*/)
|
||||
{
|
||||
if (port < 0) {
|
||||
port = ctx->ports[0];
|
||||
@@ -176,7 +175,7 @@ mscclppResult_t mscclppIbContextCreateQp(struct mscclppIbContext *ctx, struct ms
|
||||
}
|
||||
}
|
||||
|
||||
struct ibv_cq *cq = ibv_create_cq(ctx->ctx, MSCCLPP_IB_CQ_SIZE, NULL, NULL, 0);
|
||||
struct ibv_cq* cq = ibv_create_cq(ctx->ctx, MSCCLPP_IB_CQ_SIZE, NULL, NULL, 0);
|
||||
if (cq == NULL) {
|
||||
WARN("ibv_create_cq failed (errno %d)", errno);
|
||||
return mscclppInternalError;
|
||||
@@ -193,7 +192,7 @@ mscclppResult_t mscclppIbContextCreateQp(struct mscclppIbContext *ctx, struct ms
|
||||
qp_init_attr.cap.max_send_sge = 1;
|
||||
qp_init_attr.cap.max_recv_sge = 1;
|
||||
qp_init_attr.cap.max_inline_data = 0;
|
||||
struct ibv_qp *qp = ibv_create_qp(ctx->pd, &qp_init_attr);
|
||||
struct ibv_qp* qp = ibv_create_qp(ctx->pd, &qp_init_attr);
|
||||
if (qp == nullptr) {
|
||||
WARN("ibv_create_qp failed (errno %d)", errno);
|
||||
return mscclppInternalError;
|
||||
@@ -219,7 +218,7 @@ mscclppResult_t mscclppIbContextCreateQp(struct mscclppIbContext *ctx, struct ms
|
||||
WARN("too many QPs");
|
||||
return mscclppInternalError;
|
||||
}
|
||||
struct mscclppIbQp *_ibQp = &ctx->qps[ctx->nQps - 1];
|
||||
struct mscclppIbQp* _ibQp = &ctx->qps[ctx->nQps - 1];
|
||||
_ibQp->qp = qp;
|
||||
_ibQp->info.lid = port_attr.lid;
|
||||
_ibQp->info.port = port;
|
||||
@@ -229,8 +228,8 @@ mscclppResult_t mscclppIbContextCreateQp(struct mscclppIbContext *ctx, struct ms
|
||||
if (port_attr.link_layer != IBV_LINK_LAYER_INFINIBAND) {
|
||||
union ibv_gid gid;
|
||||
if (ibv_query_gid(ctx->ctx, port, 0, &gid) != 0) {
|
||||
WARN("ibv_query_gid failed (errno %d)", errno);
|
||||
return mscclppInternalError;
|
||||
WARN("ibv_query_gid failed (errno %d)", errno);
|
||||
return mscclppInternalError;
|
||||
}
|
||||
_ibQp->info.spn = gid.global.subnet_prefix;
|
||||
}
|
||||
@@ -256,7 +255,8 @@ mscclppResult_t mscclppIbContextCreateQp(struct mscclppIbContext *ctx, struct ms
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t mscclppIbContextRegisterMr(struct mscclppIbContext *ctx, void *buff, size_t size, struct mscclppIbMr **ibMr)
|
||||
mscclppResult_t mscclppIbContextRegisterMr(struct mscclppIbContext* ctx, void* buff, size_t size,
|
||||
struct mscclppIbMr** ibMr)
|
||||
{
|
||||
if (size == 0) {
|
||||
WARN("invalid size: %zu", size);
|
||||
@@ -271,8 +271,8 @@ mscclppResult_t mscclppIbContextRegisterMr(struct mscclppIbContext *ctx, void *b
|
||||
}
|
||||
uintptr_t addr = reinterpret_cast<uintptr_t>(buff) & -pageSize;
|
||||
size_t pages = (size + (reinterpret_cast<uintptr_t>(buff) - addr) + pageSize - 1) / pageSize;
|
||||
struct ibv_mr *mr =
|
||||
ibv_reg_mr(ctx->pd, reinterpret_cast<void *>(addr), pages * pageSize,
|
||||
struct ibv_mr* mr =
|
||||
ibv_reg_mr(ctx->pd, reinterpret_cast<void*>(addr), pages * pageSize,
|
||||
IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_RELAXED_ORDERING);
|
||||
if (mr == nullptr) {
|
||||
WARN("ibv_reg_mr failed (errno %d)", errno);
|
||||
@@ -287,7 +287,7 @@ mscclppResult_t mscclppIbContextRegisterMr(struct mscclppIbContext *ctx, void *b
|
||||
WARN("too many MRs");
|
||||
return mscclppInternalError;
|
||||
}
|
||||
struct mscclppIbMr *_ibMr = &ctx->mrs[ctx->nMrs - 1];
|
||||
struct mscclppIbMr* _ibMr = &ctx->mrs[ctx->nMrs - 1];
|
||||
_ibMr->mr = mr;
|
||||
_ibMr->buff = buff;
|
||||
_ibMr->info.addr = (uint64_t)buff;
|
||||
@@ -298,7 +298,7 @@ mscclppResult_t mscclppIbContextRegisterMr(struct mscclppIbContext *ctx, void *b
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int mscclppIbQp::rtr(const mscclppIbQpInfo *info)
|
||||
int mscclppIbQp::rtr(const mscclppIbQpInfo* info)
|
||||
{
|
||||
struct ibv_qp_attr qp_attr;
|
||||
std::memset(&qp_attr, 0, sizeof(struct ibv_qp_attr));
|
||||
@@ -324,8 +324,8 @@ int mscclppIbQp::rtr(const mscclppIbQpInfo *info)
|
||||
qp_attr.ah_attr.src_path_bits = 0;
|
||||
qp_attr.ah_attr.port_num = info->port;
|
||||
return ibv_modify_qp(this->qp, &qp_attr,
|
||||
IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN |
|
||||
IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER);
|
||||
IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN |
|
||||
IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER);
|
||||
}
|
||||
|
||||
int mscclppIbQp::rts()
|
||||
@@ -339,19 +339,19 @@ int mscclppIbQp::rts()
|
||||
qp_attr.sq_psn = 0;
|
||||
qp_attr.max_rd_atomic = 1;
|
||||
return ibv_modify_qp(this->qp, &qp_attr,
|
||||
IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN |
|
||||
IBV_QP_MAX_QP_RD_ATOMIC);
|
||||
IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN |
|
||||
IBV_QP_MAX_QP_RD_ATOMIC);
|
||||
}
|
||||
|
||||
int mscclppIbQp::stageSend(struct mscclppIbMr *ibMr, const mscclppIbMrInfo *info, uint32_t size,
|
||||
uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled)
|
||||
int mscclppIbQp::stageSend(struct mscclppIbMr* ibMr, const mscclppIbMrInfo* info, uint32_t size, uint64_t wrId,
|
||||
uint64_t srcOffset, uint64_t dstOffset, bool signaled)
|
||||
{
|
||||
if (this->wrn >= MSCCLPP_IB_MAX_SENDS) {
|
||||
return -1;
|
||||
}
|
||||
int wrn = this->wrn;
|
||||
struct ibv_send_wr *wr_ = &this->wrs[wrn];
|
||||
struct ibv_sge *sge_ = &this->sges[wrn];
|
||||
struct ibv_send_wr* wr_ = &this->wrs[wrn];
|
||||
struct ibv_sge* sge_ = &this->sges[wrn];
|
||||
// std::memset(wr_, 0, sizeof(struct ibv_send_wr));
|
||||
// std::memset(sge_, 0, sizeof(struct ibv_sge));
|
||||
wr_->wr_id = wrId;
|
||||
@@ -372,8 +372,8 @@ int mscclppIbQp::stageSend(struct mscclppIbMr *ibMr, const mscclppIbMrInfo *info
|
||||
return this->wrn;
|
||||
}
|
||||
|
||||
int mscclppIbQp::stageSendWithImm(struct mscclppIbMr *ibMr, const mscclppIbMrInfo *info, uint32_t size,
|
||||
uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled, unsigned int immData)
|
||||
int mscclppIbQp::stageSendWithImm(struct mscclppIbMr* ibMr, const mscclppIbMrInfo* info, uint32_t size, uint64_t wrId,
|
||||
uint64_t srcOffset, uint64_t dstOffset, bool signaled, unsigned int immData)
|
||||
{
|
||||
int wrn = this->stageSend(ibMr, info, size, wrId, srcOffset, dstOffset, signaled);
|
||||
this->wrs[wrn - 1].opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
|
||||
@@ -387,7 +387,7 @@ int mscclppIbQp::postSend()
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct ibv_send_wr *bad_wr;
|
||||
struct ibv_send_wr* bad_wr;
|
||||
int ret = ibv_post_send(this->qp, this->wrs, &bad_wr);
|
||||
if (ret != 0) {
|
||||
return ret;
|
||||
|
||||
@@ -7,38 +7,35 @@
|
||||
#ifndef NCCL_ALIGN_H_
|
||||
#define NCCL_ALIGN_H_
|
||||
|
||||
#define DIVUP(x, y) \
|
||||
(((x)+(y)-1)/(y))
|
||||
#define DIVUP(x, y) (((x) + (y)-1) / (y))
|
||||
|
||||
#define ROUNDUP(x, y) \
|
||||
(DIVUP((x), (y))*(y))
|
||||
#define ROUNDUP(x, y) (DIVUP((x), (y)) * (y))
|
||||
|
||||
#define ALIGN_SIZE(size, align) \
|
||||
size = ((size + (align) - 1) / (align)) * (align);
|
||||
#define ALIGN_SIZE(size, align) size = ((size + (align)-1) / (align)) * (align);
|
||||
|
||||
#if !__CUDA_ARCH__
|
||||
#ifndef __host__
|
||||
#define __host__
|
||||
#endif
|
||||
#ifndef __device__
|
||||
#define __device__
|
||||
#endif
|
||||
#ifndef __host__
|
||||
#define __host__
|
||||
#endif
|
||||
#ifndef __device__
|
||||
#define __device__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
template<typename X, typename Y, typename Z = decltype(X()+Y())>
|
||||
__host__ __device__ constexpr Z divUp(X x, Y y) {
|
||||
return (x+y-1)/y;
|
||||
template <typename X, typename Y, typename Z = decltype(X() + Y())> __host__ __device__ constexpr Z divUp(X x, Y y)
|
||||
{
|
||||
return (x + y - 1) / y;
|
||||
}
|
||||
|
||||
template<typename X, typename Y, typename Z = decltype(X()+Y())>
|
||||
__host__ __device__ constexpr Z roundUp(X x, Y y) {
|
||||
return (x+y-1) - (x+y-1)%y;
|
||||
template <typename X, typename Y, typename Z = decltype(X() + Y())> __host__ __device__ constexpr Z roundUp(X x, Y y)
|
||||
{
|
||||
return (x + y - 1) - (x + y - 1) % y;
|
||||
}
|
||||
|
||||
// assumes second argument is a power of 2
|
||||
template<typename X, typename Z = decltype(X()+int())>
|
||||
__host__ __device__ constexpr Z alignUp(X x, int a) {
|
||||
return (x+a-1) & Z(-a);
|
||||
template <typename X, typename Z = decltype(X() + int())> __host__ __device__ constexpr Z alignUp(X x, int a)
|
||||
{
|
||||
return (x + a - 1) & Z(-a);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -7,88 +7,94 @@
|
||||
#ifndef MSCCLPP_ALLOC_H_
|
||||
#define MSCCLPP_ALLOC_H_
|
||||
|
||||
#include "mscclpp.h"
|
||||
#include "checks.h"
|
||||
#include "align.h"
|
||||
#include "checks.h"
|
||||
#include "mscclpp.h"
|
||||
#include "utils.h"
|
||||
#include <sys/mman.h>
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/mman.h>
|
||||
#include <unistd.h>
|
||||
|
||||
uint64_t clockNano(); // from utils.h with which we have a circular dependency
|
||||
|
||||
template <typename T>
|
||||
mscclppResult_t mscclppCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
|
||||
template <typename T> mscclppResult_t mscclppCudaHostCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line)
|
||||
{
|
||||
mscclppResult_t result = mscclppSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped), result, finish);
|
||||
memset(*ptr, 0, nelem*sizeof(T));
|
||||
CUDACHECKGOTO(cudaHostAlloc(ptr, nelem * sizeof(T), cudaHostAllocMapped), result, finish);
|
||||
memset(*ptr, 0, nelem * sizeof(T));
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (*ptr == nullptr) WARN("Failed to CUDA host alloc %ld bytes", nelem*sizeof(T));
|
||||
INFO(MSCCLPP_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
|
||||
if (*ptr == nullptr)
|
||||
WARN("Failed to CUDA host alloc %ld bytes", nelem * sizeof(T));
|
||||
INFO(MSCCLPP_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
|
||||
return result;
|
||||
}
|
||||
#define mscclppCudaHostCalloc(...) mscclppCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
||||
|
||||
inline mscclppResult_t mscclppCudaHostFree(void* ptr) {
|
||||
inline mscclppResult_t mscclppCudaHostFree(void* ptr)
|
||||
{
|
||||
CUDACHECK(cudaFreeHost(ptr));
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
mscclppResult_t mscclppCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
|
||||
void* p = malloc(nelem*sizeof(T));
|
||||
template <typename T> mscclppResult_t mscclppCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line)
|
||||
{
|
||||
void* p = malloc(nelem * sizeof(T));
|
||||
if (p == NULL) {
|
||||
WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
|
||||
WARN("Failed to malloc %ld bytes", nelem * sizeof(T));
|
||||
return mscclppSystemError;
|
||||
}
|
||||
INFO(MSCCLPP_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), p);
|
||||
memset(p, 0, nelem*sizeof(T));
|
||||
INFO(MSCCLPP_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), p);
|
||||
memset(p, 0, nelem * sizeof(T));
|
||||
*ptr = (T*)p;
|
||||
return mscclppSuccess;
|
||||
}
|
||||
#define mscclppCalloc(...) mscclppCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
||||
|
||||
template <typename T>
|
||||
mscclppResult_t mscclppRealloc(T** ptr, size_t oldNelem, size_t nelem) {
|
||||
if (nelem < oldNelem) return mscclppInternalError;
|
||||
if (nelem == oldNelem) return mscclppSuccess;
|
||||
template <typename T> mscclppResult_t mscclppRealloc(T** ptr, size_t oldNelem, size_t nelem)
|
||||
{
|
||||
if (nelem < oldNelem)
|
||||
return mscclppInternalError;
|
||||
if (nelem == oldNelem)
|
||||
return mscclppSuccess;
|
||||
|
||||
T* oldp = *ptr;
|
||||
T* p = (T*)malloc(nelem*sizeof(T));
|
||||
T* p = (T*)malloc(nelem * sizeof(T));
|
||||
if (p == NULL) {
|
||||
WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
|
||||
WARN("Failed to malloc %ld bytes", nelem * sizeof(T));
|
||||
return mscclppSystemError;
|
||||
}
|
||||
memcpy(p, oldp, oldNelem*sizeof(T));
|
||||
memcpy(p, oldp, oldNelem * sizeof(T));
|
||||
free(oldp);
|
||||
memset(p+oldNelem, 0, (nelem-oldNelem)*sizeof(T));
|
||||
memset(p + oldNelem, 0, (nelem - oldNelem) * sizeof(T));
|
||||
*ptr = (T*)p;
|
||||
INFO(MSCCLPP_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*sizeof(T), nelem*sizeof(T), *ptr);
|
||||
INFO(MSCCLPP_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem * sizeof(T), nelem * sizeof(T),
|
||||
*ptr);
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
mscclppResult_t mscclppCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
|
||||
template <typename T> mscclppResult_t mscclppCudaMallocDebug(T** ptr, size_t nelem, const char* filefunc, int line)
|
||||
{
|
||||
mscclppResult_t result = mscclppSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
|
||||
CUDACHECKGOTO(cudaMalloc(ptr, nelem * sizeof(T)), result, finish);
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem*sizeof(T));
|
||||
INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
|
||||
if (*ptr == nullptr)
|
||||
WARN("Failed to CUDA malloc %ld bytes", nelem * sizeof(T));
|
||||
INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
|
||||
return result;
|
||||
}
|
||||
#define mscclppCudaMalloc(...) mscclppCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
||||
|
||||
template <typename T>
|
||||
mscclppResult_t mscclppCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
|
||||
template <typename T> mscclppResult_t mscclppCudaCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line)
|
||||
{
|
||||
mscclppResult_t result = mscclppSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
@@ -96,36 +102,39 @@ mscclppResult_t mscclppCudaCallocDebug(T** ptr, size_t nelem, const char *filefu
|
||||
// Need a side stream so as not to interfere with graph capture.
|
||||
cudaStream_t stream;
|
||||
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
|
||||
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
|
||||
CUDACHECKGOTO(cudaMalloc(ptr, nelem * sizeof(T)), result, finish);
|
||||
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem * sizeof(T), stream), result, finish);
|
||||
CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
|
||||
CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (*ptr == nullptr) WARN("Failed to CUDA calloc %ld bytes", nelem*sizeof(T));
|
||||
INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
|
||||
if (*ptr == nullptr)
|
||||
WARN("Failed to CUDA calloc %ld bytes", nelem * sizeof(T));
|
||||
INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
|
||||
return result;
|
||||
}
|
||||
#define mscclppCudaCalloc(...) mscclppCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
||||
|
||||
template <typename T>
|
||||
mscclppResult_t mscclppCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream, const char *filefunc, int line) {
|
||||
mscclppResult_t mscclppCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream, const char* filefunc, int line)
|
||||
{
|
||||
mscclppResult_t result = mscclppSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
|
||||
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
|
||||
CUDACHECKGOTO(cudaMalloc(ptr, nelem * sizeof(T)), result, finish);
|
||||
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem * sizeof(T), stream), result, finish);
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (*ptr == nullptr) WARN("Failed to CUDA calloc async %ld bytes", nelem*sizeof(T));
|
||||
INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
|
||||
if (*ptr == nullptr)
|
||||
WARN("Failed to CUDA calloc async %ld bytes", nelem * sizeof(T));
|
||||
INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
|
||||
return result;
|
||||
}
|
||||
#define mscclppCudaCallocAsync(...) mscclppCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__)
|
||||
|
||||
template <typename T>
|
||||
mscclppResult_t mscclppCudaMemcpy(T* dst, T* src, size_t nelem) {
|
||||
template <typename T> mscclppResult_t mscclppCudaMemcpy(T* dst, T* src, size_t nelem)
|
||||
{
|
||||
mscclppResult_t result = mscclppSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
@@ -140,19 +149,19 @@ finish:
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
mscclppResult_t mscclppCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stream) {
|
||||
template <typename T> mscclppResult_t mscclppCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stream)
|
||||
{
|
||||
mscclppResult_t result = mscclppSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem*sizeof(T), cudaMemcpyDefault, stream), result, finish);
|
||||
CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem * sizeof(T), cudaMemcpyDefault, stream), result, finish);
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
mscclppResult_t mscclppCudaFree(T* ptr) {
|
||||
template <typename T> mscclppResult_t mscclppCudaFree(T* ptr)
|
||||
{
|
||||
mscclppResult_t result = mscclppSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
@@ -165,12 +174,14 @@ finish:
|
||||
// Allocate memory to be potentially ibv_reg_mr'd. This needs to be
|
||||
// allocated on separate pages as those pages will be marked DONTFORK
|
||||
// and if they are shared, that could cause a crash in a child process
|
||||
inline mscclppResult_t mscclppIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
|
||||
inline mscclppResult_t mscclppIbMallocDebug(void** ptr, size_t size, const char* filefunc, int line)
|
||||
{
|
||||
size_t page_size = sysconf(_SC_PAGESIZE);
|
||||
void* p;
|
||||
int size_aligned = ROUNDUP(size, page_size);
|
||||
int ret = posix_memalign(&p, page_size, size_aligned);
|
||||
if (ret != 0) return mscclppSystemError;
|
||||
if (ret != 0)
|
||||
return mscclppSystemError;
|
||||
memset(p, 0, size);
|
||||
*ptr = p;
|
||||
INFO(MSCCLPP_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr);
|
||||
|
||||
@@ -12,21 +12,24 @@
|
||||
|
||||
#include "comm.h"
|
||||
|
||||
struct mscclppBootstrapHandle {
|
||||
struct mscclppBootstrapHandle
|
||||
{
|
||||
uint64_t magic;
|
||||
union mscclppSocketAddress addr;
|
||||
};
|
||||
static_assert(sizeof(struct mscclppBootstrapHandle) <= sizeof(mscclppUniqueId), "Bootstrap handle is too large to fit inside MSCCLPP unique ID");
|
||||
static_assert(sizeof(struct mscclppBootstrapHandle) <= sizeof(mscclppUniqueId),
|
||||
"Bootstrap handle is too large to fit inside MSCCLPP unique ID");
|
||||
|
||||
mscclppResult_t bootstrapNetInit(const char* ip_port_pair = NULL);
|
||||
mscclppResult_t bootstrapCreateRoot(struct mscclppBootstrapHandle* handle);
|
||||
mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle, bool isRoot = true, const char* ip_port_pair = NULL);
|
||||
mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle, bool isRoot = true,
|
||||
const char* ip_port_pair = NULL);
|
||||
mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscclppComm* comm);
|
||||
mscclppResult_t bootstrapAllGather(void* commState, void* allData, int size);
|
||||
mscclppResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
|
||||
mscclppResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
|
||||
mscclppResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
|
||||
mscclppResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size);
|
||||
mscclppResult_t bootstrapBarrier(void* commState, int* ranks, int rank, int nranks, int tag);
|
||||
mscclppResult_t bootstrapIntraNodeAllGather(void* commState, int* ranks, int rank, int nranks, void* allData, int size);
|
||||
mscclppResult_t bootstrapClose(void* commState);
|
||||
mscclppResult_t bootstrapAbort(void* commState);
|
||||
#endif
|
||||
|
||||
@@ -11,151 +11,174 @@
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
// Check CUDA RT calls
|
||||
#define CUDACHECK(cmd) do { \
|
||||
cudaError_t err = cmd; \
|
||||
if( err != cudaSuccess ) { \
|
||||
WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
|
||||
return mscclppUnhandledCudaError; \
|
||||
} \
|
||||
} while(false)
|
||||
#define CUDACHECK(cmd) \
|
||||
do { \
|
||||
cudaError_t err = cmd; \
|
||||
if (err != cudaSuccess) { \
|
||||
WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
|
||||
return mscclppUnhandledCudaError; \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
#define CUDACHECKGOTO(cmd, res, label) do { \
|
||||
cudaError_t err = cmd; \
|
||||
if( err != cudaSuccess ) { \
|
||||
WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
|
||||
res = mscclppUnhandledCudaError; \
|
||||
goto label; \
|
||||
} \
|
||||
} while(false)
|
||||
#define CUDACHECKGOTO(cmd, res, label) \
|
||||
do { \
|
||||
cudaError_t err = cmd; \
|
||||
if (err != cudaSuccess) { \
|
||||
WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
|
||||
res = mscclppUnhandledCudaError; \
|
||||
goto label; \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
// Report failure but clear error and continue
|
||||
#define CUDACHECKIGNORE(cmd) do { \
|
||||
cudaError_t err = cmd; \
|
||||
if( err != cudaSuccess ) { \
|
||||
INFO(MSCCLPP_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||
(void) cudaGetLastError(); \
|
||||
} \
|
||||
} while(false)
|
||||
#define CUDACHECKIGNORE(cmd) \
|
||||
do { \
|
||||
cudaError_t err = cmd; \
|
||||
if (err != cudaSuccess) { \
|
||||
INFO(MSCCLPP_ALL, "%s:%d Cuda failure '%s'", __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||
(void)cudaGetLastError(); \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
#include <errno.h>
|
||||
// Check system calls
|
||||
#define SYSCHECK(call, name) do { \
|
||||
int retval; \
|
||||
SYSCHECKVAL(call, name, retval); \
|
||||
} while (false)
|
||||
#define SYSCHECK(call, name) \
|
||||
do { \
|
||||
int retval; \
|
||||
SYSCHECKVAL(call, name, retval); \
|
||||
} while (false)
|
||||
|
||||
#define SYSCHECKVAL(call, name, retval) do { \
|
||||
SYSCHECKSYNC(call, name, retval); \
|
||||
if (retval == -1) { \
|
||||
WARN("Call to " name " failed : %s", strerror(errno)); \
|
||||
return mscclppSystemError; \
|
||||
} \
|
||||
} while (false)
|
||||
#define SYSCHECKVAL(call, name, retval) \
|
||||
do { \
|
||||
SYSCHECKSYNC(call, name, retval); \
|
||||
if (retval == -1) { \
|
||||
WARN("Call to " name " failed : %s", strerror(errno)); \
|
||||
return mscclppSystemError; \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
#define SYSCHECKSYNC(call, name, retval) do { \
|
||||
retval = call; \
|
||||
if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
|
||||
INFO(MSCCLPP_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
|
||||
} else { \
|
||||
break; \
|
||||
} \
|
||||
} while(true)
|
||||
#define SYSCHECKSYNC(call, name, retval) \
|
||||
do { \
|
||||
retval = call; \
|
||||
if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
|
||||
INFO(MSCCLPP_ALL, "Call to " name " returned %s, retrying", strerror(errno)); \
|
||||
} else { \
|
||||
break; \
|
||||
} \
|
||||
} while (true)
|
||||
|
||||
#define SYSCHECKGOTO(statement, res, label) do { \
|
||||
if ((statement) == -1) { \
|
||||
/* Print the back trace*/ \
|
||||
res = mscclppSystemError; \
|
||||
INFO(MSCCLPP_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
goto label; \
|
||||
} \
|
||||
} while (0);
|
||||
#define SYSCHECKGOTO(statement, res, label) \
|
||||
do { \
|
||||
if ((statement) == -1) { \
|
||||
/* Print the back trace*/ \
|
||||
res = mscclppSystemError; \
|
||||
INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
goto label; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#define NEQCHECK(statement, value) do { \
|
||||
if ((statement) != value) { \
|
||||
/* Print the back trace*/ \
|
||||
INFO(MSCCLPP_ALL,"%s:%d -> %d", __FILE__, __LINE__, mscclppSystemError); \
|
||||
return mscclppSystemError; \
|
||||
} \
|
||||
} while (0);
|
||||
#define NEQCHECK(statement, value) \
|
||||
do { \
|
||||
if ((statement) != value) { \
|
||||
/* Print the back trace*/ \
|
||||
INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, mscclppSystemError); \
|
||||
return mscclppSystemError; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#define NEQCHECKGOTO(statement, value, res, label) do { \
|
||||
if ((statement) != value) { \
|
||||
/* Print the back trace*/ \
|
||||
res = mscclppSystemError; \
|
||||
INFO(MSCCLPP_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
goto label; \
|
||||
} \
|
||||
} while (0);
|
||||
#define NEQCHECKGOTO(statement, value, res, label) \
|
||||
do { \
|
||||
if ((statement) != value) { \
|
||||
/* Print the back trace*/ \
|
||||
res = mscclppSystemError; \
|
||||
INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
goto label; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#define EQCHECK(statement, value) do { \
|
||||
if ((statement) == value) { \
|
||||
/* Print the back trace*/ \
|
||||
INFO(MSCCLPP_ALL,"%s:%d -> %d", __FILE__, __LINE__, mscclppSystemError); \
|
||||
return mscclppSystemError; \
|
||||
} \
|
||||
} while (0);
|
||||
#define EQCHECK(statement, value) \
|
||||
do { \
|
||||
if ((statement) == value) { \
|
||||
/* Print the back trace*/ \
|
||||
INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, mscclppSystemError); \
|
||||
return mscclppSystemError; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#define EQCHECKGOTO(statement, value, res, label) do { \
|
||||
if ((statement) == value) { \
|
||||
/* Print the back trace*/ \
|
||||
res = mscclppSystemError; \
|
||||
INFO(MSCCLPP_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
goto label; \
|
||||
} \
|
||||
} while (0);
|
||||
#define EQCHECKGOTO(statement, value, res, label) \
|
||||
do { \
|
||||
if ((statement) == value) { \
|
||||
/* Print the back trace*/ \
|
||||
res = mscclppSystemError; \
|
||||
INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
goto label; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
// Propagate errors up
|
||||
#define MSCCLPPCHECK(call) do { \
|
||||
mscclppResult_t res = call; \
|
||||
if (res != mscclppSuccess && res != mscclppInProgress) { \
|
||||
/* Print the back trace*/ \
|
||||
if (mscclppDebugNoWarn == 0) INFO(MSCCLPP_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
return res; \
|
||||
} \
|
||||
} while (0);
|
||||
#define MSCCLPPCHECK(call) \
|
||||
do { \
|
||||
mscclppResult_t res = call; \
|
||||
if (res != mscclppSuccess && res != mscclppInProgress) { \
|
||||
/* Print the back trace*/ \
|
||||
if (mscclppDebugNoWarn == 0) \
|
||||
INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
return res; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#define MSCCLPPCHECKGOTO(call, res, label) do { \
|
||||
res = call; \
|
||||
if (res != mscclppSuccess && res != mscclppInProgress) { \
|
||||
/* Print the back trace*/ \
|
||||
if (mscclppDebugNoWarn == 0) INFO(MSCCLPP_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
goto label; \
|
||||
} \
|
||||
} while (0);
|
||||
#define MSCCLPPCHECKGOTO(call, res, label) \
|
||||
do { \
|
||||
res = call; \
|
||||
if (res != mscclppSuccess && res != mscclppInProgress) { \
|
||||
/* Print the back trace*/ \
|
||||
if (mscclppDebugNoWarn == 0) \
|
||||
INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
goto label; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#define MSCCLPPWAIT(call, cond, abortFlagPtr) do { \
|
||||
volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
|
||||
mscclppResult_t res = call; \
|
||||
if (res != mscclppSuccess && res != mscclppInProgress) { \
|
||||
if (mscclppDebugNoWarn == 0) INFO(MSCCLPP_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
return mscclppInternalError; \
|
||||
} \
|
||||
if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \
|
||||
} while (!(cond));
|
||||
#define MSCCLPPWAIT(call, cond, abortFlagPtr) \
|
||||
do { \
|
||||
volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
|
||||
mscclppResult_t res = call; \
|
||||
if (res != mscclppSuccess && res != mscclppInProgress) { \
|
||||
if (mscclppDebugNoWarn == 0) \
|
||||
INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
return mscclppInternalError; \
|
||||
} \
|
||||
if (tmpAbortFlag) \
|
||||
NEQCHECK(*tmpAbortFlag, 0); \
|
||||
} while (!(cond));
|
||||
|
||||
#define MSCCLPPWAITGOTO(call, cond, abortFlagPtr, res, label) do { \
|
||||
volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
|
||||
res = call; \
|
||||
if (res != mscclppSuccess && res != mscclppInProgress) { \
|
||||
if (mscclppDebugNoWarn == 0) INFO(MSCCLPP_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
goto label; \
|
||||
} \
|
||||
if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \
|
||||
} while (!(cond));
|
||||
#define MSCCLPPWAITGOTO(call, cond, abortFlagPtr, res, label) \
|
||||
do { \
|
||||
volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
|
||||
res = call; \
|
||||
if (res != mscclppSuccess && res != mscclppInProgress) { \
|
||||
if (mscclppDebugNoWarn == 0) \
|
||||
INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
goto label; \
|
||||
} \
|
||||
if (tmpAbortFlag) \
|
||||
NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \
|
||||
} while (!(cond));
|
||||
|
||||
#define MSCCLPPCHECKTHREAD(a, args) do { \
|
||||
if (((args)->ret = (a)) != mscclppSuccess && (args)->ret != mscclppInProgress) { \
|
||||
INFO(MSCCLPP_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \
|
||||
return args; \
|
||||
} \
|
||||
} while(0)
|
||||
#define MSCCLPPCHECKTHREAD(a, args) \
|
||||
do { \
|
||||
if (((args)->ret = (a)) != mscclppSuccess && (args)->ret != mscclppInProgress) { \
|
||||
INFO(MSCCLPP_INIT, "%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \
|
||||
return args; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define CUDACHECKTHREAD(a) do { \
|
||||
if ((a) != cudaSuccess) { \
|
||||
INFO(MSCCLPP_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
|
||||
args->ret = mscclppUnhandledCudaError; \
|
||||
return args; \
|
||||
} \
|
||||
} while(0)
|
||||
#define CUDACHECKTHREAD(a) \
|
||||
do { \
|
||||
if ((a) != cudaSuccess) { \
|
||||
INFO(MSCCLPP_INIT, "%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
|
||||
args->ret = mscclppUnhandledCudaError; \
|
||||
return args; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
@@ -7,8 +7,8 @@
|
||||
#ifndef MSCCLPP_COMM_H_
|
||||
#define MSCCLPP_COMM_H_
|
||||
|
||||
#include "proxy.h"
|
||||
#include "ib.h"
|
||||
#include "proxy.h"
|
||||
|
||||
// #define CACHE_LINE_SIZE 128
|
||||
// #define MEM_ALIGN 4096
|
||||
@@ -21,43 +21,46 @@
|
||||
|
||||
#define MAXCONNECTIONS 1024
|
||||
|
||||
struct mscclppConn {
|
||||
struct mscclppConn
|
||||
{
|
||||
mscclppTransport_t transport;
|
||||
int remoteRank;
|
||||
uint64_t buffSize;
|
||||
uint64_t *remoteProxyFlag;
|
||||
uint64_t *cpuProxyFlag;
|
||||
void *cpuProxyFlagGdrDesc;
|
||||
struct mscclppDevConn *devConn;
|
||||
struct mscclppIbContext *ibCtx;
|
||||
struct mscclppIbQp *ibQp;
|
||||
struct mscclppIbMr *ibBuffMr;
|
||||
struct mscclppIbMr *ibLocalFlagMr;
|
||||
struct mscclppIbMr *ibProxyFlagMr;
|
||||
uint64_t* remoteProxyFlag;
|
||||
uint64_t* cpuProxyFlag;
|
||||
void* cpuProxyFlagGdrDesc;
|
||||
struct mscclppDevConn* devConn;
|
||||
struct mscclppIbContext* ibCtx;
|
||||
struct mscclppIbQp* ibQp;
|
||||
struct mscclppIbMr* ibBuffMr;
|
||||
struct mscclppIbMr* ibLocalFlagMr;
|
||||
struct mscclppIbMr* ibProxyFlagMr;
|
||||
struct mscclppIbMrInfo ibBuffMrInfo;
|
||||
struct mscclppIbMrInfo ibLocalFlagMrInfo;
|
||||
struct mscclppIbMrInfo ibProxyFlagMrInfo;
|
||||
};
|
||||
|
||||
struct mscclppComm {
|
||||
struct mscclppComm
|
||||
{
|
||||
struct mscclppConn conns[MAXCONNECTIONS];
|
||||
struct mscclppDevConn devConns[MAXCONNECTIONS];
|
||||
int nConns;
|
||||
|
||||
void* bootstrap;
|
||||
|
||||
uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
|
||||
uint64_t
|
||||
magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
|
||||
|
||||
int rank; // my rank in the communicator
|
||||
int nRanks; // number of GPUs in communicator
|
||||
int cudaDev; // my cuda device index
|
||||
|
||||
// Flag to ask MSCCLPP kernels to abort
|
||||
volatile uint32_t *abortFlag;
|
||||
volatile uint32_t* abortFlag;
|
||||
|
||||
struct mscclppIbContext *ibContext[MSCCLPP_IB_MAX_DEVS];
|
||||
struct mscclppIbContext* ibContext[MSCCLPP_IB_MAX_DEVS];
|
||||
cudaStream_t stream; // DMA engine stream for P2P
|
||||
struct mscclppProxyState *proxyState[MSCCLPP_PROXY_MAX_NUM];
|
||||
struct mscclppProxyState* proxyState[MSCCLPP_PROXY_MAX_NUM];
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -7,32 +7,24 @@
|
||||
#ifndef MSCCLPP_CORE_H_
|
||||
#define MSCCLPP_CORE_H_
|
||||
|
||||
#include <pthread.h>
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <algorithm> // For std::min/std::max
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include "mscclpp.h"
|
||||
#include "debug.h"
|
||||
#include "alloc.h"
|
||||
#include "debug.h"
|
||||
#include "mscclpp.h"
|
||||
#include "param.h"
|
||||
#include <algorithm> // For std::min/std::max
|
||||
#include <pthread.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#ifdef PROFAPI
|
||||
#define MSCCLPP_API(ret, func, args...) \
|
||||
__attribute__ ((visibility("default"))) \
|
||||
__attribute__ ((alias(#func))) \
|
||||
ret p##func (args); \
|
||||
extern "C" \
|
||||
__attribute__ ((visibility("default"))) \
|
||||
__attribute__ ((weak)) \
|
||||
ret func(args)
|
||||
#define MSCCLPP_API(ret, func, args...) \
|
||||
__attribute__((visibility("default"))) __attribute__((alias(#func))) ret p##func(args); \
|
||||
extern "C" __attribute__((visibility("default"))) __attribute__((weak)) ret func(args)
|
||||
#else
|
||||
#define MSCCLPP_API(ret, func, args...) \
|
||||
extern "C" \
|
||||
__attribute__ ((visibility("default"))) \
|
||||
ret func(args)
|
||||
#define MSCCLPP_API(ret, func, args...) extern "C" __attribute__((visibility("default"))) ret func(args)
|
||||
#endif // end PROFAPI
|
||||
|
||||
#endif // end include guard
|
||||
|
||||
@@ -8,27 +8,49 @@
|
||||
#define MSCCLPP_DEBUG_H_
|
||||
|
||||
#include "mscclpp.h"
|
||||
#include <stdio.h>
|
||||
#include <chrono>
|
||||
#include <stdio.h>
|
||||
#include <type_traits>
|
||||
|
||||
#include <limits.h>
|
||||
#include <string.h>
|
||||
#include <pthread.h>
|
||||
#include <string.h>
|
||||
|
||||
// Conform to pthread and NVTX standard
|
||||
#define MSCCLPP_THREAD_NAMELEN 16
|
||||
|
||||
typedef enum {MSCCLPP_LOG_NONE=0, MSCCLPP_LOG_VERSION=1, MSCCLPP_LOG_WARN=2, MSCCLPP_LOG_INFO=3, MSCCLPP_LOG_ABORT=4, MSCCLPP_LOG_TRACE=5} mscclppDebugLogLevel;
|
||||
typedef enum {MSCCLPP_INIT=1, MSCCLPP_COLL=2, MSCCLPP_P2P=4, MSCCLPP_SHM=8, MSCCLPP_NET=16, MSCCLPP_GRAPH=32, MSCCLPP_TUNING=64, MSCCLPP_ENV=128, MSCCLPP_ALLOC=256, MSCCLPP_CALL=512, MSCCLPP_ALL=~0} mscclppDebugLogSubSys;
|
||||
typedef enum
|
||||
{
|
||||
MSCCLPP_LOG_NONE = 0,
|
||||
MSCCLPP_LOG_VERSION = 1,
|
||||
MSCCLPP_LOG_WARN = 2,
|
||||
MSCCLPP_LOG_INFO = 3,
|
||||
MSCCLPP_LOG_ABORT = 4,
|
||||
MSCCLPP_LOG_TRACE = 5
|
||||
} mscclppDebugLogLevel;
|
||||
typedef enum
|
||||
{
|
||||
MSCCLPP_INIT = 1,
|
||||
MSCCLPP_COLL = 2,
|
||||
MSCCLPP_P2P = 4,
|
||||
MSCCLPP_SHM = 8,
|
||||
MSCCLPP_NET = 16,
|
||||
MSCCLPP_GRAPH = 32,
|
||||
MSCCLPP_TUNING = 64,
|
||||
MSCCLPP_ENV = 128,
|
||||
MSCCLPP_ALLOC = 256,
|
||||
MSCCLPP_CALL = 512,
|
||||
MSCCLPP_ALL = ~0
|
||||
} mscclppDebugLogSubSys;
|
||||
|
||||
extern int mscclppDebugLevel;
|
||||
extern uint64_t mscclppDebugMask;
|
||||
extern pthread_mutex_t mscclppDebugLock;
|
||||
extern FILE *mscclppDebugFile;
|
||||
extern FILE* mscclppDebugFile;
|
||||
extern mscclppResult_t getHostName(char* hostname, int maxlen, const char delim);
|
||||
|
||||
void mscclppDebugLog(mscclppDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));
|
||||
void mscclppDebugLog(mscclppDebugLogLevel level, unsigned long flags, const char* filefunc, int line, const char* fmt,
|
||||
...) __attribute__((format(printf, 5, 6)));
|
||||
|
||||
// Let code temporarily downgrade WARN into INFO
|
||||
extern thread_local int mscclppDebugNoWarn;
|
||||
@@ -45,6 +67,6 @@ extern std::chrono::steady_clock::time_point mscclppEpoch;
|
||||
#define TRACE(...)
|
||||
#endif
|
||||
|
||||
void mscclppSetThreadName(pthread_t thread, const char *fmt, ...);
|
||||
void mscclppSetThreadName(pthread_t thread, const char* fmt, ...);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,53 +1,58 @@
|
||||
#ifndef MSCCLPP_GDR_H_
|
||||
#define MSCCLPP_GDR_H_
|
||||
|
||||
#include "gdrapi.h"
|
||||
#include "debug.h"
|
||||
#include "checks.h"
|
||||
#include "align.h"
|
||||
#include "alloc.h"
|
||||
#include "checks.h"
|
||||
#include "debug.h"
|
||||
#include "gdrapi.h"
|
||||
|
||||
// These can be used if the GDR library isn't thread safe
|
||||
#include <pthread.h>
|
||||
extern pthread_mutex_t gdrLock;
|
||||
#define GDRLOCK() pthread_mutex_lock(&gdrLock)
|
||||
#define GDRUNLOCK() pthread_mutex_unlock(&gdrLock)
|
||||
#define GDRLOCKCALL(cmd, ret) do { \
|
||||
GDRLOCK(); \
|
||||
ret = cmd; \
|
||||
GDRUNLOCK(); \
|
||||
} while(false)
|
||||
#define GDRLOCKCALL(cmd, ret) \
|
||||
do { \
|
||||
GDRLOCK(); \
|
||||
ret = cmd; \
|
||||
GDRUNLOCK(); \
|
||||
} while (false)
|
||||
|
||||
#define GDRCHECK(cmd) do { \
|
||||
int e; \
|
||||
/* GDRLOCKCALL(cmd, e); */ \
|
||||
e = cmd; \
|
||||
if( e != 0 ) { \
|
||||
WARN("GDRCOPY failure %d", e); \
|
||||
return mscclppSystemError; \
|
||||
} \
|
||||
} while(false)
|
||||
#define GDRCHECK(cmd) \
|
||||
do { \
|
||||
int e; \
|
||||
/* GDRLOCKCALL(cmd, e); */ \
|
||||
e = cmd; \
|
||||
if (e != 0) { \
|
||||
WARN("GDRCOPY failure %d", e); \
|
||||
return mscclppSystemError; \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
gdr_t wrap_gdr_open(void);
|
||||
mscclppResult_t wrap_gdr_close(gdr_t g);
|
||||
mscclppResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle);
|
||||
mscclppResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space,
|
||||
gdr_mh_t* handle);
|
||||
mscclppResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle);
|
||||
mscclppResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info);
|
||||
mscclppResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size);
|
||||
mscclppResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size);
|
||||
mscclppResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t* info);
|
||||
mscclppResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void** va, size_t size);
|
||||
mscclppResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void* va, size_t size);
|
||||
|
||||
// Global GDR driver handle
|
||||
extern gdr_t mscclppGdrCopy;
|
||||
|
||||
typedef struct gdr_mem_desc {
|
||||
void *gdrDevMem;
|
||||
void *gdrMap;
|
||||
typedef struct gdr_mem_desc
|
||||
{
|
||||
void* gdrDevMem;
|
||||
void* gdrMap;
|
||||
size_t gdrOffset;
|
||||
size_t gdrMapSize;
|
||||
gdr_mh_t gdrMh;
|
||||
} gdr_mem_desc_t;
|
||||
|
||||
static gdr_t mscclppGdrInit() {
|
||||
static gdr_t mscclppGdrInit()
|
||||
{
|
||||
// int libMajor, libMinor, drvMajor, drvMinor;
|
||||
gdr_t handle = wrap_gdr_open();
|
||||
|
||||
@@ -68,13 +73,15 @@ static gdr_t mscclppGdrInit() {
|
||||
// INFO(MSCCLPP_INIT, "GDRCOPY enabled library %d.%d driver %d.%d", libMajor, libMinor, drvMajor, drvMinor);
|
||||
// }
|
||||
return handle;
|
||||
// error:
|
||||
// if (handle != NULL) (void) wrap_gdr_close(handle);
|
||||
// return NULL;
|
||||
// error:
|
||||
// if (handle != NULL) (void) wrap_gdr_close(handle);
|
||||
// return NULL;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
mscclppResult_t mscclppGdrCudaCallocDebug(T** ptr, T** devPtr, size_t nelem, void** gdrDesc, const char *filefunc, int line) {
|
||||
mscclppResult_t mscclppGdrCudaCallocDebug(T** ptr, T** devPtr, size_t nelem, void** gdrDesc, const char* filefunc,
|
||||
int line)
|
||||
{
|
||||
mscclppResult_t result = mscclppSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
@@ -85,20 +92,20 @@ mscclppResult_t mscclppGdrCudaCallocDebug(T** ptr, T** devPtr, size_t nelem, voi
|
||||
gdr_info_t info;
|
||||
size_t mapSize;
|
||||
gdr_mh_t mh;
|
||||
char *devMem;
|
||||
void *gdrMap;
|
||||
char* devMem;
|
||||
void* gdrMap;
|
||||
ssize_t off;
|
||||
gdr_mem_desc_t* md;
|
||||
uint64_t alignedAddr;
|
||||
size_t align;
|
||||
|
||||
mapSize = sizeof(T)*nelem;
|
||||
mapSize = sizeof(T) * nelem;
|
||||
|
||||
// GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE
|
||||
ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
|
||||
// GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too
|
||||
MSCCLPPCHECKGOTO(mscclppCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1), result, finish);
|
||||
alignedAddr = (((uint64_t) devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK;
|
||||
MSCCLPPCHECKGOTO(mscclppCudaCalloc(&devMem, mapSize + GPU_PAGE_SIZE - 1), result, finish);
|
||||
alignedAddr = (((uint64_t)devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK;
|
||||
align = alignedAddr - (uint64_t)devMem;
|
||||
MSCCLPPCHECKGOTO(wrap_gdr_pin_buffer(mscclppGdrCopy, alignedAddr, mapSize, 0, 0, &mh), result, finish);
|
||||
|
||||
@@ -113,29 +120,31 @@ mscclppResult_t mscclppGdrCudaCallocDebug(T** ptr, T** devPtr, size_t nelem, voi
|
||||
md->gdrDevMem = devMem;
|
||||
md->gdrMap = gdrMap;
|
||||
md->gdrMapSize = mapSize;
|
||||
md->gdrOffset = off+align;
|
||||
md->gdrOffset = off + align;
|
||||
md->gdrMh = mh;
|
||||
*gdrDesc = md;
|
||||
|
||||
*ptr = (T *)((char *)gdrMap+off);
|
||||
if (devPtr) *devPtr = (T *)(devMem+off+align);
|
||||
*ptr = (T*)((char*)gdrMap + off);
|
||||
if (devPtr)
|
||||
*devPtr = (T*)(devMem + off + align);
|
||||
|
||||
TRACE(mscclpp_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p",
|
||||
md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr);
|
||||
TRACE(mscclpp_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p", md->gdrDevMem,
|
||||
md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr);
|
||||
|
||||
return mscclppSuccess;
|
||||
|
||||
finish:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (*ptr == nullptr) WARN("Failed to CUDA calloc %ld bytes", nelem*sizeof(T));
|
||||
INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
|
||||
if (*ptr == nullptr)
|
||||
WARN("Failed to CUDA calloc %ld bytes", nelem * sizeof(T));
|
||||
INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
|
||||
return result;
|
||||
}
|
||||
#define mscclppGdrCudaCalloc(...) mscclppGdrCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
||||
|
||||
|
||||
static mscclppResult_t mscclppGdrCudaFree(void* gdrDesc) {
|
||||
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrDesc;
|
||||
static mscclppResult_t mscclppGdrCudaFree(void* gdrDesc)
|
||||
{
|
||||
gdr_mem_desc_t* md = (gdr_mem_desc_t*)gdrDesc;
|
||||
MSCCLPPCHECK(wrap_gdr_unmap(mscclppGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize));
|
||||
MSCCLPPCHECK(wrap_gdr_unpin_buffer(mscclppGdrCopy, md->gdrMh));
|
||||
CUDACHECK(cudaFree(md->gdrDevMem));
|
||||
@@ -144,5 +153,4 @@ static mscclppResult_t mscclppGdrCudaFree(void* gdrDesc) {
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -2,10 +2,10 @@
|
||||
#define MSCCLPP_IB_H_
|
||||
|
||||
#include "mscclpp.h"
|
||||
#include <infiniband/verbs.h>
|
||||
#include <list>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <infiniband/verbs.h>
|
||||
|
||||
#define MSCCLPP_IB_CQ_SIZE 1024
|
||||
#define MSCCLPP_IB_CQ_POLL_NUM 4
|
||||
@@ -13,20 +13,23 @@
|
||||
#define MSCCLPP_IB_MAX_DEVS 8
|
||||
|
||||
// MR info to be shared with the remote peer
|
||||
struct mscclppIbMrInfo {
|
||||
struct mscclppIbMrInfo
|
||||
{
|
||||
uint64_t addr;
|
||||
uint32_t rkey;
|
||||
};
|
||||
|
||||
// IB memory region
|
||||
struct mscclppIbMr {
|
||||
struct ibv_mr *mr;
|
||||
void *buff;
|
||||
struct mscclppIbMr
|
||||
{
|
||||
struct ibv_mr* mr;
|
||||
void* buff;
|
||||
struct mscclppIbMrInfo info;
|
||||
};
|
||||
|
||||
// QP info to be shared with the remote peer
|
||||
struct mscclppIbQpInfo {
|
||||
struct mscclppIbQpInfo
|
||||
{
|
||||
uint16_t lid;
|
||||
uint8_t port;
|
||||
uint8_t linkLayer;
|
||||
@@ -36,44 +39,47 @@ struct mscclppIbQpInfo {
|
||||
};
|
||||
|
||||
// IB queue pair
|
||||
struct mscclppIbQp {
|
||||
struct ibv_qp *qp;
|
||||
struct mscclppIbQp
|
||||
{
|
||||
struct ibv_qp* qp;
|
||||
struct mscclppIbQpInfo info;
|
||||
struct ibv_send_wr *wrs;
|
||||
struct ibv_sge *sges;
|
||||
struct ibv_cq *cq;
|
||||
struct ibv_wc *wcs;
|
||||
struct ibv_send_wr* wrs;
|
||||
struct ibv_sge* sges;
|
||||
struct ibv_cq* cq;
|
||||
struct ibv_wc* wcs;
|
||||
int wrn;
|
||||
|
||||
int rtr(const mscclppIbQpInfo *info);
|
||||
int rtr(const mscclppIbQpInfo* info);
|
||||
int rts();
|
||||
int stageSend(struct mscclppIbMr *ibMr, const mscclppIbMrInfo *info, uint32_t size,
|
||||
uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled);
|
||||
int stageSendWithImm(struct mscclppIbMr *ibMr, const mscclppIbMrInfo *info, uint32_t size,
|
||||
uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled, unsigned int immData);
|
||||
int stageSend(struct mscclppIbMr* ibMr, const mscclppIbMrInfo* info, uint32_t size, uint64_t wrId, uint64_t srcOffset,
|
||||
uint64_t dstOffset, bool signaled);
|
||||
int stageSendWithImm(struct mscclppIbMr* ibMr, const mscclppIbMrInfo* info, uint32_t size, uint64_t wrId,
|
||||
uint64_t srcOffset, uint64_t dstOffset, bool signaled, unsigned int immData);
|
||||
int postSend();
|
||||
int postRecv(uint64_t wrId);
|
||||
int pollCq();
|
||||
};
|
||||
|
||||
// Holds resources of a single IB device.
|
||||
struct mscclppIbContext {
|
||||
struct mscclppIbContext
|
||||
{
|
||||
int numaNode;
|
||||
struct ibv_context *ctx;
|
||||
struct ibv_pd *pd;
|
||||
int *ports;
|
||||
struct ibv_context* ctx;
|
||||
struct ibv_pd* pd;
|
||||
int* ports;
|
||||
int nPorts;
|
||||
struct mscclppIbQp *qps;
|
||||
struct mscclppIbQp* qps;
|
||||
int nQps;
|
||||
int maxQps;
|
||||
struct mscclppIbMr *mrs;
|
||||
struct mscclppIbMr* mrs;
|
||||
int nMrs;
|
||||
int maxMrs;
|
||||
};
|
||||
|
||||
mscclppResult_t mscclppIbContextCreate(struct mscclppIbContext **ctx, const char *ibDevName);
|
||||
mscclppResult_t mscclppIbContextDestroy(struct mscclppIbContext *ctx);
|
||||
mscclppResult_t mscclppIbContextCreateQp(struct mscclppIbContext *ctx, struct mscclppIbQp **ibQp, int port = -1);
|
||||
mscclppResult_t mscclppIbContextRegisterMr(struct mscclppIbContext *ctx, void *buff, size_t size, struct mscclppIbMr **ibMr);
|
||||
mscclppResult_t mscclppIbContextCreate(struct mscclppIbContext** ctx, const char* ibDevName);
|
||||
mscclppResult_t mscclppIbContextDestroy(struct mscclppIbContext* ctx);
|
||||
mscclppResult_t mscclppIbContextCreateQp(struct mscclppIbContext* ctx, struct mscclppIbQp** ibQp, int port = -1);
|
||||
mscclppResult_t mscclppIbContextRegisterMr(struct mscclppIbContext* ctx, void* buff, size_t size,
|
||||
struct mscclppIbMr** ibMr);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -17,47 +17,47 @@ extern "C" {
|
||||
/***************************************************************************************************************
|
||||
* A mscclppDevConn provides a zero-copy connection between two GPUs connected via P2P NVLink or InfiniBand.
|
||||
* The communication API is one-sided meaning that for every single data transfer, only one side
|
||||
* needs to execute unlike a two-sided communication stack such as NCCL where both sides
|
||||
* needs to execute unlike a two-sided communication stack such as NCCL where both sides
|
||||
* need to execute a send and a receive instruction, respectively, for every transfer.
|
||||
*
|
||||
* A connection is uniquely identified by the (remoteRank, tag) pair at an endpoint.
|
||||
* The two endpoints register buffers of the same size with the connection.
|
||||
*
|
||||
*
|
||||
* A connection is uniquely identified by the (remoteRank, tag) pair at an endpoint.
|
||||
* The two endpoints register buffers of the same size with the connection.
|
||||
*
|
||||
* The endpoints provide the remoteRank, tag, and the buffer when registering a connection with msccppConnect().
|
||||
*
|
||||
* mscllppConnectionSetup() sets up all the registered connections.
|
||||
*
|
||||
*
|
||||
* mscllppConnectionSetup() sets up all the registered connections.
|
||||
*
|
||||
***************************************************************************************************************
|
||||
* A proxy thread running on the CPU is necessary to perform transfers using InfiniBand or the DMA engine.
|
||||
* A proxy thread running on the CPU is necessary to perform transfers using InfiniBand or the DMA engine.
|
||||
* The current implementation uses a single proxy thread per context - one IB connection or DMA engine per node.
|
||||
* Thus multiple threadblocks using different connections might use the same CPU proxy thread.
|
||||
*
|
||||
* Thus multiple threadblocks using different connections might use the same CPU proxy thread.
|
||||
*
|
||||
* Before using any of functionality of connections, mscclppProxyLaunch needs to be called to spawn the
|
||||
* proxy threads. There are currently two types of connections:
|
||||
*
|
||||
*
|
||||
* P2P via NVLink: the DMA engine can perform the copy between the buffers. DMA engine has higher latency
|
||||
* but has a higher bandwidth and costs no compute cycles on the GPU.
|
||||
*
|
||||
*
|
||||
* InfiniBand: the RDMA engine copies the data over MLX devices.
|
||||
*
|
||||
*
|
||||
***************************************************************************************************************
|
||||
* At the runtime, a GPU kernel has access to a mscclppDevConn object that provides the following functions:
|
||||
*
|
||||
*
|
||||
* put(): the sender initiates a data transfer to the receiver.
|
||||
*
|
||||
*
|
||||
* signal(): the sender signals the receiver that data is ready to be consumed.
|
||||
*
|
||||
*
|
||||
* wait(): the reciever waits on the signal() to start reading the data.
|
||||
*
|
||||
*
|
||||
* The sender should not reuse the buffer till the signal returns.
|
||||
* The receiver should only access the data after the wait returns.
|
||||
*
|
||||
*
|
||||
* putWithSignal(): the sender initiates a data transfer and signals the receiver that data is ready to be consumed.
|
||||
* This is an optimized version of a put followed by a signal.
|
||||
*
|
||||
* These functions hide the complexity of syncrhonization between the two GPUs and the CPU proxy thread.
|
||||
*
|
||||
* These functions hide the complexity of syncrhonization between the two GPUs and the CPU proxy thread.
|
||||
* Example:
|
||||
*
|
||||
*
|
||||
* // sender GPU
|
||||
* devConn.put(data1)
|
||||
* // not OK to write to data1
|
||||
@@ -67,43 +67,54 @@ extern "C" {
|
||||
* // not OK to write to data1, data2, data3 // not OK to read data1, data2, data3
|
||||
* devConn.signal() -------------------------------> devConn.wait()
|
||||
* // OK to write to data1, data2, data3 // OK to read data1, data2, data3
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
* The two endpoint can concurrently use the same connection provided they are writing (puts) on different
|
||||
* indices in the registered buffer.
|
||||
* indices in the registered buffer.
|
||||
**************************************************************************************************************/
|
||||
struct mscclppDevConn {
|
||||
struct mscclppDevConn
|
||||
{
|
||||
#ifdef __CUDACC__
|
||||
__forceinline__ __device__ void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize){
|
||||
__forceinline__ __device__ void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize)
|
||||
{
|
||||
fifo.push(mscclppData, dstDataOffset, srcDataOffset, dataSize);
|
||||
}
|
||||
|
||||
__forceinline__ __device__ void put(uint64_t dataOffset, uint64_t dataSize){
|
||||
__forceinline__ __device__ void put(uint64_t dataOffset, uint64_t dataSize)
|
||||
{
|
||||
put(dataOffset, dataOffset, dataSize);
|
||||
}
|
||||
|
||||
__forceinline__ __device__ void signal(){
|
||||
__forceinline__ __device__ void signal()
|
||||
{
|
||||
epochIncrement();
|
||||
uint64_t curFifoHead = fifo.push(mscclppFlag | mscclppSync, 0, 0, 1);
|
||||
while (*(volatile uint64_t *)fifo.triggerFifoTail <= curFifoHead);
|
||||
while (*(volatile uint64_t*)fifo.triggerFifoTail <= curFifoHead)
|
||||
;
|
||||
}
|
||||
|
||||
__forceinline__ __device__ void putWithSignal(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize){
|
||||
__forceinline__ __device__ void putWithSignal(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize)
|
||||
{
|
||||
epochIncrement();
|
||||
uint64_t curFifoHead = fifo.push(mscclppData | mscclppFlag | mscclppSync, dstDataOffset, srcDataOffset, dataSize);
|
||||
while (*(volatile uint64_t *)fifo.triggerFifoTail <= curFifoHead);
|
||||
while (*(volatile uint64_t*)fifo.triggerFifoTail <= curFifoHead)
|
||||
;
|
||||
}
|
||||
|
||||
__forceinline__ __device__ void putWithSignal(uint64_t dataOffset, uint64_t dataSize){
|
||||
__forceinline__ __device__ void putWithSignal(uint64_t dataOffset, uint64_t dataSize)
|
||||
{
|
||||
putWithSignal(dataOffset, dataOffset, dataSize);
|
||||
}
|
||||
|
||||
__forceinline__ __device__ void wait(){
|
||||
__forceinline__ __device__ void wait()
|
||||
{
|
||||
(*recvEpochId) += 1;
|
||||
while (*(volatile uint64_t*)proxyEpochId < (*recvEpochId));
|
||||
while (*(volatile uint64_t*)proxyEpochId < (*recvEpochId))
|
||||
;
|
||||
}
|
||||
|
||||
__forceinline__ __device__ void epochIncrement(){
|
||||
__forceinline__ __device__ void epochIncrement()
|
||||
{
|
||||
*(volatile uint64_t*)sendEpochId += 1;
|
||||
}
|
||||
|
||||
@@ -127,18 +138,24 @@ typedef struct mscclppComm* mscclppComm_t;
|
||||
typedef struct mscclppDevConn mscclppDevConn_t;
|
||||
|
||||
#define MSCCLPP_UNIQUE_ID_BYTES 128
|
||||
typedef struct { char internal[MSCCLPP_UNIQUE_ID_BYTES]; } mscclppUniqueId;
|
||||
typedef struct
|
||||
{
|
||||
char internal[MSCCLPP_UNIQUE_ID_BYTES];
|
||||
} mscclppUniqueId;
|
||||
|
||||
/* Error type */
|
||||
typedef enum { mscclppSuccess = 0,
|
||||
mscclppUnhandledCudaError = 1,
|
||||
mscclppSystemError = 2,
|
||||
mscclppInternalError = 3,
|
||||
mscclppInvalidArgument = 4,
|
||||
mscclppInvalidUsage = 5,
|
||||
mscclppRemoteError = 6,
|
||||
mscclppInProgress = 7,
|
||||
mscclppNumResults = 8 } mscclppResult_t;
|
||||
typedef enum
|
||||
{
|
||||
mscclppSuccess = 0,
|
||||
mscclppUnhandledCudaError = 1,
|
||||
mscclppSystemError = 2,
|
||||
mscclppInternalError = 3,
|
||||
mscclppInvalidArgument = 4,
|
||||
mscclppInvalidUsage = 5,
|
||||
mscclppRemoteError = 6,
|
||||
mscclppInProgress = 7,
|
||||
mscclppNumResults = 8
|
||||
} mscclppResult_t;
|
||||
|
||||
/* Create a unique ID for communication. Only needs to be called by one process.
|
||||
* Use with mscclppCommInitRankFromId().
|
||||
@@ -150,16 +167,18 @@ typedef enum { mscclppSuccess = 0,
|
||||
mscclppResult_t mscclppGetUniqueId(mscclppUniqueId* uniqueId);
|
||||
|
||||
/* Transport Types */
|
||||
typedef enum { mscclppTransportP2P = 0,
|
||||
mscclppTransportSHM = 1, // TODO(chhwang): not implemented yet
|
||||
mscclppTransportIB = 2,
|
||||
typedef enum
|
||||
{
|
||||
mscclppTransportP2P = 0,
|
||||
mscclppTransportSHM = 1, // TODO(chhwang): not implemented yet
|
||||
mscclppTransportIB = 2,
|
||||
} mscclppTransport_t;
|
||||
|
||||
/* Initialize a communicator. nranks processes with rank 0 to nranks-1 need to call this function.
|
||||
*
|
||||
*
|
||||
* Outputs:
|
||||
* comm: the communicator to be initialized
|
||||
*
|
||||
*
|
||||
* Inputs:
|
||||
* nranks: number of ranks in the communicator
|
||||
* ipPortPair: a string of the form "ip:port" that represents the address of the root process
|
||||
@@ -169,10 +188,10 @@ mscclppResult_t mscclppCommInitRank(mscclppComm_t* comm, int nranks, const char*
|
||||
|
||||
/* Initialize a communicator from a given mscclppUniqueId. Same as mscclppCommInitRank() except that
|
||||
* id is provided by the user by calling mscclppGetUniqueId()
|
||||
*
|
||||
*
|
||||
* Outputs:
|
||||
* comm: the communicator to be initialized
|
||||
*
|
||||
*
|
||||
* Inputs:
|
||||
* nranks: number of ranks in the communicator
|
||||
* id: the unique ID to be used for communication
|
||||
@@ -181,10 +200,10 @@ mscclppResult_t mscclppCommInitRank(mscclppComm_t* comm, int nranks, const char*
|
||||
mscclppResult_t mscclppCommInitRankFromId(mscclppComm_t* comm, int nranks, mscclppUniqueId id, int rank);
|
||||
|
||||
/* Ring-based AllGather through the bootstrap socket.
|
||||
*
|
||||
*
|
||||
* Outputs:
|
||||
* comm: the communicator
|
||||
*
|
||||
*
|
||||
* Inputs:
|
||||
* data: data array to be gathered where `[r*size, (r+1)*size)` is the data for rank `r`
|
||||
* size: data size per rank
|
||||
@@ -192,26 +211,26 @@ mscclppResult_t mscclppCommInitRankFromId(mscclppComm_t* comm, int nranks, msccl
|
||||
mscclppResult_t mscclppBootstrapAllGather(mscclppComm_t comm, void* data, int size);
|
||||
|
||||
/* Destroy a communicator.
|
||||
*
|
||||
*
|
||||
* Inputs:
|
||||
* comm: the communicator to be destroyed
|
||||
*/
|
||||
mscclppResult_t mscclppCommDestroy(mscclppComm_t comm);
|
||||
|
||||
/* Return the string for the given error code.
|
||||
*
|
||||
*
|
||||
* Ouput:
|
||||
* returns the string
|
||||
*
|
||||
*
|
||||
* Inputs:
|
||||
* result: the error code that this function needs to translate
|
||||
*/
|
||||
const char* mscclppGetErrorString(mscclppResult_t result);
|
||||
const char* mscclppGetErrorString(mscclppResult_t result);
|
||||
|
||||
/* Connect to a remote rank. This function only prepares metadata for connection. The actual connection
|
||||
* is made by a following call of mscclppConnectionSetup(). Note that this function is two-way and a connection
|
||||
* from rank i to remote rank j needs to have a counterpart from rank j to rank i.
|
||||
*
|
||||
*
|
||||
* Inputs:
|
||||
* comm: the communicator
|
||||
* remoteRank: the rank of the remote process
|
||||
@@ -223,11 +242,11 @@ const char* mscclppGetErrorString(mscclppResult_t result);
|
||||
* ibDev: the name of the IB device to be used. Expects a null for mscclppTransportP2P.
|
||||
*/
|
||||
mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void* localBuff, uint64_t buffSize,
|
||||
mscclppTransport_t transportType, const char *ibDev=0);
|
||||
mscclppTransport_t transportType, const char* ibDev = 0);
|
||||
|
||||
/* Establish all connections declared by mscclppConnect(). This function must be called after all mscclppConnect()
|
||||
* calls are made. This function ensures that all remote ranks are ready to communicate when it returns.
|
||||
*
|
||||
*
|
||||
* Inputs:
|
||||
* comm: the communicator
|
||||
*/
|
||||
@@ -235,22 +254,22 @@ mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm);
|
||||
|
||||
/* Return an array of mscclppDevConn_t and the number of connections created by mscclppConnectionSetup().
|
||||
* The order of connections matches the order of mscclppConnect() calls.
|
||||
*
|
||||
*
|
||||
* Outputs:
|
||||
* devConns: the array of mscclppDevConn_t. Each mscclppDevConn_t corresponds to a mscclppConnect() call in the
|
||||
* order of the calls.
|
||||
* nConns: the number of connections
|
||||
*
|
||||
*
|
||||
* Inputs:
|
||||
* comm: the communicator
|
||||
*/
|
||||
mscclppResult_t mscclppGetAllDeviceConnections(mscclppComm_t comm, mscclppDevConn_t** devConns, int* nConns);
|
||||
|
||||
/* Return the mscclppDevConn_t corresponding to a given tag and a remoteRank.
|
||||
*
|
||||
*
|
||||
* Outputs:
|
||||
* devConn: the mscclppDevConn_t corresponding to the given tag
|
||||
*
|
||||
*
|
||||
* Inputs:
|
||||
* comm: the communicator
|
||||
* tag: the tag of the connection
|
||||
@@ -261,34 +280,34 @@ mscclppResult_t mscclppGetDeviceConnection(mscclppComm_t comm, int remoteRank, i
|
||||
/* Launch proxy threads for all connections created by mscclppConnectionSetup(). This function is supposed to be called
|
||||
* before starting a kernel that uses mscclppDevConn_t. Up to two proxy threads are launched for each (GPU + IB) pair
|
||||
* (one for P2P NVLink and one for InfiniBand).
|
||||
*
|
||||
*
|
||||
* Inputs:
|
||||
* comm: the communicator
|
||||
*/
|
||||
mscclppResult_t mscclppProxyLaunch(mscclppComm_t comm);
|
||||
|
||||
/* Stop all proxy threads.
|
||||
*
|
||||
*
|
||||
* Inputs:
|
||||
* comm: the communicator
|
||||
*/
|
||||
mscclppResult_t mscclppProxyStop(mscclppComm_t comm);
|
||||
|
||||
/* Return the rank of the calling process.
|
||||
*
|
||||
*
|
||||
* Outputs:
|
||||
* rank: the rank of the calling process
|
||||
*
|
||||
*
|
||||
* Inputs:
|
||||
* comm: the communicator
|
||||
*/
|
||||
mscclppResult_t mscclppCommRank(mscclppComm_t comm, int* rank);
|
||||
|
||||
/* Return the number of ranks of the communicator.
|
||||
*
|
||||
*
|
||||
* Outputs:
|
||||
* size: the number of ranks of the communicator
|
||||
*
|
||||
*
|
||||
* Inputs:
|
||||
* comm: the communicator
|
||||
*/
|
||||
|
||||
@@ -7,9 +7,12 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef enum : uint64_t { mscclppData = 0x1,
|
||||
mscclppFlag = 0x2,
|
||||
mscclppSync = 0x4} mscclppTriggerType_t;
|
||||
typedef enum : uint64_t
|
||||
{
|
||||
mscclppData = 0x1,
|
||||
mscclppFlag = 0x2,
|
||||
mscclppSync = 0x4
|
||||
} mscclppTriggerType_t;
|
||||
|
||||
#define MSCCLPP_BITS_SIZE 32
|
||||
#define MSCCLPP_BITS_OFFSET 32
|
||||
@@ -19,34 +22,38 @@ typedef enum : uint64_t { mscclppData = 0x1,
|
||||
// the summation of number of bits must be 128 or less
|
||||
union alignas(16) mscclppTrigger {
|
||||
uint64_t value[2];
|
||||
struct {
|
||||
struct
|
||||
{
|
||||
// first 64 bits: value[0]
|
||||
uint64_t dataSize : MSCCLPP_BITS_SIZE;
|
||||
uint64_t dataSize : MSCCLPP_BITS_SIZE;
|
||||
uint64_t srcDataOffset : MSCCLPP_BITS_OFFSET;
|
||||
uint64_t : (64-MSCCLPP_BITS_SIZE-MSCCLPP_BITS_OFFSET); // ensure 64-bit alignment
|
||||
uint64_t : (64 - MSCCLPP_BITS_SIZE - MSCCLPP_BITS_OFFSET); // ensure 64-bit alignment
|
||||
// second 64 bits: value[1]
|
||||
uint64_t dstDataOffset : MSCCLPP_BITS_OFFSET;
|
||||
uint64_t connId : MSCCLPP_BITS_CONNID;
|
||||
uint64_t type : MSCCLPP_BITS_TYPE;
|
||||
uint64_t : (64-MSCCLPP_BITS_OFFSET-MSCCLPP_BITS_CONNID-MSCCLPP_BITS_TYPE); // ensure 64-bit alignment
|
||||
uint64_t connId : MSCCLPP_BITS_CONNID;
|
||||
uint64_t type : MSCCLPP_BITS_TYPE;
|
||||
uint64_t : (64 - MSCCLPP_BITS_OFFSET - MSCCLPP_BITS_CONNID - MSCCLPP_BITS_TYPE); // ensure 64-bit alignment
|
||||
} fields;
|
||||
};
|
||||
|
||||
typedef mscclppTrigger* mscclppTrigger_t;
|
||||
|
||||
struct mscclppConcurrentFifo {
|
||||
struct mscclppConcurrentFifo
|
||||
{
|
||||
#ifdef __CUDACC__
|
||||
|
||||
__forceinline__ __device__ uint64_t push(uint64_t type, uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize){
|
||||
uint64_t curFifoHead = atomicAdd((unsigned long long int*)this->triggerFifoHead,1);
|
||||
while (curFifoHead >= MSCCLPP_PROXY_FIFO_SIZE + *((volatile uint64_t*)this->triggerFifoTail));
|
||||
while (*(volatile uint64_t*)&this->triggerFifo[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0);
|
||||
__forceinline__ __device__ uint64_t push(uint64_t type, uint64_t dstDataOffset, uint64_t srcDataOffset,
|
||||
uint64_t dataSize)
|
||||
{
|
||||
uint64_t curFifoHead = atomicAdd((unsigned long long int*)this->triggerFifoHead, 1);
|
||||
while (curFifoHead >= MSCCLPP_PROXY_FIFO_SIZE + *((volatile uint64_t*)this->triggerFifoTail))
|
||||
;
|
||||
while (*(volatile uint64_t*)&this->triggerFifo[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0)
|
||||
;
|
||||
uint64_t* valptr = (uint64_t*)&(this->triggerFifo[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE].value);
|
||||
asm volatile(
|
||||
"st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(valptr),
|
||||
"l"((srcDataOffset << MSCCLPP_BITS_SIZE) + dataSize),
|
||||
"l"((((type << MSCCLPP_BITS_CONNID) + this->connId) << MSCCLPP_BITS_OFFSET) + dstDataOffset)
|
||||
);
|
||||
asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(valptr),
|
||||
"l"((srcDataOffset << MSCCLPP_BITS_SIZE) + dataSize),
|
||||
"l"((((type << MSCCLPP_BITS_CONNID) + this->connId) << MSCCLPP_BITS_OFFSET) + dstDataOffset));
|
||||
return curFifoHead;
|
||||
}
|
||||
|
||||
|
||||
@@ -9,8 +9,9 @@
|
||||
#include "npkit/npkit_event.h"
|
||||
#include "npkit/npkit_struct.h"
|
||||
|
||||
class NpKit {
|
||||
public:
|
||||
class NpKit
|
||||
{
|
||||
public:
|
||||
static const uint64_t kNumGpuEventBuffers = 512;
|
||||
|
||||
static const uint64_t kNumCpuEventBuffers = 32;
|
||||
@@ -24,7 +25,8 @@ class NpKit {
|
||||
static NpKitEventCollectContext* GetGpuEventCollectContexts();
|
||||
|
||||
static inline __device__ void CollectGpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp,
|
||||
NpKitEventCollectContext* ctx) {
|
||||
NpKitEventCollectContext* ctx)
|
||||
{
|
||||
uint64_t event_buffer_head = ctx->event_buffer_head;
|
||||
if (event_buffer_head < kMaxNumGpuEventsPerBuffer) {
|
||||
NpKitEvent& event = ctx->event_buffer[event_buffer_head];
|
||||
@@ -40,7 +42,7 @@ class NpKit {
|
||||
|
||||
static uint64_t* GetCpuTimestamp();
|
||||
|
||||
private:
|
||||
private:
|
||||
static void CpuTimestampUpdateThread();
|
||||
|
||||
// 64K * 512 * 16B = 512MB per GPU
|
||||
|
||||
@@ -1,22 +1,22 @@
|
||||
#ifndef NPKIT_EVENT_H_
|
||||
#define NPKIT_EVENT_H_
|
||||
|
||||
#define NPKIT_EVENT_INVALID 0x0
|
||||
#define NPKIT_EVENT_INVALID 0x0
|
||||
|
||||
#define NPKIT_EVENT_TIME_SYNC_GPU 0x1
|
||||
#define NPKIT_EVENT_TIME_SYNC_CPU 0x2
|
||||
#define NPKIT_EVENT_TIME_SYNC_GPU 0x1
|
||||
#define NPKIT_EVENT_TIME_SYNC_CPU 0x2
|
||||
|
||||
#define NPKIT_EVENT_SM_REDUCE_ENTRY 0x3
|
||||
#define NPKIT_EVENT_SM_REDUCE_EXIT 0x4
|
||||
#define NPKIT_EVENT_SM_REDUCE_EXIT 0x4
|
||||
|
||||
#define NPKIT_EVENT_IB_SEND_ENTRY 0x5
|
||||
#define NPKIT_EVENT_IB_SEND_EXIT 0x6
|
||||
#define NPKIT_EVENT_IB_RECV_ENTRY 0x7
|
||||
#define NPKIT_EVENT_IB_RECV_EXIT 0x8
|
||||
#define NPKIT_EVENT_IB_SEND_ENTRY 0x5
|
||||
#define NPKIT_EVENT_IB_SEND_EXIT 0x6
|
||||
#define NPKIT_EVENT_IB_RECV_ENTRY 0x7
|
||||
#define NPKIT_EVENT_IB_RECV_EXIT 0x8
|
||||
|
||||
#define NPKIT_EVENT_DMA_SEND_ENTRY 0x9
|
||||
#define NPKIT_EVENT_DMA_SEND_EXIT 0xA
|
||||
#define NPKIT_EVENT_DMA_RECV_ENTRY 0xB
|
||||
#define NPKIT_EVENT_DMA_RECV_EXIT 0xC
|
||||
#define NPKIT_EVENT_DMA_SEND_ENTRY 0x9
|
||||
#define NPKIT_EVENT_DMA_SEND_EXIT 0xA
|
||||
#define NPKIT_EVENT_DMA_RECV_ENTRY 0xB
|
||||
#define NPKIT_EVENT_DMA_RECV_EXIT 0xC
|
||||
|
||||
#endif
|
||||
@@ -7,7 +7,8 @@
|
||||
|
||||
union NpKitEvent {
|
||||
uint64_t bits[2];
|
||||
struct {
|
||||
struct
|
||||
{
|
||||
uint64_t type : 8;
|
||||
uint64_t size : 32;
|
||||
uint64_t rsvd : 24;
|
||||
@@ -15,7 +16,8 @@ union NpKitEvent {
|
||||
} fields;
|
||||
};
|
||||
|
||||
struct NpKitEventCollectContext {
|
||||
struct NpKitEventCollectContext
|
||||
{
|
||||
NpKitEvent* event_buffer;
|
||||
uint64_t event_buffer_head;
|
||||
};
|
||||
|
||||
@@ -15,15 +15,16 @@ void initEnv();
|
||||
|
||||
void mscclppLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache);
|
||||
|
||||
#define MSCCLPP_PARAM(name, env, deftVal) \
|
||||
int64_t mscclppParam##name() { \
|
||||
constexpr int64_t uninitialized = INT64_MIN; \
|
||||
static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \
|
||||
static int64_t cache = uninitialized; \
|
||||
if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \
|
||||
mscclppLoadParam("MSCCLPP_" env, deftVal, uninitialized, &cache); \
|
||||
} \
|
||||
return cache; \
|
||||
#define MSCCLPP_PARAM(name, env, deftVal) \
|
||||
int64_t mscclppParam##name() \
|
||||
{ \
|
||||
constexpr int64_t uninitialized = INT64_MIN; \
|
||||
static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \
|
||||
static int64_t cache = uninitialized; \
|
||||
if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \
|
||||
mscclppLoadParam("MSCCLPP_" env, deftVal, uninitialized, &cache); \
|
||||
} \
|
||||
return cache; \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,27 +1,29 @@
|
||||
#ifndef MSCCLPP_PROXY_H_
|
||||
#define MSCCLPP_PROXY_H_
|
||||
|
||||
#include "mscclpp.h"
|
||||
#include "comm.h"
|
||||
#include <pthread.h>
|
||||
#include "mscclpp.h"
|
||||
#include <cuda_runtime.h>
|
||||
#include <pthread.h>
|
||||
|
||||
#define MSCCLPP_PROXY_MAX_NUM (MSCCLPP_IB_MAX_DEVS + 1) // One is for a P2P proxy.
|
||||
|
||||
typedef enum {
|
||||
typedef enum
|
||||
{
|
||||
MSCCLPP_PROXY_RUN_STATE_IDLE = 0,
|
||||
MSCCLPP_PROXY_RUN_STATE_RUNNING,
|
||||
MSCCLPP_PROXY_RUN_STATE_EXITING,
|
||||
} mscclppProxyRunState_t;
|
||||
|
||||
template <typename T>
|
||||
struct mscclppGDRState {
|
||||
template <typename T> struct mscclppGDRState
|
||||
{
|
||||
T* hostPtr;
|
||||
T* devPtr;
|
||||
void* desc;
|
||||
};
|
||||
|
||||
struct mscclppProxyState {
|
||||
struct mscclppProxyState
|
||||
{
|
||||
mscclppTransport_t transportType;
|
||||
pthread_t thread;
|
||||
mscclppProxyRunState_t run;
|
||||
@@ -31,8 +33,8 @@ struct mscclppProxyState {
|
||||
mscclppGDRState<uint64_t> fifoHead;
|
||||
mscclppGDRState<uint64_t> fifoTail;
|
||||
|
||||
struct mscclppIbContext *ibContext; // For IB connection only
|
||||
cudaStream_t stream; // for P2P DMA engine only
|
||||
struct mscclppIbContext* ibContext; // For IB connection only
|
||||
cudaStream_t stream; // for P2P DMA engine only
|
||||
};
|
||||
|
||||
mscclppResult_t mscclppProxyCreate(struct mscclppComm* comm);
|
||||
|
||||
@@ -8,21 +8,21 @@
|
||||
#define MSCCLPP_SOCKET_H_
|
||||
|
||||
#include "mscclpp.h"
|
||||
#include <sys/socket.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <netinet/tcp.h>
|
||||
#include <netdb.h>
|
||||
#include <fcntl.h>
|
||||
#include <netdb.h>
|
||||
#include <netinet/tcp.h>
|
||||
#include <poll.h>
|
||||
#include <stddef.h>
|
||||
#include <sys/socket.h>
|
||||
|
||||
#define MAX_IFS 16
|
||||
#define MAX_IF_NAME_SIZE 16
|
||||
#define SLEEP_INT 1000 // connection retry sleep interval in usec
|
||||
#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec)
|
||||
#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s)
|
||||
#define RETRY_ACCEPT_TIMES 2e4 // connection accept retry times (each one can take 20s)
|
||||
#define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
|
||||
#define SLEEP_INT 1000 // connection retry sleep interval in usec
|
||||
#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec)
|
||||
#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s)
|
||||
#define RETRY_ACCEPT_TIMES 2e4 // connection accept retry times (each one can take 20s)
|
||||
#define SOCKET_NAME_MAXLEN (NI_MAXHOST + NI_MAXSERV)
|
||||
#define MSCCLPP_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL
|
||||
|
||||
/* Common socket address storage structure for IPv4/IPv6 */
|
||||
@@ -32,7 +32,8 @@ union mscclppSocketAddress {
|
||||
struct sockaddr_in6 sin6;
|
||||
};
|
||||
|
||||
enum mscclppSocketState {
|
||||
enum mscclppSocketState
|
||||
{
|
||||
mscclppSocketStateNone = 0,
|
||||
mscclppSocketStateInitialized = 1,
|
||||
mscclppSocketStateAccepting = 2,
|
||||
@@ -46,7 +47,8 @@ enum mscclppSocketState {
|
||||
mscclppSocketStateNum = 10
|
||||
};
|
||||
|
||||
enum mscclppSocketType {
|
||||
enum mscclppSocketType
|
||||
{
|
||||
mscclppSocketTypeUnknown = 0,
|
||||
mscclppSocketTypeBootstrap = 1,
|
||||
mscclppSocketTypeProxy = 2,
|
||||
@@ -54,7 +56,8 @@ enum mscclppSocketType {
|
||||
mscclppSocketTypeNetIb = 4
|
||||
};
|
||||
|
||||
struct mscclppSocket {
|
||||
struct mscclppSocket
|
||||
{
|
||||
int fd;
|
||||
int acceptFd;
|
||||
int timedOutRetries;
|
||||
@@ -69,13 +72,17 @@ struct mscclppSocket {
|
||||
enum mscclppSocketType type;
|
||||
};
|
||||
|
||||
const char *mscclppSocketToString(union mscclppSocketAddress *addr, char *buf, const int numericHostForm = 1);
|
||||
const char* mscclppSocketToString(union mscclppSocketAddress* addr, char* buf, const int numericHostForm = 1);
|
||||
mscclppResult_t mscclppSocketGetAddrFromString(union mscclppSocketAddress* ua, const char* ip_port_pair);
|
||||
int mscclppFindInterfaceMatchSubnet(char* ifNames, union mscclppSocketAddress* localAddrs, union mscclppSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
|
||||
int mscclppFindInterfaces(char* ifNames, union mscclppSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
|
||||
int mscclppFindInterfaceMatchSubnet(char* ifNames, union mscclppSocketAddress* localAddrs,
|
||||
union mscclppSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
|
||||
int mscclppFindInterfaces(char* ifNames, union mscclppSocketAddress* ifAddrs, int ifNameMaxSize, int maxIfs);
|
||||
|
||||
// Initialize a socket
|
||||
mscclppResult_t mscclppSocketInit(struct mscclppSocket* sock, union mscclppSocketAddress* addr = NULL, uint64_t magic = MSCCLPP_SOCKET_MAGIC, enum mscclppSocketType type = mscclppSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0);
|
||||
mscclppResult_t mscclppSocketInit(struct mscclppSocket* sock, union mscclppSocketAddress* addr = NULL,
|
||||
uint64_t magic = MSCCLPP_SOCKET_MAGIC,
|
||||
enum mscclppSocketType type = mscclppSocketTypeUnknown,
|
||||
volatile uint32_t* abortFlag = NULL, int asyncFlag = 0);
|
||||
// Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call
|
||||
mscclppResult_t mscclppSocketListen(struct mscclppSocket* sock);
|
||||
mscclppResult_t mscclppSocketGetAddr(struct mscclppSocket* sock, union mscclppSocketAddress* addr);
|
||||
@@ -83,7 +90,8 @@ mscclppResult_t mscclppSocketGetAddr(struct mscclppSocket* sock, union mscclppSo
|
||||
mscclppResult_t mscclppSocketConnect(struct mscclppSocket* sock);
|
||||
// Return socket connection state.
|
||||
// mscclppResult_t mscclppSocketReady(struct mscclppSocket* sock, int *running);
|
||||
// Accept an incoming connection from listenSock->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr.
|
||||
// Accept an incoming connection from listenSock->fd and keep the file descriptor in sock->fd, with the remote side
|
||||
// IP/port in sock->addr.
|
||||
mscclppResult_t mscclppSocketAccept(struct mscclppSocket* sock, struct mscclppSocket* ulistenSock);
|
||||
// mscclppResult_t mscclppSocketGetFd(struct mscclppSocket* sock, int* fd);
|
||||
// mscclppResult_t mscclppSocketSetFd(int fd, struct mscclppSocket* sock);
|
||||
|
||||
@@ -7,13 +7,13 @@
|
||||
#ifndef MSCCLPP_UTILS_H_
|
||||
#define MSCCLPP_UTILS_H_
|
||||
|
||||
#include "mscclpp.h"
|
||||
#include "alloc.h"
|
||||
#include "checks.h"
|
||||
#include "mscclpp.h"
|
||||
#include <new>
|
||||
#include <sched.h>
|
||||
#include <stdint.h>
|
||||
#include <time.h>
|
||||
#include <sched.h>
|
||||
#include <new>
|
||||
|
||||
// int mscclppCudaCompCap();
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
mscclppResult_t int64ToBusId(int64_t id, char* busId);
|
||||
mscclppResult_t busIdToInt64(const char* busId, int64_t* id);
|
||||
|
||||
mscclppResult_t getBusId(int cudaDev, int64_t *busId);
|
||||
mscclppResult_t getBusId(int cudaDev, int64_t* busId);
|
||||
|
||||
mscclppResult_t getHostName(char* hostname, int maxlen, const char delim);
|
||||
uint64_t getHash(const char* string, int n);
|
||||
@@ -29,7 +29,8 @@ uint64_t getHostHash();
|
||||
uint64_t getPidHash();
|
||||
mscclppResult_t getRandomData(void* buffer, size_t bytes);
|
||||
|
||||
struct netIf {
|
||||
struct netIf
|
||||
{
|
||||
char prefix[64];
|
||||
int port;
|
||||
};
|
||||
@@ -37,27 +38,33 @@ struct netIf {
|
||||
int parseStringList(const char* string, struct netIf* ifList, int maxList);
|
||||
bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
|
||||
|
||||
static long log2i(long n) {
|
||||
long l = 0;
|
||||
while (n>>=1) l++;
|
||||
return l;
|
||||
static long log2i(long n)
|
||||
{
|
||||
long l = 0;
|
||||
while (n >>= 1)
|
||||
l++;
|
||||
return l;
|
||||
}
|
||||
|
||||
inline uint64_t clockNano() {
|
||||
inline uint64_t clockNano()
|
||||
{
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec;
|
||||
return uint64_t(ts.tv_sec) * 1000 * 1000 * 1000 + ts.tv_nsec;
|
||||
}
|
||||
|
||||
/* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else
|
||||
* return -1 */
|
||||
inline mscclppResult_t getRandomData(void* buffer, size_t bytes) {
|
||||
inline mscclppResult_t getRandomData(void* buffer, size_t bytes)
|
||||
{
|
||||
mscclppResult_t ret = mscclppSuccess;
|
||||
if (bytes > 0) {
|
||||
const size_t one = 1UL;
|
||||
FILE* fp = fopen("/dev/urandom", "r");
|
||||
if (buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one) ret = mscclppSystemError;
|
||||
if (fp) fclose(fp);
|
||||
if (buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one)
|
||||
ret = mscclppSystemError;
|
||||
if (fp)
|
||||
fclose(fp);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
@@ -252,7 +259,6 @@ inline mscclppResult_t getRandomData(void* buffer, size_t bytes) {
|
||||
// me->topFrame = *me->topFrame.below; // C++ struct assignment
|
||||
// }
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// struct mscclppMemoryPool {
|
||||
@@ -441,7 +447,8 @@ inline mscclppResult_t getRandomData(void* buffer, size_t bytes) {
|
||||
// uintptr_t expected = sleeping ? 0x1 : 0x0;
|
||||
// uintptr_t desired = 0x1;
|
||||
// me->waiting = waitSignal; // release done by successful compare exchange
|
||||
// if (__atomic_compare_exchange_n(&me->tail, &expected, desired, /*weak=*/true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) {
|
||||
// if (__atomic_compare_exchange_n(&me->tail, &expected, desired, /*weak=*/true, __ATOMIC_RELEASE,
|
||||
// __ATOMIC_RELAXED)) {
|
||||
// sleeping = true;
|
||||
// pthread_cond_wait(&waitSignal->cond, &waitSignal->mutex);
|
||||
// }
|
||||
@@ -471,7 +478,8 @@ inline mscclppResult_t getRandomData(void* buffer, size_t bytes) {
|
||||
// template<typename T, T *T::*next>
|
||||
// T* mscclppIntruQueueMpscAbandon(mscclppIntruQueueMpsc<T,next>* me) {
|
||||
// uintptr_t expected = 0x0;
|
||||
// if (__atomic_compare_exchange_n(&me->tail, &expected, /*desired=*/0x2, /*weak=*/true, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
|
||||
// if (__atomic_compare_exchange_n(&me->tail, &expected, /*desired=*/0x2, /*weak=*/true, __ATOMIC_RELAXED,
|
||||
// __ATOMIC_RELAXED)) {
|
||||
// return nullptr;
|
||||
// } else {
|
||||
// int spins = 0;
|
||||
|
||||
196
src/init.cc
196
src/init.cc
@@ -1,17 +1,18 @@
|
||||
#include "mscclpp.h"
|
||||
#include "bootstrap.h"
|
||||
#include "core.h"
|
||||
#include "gdr.h"
|
||||
#include "mscclpp.h"
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
#if defined(ENABLE_NPKIT)
|
||||
#include "npkit/npkit.h"
|
||||
#endif
|
||||
|
||||
static uint64_t hashUniqueId(mscclppUniqueId const &id) {
|
||||
char const *bytes = (char const*)&id;
|
||||
static uint64_t hashUniqueId(mscclppUniqueId const& id)
|
||||
{
|
||||
char const* bytes = (char const*)&id;
|
||||
uint64_t h = 0xdeadbeef;
|
||||
for(int i=0; i < (int)sizeof(mscclppUniqueId); i++) {
|
||||
for (int i = 0; i < (int)sizeof(mscclppUniqueId); i++) {
|
||||
h ^= h >> 32;
|
||||
h *= 0x8db3db47fa2994ad;
|
||||
h += bytes[i];
|
||||
@@ -25,7 +26,8 @@ static bool initialized = false;
|
||||
|
||||
gdr_t mscclppGdrCopy = NULL;
|
||||
|
||||
mscclppResult_t initGdrCopy() {
|
||||
mscclppResult_t initGdrCopy()
|
||||
{
|
||||
mscclppGdrCopy = mscclppGdrInit();
|
||||
if (mscclppGdrCopy == NULL) {
|
||||
WARN("GDR init failed");
|
||||
@@ -34,8 +36,10 @@ mscclppResult_t initGdrCopy() {
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
static mscclppResult_t mscclppInit() {
|
||||
if (__atomic_load_n(&initialized, __ATOMIC_ACQUIRE)) return mscclppSuccess;
|
||||
static mscclppResult_t mscclppInit()
|
||||
{
|
||||
if (__atomic_load_n(&initialized, __ATOMIC_ACQUIRE))
|
||||
return mscclppSuccess;
|
||||
pthread_mutex_lock(&initLock);
|
||||
if (!initialized) {
|
||||
// initEnv();
|
||||
@@ -62,22 +66,25 @@ static std::string mscclppShmFileName(mscclppComm_t comm, int rank)
|
||||
}
|
||||
|
||||
MSCCLPP_API(mscclppResult_t, mscclppGetUniqueId, mscclppUniqueId* out);
|
||||
mscclppResult_t mscclppGetUniqueId(mscclppUniqueId* out) {
|
||||
mscclppResult_t mscclppGetUniqueId(mscclppUniqueId* out)
|
||||
{
|
||||
MSCCLPPCHECK(mscclppInit());
|
||||
// mscclppCHECK(PtrCheck(out, "GetUniqueId", "out"));
|
||||
// mscclppCHECK(PtrCheck(out, "GetUniqueId", "out"));
|
||||
mscclppResult_t res = bootstrapGetUniqueId((struct mscclppBootstrapHandle*)out);
|
||||
TRACE_CALL("mscclppGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(*out));
|
||||
return res;
|
||||
}
|
||||
|
||||
MSCCLPP_API(mscclppResult_t, mscclppBootstrapAllGather, mscclppComm_t comm, void* data, int size);
|
||||
mscclppResult_t mscclppBootstrapAllGather(mscclppComm_t comm, void* data, int size){
|
||||
mscclppResult_t mscclppBootstrapAllGather(mscclppComm_t comm, void* data, int size)
|
||||
{
|
||||
MSCCLPPCHECK(bootstrapAllGather(comm->bootstrap, data, size));
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
MSCCLPP_API(mscclppResult_t, mscclppCommInitRank, mscclppComm_t* comm, int nranks, const char* ipPortPair, int rank);
|
||||
mscclppResult_t mscclppCommInitRank(mscclppComm_t* comm, int nranks, const char* ipPortPair, int rank) {
|
||||
mscclppResult_t mscclppCommInitRank(mscclppComm_t* comm, int nranks, const char* ipPortPair, int rank)
|
||||
{
|
||||
if (mscclppGdrCopy == NULL) {
|
||||
MSCCLPPCHECK(initGdrCopy());
|
||||
}
|
||||
@@ -99,7 +106,7 @@ mscclppResult_t mscclppCommInitRank(mscclppComm_t* comm, int nranks, const char*
|
||||
MSCCLPPCHECK(bootstrapGetUniqueId(&handle, rank == 0, ipPortPair));
|
||||
_comm->magic = handle.magic;
|
||||
|
||||
MSCCLPPCHECKGOTO(mscclppCudaHostCalloc((uint32_t **)&_comm->abortFlag, 1), res, fail);
|
||||
MSCCLPPCHECKGOTO(mscclppCudaHostCalloc((uint32_t**)&_comm->abortFlag, 1), res, fail);
|
||||
MSCCLPPCHECK(bootstrapInit(&handle, _comm));
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
@@ -142,15 +149,18 @@ mscclppResult_t mscclppCommInitRank(mscclppComm_t* comm, int nranks, const char*
|
||||
return res;
|
||||
fail:
|
||||
if (_comm) {
|
||||
if (_comm->abortFlag) mscclppCudaHostFree((void *)_comm->abortFlag);
|
||||
if (_comm->abortFlag)
|
||||
mscclppCudaHostFree((void*)_comm->abortFlag);
|
||||
free(_comm);
|
||||
}
|
||||
if (comm) *comm = NULL;
|
||||
if (comm)
|
||||
*comm = NULL;
|
||||
return res;
|
||||
}
|
||||
|
||||
MSCCLPP_API(mscclppResult_t, mscclppCommInitRankFromId, mscclppComm_t* comm, int nranks, mscclppUniqueId id, int rank);
|
||||
mscclppResult_t mscclppCommInitRankFromId(mscclppComm_t* comm, int nranks, mscclppUniqueId id, int rank) {
|
||||
mscclppResult_t mscclppCommInitRankFromId(mscclppComm_t* comm, int nranks, mscclppUniqueId id, int rank)
|
||||
{
|
||||
if (mscclppGdrCopy == NULL) {
|
||||
MSCCLPPCHECK(initGdrCopy());
|
||||
}
|
||||
@@ -168,7 +178,7 @@ mscclppResult_t mscclppCommInitRankFromId(mscclppComm_t* comm, int nranks, msccl
|
||||
MSCCLPPCHECK(bootstrapNetInit());
|
||||
_comm->magic = handle->magic;
|
||||
|
||||
MSCCLPPCHECKGOTO(mscclppCudaHostCalloc((uint32_t **)&_comm->abortFlag, 1), res, fail);
|
||||
MSCCLPPCHECKGOTO(mscclppCudaHostCalloc((uint32_t**)&_comm->abortFlag, 1), res, fail);
|
||||
MSCCLPPCHECK(bootstrapInit(handle, _comm));
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
@@ -180,15 +190,18 @@ mscclppResult_t mscclppCommInitRankFromId(mscclppComm_t* comm, int nranks, msccl
|
||||
return res;
|
||||
fail:
|
||||
if (_comm) {
|
||||
if (_comm->abortFlag) mscclppCudaHostFree((void *)_comm->abortFlag);
|
||||
if (_comm->abortFlag)
|
||||
mscclppCudaHostFree((void*)_comm->abortFlag);
|
||||
free(_comm);
|
||||
}
|
||||
if (comm) *comm = NULL;
|
||||
if (comm)
|
||||
*comm = NULL;
|
||||
return res;
|
||||
}
|
||||
|
||||
MSCCLPP_API(mscclppResult_t, mscclppCommDestroy, mscclppComm_t comm);
|
||||
mscclppResult_t mscclppCommDestroy(mscclppComm_t comm){
|
||||
mscclppResult_t mscclppCommDestroy(mscclppComm_t comm)
|
||||
{
|
||||
#if defined(ENABLE_NPKIT)
|
||||
const char* npkitDumpDir = nullptr;
|
||||
#endif
|
||||
@@ -197,7 +210,7 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm){
|
||||
return mscclppSuccess;
|
||||
|
||||
for (int i = 0; i < comm->nConns; ++i) {
|
||||
struct mscclppConn *conn = &comm->conns[i];
|
||||
struct mscclppConn* conn = &comm->conns[i];
|
||||
if (conn->cpuProxyFlagGdrDesc) {
|
||||
// IB
|
||||
MSCCLPPCHECK(mscclppGdrCudaFree(conn->cpuProxyFlagGdrDesc));
|
||||
@@ -208,7 +221,7 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm){
|
||||
}
|
||||
|
||||
for (int i = 0; i < MSCCLPP_PROXY_MAX_NUM; ++i) {
|
||||
struct mscclppProxyState *proxyState = comm->proxyState[i];
|
||||
struct mscclppProxyState* proxyState = comm->proxyState[i];
|
||||
if (proxyState) {
|
||||
MSCCLPPCHECK(mscclppGdrCudaFree(proxyState->triggerFifo.desc));
|
||||
MSCCLPPCHECK(mscclppGdrCudaFree(proxyState->fifoHead.desc));
|
||||
@@ -227,9 +240,9 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm){
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < comm->nConns; i++){
|
||||
struct mscclppConn *conn = &comm->conns[i];
|
||||
if (conn){
|
||||
for (int i = 0; i < comm->nConns; i++) {
|
||||
struct mscclppConn* conn = &comm->conns[i];
|
||||
if (conn) {
|
||||
MSCCLPPCHECK(mscclppCudaFree(conn->devConn->sendEpochId));
|
||||
MSCCLPPCHECK(mscclppCudaFree(conn->devConn->recvEpochId));
|
||||
}
|
||||
@@ -238,7 +251,7 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm){
|
||||
if (comm->bootstrap)
|
||||
MSCCLPPCHECK(bootstrapClose(comm->bootstrap));
|
||||
|
||||
mscclppCudaHostFree((void *)comm->abortFlag);
|
||||
mscclppCudaHostFree((void*)comm->abortFlag);
|
||||
free(comm);
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
@@ -256,24 +269,36 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm){
|
||||
}
|
||||
|
||||
MSCCLPP_API(const char*, mscclppGetErrorString, mscclppResult_t code);
|
||||
const char* mscclppGetErrorString(mscclppResult_t code) {
|
||||
const char* mscclppGetErrorString(mscclppResult_t code)
|
||||
{
|
||||
switch (code) {
|
||||
case mscclppSuccess : return "no error";
|
||||
case mscclppUnhandledCudaError : return "unhandled cuda error";
|
||||
case mscclppSystemError : return "unhandled system error";
|
||||
case mscclppInternalError : return "internal error";
|
||||
case mscclppInvalidArgument : return "invalid argument";
|
||||
case mscclppInvalidUsage : return "invalid usage";
|
||||
case mscclppRemoteError : return "remote process exited or there was a network error";
|
||||
case mscclppInProgress : return "MSCCL++ operation in progress";
|
||||
default : return "unknown result code";
|
||||
case mscclppSuccess:
|
||||
return "no error";
|
||||
case mscclppUnhandledCudaError:
|
||||
return "unhandled cuda error";
|
||||
case mscclppSystemError:
|
||||
return "unhandled system error";
|
||||
case mscclppInternalError:
|
||||
return "internal error";
|
||||
case mscclppInvalidArgument:
|
||||
return "invalid argument";
|
||||
case mscclppInvalidUsage:
|
||||
return "invalid usage";
|
||||
case mscclppRemoteError:
|
||||
return "remote process exited or there was a network error";
|
||||
case mscclppInProgress:
|
||||
return "MSCCL++ operation in progress";
|
||||
default:
|
||||
return "unknown result code";
|
||||
}
|
||||
}
|
||||
|
||||
MSCCLPP_API(mscclppResult_t, mscclppGetDeviceConnection, mscclppComm_t comm, int remoteRank, int tag, mscclppDevConn_t** devConn);
|
||||
mscclppResult_t mscclppGetDeviceConnection(mscclppComm_t comm, int remoteRank, int tag, mscclppDevConn_t** devConn){
|
||||
for (int i = 0; i < comm->nConns; i++){
|
||||
if (comm->devConns[i].remoteRank == remoteRank && comm->devConns[i].tag == tag){
|
||||
MSCCLPP_API(mscclppResult_t, mscclppGetDeviceConnection, mscclppComm_t comm, int remoteRank, int tag,
|
||||
mscclppDevConn_t** devConn);
|
||||
mscclppResult_t mscclppGetDeviceConnection(mscclppComm_t comm, int remoteRank, int tag, mscclppDevConn_t** devConn)
|
||||
{
|
||||
for (int i = 0; i < comm->nConns; i++) {
|
||||
if (comm->devConns[i].remoteRank == remoteRank && comm->devConns[i].tag == tag) {
|
||||
*devConn = &comm->devConns[i];
|
||||
return mscclppSuccess;
|
||||
}
|
||||
@@ -282,8 +307,8 @@ mscclppResult_t mscclppGetDeviceConnection(mscclppComm_t comm, int remoteRank, i
|
||||
return mscclppInvalidArgument;
|
||||
}
|
||||
|
||||
|
||||
MSCCLPP_API(mscclppResult_t, mscclppGetAllDeviceConnections, mscclppComm_t comm, mscclppDevConn_t** devConns, int* nConns);
|
||||
MSCCLPP_API(mscclppResult_t, mscclppGetAllDeviceConnections, mscclppComm_t comm, mscclppDevConn_t** devConns,
|
||||
int* nConns);
|
||||
mscclppResult_t mscclppGetAllDeviceConnections(mscclppComm_t comm, mscclppDevConn_t** devConns, int* nConns)
|
||||
{
|
||||
*nConns = comm->nConns;
|
||||
@@ -291,17 +316,16 @@ mscclppResult_t mscclppGetAllDeviceConnections(mscclppComm_t comm, mscclppDevCon
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
|
||||
MSCCLPP_API(mscclppResult_t, mscclppConnect, mscclppComm_t comm, int remoteRank, int tag,
|
||||
void* localBuff, uint64_t buffSize, mscclppTransport_t transportType, const char *ibDev);
|
||||
MSCCLPP_API(mscclppResult_t, mscclppConnect, mscclppComm_t comm, int remoteRank, int tag, void* localBuff,
|
||||
uint64_t buffSize, mscclppTransport_t transportType, const char* ibDev);
|
||||
mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void* localBuff, uint64_t buffSize,
|
||||
mscclppTransport_t transportType, const char *ibDev)
|
||||
mscclppTransport_t transportType, const char* ibDev)
|
||||
{
|
||||
if (comm->nConns == MAXCONNECTIONS) {
|
||||
WARN("Too many connections made");
|
||||
return mscclppInternalError;
|
||||
}
|
||||
struct mscclppConn *conn = &comm->conns[comm->nConns];
|
||||
struct mscclppConn* conn = &comm->conns[comm->nConns];
|
||||
conn->transport = transportType;
|
||||
conn->buffSize = buffSize;
|
||||
|
||||
@@ -333,12 +357,12 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void
|
||||
}
|
||||
// Set the ib context for this conn
|
||||
conn->ibCtx = comm->ibContext[ibDevIdx];
|
||||
} else if (transportType == mscclppTransportP2P){
|
||||
} else if (transportType == mscclppTransportP2P) {
|
||||
// Check if a DMA context/stream exists
|
||||
if (comm->stream == NULL){
|
||||
if (comm->stream == NULL) {
|
||||
CUDACHECK(cudaStreamCreateWithFlags(&comm->stream, cudaStreamNonBlocking));
|
||||
}
|
||||
} else if (transportType == mscclppTransportSHM){
|
||||
} else if (transportType == mscclppTransportSHM) {
|
||||
WARN("Shared memory interconnection is not implemented yet!");
|
||||
return mscclppInternalError;
|
||||
} else {
|
||||
@@ -346,44 +370,44 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void
|
||||
return mscclppInvalidUsage;
|
||||
}
|
||||
|
||||
|
||||
// Find/create a proxy state for the given connection
|
||||
struct mscclppProxyState *proxyState = NULL;
|
||||
struct mscclppProxyState* proxyState = NULL;
|
||||
// First see if there is a matching context
|
||||
// If not, find the first empty proxy
|
||||
int firstEmptyProxyIndex = -1;
|
||||
for (int i = 0; i < MSCCLPP_PROXY_MAX_NUM; ++i) {
|
||||
struct mscclppProxyState *curProxy = comm->proxyState[i];
|
||||
if (curProxy && (curProxy->transportType == transportType)){
|
||||
if ((transportType == mscclppTransportIB && curProxy->ibContext == conn->ibCtx) || (transportType == mscclppTransportP2P)){
|
||||
struct mscclppProxyState* curProxy = comm->proxyState[i];
|
||||
if (curProxy && (curProxy->transportType == transportType)) {
|
||||
if ((transportType == mscclppTransportIB && curProxy->ibContext == conn->ibCtx) ||
|
||||
(transportType == mscclppTransportP2P)) {
|
||||
proxyState = curProxy;
|
||||
break; // we found the matching context
|
||||
}
|
||||
}
|
||||
if (curProxy == NULL && firstEmptyProxyIndex == -1){
|
||||
if (curProxy == NULL && firstEmptyProxyIndex == -1) {
|
||||
firstEmptyProxyIndex = i;
|
||||
}
|
||||
}
|
||||
|
||||
if (proxyState == NULL && firstEmptyProxyIndex == -1){
|
||||
if (proxyState == NULL && firstEmptyProxyIndex == -1) {
|
||||
WARN("Too many proxies have been allocated!");
|
||||
return mscclppInvalidUsage;
|
||||
}
|
||||
|
||||
// If we couldn't find a matching context, create one
|
||||
if (proxyState == NULL){
|
||||
if (proxyState == NULL) {
|
||||
MSCCLPPCHECK(mscclppCalloc(&proxyState, 1));
|
||||
MSCCLPPCHECK(mscclppGdrCudaCalloc(&proxyState->triggerFifo.hostPtr, &proxyState->triggerFifo.devPtr,
|
||||
MSCCLPP_PROXY_FIFO_SIZE, &proxyState->triggerFifo.desc));
|
||||
MSCCLPPCHECK(mscclppGdrCudaCalloc(&proxyState->fifoHead.hostPtr, &proxyState->fifoHead.devPtr,
|
||||
1, &proxyState->fifoHead.desc));
|
||||
MSCCLPPCHECK(mscclppGdrCudaCalloc(&proxyState->fifoTail.hostPtr, &proxyState->fifoTail.devPtr,
|
||||
1, &proxyState->fifoTail.desc));
|
||||
MSCCLPPCHECK(
|
||||
mscclppGdrCudaCalloc(&proxyState->fifoHead.hostPtr, &proxyState->fifoHead.devPtr, 1, &proxyState->fifoHead.desc));
|
||||
MSCCLPPCHECK(
|
||||
mscclppGdrCudaCalloc(&proxyState->fifoTail.hostPtr, &proxyState->fifoTail.devPtr, 1, &proxyState->fifoTail.desc));
|
||||
|
||||
if (transportType == mscclppTransportIB){
|
||||
if (transportType == mscclppTransportIB) {
|
||||
proxyState->ibContext = conn->ibCtx;
|
||||
proxyState->stream = NULL;
|
||||
} else if (transportType == mscclppTransportP2P){
|
||||
} else if (transportType == mscclppTransportP2P) {
|
||||
proxyState->ibContext = NULL;
|
||||
proxyState->stream = comm->stream;
|
||||
}
|
||||
@@ -395,8 +419,8 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void
|
||||
WARN("Proxy allocation failed!");
|
||||
return mscclppInternalError;
|
||||
}
|
||||
|
||||
struct mscclppDevConn *devConn = &comm->devConns[comm->nConns];
|
||||
|
||||
struct mscclppDevConn* devConn = &comm->devConns[comm->nConns];
|
||||
|
||||
conn->devConn = devConn;
|
||||
conn->devConn->localBuff = localBuff;
|
||||
@@ -415,7 +439,8 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
struct connInfo {
|
||||
struct connInfo
|
||||
{
|
||||
cudaIpcMemHandle_t handleBuff;
|
||||
cudaIpcMemHandle_t handleFlag;
|
||||
cudaIpcMemHandle_t handleProxyFlag;
|
||||
@@ -425,12 +450,13 @@ struct connInfo {
|
||||
mscclppIbMrInfo infoProxyFlagMr;
|
||||
};
|
||||
|
||||
mscclppResult_t mscclppP2pConnectionSetupStart(struct connInfo* connInfo /*output*/, struct mscclppConn* conn /*input*/){
|
||||
if (connInfo == NULL || conn == NULL){
|
||||
mscclppResult_t mscclppP2pConnectionSetupStart(struct connInfo* connInfo /*output*/, struct mscclppConn* conn /*input*/)
|
||||
{
|
||||
if (connInfo == NULL || conn == NULL) {
|
||||
WARN("connInfo or connection cannot be null");
|
||||
return mscclppInternalError;
|
||||
}
|
||||
struct mscclppDevConn *devConn = conn->devConn;
|
||||
struct mscclppDevConn* devConn = conn->devConn;
|
||||
MSCCLPPCHECK(mscclppCudaCalloc(&devConn->proxyEpochId, 1));
|
||||
CUDACHECK(cudaIpcGetMemHandle(&connInfo->handleProxyFlag, devConn->proxyEpochId));
|
||||
CUDACHECK(cudaIpcGetMemHandle(&connInfo->handleBuff, devConn->localBuff));
|
||||
@@ -438,28 +464,33 @@ mscclppResult_t mscclppP2pConnectionSetupStart(struct connInfo* connInfo /*outpu
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t mscclppP2pConnectionSetupEnd(struct connInfo* connInfo /*input*/, struct mscclppConn* conn /*output*/){
|
||||
if (connInfo == NULL || conn == NULL){
|
||||
mscclppResult_t mscclppP2pConnectionSetupEnd(struct connInfo* connInfo /*input*/, struct mscclppConn* conn /*output*/)
|
||||
{
|
||||
if (connInfo == NULL || conn == NULL) {
|
||||
WARN("ipcHandles or connection cannot be null");
|
||||
return mscclppInternalError;
|
||||
}
|
||||
CUDACHECK(cudaIpcOpenMemHandle((void**)&conn->devConn->remoteBuff, connInfo->handleBuff, cudaIpcMemLazyEnablePeerAccess));
|
||||
CUDACHECK(cudaIpcOpenMemHandle((void**)&conn->devConn->remoteFlag, connInfo->handleFlag, cudaIpcMemLazyEnablePeerAccess));
|
||||
CUDACHECK(cudaIpcOpenMemHandle((void**)&conn->remoteProxyFlag, connInfo->handleProxyFlag, cudaIpcMemLazyEnablePeerAccess));
|
||||
CUDACHECK(
|
||||
cudaIpcOpenMemHandle((void**)&conn->devConn->remoteBuff, connInfo->handleBuff, cudaIpcMemLazyEnablePeerAccess));
|
||||
CUDACHECK(
|
||||
cudaIpcOpenMemHandle((void**)&conn->devConn->remoteFlag, connInfo->handleFlag, cudaIpcMemLazyEnablePeerAccess));
|
||||
CUDACHECK(
|
||||
cudaIpcOpenMemHandle((void**)&conn->remoteProxyFlag, connInfo->handleProxyFlag, cudaIpcMemLazyEnablePeerAccess));
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t mscclppIbConnectionSetupStart(struct connInfo* connInfo /*output*/, struct mscclppConn* conn /*input*/){
|
||||
if (connInfo == NULL || conn == NULL){
|
||||
mscclppResult_t mscclppIbConnectionSetupStart(struct connInfo* connInfo /*output*/, struct mscclppConn* conn /*input*/)
|
||||
{
|
||||
if (connInfo == NULL || conn == NULL) {
|
||||
WARN("connInfo or connection cannot be null");
|
||||
return mscclppInternalError;
|
||||
}
|
||||
struct mscclppDevConn *devConn = conn->devConn;
|
||||
struct mscclppDevConn* devConn = conn->devConn;
|
||||
devConn->remoteBuff = NULL;
|
||||
devConn->remoteFlag = NULL;
|
||||
MSCCLPPCHECK(mscclppGdrCudaCalloc(&conn->cpuProxyFlag, &devConn->proxyEpochId, 1, &conn->cpuProxyFlagGdrDesc));
|
||||
|
||||
struct mscclppIbContext *ibCtx = conn->ibCtx;
|
||||
struct mscclppIbContext* ibCtx = conn->ibCtx;
|
||||
if (conn->ibQp == NULL) {
|
||||
MSCCLPPCHECK(mscclppIbContextCreateQp(ibCtx, &conn->ibQp));
|
||||
}
|
||||
@@ -474,8 +505,9 @@ mscclppResult_t mscclppIbConnectionSetupStart(struct connInfo* connInfo /*output
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t mscclppIbConnectionSetupEnd(struct connInfo* connInfo /*input*/, struct mscclppConn* conn /*output*/){
|
||||
if (connInfo == NULL || conn == NULL){
|
||||
mscclppResult_t mscclppIbConnectionSetupEnd(struct connInfo* connInfo /*input*/, struct mscclppConn* conn /*output*/)
|
||||
{
|
||||
if (connInfo == NULL || conn == NULL) {
|
||||
WARN("ipcHandles or connection cannot be null");
|
||||
return mscclppInternalError;
|
||||
}
|
||||
@@ -498,7 +530,7 @@ mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm)
|
||||
{
|
||||
// Send info to peers
|
||||
for (int i = 0; i < comm->nConns; ++i) {
|
||||
struct mscclppConn *conn = &comm->conns[i];
|
||||
struct mscclppConn* conn = &comm->conns[i];
|
||||
|
||||
struct connInfo cInfo;
|
||||
if (conn->transport == mscclppTransportP2P) {
|
||||
@@ -512,7 +544,7 @@ mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm)
|
||||
|
||||
// Recv info from peers
|
||||
for (int i = 0; i < comm->nConns; ++i) {
|
||||
struct mscclppConn *conn = &comm->conns[i];
|
||||
struct mscclppConn* conn = &comm->conns[i];
|
||||
struct connInfo cInfo;
|
||||
MSCCLPPCHECK(bootstrapRecv(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag, &cInfo, sizeof(cInfo)));
|
||||
if (conn->transport == mscclppTransportP2P) {
|
||||
|
||||
@@ -17,7 +17,8 @@ uint64_t* NpKit::cpu_timestamp_ = nullptr;
|
||||
std::thread* NpKit::cpu_timestamp_update_thread_ = nullptr;
|
||||
volatile bool NpKit::cpu_timestamp_update_thread_should_stop_ = false;
|
||||
|
||||
void NpKit::CpuTimestampUpdateThread() {
|
||||
void NpKit::CpuTimestampUpdateThread()
|
||||
{
|
||||
uint64_t init_system_clock = std::chrono::system_clock::now().time_since_epoch().count();
|
||||
uint64_t init_steady_clock = std::chrono::steady_clock::now().time_since_epoch().count();
|
||||
uint64_t curr_steady_clock = 0;
|
||||
@@ -28,7 +29,8 @@ void NpKit::CpuTimestampUpdateThread() {
|
||||
}
|
||||
}
|
||||
|
||||
mscclppResult_t NpKit::Init(int rank) {
|
||||
mscclppResult_t NpKit::Init(int rank)
|
||||
{
|
||||
uint64_t i = 0;
|
||||
NpKitEventCollectContext ctx;
|
||||
ctx.event_buffer_head = 0;
|
||||
@@ -61,7 +63,8 @@ mscclppResult_t NpKit::Init(int rank) {
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t NpKit::Dump(const std::string& dump_dir) {
|
||||
mscclppResult_t NpKit::Dump(const std::string& dump_dir)
|
||||
{
|
||||
uint64_t i = 0;
|
||||
std::string dump_file_path;
|
||||
|
||||
@@ -74,7 +77,7 @@ mscclppResult_t NpKit::Dump(const std::string& dump_dir) {
|
||||
dump_file_path += std::to_string(i);
|
||||
auto cpu_trace_file = std::fstream(dump_file_path, std::ios::out | std::ios::binary);
|
||||
cpu_trace_file.write(reinterpret_cast<char*>(cpu_event_buffers_[i]),
|
||||
cpu_collect_contexts_[i].event_buffer_head * sizeof(NpKitEvent));
|
||||
cpu_collect_contexts_[i].event_buffer_head * sizeof(NpKitEvent));
|
||||
cpu_trace_file.close();
|
||||
}
|
||||
|
||||
@@ -106,7 +109,7 @@ mscclppResult_t NpKit::Dump(const std::string& dump_dir) {
|
||||
MSCCLPPCHECK(mscclppCudaMemcpy(cpu_collect_contexts_, gpu_collect_contexts_ + i, 1));
|
||||
auto gpu_trace_file = std::fstream(dump_file_path, std::ios::out | std::ios::binary);
|
||||
gpu_trace_file.write(reinterpret_cast<char*>(cpu_event_buffers_[0]),
|
||||
cpu_collect_contexts_[0].event_buffer_head * sizeof(NpKitEvent));
|
||||
cpu_collect_contexts_[0].event_buffer_head * sizeof(NpKitEvent));
|
||||
gpu_trace_file.close();
|
||||
}
|
||||
|
||||
@@ -126,7 +129,8 @@ mscclppResult_t NpKit::Dump(const std::string& dump_dir) {
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t NpKit::Shutdown() {
|
||||
mscclppResult_t NpKit::Shutdown()
|
||||
{
|
||||
uint64_t i = 0;
|
||||
|
||||
// Stop CPU timestamp updating thread
|
||||
@@ -153,11 +157,13 @@ mscclppResult_t NpKit::Shutdown() {
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
NpKitEventCollectContext* NpKit::GetGpuEventCollectContexts() {
|
||||
NpKitEventCollectContext* NpKit::GetGpuEventCollectContexts()
|
||||
{
|
||||
return gpu_collect_contexts_;
|
||||
}
|
||||
|
||||
void NpKit::CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id) {
|
||||
void NpKit::CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id)
|
||||
{
|
||||
uint64_t event_buffer_head = cpu_collect_contexts_[channel_id].event_buffer_head;
|
||||
if (event_buffer_head < kMaxNumCpuEventsPerBuffer) {
|
||||
NpKitEvent& event = cpu_collect_contexts_[channel_id].event_buffer[event_buffer_head];
|
||||
@@ -169,6 +175,7 @@ void NpKit::CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t* NpKit::GetCpuTimestamp() {
|
||||
uint64_t* NpKit::GetCpuTimestamp()
|
||||
{
|
||||
return cpu_timestamp_;
|
||||
}
|
||||
53
src/param.cc
53
src/param.cc
@@ -9,48 +9,56 @@
|
||||
|
||||
#include <algorithm>
|
||||
#include <errno.h>
|
||||
#include <pthread.h>
|
||||
#include <pwd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
#include <pthread.h>
|
||||
#include <pwd.h>
|
||||
|
||||
const char* userHomeDir() {
|
||||
struct passwd *pwUser = getpwuid(getuid());
|
||||
const char* userHomeDir()
|
||||
{
|
||||
struct passwd* pwUser = getpwuid(getuid());
|
||||
return pwUser == NULL ? NULL : pwUser->pw_dir;
|
||||
}
|
||||
|
||||
void setEnvFile(const char* fileName) {
|
||||
FILE * file = fopen(fileName, "r");
|
||||
if (file == NULL) return;
|
||||
void setEnvFile(const char* fileName)
|
||||
{
|
||||
FILE* file = fopen(fileName, "r");
|
||||
if (file == NULL)
|
||||
return;
|
||||
|
||||
char *line = NULL;
|
||||
char* line = NULL;
|
||||
char envVar[1024];
|
||||
char envValue[1024];
|
||||
size_t n = 0;
|
||||
ssize_t read;
|
||||
while ((read = getline(&line, &n, file)) != -1) {
|
||||
if (line[read-1] == '\n') line[read-1] = '\0';
|
||||
int s=0; // Env Var Size
|
||||
while (line[s] != '\0' && line[s] != '=') s++;
|
||||
if (line[s] == '\0') continue;
|
||||
strncpy(envVar, line, std::min(1023,s));
|
||||
if (line[read - 1] == '\n')
|
||||
line[read - 1] = '\0';
|
||||
int s = 0; // Env Var Size
|
||||
while (line[s] != '\0' && line[s] != '=')
|
||||
s++;
|
||||
if (line[s] == '\0')
|
||||
continue;
|
||||
strncpy(envVar, line, std::min(1023, s));
|
||||
envVar[s] = '\0';
|
||||
s++;
|
||||
strncpy(envValue, line+s, 1023);
|
||||
envValue[1023]='\0';
|
||||
strncpy(envValue, line + s, 1023);
|
||||
envValue[1023] = '\0';
|
||||
setenv(envVar, envValue, 0);
|
||||
//printf("%s : %s->%s\n", fileName, envVar, envValue);
|
||||
// printf("%s : %s->%s\n", fileName, envVar, envValue);
|
||||
}
|
||||
if (line) free(line);
|
||||
if (line)
|
||||
free(line);
|
||||
fclose(file);
|
||||
}
|
||||
|
||||
void initEnv() {
|
||||
void initEnv()
|
||||
{
|
||||
char confFilePath[1024];
|
||||
const char * userDir = userHomeDir();
|
||||
const char* userDir = userHomeDir();
|
||||
if (userDir) {
|
||||
sprintf(confFilePath, "%s/.mscclpp.conf", userDir);
|
||||
setEnvFile(confFilePath);
|
||||
@@ -59,7 +67,8 @@ void initEnv() {
|
||||
setEnvFile(confFilePath);
|
||||
}
|
||||
|
||||
void mscclppLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) {
|
||||
void mscclppLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache)
|
||||
{
|
||||
static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
|
||||
pthread_mutex_lock(&mutex);
|
||||
if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) {
|
||||
@@ -70,9 +79,9 @@ void mscclppLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, i
|
||||
value = strtoll(str, nullptr, 0);
|
||||
if (errno) {
|
||||
value = deftVal;
|
||||
INFO(MSCCLPP_ALL,"Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal);
|
||||
INFO(MSCCLPP_ALL, "Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal);
|
||||
} else {
|
||||
INFO(MSCCLPP_ALL,"%s set by environment to %lld.", env, (long long)value);
|
||||
INFO(MSCCLPP_ALL, "%s set by environment to %lld.", env, (long long)value);
|
||||
}
|
||||
}
|
||||
__atomic_store_n(cache, value, __ATOMIC_RELAXED);
|
||||
|
||||
182
src/proxy.cc
182
src/proxy.cc
@@ -1,16 +1,16 @@
|
||||
#include "comm.h"
|
||||
#include "socket.h"
|
||||
#include "debug.h"
|
||||
#include "alloc.h"
|
||||
#include "ib.h"
|
||||
#include "checks.h"
|
||||
#include "comm.h"
|
||||
#include "debug.h"
|
||||
#include "ib.h"
|
||||
#include "socket.h"
|
||||
|
||||
#include <emmintrin.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <numa.h>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <numa.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
#include "npkit/npkit.h"
|
||||
@@ -20,13 +20,13 @@
|
||||
// TODO(chhwang): verify if MSCCLPP_PROXY_FLAG_SET_BY_RDMA == 0 is useful, otherwise delete this option.
|
||||
#define MSCCLPP_PROXY_FLAG_SET_BY_RDMA 1
|
||||
|
||||
#define PROXYCUDACHECK(cmd) \
|
||||
do { \
|
||||
cudaError_t err = cmd; \
|
||||
if (err != cudaSuccess) { \
|
||||
WARN("CUDA error from proxy: %s", cudaGetErrorString(err)); \
|
||||
return NULL; \
|
||||
} \
|
||||
#define PROXYCUDACHECK(cmd) \
|
||||
do { \
|
||||
cudaError_t err = cmd; \
|
||||
if (err != cudaSuccess) { \
|
||||
WARN("CUDA error from proxy: %s", cudaGetErrorString(err)); \
|
||||
return NULL; \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
static void NumaBind(int node)
|
||||
@@ -37,24 +37,27 @@ static void NumaBind(int node)
|
||||
numa_bind_compat(&mask);
|
||||
}
|
||||
|
||||
struct proxyArgs {
|
||||
struct proxyArgs
|
||||
{
|
||||
struct mscclppComm* comm;
|
||||
struct mscclppProxyState *proxyState;
|
||||
struct mscclppProxyState* proxyState;
|
||||
cudaStream_t stream;
|
||||
};
|
||||
|
||||
static void readTrigger(mscclppTrigger *dst, mscclppTrigger *src) {
|
||||
__m128i xmm0 = _mm_load_si128((__m128i *)src);
|
||||
_mm_store_si128((__m128i *)dst, xmm0);
|
||||
static void readTrigger(mscclppTrigger* dst, mscclppTrigger* src)
|
||||
{
|
||||
__m128i xmm0 = _mm_load_si128((__m128i*)src);
|
||||
_mm_store_si128((__m128i*)dst, xmm0);
|
||||
}
|
||||
|
||||
void* mscclppProxyServiceP2P(void* _args) {
|
||||
struct proxyArgs *args = (struct proxyArgs *)_args;
|
||||
struct mscclppComm *comm = args->comm;
|
||||
volatile mscclppProxyRunState_t *run = &args->proxyState->run;
|
||||
mscclppTrigger *fifo = args->proxyState->triggerFifo.hostPtr;
|
||||
volatile uint64_t *fifoTail = args->proxyState->fifoTail.hostPtr;
|
||||
volatile uint64_t *fifoHead = args->proxyState->fifoHead.hostPtr;
|
||||
void* mscclppProxyServiceP2P(void* _args)
|
||||
{
|
||||
struct proxyArgs* args = (struct proxyArgs*)_args;
|
||||
struct mscclppComm* comm = args->comm;
|
||||
volatile mscclppProxyRunState_t* run = &args->proxyState->run;
|
||||
mscclppTrigger* fifo = args->proxyState->triggerFifo.hostPtr;
|
||||
volatile uint64_t* fifoTail = args->proxyState->fifoTail.hostPtr;
|
||||
volatile uint64_t* fifoHead = args->proxyState->fifoHead.hostPtr;
|
||||
|
||||
cudaStream_t stream = args->proxyState->stream;
|
||||
free(_args);
|
||||
@@ -73,32 +76,36 @@ void* mscclppProxyServiceP2P(void* _args) {
|
||||
if (runCheckCounter-- == 0) {
|
||||
runCheckCounter = MSCCLPP_PROXY_RUN_STATE_CHECK_PERIOD;
|
||||
// Check if we need to exit
|
||||
if (*run != MSCCLPP_PROXY_RUN_STATE_RUNNING) break;
|
||||
if (*run != MSCCLPP_PROXY_RUN_STATE_RUNNING)
|
||||
break;
|
||||
}
|
||||
// Poll to see if we are ready to send anything
|
||||
if (cachedFifoTail == *fifoHead) continue; // no need trigger
|
||||
if (cachedFifoTail == *fifoHead)
|
||||
continue; // no need trigger
|
||||
readTrigger(&trigger, &fifo[cachedFifoTail % MSCCLPP_PROXY_FIFO_SIZE]);
|
||||
if (trigger.value[0] == 0) continue; // there is one in progreess
|
||||
if (trigger.value[0] == 0)
|
||||
continue; // there is one in progreess
|
||||
// there is a trigger value ready to be consumed
|
||||
|
||||
struct mscclppConn *conn = &comm->conns[trigger.fields.connId];
|
||||
|
||||
struct mscclppConn* conn = &comm->conns[trigger.fields.connId];
|
||||
|
||||
// Iterate over what send is needed
|
||||
if (trigger.fields.type & mscclppData){
|
||||
void *srcBuff = (void *)((char *)conn->devConn->localBuff + trigger.fields.srcDataOffset);
|
||||
void *dstBuff = (void *)((char *)conn->devConn->remoteBuff + trigger.fields.dstDataOffset);
|
||||
if (trigger.fields.type & mscclppData) {
|
||||
void* srcBuff = (void*)((char*)conn->devConn->localBuff + trigger.fields.srcDataOffset);
|
||||
void* dstBuff = (void*)((char*)conn->devConn->remoteBuff + trigger.fields.dstDataOffset);
|
||||
PROXYCUDACHECK(cudaMemcpyAsync(dstBuff, srcBuff, trigger.fields.dataSize, cudaMemcpyDeviceToDevice, stream));
|
||||
}
|
||||
if (trigger.fields.type & mscclppFlag) {
|
||||
PROXYCUDACHECK(cudaMemcpyAsync(conn->remoteProxyFlag, conn->devConn->sendEpochId, sizeof(uint64_t), cudaMemcpyDeviceToDevice, stream));
|
||||
PROXYCUDACHECK(cudaMemcpyAsync(conn->remoteProxyFlag, conn->devConn->sendEpochId, sizeof(uint64_t),
|
||||
cudaMemcpyDeviceToDevice, stream));
|
||||
}
|
||||
// Wait for completion
|
||||
if (trigger.fields.type & mscclppSync){
|
||||
if (trigger.fields.type & mscclppSync) {
|
||||
PROXYCUDACHECK(cudaStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
// Send completion: reset only the high 64 bits
|
||||
*(volatile uint64_t *)(&fifo[cachedFifoTail % MSCCLPP_PROXY_FIFO_SIZE]) = 0;
|
||||
*(volatile uint64_t*)(&fifo[cachedFifoTail % MSCCLPP_PROXY_FIFO_SIZE]) = 0;
|
||||
cachedFifoTail++;
|
||||
*fifoTail = cachedFifoTail;
|
||||
}
|
||||
@@ -112,28 +119,30 @@ void* mscclppProxyServiceP2P(void* _args) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void* mscclppProxyServiceIb(void* _args) {
|
||||
struct proxyArgs *args = (struct proxyArgs *)_args;
|
||||
struct mscclppComm *comm = args->comm;
|
||||
struct mscclppIbContext *ibCtx = args->proxyState->ibContext;
|
||||
volatile mscclppProxyRunState_t *run = &args->proxyState->run;
|
||||
mscclppTrigger *fifo = args->proxyState->triggerFifo.hostPtr;
|
||||
volatile uint64_t *fifoTail = args->proxyState->fifoTail.hostPtr;
|
||||
volatile uint64_t *fifoHead = args->proxyState->fifoHead.hostPtr;
|
||||
void* mscclppProxyServiceIb(void* _args)
|
||||
{
|
||||
struct proxyArgs* args = (struct proxyArgs*)_args;
|
||||
struct mscclppComm* comm = args->comm;
|
||||
struct mscclppIbContext* ibCtx = args->proxyState->ibContext;
|
||||
volatile mscclppProxyRunState_t* run = &args->proxyState->run;
|
||||
mscclppTrigger* fifo = args->proxyState->triggerFifo.hostPtr;
|
||||
volatile uint64_t* fifoTail = args->proxyState->fifoTail.hostPtr;
|
||||
volatile uint64_t* fifoHead = args->proxyState->fifoHead.hostPtr;
|
||||
free(_args);
|
||||
|
||||
#if (MSCCLPP_PROXY_FLAG_SET_BY_RDMA == 0)
|
||||
enum {
|
||||
enum
|
||||
{
|
||||
SEND_STATE_INIT,
|
||||
SEND_STATE_INPROGRESS
|
||||
};
|
||||
int *sendState;
|
||||
uint64_t *currentProxyFlagValue;
|
||||
if (mscclppCalloc((void **)&sendState, comm->nConns) != mscclppSuccess) {
|
||||
int* sendState;
|
||||
uint64_t* currentProxyFlagValue;
|
||||
if (mscclppCalloc((void**)&sendState, comm->nConns) != mscclppSuccess) {
|
||||
WARN("mscclppCalloc failed: errno %d", errno);
|
||||
return NULL;
|
||||
}
|
||||
if (mscclppCalloc((void **)¤tProxyFlagValue, comm->nConns) != mscclppSuccess) {
|
||||
if (mscclppCalloc((void**)¤tProxyFlagValue, comm->nConns) != mscclppSuccess) {
|
||||
WARN("mscclppCalloc failed: errno %d", errno);
|
||||
return NULL;
|
||||
}
|
||||
@@ -148,7 +157,7 @@ void* mscclppProxyServiceIb(void* _args) {
|
||||
#if (MSCCLPP_PROXY_FLAG_SET_BY_RDMA == 0)
|
||||
for (int i = 0; i < (int)comm->nConns; ++i) {
|
||||
sendState[i] = SEND_STATE_INIT;
|
||||
struct mscclppConn *conn = &comm->conns[i];
|
||||
struct mscclppConn* conn = &comm->conns[i];
|
||||
currentProxyFlagValue[i] = *conn->cpuProxyFlag;
|
||||
// Post recv
|
||||
if (conn->ibQp->postRecv(0) != 0) {
|
||||
@@ -163,17 +172,19 @@ void* mscclppProxyServiceIb(void* _args) {
|
||||
if (runCheckCounter-- == 0) {
|
||||
runCheckCounter = MSCCLPP_PROXY_RUN_STATE_CHECK_PERIOD;
|
||||
// Check if we need to exit
|
||||
if (*run != MSCCLPP_PROXY_RUN_STATE_RUNNING) break;
|
||||
if (*run != MSCCLPP_PROXY_RUN_STATE_RUNNING)
|
||||
break;
|
||||
}
|
||||
|
||||
#if (MSCCLPP_PROXY_FLAG_SET_BY_RDMA == 0)
|
||||
struct mscclppConn *conn = &comm->conns[trigger.fields.connId];
|
||||
struct mscclppConn* conn = &comm->conns[trigger.fields.connId];
|
||||
// Try send
|
||||
if (sendState[trigger.fields.connId] == SEND_STATE_INIT) {
|
||||
if (trigger.value[0] != 0) {
|
||||
// Do send
|
||||
conn->ibQp->stageSendWithImm(conn->ibBuffMr, &conn->ibBuffMrInfo, (uint32_t)trigger.fields.dataSize,
|
||||
/*wrId=*/0, /*offset=*/trigger.fields.dataOffset, /*signaled=*/true, /*immData=*/0);
|
||||
/*wrId=*/0, /*offset=*/trigger.fields.dataOffset, /*signaled=*/true,
|
||||
/*immData=*/0);
|
||||
int ret;
|
||||
if ((ret = conn->ibQp->postSend()) != 0) {
|
||||
// Return value is errno.
|
||||
@@ -189,7 +200,7 @@ void* mscclppProxyServiceIb(void* _args) {
|
||||
WARN("rank %d pollCq failed: errno %d", rank, errno);
|
||||
} else {
|
||||
for (int i = 0; i < wcNum; ++i) {
|
||||
struct ibv_wc *wc = &conn->ibQp->wcs[i];
|
||||
struct ibv_wc* wc = &conn->ibQp->wcs[i];
|
||||
if (wc->status != IBV_WC_SUCCESS) {
|
||||
WARN("rank %d wc status %d", rank, wc->status);
|
||||
continue;
|
||||
@@ -200,7 +211,7 @@ void* mscclppProxyServiceIb(void* _args) {
|
||||
}
|
||||
if (wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
|
||||
// TODO(chhwang): cpu flush
|
||||
*((volatile uint64_t *)conn->cpuProxyFlag) = ++currentProxyFlagValue[trigger.fields.connId];
|
||||
*((volatile uint64_t*)conn->cpuProxyFlag) = ++currentProxyFlagValue[trigger.fields.connId];
|
||||
// recv completion
|
||||
if (conn->ibQp->postRecv(wc->wr_id) != 0) {
|
||||
WARN("postRecv failed: errno %d", errno);
|
||||
@@ -208,7 +219,7 @@ void* mscclppProxyServiceIb(void* _args) {
|
||||
// WARN("rank %d recv completion", rank);
|
||||
} else if (wc->opcode == IBV_WC_RDMA_WRITE) {
|
||||
// send completion
|
||||
*(volatile uint64_t *)(&fifo[fifoTail]) = 0;
|
||||
*(volatile uint64_t*)(&fifo[fifoTail]) = 0;
|
||||
fifoTail = (fifoTail + 1) % MSCCLPP_PROXY_FIFO_SIZE;
|
||||
sendState[trigger.fields.connId] = SEND_STATE_INIT;
|
||||
// WARN("rank %d send completion", rank);
|
||||
@@ -217,21 +228,24 @@ void* mscclppProxyServiceIb(void* _args) {
|
||||
}
|
||||
#else // (MSCCLPP_PROXY_FLAG_SET_BY_RDMA == 1)
|
||||
// Poll to see if we are ready to send anything
|
||||
if (cachedFifoTail == *fifoHead) continue; // no need trigger
|
||||
if (cachedFifoTail == *fifoHead)
|
||||
continue; // no need trigger
|
||||
readTrigger(&trigger, &fifo[cachedFifoTail % MSCCLPP_PROXY_FIFO_SIZE]);
|
||||
if (trigger.value[0] == 0) continue; // there is one in progreess
|
||||
if (trigger.value[0] == 0)
|
||||
continue; // there is one in progreess
|
||||
// there is a trigger value ready to be consumed
|
||||
|
||||
struct mscclppConn *conn = &comm->conns[trigger.fields.connId];
|
||||
struct mscclppConn* conn = &comm->conns[trigger.fields.connId];
|
||||
|
||||
if (trigger.fields.type & mscclppData) {
|
||||
conn->ibQp->stageSend(conn->ibBuffMr, &conn->ibBuffMrInfo, (uint32_t)trigger.fields.dataSize,
|
||||
/*wrId=*/0, /*srcOffset=*/trigger.fields.srcDataOffset, /*dstOffset=*/trigger.fields.dstDataOffset,
|
||||
/*wrId=*/0, /*srcOffset=*/trigger.fields.srcDataOffset,
|
||||
/*dstOffset=*/trigger.fields.dstDataOffset,
|
||||
/*signaled=*/false);
|
||||
#if defined(ENABLE_NPKIT)
|
||||
NpKit::CollectCpuEvent(
|
||||
NPKIT_EVENT_IB_SEND_ENTRY, (uint32_t)trigger.fields.dataSize, 0 /* inflight request differentiator */,
|
||||
*(volatile uint64_t*)NpKit::GetCpuTimestamp(), trigger.fields.connId /* event collection context index */);
|
||||
NpKit::CollectCpuEvent(NPKIT_EVENT_IB_SEND_ENTRY, (uint32_t)trigger.fields.dataSize,
|
||||
0 /* inflight request differentiator */, *(volatile uint64_t*)NpKit::GetCpuTimestamp(),
|
||||
trigger.fields.connId /* event collection context index */);
|
||||
#endif
|
||||
}
|
||||
if (trigger.fields.type & mscclppFlag) {
|
||||
@@ -255,7 +269,7 @@ void* mscclppProxyServiceIb(void* _args) {
|
||||
continue;
|
||||
}
|
||||
for (int i = 0; i < wcNum; ++i) {
|
||||
struct ibv_wc *wc = &conn->ibQp->wcs[i];
|
||||
struct ibv_wc* wc = &conn->ibQp->wcs[i];
|
||||
if (wc->status != IBV_WC_SUCCESS) {
|
||||
WARN("rank %d wc status %d", rank, wc->status);
|
||||
continue;
|
||||
@@ -268,9 +282,10 @@ void* mscclppProxyServiceIb(void* _args) {
|
||||
// send completion
|
||||
waiting = false;
|
||||
#if defined(ENABLE_NPKIT)
|
||||
NpKit::CollectCpuEvent(
|
||||
NPKIT_EVENT_IB_SEND_EXIT, (uint32_t)trigger.fields.dataSize, 0 /* inflight request differentiator */,
|
||||
*(volatile uint64_t*)NpKit::GetCpuTimestamp(), trigger.fields.connId /* event collection context index */);
|
||||
NpKit::CollectCpuEvent(NPKIT_EVENT_IB_SEND_EXIT, (uint32_t)trigger.fields.dataSize,
|
||||
0 /* inflight request differentiator */,
|
||||
*(volatile uint64_t*)NpKit::GetCpuTimestamp(),
|
||||
trigger.fields.connId /* event collection context index */);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
@@ -279,22 +294,23 @@ void* mscclppProxyServiceIb(void* _args) {
|
||||
}
|
||||
|
||||
// Send completion: reset only the high 64 bits
|
||||
*(volatile uint64_t *)(&fifo[cachedFifoTail % MSCCLPP_PROXY_FIFO_SIZE]) = 0;
|
||||
*(volatile uint64_t*)(&fifo[cachedFifoTail % MSCCLPP_PROXY_FIFO_SIZE]) = 0;
|
||||
cachedFifoTail++;
|
||||
*fifoTail = cachedFifoTail;
|
||||
#endif
|
||||
}
|
||||
|
||||
//TODO(saemal): we need to wait for completion of wc here too
|
||||
// TODO(saemal): we need to wait for completion of wc here too
|
||||
|
||||
*run = MSCCLPP_PROXY_RUN_STATE_IDLE;
|
||||
// WARN("Proxy exits: rank %d", rank);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void* mscclppProxyService(void* _args) {
|
||||
struct proxyArgs *args = (struct proxyArgs *)_args;
|
||||
void *ret;
|
||||
void* mscclppProxyService(void* _args)
|
||||
{
|
||||
struct proxyArgs* args = (struct proxyArgs*)_args;
|
||||
void* ret;
|
||||
if (args->proxyState->ibContext == NULL) {
|
||||
ret = mscclppProxyServiceP2P(_args);
|
||||
} else {
|
||||
@@ -303,12 +319,14 @@ void* mscclppProxyService(void* _args) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
mscclppResult_t mscclppProxyCreate(struct mscclppComm* comm) {
|
||||
mscclppResult_t mscclppProxyCreate(struct mscclppComm* comm)
|
||||
{
|
||||
for (int i = 0; i < MSCCLPP_PROXY_MAX_NUM; ++i) {
|
||||
struct mscclppProxyState *proxyState = comm->proxyState[i];
|
||||
if (proxyState == NULL) break;
|
||||
struct mscclppProxyState* proxyState = comm->proxyState[i];
|
||||
if (proxyState == NULL)
|
||||
break;
|
||||
|
||||
struct proxyArgs *args;
|
||||
struct proxyArgs* args;
|
||||
MSCCLPPCHECK(mscclppCalloc(&args, 1));
|
||||
args->comm = comm;
|
||||
args->proxyState = proxyState;
|
||||
@@ -324,12 +342,14 @@ mscclppResult_t mscclppProxyCreate(struct mscclppComm* comm) {
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t mscclppProxyDestroy(struct mscclppComm* comm) {
|
||||
mscclppResult_t mscclppProxyDestroy(struct mscclppComm* comm)
|
||||
{
|
||||
for (int i = 0; i < MSCCLPP_PROXY_MAX_NUM; ++i) {
|
||||
struct mscclppProxyState *proxyState = comm->proxyState[i];
|
||||
if (proxyState == NULL) break;
|
||||
struct mscclppProxyState* proxyState = comm->proxyState[i];
|
||||
if (proxyState == NULL)
|
||||
break;
|
||||
|
||||
volatile int *run = (volatile int *)&proxyState->run;
|
||||
volatile int* run = (volatile int*)&proxyState->run;
|
||||
if (*run == MSCCLPP_PROXY_RUN_STATE_IDLE) {
|
||||
continue;
|
||||
}
|
||||
|
||||
114
src/utils.cc
114
src/utils.cc
@@ -21,22 +21,24 @@
|
||||
// return ccMajor*10+ccMinor;
|
||||
// }
|
||||
|
||||
mscclppResult_t int64ToBusId(int64_t id, char* busId) {
|
||||
mscclppResult_t int64ToBusId(int64_t id, char* busId)
|
||||
{
|
||||
sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf));
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t busIdToInt64(const char* busId, int64_t* id) {
|
||||
char hexStr[17]; // Longest possible int64 hex string + null terminator.
|
||||
mscclppResult_t busIdToInt64(const char* busId, int64_t* id)
|
||||
{
|
||||
char hexStr[17]; // Longest possible int64 hex string + null terminator.
|
||||
int hexOffset = 0;
|
||||
for (int i = 0; hexOffset < sizeof(hexStr) - 1; i++) {
|
||||
char c = busId[i];
|
||||
if (c == '.' || c == ':') continue;
|
||||
if ((c >= '0' && c <= '9') ||
|
||||
(c >= 'A' && c <= 'F') ||
|
||||
(c >= 'a' && c <= 'f')) {
|
||||
if (c == '.' || c == ':')
|
||||
continue;
|
||||
if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) {
|
||||
hexStr[hexOffset++] = busId[i];
|
||||
} else break;
|
||||
} else
|
||||
break;
|
||||
}
|
||||
hexStr[hexOffset] = '\0';
|
||||
*id = strtol(hexStr, NULL, 16);
|
||||
@@ -44,7 +46,8 @@ mscclppResult_t busIdToInt64(const char* busId, int64_t* id) {
|
||||
}
|
||||
|
||||
// Convert a logical cudaDev index to the NVML device minor number
|
||||
mscclppResult_t getBusId(int cudaDev, int64_t *busId) {
|
||||
mscclppResult_t getBusId(int cudaDev, int64_t* busId)
|
||||
{
|
||||
// On most systems, the PCI bus ID comes back as in the 0000:00:00.0
|
||||
// format. Still need to allocate proper space in case PCI domain goes
|
||||
// higher.
|
||||
@@ -54,18 +57,21 @@ mscclppResult_t getBusId(int cudaDev, int64_t *busId) {
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
mscclppResult_t getHostName(char* hostname, int maxlen, const char delim) {
|
||||
mscclppResult_t getHostName(char* hostname, int maxlen, const char delim)
|
||||
{
|
||||
if (gethostname(hostname, maxlen) != 0) {
|
||||
strncpy(hostname, "unknown", maxlen);
|
||||
return mscclppSystemError;
|
||||
}
|
||||
int i = 0;
|
||||
while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen-1)) i++;
|
||||
while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen - 1))
|
||||
i++;
|
||||
hostname[i] = '\0';
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
uint64_t getHash(const char* string, int n) {
|
||||
uint64_t getHash(const char* string, int n)
|
||||
{
|
||||
// Based on DJB2a, result = result * 33 ^ char
|
||||
uint64_t result = 5381;
|
||||
for (int c = 0; c < n; c++) {
|
||||
@@ -83,23 +89,24 @@ uint64_t getHash(const char* string, int n) {
|
||||
* This string can be overridden by using the MSCCLPP_HOSTID env var.
|
||||
*/
|
||||
#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
|
||||
uint64_t getHostHash(void) {
|
||||
uint64_t getHostHash(void)
|
||||
{
|
||||
char hostHash[1024];
|
||||
char *hostId;
|
||||
char* hostId;
|
||||
|
||||
// Fall back is the full hostname if something fails
|
||||
(void) getHostName(hostHash, sizeof(hostHash), '\0');
|
||||
(void)getHostName(hostHash, sizeof(hostHash), '\0');
|
||||
int offset = strlen(hostHash);
|
||||
|
||||
if ((hostId = getenv("MSCCLPP_HOSTID")) != NULL) {
|
||||
INFO(MSCCLPP_ENV, "MSCCLPP_HOSTID set by environment to %s", hostId);
|
||||
strncpy(hostHash, hostId, sizeof(hostHash));
|
||||
} else {
|
||||
FILE *file = fopen(HOSTID_FILE, "r");
|
||||
FILE* file = fopen(HOSTID_FILE, "r");
|
||||
if (file != NULL) {
|
||||
char *p;
|
||||
char* p;
|
||||
if (fscanf(file, "%ms", &p) == 1) {
|
||||
strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
|
||||
strncpy(hostHash + offset, p, sizeof(hostHash) - offset - 1);
|
||||
free(p);
|
||||
}
|
||||
}
|
||||
@@ -107,9 +114,9 @@ uint64_t getHostHash(void) {
|
||||
}
|
||||
|
||||
// Make sure the string is terminated
|
||||
hostHash[sizeof(hostHash)-1]='\0';
|
||||
hostHash[sizeof(hostHash) - 1] = '\0';
|
||||
|
||||
TRACE(MSCCLPP_INIT,"unique hostname '%s'", hostHash);
|
||||
TRACE(MSCCLPP_INIT, "unique hostname '%s'", hostHash);
|
||||
|
||||
return getHash(hostHash, strlen(hostHash));
|
||||
}
|
||||
@@ -120,22 +127,26 @@ uint64_t getHostHash(void) {
|
||||
*
|
||||
* $$ $(readlink /proc/self/ns/pid)
|
||||
*/
|
||||
uint64_t getPidHash(void) {
|
||||
uint64_t getPidHash(void)
|
||||
{
|
||||
char pname[1024];
|
||||
// Start off with our pid ($$)
|
||||
sprintf(pname, "%ld", (long) getpid());
|
||||
sprintf(pname, "%ld", (long)getpid());
|
||||
int plen = strlen(pname);
|
||||
int len = readlink("/proc/self/ns/pid", pname+plen, sizeof(pname)-1-plen);
|
||||
if (len < 0) len = 0;
|
||||
int len = readlink("/proc/self/ns/pid", pname + plen, sizeof(pname) - 1 - plen);
|
||||
if (len < 0)
|
||||
len = 0;
|
||||
|
||||
pname[plen+len]='\0';
|
||||
TRACE(MSCCLPP_INIT,"unique PID '%s'", pname);
|
||||
pname[plen + len] = '\0';
|
||||
TRACE(MSCCLPP_INIT, "unique PID '%s'", pname);
|
||||
|
||||
return getHash(pname, strlen(pname));
|
||||
}
|
||||
|
||||
int parseStringList(const char* string, struct netIf* ifList, int maxList) {
|
||||
if (!string) return 0;
|
||||
int parseStringList(const char* string, struct netIf* ifList, int maxList)
|
||||
{
|
||||
if (!string)
|
||||
return 0;
|
||||
|
||||
const char* ptr = string;
|
||||
|
||||
@@ -147,15 +158,18 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList) {
|
||||
if (c == ':') {
|
||||
if (ifC > 0) {
|
||||
ifList[ifNum].prefix[ifC] = '\0';
|
||||
ifList[ifNum].port = atoi(ptr+1);
|
||||
ifNum++; ifC = 0;
|
||||
ifList[ifNum].port = atoi(ptr + 1);
|
||||
ifNum++;
|
||||
ifC = 0;
|
||||
}
|
||||
while (c != ',' && c != '\0') c = *(++ptr);
|
||||
while (c != ',' && c != '\0')
|
||||
c = *(++ptr);
|
||||
} else if (c == ',' || c == '\0') {
|
||||
if (ifC > 0) {
|
||||
ifList[ifNum].prefix[ifC] = '\0';
|
||||
ifList[ifNum].port = -1;
|
||||
ifNum++; ifC = 0;
|
||||
ifNum++;
|
||||
ifC = 0;
|
||||
}
|
||||
} else {
|
||||
ifList[ifNum].prefix[ifC] = c;
|
||||
@@ -166,27 +180,32 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList) {
|
||||
return ifNum;
|
||||
}
|
||||
|
||||
static bool matchIf(const char* string, const char* ref, bool matchExact) {
|
||||
static bool matchIf(const char* string, const char* ref, bool matchExact)
|
||||
{
|
||||
// Make sure to include '\0' in the exact case
|
||||
int matchLen = matchExact ? strlen(string) + 1 : strlen(ref);
|
||||
return strncmp(string, ref, matchLen) == 0;
|
||||
}
|
||||
|
||||
static bool matchPort(const int port1, const int port2) {
|
||||
if (port1 == -1) return true;
|
||||
if (port2 == -1) return true;
|
||||
if (port1 == port2) return true;
|
||||
static bool matchPort(const int port1, const int port2)
|
||||
{
|
||||
if (port1 == -1)
|
||||
return true;
|
||||
if (port2 == -1)
|
||||
return true;
|
||||
if (port1 == port2)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) {
|
||||
bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact)
|
||||
{
|
||||
// Make an exception for the case where no user list is defined
|
||||
if (listSize == 0) return true;
|
||||
if (listSize == 0)
|
||||
return true;
|
||||
|
||||
for (int i=0; i<listSize; i++) {
|
||||
if (matchIf(string, ifList[i].prefix, matchExact)
|
||||
&& matchPort(port, ifList[i].port)) {
|
||||
for (int i = 0; i < listSize; i++) {
|
||||
if (matchIf(string, ifList[i].prefix, matchExact) && matchPort(port, ifList[i].port)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -262,14 +281,13 @@ bool matchIfList(const char* string, int port, struct netIf* ifList, int listSiz
|
||||
// me->topFrame.unhunks = proxy;
|
||||
// mallocSize = size;
|
||||
// proxy->obj = malloc(mallocSize);
|
||||
// INFO(MSCCLPP_ALLOC, "%s:%d memory stack non-hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
|
||||
// if (proxy->obj == nullptr) goto malloc_exhausted;
|
||||
// return proxy->obj;
|
||||
// INFO(MSCCLPP_ALLOC, "%s:%d memory stack non-hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long
|
||||
// long)mallocSize); if (proxy->obj == nullptr) goto malloc_exhausted; return proxy->obj;
|
||||
// }
|
||||
|
||||
// malloc_exhausted:
|
||||
// WARN("%s:%d Unrecoverable error detected: malloc(size=%llu) returned null.", __FILE__, __LINE__, (unsigned long long)mallocSize);
|
||||
// abort();
|
||||
// WARN("%s:%d Unrecoverable error detected: malloc(size=%llu) returned null.", __FILE__, __LINE__, (unsigned long
|
||||
// long)mallocSize); abort();
|
||||
// }
|
||||
|
||||
// void mscclppMemoryStackDestruct(struct mscclppMemoryStack* me) {
|
||||
|
||||
@@ -3,36 +3,36 @@
|
||||
#ifdef MSCCLPP_USE_MPI_FOR_TESTS
|
||||
#include "mpi.h"
|
||||
#endif // MSCCLPP_USE_MPI_FOR_TESTS
|
||||
#include <iostream>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <unistd.h>
|
||||
#include <unordered_map>
|
||||
|
||||
|
||||
static int nranksPerNode = 8;
|
||||
|
||||
// Propagate errors up
|
||||
|
||||
#define MSCCLPPCHECK(call) do { \
|
||||
mscclppResult_t res = call; \
|
||||
if (res != mscclppSuccess && res != mscclppInProgress) { \
|
||||
/* Print the back trace*/ \
|
||||
printf("Failure at %s:%d -> %s\n", __FILE__, __LINE__, mscclppGetErrorString(res)); \
|
||||
return res; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define MSCCLPPCHECK(call) \
|
||||
do { \
|
||||
mscclppResult_t res = call; \
|
||||
if (res != mscclppSuccess && res != mscclppInProgress) { \
|
||||
/* Print the back trace*/ \
|
||||
printf("Failure at %s:%d -> %s\n", __FILE__, __LINE__, mscclppGetErrorString(res)); \
|
||||
return res; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// Check CUDA RT calls
|
||||
#define CUDACHECK(cmd) do { \
|
||||
cudaError_t err = cmd; \
|
||||
if( err != cudaSuccess ) { \
|
||||
printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while(false)
|
||||
#define CUDACHECK(cmd) \
|
||||
do { \
|
||||
cudaError_t err = cmd; \
|
||||
if (err != cudaSuccess) { \
|
||||
printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
// Measure current time in second.
|
||||
static double getTime(void)
|
||||
@@ -47,33 +47,36 @@ static double getTime(void)
|
||||
|
||||
__constant__ mscclppDevConn_t constDevConns[16];
|
||||
|
||||
__device__ void allgather0(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, int nelemsPerGPU){
|
||||
__device__ void allgather0(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, int nelemsPerGPU)
|
||||
{
|
||||
// this allgather is really simple and implemented as an alltoall
|
||||
|
||||
// this thread's role is a sender role
|
||||
// put your data asynchronously
|
||||
devConn.put(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int));
|
||||
devConn.put(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int));
|
||||
// make sure everyone is put their data before some thread randomly blocks everyone else in signal
|
||||
__syncthreads();
|
||||
// push with flag and sync to make sure the data is received
|
||||
devConn.signal();
|
||||
|
||||
|
||||
// this thread's role is a receiver role. wait on the semaphore to make sure the data is ready
|
||||
devConn.wait();
|
||||
}
|
||||
|
||||
__device__ void allgather1(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, int nelemsPerGPU){
|
||||
__device__ void allgather1(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, int nelemsPerGPU)
|
||||
{
|
||||
// this allgather algorithm works as follows:
|
||||
// Step 1: GPU rank i sends data to GPU rank (i+1) % world_size
|
||||
// Step 2: GPU rank i waits for data from GPU rank (i+2) % world_size
|
||||
// ...
|
||||
// This order is much better for DMA engine for NVLinks
|
||||
|
||||
for (int i = 1; i < world_size; i++){
|
||||
for (int i = 1; i < world_size; i++) {
|
||||
__syncthreads();
|
||||
if (remoteRank != ((rank+i) % world_size)) continue;
|
||||
if (remoteRank != ((rank + i) % world_size))
|
||||
continue;
|
||||
// put your data to GPU (rank+i) % world_size and signal all in one call
|
||||
devConn.putWithSignal(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int));
|
||||
devConn.putWithSignal(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int));
|
||||
}
|
||||
// all connections wait for the signal from the sender
|
||||
devConn.wait();
|
||||
@@ -82,7 +85,8 @@ __device__ void allgather1(mscclppDevConn_t devConn, int rank, int world_size, i
|
||||
__global__ void kernel(int rank, int world_size, int nelemsPerGPU, int kernel)
|
||||
{
|
||||
// only use a single thread from each warp
|
||||
if (threadIdx.x % 32 != 0) return;
|
||||
if (threadIdx.x % 32 != 0)
|
||||
return;
|
||||
|
||||
// find the mapping between remoteRank and devConns
|
||||
int warpId = threadIdx.x / 32;
|
||||
@@ -106,7 +110,7 @@ int rankToNode(int rank)
|
||||
return rank / nranksPerNode;
|
||||
}
|
||||
|
||||
void print_usage(const char *prog)
|
||||
void print_usage(const char* prog)
|
||||
{
|
||||
#ifdef MSCCLPP_USE_MPI_FOR_TESTS
|
||||
printf("usage: %s IP:PORT [rank nranks]\n", prog);
|
||||
@@ -115,15 +119,16 @@ void print_usage(const char *prog)
|
||||
#endif
|
||||
}
|
||||
|
||||
void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSize, int nelemsPerGPU, int** data_h, int **data_d)
|
||||
void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSize, int nelemsPerGPU, int** data_h,
|
||||
int** data_d)
|
||||
{
|
||||
CUDACHECK(cudaMalloc(data_d, dataSize));
|
||||
CUDACHECK(cudaMemset(*data_d, 0, dataSize));
|
||||
|
||||
*data_h = new int[nelemsPerGPU*world_size];
|
||||
for (int i = 0; i < nelemsPerGPU*world_size; i++){
|
||||
*data_h = new int[nelemsPerGPU * world_size];
|
||||
for (int i = 0; i < nelemsPerGPU * world_size; i++) {
|
||||
int val = i + 1;
|
||||
if (i / nelemsPerGPU == rank){
|
||||
if (i / nelemsPerGPU == rank) {
|
||||
(*data_h)[i] = val;
|
||||
} else {
|
||||
(*data_h)[i] = 0;
|
||||
@@ -132,16 +137,18 @@ void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSiz
|
||||
CUDACHECK(cudaMemcpy(*data_d, *data_h, dataSize, cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
mscclppResult_t setupMscclppConnections(int rank, int world_size, mscclppComm_t comm, int* data_d, size_t dataSize){
|
||||
mscclppResult_t setupMscclppConnections(int rank, int world_size, mscclppComm_t comm, int* data_d, size_t dataSize)
|
||||
{
|
||||
int thisNode = rankToNode(rank);
|
||||
int cudaNum = rankToLocalRank(rank);
|
||||
std::string ibDevStr = "mlx5_ib" + std::to_string(cudaNum);
|
||||
|
||||
for (int r = 0; r < world_size; ++r) {
|
||||
if (r == rank) continue;
|
||||
if (r == rank)
|
||||
continue;
|
||||
mscclppTransport_t transportType;
|
||||
const char* ibDev = ibDevStr.c_str();
|
||||
if (rankToNode(r) == thisNode){
|
||||
if (rankToNode(r) == thisNode) {
|
||||
ibDev = NULL;
|
||||
transportType = mscclppTransportP2P;
|
||||
} else {
|
||||
@@ -153,7 +160,7 @@ mscclppResult_t setupMscclppConnections(int rank, int world_size, mscclppComm_t
|
||||
|
||||
MSCCLPPCHECK(mscclppConnectionSetup(comm));
|
||||
|
||||
mscclppDevConn_t *devConns;
|
||||
mscclppDevConn_t* devConns;
|
||||
int nCons;
|
||||
MSCCLPPCHECK(mscclppGetAllDeviceConnections(comm, &devConns, &nCons));
|
||||
|
||||
@@ -162,36 +169,39 @@ mscclppResult_t setupMscclppConnections(int rank, int world_size, mscclppComm_t
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
void printUsage(const char* prog, bool isMpi) {
|
||||
if (isMpi){
|
||||
void printUsage(const char* prog, bool isMpi)
|
||||
{
|
||||
if (isMpi) {
|
||||
std::string st = "you are using MPI for this test\n";
|
||||
st += "two possilbe usages are:\n";
|
||||
st += "> " + std::string(prog) + "\n";
|
||||
st += "or\n";
|
||||
st += "> " + std::string(prog) + " -ip_port [ip:port]\n";
|
||||
st += "> " + std::string(prog) + " -ip_port [ip:port]\n";
|
||||
printf("%s", st.c_str());
|
||||
} else {
|
||||
std::string st = "you are NOT using MPI for this test\n";
|
||||
st += "the only possible usage:\n";
|
||||
st += "> " + std::string(prog) + " -ip_port [ip:port] -rank [rank] -nranks [nranks]\n";
|
||||
printf("%s", st.c_str());
|
||||
printf("%s", st.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
std::unordered_map<std::string, std::string> parseArgs(int argc, const char* argv[], bool isMpi) {
|
||||
std::unordered_map<std::string, std::string> parseArgs(int argc, const char* argv[], bool isMpi)
|
||||
{
|
||||
std::unordered_map<std::string, std::string> options;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
std::string arg = argv[i];
|
||||
if (arg == "-rankspernode") {
|
||||
if (isMpi){
|
||||
if (isMpi) {
|
||||
fprintf(stderr, "Error: -rankspernode should not be specified with MPI.\n");
|
||||
exit(-1);
|
||||
}
|
||||
if (i + 1 < argc) {
|
||||
options["rankspernode"] = argv[++i];
|
||||
} else {
|
||||
fprintf(stderr, "Error: -rankspernode option requires an argument.\n");;
|
||||
fprintf(stderr, "Error: -rankspernode option requires an argument.\n");
|
||||
;
|
||||
exit(-1);
|
||||
}
|
||||
} else if (arg == "-kernel") {
|
||||
@@ -209,7 +219,7 @@ std::unordered_map<std::string, std::string> parseArgs(int argc, const char* arg
|
||||
exit(-1);
|
||||
}
|
||||
} else if (arg == "-rank") {
|
||||
if (isMpi){
|
||||
if (isMpi) {
|
||||
fprintf(stderr, "Error: -rank should not be specified with MPI.\n");
|
||||
exit(-1);
|
||||
}
|
||||
@@ -220,7 +230,7 @@ std::unordered_map<std::string, std::string> parseArgs(int argc, const char* arg
|
||||
exit(-1);
|
||||
}
|
||||
} else if (arg == "-nranks") {
|
||||
if (isMpi){
|
||||
if (isMpi) {
|
||||
fprintf(stderr, "Error: -nranks should not be specified with MPI.\n");
|
||||
exit(-1);
|
||||
}
|
||||
@@ -248,8 +258,7 @@ std::unordered_map<std::string, std::string> parseArgs(int argc, const char* arg
|
||||
return options;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, const char *argv[])
|
||||
int main(int argc, const char* argv[])
|
||||
{
|
||||
bool isMpi = false;
|
||||
#ifdef MSCCLPP_USE_MPI_FOR_TESTS
|
||||
@@ -266,8 +275,7 @@ int main(int argc, const char *argv[])
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
|
||||
// get the local number of nodes with MPI
|
||||
MPI_Comm shmcomm;
|
||||
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
|
||||
MPI_INFO_NULL, &shmcomm);
|
||||
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm);
|
||||
int shmrank;
|
||||
MPI_Comm_size(shmcomm, &shmrank);
|
||||
nranksPerNode = shmrank;
|
||||
@@ -300,29 +308,33 @@ int main(int argc, const char *argv[])
|
||||
int cudaNum = rankToLocalRank(rank);
|
||||
CUDACHECK(cudaSetDevice(cudaNum));
|
||||
|
||||
if (rank == 0) printf("Initializing MSCCL++\n");
|
||||
if (rank == 0)
|
||||
printf("Initializing MSCCL++\n");
|
||||
mscclppComm_t comm;
|
||||
MSCCLPPCHECK(mscclppCommInitRank(&comm, world_size, ip_port, rank));
|
||||
|
||||
int *data_d;
|
||||
int *data_h;
|
||||
size_t dataSize = 1024*1024*1024;
|
||||
int* data_d;
|
||||
int* data_h;
|
||||
size_t dataSize = 1024 * 1024 * 1024;
|
||||
if (parsedArgs.find("datasize") != parsedArgs.end()) {
|
||||
dataSize = std::stoi(parsedArgs["datasize"]);
|
||||
}
|
||||
int nelemsPerGPU = dataSize / sizeof(int) / world_size;
|
||||
|
||||
if (rank == 0) printf("Initializing data for allgather test\n");
|
||||
if (rank == 0)
|
||||
printf("Initializing data for allgather test\n");
|
||||
initializeAndAllocateAllGatherData(rank, world_size, dataSize, nelemsPerGPU, &data_h, &data_d);
|
||||
|
||||
if (rank == 0) printf("Setting up the connection in MSCCL++\n");
|
||||
if (rank == 0)
|
||||
printf("Setting up the connection in MSCCL++\n");
|
||||
MSCCLPPCHECK(setupMscclppConnections(rank, world_size, comm, data_d, dataSize));
|
||||
|
||||
if (rank == 0) printf("Launching MSCCL++ proxy threads\n");
|
||||
if (rank == 0)
|
||||
printf("Launching MSCCL++ proxy threads\n");
|
||||
MSCCLPPCHECK(mscclppProxyLaunch(comm));
|
||||
|
||||
|
||||
if (rank == 0) printf("Testing the correctness of AllGather implementation\n");
|
||||
if (rank == 0)
|
||||
printf("Testing the correctness of AllGather implementation\n");
|
||||
cudaStream_t stream;
|
||||
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
@@ -331,9 +343,9 @@ int main(int argc, const char *argv[])
|
||||
CUDACHECK(cudaMemcpy(data_h, data_d, dataSize, cudaMemcpyDeviceToHost));
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
|
||||
for (int i = 0; i < nelemsPerGPU*world_size; i++){
|
||||
for (int i = 0; i < nelemsPerGPU * world_size; i++) {
|
||||
int val = i + 1;
|
||||
if (data_h[i] != val){
|
||||
if (data_h[i] != val) {
|
||||
printf("oh uh! data_h[%d] (%d) != val (%d)\n", i, data_h[i], val);
|
||||
break;
|
||||
}
|
||||
@@ -341,11 +353,13 @@ int main(int argc, const char *argv[])
|
||||
int tmp[16];
|
||||
// A simple barrier
|
||||
MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int)));
|
||||
if (rank == 0) printf("Successfully checked the correctness\n");
|
||||
if (rank == 0)
|
||||
printf("Successfully checked the correctness\n");
|
||||
|
||||
// Perf test
|
||||
int iterwithoutcudagraph = 10;
|
||||
if (rank == 0) printf("Running %d iterations of the kernel without CUDA graph\n", iterwithoutcudagraph);
|
||||
if (rank == 0)
|
||||
printf("Running %d iterations of the kernel without CUDA graph\n", iterwithoutcudagraph);
|
||||
for (int i = 0; i < iterwithoutcudagraph; ++i) {
|
||||
kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size, nelemsPerGPU, kernelNum);
|
||||
}
|
||||
@@ -354,43 +368,51 @@ int main(int argc, const char *argv[])
|
||||
|
||||
// cudaGraph Capture
|
||||
int cudagraphiter = 10;
|
||||
if (rank == 0) printf("Capturing %d iterations of the kernel in a CUDA graph\n", cudagraphiter);
|
||||
if (rank == 0)
|
||||
printf("Capturing %d iterations of the kernel in a CUDA graph\n", cudagraphiter);
|
||||
cudaGraph_t graph;
|
||||
cudaGraphExec_t instance;
|
||||
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
|
||||
for (int i = 0; i < cudagraphiter; ++i) {
|
||||
kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size, nelemsPerGPU, kernelNum);
|
||||
kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size, nelemsPerGPU, kernelNum);
|
||||
}
|
||||
cudaStreamEndCapture(stream, &graph);
|
||||
cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);
|
||||
|
||||
int cudagraphwarmup = 10;
|
||||
if (rank == 0) printf("Warming up %d iterations of the CUDA graph with %d iterations of the kernel\n", cudagraphwarmup, cudagraphiter);
|
||||
if (rank == 0)
|
||||
printf("Warming up %d iterations of the CUDA graph with %d iterations of the kernel\n", cudagraphwarmup,
|
||||
cudagraphiter);
|
||||
for (int i = 0; i < cudagraphwarmup; ++i) {
|
||||
cudaGraphLaunch(instance, stream);
|
||||
cudaGraphLaunch(instance, stream);
|
||||
}
|
||||
CUDACHECK(cudaStreamSynchronize(stream));
|
||||
|
||||
// measure runtime
|
||||
// measure runtime
|
||||
int cudagraphlaunch = 10;
|
||||
if (rank == 0) printf("Running %d iterations of the CUDA graph with %d iterations of the kernel\n", cudagraphlaunch, cudagraphiter);
|
||||
if (rank == 0)
|
||||
printf("Running %d iterations of the CUDA graph with %d iterations of the kernel\n", cudagraphlaunch,
|
||||
cudagraphiter);
|
||||
MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int)));
|
||||
double t0 = getTime();
|
||||
for (int i = 0; i < cudagraphlaunch; ++i) {
|
||||
cudaGraphLaunch(instance, stream);
|
||||
cudaGraphLaunch(instance, stream);
|
||||
}
|
||||
CUDACHECK(cudaStreamSynchronize(stream));
|
||||
|
||||
double t1 = getTime();
|
||||
float ms = (t1-t0)*1000.0;
|
||||
double time_in_us = ms * 1000. / (float) cudagraphlaunch / (float) cudagraphiter;
|
||||
printf("Rank %d report: size %lu time: %f us/iter algBW %f GBps\n", rank, dataSize, time_in_us, (double) (dataSize) / 1e9 /(time_in_us/1e6));
|
||||
float ms = (t1 - t0) * 1000.0;
|
||||
double time_in_us = ms * 1000. / (float)cudagraphlaunch / (float)cudagraphiter;
|
||||
printf("Rank %d report: size %lu time: %f us/iter algBW %f GBps\n", rank, dataSize, time_in_us,
|
||||
(double)(dataSize) / 1e9 / (time_in_us / 1e6));
|
||||
MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int)));
|
||||
|
||||
if (rank == 0) printf("Stopping MSCCL++ proxy threads\n");
|
||||
if (rank == 0)
|
||||
printf("Stopping MSCCL++ proxy threads\n");
|
||||
MSCCLPPCHECK(mscclppProxyStop(comm));
|
||||
|
||||
if (rank == 0) printf("Destroying MSCCL++ communicator\n");
|
||||
if (rank == 0)
|
||||
printf("Destroying MSCCL++ communicator\n");
|
||||
MSCCLPPCHECK(mscclppCommDestroy(comm));
|
||||
printf("Rank %d succeeded!\n", rank);
|
||||
|
||||
|
||||
@@ -4,19 +4,20 @@
|
||||
#endif // MSCCLPP_USE_MPI_FOR_TESTS
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <string>
|
||||
#include <unistd.h>
|
||||
|
||||
#define RANKS_PER_NODE 8
|
||||
|
||||
// Check CUDA RT calls
|
||||
#define CUDACHECK(cmd) do { \
|
||||
cudaError_t err = cmd; \
|
||||
if( err != cudaSuccess ) { \
|
||||
printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while(false)
|
||||
#define CUDACHECK(cmd) \
|
||||
do { \
|
||||
cudaError_t err = cmd; \
|
||||
if (err != cudaSuccess) { \
|
||||
printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
// Measure current time in second.
|
||||
static double getTime(void)
|
||||
@@ -33,43 +34,52 @@ __constant__ mscclppDevConn_t constDevConns[16];
|
||||
|
||||
__global__ void kernel(int rank, int world_size, int nelemsPerGPU)
|
||||
{
|
||||
if (threadIdx.x % 32 != 0) return;
|
||||
if (threadIdx.x % 32 != 0)
|
||||
return;
|
||||
|
||||
int warpId = threadIdx.x / 32;
|
||||
bool isIB = false;
|
||||
if (warpId >= world_size-1) isIB = true;
|
||||
if (isIB) warpId = warpId - (world_size-1);
|
||||
if (warpId >= world_size - 1)
|
||||
isIB = true;
|
||||
if (isIB)
|
||||
warpId = warpId - (world_size - 1);
|
||||
int remoteRank = (warpId < rank) ? warpId : warpId + 1;
|
||||
mscclppDevConn_t devConn = constDevConns[remoteRank];
|
||||
if (isIB) devConn = constDevConns[remoteRank + world_size];
|
||||
if (isIB)
|
||||
devConn = constDevConns[remoteRank + world_size];
|
||||
|
||||
// Each warp receives data from different ranks
|
||||
// Each warp receives data from different ranks
|
||||
#if 1
|
||||
|
||||
// Trigger sending data, flag and synchronize after
|
||||
devConn.putWithSignal(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int));
|
||||
devConn.putWithSignal(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int));
|
||||
|
||||
devConn.wait();
|
||||
|
||||
#else
|
||||
for (int i = 1; i < world_size; i++){
|
||||
for (int i = 1; i < world_size; i++) {
|
||||
__syncthreads();
|
||||
if (remoteRank != ((rank+i) % world_size)) continue;
|
||||
if (remoteRank != ((rank + i) % world_size))
|
||||
continue;
|
||||
|
||||
// Trigger sending data, flag and synchronize after
|
||||
int ibPortion = nelemsPerGPU/12;//nelemsPerGPU/12;
|
||||
int ibPortion = nelemsPerGPU / 12; // nelemsPerGPU/12;
|
||||
if (isIB)
|
||||
devConn.fifo.setTrigger(trig, mscclppFlag | mscclppData | mscclppSync, rank * nelemsPerGPU * sizeof(int) + (nelemsPerGPU - ibPortion)*sizeof(int), rank * nelemsPerGPU * sizeof(int) + (nelemsPerGPU - ibPortion)*sizeof(int), ibPortion*sizeof(int));
|
||||
else
|
||||
devConn.fifo.setTrigger(trig, mscclppFlag | mscclppData | mscclppSync, rank * nelemsPerGPU * sizeof(int), rank * nelemsPerGPU * sizeof(int), (nelemsPerGPU-ibPortion)*sizeof(int));
|
||||
devConn.fifo.setTrigger(trig, mscclppFlag | mscclppData | mscclppSync,
|
||||
rank * nelemsPerGPU * sizeof(int) + (nelemsPerGPU - ibPortion) * sizeof(int),
|
||||
rank * nelemsPerGPU * sizeof(int) + (nelemsPerGPU - ibPortion) * sizeof(int),
|
||||
ibPortion * sizeof(int));
|
||||
else
|
||||
devConn.fifo.setTrigger(trig, mscclppFlag | mscclppData | mscclppSync, rank * nelemsPerGPU * sizeof(int),
|
||||
rank * nelemsPerGPU * sizeof(int), (nelemsPerGPU - ibPortion) * sizeof(int));
|
||||
// Wait on the request to make sure it is safe to reuse buffer and flag
|
||||
auto req = devConn.fifo.putWithSignal(dataOffset, dataSize);
|
||||
devConn.fifo.sync(req);
|
||||
auto req = devConn.fifo.putWithSignal(dataOffset, dataSize);
|
||||
devConn.fifo.sync(req);
|
||||
}
|
||||
// Wait for receiving data from remote rank
|
||||
while (*proxyFlag == baseFlag);
|
||||
while (*proxyFlag == baseFlag)
|
||||
;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
int rankToLocalRank(int rank)
|
||||
@@ -108,7 +118,7 @@ int cudaNumToIbNum(int cudaNum)
|
||||
return ibNum;
|
||||
}
|
||||
|
||||
void print_usage(const char *prog)
|
||||
void print_usage(const char* prog)
|
||||
{
|
||||
#ifdef MSCCLPP_USE_MPI_FOR_TESTS
|
||||
printf("usage: %s IP:PORT [rank nranks]\n", prog);
|
||||
@@ -117,14 +127,14 @@ void print_usage(const char *prog)
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int argc, const char *argv[])
|
||||
int main(int argc, const char* argv[])
|
||||
{
|
||||
#ifdef MSCCLPP_USE_MPI_FOR_TESTS
|
||||
if (argc != 2 && argc != 4) {
|
||||
print_usage(argv[0]);
|
||||
return -1;
|
||||
}
|
||||
const char *ip_port = argv[1];
|
||||
const char* ip_port = argv[1];
|
||||
int rank;
|
||||
int world_size;
|
||||
if (argc == 4) {
|
||||
@@ -140,7 +150,7 @@ int main(int argc, const char *argv[])
|
||||
print_usage(argv[0]);
|
||||
return -1;
|
||||
}
|
||||
const char *ip_port = argv[1];
|
||||
const char* ip_port = argv[1];
|
||||
int rank = atoi(argv[2]);
|
||||
int world_size = atoi(argv[3]);
|
||||
#endif
|
||||
@@ -155,19 +165,19 @@ int main(int argc, const char *argv[])
|
||||
mscclppComm_t comm;
|
||||
MSCCLPPCHECK(mscclppCommInitRank(&comm, world_size, rank, ip_port));
|
||||
|
||||
int *data_d;
|
||||
uint64_t *flag_d;
|
||||
size_t data_size = 1536*1024*1024;
|
||||
int* data_d;
|
||||
uint64_t* flag_d;
|
||||
size_t data_size = 1536 * 1024 * 1024;
|
||||
int nelemsPerGPU = data_size / sizeof(int) / world_size;
|
||||
CUDACHECK(cudaMalloc(&data_d, data_size));
|
||||
CUDACHECK(cudaMalloc(&flag_d, sizeof(uint64_t)));
|
||||
CUDACHECK(cudaMemset(data_d, 0, data_size));
|
||||
CUDACHECK(cudaMemset(flag_d, 0, sizeof(uint64_t)));
|
||||
|
||||
int* data_h = new int[nelemsPerGPU*world_size];
|
||||
for (int i = 0; i < nelemsPerGPU*world_size; i++){
|
||||
int* data_h = new int[nelemsPerGPU * world_size];
|
||||
for (int i = 0; i < nelemsPerGPU * world_size; i++) {
|
||||
int val = i + 1;
|
||||
if (i / nelemsPerGPU == rank){
|
||||
if (i / nelemsPerGPU == rank) {
|
||||
data_h[i] = val;
|
||||
} else {
|
||||
data_h[i] = 0;
|
||||
@@ -177,7 +187,8 @@ int main(int argc, const char *argv[])
|
||||
|
||||
mscclppDevConn_t devConns[16];
|
||||
for (int r = 0; r < world_size; ++r) {
|
||||
if (r == rank) continue;
|
||||
if (r == rank)
|
||||
continue;
|
||||
mscclppTransport_t transportType;
|
||||
const char* ibDev = NULL;
|
||||
transportType = mscclppTransportP2P;
|
||||
@@ -185,12 +196,14 @@ int main(int argc, const char *argv[])
|
||||
MSCCLPPCHECK(mscclppConnect(comm, &devConns[r], r, 0, data_d, data_size, flag_d, transportType, ibDev));
|
||||
}
|
||||
for (int r = 0; r < world_size; ++r) {
|
||||
if (r == rank) continue;
|
||||
if (r == rank)
|
||||
continue;
|
||||
mscclppTransport_t transportType;
|
||||
const char* ibDev = ibDevStr.c_str();
|
||||
transportType = mscclppTransportIB;
|
||||
// Connect with all other ranks
|
||||
MSCCLPPCHECK(mscclppConnect(comm, &devConns[r+world_size], r, 0, data_d, data_size, flag_d, transportType, ibDev));
|
||||
MSCCLPPCHECK(
|
||||
mscclppConnect(comm, &devConns[r + world_size], r, 0, data_d, data_size, flag_d, transportType, ibDev));
|
||||
}
|
||||
|
||||
MSCCLPPCHECK(mscclppConnectionSetup(comm));
|
||||
@@ -202,16 +215,15 @@ int main(int argc, const char *argv[])
|
||||
cudaStream_t stream;
|
||||
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
|
||||
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
kernel<<<1, 32 * 2*(world_size - 1), 0, stream>>>(rank, world_size, nelemsPerGPU);
|
||||
kernel<<<1, 32 * 2 * (world_size - 1), 0, stream>>>(rank, world_size, nelemsPerGPU);
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
CUDACHECK(cudaMemcpy(data_h, data_d, data_size, cudaMemcpyDeviceToHost));
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
|
||||
for (int i = 0; i < nelemsPerGPU*world_size; i++){
|
||||
for (int i = 0; i < nelemsPerGPU * world_size; i++) {
|
||||
int val = i + 1;
|
||||
if (data_h[i] != val){
|
||||
if (data_h[i] != val) {
|
||||
printf("oh uh things went wrong! data_h[%d] (%d) != val (%d)\n", i, data_h[i], val);
|
||||
break;
|
||||
}
|
||||
@@ -219,11 +231,11 @@ int main(int argc, const char *argv[])
|
||||
int tmp[16];
|
||||
MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int)));
|
||||
|
||||
// // Perf test
|
||||
// cudaEvent_t ev_start;
|
||||
// cudaEvent_t ev_end;
|
||||
// CUDACHECK(cudaEventCreate(&ev_start));
|
||||
// CUDACHECK(cudaEventCreate(&ev_end));
|
||||
// // Perf test
|
||||
// cudaEvent_t ev_start;
|
||||
// cudaEvent_t ev_end;
|
||||
// CUDACHECK(cudaEventCreate(&ev_start));
|
||||
// CUDACHECK(cudaEventCreate(&ev_end));
|
||||
|
||||
// warm up
|
||||
// int warmupiter = 1000;
|
||||
@@ -239,33 +251,34 @@ int main(int argc, const char *argv[])
|
||||
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
|
||||
int cudagraphiter = 10;
|
||||
for (int i = 0; i < cudagraphiter; ++i) {
|
||||
kernel<<<1, 32 * 2*(world_size - 1), 0, stream>>>(rank, world_size, nelemsPerGPU);
|
||||
kernel<<<1, 32 * 2 * (world_size - 1), 0, stream>>>(rank, world_size, nelemsPerGPU);
|
||||
}
|
||||
cudaStreamEndCapture(stream, &graph);
|
||||
cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);
|
||||
|
||||
int cudagraphwarmup = 10;
|
||||
for (int i = 0; i < cudagraphwarmup; ++i) {
|
||||
cudaGraphLaunch(instance, stream);
|
||||
cudaGraphLaunch(instance, stream);
|
||||
}
|
||||
CUDACHECK(cudaStreamSynchronize(stream));
|
||||
|
||||
// measure runtime
|
||||
// CUDACHECK(cudaEventRecord(ev_start, stream));
|
||||
// measure runtime
|
||||
// CUDACHECK(cudaEventRecord(ev_start, stream));
|
||||
double t0 = getTime();
|
||||
int cudagraphlaunch = 10;
|
||||
for (int i = 0; i < cudagraphlaunch; ++i) {
|
||||
// kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size);
|
||||
cudaGraphLaunch(instance, stream);
|
||||
// kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size);
|
||||
cudaGraphLaunch(instance, stream);
|
||||
}
|
||||
// CUDACHECK(cudaEventRecord(ev_end, stream));
|
||||
// CUDACHECK(cudaEventRecord(ev_end, stream));
|
||||
CUDACHECK(cudaStreamSynchronize(stream));
|
||||
|
||||
double t1 = getTime();
|
||||
float ms = (t1-t0)*1000.0;
|
||||
// CUDACHECK(cudaEventElapsedTime(&ms, ev_start, ev_end));
|
||||
double time_in_us = ms * 1000. / (float) cudagraphlaunch / (float) cudagraphiter;
|
||||
printf("rank: %d, time: %f us/iter algBW %f\n", rank, time_in_us, (double) (data_size) / 1024./1024./1024./(time_in_us/1e6));
|
||||
float ms = (t1 - t0) * 1000.0;
|
||||
// CUDACHECK(cudaEventElapsedTime(&ms, ev_start, ev_end));
|
||||
double time_in_us = ms * 1000. / (float)cudagraphlaunch / (float)cudagraphiter;
|
||||
printf("rank: %d, time: %f us/iter algBW %f\n", rank, time_in_us,
|
||||
(double)(data_size) / 1024. / 1024. / 1024. / (time_in_us / 1e6));
|
||||
|
||||
MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int)));
|
||||
MSCCLPPCHECK(mscclppProxyStop(comm));
|
||||
|
||||
@@ -1,45 +1,49 @@
|
||||
#include "mscclpp.h"
|
||||
#include <cuda/barrier>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
#include <cuda/barrier>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define MSCCLPPCHECK(call) do { \
|
||||
mscclppResult_t res = call; \
|
||||
if (res != mscclppSuccess && res != mscclppInProgress) { \
|
||||
/* Print the back trace*/ \
|
||||
printf("Failure at %s:%d -> %d\n", __FILE__, __LINE__, res); \
|
||||
return res; \
|
||||
} \
|
||||
} while (0);
|
||||
#define MSCCLPPCHECK(call) \
|
||||
do { \
|
||||
mscclppResult_t res = call; \
|
||||
if (res != mscclppSuccess && res != mscclppInProgress) { \
|
||||
/* Print the back trace*/ \
|
||||
printf("Failure at %s:%d -> %d\n", __FILE__, __LINE__, res); \
|
||||
return res; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#define CUDACHECK(cmd) do { \
|
||||
cudaError_t err = cmd; \
|
||||
if( err != cudaSuccess ) { \
|
||||
printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while(false)
|
||||
#define CUDACHECK(cmd) \
|
||||
do { \
|
||||
cudaError_t err = cmd; \
|
||||
if (err != cudaSuccess) { \
|
||||
printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
struct Volume {
|
||||
struct Volume
|
||||
{
|
||||
size_t offset;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
__host__ __device__ Volume chunkVolume(size_t totalSize, size_t totalChunks, size_t chunkIdx, size_t chunkCount) {
|
||||
__host__ __device__ Volume chunkVolume(size_t totalSize, size_t totalChunks, size_t chunkIdx, size_t chunkCount)
|
||||
{
|
||||
size_t remainder = totalSize % totalChunks;
|
||||
size_t smallChunk = totalSize / totalChunks;
|
||||
size_t largeChunk = smallChunk + 1;
|
||||
size_t numLargeChunks = chunkIdx < remainder ? remainder - chunkIdx : 0;
|
||||
size_t numSmallChunks = chunkCount - numLargeChunks;
|
||||
size_t offset = (remainder - numLargeChunks) * largeChunk +
|
||||
(chunkIdx > remainder ? chunkIdx - remainder : 0) * smallChunk;
|
||||
size_t offset =
|
||||
(remainder - numLargeChunks) * largeChunk + (chunkIdx > remainder ? chunkIdx - remainder : 0) * smallChunk;
|
||||
return Volume{offset, numLargeChunks * largeChunk + numSmallChunks * smallChunk};
|
||||
}
|
||||
|
||||
template<class T, void (*reduce)(T*,T*,size_t)>
|
||||
struct AllreduceAllpairs {
|
||||
template <class T, void (*reduce)(T*, T*, size_t)> struct AllreduceAllpairs
|
||||
{
|
||||
int rank;
|
||||
int nRanks;
|
||||
T* userData;
|
||||
@@ -50,7 +54,8 @@ struct AllreduceAllpairs {
|
||||
uint64_t* connFlags;
|
||||
cuda::barrier<cuda::thread_scope_device>* barrier;
|
||||
|
||||
__device__ void run(int idx) {
|
||||
__device__ void run(int idx)
|
||||
{
|
||||
int myPeer = peerRank(idx, rank);
|
||||
mscclppDevConn_t phase1SendConn = conns[phase1SendConnIdx(myPeer)];
|
||||
mscclppDevConn_t phase1RecvConn = conns[phase1RecvConnIdx(myPeer)];
|
||||
@@ -92,59 +97,70 @@ struct AllreduceAllpairs {
|
||||
Volume srcVolume2 = chunkVolume(userSize, nRanks, rank, 1);
|
||||
send(phase2Conn, srcVolume2.offset, srcVolume2.offset, srcVolume2.size);
|
||||
recv(phase2Conn);
|
||||
|
||||
}
|
||||
|
||||
__device__ void send(mscclppDevConn_t& conn, size_t srcOffset, size_t dstOffset, size_t size) {
|
||||
__device__ void send(mscclppDevConn_t& conn, size_t srcOffset, size_t dstOffset, size_t size)
|
||||
{
|
||||
if (threadIdx.x == 0) {
|
||||
volatile uint64_t *localFlag = conn.localFlag;
|
||||
volatile uint64_t* localFlag = conn.localFlag;
|
||||
*localFlag = 1; // 1 is used to signal the send
|
||||
|
||||
mscclppTrigger_t trigger;
|
||||
auto request = conn.fifo.getTrigger(&trigger);
|
||||
conn.fifo.setTrigger(trigger, mscclppData | mscclppFlag, srcOffset * sizeof(T), dstOffset * sizeof(T), size * sizeof(T));
|
||||
conn.fifo.setTrigger(trigger, mscclppData | mscclppFlag, srcOffset * sizeof(T), dstOffset * sizeof(T),
|
||||
size * sizeof(T));
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
__device__ void recv(mscclppDevConn_t& conn) {
|
||||
__device__ void recv(mscclppDevConn_t& conn)
|
||||
{
|
||||
if (threadIdx.x == 0) {
|
||||
volatile uint64_t *proxyFlag = conn.proxyFlag;
|
||||
while (*proxyFlag != 1) {}
|
||||
volatile uint64_t* proxyFlag = conn.proxyFlag;
|
||||
while (*proxyFlag != 1) {
|
||||
}
|
||||
*proxyFlag = 0;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
__host__ __device__ int numPeers() {
|
||||
__host__ __device__ int numPeers()
|
||||
{
|
||||
return nRanks - 1;
|
||||
}
|
||||
|
||||
__host__ __device__ int numBlocks() {
|
||||
__host__ __device__ int numBlocks()
|
||||
{
|
||||
return numPeers();
|
||||
}
|
||||
|
||||
__host__ __device__ int peerIdx(int peerRank, int myRank) {
|
||||
__host__ __device__ int peerIdx(int peerRank, int myRank)
|
||||
{
|
||||
return peerRank < myRank ? peerRank : peerRank - 1;
|
||||
}
|
||||
|
||||
__host__ __device__ int peerRank(int peerIdx, int myRank) {
|
||||
__host__ __device__ int peerRank(int peerIdx, int myRank)
|
||||
{
|
||||
return peerIdx < myRank ? peerIdx : peerIdx + 1;
|
||||
}
|
||||
|
||||
__host__ __device__ int phase1SendConnIdx(int peerRank) {
|
||||
__host__ __device__ int phase1SendConnIdx(int peerRank)
|
||||
{
|
||||
return peerIdx(peerRank, rank) * 3;
|
||||
}
|
||||
|
||||
__host__ __device__ int phase1RecvConnIdx(int peerRank) {
|
||||
__host__ __device__ int phase1RecvConnIdx(int peerRank)
|
||||
{
|
||||
return peerIdx(peerRank, rank) * 3 + 1;
|
||||
}
|
||||
|
||||
__host__ __device__ int phase2ConnIdx(int peerRank) {
|
||||
__host__ __device__ int phase2ConnIdx(int peerRank)
|
||||
{
|
||||
return peerIdx(peerRank, rank) * 3 + 2;
|
||||
}
|
||||
|
||||
void freeGPUResources() {
|
||||
void freeGPUResources()
|
||||
{
|
||||
if (scratch)
|
||||
CUDACHECK(cudaFree(scratch));
|
||||
scratch = nullptr;
|
||||
@@ -160,16 +176,16 @@ struct AllreduceAllpairs {
|
||||
}
|
||||
};
|
||||
|
||||
// The builder class encapsulates the
|
||||
template<class T, void (*reduce)(T*,T*,size_t)>
|
||||
class AllreduceAllpairsBuilder {
|
||||
// The builder class encapsulates the
|
||||
template <class T, void (*reduce)(T*, T*, size_t)> class AllreduceAllpairsBuilder
|
||||
{
|
||||
AllreduceAllpairs<T, reduce> d;
|
||||
std::vector<mscclppDevConn_t> hostConns;
|
||||
|
||||
public:
|
||||
|
||||
// The constructor is called after the user has allocated the buffer to be allreduced
|
||||
AllreduceAllpairsBuilder(T* data, size_t size) {
|
||||
AllreduceAllpairsBuilder(T* data, size_t size)
|
||||
{
|
||||
d.userData = data;
|
||||
d.userSize = size;
|
||||
d.scratch = nullptr;
|
||||
@@ -179,7 +195,8 @@ public:
|
||||
}
|
||||
|
||||
// connect is called after rank initialization but before connection setup
|
||||
mscclppResult_t connect(mscclppComm_t comm) {
|
||||
mscclppResult_t connect(mscclppComm_t comm)
|
||||
{
|
||||
MSCCLPPCHECK(mscclppCommRank(comm, &d.rank));
|
||||
MSCCLPPCHECK(mscclppCommSize(comm, &d.nRanks));
|
||||
|
||||
@@ -195,47 +212,55 @@ public:
|
||||
if (peer != d.rank) {
|
||||
int sendTag = d.rank < peer ? 0 : 1;
|
||||
int recvTag = d.rank < peer ? 1 : 0;
|
||||
MSCCLPPCHECK(mscclppConnect(comm, hostConns.data() + d.phase1SendConnIdx(peer), peer, d.userData, d.userSize * sizeof(T), d.connFlags + 0, sendTag, mscclppTransportP2P, nullptr));
|
||||
MSCCLPPCHECK(mscclppConnect(comm, hostConns.data() + d.phase1RecvConnIdx(peer), peer, d.scratch, d.scratchSize * sizeof(T), d.connFlags + 1, recvTag, mscclppTransportP2P, nullptr));
|
||||
MSCCLPPCHECK(mscclppConnect(comm, hostConns.data() + d.phase2ConnIdx(peer), peer, d.userData, d.userSize * sizeof(T), d.connFlags + 2, 2, mscclppTransportP2P, nullptr));
|
||||
MSCCLPPCHECK(mscclppConnect(comm, hostConns.data() + d.phase1SendConnIdx(peer), peer, d.userData,
|
||||
d.userSize * sizeof(T), d.connFlags + 0, sendTag, mscclppTransportP2P, nullptr));
|
||||
MSCCLPPCHECK(mscclppConnect(comm, hostConns.data() + d.phase1RecvConnIdx(peer), peer, d.scratch,
|
||||
d.scratchSize * sizeof(T), d.connFlags + 1, recvTag, mscclppTransportP2P, nullptr));
|
||||
MSCCLPPCHECK(mscclppConnect(comm, hostConns.data() + d.phase2ConnIdx(peer), peer, d.userData,
|
||||
d.userSize * sizeof(T), d.connFlags + 2, 2, mscclppTransportP2P, nullptr));
|
||||
}
|
||||
}
|
||||
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
// finishSetup is called after connection setup and returns an algorithm object that is ready to be passed to a GPU kernel
|
||||
AllreduceAllpairs<T, reduce> finishSetup() {
|
||||
// finishSetup is called after connection setup and returns an algorithm object that is ready to be passed to a GPU
|
||||
// kernel
|
||||
AllreduceAllpairs<T, reduce> finishSetup()
|
||||
{
|
||||
CUDACHECK(cudaMalloc(&d.conns, hostConns.size() * sizeof(mscclppDevConn_t)));
|
||||
CUDACHECK(cudaMemcpy(d.conns, hostConns.data(), hostConns.size() * sizeof(mscclppDevConn_t), cudaMemcpyHostToDevice));
|
||||
CUDACHECK(
|
||||
cudaMemcpy(d.conns, hostConns.data(), hostConns.size() * sizeof(mscclppDevConn_t), cudaMemcpyHostToDevice));
|
||||
CUDACHECK(cudaMalloc(&d.barrier, sizeof(cuda::barrier<cuda::thread_scope_device>)));
|
||||
cuda::barrier<cuda::thread_scope_device> initBarrier(d.numBlocks());
|
||||
CUDACHECK(cudaMemcpy(d.barrier, &initBarrier, sizeof(cuda::barrier<cuda::thread_scope_device>), cudaMemcpyHostToDevice));
|
||||
CUDACHECK(
|
||||
cudaMemcpy(d.barrier, &initBarrier, sizeof(cuda::barrier<cuda::thread_scope_device>), cudaMemcpyHostToDevice));
|
||||
return d;
|
||||
}
|
||||
};
|
||||
|
||||
template<class T>
|
||||
__device__ void reduceSum(T* dst, T* src, size_t size) {
|
||||
template <class T> __device__ void reduceSum(T* dst, T* src, size_t size)
|
||||
{
|
||||
for (int i = threadIdx.x; i < size; i += blockDim.x) {
|
||||
dst[i] += src[i];
|
||||
}
|
||||
}
|
||||
|
||||
template<class T>
|
||||
__global__ void init(T* data, size_t size, int rank) {
|
||||
template <class T> __global__ void init(T* data, size_t size, int rank)
|
||||
{
|
||||
for (int i = threadIdx.x; i < size; i += blockDim.x) {
|
||||
data[i] = rank;
|
||||
}
|
||||
}
|
||||
|
||||
// The main test kernel
|
||||
template<class T>
|
||||
__global__ void testKernel(AllreduceAllpairs<T, reduceSum> d) {
|
||||
template <class T> __global__ void testKernel(AllreduceAllpairs<T, reduceSum> d)
|
||||
{
|
||||
d.run(blockIdx.x);
|
||||
}
|
||||
|
||||
int main(int argc, const char *argv[]) {
|
||||
int main(int argc, const char* argv[])
|
||||
{
|
||||
#ifdef MSCCLPP_USE_MPI_FOR_TESTS
|
||||
MPI_Init(NULL, NULL);
|
||||
#endif
|
||||
@@ -246,14 +271,14 @@ int main(int argc, const char *argv[]) {
|
||||
CUDACHECK(cudaSetDevice(rank));
|
||||
|
||||
// Allocate and initialize 1 MB of data
|
||||
int *data;
|
||||
int* data;
|
||||
size_t dataSize = 1024 * 1024 / sizeof(int);
|
||||
CUDACHECK(cudaMalloc(&data, dataSize * sizeof(int)));
|
||||
init<<<1, 256>>>(data, dataSize, rank);
|
||||
|
||||
|
||||
// Create the collective
|
||||
AllreduceAllpairsBuilder<int, reduceSum> builder(data, dataSize);
|
||||
|
||||
|
||||
// Create the communicator
|
||||
mscclppComm_t comm;
|
||||
MSCCLPPCHECK(mscclppCommInitRank(&comm, world_size, rank, ip_port));
|
||||
@@ -268,7 +293,7 @@ int main(int argc, const char *argv[]) {
|
||||
|
||||
// Run the collective
|
||||
testKernel<<<allreduce.numBlocks(), 256>>>(allreduce);
|
||||
|
||||
|
||||
// Wait for kernel to finish
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
|
||||
|
||||
@@ -4,19 +4,20 @@
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <string>
|
||||
#include <unistd.h>
|
||||
|
||||
#define MSCCLPPCHECK(call) do { \
|
||||
mscclppResult_t res = call; \
|
||||
if (res != mscclppSuccess && res != mscclppInProgress) { \
|
||||
/* Print the back trace*/ \
|
||||
printf("Failure at %s:%d -> %d\n", __FILE__, __LINE__, res); \
|
||||
return res; \
|
||||
} \
|
||||
} while (0);
|
||||
#define MSCCLPPCHECK(call) \
|
||||
do { \
|
||||
mscclppResult_t res = call; \
|
||||
if (res != mscclppSuccess && res != mscclppInProgress) { \
|
||||
/* Print the back trace*/ \
|
||||
printf("Failure at %s:%d -> %d\n", __FILE__, __LINE__, res); \
|
||||
return res; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
void print_usage(const char *prog)
|
||||
void print_usage(const char* prog)
|
||||
{
|
||||
#ifdef MSCCLPP_USE_MPI_FOR_TESTS
|
||||
std::string st = "you are using MPI for this test\n";
|
||||
@@ -33,7 +34,7 @@ void print_usage(const char *prog)
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int argc, const char *argv[])
|
||||
int main(int argc, const char* argv[])
|
||||
{
|
||||
if (argc >= 2 && (std::string(argv[1]) == "-h" || std::string(argv[1]) == "--help")) {
|
||||
print_usage(argv[0]);
|
||||
@@ -48,7 +49,7 @@ int main(int argc, const char *argv[])
|
||||
MPI_Init(NULL, NULL);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
|
||||
const char *ip_port;
|
||||
const char* ip_port;
|
||||
if (argc == 2)
|
||||
ip_port = argv[1];
|
||||
else
|
||||
@@ -58,7 +59,7 @@ int main(int argc, const char *argv[])
|
||||
print_usage(argv[0]);
|
||||
return -1;
|
||||
}
|
||||
const char *ip_port = argv[1];
|
||||
const char* ip_port = argv[1];
|
||||
rank = atoi(argv[2]);
|
||||
world_size = atoi(argv[3]);
|
||||
#endif
|
||||
@@ -70,7 +71,8 @@ int main(int argc, const char *argv[])
|
||||
} else {
|
||||
#ifdef MSCCLPP_USE_MPI_FOR_TESTS
|
||||
mscclppUniqueId id;
|
||||
if (rank == 0) MSCCLPPCHECK(mscclppGetUniqueId(&id));
|
||||
if (rank == 0)
|
||||
MSCCLPPCHECK(mscclppGetUniqueId(&id));
|
||||
MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
|
||||
MSCCLPPCHECK(mscclppCommInitRankFromId(&comm, world_size, id, rank));
|
||||
#else
|
||||
@@ -80,7 +82,7 @@ int main(int argc, const char *argv[])
|
||||
}
|
||||
|
||||
// allocate some test buffer
|
||||
int *buf = (int *)calloc(world_size, sizeof(int));
|
||||
int* buf = (int*)calloc(world_size, sizeof(int));
|
||||
if (buf == nullptr) {
|
||||
printf("calloc failed\n");
|
||||
return -1;
|
||||
@@ -101,7 +103,7 @@ int main(int argc, const char *argv[])
|
||||
MSCCLPPCHECK(mscclppCommDestroy(comm));
|
||||
|
||||
#ifdef MSCCLPP_USE_MPI_FOR_TESTS
|
||||
MPI_Finalize();
|
||||
MPI_Finalize();
|
||||
#endif
|
||||
|
||||
printf("Rank %d Succeeded\n", rank);
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
#include "mpi.h"
|
||||
#endif // MSCCLPP_USE_MPI_FOR_TESTS
|
||||
|
||||
void print_usage(const char *prog)
|
||||
void print_usage(const char* prog)
|
||||
{
|
||||
#ifdef MSCCLPP_USE_MPI_FOR_TESTS
|
||||
printf("usage: %s IP:PORT [rank nranks]\n", prog);
|
||||
@@ -17,7 +17,8 @@ void print_usage(const char *prog)
|
||||
#endif
|
||||
}
|
||||
|
||||
void parse_arguments(int argc, const char *argv[], const char** ip_port, int* rank, int* world_size) {
|
||||
void parse_arguments(int argc, const char* argv[], const char** ip_port, int* rank, int* world_size)
|
||||
{
|
||||
#ifdef MSCCLPP_USE_MPI_FOR_TESTS
|
||||
if (argc != 2 && argc != 4) {
|
||||
print_usage(argv[0]);
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#include "mscclpp.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <string>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
@@ -10,23 +10,25 @@
|
||||
#define USE_DMA_FOR_P2P 1
|
||||
#define TEST_CONN_TYPE 0 // 0: P2P(for local)+IB(for remote), 1: IB-Only
|
||||
|
||||
#define MSCCLPPCHECK(call) do { \
|
||||
mscclppResult_t res = call; \
|
||||
if (res != mscclppSuccess && res != mscclppInProgress) { \
|
||||
/* Print the back trace*/ \
|
||||
printf("Failure at %s:%d -> %d\n", __FILE__, __LINE__, res); \
|
||||
return res; \
|
||||
} \
|
||||
} while (0);
|
||||
#define MSCCLPPCHECK(call) \
|
||||
do { \
|
||||
mscclppResult_t res = call; \
|
||||
if (res != mscclppSuccess && res != mscclppInProgress) { \
|
||||
/* Print the back trace*/ \
|
||||
printf("Failure at %s:%d -> %d\n", __FILE__, __LINE__, res); \
|
||||
return res; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
// Check CUDA RT calls
|
||||
#define CUDACHECK(cmd) do { \
|
||||
cudaError_t err = cmd; \
|
||||
if( err != cudaSuccess ) { \
|
||||
printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while(false)
|
||||
#define CUDACHECK(cmd) \
|
||||
do { \
|
||||
cudaError_t err = cmd; \
|
||||
if (err != cudaSuccess) { \
|
||||
printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
// Measure current time in second.
|
||||
static double getTime(void)
|
||||
@@ -43,17 +45,18 @@ __constant__ mscclppDevConn_t constDevConns[16];
|
||||
|
||||
__global__ void kernel(int rank, int world_size)
|
||||
{
|
||||
if (threadIdx.x % 32 != 0) return;
|
||||
if (threadIdx.x % 32 != 0)
|
||||
return;
|
||||
|
||||
int warpId = threadIdx.x / 32;
|
||||
int remoteRank = (warpId < rank) ? warpId : warpId + 1;
|
||||
mscclppDevConn_t devConn = constDevConns[remoteRank];
|
||||
volatile int *data = (volatile int *)devConn.localBuff;
|
||||
volatile uint64_t *localFlag = devConn.localFlag;
|
||||
volatile int* data = (volatile int*)devConn.localBuff;
|
||||
volatile uint64_t* localFlag = devConn.localFlag;
|
||||
#if (USE_DMA_FOR_P2P == 0)
|
||||
volatile uint64_t *remoteFlag = devConn.remoteFlag;
|
||||
volatile uint64_t* remoteFlag = devConn.remoteFlag;
|
||||
#endif
|
||||
volatile uint64_t *proxyFlag = devConn.proxyFlag;
|
||||
volatile uint64_t* proxyFlag = devConn.proxyFlag;
|
||||
|
||||
uint64_t baseFlag = *localFlag;
|
||||
|
||||
@@ -83,7 +86,8 @@ __global__ void kernel(int rank, int world_size)
|
||||
devConn.fifo.sync(req);
|
||||
|
||||
// Wait for receiving data from remote rank
|
||||
while (*proxyFlag == baseFlag) {}
|
||||
while (*proxyFlag == baseFlag) {
|
||||
}
|
||||
|
||||
#else // USE_DMA_FOR_P2P == 0
|
||||
|
||||
@@ -95,13 +99,15 @@ __global__ void kernel(int rank, int world_size)
|
||||
devConn.setTrigger(trig, mscclppFlag | mscclppData, rank * sizeof(int), sizeof(int));
|
||||
|
||||
// Wait for receiving data from remote rank
|
||||
while (*proxyFlag == baseFlag) {}
|
||||
while (*proxyFlag == baseFlag) {
|
||||
}
|
||||
} else { // P2P
|
||||
// Directly read data
|
||||
volatile int *remoteData = (volatile int *)devConn.remoteBuff;
|
||||
volatile int* remoteData = (volatile int*)devConn.remoteBuff;
|
||||
|
||||
// Wait until the remote data is set
|
||||
while (*remoteFlag == baseFlag) {}
|
||||
while (*remoteFlag == baseFlag) {
|
||||
}
|
||||
|
||||
// Read remote data
|
||||
data[remoteRank] = remoteData[remoteRank];
|
||||
@@ -146,7 +152,7 @@ int cudaNumToIbNum(int cudaNum)
|
||||
return ibNum;
|
||||
}
|
||||
|
||||
int main(int argc, const char *argv[])
|
||||
int main(int argc, const char* argv[])
|
||||
{
|
||||
#ifdef MSCCLPP_USE_MPI_FOR_TESTS
|
||||
MPI_Init(NULL, NULL);
|
||||
@@ -165,8 +171,8 @@ int main(int argc, const char *argv[])
|
||||
mscclppComm_t comm;
|
||||
MSCCLPPCHECK(mscclppCommInitRank(&comm, world_size, rank, ip_port));
|
||||
|
||||
int *data_d;
|
||||
uint64_t *flag_d;
|
||||
int* data_d;
|
||||
uint64_t* flag_d;
|
||||
size_t data_size = sizeof(int) * world_size;
|
||||
CUDACHECK(cudaMalloc(&data_d, data_size));
|
||||
CUDACHECK(cudaMalloc(&flag_d, sizeof(uint64_t)));
|
||||
@@ -174,9 +180,10 @@ int main(int argc, const char *argv[])
|
||||
CUDACHECK(cudaMemset(flag_d, 0, sizeof(uint64_t)));
|
||||
|
||||
for (int r = 0; r < world_size; ++r) {
|
||||
if (r == rank) continue;
|
||||
if (r == rank)
|
||||
continue;
|
||||
mscclppTransport_t transportType = mscclppTransportIB;
|
||||
const char *ibDev = ibDevStr.c_str();
|
||||
const char* ibDev = ibDevStr.c_str();
|
||||
#if (TEST_CONN_TYPE == 0) // P2P+IB
|
||||
if (rankToNode(r) == thisNode) {
|
||||
transportType = mscclppTransportP2P;
|
||||
@@ -191,7 +198,7 @@ int main(int argc, const char *argv[])
|
||||
|
||||
MSCCLPPCHECK(mscclppProxyLaunch(comm));
|
||||
|
||||
mscclppDevConn_t *devConns;
|
||||
mscclppDevConn_t* devConns;
|
||||
int nCons;
|
||||
MSCCLPPCHECK(mscclppGetAllDeviceConnections(comm, &devConns, &nCons));
|
||||
|
||||
@@ -204,7 +211,7 @@ int main(int argc, const char *argv[])
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
|
||||
// Read results from GPU
|
||||
int *buf = (int *)calloc(world_size, sizeof(int));
|
||||
int* buf = (int*)calloc(world_size, sizeof(int));
|
||||
if (buf == nullptr) {
|
||||
printf("calloc failed\n");
|
||||
return -1;
|
||||
@@ -230,9 +237,9 @@ int main(int argc, const char *argv[])
|
||||
|
||||
// warm up
|
||||
// int warmupiter = 10;
|
||||
// for (int i = 0; i < warmupiter; ++i) {
|
||||
// kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size);
|
||||
// }
|
||||
// for (int i = 0; i < warmupiter; ++i) {
|
||||
// kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size);
|
||||
// }
|
||||
|
||||
// cudaGraph Capture
|
||||
cudaGraph_t graph;
|
||||
@@ -240,32 +247,32 @@ int main(int argc, const char *argv[])
|
||||
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
|
||||
int cudagraphiter = 100;
|
||||
for (int i = 0; i < cudagraphiter; ++i) {
|
||||
kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size);
|
||||
kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size);
|
||||
}
|
||||
cudaStreamEndCapture(stream, &graph);
|
||||
cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);
|
||||
|
||||
int cudagraphwarmup = 10;
|
||||
for (int i = 0; i < cudagraphwarmup; ++i) {
|
||||
cudaGraphLaunch(instance, stream);
|
||||
cudaGraphLaunch(instance, stream);
|
||||
}
|
||||
CUDACHECK(cudaStreamSynchronize(stream));
|
||||
|
||||
// measure runtime
|
||||
// CUDACHECK(cudaEventRecord(ev_start, stream));
|
||||
// measure runtime
|
||||
// CUDACHECK(cudaEventRecord(ev_start, stream));
|
||||
double t0 = getTime();
|
||||
int cudagraphlaunch = 10;
|
||||
for (int i = 0; i < cudagraphlaunch; ++i) {
|
||||
// kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size);
|
||||
cudaGraphLaunch(instance, stream);
|
||||
// kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size);
|
||||
cudaGraphLaunch(instance, stream);
|
||||
}
|
||||
// CUDACHECK(cudaEventRecord(ev_end, stream));
|
||||
// CUDACHECK(cudaEventRecord(ev_end, stream));
|
||||
CUDACHECK(cudaStreamSynchronize(stream));
|
||||
|
||||
double t1 = getTime();
|
||||
float ms = (t1-t0)*1000.0;
|
||||
// CUDACHECK(cudaEventElapsedTime(&ms, ev_start, ev_end));
|
||||
printf("rank: %d, time: %f us/iter\n", rank, ms * 1000. / (float) cudagraphlaunch / (float) cudagraphiter);
|
||||
float ms = (t1 - t0) * 1000.0;
|
||||
// CUDACHECK(cudaEventElapsedTime(&ms, ev_start, ev_end));
|
||||
printf("rank: %d, time: %f us/iter\n", rank, ms * 1000. / (float)cudagraphlaunch / (float)cudagraphiter);
|
||||
|
||||
MSCCLPPCHECK(mscclppProxyStop(comm));
|
||||
|
||||
|
||||
Reference in New Issue
Block a user