From 38cd87cdcc64964c245bf8d518d77e6ff023ae20 Mon Sep 17 00:00:00 2001 From: Felipe Petroski Such Date: Fri, 7 Apr 2023 12:26:56 -0700 Subject: [PATCH 001/135] add memory region functions --- src/include/ib.h | 15 -------- src/include/mscclpp.h | 30 +++++++++++++++- src/init.cc | 83 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 112 insertions(+), 16 deletions(-) diff --git a/src/include/ib.h b/src/include/ib.h index 70d219b6..7494ab11 100644 --- a/src/include/ib.h +++ b/src/include/ib.h @@ -12,21 +12,6 @@ #define MSCCLPP_IB_MAX_SENDS 64 #define MSCCLPP_IB_MAX_DEVS 8 -// MR info to be shared with the remote peer -struct mscclppIbMrInfo -{ - uint64_t addr; - uint32_t rkey; -}; - -// IB memory region -struct mscclppIbMr -{ - struct ibv_mr* mr; - void* buff; - struct mscclppIbMrInfo info; -}; - // QP info to be shared with the remote peer struct mscclppIbQpInfo { diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index de6edbc3..1477258d 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -13,9 +13,11 @@ #include #include +#include #ifdef __cplusplus -extern "C" { +extern "C" +{ #endif /*************************************************************************************************************** @@ -174,6 +176,32 @@ typedef struct char internal[MSCCLPP_UNIQUE_ID_BYTES]; } mscclppUniqueId; +// MR info to be shared with the remote peer +struct mscclppIbMrInfo +{ + uint64_t addr; + uint32_t rkey; +}; + +// IB memory region +struct mscclppIbMr +{ + struct ibv_mr* mr; + void* buff; + struct mscclppIbMrInfo info; +}; + +struct mscclppRegisteredMemoryP2P +{ + void* remoteBuff; + mscclppIbMr* IbMr; +}; + +struct mscclppRegisteredMemory +{ + std::vector p2p; +}; + /* Error type */ typedef enum { diff --git a/src/init.cc b/src/init.cc index 2c6db009..b631e82f 100644 --- a/src/init.cc +++ b/src/init.cc @@ -560,6 +560,89 @@ mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm) return mscclppSuccess; } +struct bufferInfo +{ + cudaIpcMemHandle_t handleBuff; + mscclppIbMrInfo infoBuffMr; +}; + +MSCCLPP_API(mscclppResult_t, mscclppRegisterBuffer, mscclppComm_t comm, void* local_memory, size_t size, + mscclppRegisteredMemory* regMem); +mscclppResult_t mscclppRegisterBuffer(mscclppComm_t comm, void* local_memory, size_t size, + mscclppRegisteredMemory* regMem) +{ + std::vector ibMrs; + for (int i = 0; i < comm->nConns; ++i) { + struct mscclppConn* conn = &comm->conns[i]; + struct bufferInfo bInfo; + struct mscclppIbMr* ibBuffMr; + + // TODO: (conn->transport & mscclppTransportP2P) to support both P2P and IB + if (conn->transport == mscclppTransportP2P) { + CUDACHECK(cudaIpcGetMemHandle(&bInfo.handleBuff, local_memory)); + } else if (conn->transport == mscclppTransportIB) { + MSCCLPPCHECK(mscclppIbContextRegisterMr(conn->ibCtx, local_memory, size, &ibBuffMr)); + bInfo.infoBuffMr = ibBuffMr->info; + ibMrs.push_back(ibBuffMr); + } + + MSCCLPPCHECK(bootstrapSend(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag, &bInfo, sizeof(bInfo))); + } + + // Recv info from peers + for (int i = 0; i < comm->nConns; ++i) { + struct mscclppConn* conn = &comm->conns[i]; + struct bufferInfo bInfo; + + mscclppRegisteredMemoryP2P p2p; + p2p.IbMr = NULL; + p2p.remoteBuff = NULL; + MSCCLPPCHECK(bootstrapRecv(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag, &bInfo, sizeof(bInfo))); + + // TODO: (conn->transport & mscclppTransportP2P) to support both P2P and IB + if (conn->transport == mscclppTransportP2P) { + CUDACHECK(cudaIpcOpenMemHandle((void**)&p2p.remoteBuff, bInfo.handleBuff, cudaIpcMemLazyEnablePeerAccess)); + } else if (conn->transport == mscclppTransportIB) { + p2p.IbMr = ibMrs[i]; + } + regMem->p2p.push_back(p2p); + } + return mscclppSuccess; +} + +MSCCLPP_API(mscclppResult_t, mscclppRegisteredBufferWrite, mscclppComm_t comm, mscclppRegisteredMemory* regMem, + void* srcBuff, size_t size, uint32_t srcOffset, uint32_t dstOffset, int64_t stream); +mscclppResult_t mscclppRegisteredBufferWrite(mscclppComm_t comm, mscclppRegisteredMemory* regMem, void* srcBuff, + size_t size, uint32_t srcOffset, uint32_t dstOffset, int64_t stream) +{ + int ret = 0; + // TODO: transport should be an argument too so user can decide which transport to use + for (int i = 0; i < comm->nConns; ++i) { + struct mscclppConn* conn = &comm->conns[i]; + // TODO: (conn->transport & mscclppTransportP2P) to support both P2P and IB + if (conn->transport == mscclppTransportP2P) { + // TODO: check errors + void* dstBuff = regMem->p2p[i].remoteBuff; + cudaMemcpyAsync(dstBuff, srcBuff, size, cudaMemcpyDeviceToDevice, (cudaStream_t)stream); + } else { + conn->ibQp->stageSend(conn->ibBuffMr, &conn->ibBuffMrInfo, (uint32_t)size, + /*wrId=*/0, /*srcOffset=*/srcOffset, + /*dstOffset=*/dstOffset, + /*signaled=*/false); + if ((ret = conn->ibQp->postSend()) != 0) { + // Return value is errno. + WARN("data postSend failed: errno %d", ret); + } + // ?? + // npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_ENTRY, (uint32_t)trigger.fields.dataSize, + // trigger.fields.connId); + } + } + return mscclppSuccess; +} + +// TODO: destroy registered buffer + MSCCLPP_API(mscclppResult_t, mscclppProxyLaunch, mscclppComm_t comm); mscclppResult_t mscclppProxyLaunch(mscclppComm_t comm) { From cc8c30f95817181e3b9cd5f21d7e03d1cc89a7a1 Mon Sep 17 00:00:00 2001 From: Felipe Petroski Such Date: Fri, 7 Apr 2023 12:34:22 -0700 Subject: [PATCH 002/135] error checking --- src/init.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/init.cc b/src/init.cc index b631e82f..2eb4fd4a 100644 --- a/src/init.cc +++ b/src/init.cc @@ -621,9 +621,8 @@ mscclppResult_t mscclppRegisteredBufferWrite(mscclppComm_t comm, mscclppRegister struct mscclppConn* conn = &comm->conns[i]; // TODO: (conn->transport & mscclppTransportP2P) to support both P2P and IB if (conn->transport == mscclppTransportP2P) { - // TODO: check errors void* dstBuff = regMem->p2p[i].remoteBuff; - cudaMemcpyAsync(dstBuff, srcBuff, size, cudaMemcpyDeviceToDevice, (cudaStream_t)stream); + CUDACHECK(cudaMemcpyAsync(dstBuff, srcBuff, size, cudaMemcpyDeviceToDevice, (cudaStream_t)stream)); } else { conn->ibQp->stageSend(conn->ibBuffMr, &conn->ibBuffMrInfo, (uint32_t)size, /*wrId=*/0, /*srcOffset=*/srcOffset, From 34464b40bb2eb9fff05763f705170145a5907281 Mon Sep 17 00:00:00 2001 From: Crutcher Dunnavant Date: Fri, 7 Apr 2023 19:11:50 -0700 Subject: [PATCH 003/135] register buffers --- python/src/_py_mscclpp.cpp | 90 +++++++++++++++------- python/src/mscclpp/__init__.py | 47 ++++++++--- python/src/mscclpp/test_mscclpp.py | 4 +- python/src/mscclpp/tests/bootstrap_test.py | 11 ++- src/include/mscclpp.h | 27 +++++++ 5 files changed, 137 insertions(+), 42 deletions(-) diff --git a/python/src/_py_mscclpp.cpp b/python/src/_py_mscclpp.cpp index 39bc52e3..b6fdd9ed 100644 --- a/python/src/_py_mscclpp.cpp +++ b/python/src/_py_mscclpp.cpp @@ -1,10 +1,9 @@ +#include #include #include #include #include -#include - #include #include #include @@ -71,14 +70,14 @@ void checkResult( } } -#define RETRY(C, ...) \ -{ \ - mscclppResult_t res; \ - do { \ - res = (C); \ - } while (res == mscclppInProgress); \ - checkResult(res, __VA_ARGS__); \ -} +#define RETRY(C, ...) \ + { \ + mscclppResult_t res; \ + do { \ + res = (C); \ + } while (res == mscclppInProgress); \ + checkResult(res, __VA_ARGS__); \ + } // Maybe return the value, maybe throw an exception. template @@ -98,7 +97,11 @@ struct _Comm { public: _Comm(int rank, int world_size, mscclppComm_t handle) - : _rank(rank), _world_size(world_size), _handle(handle), _is_open(true), _proxies_running(false) {} + : _rank(rank), + _world_size(world_size), + _handle(handle), + _is_open(true), + _proxies_running(false) {} ~_Comm() { close(); } @@ -106,8 +109,8 @@ struct _Comm { void close() { if (_is_open) { if (_proxies_running) { - mscclppProxyStop(_handle); - _proxies_running = false; + mscclppProxyStop(_handle); + _proxies_running = false; } checkResult(mscclppCommDestroy(_handle), "Failed to close comm channel"); _handle = NULL; @@ -124,6 +127,21 @@ struct _Comm { } }; +struct _P2PHandle { + struct mscclppRegisteredMemoryP2P _rmP2P; + struct mscclppIbMr _ibmr; + + _P2PHandle() : _rmP2P({0}), _ibmr({0}) {} + + _P2PHandle(const mscclppRegisteredMemoryP2P &p2p): _ibmr({0}) { + _rmP2P = p2p; + if (_rmP2P.IbMr != nullptr) { + _ibmr = *_rmP2P.IbMr; + _rmP2P.IbMr = &_ibmr; + } + } +}; + nb::callable _log_callback; void _LogHandler(const char* msg) { @@ -138,6 +156,8 @@ static const std::string DOC_MscclppUniqueId = static const std::string DOC__Comm = "MSCCLPP Communications Handle"; +static const std::string DOC__P2PHandle = "MSCCLPP P2P MR Handle"; + NB_MODULE(_py_mscclpp, m) { m.doc() = "Python bindings for MSCCLPP: which is not NCCL"; @@ -188,6 +208,9 @@ NB_MODULE(_py_mscclpp, m) { return nb::bytes(id.internal, sizeof(id.internal)); }); + nb::class_<_P2PHandle>(m, "_P2PHandle") + .def_ro_static("__doc__", &DOC__P2PHandle); + nb::class_<_Comm>(m, "_Comm") .def_ro_static("__doc__", &DOC__Comm) .def_static( @@ -236,6 +259,29 @@ NB_MODULE(_py_mscclpp, m) { "Is this comm object closed?") .def_ro("rank", &_Comm::_rank) .def_ro("world_size", &_Comm::_world_size) + .def( + "register_buffer", + [](_Comm& comm, uint64_t local_buff, uint64_t buff_size) -> std::vector<_P2PHandle> { + comm.check_open(); + mscclppRegisteredMemory regMem; + checkResult( + mscclppRegisterBuffer( + comm._handle, + reinterpret_cast(local_buff), + buff_size, + ®Mem), + "Registering buffer failed"); + + std::vector<_P2PHandle> handles; + for (const auto &p2p : regMem.p2p) { + handles.push_back(_P2PHandle(p2p)); + } + return handles; + }, + "local_buf"_a, + "buff_size"_a, + nb::call_guard(), + "Register a buffer for P2P transfers.") .def( "connect", [](_Comm& comm, @@ -244,9 +290,7 @@ NB_MODULE(_py_mscclpp, m) { uint64_t local_buff, uint64_t buff_size, mscclppTransport_t transport_type) -> void { - if (comm._proxies_running) { - throw std::invalid_argument("Proxy Threads Already Running"); - } + comm.check_open(); RETRY( mscclppConnect( comm._handle, @@ -270,8 +314,9 @@ NB_MODULE(_py_mscclpp, m) { "connection_setup", [](_Comm& comm) -> void { comm.check_open(); - RETRY(mscclppConnectionSetup(comm._handle), - "Failed to setup MSCCLPP connection"); + RETRY( + mscclppConnectionSetup(comm._handle), + "Failed to setup MSCCLPP connection"); }, nb::call_guard(), "Run connection setup for MSCCLPP.") @@ -304,15 +349,6 @@ NB_MODULE(_py_mscclpp, m) { "Start the MSCCLPP proxy.") .def("close", &_Comm::close, nb::call_guard()) .def("__del__", &_Comm::close, nb::call_guard()) - .def( - "connection_setup", - [](_Comm& comm) -> void { - comm.check_open(); - checkResult( - mscclppConnectionSetup(comm._handle), - "Connection Setup Failed"); - }, - nb::call_guard()) .def( "bootstrap_all_gather_int", [](_Comm& comm, int val) -> std::vector { diff --git a/python/src/mscclpp/__init__.py b/python/src/mscclpp/__init__.py index cbb84c2c..c6c2ff13 100644 --- a/python/src/mscclpp/__init__.py +++ b/python/src/mscclpp/__init__.py @@ -18,6 +18,7 @@ __all__ = ( ) _Comm = _py_mscclpp._Comm +_P2PHandle = _py_mscclpp._P2PHandle TransportType = _py_mscclpp.TransportType MscclppUniqueId = _py_mscclpp.MscclppUniqueId @@ -46,6 +47,7 @@ MSCCLPP_LOG_LEVELS: set[str] = { "TRACE", } + def _setup_logging(level: str = "INFO"): """Setup log hooks for the C library.""" level = level.upper() @@ -69,11 +71,11 @@ class Comm: @staticmethod def init_rank_from_address( - address: str, - rank: int, - world_size: int, - *, - port: Optional[int] = None, + address: str, + rank: int, + world_size: int, + *, + port: Optional[int] = None, ): """Initialize a Comm from an address. @@ -154,12 +156,12 @@ class Comm: return [pickle.loads(b) for b in self.all_gather_bytes(pickle.dumps(item))] def connect( - self, - remote_rank: int, - tag: int, - data_ptr, - data_size: int, - transport: int, + self, + remote_rank: int, + tag: int, + data_ptr, + data_size: int, + transport: int, ) -> None: self._comm.connect( remote_rank, @@ -177,3 +179,26 @@ class Comm: def stop_proxies(self) -> None: self._comm.stop_proxies() + + def register_buffer( + self, + data_ptr, + data_size: int, + ) -> list[_P2PHandle]: + return [ + P2PHandle(self, h) for h in self._comm.register_buffer( + data_ptr, + data_size, + ) + ] + + +class P2PHandle: + _comm: Comm + _handle: _P2PHandle + + def __init__(self, + comm: Comm, + handle: _P2PHandle): + self._comm = comm + self._handle = handle diff --git a/python/src/mscclpp/test_mscclpp.py b/python/src/mscclpp/test_mscclpp.py index 6b162f7d..d4159ad5 100644 --- a/python/src/mscclpp/test_mscclpp.py +++ b/python/src/mscclpp/test_mscclpp.py @@ -79,6 +79,8 @@ class CommsTest(unittest.TestCase): if errors: parts = [] for rank, content in errors: - parts.append(f"[rank {rank}]: " + content.decode('utf-8', errors='ignore')) + parts.append( + f"[rank {rank}]: " + content.decode("utf-8", errors="ignore") + ) raise AssertionError("\n\n".join(parts)) diff --git a/python/src/mscclpp/tests/bootstrap_test.py b/python/src/mscclpp/tests/bootstrap_test.py index 6f5c5ec7..0d5d8aa1 100644 --- a/python/src/mscclpp/tests/bootstrap_test.py +++ b/python/src/mscclpp/tests/bootstrap_test.py @@ -73,7 +73,7 @@ def _test_bootstrap_allgather_pickle(options: argparse.Namespace, comm: mscclpp. comm.connection_setup() -def _test_p2p_connect(options: argparse.Namespace, comm: mscclpp.Comm): +def _test_rm(options: argparse.Namespace, comm: mscclpp.Comm): rank = options.rank buf = torch.zeros([options.world_size], dtype=torch.int64) @@ -95,6 +95,12 @@ def _test_p2p_connect(options: argparse.Namespace, comm: mscclpp.Comm): mscclpp.TransportType.P2P, ) + handles = comm.register_buffer(buf.data_ptr(), buf.element_size() * buf.numel()) + hamcrest.assert_that( + handles, + hamcrest.has_length(options.world_size - 1), + ) + torch.cuda.synchronize() comm.connection_setup() @@ -103,7 +109,6 @@ def _test_p2p_connect(options: argparse.Namespace, comm: mscclpp.Comm): comm.stop_proxies() - def main(): p = argparse.ArgumentParser() p.add_argument("--rank", type=int, required=True) @@ -131,7 +136,7 @@ def main(): _test_bootstrap_allgather_bytes(options, comm) _test_bootstrap_allgather_json(options, comm) _test_bootstrap_allgather_pickle(options, comm) - _test_p2p_connect(options, comm) + _test_rm(options, comm) finally: comm.close() diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index 1477258d..eba5dccb 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -405,6 +405,33 @@ void mscclppDefaultLogHandler(const char* msg); */ mscclppResult_t mscclppSetLogHandler(mscclppLogHandler_t handler); +/* Register a buffer for RDMA. + * + * Outputs: + * regMem: the registered memory + * + * Inputs: + * comm: the communicator + * local_memory: the local buffer to be registered + * size: the size of the buffer + */ +mscclppResult_t mscclppRegisterBuffer(mscclppComm_t comm, void* local_memory, size_t size, + mscclppRegisteredMemory* regMem); + +/* Write to a registered buffer. + * + * Inputs: + * comm: the communicator + * regMem: the registered memory + * srcBuff: the source buffer + * size: the size of the buffer + * srcOffset: the offset of the source buffer + * dstOffset: the offset of the destination buffer + * stream: the CUDA stream + */ +mscclppResult_t mscclppRegisteredBufferWrite(mscclppComm_t comm, mscclppRegisteredMemory* regMem, void* srcBuff, + size_t size, uint32_t srcOffset, uint32_t dstOffset, int64_t stream); + #ifdef __cplusplus } // end extern "C" #endif From 00d382dbf72d869c12b66b6b7b3f2ebb76b968b7 Mon Sep 17 00:00:00 2001 From: Crutcher Dunnavant Date: Fri, 7 Apr 2023 19:12:05 -0700 Subject: [PATCH 004/135] format --- python/src/_py_mscclpp.cpp | 26 +++++++++++++------------ python/src/mscclpp/__init__.py | 35 +++++++++++++++++----------------- 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/python/src/_py_mscclpp.cpp b/python/src/_py_mscclpp.cpp index b6fdd9ed..3171122d 100644 --- a/python/src/_py_mscclpp.cpp +++ b/python/src/_py_mscclpp.cpp @@ -128,18 +128,18 @@ struct _Comm { }; struct _P2PHandle { - struct mscclppRegisteredMemoryP2P _rmP2P; - struct mscclppIbMr _ibmr; + struct mscclppRegisteredMemoryP2P _rmP2P; + struct mscclppIbMr _ibmr; - _P2PHandle() : _rmP2P({0}), _ibmr({0}) {} + _P2PHandle() : _rmP2P({0}), _ibmr({0}) {} - _P2PHandle(const mscclppRegisteredMemoryP2P &p2p): _ibmr({0}) { - _rmP2P = p2p; - if (_rmP2P.IbMr != nullptr) { - _ibmr = *_rmP2P.IbMr; - _rmP2P.IbMr = &_ibmr; - } + _P2PHandle(const mscclppRegisteredMemoryP2P& p2p) : _ibmr({0}) { + _rmP2P = p2p; + if (_rmP2P.IbMr != nullptr) { + _ibmr = *_rmP2P.IbMr; + _rmP2P.IbMr = &_ibmr; } + } }; nb::callable _log_callback; @@ -261,7 +261,9 @@ NB_MODULE(_py_mscclpp, m) { .def_ro("world_size", &_Comm::_world_size) .def( "register_buffer", - [](_Comm& comm, uint64_t local_buff, uint64_t buff_size) -> std::vector<_P2PHandle> { + [](_Comm& comm, + uint64_t local_buff, + uint64_t buff_size) -> std::vector<_P2PHandle> { comm.check_open(); mscclppRegisteredMemory regMem; checkResult( @@ -273,8 +275,8 @@ NB_MODULE(_py_mscclpp, m) { "Registering buffer failed"); std::vector<_P2PHandle> handles; - for (const auto &p2p : regMem.p2p) { - handles.push_back(_P2PHandle(p2p)); + for (const auto& p2p : regMem.p2p) { + handles.push_back(_P2PHandle(p2p)); } return handles; }, diff --git a/python/src/mscclpp/__init__.py b/python/src/mscclpp/__init__.py index c6c2ff13..51c564d8 100644 --- a/python/src/mscclpp/__init__.py +++ b/python/src/mscclpp/__init__.py @@ -71,11 +71,11 @@ class Comm: @staticmethod def init_rank_from_address( - address: str, - rank: int, - world_size: int, - *, - port: Optional[int] = None, + address: str, + rank: int, + world_size: int, + *, + port: Optional[int] = None, ): """Initialize a Comm from an address. @@ -156,12 +156,12 @@ class Comm: return [pickle.loads(b) for b in self.all_gather_bytes(pickle.dumps(item))] def connect( - self, - remote_rank: int, - tag: int, - data_ptr, - data_size: int, - transport: int, + self, + remote_rank: int, + tag: int, + data_ptr, + data_size: int, + transport: int, ) -> None: self._comm.connect( remote_rank, @@ -181,12 +181,13 @@ class Comm: self._comm.stop_proxies() def register_buffer( - self, - data_ptr, - data_size: int, + self, + data_ptr, + data_size: int, ) -> list[_P2PHandle]: return [ - P2PHandle(self, h) for h in self._comm.register_buffer( + P2PHandle(self, h) + for h in self._comm.register_buffer( data_ptr, data_size, ) @@ -197,8 +198,6 @@ class P2PHandle: _comm: Comm _handle: _P2PHandle - def __init__(self, - comm: Comm, - handle: _P2PHandle): + def __init__(self, comm: Comm, handle: _P2PHandle): self._comm = comm self._handle = handle From 426e78997c052057bd481a22f691219853e81286 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Sun, 9 Apr 2023 02:20:54 +0000 Subject: [PATCH 005/135] name changes + documentation for clarity --- src/include/comm.h | 8 ++++---- src/include/mscclpp.h | 27 ++++++++++++++++++-------- src/init.cc | 44 +++++++++++++++++++++---------------------- src/proxy.cc | 4 ++-- tests/p2p_test.cu | 4 ++-- 5 files changed, 49 insertions(+), 38 deletions(-) diff --git a/src/include/comm.h b/src/include/comm.h index b45f4348..f1e3ed47 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -35,11 +35,11 @@ struct mscclppConn struct mscclppIbContext* ibCtx; struct mscclppIbQp* ibQp; struct mscclppIbMr* ibBuffMr; - struct mscclppIbMr* ibLocalFlagMr; - struct mscclppIbMr* ibProxyFlagMr; + struct mscclppIbMr* ibSignalEpochIdMr; + struct mscclppIbMr* ibProxySignalEpochIdMr; struct mscclppIbMrInfo ibBuffMrInfo; - struct mscclppIbMrInfo ibLocalFlagMrInfo; - struct mscclppIbMrInfo ibProxyFlagMrInfo; + struct mscclppIbMrInfo ibSignalEpochIdMrInfo; + struct mscclppIbMrInfo ibProxySignalEpochIdMrInfo; #if defined(ENABLE_NPKIT) std::vector npkitUsedReqIds; std::vector npkitFreeReqIds; diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index eba5dccb..f7261410 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -139,28 +139,39 @@ struct mscclppDevConn __forceinline__ __device__ void wait() { - (*recvEpochId) += 1; - // printf("%llu %llu %llu\n", *(volatile uint64_t*)proxyEpochId, (*recvEpochId), *(volatile uint64_t*)sendEpochId); - while (*(volatile uint64_t*)proxyEpochId < (*recvEpochId)) + (*waitEpochId) += 1; + // printf("%llu %llu %llu\n", *(volatile uint64_t*)proxySignalEpochId, (*waitEpochId), *(volatile uint64_t*)signalEpochId); + while (*(volatile uint64_t*)proxySignalEpochId < (*waitEpochId)) ; } __forceinline__ __device__ void epochIncrement() { - *(volatile uint64_t*)sendEpochId += 1; + *(volatile uint64_t*)signalEpochId += 1; } #endif int remoteRank; int tag; + // my local buffer void* localBuff; - uint64_t* sendEpochId; // this is read and written by the GPU - uint64_t* recvEpochId; // this is the copy of the remote epoch id. + // every signal(), increaments this and either: + // 1) proxy thread pushes it to the remote peer's proxySignalEpochId + // 2) gpu thread directly writes it to remoteSignalEpochId + uint64_t* signalEpochId; + // every wait(), increaments this and then the gpu waits for either: + // 1) proxySignalEpochId to be >= this in case of a proxy thread + // 2) remoteSignalEpochId to be >= this in case of a gpu thread + uint64_t* waitEpochId; + // my remote peer's buffer. only non-NULL with gpu's direct access + // gpu can directly write into it void* remoteBuff; - uint64_t* remoteFlag; - uint64_t* proxyEpochId; // this is only written by the proxy thread + // used by the signal() function directly from gpu + uint64_t* remoteSignalEpochId; + // signal() function triggers the cpu proxy thread to write to it + uint64_t* proxySignalEpochId; // this is a concurrent fifo which is multiple threads from the device // can produce for and the sole proxy thread consumes it. diff --git a/src/init.cc b/src/init.cc index 2eb4fd4a..0cff496a 100644 --- a/src/init.cc +++ b/src/init.cc @@ -183,7 +183,7 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm) for (int i = 0; i < comm->nConns; ++i) { struct mscclppConn* conn = &comm->conns[i]; - MSCCLPPCHECK(mscclppCudaFree(conn->devConn->proxyEpochId)); + MSCCLPPCHECK(mscclppCudaFree(conn->devConn->proxySignalEpochId)); } for (int i = 0; i < MSCCLPP_PROXY_MAX_NUM; ++i) { @@ -216,8 +216,8 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm) for (int i = 0; i < comm->nConns; i++) { struct mscclppConn* conn = &comm->conns[i]; if (conn) { - MSCCLPPCHECK(mscclppCudaFree(conn->devConn->sendEpochId)); - MSCCLPPCHECK(mscclppCudaFree(conn->devConn->recvEpochId)); + MSCCLPPCHECK(mscclppCudaFree(conn->devConn->signalEpochId)); + MSCCLPPCHECK(mscclppCudaFree(conn->devConn->waitEpochId)); } } @@ -419,8 +419,8 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void conn->devConn = devConn; conn->devConn->localBuff = localBuff; - MSCCLPPCHECK(mscclppCudaCalloc(&conn->devConn->sendEpochId, 1)); - MSCCLPPCHECK(mscclppCudaCalloc(&conn->devConn->recvEpochId, 1)); + MSCCLPPCHECK(mscclppCudaCalloc(&conn->devConn->signalEpochId, 1)); + MSCCLPPCHECK(mscclppCudaCalloc(&conn->devConn->waitEpochId, 1)); conn->devConn->remoteRank = remoteRank; conn->devConn->tag = tag; conn->devConn->fifo.connId = comm->nConns; @@ -443,12 +443,12 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void struct connInfo { cudaIpcMemHandle_t handleBuff; - cudaIpcMemHandle_t handleFlag; - cudaIpcMemHandle_t handleProxyFlag; + cudaIpcMemHandle_t handleSignalEpochId; + cudaIpcMemHandle_t handleProxySignalEpochId; mscclppIbQpInfo infoQp; mscclppIbMrInfo infoBuffMr; - mscclppIbMrInfo infoLocalFlagMr; - mscclppIbMrInfo infoProxyFlagMr; + mscclppIbMrInfo infoSignalEpochIdMr; + mscclppIbMrInfo infoProxySignalEpochIdMr; }; mscclppResult_t mscclppP2pConnectionSetupStart(struct connInfo* connInfo /*output*/, struct mscclppConn* conn /*input*/) @@ -458,10 +458,10 @@ mscclppResult_t mscclppP2pConnectionSetupStart(struct connInfo* connInfo /*outpu return mscclppInternalError; } struct mscclppDevConn* devConn = conn->devConn; - MSCCLPPCHECK(mscclppCudaCalloc(&devConn->proxyEpochId, 1)); - CUDACHECK(cudaIpcGetMemHandle(&connInfo->handleProxyFlag, devConn->proxyEpochId)); + MSCCLPPCHECK(mscclppCudaCalloc(&devConn->proxySignalEpochId, 1)); + CUDACHECK(cudaIpcGetMemHandle(&connInfo->handleProxySignalEpochId, devConn->proxySignalEpochId)); CUDACHECK(cudaIpcGetMemHandle(&connInfo->handleBuff, devConn->localBuff)); - CUDACHECK(cudaIpcGetMemHandle(&connInfo->handleFlag, devConn->sendEpochId)); + CUDACHECK(cudaIpcGetMemHandle(&connInfo->handleSignalEpochId, devConn->signalEpochId)); return mscclppSuccess; } @@ -474,9 +474,9 @@ mscclppResult_t mscclppP2pConnectionSetupEnd(struct connInfo* connInfo /*input*/ CUDACHECK( cudaIpcOpenMemHandle((void**)&conn->devConn->remoteBuff, connInfo->handleBuff, cudaIpcMemLazyEnablePeerAccess)); CUDACHECK( - cudaIpcOpenMemHandle((void**)&conn->devConn->remoteFlag, connInfo->handleFlag, cudaIpcMemLazyEnablePeerAccess)); + cudaIpcOpenMemHandle((void**)&conn->devConn->remoteSignalEpochId, connInfo->handleSignalEpochId, cudaIpcMemLazyEnablePeerAccess)); CUDACHECK( - cudaIpcOpenMemHandle((void**)&conn->remoteProxyFlag, connInfo->handleProxyFlag, cudaIpcMemLazyEnablePeerAccess)); + cudaIpcOpenMemHandle((void**)&conn->remoteProxyFlag, connInfo->handleProxySignalEpochId, cudaIpcMemLazyEnablePeerAccess)); return mscclppSuccess; } @@ -488,8 +488,8 @@ mscclppResult_t mscclppIbConnectionSetupStart(struct connInfo* connInfo /*output } struct mscclppDevConn* devConn = conn->devConn; devConn->remoteBuff = NULL; - devConn->remoteFlag = NULL; - MSCCLPPCHECK(mscclppCudaCalloc(&devConn->proxyEpochId, 1)); + devConn->remoteSignalEpochId = NULL; + MSCCLPPCHECK(mscclppCudaCalloc(&devConn->proxySignalEpochId, 1)); struct mscclppIbContext* ibCtx = conn->ibCtx; if (conn->ibQp == NULL) { @@ -497,12 +497,12 @@ mscclppResult_t mscclppIbConnectionSetupStart(struct connInfo* connInfo /*output } // TODO(chhwang): can we register only one MR for the following three? MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, devConn->localBuff, conn->buffSize, &conn->ibBuffMr)); - MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, devConn->sendEpochId, sizeof(uint64_t), &conn->ibLocalFlagMr)); - MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, devConn->proxyEpochId, sizeof(uint64_t), &conn->ibProxyFlagMr)); + MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, devConn->signalEpochId, sizeof(uint64_t), &conn->ibSignalEpochIdMr)); + MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, devConn->proxySignalEpochId, sizeof(uint64_t), &conn->ibProxySignalEpochIdMr)); connInfo->infoQp = conn->ibQp->info; connInfo->infoBuffMr = conn->ibBuffMr->info; - connInfo->infoLocalFlagMr = conn->ibLocalFlagMr->info; - connInfo->infoProxyFlagMr = conn->ibProxyFlagMr->info; + connInfo->infoSignalEpochIdMr = conn->ibSignalEpochIdMr->info; + connInfo->infoProxySignalEpochIdMr = conn->ibProxySignalEpochIdMr->info; return mscclppSuccess; } @@ -521,8 +521,8 @@ mscclppResult_t mscclppIbConnectionSetupEnd(struct connInfo* connInfo /*input*/, return mscclppInvalidUsage; } conn->ibBuffMrInfo = connInfo->infoBuffMr; - conn->ibLocalFlagMrInfo = connInfo->infoLocalFlagMr; - conn->ibProxyFlagMrInfo = connInfo->infoProxyFlagMr; + conn->ibSignalEpochIdMrInfo = connInfo->infoSignalEpochIdMr; + conn->ibProxySignalEpochIdMrInfo = connInfo->infoProxySignalEpochIdMr; return mscclppSuccess; } diff --git a/src/proxy.cc b/src/proxy.cc index 8d2fe6be..cc0ae870 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -162,12 +162,12 @@ void* mscclppProxyService(void* _args) } if (trigger.fields.type & mscclppFlag) { if (isP2pProxy) { - PROXYCUDACHECK(cudaMemcpyAsync(conn->remoteProxyFlag, conn->devConn->sendEpochId, sizeof(uint64_t), + PROXYCUDACHECK(cudaMemcpyAsync(conn->remoteProxyFlag, conn->devConn->signalEpochId, sizeof(uint64_t), cudaMemcpyDeviceToDevice, p2pStream)); npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_ENTRY, (uint32_t)sizeof(uint64_t), trigger.fields.connId); } else { // My local flag is copied to the peer's proxy flag - conn->ibQp->stageSend(conn->ibLocalFlagMr, &conn->ibProxyFlagMrInfo, sizeof(uint64_t), + conn->ibQp->stageSend(conn->ibSignalEpochIdMr, &conn->ibProxySignalEpochIdMrInfo, sizeof(uint64_t), /*wrId=*/0, /*srcOffset=*/0, /*dstOffset=*/0, /*signaled=*/true); if ((ret = conn->ibQp->postSend()) != 0) { WARN("flag postSend failed: errno %d", ret); diff --git a/tests/p2p_test.cu b/tests/p2p_test.cu index 95f18e6c..e2218e83 100644 --- a/tests/p2p_test.cu +++ b/tests/p2p_test.cu @@ -54,7 +54,7 @@ __global__ void kernel(int rank, int world_size) volatile int* data = (volatile int*)devConn.localBuff; volatile uint64_t* localFlag = devConn.localFlag; #if (USE_DMA_FOR_P2P == 0) - volatile uint64_t* remoteFlag = devConn.remoteFlag; + volatile uint64_t* remoteSignalEpochId = devConn.remoteSignalEpochId; #endif volatile uint64_t* proxyFlag = devConn.proxyFlag; @@ -106,7 +106,7 @@ __global__ void kernel(int rank, int world_size) volatile int* remoteData = (volatile int*)devConn.remoteBuff; // Wait until the remote data is set - while (*remoteFlag == baseFlag) { + while (*remoteSignalEpochId == baseFlag) { } // Read remote data From a1ae982c616dac2fcf85752e19068e41c96f076d Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Mon, 10 Apr 2023 14:05:25 +0000 Subject: [PATCH 006/135] Merge signalEpochId with proxySignalEpochId --- src/include/comm.h | 2 -- src/include/mscclpp.h | 32 +++++++++++++++++++------------- src/init.cc | 25 +++++-------------------- src/proxy.cc | 8 ++++---- 4 files changed, 28 insertions(+), 39 deletions(-) diff --git a/src/include/comm.h b/src/include/comm.h index f1e3ed47..56e24e2d 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -36,10 +36,8 @@ struct mscclppConn struct mscclppIbQp* ibQp; struct mscclppIbMr* ibBuffMr; struct mscclppIbMr* ibSignalEpochIdMr; - struct mscclppIbMr* ibProxySignalEpochIdMr; struct mscclppIbMrInfo ibBuffMrInfo; struct mscclppIbMrInfo ibSignalEpochIdMrInfo; - struct mscclppIbMrInfo ibProxySignalEpochIdMrInfo; #if defined(ENABLE_NPKIT) std::vector npkitUsedReqIds; std::vector npkitFreeReqIds; diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index f7261410..b620153c 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -20,6 +20,16 @@ extern "C" { #endif +struct alignas(16) mscclppDevConnSignalEpochId +{ + // every signal(), increaments this and either: + // 1) proxy thread pushes it to the remote peer's localSignalEpochId->proxy + // 2) gpu thread directly writes it to remoteSignalEpochId->device + uint64_t device; + // signal() function triggers the cpu proxy thread to write to it + uint64_t proxy; +}; + /*************************************************************************************************************** * A mscclppDevConn provides a zero-copy connection between two GPUs connected via P2P NVLink or InfiniBand. * The communication API is one-sided meaning that for every single data transfer, only one side @@ -140,14 +150,13 @@ struct mscclppDevConn __forceinline__ __device__ void wait() { (*waitEpochId) += 1; - // printf("%llu %llu %llu\n", *(volatile uint64_t*)proxySignalEpochId, (*waitEpochId), *(volatile uint64_t*)signalEpochId); - while (*(volatile uint64_t*)proxySignalEpochId < (*waitEpochId)) + while (*(volatile uint64_t*)&(localSignalEpochId->proxy) < (*waitEpochId)) ; } __forceinline__ __device__ void epochIncrement() { - *(volatile uint64_t*)signalEpochId += 1; + *(volatile uint64_t*)&(localSignalEpochId->device) += 1; } #endif @@ -156,22 +165,19 @@ struct mscclppDevConn // my local buffer void* localBuff; - // every signal(), increaments this and either: - // 1) proxy thread pushes it to the remote peer's proxySignalEpochId - // 2) gpu thread directly writes it to remoteSignalEpochId - uint64_t* signalEpochId; + + struct mscclppDevConnSignalEpochId* localSignalEpochId; + // used by the signal() function directly from gpu + struct mscclppDevConnSignalEpochId* remoteSignalEpochId; + // every wait(), increaments this and then the gpu waits for either: - // 1) proxySignalEpochId to be >= this in case of a proxy thread - // 2) remoteSignalEpochId to be >= this in case of a gpu thread + // 1) localSignalEpochId->proxy to be >= this in case of a proxy thread + // 2) remoteSignalEpochId->device to be >= this in case of a gpu thread uint64_t* waitEpochId; // my remote peer's buffer. only non-NULL with gpu's direct access // gpu can directly write into it void* remoteBuff; - // used by the signal() function directly from gpu - uint64_t* remoteSignalEpochId; - // signal() function triggers the cpu proxy thread to write to it - uint64_t* proxySignalEpochId; // this is a concurrent fifo which is multiple threads from the device // can produce for and the sole proxy thread consumes it. diff --git a/src/init.cc b/src/init.cc index 0cff496a..08302f6e 100644 --- a/src/init.cc +++ b/src/init.cc @@ -181,11 +181,6 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm) if (comm == NULL) return mscclppSuccess; - for (int i = 0; i < comm->nConns; ++i) { - struct mscclppConn* conn = &comm->conns[i]; - MSCCLPPCHECK(mscclppCudaFree(conn->devConn->proxySignalEpochId)); - } - for (int i = 0; i < MSCCLPP_PROXY_MAX_NUM; ++i) { struct mscclppProxyState* proxyState = comm->proxyState[i]; if (proxyState) { @@ -216,7 +211,7 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm) for (int i = 0; i < comm->nConns; i++) { struct mscclppConn* conn = &comm->conns[i]; if (conn) { - MSCCLPPCHECK(mscclppCudaFree(conn->devConn->signalEpochId)); + MSCCLPPCHECK(mscclppCudaFree(conn->devConn->localSignalEpochId)); MSCCLPPCHECK(mscclppCudaFree(conn->devConn->waitEpochId)); } } @@ -419,7 +414,7 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void conn->devConn = devConn; conn->devConn->localBuff = localBuff; - MSCCLPPCHECK(mscclppCudaCalloc(&conn->devConn->signalEpochId, 1)); + MSCCLPPCHECK(mscclppCudaCalloc(&conn->devConn->localSignalEpochId, 1)); MSCCLPPCHECK(mscclppCudaCalloc(&conn->devConn->waitEpochId, 1)); conn->devConn->remoteRank = remoteRank; conn->devConn->tag = tag; @@ -444,11 +439,9 @@ struct connInfo { cudaIpcMemHandle_t handleBuff; cudaIpcMemHandle_t handleSignalEpochId; - cudaIpcMemHandle_t handleProxySignalEpochId; mscclppIbQpInfo infoQp; mscclppIbMrInfo infoBuffMr; mscclppIbMrInfo infoSignalEpochIdMr; - mscclppIbMrInfo infoProxySignalEpochIdMr; }; mscclppResult_t mscclppP2pConnectionSetupStart(struct connInfo* connInfo /*output*/, struct mscclppConn* conn /*input*/) @@ -458,10 +451,8 @@ mscclppResult_t mscclppP2pConnectionSetupStart(struct connInfo* connInfo /*outpu return mscclppInternalError; } struct mscclppDevConn* devConn = conn->devConn; - MSCCLPPCHECK(mscclppCudaCalloc(&devConn->proxySignalEpochId, 1)); - CUDACHECK(cudaIpcGetMemHandle(&connInfo->handleProxySignalEpochId, devConn->proxySignalEpochId)); CUDACHECK(cudaIpcGetMemHandle(&connInfo->handleBuff, devConn->localBuff)); - CUDACHECK(cudaIpcGetMemHandle(&connInfo->handleSignalEpochId, devConn->signalEpochId)); + CUDACHECK(cudaIpcGetMemHandle(&connInfo->handleSignalEpochId, devConn->localSignalEpochId)); return mscclppSuccess; } @@ -475,8 +466,7 @@ mscclppResult_t mscclppP2pConnectionSetupEnd(struct connInfo* connInfo /*input*/ cudaIpcOpenMemHandle((void**)&conn->devConn->remoteBuff, connInfo->handleBuff, cudaIpcMemLazyEnablePeerAccess)); CUDACHECK( cudaIpcOpenMemHandle((void**)&conn->devConn->remoteSignalEpochId, connInfo->handleSignalEpochId, cudaIpcMemLazyEnablePeerAccess)); - CUDACHECK( - cudaIpcOpenMemHandle((void**)&conn->remoteProxyFlag, connInfo->handleProxySignalEpochId, cudaIpcMemLazyEnablePeerAccess)); + conn->remoteProxyFlag = &(conn->devConn->remoteSignalEpochId->proxy); return mscclppSuccess; } @@ -489,20 +479,16 @@ mscclppResult_t mscclppIbConnectionSetupStart(struct connInfo* connInfo /*output struct mscclppDevConn* devConn = conn->devConn; devConn->remoteBuff = NULL; devConn->remoteSignalEpochId = NULL; - MSCCLPPCHECK(mscclppCudaCalloc(&devConn->proxySignalEpochId, 1)); struct mscclppIbContext* ibCtx = conn->ibCtx; if (conn->ibQp == NULL) { MSCCLPPCHECK(mscclppIbContextCreateQp(ibCtx, &conn->ibQp)); } - // TODO(chhwang): can we register only one MR for the following three? MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, devConn->localBuff, conn->buffSize, &conn->ibBuffMr)); - MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, devConn->signalEpochId, sizeof(uint64_t), &conn->ibSignalEpochIdMr)); - MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, devConn->proxySignalEpochId, sizeof(uint64_t), &conn->ibProxySignalEpochIdMr)); + MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, devConn->localSignalEpochId, sizeof(struct mscclppDevConnSignalEpochId), &conn->ibSignalEpochIdMr)); connInfo->infoQp = conn->ibQp->info; connInfo->infoBuffMr = conn->ibBuffMr->info; connInfo->infoSignalEpochIdMr = conn->ibSignalEpochIdMr->info; - connInfo->infoProxySignalEpochIdMr = conn->ibProxySignalEpochIdMr->info; return mscclppSuccess; } @@ -522,7 +508,6 @@ mscclppResult_t mscclppIbConnectionSetupEnd(struct connInfo* connInfo /*input*/, } conn->ibBuffMrInfo = connInfo->infoBuffMr; conn->ibSignalEpochIdMrInfo = connInfo->infoSignalEpochIdMr; - conn->ibProxySignalEpochIdMrInfo = connInfo->infoProxySignalEpochIdMr; return mscclppSuccess; } diff --git a/src/proxy.cc b/src/proxy.cc index cc0ae870..6545c855 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -162,13 +162,13 @@ void* mscclppProxyService(void* _args) } if (trigger.fields.type & mscclppFlag) { if (isP2pProxy) { - PROXYCUDACHECK(cudaMemcpyAsync(conn->remoteProxyFlag, conn->devConn->signalEpochId, sizeof(uint64_t), + PROXYCUDACHECK(cudaMemcpyAsync(conn->remoteProxyFlag, &(conn->devConn->localSignalEpochId->device), sizeof(uint64_t), cudaMemcpyDeviceToDevice, p2pStream)); npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_ENTRY, (uint32_t)sizeof(uint64_t), trigger.fields.connId); } else { - // My local flag is copied to the peer's proxy flag - conn->ibQp->stageSend(conn->ibSignalEpochIdMr, &conn->ibProxySignalEpochIdMrInfo, sizeof(uint64_t), - /*wrId=*/0, /*srcOffset=*/0, /*dstOffset=*/0, /*signaled=*/true); + // My local device flag is copied to the remote's proxy flag + conn->ibQp->stageSend(conn->ibSignalEpochIdMr, &conn->ibSignalEpochIdMrInfo, sizeof(uint64_t), + /*wrId=*/0, /*srcOffset=*/0, /*dstOffset=*/sizeof(uint64_t), /*signaled=*/true); if ((ret = conn->ibQp->postSend()) != 0) { WARN("flag postSend failed: errno %d", ret); } From 48102a0858797c7177b1d85bdcac8e16fc4ee401 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Tue, 11 Apr 2023 01:22:40 +0000 Subject: [PATCH 007/135] removing unnecessary flags --- src/include/comm.h | 3 --- src/init.cc | 1 - src/proxy.cc | 2 +- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/include/comm.h b/src/include/comm.h index 56e24e2d..28f3cb0d 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -28,9 +28,6 @@ struct mscclppConn mscclppTransport_t transport; int remoteRank; uint64_t buffSize; - uint64_t* remoteProxyFlag; - uint64_t* cpuProxyFlag; - void* cpuProxyFlagGdrDesc; struct mscclppDevConn* devConn; struct mscclppIbContext* ibCtx; struct mscclppIbQp* ibQp; diff --git a/src/init.cc b/src/init.cc index 08302f6e..f799e2fb 100644 --- a/src/init.cc +++ b/src/init.cc @@ -466,7 +466,6 @@ mscclppResult_t mscclppP2pConnectionSetupEnd(struct connInfo* connInfo /*input*/ cudaIpcOpenMemHandle((void**)&conn->devConn->remoteBuff, connInfo->handleBuff, cudaIpcMemLazyEnablePeerAccess)); CUDACHECK( cudaIpcOpenMemHandle((void**)&conn->devConn->remoteSignalEpochId, connInfo->handleSignalEpochId, cudaIpcMemLazyEnablePeerAccess)); - conn->remoteProxyFlag = &(conn->devConn->remoteSignalEpochId->proxy); return mscclppSuccess; } diff --git a/src/proxy.cc b/src/proxy.cc index 6545c855..8df6beb5 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -162,7 +162,7 @@ void* mscclppProxyService(void* _args) } if (trigger.fields.type & mscclppFlag) { if (isP2pProxy) { - PROXYCUDACHECK(cudaMemcpyAsync(conn->remoteProxyFlag, &(conn->devConn->localSignalEpochId->device), sizeof(uint64_t), + PROXYCUDACHECK(cudaMemcpyAsync(&conn->devConn->remoteSignalEpochId->proxy, &(conn->devConn->localSignalEpochId->device), sizeof(uint64_t), cudaMemcpyDeviceToDevice, p2pStream)); npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_ENTRY, (uint32_t)sizeof(uint64_t), trigger.fields.connId); } else { From b6179224aaa7b63f99734d13a9b724957fdc018b Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Tue, 11 Apr 2023 01:36:37 +0000 Subject: [PATCH 008/135] lint --- src/include/mscclpp.h | 3 +-- src/init.cc | 7 ++++--- src/proxy.cc | 3 ++- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index b620153c..4c3473e6 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -16,8 +16,7 @@ #include #ifdef __cplusplus -extern "C" -{ +extern "C" { #endif struct alignas(16) mscclppDevConnSignalEpochId diff --git a/src/init.cc b/src/init.cc index f799e2fb..803f1916 100644 --- a/src/init.cc +++ b/src/init.cc @@ -464,8 +464,8 @@ mscclppResult_t mscclppP2pConnectionSetupEnd(struct connInfo* connInfo /*input*/ } CUDACHECK( cudaIpcOpenMemHandle((void**)&conn->devConn->remoteBuff, connInfo->handleBuff, cudaIpcMemLazyEnablePeerAccess)); - CUDACHECK( - cudaIpcOpenMemHandle((void**)&conn->devConn->remoteSignalEpochId, connInfo->handleSignalEpochId, cudaIpcMemLazyEnablePeerAccess)); + CUDACHECK(cudaIpcOpenMemHandle((void**)&conn->devConn->remoteSignalEpochId, connInfo->handleSignalEpochId, + cudaIpcMemLazyEnablePeerAccess)); return mscclppSuccess; } @@ -484,7 +484,8 @@ mscclppResult_t mscclppIbConnectionSetupStart(struct connInfo* connInfo /*output MSCCLPPCHECK(mscclppIbContextCreateQp(ibCtx, &conn->ibQp)); } MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, devConn->localBuff, conn->buffSize, &conn->ibBuffMr)); - MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, devConn->localSignalEpochId, sizeof(struct mscclppDevConnSignalEpochId), &conn->ibSignalEpochIdMr)); + MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, devConn->localSignalEpochId, + sizeof(struct mscclppDevConnSignalEpochId), &conn->ibSignalEpochIdMr)); connInfo->infoQp = conn->ibQp->info; connInfo->infoBuffMr = conn->ibBuffMr->info; connInfo->infoSignalEpochIdMr = conn->ibSignalEpochIdMr->info; diff --git a/src/proxy.cc b/src/proxy.cc index 8df6beb5..a8680398 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -162,7 +162,8 @@ void* mscclppProxyService(void* _args) } if (trigger.fields.type & mscclppFlag) { if (isP2pProxy) { - PROXYCUDACHECK(cudaMemcpyAsync(&conn->devConn->remoteSignalEpochId->proxy, &(conn->devConn->localSignalEpochId->device), sizeof(uint64_t), + PROXYCUDACHECK(cudaMemcpyAsync(&conn->devConn->remoteSignalEpochId->proxy, + &(conn->devConn->localSignalEpochId->device), sizeof(uint64_t), cudaMemcpyDeviceToDevice, p2pStream)); npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_ENTRY, (uint32_t)sizeof(uint64_t), trigger.fields.connId); } else { From 1bfa8d4034b9ac0143d7156262bc99f5803f5a7b Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Tue, 11 Apr 2023 04:05:44 +0000 Subject: [PATCH 009/135] creating a base conn class --- src/include/mscclpp.h | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index 4c3473e6..6b60a8f2 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -29,6 +29,28 @@ struct alignas(16) mscclppDevConnSignalEpochId uint64_t proxy; }; + +struct mscclppBaseConn { + int remoteRank; + int tag; + + // my local buffer + void* localBuff; + + struct mscclppDevConnSignalEpochId* localSignalEpochId; + // used by the signal() function directly from gpu + struct mscclppDevConnSignalEpochId* remoteSignalEpochId; + + // every wait(), increaments this and then the gpu waits for either: + // 1) localSignalEpochId->proxy to be >= this in case of a proxy thread + // 2) remoteSignalEpochId->device to be >= this in case of a gpu thread + uint64_t* waitEpochId; + + // my remote peer's buffer. only non-NULL with gpu's direct access + // gpu can directly write into it + void* remoteBuff; +}; + /*************************************************************************************************************** * A mscclppDevConn provides a zero-copy connection between two GPUs connected via P2P NVLink or InfiniBand. * The communication API is one-sided meaning that for every single data transfer, only one side @@ -91,7 +113,7 @@ struct alignas(16) mscclppDevConnSignalEpochId * The two endpoint can concurrently use the same connection provided they are writing (puts) on different * indices in the registered buffer. **************************************************************************************************************/ -struct mscclppDevConn +struct mscclppDevConn : mscclppBaseConn { #ifdef __CUDACC__ __forceinline__ __device__ void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) @@ -159,25 +181,6 @@ struct mscclppDevConn } #endif - int remoteRank; - int tag; - - // my local buffer - void* localBuff; - - struct mscclppDevConnSignalEpochId* localSignalEpochId; - // used by the signal() function directly from gpu - struct mscclppDevConnSignalEpochId* remoteSignalEpochId; - - // every wait(), increaments this and then the gpu waits for either: - // 1) localSignalEpochId->proxy to be >= this in case of a proxy thread - // 2) remoteSignalEpochId->device to be >= this in case of a gpu thread - uint64_t* waitEpochId; - - // my remote peer's buffer. only non-NULL with gpu's direct access - // gpu can directly write into it - void* remoteBuff; - // this is a concurrent fifo which is multiple threads from the device // can produce for and the sole proxy thread consumes it. struct mscclppConcurrentFifo fifo; From 69b5bdfd133d6dacf7c90c26a7a44852239e0739 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 11 Apr 2023 05:01:39 +0000 Subject: [PATCH 010/135] minor fix --- Makefile | 2 +- src/include/mscclpp.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 972582eb..14f7bc81 100644 --- a/Makefile +++ b/Makefile @@ -158,7 +158,7 @@ MSCLLPPTESTBINS := $(MSCLLPPTESTBINFILESLIST:%=$(BUILDDIR)/$(BINDIR)/$(T INCLUDE := -Isrc -Isrc/include -.PHONY: all build lib tests mscclpp_test clean +.PHONY: all build lib unittests tests mscclpp-test cpplint cpplint-autofix cpplint-file-autofix clean all: build diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index 6b60a8f2..33ddc47b 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -29,8 +29,8 @@ struct alignas(16) mscclppDevConnSignalEpochId uint64_t proxy; }; - -struct mscclppBaseConn { +struct mscclppBaseConn +{ int remoteRank; int tag; From d2c2ae72a7ef2b65e68738c38d8a3de424aa3f65 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 11 Apr 2023 08:45:22 +0000 Subject: [PATCH 011/135] Some cleanup --- Makefile | 2 +- src/bootstrap/bootstrap.cc | 1 - src/debug.cc | 7 +-- src/include/alloc.h | 1 - src/include/comm.h | 11 +---- src/include/core.h | 30 ------------- src/include/mscclpp.h | 4 +- src/include/mscclppfifo.h | 2 +- src/include/npkit/npkit.h | 3 -- src/include/param.h | 30 ------------- src/include/utils.h | 6 +-- src/init.cc | 7 ++- src/misc/npkit.cc | 1 + src/param.cc | 90 -------------------------------------- src/utils.cc | 2 +- 15 files changed, 16 insertions(+), 181 deletions(-) delete mode 100644 src/include/core.h delete mode 100644 src/include/param.h delete mode 100644 src/param.cc diff --git a/Makefile b/Makefile index 14f7bc81..99f2fcd6 100644 --- a/Makefile +++ b/Makefile @@ -118,7 +118,7 @@ endif LDFLAGS := $(NVLDFLAGS) $(GDRCOPY_LDFLAGS) -libverbs -lnuma -LIBSRCS := $(addprefix src/,debug.cc utils.cc param.cc init.cc proxy.cc ib.cc config.cc) +LIBSRCS := $(addprefix src/,debug.cc utils.cc init.cc proxy.cc ib.cc config.cc) LIBSRCS += $(addprefix src/bootstrap/,bootstrap.cc socket.cc) ifneq ($(NPKIT), 0) LIBSRCS += $(addprefix src/misc/,npkit.cc) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index 064af4a8..11389222 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -6,7 +6,6 @@ #include "bootstrap.h" #include "config.h" -#include "core.h" #include "mscclpp.h" #include "utils.h" #include diff --git a/src/debug.cc b/src/debug.cc index a3807d3e..d6d29262 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -5,10 +5,11 @@ ************************************************************************/ #include "debug.h" -#include "core.h" +#include #include #include #include +#include int mscclppDebugLevel = -1; static int pid = -1; @@ -229,15 +230,11 @@ mscclppResult_t mscclppDebugSetLogHandler(mscclppLogHandler_t handler) return mscclppSuccess; } -MSCCLPP_PARAM(SetThreadName, "SET_THREAD_NAME", 0); - void mscclppSetThreadName(pthread_t thread, const char* fmt, ...) { // pthread_setname_np is nonstandard GNU extension // needs the following feature test macro #ifdef _GNU_SOURCE - if (mscclppParamSetThreadName() != 1) - return; char threadName[MSCCLPP_THREAD_NAMELEN]; va_list vargs; va_start(vargs, fmt); diff --git a/src/include/alloc.h b/src/include/alloc.h index 496af197..5c696e6e 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -12,7 +12,6 @@ #include "mscclpp.h" #include "utils.h" #include -#include #include #include diff --git a/src/include/comm.h b/src/include/comm.h index 28f3cb0d..38abd438 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -10,16 +10,9 @@ #include "ib.h" #include "proxy.h" +#if defined(ENABLE_NPKIT) #include - -// #define CACHE_LINE_SIZE 128 -// #define MEM_ALIGN 4096 -// #define CUDA_IPC_MIN 2097152UL - -// // Channels / LL tuning -// #define MSCCLPP_LL_THREAD_THRESHOLD 8 -// #define MSCCLPP_LL128_THREAD_THRESHOLD 8 -// #define MSCCLPP_SIMPLE_THREAD_THRESHOLD 64 +#endif #define MAXCONNECTIONS 64 diff --git a/src/include/core.h b/src/include/core.h deleted file mode 100644 index e3213bd6..00000000 --- a/src/include/core.h +++ /dev/null @@ -1,30 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef MSCCLPP_CORE_H_ -#define MSCCLPP_CORE_H_ - -#include "alloc.h" -#include "debug.h" -#include "mscclpp.h" -#include "param.h" -#include // For std::min/std::max -#include -#include -#include -#include -#include -#include - -#ifdef PROFAPI -#define MSCCLPP_API(ret, func, args...) \ - __attribute__((visibility("default"))) __attribute__((alias(#func))) ret p##func(args); \ - extern "C" __attribute__((visibility("default"))) __attribute__((weak)) ret func(args) -#else -#define MSCCLPP_API(ret, func, args...) extern "C" __attribute__((visibility("default"))) ret func(args) -#endif // end PROFAPI - -#endif // end include guard diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index 33ddc47b..c67add94 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -12,7 +12,6 @@ #define MSCCLPP_PROXY_FIFO_FLUSH_COUNTER 4 #include -#include #include #ifdef __cplusplus @@ -180,7 +179,8 @@ struct mscclppDevConn : mscclppBaseConn *(volatile uint64_t*)&(localSignalEpochId->device) += 1; } -#endif +#endif // __CUDACC__ + // this is a concurrent fifo which is multiple threads from the device // can produce for and the sole proxy thread consumes it. struct mscclppConcurrentFifo fifo; diff --git a/src/include/mscclppfifo.h b/src/include/mscclppfifo.h index 78918fff..341025b5 100644 --- a/src/include/mscclppfifo.h +++ b/src/include/mscclppfifo.h @@ -49,7 +49,7 @@ typedef mscclppTrigger* mscclppTrigger_t; * push() function increments triggerFifoHead, proxyState->fifoTailHost is updated in proxy.cc:mscclppProxyService * and it occasionally flushes it to triggerFifoTail via a cudaMemcpyAsync. * - * Why douplicating the tail is a good idea? The fifo is large engouh and we do not need frequent updates + * Why duplicating the tail is a good idea? The fifo is large engouh and we do not need frequent updates * for the tail as there is usually enough space for device threads to push their work into. */ struct mscclppConcurrentFifo diff --git a/src/include/npkit/npkit.h b/src/include/npkit/npkit.h index a0691afd..f0a72dfc 100644 --- a/src/include/npkit/npkit.h +++ b/src/include/npkit/npkit.h @@ -2,9 +2,6 @@ #define NPKIT_H_ #include -#include - -#include #include "npkit/npkit_event.h" #include "npkit/npkit_struct.h" diff --git a/src/include/param.h b/src/include/param.h deleted file mode 100644 index e7478807..00000000 --- a/src/include/param.h +++ /dev/null @@ -1,30 +0,0 @@ -/************************************************************************* - * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef MSCCLPP_PARAM_H_ -#define MSCCLPP_PARAM_H_ - -#include - -const char* userHomeDir(); -void setEnvFile(const char* fileName); -void initEnv(); - -void mscclppLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache); - -#define MSCCLPP_PARAM(name, env, deftVal) \ - int64_t mscclppParam##name() \ - { \ - constexpr int64_t uninitialized = INT64_MIN; \ - static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \ - static int64_t cache = uninitialized; \ - if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \ - mscclppLoadParam("MSCCLPP_" env, deftVal, uninitialized, &cache); \ - } \ - return cache; \ - } - -#endif diff --git a/src/include/utils.h b/src/include/utils.h index 3eff9842..59b35407 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -8,13 +8,9 @@ #define MSCCLPP_UTILS_H_ #include "alloc.h" -#include "checks.h" #include "mscclpp.h" -#include -#include -#include +#include #include -#include // int mscclppCudaCompCap(); diff --git a/src/init.cc b/src/init.cc index 803f1916..db6ee645 100644 --- a/src/init.cc +++ b/src/init.cc @@ -1,6 +1,7 @@ +#include "alloc.h" #include "bootstrap.h" +#include "checks.h" #include "config.h" -#include "core.h" #if defined(MSCCLPP_USE_GDRCOPY) #include "gdr.h" #endif @@ -11,6 +12,8 @@ #include "npkit/npkit.h" #endif +#define MSCCLPP_API(ret, func, args...) extern "C" __attribute__((visibility("default"))) ret func(args) + static uint64_t hashUniqueId(mscclppUniqueId const& id) { char const* bytes = (char const*)&id; @@ -693,4 +696,4 @@ mscclppResult_t mscclppSetBootstrapConnTimeout(int timeout) mscclppConfig* config = mscclppConfig::getInstance(); config->setBootstrapConnectionTimeoutConfig(timeout); return mscclppSuccess; -} \ No newline at end of file +} diff --git a/src/misc/npkit.cc b/src/misc/npkit.cc index 4a7eb849..30914810 100644 --- a/src/misc/npkit.cc +++ b/src/misc/npkit.cc @@ -4,6 +4,7 @@ #include "alloc.h" #include "npkit/npkit.h" +#include uint64_t NpKit::rank_ = 0; diff --git a/src/param.cc b/src/param.cc deleted file mode 100644 index 2af48084..00000000 --- a/src/param.cc +++ /dev/null @@ -1,90 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "param.h" -#include "debug.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -const char* userHomeDir() -{ - struct passwd* pwUser = getpwuid(getuid()); - return pwUser == NULL ? NULL : pwUser->pw_dir; -} - -void setEnvFile(const char* fileName) -{ - FILE* file = fopen(fileName, "r"); - if (file == NULL) - return; - - char* line = NULL; - char envVar[1024]; - char envValue[1024]; - size_t n = 0; - ssize_t read; - while ((read = getline(&line, &n, file)) != -1) { - if (line[read - 1] == '\n') - line[read - 1] = '\0'; - int s = 0; // Env Var Size - while (line[s] != '\0' && line[s] != '=') - s++; - if (line[s] == '\0') - continue; - strncpy(envVar, line, std::min(1023, s)); - envVar[s] = '\0'; - s++; - strncpy(envValue, line + s, 1023); - envValue[1023] = '\0'; - setenv(envVar, envValue, 0); - // printf("%s : %s->%s\n", fileName, envVar, envValue); - } - if (line) - free(line); - fclose(file); -} - -void initEnv() -{ - char confFilePath[1024]; - const char* userDir = userHomeDir(); - if (userDir) { - sprintf(confFilePath, "%s/.mscclpp.conf", userDir); - setEnvFile(confFilePath); - } - sprintf(confFilePath, "/etc/mscclpp.conf"); - setEnvFile(confFilePath); -} - -void mscclppLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) -{ - static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; - pthread_mutex_lock(&mutex); - if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) { - char* str = getenv(env); - int64_t value = deftVal; - if (str && strlen(str) > 0) { - errno = 0; - value = strtoll(str, nullptr, 0); - if (errno) { - value = deftVal; - INFO(MSCCLPP_ALL, "Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal); - } else { - INFO(MSCCLPP_ALL, "%s set by environment to %lld.", env, (long long)value); - } - } - __atomic_store_n(cache, value, __ATOMIC_RELAXED); - } - pthread_mutex_unlock(&mutex); -} diff --git a/src/utils.cc b/src/utils.cc index c0766765..ebd31bfe 100644 --- a/src/utils.cc +++ b/src/utils.cc @@ -5,8 +5,8 @@ ************************************************************************/ #include "utils.h" -#include "core.h" +#include #include #include From 35acdf796c36823b64671dd10454d868a203d581 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 11 Apr 2023 11:28:40 +0000 Subject: [PATCH 012/135] Add mscclppProxyFifo --- src/include/proxy.h | 24 +++++++--- src/init.cc | 37 +++------------- src/proxy.cc | 106 +++++++++++++++++++++++++++++++------------- 3 files changed, 99 insertions(+), 68 deletions(-) diff --git a/src/include/proxy.h b/src/include/proxy.h index cf496f0f..8b300919 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -15,11 +15,14 @@ typedef enum MSCCLPP_PROXY_RUN_STATE_EXITING, } mscclppProxyRunState_t; -struct mscclppProxyState +struct mscclppProxyFifo { - mscclppTransport_t transportType; - pthread_t thread; - mscclppProxyRunState_t run; + mscclppResult_t create(); + mscclppResult_t destroy(); + + mscclppResult_t poll(mscclppTrigger* trigger); + mscclppResult_t pop(); + mscclppResult_t flushTail(bool sync = false); // fifo cudaHostCalloc'ed that is produced by device and consumed by host mscclppTrigger* triggerFifo; @@ -45,10 +48,21 @@ struct mscclppProxyState // these updates are pushed to the device. uint64_t fifoTailHost; + // for transferring fifo tail + cudaStream_t stream; +}; + +struct mscclppProxyState +{ + mscclppTransport_t transportType; + pthread_t thread; + mscclppProxyRunState_t run; + int numaNodeToBind; struct mscclppIbContext* ibContext; // For IB connection only cudaStream_t p2pStream; // for P2P DMA engine only - cudaStream_t fifoStream; // for transferring fifo tail + + struct mscclppProxyFifo fifo; }; mscclppResult_t mscclppProxyCreate(struct mscclppComm* comm); diff --git a/src/init.cc b/src/init.cc index db6ee645..daaac49d 100644 --- a/src/init.cc +++ b/src/init.cc @@ -187,20 +187,9 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm) for (int i = 0; i < MSCCLPP_PROXY_MAX_NUM; ++i) { struct mscclppProxyState* proxyState = comm->proxyState[i]; if (proxyState) { -#if defined(MSCCLPP_USE_GDRCOPY) - MSCCLPPCHECK(mscclppGdrCudaFree(proxyState->triggerFifoDesc)); -#else - MSCCLPPCHECK(mscclppCudaHostFree(proxyState->triggerFifo)); -#endif - MSCCLPPCHECK(mscclppCudaFree(proxyState->fifoHead)); -#if defined(MSCCLPP_USE_GDRCOPY) - MSCCLPPCHECK(mscclppGdrCudaFree(proxyState->fifoTailDesc)); -#else - MSCCLPPCHECK(mscclppCudaFree(proxyState->fifoTailDev)); -#endif + MSCCLPPCHECK(proxyState->fifo.destroy()); if (proxyState->p2pStream) CUDACHECK(cudaStreamDestroy(proxyState->p2pStream)); - CUDACHECK(cudaStreamDestroy(proxyState->fifoStream)); free(proxyState); } } @@ -378,20 +367,7 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void // If we couldn't find a matching context, create one if (proxyState == NULL) { MSCCLPPCHECK(mscclppCalloc(&proxyState, 1)); -#if defined(MSCCLPP_USE_GDRCOPY) - MSCCLPPCHECK(mscclppGdrCudaCalloc(&proxyState->triggerFifo, &proxyState->triggerFifoDev, MSCCLPP_PROXY_FIFO_SIZE, - &proxyState->triggerFifoDesc)); -#else - MSCCLPPCHECK(mscclppCudaHostCalloc(&proxyState->triggerFifo, MSCCLPP_PROXY_FIFO_SIZE)); -#endif - MSCCLPPCHECK(mscclppCudaCalloc(&proxyState->fifoHead, 1)); -#if defined(MSCCLPP_USE_GDRCOPY) - MSCCLPPCHECK( - mscclppGdrCudaCalloc(&proxyState->fifoTailDevHostPtr, &proxyState->fifoTailDev, 1, &proxyState->fifoTailDesc)); -#else - MSCCLPPCHECK(mscclppCudaCalloc(&proxyState->fifoTailDev, 1)); -#endif - proxyState->fifoTailHost = 0; + MSCCLPPCHECK(proxyState->fifo.create()); if (transportType == mscclppTransportIB) { proxyState->ibContext = conn->ibCtx; @@ -400,7 +376,6 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void proxyState->ibContext = NULL; CUDACHECK(cudaStreamCreateWithFlags(&proxyState->p2pStream, cudaStreamNonBlocking)); } - CUDACHECK(cudaStreamCreateWithFlags(&proxyState->fifoStream, cudaStreamNonBlocking)); proxyState->numaNodeToBind = comm->devNumaNode; // INFO(MSCCLPP_INIT, "NUMA node for device %d is %d", cudaDev, *numaNode); @@ -423,12 +398,12 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void conn->devConn->tag = tag; conn->devConn->fifo.connId = comm->nConns; #if defined(MSCCLPP_USE_GDRCOPY) - conn->devConn->fifo.triggerFifo = proxyState->triggerFifoDev; + conn->devConn->fifo.triggerFifo = proxyState->fifo.triggerFifoDev; #else - conn->devConn->fifo.triggerFifo = proxyState->triggerFifo; + conn->devConn->fifo.triggerFifo = proxyState->fifo.triggerFifo; #endif - conn->devConn->fifo.triggerFifoHead = proxyState->fifoHead; - conn->devConn->fifo.triggerFifoTail = proxyState->fifoTailDev; + conn->devConn->fifo.triggerFifoHead = proxyState->fifo.fifoHead; + conn->devConn->fifo.triggerFifoTail = proxyState->fifo.fifoTailDev; comm->nConns++; diff --git a/src/proxy.cc b/src/proxy.cc index dcd31aa5..cda01466 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -93,6 +93,69 @@ static void npkitCollectExitEvents(struct mscclppConn* conn, uint8_t type, int c #endif +mscclppResult_t mscclppProxyFifo::create() +{ + MSCCLPPCHECK(mscclppCudaCalloc(&this->fifoHead, 1)); +#if defined(MSCCLPP_USE_GDRCOPY) + MSCCLPPCHECK(mscclppGdrCudaCalloc(&this->triggerFifo, &this->triggerFifoDev, MSCCLPP_PROXY_FIFO_SIZE, + &this->triggerFifoDesc)); + MSCCLPPCHECK( + mscclppGdrCudaCalloc(&this->fifoTailDevHostPtr, &this->fifoTailDev, 1, &this->fifoTailDesc)); +#else + MSCCLPPCHECK(mscclppCudaHostCalloc(&this->triggerFifo, MSCCLPP_PROXY_FIFO_SIZE)); + MSCCLPPCHECK(mscclppCudaCalloc(&this->fifoTailDev, 1)); +#endif + CUDACHECK(cudaStreamCreateWithFlags(&this->stream, cudaStreamNonBlocking)); + this->fifoTailHost = 0; + return mscclppSuccess; +} + +mscclppResult_t mscclppProxyFifo::destroy() +{ + MSCCLPPCHECK(mscclppCudaFree(this->fifoHead)); +#if defined(MSCCLPP_USE_GDRCOPY) + MSCCLPPCHECK(mscclppGdrCudaFree(this->triggerFifoDesc)); + MSCCLPPCHECK(mscclppGdrCudaFree(this->fifoTailDesc)); +#else + MSCCLPPCHECK(mscclppCudaHostFree(this->triggerFifo)); + MSCCLPPCHECK(mscclppCudaFree(this->fifoTailDev)); +#endif + CUDACHECK(cudaStreamDestroy(this->stream)); + return mscclppSuccess; +} + +// return true if the trigger is valid +mscclppResult_t mscclppProxyFifo::poll(mscclppTrigger* trigger) +{ + __m128i xmm0 = _mm_load_si128((__m128i*)&this->triggerFifo[this->fifoTailHost % MSCCLPP_PROXY_FIFO_SIZE]); + _mm_store_si128((__m128i*)trigger, xmm0); + return mscclppSuccess; +} + +mscclppResult_t mscclppProxyFifo::pop() +{ + *(volatile uint64_t*)(&this->triggerFifo[this->fifoTailHost % MSCCLPP_PROXY_FIFO_SIZE]) = 0; + (this->fifoTailHost)++; + return mscclppSuccess; +} + +mscclppResult_t mscclppProxyFifo::flushTail(bool sync) +{ + // Flush the tail to device memory. This is either triggered every MSCCLPP_PROXY_FIFO_FLUSH_COUNTER to make sure + // that the fifo can make progress even if there is no request mscclppSync. However, mscclppSync type is for flush + // request. +#if defined(MSCCLPP_USE_GDRCOPY) + *(volatile uint64_t*)(this->fifoTailDevHostPtr) = this->fifoTailHost; +#else + CUDACHECK( + cudaMemcpyAsync(this->fifoTailDev, &(this->fifoTailHost), sizeof(uint64_t), cudaMemcpyHostToDevice, this->stream)); + if (sync) { + CUDACHECK(cudaStreamSynchronize(this->stream)); + } +#endif + return mscclppSuccess; +} + void* mscclppProxyService(void* _args) { struct proxyArgs* args = (struct proxyArgs*)_args; @@ -101,36 +164,28 @@ void* mscclppProxyService(void* _args) // from this point on, proxy thread will stay close to the device PROXYMSCCLPPCHECK(numaBind(comm->devNumaNode)); + struct mscclppProxyFifo* fifo = &args->proxyState->fifo; volatile mscclppProxyRunState_t* run = &args->proxyState->run; - mscclppTrigger* fifo = args->proxyState->triggerFifo; - uint64_t* fifoTail = &args->proxyState->fifoTailHost; -#if defined(MSCCLPP_USE_GDRCOPY) - volatile uint64_t* fifoTailDevPtr = args->proxyState->fifoTailDevHostPtr; -#else - uint64_t* fifoTailDevPtr = args->proxyState->fifoTailDev; -#endif - uint64_t fifoTailCached = *fifoTail; + mscclppTrigger trigger; mscclppIbContext* ibCtx = args->proxyState->ibContext; cudaStream_t p2pStream = args->proxyState->p2pStream; -#if !defined(MSCCLPP_USE_GDRCOPY) - cudaStream_t fifoStream = args->proxyState->fifoStream; -#endif bool isP2pProxy = (ibCtx == nullptr); free(_args); // allocated in mscclppProxyCreate npkitInitReqIds(comm); - int counter = MSCCLPP_PROXY_RUN_STATE_CHECK_PERIOD; + int runCnt = MSCCLPP_PROXY_RUN_STATE_CHECK_PERIOD; + uint64_t flushCnt = 0; for (;;) { - if (counter-- == 0) { - counter = MSCCLPP_PROXY_RUN_STATE_CHECK_PERIOD; + if (runCnt-- == 0) { + runCnt = MSCCLPP_PROXY_RUN_STATE_CHECK_PERIOD; if (*run != MSCCLPP_PROXY_RUN_STATE_RUNNING) { break; } } // Poll to see if we are ready to send anything - readTrigger(&trigger, &fifo[fifoTailCached % MSCCLPP_PROXY_FIFO_SIZE]); + PROXYMSCCLPPCHECK(fifo->poll(&trigger)); if (trigger.value[0] == 0) { continue; // there is one in progreess } @@ -210,30 +265,17 @@ void* mscclppProxyService(void* _args) } // Send completion: reset only the high 64 bits - *(volatile uint64_t*)(&fifo[fifoTailCached % MSCCLPP_PROXY_FIFO_SIZE]) = 0; - fifoTailCached++; + PROXYMSCCLPPCHECK(fifo->pop()); // Flush the tail to device memory. This is either triggered every MSCCLPP_PROXY_FIFO_FLUSH_COUNTER to make sure // that the fifo can make progress even if there is no request mscclppSync. However, mscclppSync type is for flush // request. - if (((fifoTailCached % MSCCLPP_PROXY_FIFO_FLUSH_COUNTER) == 0) || (trigger.fields.type & mscclppSync)) { -#if defined(MSCCLPP_USE_GDRCOPY) - *fifoTailDevPtr = fifoTailCached; -#else - PROXYCUDACHECK( - cudaMemcpyAsync(fifoTailDevPtr, &fifoTailCached, sizeof(uint64_t), cudaMemcpyHostToDevice, fifoStream)); -#endif + if (((++flushCnt % MSCCLPP_PROXY_FIFO_FLUSH_COUNTER) == 0) || (trigger.fields.type & mscclppSync)) { + PROXYMSCCLPPCHECK(fifo->flushTail()); } } - *fifoTail = fifoTailCached; // make sure the tail is flushed before we shut the proxy -#if defined(MSCCLPP_USE_GDRCOPY) - *fifoTailDevPtr = fifoTailCached; -#else - PROXYCUDACHECK( - cudaMemcpyAsync(fifoTailDevPtr, &fifoTailCached, sizeof(uint64_t), cudaMemcpyHostToDevice, fifoStream)); - PROXYCUDACHECK(cudaStreamSynchronize(fifoStream)); -#endif + PROXYMSCCLPPCHECK(fifo->flushTail(/*sync=*/true)); if (isP2pProxy) { PROXYCUDACHECK(cudaStreamSynchronize(p2pStream)); } From 7a0e64813a6fef3454e9750719b87db923baa947 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 11 Apr 2023 12:28:45 +0000 Subject: [PATCH 013/135] Add fifo for host connections --- src/include/api.h | 6 ++ src/include/mscclpp.h | 15 +++++ src/include/proxy.h | 33 +++++++++- src/init.cc | 139 ++++++++++++++++++++++++++---------------- src/proxy.cc | 53 ++++++++++++---- 5 files changed, 178 insertions(+), 68 deletions(-) create mode 100644 src/include/api.h diff --git a/src/include/api.h b/src/include/api.h new file mode 100644 index 00000000..bc5bd1a6 --- /dev/null +++ b/src/include/api.h @@ -0,0 +1,6 @@ +#ifndef MSCCLPP_API_H_ +#define MSCCLPP_API_H_ + +#define MSCCLPP_API extern "C" __attribute__((visibility("default"))) + +#endif // MSCCLPP_API_H_ diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index c67add94..94cabd58 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -186,8 +186,23 @@ struct mscclppDevConn : mscclppBaseConn struct mscclppConcurrentFifo fifo; }; +struct mscclppHostConn : mscclppBaseConn +{ + void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize); + void put(uint64_t dataOffset, uint64_t dataSize); + void signal(); + void putWithSignal(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize); + void putWithSignal(uint64_t dataOffset, uint64_t dataSize); + void putWithSignalAndFlush(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize); + void putWithSignalAndFlush(uint64_t dataOffset, uint64_t dataSize); + void flush(); + void wait(); + void epochIncrement(); +}; + typedef struct mscclppComm* mscclppComm_t; typedef struct mscclppDevConn mscclppDevConn_t; +typedef struct mscclppHostConn mscclppHostConn_t; #define MSCCLPP_UNIQUE_ID_BYTES 128 typedef struct diff --git a/src/include/proxy.h b/src/include/proxy.h index 8b300919..682164a0 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -3,6 +3,7 @@ #include "comm.h" #include "mscclpp.h" +#include #include #include @@ -15,11 +16,20 @@ typedef enum MSCCLPP_PROXY_RUN_STATE_EXITING, } mscclppProxyRunState_t; +// TODO: virtual functions struct mscclppProxyFifo +{ + // virtual mscclppResult_t create() = 0; + // virtual mscclppResult_t destroy() = 0; + // virtual mscclppResult_t poll(mscclppTrigger*) = 0; + // virtual mscclppResult_t pop() = 0; + // virtual mscclppResult_t flushTail(bool) = 0; +}; + +struct mscclppProxyDevFifo : mscclppProxyFifo { mscclppResult_t create(); mscclppResult_t destroy(); - mscclppResult_t poll(mscclppTrigger* trigger); mscclppResult_t pop(); mscclppResult_t flushTail(bool sync = false); @@ -52,6 +62,24 @@ struct mscclppProxyFifo cudaStream_t stream; }; +struct mscclppProxyHostFifo : mscclppProxyFifo +{ + mscclppResult_t create(); + mscclppResult_t destroy(); + mscclppResult_t poll(mscclppTrigger* trigger); + mscclppResult_t pop(); + mscclppResult_t flushTail(bool sync = false); + + // fifo cudaHostCalloc'ed that is produced by device and consumed by host + mscclppTrigger* triggerFifo; + + // allocated on the device and only accessed by the device + std::atomic* fifoHead; + + // + uint64_t fifoTailHost; +}; + struct mscclppProxyState { mscclppTransport_t transportType; @@ -62,7 +90,8 @@ struct mscclppProxyState struct mscclppIbContext* ibContext; // For IB connection only cudaStream_t p2pStream; // for P2P DMA engine only - struct mscclppProxyFifo fifo; + struct mscclppProxyDevFifo devFifo; + struct mscclppProxyHostFifo hostFifo; }; mscclppResult_t mscclppProxyCreate(struct mscclppComm* comm); diff --git a/src/init.cc b/src/init.cc index daaac49d..10215984 100644 --- a/src/init.cc +++ b/src/init.cc @@ -1,4 +1,5 @@ #include "alloc.h" +#include "api.h" #include "bootstrap.h" #include "checks.h" #include "config.h" @@ -12,8 +13,6 @@ #include "npkit/npkit.h" #endif -#define MSCCLPP_API(ret, func, args...) extern "C" __attribute__((visibility("default"))) ret func(args) - static uint64_t hashUniqueId(mscclppUniqueId const& id) { char const* bytes = (char const*)&id; @@ -70,8 +69,7 @@ static std::string mscclppShmFileName(mscclppComm_t comm, int rank) return ss.str(); } -MSCCLPP_API(mscclppResult_t, mscclppGetUniqueId, mscclppUniqueId* out); -mscclppResult_t mscclppGetUniqueId(mscclppUniqueId* out) +MSCCLPP_API mscclppResult_t mscclppGetUniqueId(mscclppUniqueId* out) { MSCCLPPCHECK(mscclppInit()); // mscclppCHECK(PtrCheck(out, "GetUniqueId", "out")); @@ -80,15 +78,13 @@ mscclppResult_t mscclppGetUniqueId(mscclppUniqueId* out) return res; } -MSCCLPP_API(mscclppResult_t, mscclppBootstrapAllGather, mscclppComm_t comm, void* data, int size); -mscclppResult_t mscclppBootstrapAllGather(mscclppComm_t comm, void* data, int size) +MSCCLPP_API mscclppResult_t mscclppBootstrapAllGather(mscclppComm_t comm, void* data, int size) { MSCCLPPCHECK(bootstrapAllGather(comm->bootstrap, data, size)); return mscclppSuccess; } -MSCCLPP_API(mscclppResult_t, mscclppCommInitRank, mscclppComm_t* comm, int nranks, const char* ipPortPair, int rank); -mscclppResult_t mscclppCommInitRank(mscclppComm_t* comm, int nranks, const char* ipPortPair, int rank) +MSCCLPP_API mscclppResult_t mscclppCommInitRank(mscclppComm_t* comm, int nranks, const char* ipPortPair, int rank) { #if defined(MSCCLPP_USE_GDRCOPY) MSCCLPPCHECK(initGdrCopy()); @@ -133,8 +129,7 @@ fail: return res; } -MSCCLPP_API(mscclppResult_t, mscclppCommInitRankFromId, mscclppComm_t* comm, int nranks, mscclppUniqueId id, int rank); -mscclppResult_t mscclppCommInitRankFromId(mscclppComm_t* comm, int nranks, mscclppUniqueId id, int rank) +MSCCLPP_API mscclppResult_t mscclppCommInitRankFromId(mscclppComm_t* comm, int nranks, mscclppUniqueId id, int rank) { #if defined(MSCCLPP_USE_GDRCOPY) MSCCLPPCHECK(initGdrCopy()); @@ -174,8 +169,7 @@ fail: return res; } -MSCCLPP_API(mscclppResult_t, mscclppCommDestroy, mscclppComm_t comm); -mscclppResult_t mscclppCommDestroy(mscclppComm_t comm) +MSCCLPP_API mscclppResult_t mscclppCommDestroy(mscclppComm_t comm) { #if defined(ENABLE_NPKIT) const char* npkitDumpDir = nullptr; @@ -187,7 +181,8 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm) for (int i = 0; i < MSCCLPP_PROXY_MAX_NUM; ++i) { struct mscclppProxyState* proxyState = comm->proxyState[i]; if (proxyState) { - MSCCLPPCHECK(proxyState->fifo.destroy()); + MSCCLPPCHECK(proxyState->devFifo.destroy()); + MSCCLPPCHECK(proxyState->hostFifo.destroy()); if (proxyState->p2pStream) CUDACHECK(cudaStreamDestroy(proxyState->p2pStream)); free(proxyState); @@ -228,8 +223,7 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm) return mscclppSuccess; } -MSCCLPP_API(const char*, mscclppGetErrorString, mscclppResult_t code); -const char* mscclppGetErrorString(mscclppResult_t code) +MSCCLPP_API const char* mscclppGetErrorString(mscclppResult_t code) { switch (code) { case mscclppSuccess: @@ -253,9 +247,7 @@ const char* mscclppGetErrorString(mscclppResult_t code) } } -MSCCLPP_API(mscclppResult_t, mscclppGetDeviceConnection, mscclppComm_t comm, int remoteRank, int tag, - mscclppDevConn_t** devConn); -mscclppResult_t mscclppGetDeviceConnection(mscclppComm_t comm, int remoteRank, int tag, mscclppDevConn_t** devConn) +MSCCLPP_API mscclppResult_t mscclppGetDeviceConnection(mscclppComm_t comm, int remoteRank, int tag, mscclppDevConn_t** devConn) { for (int i = 0; i < comm->nConns; i++) { if (comm->devConns[i].remoteRank == remoteRank && comm->devConns[i].tag == tag) { @@ -267,18 +259,14 @@ mscclppResult_t mscclppGetDeviceConnection(mscclppComm_t comm, int remoteRank, i return mscclppInvalidArgument; } -MSCCLPP_API(mscclppResult_t, mscclppGetAllDeviceConnections, mscclppComm_t comm, mscclppDevConn_t** devConns, - int* nConns); -mscclppResult_t mscclppGetAllDeviceConnections(mscclppComm_t comm, mscclppDevConn_t** devConns, int* nConns) +MSCCLPP_API mscclppResult_t mscclppGetAllDeviceConnections(mscclppComm_t comm, mscclppDevConn_t** devConns, int* nConns) { *nConns = comm->nConns; *devConns = comm->devConns; return mscclppSuccess; } -MSCCLPP_API(mscclppResult_t, mscclppConnect, mscclppComm_t comm, int remoteRank, int tag, void* localBuff, - uint64_t buffSize, mscclppTransport_t transportType, const char* ibDev); -mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void* localBuff, uint64_t buffSize, +MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void* localBuff, uint64_t buffSize, mscclppTransport_t transportType, const char* ibDev) { // save this processes numa binding and set it to the one closest to the device @@ -367,7 +355,8 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void // If we couldn't find a matching context, create one if (proxyState == NULL) { MSCCLPPCHECK(mscclppCalloc(&proxyState, 1)); - MSCCLPPCHECK(proxyState->fifo.create()); + MSCCLPPCHECK(proxyState->devFifo.create()); + MSCCLPPCHECK(proxyState->hostFifo.create()); if (transportType == mscclppTransportIB) { proxyState->ibContext = conn->ibCtx; @@ -398,12 +387,12 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void conn->devConn->tag = tag; conn->devConn->fifo.connId = comm->nConns; #if defined(MSCCLPP_USE_GDRCOPY) - conn->devConn->fifo.triggerFifo = proxyState->fifo.triggerFifoDev; + conn->devConn->fifo.triggerFifo = proxyState->devFifo.triggerFifoDev; #else - conn->devConn->fifo.triggerFifo = proxyState->fifo.triggerFifo; + conn->devConn->fifo.triggerFifo = proxyState->devFifo.triggerFifo; #endif - conn->devConn->fifo.triggerFifoHead = proxyState->fifo.fifoHead; - conn->devConn->fifo.triggerFifoTail = proxyState->fifo.fifoTailDev; + conn->devConn->fifo.triggerFifoHead = proxyState->devFifo.fifoHead; + conn->devConn->fifo.triggerFifoTail = proxyState->devFifo.fifoTailDev; comm->nConns++; @@ -489,8 +478,7 @@ mscclppResult_t mscclppIbConnectionSetupEnd(struct connInfo* connInfo /*input*/, return mscclppSuccess; } -MSCCLPP_API(mscclppResult_t, mscclppConnectionSetup, mscclppComm_t comm); -mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm) +MSCCLPP_API mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm) { // Send info to peers for (int i = 0; i < comm->nConns; ++i) { @@ -529,9 +517,7 @@ struct bufferInfo mscclppIbMrInfo infoBuffMr; }; -MSCCLPP_API(mscclppResult_t, mscclppRegisterBuffer, mscclppComm_t comm, void* local_memory, size_t size, - mscclppRegisteredMemory* regMem); -mscclppResult_t mscclppRegisterBuffer(mscclppComm_t comm, void* local_memory, size_t size, +MSCCLPP_API mscclppResult_t mscclppRegisterBuffer(mscclppComm_t comm, void* local_memory, size_t size, mscclppRegisteredMemory* regMem) { std::vector ibMrs; @@ -573,9 +559,7 @@ mscclppResult_t mscclppRegisterBuffer(mscclppComm_t comm, void* local_memory, si return mscclppSuccess; } -MSCCLPP_API(mscclppResult_t, mscclppRegisteredBufferWrite, mscclppComm_t comm, mscclppRegisteredMemory* regMem, - void* srcBuff, size_t size, uint32_t srcOffset, uint32_t dstOffset, int64_t stream); -mscclppResult_t mscclppRegisteredBufferWrite(mscclppComm_t comm, mscclppRegisteredMemory* regMem, void* srcBuff, +MSCCLPP_API mscclppResult_t mscclppRegisteredBufferWrite(mscclppComm_t comm, mscclppRegisteredMemory* regMem, void* srcBuff, size_t size, uint32_t srcOffset, uint32_t dstOffset, int64_t stream) { int ret = 0; @@ -605,15 +589,13 @@ mscclppResult_t mscclppRegisteredBufferWrite(mscclppComm_t comm, mscclppRegister // TODO: destroy registered buffer -MSCCLPP_API(mscclppResult_t, mscclppProxyLaunch, mscclppComm_t comm); -mscclppResult_t mscclppProxyLaunch(mscclppComm_t comm) +MSCCLPP_API mscclppResult_t mscclppProxyLaunch(mscclppComm_t comm) { MSCCLPPCHECK(mscclppProxyCreate(comm)); return mscclppSuccess; } -MSCCLPP_API(mscclppResult_t, mscclppBootstrapBarrier, mscclppComm_t comm); -mscclppResult_t mscclppBootstrapBarrier(mscclppComm_t comm) +MSCCLPP_API mscclppResult_t mscclppBootstrapBarrier(mscclppComm_t comm) { int* tmp = new int[comm->nRanks]; MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int))); @@ -621,8 +603,7 @@ mscclppResult_t mscclppBootstrapBarrier(mscclppComm_t comm) return mscclppSuccess; } -MSCCLPP_API(mscclppResult_t, mscclppProxyStop, mscclppComm_t comm); -mscclppResult_t mscclppProxyStop(mscclppComm_t comm) +MSCCLPP_API mscclppResult_t mscclppProxyStop(mscclppComm_t comm) { // a barrier to make sure all ranks are done with their work before stopping the proxy MSCCLPPCHECK(mscclppBootstrapBarrier(comm)); @@ -631,8 +612,7 @@ mscclppResult_t mscclppProxyStop(mscclppComm_t comm) return mscclppSuccess; } -MSCCLPP_API(mscclppResult_t, mscclppCommRank, mscclppComm_t comm, int* rank); -mscclppResult_t mscclppCommRank(mscclppComm_t comm, int* rank) +MSCCLPP_API mscclppResult_t mscclppCommRank(mscclppComm_t comm, int* rank) { if (comm == NULL || rank == NULL) { WARN("comm or rank cannot be null"); @@ -642,8 +622,7 @@ mscclppResult_t mscclppCommRank(mscclppComm_t comm, int* rank) return mscclppSuccess; } -MSCCLPP_API(mscclppResult_t, mscclppCommSize, mscclppComm_t comm, int* size); -mscclppResult_t mscclppCommSize(mscclppComm_t comm, int* size) +MSCCLPP_API mscclppResult_t mscclppCommSize(mscclppComm_t comm, int* size) { if (comm == NULL || size == NULL) { WARN("comm or size cannot be null"); @@ -653,22 +632,76 @@ mscclppResult_t mscclppCommSize(mscclppComm_t comm, int* size) return mscclppSuccess; } -MSCCLPP_API(void, mscclppDefaultLogHandler, const char* msg); -void mscclppDefaultLogHandler(const char* msg) +MSCCLPP_API void mscclppDefaultLogHandler(const char* msg) { mscclppDebugDefaultLogHandler(msg); } -MSCCLPP_API(mscclppResult_t, mscclppSetLogHandler, mscclppLogHandler_t handler); -mscclppResult_t mscclppSetLogHandler(mscclppLogHandler_t handler) +MSCCLPP_API mscclppResult_t mscclppSetLogHandler(mscclppLogHandler_t handler) { return mscclppDebugSetLogHandler(handler); } -MSCCLPP_API(mscclppResult_t, mscclppSetBootstrapConnTimeout, int timeout); -mscclppResult_t mscclppSetBootstrapConnTimeout(int timeout) +MSCCLPP_API mscclppResult_t mscclppSetBootstrapConnTimeout(int timeout) { mscclppConfig* config = mscclppConfig::getInstance(); config->setBootstrapConnectionTimeoutConfig(timeout); return mscclppSuccess; } + +static inline uint64_t hostFifoPush(uint64_t type, uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) +{ + +} + +MSCCLPP_API void mscclppHostConn::put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) +{ + +} + + +MSCCLPP_API void mscclppHostConn::put(uint64_t dataOffset, uint64_t dataSize) +{ + +} + +MSCCLPP_API void mscclppHostConn::signal() +{ + +} + +MSCCLPP_API void mscclppHostConn::putWithSignal(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) +{ + +} + +MSCCLPP_API void mscclppHostConn::putWithSignal(uint64_t dataOffset, uint64_t dataSize) +{ + +} + +MSCCLPP_API void mscclppHostConn::putWithSignalAndFlush(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) +{ + +} + +MSCCLPP_API void mscclppHostConn::putWithSignalAndFlush(uint64_t dataOffset, uint64_t dataSize) +{ + +} + +MSCCLPP_API void mscclppHostConn::flush() +{ + +} + +MSCCLPP_API void mscclppHostConn::wait() +{ + +} + +MSCCLPP_API void mscclppHostConn::epochIncrement() +{ + +} + diff --git a/src/proxy.cc b/src/proxy.cc index cda01466..97df77b4 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -40,12 +40,6 @@ struct proxyArgs struct mscclppProxyState* proxyState; }; -static void readTrigger(mscclppTrigger* dst, mscclppTrigger* src) -{ - __m128i xmm0 = _mm_load_si128((__m128i*)src); - _mm_store_si128((__m128i*)dst, xmm0); -} - #if defined(ENABLE_NPKIT) static void npkitInitReqIds(struct mscclppComm* comm) @@ -93,7 +87,7 @@ static void npkitCollectExitEvents(struct mscclppConn* conn, uint8_t type, int c #endif -mscclppResult_t mscclppProxyFifo::create() +mscclppResult_t mscclppProxyDevFifo::create() { MSCCLPPCHECK(mscclppCudaCalloc(&this->fifoHead, 1)); #if defined(MSCCLPP_USE_GDRCOPY) @@ -110,7 +104,7 @@ mscclppResult_t mscclppProxyFifo::create() return mscclppSuccess; } -mscclppResult_t mscclppProxyFifo::destroy() +mscclppResult_t mscclppProxyDevFifo::destroy() { MSCCLPPCHECK(mscclppCudaFree(this->fifoHead)); #if defined(MSCCLPP_USE_GDRCOPY) @@ -125,21 +119,21 @@ mscclppResult_t mscclppProxyFifo::destroy() } // return true if the trigger is valid -mscclppResult_t mscclppProxyFifo::poll(mscclppTrigger* trigger) +mscclppResult_t mscclppProxyDevFifo::poll(mscclppTrigger* trigger) { __m128i xmm0 = _mm_load_si128((__m128i*)&this->triggerFifo[this->fifoTailHost % MSCCLPP_PROXY_FIFO_SIZE]); _mm_store_si128((__m128i*)trigger, xmm0); return mscclppSuccess; } -mscclppResult_t mscclppProxyFifo::pop() +mscclppResult_t mscclppProxyDevFifo::pop() { *(volatile uint64_t*)(&this->triggerFifo[this->fifoTailHost % MSCCLPP_PROXY_FIFO_SIZE]) = 0; (this->fifoTailHost)++; return mscclppSuccess; } -mscclppResult_t mscclppProxyFifo::flushTail(bool sync) +mscclppResult_t mscclppProxyDevFifo::flushTail(bool sync) { // Flush the tail to device memory. This is either triggered every MSCCLPP_PROXY_FIFO_FLUSH_COUNTER to make sure // that the fifo can make progress even if there is no request mscclppSync. However, mscclppSync type is for flush @@ -156,6 +150,40 @@ mscclppResult_t mscclppProxyFifo::flushTail(bool sync) return mscclppSuccess; } +mscclppResult_t mscclppProxyHostFifo::create() +{ + MSCCLPPCHECK(mscclppCalloc(&this->fifoHead, 1)); + MSCCLPPCHECK(mscclppCalloc(&this->triggerFifo, MSCCLPP_PROXY_FIFO_SIZE)); + this->fifoTailHost = 0; + return mscclppSuccess; +} + +mscclppResult_t mscclppProxyHostFifo::destroy() +{ + free(this->fifoHead); + free(this->triggerFifo); + return mscclppSuccess; +} + +mscclppResult_t mscclppProxyHostFifo::poll(mscclppTrigger* trigger) +{ + __m128i xmm0 = _mm_load_si128((__m128i*)&this->triggerFifo[this->fifoTailHost % MSCCLPP_PROXY_FIFO_SIZE]); + _mm_store_si128((__m128i*)trigger, xmm0); + return mscclppSuccess; +} + +mscclppResult_t mscclppProxyHostFifo::pop() +{ + *(volatile uint64_t*)(&this->triggerFifo[this->fifoTailHost % MSCCLPP_PROXY_FIFO_SIZE]) = 0; + (this->fifoTailHost)++; + return mscclppSuccess; +} + +mscclppResult_t mscclppProxyHostFifo::flushTail(bool) +{ + return mscclppSuccess; +} + void* mscclppProxyService(void* _args) { struct proxyArgs* args = (struct proxyArgs*)_args; @@ -164,9 +192,8 @@ void* mscclppProxyService(void* _args) // from this point on, proxy thread will stay close to the device PROXYMSCCLPPCHECK(numaBind(comm->devNumaNode)); - struct mscclppProxyFifo* fifo = &args->proxyState->fifo; + struct mscclppProxyDevFifo* fifo = &args->proxyState->devFifo; volatile mscclppProxyRunState_t* run = &args->proxyState->run; - mscclppTrigger trigger; mscclppIbContext* ibCtx = args->proxyState->ibContext; cudaStream_t p2pStream = args->proxyState->p2pStream; From 459c56d3cb6732c66e574653cbbc430445c2cdc6 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Tue, 11 Apr 2023 22:52:47 +0000 Subject: [PATCH 014/135] not compiling -- wip for hostfunctions --- src/include/mscclpp.h | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index 94cabd58..b5390147 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -13,6 +13,7 @@ #include #include +#includa #ifdef __cplusplus extern "C" { @@ -188,9 +189,23 @@ struct mscclppDevConn : mscclppBaseConn struct mscclppHostConn : mscclppBaseConn { - void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize); - void put(uint64_t dataOffset, uint64_t dataSize); - void signal(); + void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize){ + conn->ibQp->stageSend(conn->ibBuffMr, &conn->ibBuffMrInfo, (uint32_t)dataSize, + /*wrId=*/0, /*srcOffset=*/srcDataOffset, + /*dstOffset=*/dstDataOffset, + /*signaled=*/false); + int ret = conn->ibQp->postSend(); + if (ret != 0) { + // Return value is errno. + WARN("data postSend failed: errno %d", ret); + } + } + void put(uint64_t dataOffset, uint64_t dataSize){ + put(dataOffset, dataOffset, dataSize); + } + void signal(){ + + } void putWithSignal(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize); void putWithSignal(uint64_t dataOffset, uint64_t dataSize); void putWithSignalAndFlush(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize); @@ -198,6 +213,8 @@ struct mscclppHostConn : mscclppBaseConn void flush(); void wait(); void epochIncrement(); + struct mscclppConn* conn; + cudaStream_t p2pStream; }; typedef struct mscclppComm* mscclppComm_t; From 9124856ea4dca4f635c93039bdae823419c573b0 Mon Sep 17 00:00:00 2001 From: Madan Musuvathi Date: Wed, 12 Apr 2023 01:36:06 +0000 Subject: [PATCH 015/135] first version hostConn --- src/include/checks.h | 9 +++ src/include/comm.h | 4 ++ src/include/mscclpp.h | 79 +++++++++--------------- src/init.cc | 139 ++++++++++++++++++++++++++---------------- src/proxy.cc | 117 +++++++++++------------------------ 5 files changed, 166 insertions(+), 182 deletions(-) diff --git a/src/include/checks.h b/src/include/checks.h index f93945c7..fb86fd66 100644 --- a/src/include/checks.h +++ b/src/include/checks.h @@ -20,6 +20,15 @@ } \ } while (false) +#define CUDACHECKNORET(cmd) \ + do { \ + cudaError_t err = cmd; \ + if (err != cudaSuccess) { \ + WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ + return; \ + } \ + } while (false) + #define CUDACHECKGOTO(cmd, res, label) \ do { \ cudaError_t err = cmd; \ diff --git a/src/include/comm.h b/src/include/comm.h index 38abd438..c4927143 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -16,12 +16,16 @@ #define MAXCONNECTIONS 64 + + struct mscclppConn { mscclppTransport_t transport; int remoteRank; uint64_t buffSize; struct mscclppDevConn* devConn; + struct mscclppHostConn* hostConn; + struct mscclppIbContext* ibCtx; struct mscclppIbQp* ibQp; struct mscclppIbMr* ibBuffMr; diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index b5390147..11e0cbf6 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -13,7 +13,7 @@ #include #include -#includa +// #includa #ifdef __cplusplus extern "C" { @@ -29,27 +29,6 @@ struct alignas(16) mscclppDevConnSignalEpochId uint64_t proxy; }; -struct mscclppBaseConn -{ - int remoteRank; - int tag; - - // my local buffer - void* localBuff; - - struct mscclppDevConnSignalEpochId* localSignalEpochId; - // used by the signal() function directly from gpu - struct mscclppDevConnSignalEpochId* remoteSignalEpochId; - - // every wait(), increaments this and then the gpu waits for either: - // 1) localSignalEpochId->proxy to be >= this in case of a proxy thread - // 2) remoteSignalEpochId->device to be >= this in case of a gpu thread - uint64_t* waitEpochId; - - // my remote peer's buffer. only non-NULL with gpu's direct access - // gpu can directly write into it - void* remoteBuff; -}; /*************************************************************************************************************** * A mscclppDevConn provides a zero-copy connection between two GPUs connected via P2P NVLink or InfiniBand. @@ -113,7 +92,7 @@ struct mscclppBaseConn * The two endpoint can concurrently use the same connection provided they are writing (puts) on different * indices in the registered buffer. **************************************************************************************************************/ -struct mscclppDevConn : mscclppBaseConn +struct mscclppDevConn { #ifdef __CUDACC__ __forceinline__ __device__ void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) @@ -185,36 +164,34 @@ struct mscclppDevConn : mscclppBaseConn // this is a concurrent fifo which is multiple threads from the device // can produce for and the sole proxy thread consumes it. struct mscclppConcurrentFifo fifo; + + int remoteRank; + int tag; + + // my local buffer + void* localBuff; + + struct mscclppDevConnSignalEpochId* localSignalEpochId; + // used by the signal() function directly from gpu + struct mscclppDevConnSignalEpochId* remoteSignalEpochId; + + // every wait(), increaments this and then the gpu waits for either: + // 1) localSignalEpochId->proxy to be >= this in case of a proxy thread + // 2) remoteSignalEpochId->device to be >= this in case of a gpu thread + uint64_t* waitEpochId; + + // my remote peer's buffer. only non-NULL with gpu's direct access + // gpu can directly write into it + void* remoteBuff; + }; -struct mscclppHostConn : mscclppBaseConn -{ - void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize){ - conn->ibQp->stageSend(conn->ibBuffMr, &conn->ibBuffMrInfo, (uint32_t)dataSize, - /*wrId=*/0, /*srcOffset=*/srcDataOffset, - /*dstOffset=*/dstDataOffset, - /*signaled=*/false); - int ret = conn->ibQp->postSend(); - if (ret != 0) { - // Return value is errno. - WARN("data postSend failed: errno %d", ret); - } - } - void put(uint64_t dataOffset, uint64_t dataSize){ - put(dataOffset, dataOffset, dataSize); - } - void signal(){ - - } - void putWithSignal(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize); - void putWithSignal(uint64_t dataOffset, uint64_t dataSize); - void putWithSignalAndFlush(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize); - void putWithSignalAndFlush(uint64_t dataOffset, uint64_t dataSize); - void flush(); - void wait(); - void epochIncrement(); - struct mscclppConn* conn; - cudaStream_t p2pStream; +// Host interface for mscclppDevCon functionality +struct mscclppHostConn{ + virtual void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) = 0; + virtual void signal() = 0; + virtual void wait() = 0; + virtual void flush() = 0; }; typedef struct mscclppComm* mscclppComm_t; diff --git a/src/init.cc b/src/init.cc index 10215984..16a57333 100644 --- a/src/init.cc +++ b/src/init.cc @@ -266,6 +266,84 @@ MSCCLPP_API mscclppResult_t mscclppGetAllDeviceConnections(mscclppComm_t comm, m return mscclppSuccess; } + +struct mscclppHostP2PConn : mscclppHostConn{ + mscclppHostP2PConn(mscclppConn* _conn, cudaStream_t _stream) : conn(_conn), p2pStream(_stream){} + + void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize){ + void* srcBuff = (void*)((char*)conn->devConn->localBuff + srcDataOffset); + void* dstBuff = (void*)((char*)conn->devConn->remoteBuff + dstDataOffset); + CUDACHECKNORET(cudaMemcpyAsync(dstBuff, srcBuff, dataSize, cudaMemcpyDeviceToDevice, p2pStream)); + } + void signal(){ + CUDACHECKNORET(cudaMemcpyAsync(&conn->devConn->remoteSignalEpochId->proxy, + &(conn->devConn->localSignalEpochId->device), sizeof(uint64_t), + cudaMemcpyDeviceToDevice, p2pStream)); + } + void wait(){} + void flush(){ + CUDACHECKNORET(cudaStreamSynchronize(p2pStream)); + } + + mscclppConn* conn; + cudaStream_t p2pStream; +}; + +struct mscclppHostIBConn : mscclppHostConn{ + mscclppHostIBConn(mscclppConn* conn) : conn(conn) {} + + void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize){ + conn->ibQp->stageSend(conn->ibBuffMr, &conn->ibBuffMrInfo, (uint32_t)dataSize, + /*wrId=*/0, /*srcOffset=*/srcDataOffset, + /*dstOffset=*/dstDataOffset, + /*signaled=*/false); + int ret = conn->ibQp->postSend(); + if (ret != 0) { + // Return value is errno. + WARN("data postSend failed: errno %d", ret); + } + } + void signal(){ + // My local device flag is copied to the remote's proxy flag + conn->ibQp->stageSend(conn->ibSignalEpochIdMr, &conn->ibSignalEpochIdMrInfo, sizeof(uint64_t), + /*wrId=*/0, /*srcOffset=*/0, /*dstOffset=*/sizeof(uint64_t), /*signaled=*/true); + int ret = conn->ibQp->postSend(); + if (ret != 0) { + WARN("flag postSend failed: errno %d", ret); + } + } + void wait(){} + void flush(){ + bool isWaiting = true; + while (isWaiting) { + int wcNum = conn->ibQp->pollCq(); + if (wcNum < 0) { + WARN("pollCq failed: errno %d", errno); + continue; + } + for (int i = 0; i < wcNum; ++i) { + struct ibv_wc* wc = &conn->ibQp->wcs[i]; + if (wc->status != IBV_WC_SUCCESS) { + WARN("wc status %d", wc->status); + continue; + } + if (wc->qp_num != conn->ibQp->qp->qp_num) { + WARN("got wc of unknown qp_num %d", wc->qp_num); + continue; + } + if (wc->opcode == IBV_WC_RDMA_WRITE) { + isWaiting = false; + break; + } + } + } + } + + mscclppConn* conn; +}; + + + MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void* localBuff, uint64_t buffSize, mscclppTransport_t transportType, const char* ibDev) { @@ -318,8 +396,9 @@ MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, i } // Set the ib context for this conn conn->ibCtx = comm->ibContext[ibDevIdx]; + } else if (transportType == mscclppTransportP2P) { - // No allocation needed for P2P proxy + // do the rest of the initialization later } else if (transportType == mscclppTransportSHM) { WARN("Shared memory interconnection is not implemented yet!"); return mscclppInternalError; @@ -377,6 +456,14 @@ MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, i return mscclppInternalError; } + if (transportType == mscclppTransportIB) { + conn->hostConn = new mscclppHostIBConn(conn); + } + else if (transportType == mscclppTransportP2P) { + conn->hostConn = new mscclppHostP2PConn(conn, proxyState->p2pStream); + } + + struct mscclppDevConn* devConn = &comm->devConns[comm->nConns]; conn->devConn = devConn; @@ -654,54 +741,4 @@ static inline uint64_t hostFifoPush(uint64_t type, uint64_t dstDataOffset, uint6 } -MSCCLPP_API void mscclppHostConn::put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) -{ - -} - - -MSCCLPP_API void mscclppHostConn::put(uint64_t dataOffset, uint64_t dataSize) -{ - -} - -MSCCLPP_API void mscclppHostConn::signal() -{ - -} - -MSCCLPP_API void mscclppHostConn::putWithSignal(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) -{ - -} - -MSCCLPP_API void mscclppHostConn::putWithSignal(uint64_t dataOffset, uint64_t dataSize) -{ - -} - -MSCCLPP_API void mscclppHostConn::putWithSignalAndFlush(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) -{ - -} - -MSCCLPP_API void mscclppHostConn::putWithSignalAndFlush(uint64_t dataOffset, uint64_t dataSize) -{ - -} - -MSCCLPP_API void mscclppHostConn::flush() -{ - -} - -MSCCLPP_API void mscclppHostConn::wait() -{ - -} - -MSCCLPP_API void mscclppHostConn::epochIncrement() -{ - -} diff --git a/src/proxy.cc b/src/proxy.cc index 97df77b4..d5b99e9d 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -184,21 +184,47 @@ mscclppResult_t mscclppProxyHostFifo::flushTail(bool) return mscclppSuccess; } + +void processTrigger(const mscclppTrigger trigger, mscclppConn* conn, mscclppProxyState* proxyState){ + mscclppIbContext* ibCtx = proxyState->ibContext; + bool isP2pProxy = (ibCtx == nullptr); + + // Iterate over what send is needed + if (trigger.fields.type & mscclppData) { + conn->hostConn->put(trigger.fields.dstDataOffset, trigger.fields.srcDataOffset, trigger.fields.dataSize); + + npkitCollectEntryEvent(conn, isP2pProxy ? NPKIT_EVENT_DMA_SEND_DATA_ENTRY : NPKIT_EVENT_IB_SEND_DATA_ENTRY, + (uint32_t)trigger.fields.dataSize, trigger.fields.connId); + } + + if (trigger.fields.type & mscclppFlag) { + conn->hostConn->signal(); + + npkitCollectEntryEvent(conn, isP2pProxy ? NPKIT_EVENT_P2P_SEND_FLAG_ENTRY : NPKIT_EVENT_IB_SEND_FLAG_ENTRY, + (uint32_t)sizeof(uint64_t), trigger.fields.connId); + } + + // Wait for completion + if (trigger.fields.type & mscclppSync) { + conn->hostConn->flush(); + npkitCollectExitEvents(conn, isP2pProxy? NPKIT_EVENT_DMA_SEND_EXIT : NPKIT_EVENT_IB_SEND_EXIT, trigger.fields.connId); + } +} + + void* mscclppProxyService(void* _args) { struct proxyArgs* args = (struct proxyArgs*)_args; struct mscclppComm* comm = args->comm; + struct mscclppProxyState* proxyState = args->proxyState; + free(_args); // allocated in mscclppProxyCreate // from this point on, proxy thread will stay close to the device PROXYMSCCLPPCHECK(numaBind(comm->devNumaNode)); - struct mscclppProxyDevFifo* fifo = &args->proxyState->devFifo; - volatile mscclppProxyRunState_t* run = &args->proxyState->run; + struct mscclppProxyDevFifo* fifo = &proxyState->devFifo; + volatile mscclppProxyRunState_t* run = &proxyState->run; mscclppTrigger trigger; - mscclppIbContext* ibCtx = args->proxyState->ibContext; - cudaStream_t p2pStream = args->proxyState->p2pStream; - bool isP2pProxy = (ibCtx == nullptr); - free(_args); // allocated in mscclppProxyCreate npkitInitReqIds(comm); @@ -216,80 +242,9 @@ void* mscclppProxyService(void* _args) if (trigger.value[0] == 0) { continue; // there is one in progreess } - - struct mscclppConn* conn = &comm->conns[trigger.fields.connId]; - int ret = 0; - // Iterate over what send is needed - if (trigger.fields.type & mscclppData) { - if (isP2pProxy) { - void* srcBuff = (void*)((char*)conn->devConn->localBuff + trigger.fields.srcDataOffset); - void* dstBuff = (void*)((char*)conn->devConn->remoteBuff + trigger.fields.dstDataOffset); - PROXYCUDACHECK(cudaMemcpyAsync(dstBuff, srcBuff, trigger.fields.dataSize, cudaMemcpyDeviceToDevice, p2pStream)); - npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_DATA_ENTRY, (uint32_t)trigger.fields.dataSize, - trigger.fields.connId); - } else { - conn->ibQp->stageSend(conn->ibBuffMr, &conn->ibBuffMrInfo, (uint32_t)trigger.fields.dataSize, - /*wrId=*/0, /*srcOffset=*/trigger.fields.srcDataOffset, - /*dstOffset=*/trigger.fields.dstDataOffset, - /*signaled=*/false); - if ((ret = conn->ibQp->postSend()) != 0) { - // Return value is errno. - WARN("data postSend failed: errno %d", ret); - } - npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_DATA_ENTRY, (uint32_t)trigger.fields.dataSize, - trigger.fields.connId); - } - } - if (trigger.fields.type & mscclppFlag) { - if (isP2pProxy) { - PROXYCUDACHECK(cudaMemcpyAsync(&conn->devConn->remoteSignalEpochId->proxy, - &(conn->devConn->localSignalEpochId->device), sizeof(uint64_t), - cudaMemcpyDeviceToDevice, p2pStream)); - npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_FLAG_ENTRY, (uint32_t)sizeof(uint64_t), - trigger.fields.connId); - } else { - // My local device flag is copied to the remote's proxy flag - conn->ibQp->stageSend(conn->ibSignalEpochIdMr, &conn->ibSignalEpochIdMrInfo, sizeof(uint64_t), - /*wrId=*/0, /*srcOffset=*/0, /*dstOffset=*/sizeof(uint64_t), /*signaled=*/true); - if ((ret = conn->ibQp->postSend()) != 0) { - WARN("flag postSend failed: errno %d", ret); - } - npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_FLAG_ENTRY, (uint32_t)sizeof(uint64_t), trigger.fields.connId); - } - } - // Wait for completion - if (trigger.fields.type & mscclppSync) { - if (isP2pProxy) { - PROXYCUDACHECK(cudaStreamSynchronize(p2pStream)); - npkitCollectExitEvents(conn, NPKIT_EVENT_DMA_SEND_EXIT, trigger.fields.connId); - } else { - int rank = comm->rank; - bool isWaiting = true; - while (isWaiting) { - int wcNum = conn->ibQp->pollCq(); - if (wcNum < 0) { - WARN("rank %d pollCq failed: errno %d", rank, errno); - continue; - } - for (int i = 0; i < wcNum; ++i) { - struct ibv_wc* wc = &conn->ibQp->wcs[i]; - if (wc->status != IBV_WC_SUCCESS) { - WARN("rank %d wc status %d", rank, wc->status); - continue; - } - if (wc->qp_num != conn->ibQp->qp->qp_num) { - WARN("rank %d got wc of unknown qp_num %d", rank, wc->qp_num); - continue; - } - if (wc->opcode == IBV_WC_RDMA_WRITE) { - isWaiting = false; - break; - } - } - } - npkitCollectExitEvents(conn, NPKIT_EVENT_IB_SEND_EXIT, trigger.fields.connId); - } - } + + mscclppConn* conn = &comm->conns[trigger.fields.connId]; + processTrigger(trigger, conn, proxyState); // Send completion: reset only the high 64 bits PROXYMSCCLPPCHECK(fifo->pop()); @@ -303,7 +258,9 @@ void* mscclppProxyService(void* _args) // make sure the tail is flushed before we shut the proxy PROXYMSCCLPPCHECK(fifo->flushTail(/*sync=*/true)); + bool isP2pProxy = (proxyState->ibContext == nullptr); if (isP2pProxy) { + cudaStream_t p2pStream = proxyState->p2pStream; PROXYCUDACHECK(cudaStreamSynchronize(p2pStream)); } *run = MSCCLPP_PROXY_RUN_STATE_IDLE; From edc3c237ed246f1ce684bc47de1f5424a41b5225 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Wed, 12 Apr 2023 04:09:12 +0000 Subject: [PATCH 016/135] deleteing hostconn --- src/init.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/init.cc b/src/init.cc index 16a57333..67f4c466 100644 --- a/src/init.cc +++ b/src/init.cc @@ -200,6 +200,8 @@ MSCCLPP_API mscclppResult_t mscclppCommDestroy(mscclppComm_t comm) if (conn) { MSCCLPPCHECK(mscclppCudaFree(conn->devConn->localSignalEpochId)); MSCCLPPCHECK(mscclppCudaFree(conn->devConn->waitEpochId)); + if (conn->hostConn) + delete conn->hostConn; } } From fd3f9281086680320a1f0df9e5a5b1975c9749f0 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 12 Apr 2023 08:08:19 +0000 Subject: [PATCH 017/135] remove hostFifo & rename devFifo to just fifo --- src/include/proxy.h | 31 +----------------------------- src/init.cc | 21 ++++++-------------- src/proxy.cc | 47 ++++++--------------------------------------- 3 files changed, 13 insertions(+), 86 deletions(-) diff --git a/src/include/proxy.h b/src/include/proxy.h index 682164a0..3da0196c 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -16,17 +16,7 @@ typedef enum MSCCLPP_PROXY_RUN_STATE_EXITING, } mscclppProxyRunState_t; -// TODO: virtual functions struct mscclppProxyFifo -{ - // virtual mscclppResult_t create() = 0; - // virtual mscclppResult_t destroy() = 0; - // virtual mscclppResult_t poll(mscclppTrigger*) = 0; - // virtual mscclppResult_t pop() = 0; - // virtual mscclppResult_t flushTail(bool) = 0; -}; - -struct mscclppProxyDevFifo : mscclppProxyFifo { mscclppResult_t create(); mscclppResult_t destroy(); @@ -62,24 +52,6 @@ struct mscclppProxyDevFifo : mscclppProxyFifo cudaStream_t stream; }; -struct mscclppProxyHostFifo : mscclppProxyFifo -{ - mscclppResult_t create(); - mscclppResult_t destroy(); - mscclppResult_t poll(mscclppTrigger* trigger); - mscclppResult_t pop(); - mscclppResult_t flushTail(bool sync = false); - - // fifo cudaHostCalloc'ed that is produced by device and consumed by host - mscclppTrigger* triggerFifo; - - // allocated on the device and only accessed by the device - std::atomic* fifoHead; - - // - uint64_t fifoTailHost; -}; - struct mscclppProxyState { mscclppTransport_t transportType; @@ -90,8 +62,7 @@ struct mscclppProxyState struct mscclppIbContext* ibContext; // For IB connection only cudaStream_t p2pStream; // for P2P DMA engine only - struct mscclppProxyDevFifo devFifo; - struct mscclppProxyHostFifo hostFifo; + struct mscclppProxyFifo fifo; }; mscclppResult_t mscclppProxyCreate(struct mscclppComm* comm); diff --git a/src/init.cc b/src/init.cc index 67f4c466..aaf655ea 100644 --- a/src/init.cc +++ b/src/init.cc @@ -181,8 +181,7 @@ MSCCLPP_API mscclppResult_t mscclppCommDestroy(mscclppComm_t comm) for (int i = 0; i < MSCCLPP_PROXY_MAX_NUM; ++i) { struct mscclppProxyState* proxyState = comm->proxyState[i]; if (proxyState) { - MSCCLPPCHECK(proxyState->devFifo.destroy()); - MSCCLPPCHECK(proxyState->hostFifo.destroy()); + MSCCLPPCHECK(proxyState->fifo.destroy()); if (proxyState->p2pStream) CUDACHECK(cudaStreamDestroy(proxyState->p2pStream)); free(proxyState); @@ -436,8 +435,7 @@ MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, i // If we couldn't find a matching context, create one if (proxyState == NULL) { MSCCLPPCHECK(mscclppCalloc(&proxyState, 1)); - MSCCLPPCHECK(proxyState->devFifo.create()); - MSCCLPPCHECK(proxyState->hostFifo.create()); + MSCCLPPCHECK(proxyState->fifo.create()); if (transportType == mscclppTransportIB) { proxyState->ibContext = conn->ibCtx; @@ -476,12 +474,12 @@ MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, i conn->devConn->tag = tag; conn->devConn->fifo.connId = comm->nConns; #if defined(MSCCLPP_USE_GDRCOPY) - conn->devConn->fifo.triggerFifo = proxyState->devFifo.triggerFifoDev; + conn->devConn->fifo.triggerFifo = proxyState->fifo.triggerFifoDev; #else - conn->devConn->fifo.triggerFifo = proxyState->devFifo.triggerFifo; + conn->devConn->fifo.triggerFifo = proxyState->fifo.triggerFifo; #endif - conn->devConn->fifo.triggerFifoHead = proxyState->devFifo.fifoHead; - conn->devConn->fifo.triggerFifoTail = proxyState->devFifo.fifoTailDev; + conn->devConn->fifo.triggerFifoHead = proxyState->fifo.fifoHead; + conn->devConn->fifo.triggerFifoTail = proxyState->fifo.fifoTailDev; comm->nConns++; @@ -737,10 +735,3 @@ MSCCLPP_API mscclppResult_t mscclppSetBootstrapConnTimeout(int timeout) config->setBootstrapConnectionTimeoutConfig(timeout); return mscclppSuccess; } - -static inline uint64_t hostFifoPush(uint64_t type, uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) -{ - -} - - diff --git a/src/proxy.cc b/src/proxy.cc index d5b99e9d..d7e291e2 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -87,7 +87,7 @@ static void npkitCollectExitEvents(struct mscclppConn* conn, uint8_t type, int c #endif -mscclppResult_t mscclppProxyDevFifo::create() +mscclppResult_t mscclppProxyFifo::create() { MSCCLPPCHECK(mscclppCudaCalloc(&this->fifoHead, 1)); #if defined(MSCCLPP_USE_GDRCOPY) @@ -104,7 +104,7 @@ mscclppResult_t mscclppProxyDevFifo::create() return mscclppSuccess; } -mscclppResult_t mscclppProxyDevFifo::destroy() +mscclppResult_t mscclppProxyFifo::destroy() { MSCCLPPCHECK(mscclppCudaFree(this->fifoHead)); #if defined(MSCCLPP_USE_GDRCOPY) @@ -119,21 +119,21 @@ mscclppResult_t mscclppProxyDevFifo::destroy() } // return true if the trigger is valid -mscclppResult_t mscclppProxyDevFifo::poll(mscclppTrigger* trigger) +mscclppResult_t mscclppProxyFifo::poll(mscclppTrigger* trigger) { __m128i xmm0 = _mm_load_si128((__m128i*)&this->triggerFifo[this->fifoTailHost % MSCCLPP_PROXY_FIFO_SIZE]); _mm_store_si128((__m128i*)trigger, xmm0); return mscclppSuccess; } -mscclppResult_t mscclppProxyDevFifo::pop() +mscclppResult_t mscclppProxyFifo::pop() { *(volatile uint64_t*)(&this->triggerFifo[this->fifoTailHost % MSCCLPP_PROXY_FIFO_SIZE]) = 0; (this->fifoTailHost)++; return mscclppSuccess; } -mscclppResult_t mscclppProxyDevFifo::flushTail(bool sync) +mscclppResult_t mscclppProxyFifo::flushTail(bool sync) { // Flush the tail to device memory. This is either triggered every MSCCLPP_PROXY_FIFO_FLUSH_COUNTER to make sure // that the fifo can make progress even if there is no request mscclppSync. However, mscclppSync type is for flush @@ -150,41 +150,6 @@ mscclppResult_t mscclppProxyDevFifo::flushTail(bool sync) return mscclppSuccess; } -mscclppResult_t mscclppProxyHostFifo::create() -{ - MSCCLPPCHECK(mscclppCalloc(&this->fifoHead, 1)); - MSCCLPPCHECK(mscclppCalloc(&this->triggerFifo, MSCCLPP_PROXY_FIFO_SIZE)); - this->fifoTailHost = 0; - return mscclppSuccess; -} - -mscclppResult_t mscclppProxyHostFifo::destroy() -{ - free(this->fifoHead); - free(this->triggerFifo); - return mscclppSuccess; -} - -mscclppResult_t mscclppProxyHostFifo::poll(mscclppTrigger* trigger) -{ - __m128i xmm0 = _mm_load_si128((__m128i*)&this->triggerFifo[this->fifoTailHost % MSCCLPP_PROXY_FIFO_SIZE]); - _mm_store_si128((__m128i*)trigger, xmm0); - return mscclppSuccess; -} - -mscclppResult_t mscclppProxyHostFifo::pop() -{ - *(volatile uint64_t*)(&this->triggerFifo[this->fifoTailHost % MSCCLPP_PROXY_FIFO_SIZE]) = 0; - (this->fifoTailHost)++; - return mscclppSuccess; -} - -mscclppResult_t mscclppProxyHostFifo::flushTail(bool) -{ - return mscclppSuccess; -} - - void processTrigger(const mscclppTrigger trigger, mscclppConn* conn, mscclppProxyState* proxyState){ mscclppIbContext* ibCtx = proxyState->ibContext; bool isP2pProxy = (ibCtx == nullptr); @@ -222,7 +187,7 @@ void* mscclppProxyService(void* _args) // from this point on, proxy thread will stay close to the device PROXYMSCCLPPCHECK(numaBind(comm->devNumaNode)); - struct mscclppProxyDevFifo* fifo = &proxyState->devFifo; + struct mscclppProxyFifo* fifo = &proxyState->fifo; volatile mscclppProxyRunState_t* run = &proxyState->run; mscclppTrigger trigger; From bc729cd48161d93763e0a89f086e8e27abff46e6 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 12 Apr 2023 09:05:42 +0000 Subject: [PATCH 018/135] Move MRs / MR infos to mscclppHostIBConn & cleanup --- src/include/comm.h | 5 +-- src/include/mscclpp.h | 1 + src/init.cc | 88 ++++++++++++++++++++++++++++++++++++------- src/proxy.cc | 66 ++------------------------------ 4 files changed, 80 insertions(+), 80 deletions(-) diff --git a/src/include/comm.h b/src/include/comm.h index c4927143..366659d5 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -20,6 +20,7 @@ struct mscclppConn { + int connId; mscclppTransport_t transport; int remoteRank; uint64_t buffSize; @@ -28,10 +29,6 @@ struct mscclppConn struct mscclppIbContext* ibCtx; struct mscclppIbQp* ibQp; - struct mscclppIbMr* ibBuffMr; - struct mscclppIbMr* ibSignalEpochIdMr; - struct mscclppIbMrInfo ibBuffMrInfo; - struct mscclppIbMrInfo ibSignalEpochIdMrInfo; #if defined(ENABLE_NPKIT) std::vector npkitUsedReqIds; std::vector npkitFreeReqIds; diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index 11e0cbf6..b7db058b 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -188,6 +188,7 @@ struct mscclppDevConn // Host interface for mscclppDevCon functionality struct mscclppHostConn{ + virtual ~mscclppHostConn() = default; virtual void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) = 0; virtual void signal() = 0; virtual void wait() = 0; diff --git a/src/init.cc b/src/init.cc index aaf655ea..f4f47487 100644 --- a/src/init.cc +++ b/src/init.cc @@ -267,6 +267,53 @@ MSCCLPP_API mscclppResult_t mscclppGetAllDeviceConnections(mscclppComm_t comm, m return mscclppSuccess; } +#if defined(ENABLE_NPKIT) + +static void npkitInitReqIds(struct mscclppComm* comm) +{ + for (int i = 0; i < comm->nConns; i++) { + struct mscclppConn* conn = &comm->conns[i]; + conn->npkitUsedReqIds.resize(0); + conn->npkitFreeReqIds.resize(MSCCLPP_IB_MAX_SENDS); + for (uint64_t j = 0; j < MSCCLPP_IB_MAX_SENDS; j++) { + conn->npkitFreeReqIds[j] = MSCCLPP_IB_MAX_SENDS - j - 1; + } + } +} + +static void npkitCollectEntryEvent(struct mscclppConn* conn, uint8_t type, uint32_t size) +{ + uint64_t reqId = 0; + if (conn->npkitFreeReqIds.size() == 0) { + reqId = conn->npkitUsedReqIds.size(); + } else { + reqId = conn->npkitFreeReqIds.back(); + conn->npkitFreeReqIds.pop_back(); + } + conn->npkitUsedReqIds.push_back(reqId); + NpKit::CollectCpuEvent(type, size, (uint32_t)reqId, NpKit::GetCpuTimestamp(), conn->connId); +} + +static void npkitCollectExitEvents(struct mscclppConn* conn, uint8_t type) +{ + while (conn->npkitUsedReqIds.size()) { + uint64_t reqId = conn->npkitUsedReqIds.back(); + NpKit::CollectCpuEvent(type, 0, (uint32_t)reqId, NpKit::GetCpuTimestamp(), conn->connId); + conn->npkitFreeReqIds.push_back(reqId); + conn->npkitUsedReqIds.pop_back(); + } +} + +#else + +#define npkitInitReqIds(comm) + +#define npkitCollectEntryEvent(conn, type, size) + +#define npkitCollectExitEvents(conn, type) + +#endif + struct mscclppHostP2PConn : mscclppHostConn{ mscclppHostP2PConn(mscclppConn* _conn, cudaStream_t _stream) : conn(_conn), p2pStream(_stream){} @@ -275,15 +322,18 @@ struct mscclppHostP2PConn : mscclppHostConn{ void* srcBuff = (void*)((char*)conn->devConn->localBuff + srcDataOffset); void* dstBuff = (void*)((char*)conn->devConn->remoteBuff + dstDataOffset); CUDACHECKNORET(cudaMemcpyAsync(dstBuff, srcBuff, dataSize, cudaMemcpyDeviceToDevice, p2pStream)); + npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_DATA_ENTRY, (uint32_t)dataSize); } void signal(){ CUDACHECKNORET(cudaMemcpyAsync(&conn->devConn->remoteSignalEpochId->proxy, &(conn->devConn->localSignalEpochId->device), sizeof(uint64_t), cudaMemcpyDeviceToDevice, p2pStream)); + npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_FLAG_ENTRY, (uint32_t)sizeof(uint64_t)); } void wait(){} void flush(){ CUDACHECKNORET(cudaStreamSynchronize(p2pStream)); + npkitCollectExitEvents(conn, NPKIT_EVENT_DMA_SEND_EXIT); } mscclppConn* conn; @@ -294,7 +344,7 @@ struct mscclppHostIBConn : mscclppHostConn{ mscclppHostIBConn(mscclppConn* conn) : conn(conn) {} void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize){ - conn->ibQp->stageSend(conn->ibBuffMr, &conn->ibBuffMrInfo, (uint32_t)dataSize, + conn->ibQp->stageSend(this->ibBuffMr, &this->ibBuffMrInfo, (uint32_t)dataSize, /*wrId=*/0, /*srcOffset=*/srcDataOffset, /*dstOffset=*/dstDataOffset, /*signaled=*/false); @@ -303,15 +353,17 @@ struct mscclppHostIBConn : mscclppHostConn{ // Return value is errno. WARN("data postSend failed: errno %d", ret); } + npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_DATA_ENTRY, (uint32_t)dataSize); } void signal(){ // My local device flag is copied to the remote's proxy flag - conn->ibQp->stageSend(conn->ibSignalEpochIdMr, &conn->ibSignalEpochIdMrInfo, sizeof(uint64_t), + conn->ibQp->stageSend(this->ibSignalEpochIdMr, &this->ibSignalEpochIdMrInfo, sizeof(uint64_t), /*wrId=*/0, /*srcOffset=*/0, /*dstOffset=*/sizeof(uint64_t), /*signaled=*/true); int ret = conn->ibQp->postSend(); if (ret != 0) { WARN("flag postSend failed: errno %d", ret); } + npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_FLAG_ENTRY, (uint32_t)sizeof(uint64_t)); } void wait(){} void flush(){ @@ -338,9 +390,14 @@ struct mscclppHostIBConn : mscclppHostConn{ } } } + npkitCollectExitEvents(conn, NPKIT_EVENT_IB_SEND_EXIT); } mscclppConn* conn; + struct mscclppIbMr* ibBuffMr; + struct mscclppIbMr* ibSignalEpochIdMr; + struct mscclppIbMrInfo ibBuffMrInfo; + struct mscclppIbMrInfo ibSignalEpochIdMrInfo; }; @@ -365,7 +422,9 @@ MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, i WARN("Too many connections made"); return mscclppInternalError; } - struct mscclppConn* conn = &comm->conns[comm->nConns]; + int connId = comm->nConns; + struct mscclppConn* conn = &comm->conns[connId]; + conn->connId = connId; conn->transport = transportType; conn->buffSize = buffSize; @@ -463,8 +522,7 @@ MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, i conn->hostConn = new mscclppHostP2PConn(conn, proxyState->p2pStream); } - - struct mscclppDevConn* devConn = &comm->devConns[comm->nConns]; + struct mscclppDevConn* devConn = &comm->devConns[connId]; conn->devConn = devConn; conn->devConn->localBuff = localBuff; @@ -472,7 +530,7 @@ MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, i MSCCLPPCHECK(mscclppCudaCalloc(&conn->devConn->waitEpochId, 1)); conn->devConn->remoteRank = remoteRank; conn->devConn->tag = tag; - conn->devConn->fifo.connId = comm->nConns; + conn->devConn->fifo.connId = connId; #if defined(MSCCLPP_USE_GDRCOPY) conn->devConn->fifo.triggerFifo = proxyState->fifo.triggerFifoDev; #else @@ -530,6 +588,7 @@ mscclppResult_t mscclppIbConnectionSetupStart(struct connInfo* connInfo /*output return mscclppInternalError; } struct mscclppDevConn* devConn = conn->devConn; + struct mscclppHostIBConn* hostConn = (struct mscclppHostIBConn*)conn->hostConn; devConn->remoteBuff = NULL; devConn->remoteSignalEpochId = NULL; @@ -537,12 +596,12 @@ mscclppResult_t mscclppIbConnectionSetupStart(struct connInfo* connInfo /*output if (conn->ibQp == NULL) { MSCCLPPCHECK(mscclppIbContextCreateQp(ibCtx, &conn->ibQp)); } - MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, devConn->localBuff, conn->buffSize, &conn->ibBuffMr)); + MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, devConn->localBuff, conn->buffSize, &hostConn->ibBuffMr)); MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, devConn->localSignalEpochId, - sizeof(struct mscclppDevConnSignalEpochId), &conn->ibSignalEpochIdMr)); + sizeof(struct mscclppDevConnSignalEpochId), &hostConn->ibSignalEpochIdMr)); connInfo->infoQp = conn->ibQp->info; - connInfo->infoBuffMr = conn->ibBuffMr->info; - connInfo->infoSignalEpochIdMr = conn->ibSignalEpochIdMr->info; + connInfo->infoBuffMr = hostConn->ibBuffMr->info; + connInfo->infoSignalEpochIdMr = hostConn->ibSignalEpochIdMr->info; return mscclppSuccess; } @@ -560,8 +619,9 @@ mscclppResult_t mscclppIbConnectionSetupEnd(struct connInfo* connInfo /*input*/, WARN("Failed to transition QP to RTS"); return mscclppInvalidUsage; } - conn->ibBuffMrInfo = connInfo->infoBuffMr; - conn->ibSignalEpochIdMrInfo = connInfo->infoSignalEpochIdMr; + struct mscclppHostIBConn* hostConn = (struct mscclppHostIBConn*)conn->hostConn; + hostConn->ibBuffMrInfo = connInfo->infoBuffMr; + hostConn->ibSignalEpochIdMrInfo = connInfo->infoSignalEpochIdMr; return mscclppSuccess; } @@ -658,7 +718,8 @@ MSCCLPP_API mscclppResult_t mscclppRegisteredBufferWrite(mscclppComm_t comm, msc void* dstBuff = regMem->p2p[i].remoteBuff; CUDACHECK(cudaMemcpyAsync(dstBuff, srcBuff, size, cudaMemcpyDeviceToDevice, (cudaStream_t)stream)); } else { - conn->ibQp->stageSend(conn->ibBuffMr, &conn->ibBuffMrInfo, (uint32_t)size, + struct mscclppHostIBConn* hostConn = (struct mscclppHostIBConn*)conn->hostConn; + conn->ibQp->stageSend(hostConn->ibBuffMr, &hostConn->ibBuffMrInfo, (uint32_t)size, /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset, /*signaled=*/false); @@ -678,6 +739,7 @@ MSCCLPP_API mscclppResult_t mscclppRegisteredBufferWrite(mscclppComm_t comm, msc MSCCLPP_API mscclppResult_t mscclppProxyLaunch(mscclppComm_t comm) { + npkitInitReqIds(comm); MSCCLPPCHECK(mscclppProxyCreate(comm)); return mscclppSuccess; } diff --git a/src/proxy.cc b/src/proxy.cc index d7e291e2..044316d7 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -10,8 +10,6 @@ #include #include -#include "npkit/npkit.h" - #define MSCCLPP_PROXY_RUN_STATE_CHECK_PERIOD 100 #define PROXYCUDACHECK(cmd) \ @@ -40,53 +38,6 @@ struct proxyArgs struct mscclppProxyState* proxyState; }; -#if defined(ENABLE_NPKIT) - -static void npkitInitReqIds(struct mscclppComm* comm) -{ - for (int i = 0; i < comm->nConns; i++) { - struct mscclppConn* conn = &comm->conns[i]; - conn->npkitUsedReqIds.resize(0); - conn->npkitFreeReqIds.resize(MSCCLPP_IB_MAX_SENDS); - for (uint64_t j = 0; j < MSCCLPP_IB_MAX_SENDS; j++) { - conn->npkitFreeReqIds[j] = MSCCLPP_IB_MAX_SENDS - j - 1; - } - } -} - -static void npkitCollectEntryEvent(struct mscclppConn* conn, uint8_t type, uint32_t size, int channelId) -{ - uint64_t reqId = 0; - if (conn->npkitFreeReqIds.size() == 0) { - reqId = conn->npkitUsedReqIds.size(); - } else { - reqId = conn->npkitFreeReqIds.back(); - conn->npkitFreeReqIds.pop_back(); - } - conn->npkitUsedReqIds.push_back(reqId); - NpKit::CollectCpuEvent(type, size, (uint32_t)reqId, NpKit::GetCpuTimestamp(), channelId); -} - -static void npkitCollectExitEvents(struct mscclppConn* conn, uint8_t type, int channelId) -{ - while (conn->npkitUsedReqIds.size()) { - uint64_t reqId = conn->npkitUsedReqIds.back(); - NpKit::CollectCpuEvent(type, 0, (uint32_t)reqId, NpKit::GetCpuTimestamp(), channelId); - conn->npkitFreeReqIds.push_back(reqId); - conn->npkitUsedReqIds.pop_back(); - } -} - -#else - -#define npkitInitReqIds(comm) - -#define npkitCollectEntryEvent(conn, type, size, channelId) - -#define npkitCollectExitEvents(conn, type, channelId) - -#endif - mscclppResult_t mscclppProxyFifo::create() { MSCCLPPCHECK(mscclppCudaCalloc(&this->fifoHead, 1)); @@ -150,29 +101,20 @@ mscclppResult_t mscclppProxyFifo::flushTail(bool sync) return mscclppSuccess; } -void processTrigger(const mscclppTrigger trigger, mscclppConn* conn, mscclppProxyState* proxyState){ - mscclppIbContext* ibCtx = proxyState->ibContext; - bool isP2pProxy = (ibCtx == nullptr); - +static void processTrigger(const mscclppTrigger trigger, mscclppConn* conn) +{ // Iterate over what send is needed if (trigger.fields.type & mscclppData) { conn->hostConn->put(trigger.fields.dstDataOffset, trigger.fields.srcDataOffset, trigger.fields.dataSize); - - npkitCollectEntryEvent(conn, isP2pProxy ? NPKIT_EVENT_DMA_SEND_DATA_ENTRY : NPKIT_EVENT_IB_SEND_DATA_ENTRY, - (uint32_t)trigger.fields.dataSize, trigger.fields.connId); } if (trigger.fields.type & mscclppFlag) { conn->hostConn->signal(); - - npkitCollectEntryEvent(conn, isP2pProxy ? NPKIT_EVENT_P2P_SEND_FLAG_ENTRY : NPKIT_EVENT_IB_SEND_FLAG_ENTRY, - (uint32_t)sizeof(uint64_t), trigger.fields.connId); } // Wait for completion if (trigger.fields.type & mscclppSync) { conn->hostConn->flush(); - npkitCollectExitEvents(conn, isP2pProxy? NPKIT_EVENT_DMA_SEND_EXIT : NPKIT_EVENT_IB_SEND_EXIT, trigger.fields.connId); } } @@ -191,8 +133,6 @@ void* mscclppProxyService(void* _args) volatile mscclppProxyRunState_t* run = &proxyState->run; mscclppTrigger trigger; - npkitInitReqIds(comm); - int runCnt = MSCCLPP_PROXY_RUN_STATE_CHECK_PERIOD; uint64_t flushCnt = 0; for (;;) { @@ -209,7 +149,7 @@ void* mscclppProxyService(void* _args) } mscclppConn* conn = &comm->conns[trigger.fields.connId]; - processTrigger(trigger, conn, proxyState); + processTrigger(trigger, conn); // Send completion: reset only the high 64 bits PROXYMSCCLPPCHECK(fifo->pop()); From 63a5be695355bc816bc618d343fc7b711ff628a6 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 12 Apr 2023 09:20:05 +0000 Subject: [PATCH 019/135] Move ibQp to mscclppHostIBConn --- src/include/comm.h | 1 - src/init.cc | 32 ++++++++++++++++---------------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/src/include/comm.h b/src/include/comm.h index 366659d5..04e21b56 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -28,7 +28,6 @@ struct mscclppConn struct mscclppHostConn* hostConn; struct mscclppIbContext* ibCtx; - struct mscclppIbQp* ibQp; #if defined(ENABLE_NPKIT) std::vector npkitUsedReqIds; std::vector npkitFreeReqIds; diff --git a/src/init.cc b/src/init.cc index f4f47487..f04f14fa 100644 --- a/src/init.cc +++ b/src/init.cc @@ -344,11 +344,11 @@ struct mscclppHostIBConn : mscclppHostConn{ mscclppHostIBConn(mscclppConn* conn) : conn(conn) {} void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize){ - conn->ibQp->stageSend(this->ibBuffMr, &this->ibBuffMrInfo, (uint32_t)dataSize, + this->ibQp->stageSend(this->ibBuffMr, &this->ibBuffMrInfo, (uint32_t)dataSize, /*wrId=*/0, /*srcOffset=*/srcDataOffset, /*dstOffset=*/dstDataOffset, /*signaled=*/false); - int ret = conn->ibQp->postSend(); + int ret = this->ibQp->postSend(); if (ret != 0) { // Return value is errno. WARN("data postSend failed: errno %d", ret); @@ -357,9 +357,9 @@ struct mscclppHostIBConn : mscclppHostConn{ } void signal(){ // My local device flag is copied to the remote's proxy flag - conn->ibQp->stageSend(this->ibSignalEpochIdMr, &this->ibSignalEpochIdMrInfo, sizeof(uint64_t), + this->ibQp->stageSend(this->ibSignalEpochIdMr, &this->ibSignalEpochIdMrInfo, sizeof(uint64_t), /*wrId=*/0, /*srcOffset=*/0, /*dstOffset=*/sizeof(uint64_t), /*signaled=*/true); - int ret = conn->ibQp->postSend(); + int ret = this->ibQp->postSend(); if (ret != 0) { WARN("flag postSend failed: errno %d", ret); } @@ -369,18 +369,18 @@ struct mscclppHostIBConn : mscclppHostConn{ void flush(){ bool isWaiting = true; while (isWaiting) { - int wcNum = conn->ibQp->pollCq(); + int wcNum = this->ibQp->pollCq(); if (wcNum < 0) { WARN("pollCq failed: errno %d", errno); continue; } for (int i = 0; i < wcNum; ++i) { - struct ibv_wc* wc = &conn->ibQp->wcs[i]; + struct ibv_wc* wc = &this->ibQp->wcs[i]; if (wc->status != IBV_WC_SUCCESS) { WARN("wc status %d", wc->status); continue; } - if (wc->qp_num != conn->ibQp->qp->qp_num) { + if (wc->qp_num != this->ibQp->qp->qp_num) { WARN("got wc of unknown qp_num %d", wc->qp_num); continue; } @@ -394,6 +394,7 @@ struct mscclppHostIBConn : mscclppHostConn{ } mscclppConn* conn; + struct mscclppIbQp* ibQp; struct mscclppIbMr* ibBuffMr; struct mscclppIbMr* ibSignalEpochIdMr; struct mscclppIbMrInfo ibBuffMrInfo; @@ -429,7 +430,6 @@ MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, i conn->buffSize = buffSize; conn->ibCtx = NULL; - conn->ibQp = NULL; int ibDevIdx = -1; if (transportType == mscclppTransportIB) { // Check if an IB context exists @@ -593,13 +593,13 @@ mscclppResult_t mscclppIbConnectionSetupStart(struct connInfo* connInfo /*output devConn->remoteSignalEpochId = NULL; struct mscclppIbContext* ibCtx = conn->ibCtx; - if (conn->ibQp == NULL) { - MSCCLPPCHECK(mscclppIbContextCreateQp(ibCtx, &conn->ibQp)); + if (hostConn->ibQp == NULL) { + MSCCLPPCHECK(mscclppIbContextCreateQp(ibCtx, &hostConn->ibQp)); } MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, devConn->localBuff, conn->buffSize, &hostConn->ibBuffMr)); MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, devConn->localSignalEpochId, sizeof(struct mscclppDevConnSignalEpochId), &hostConn->ibSignalEpochIdMr)); - connInfo->infoQp = conn->ibQp->info; + connInfo->infoQp = hostConn->ibQp->info; connInfo->infoBuffMr = hostConn->ibBuffMr->info; connInfo->infoSignalEpochIdMr = hostConn->ibSignalEpochIdMr->info; return mscclppSuccess; @@ -611,15 +611,15 @@ mscclppResult_t mscclppIbConnectionSetupEnd(struct connInfo* connInfo /*input*/, WARN("ipcHandles or connection cannot be null"); return mscclppInternalError; } - if (conn->ibQp->rtr(&connInfo->infoQp) != 0) { + struct mscclppHostIBConn* hostConn = (struct mscclppHostIBConn*)conn->hostConn; + if (hostConn->ibQp->rtr(&connInfo->infoQp) != 0) { WARN("Failed to transition QP to RTR"); return mscclppInvalidUsage; } - if (conn->ibQp->rts() != 0) { + if (hostConn->ibQp->rts() != 0) { WARN("Failed to transition QP to RTS"); return mscclppInvalidUsage; } - struct mscclppHostIBConn* hostConn = (struct mscclppHostIBConn*)conn->hostConn; hostConn->ibBuffMrInfo = connInfo->infoBuffMr; hostConn->ibSignalEpochIdMrInfo = connInfo->infoSignalEpochIdMr; return mscclppSuccess; @@ -719,11 +719,11 @@ MSCCLPP_API mscclppResult_t mscclppRegisteredBufferWrite(mscclppComm_t comm, msc CUDACHECK(cudaMemcpyAsync(dstBuff, srcBuff, size, cudaMemcpyDeviceToDevice, (cudaStream_t)stream)); } else { struct mscclppHostIBConn* hostConn = (struct mscclppHostIBConn*)conn->hostConn; - conn->ibQp->stageSend(hostConn->ibBuffMr, &hostConn->ibBuffMrInfo, (uint32_t)size, + hostConn->ibQp->stageSend(hostConn->ibBuffMr, &hostConn->ibBuffMrInfo, (uint32_t)size, /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset, /*signaled=*/false); - if ((ret = conn->ibQp->postSend()) != 0) { + if ((ret = hostConn->ibQp->postSend()) != 0) { // Return value is errno. WARN("data postSend failed: errno %d", ret); } From dd0883b84fdcfa4bc4db43590a87e4e9d770ba58 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 12 Apr 2023 09:25:35 +0000 Subject: [PATCH 020/135] Lint --- src/include/checks.h | 4 +- src/include/comm.h | 6 +-- src/include/mscclpp.h | 5 +- src/init.cc | 120 +++++++++++++++++++++++------------------- src/proxy.cc | 8 ++- 5 files changed, 74 insertions(+), 69 deletions(-) diff --git a/src/include/checks.h b/src/include/checks.h index fb86fd66..7422e384 100644 --- a/src/include/checks.h +++ b/src/include/checks.h @@ -20,12 +20,12 @@ } \ } while (false) -#define CUDACHECKNORET(cmd) \ +#define CUDACHECKNORET(cmd) \ do { \ cudaError_t err = cmd; \ if (err != cudaSuccess) { \ WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ - return; \ + return; \ } \ } while (false) diff --git a/src/include/comm.h b/src/include/comm.h index 04e21b56..62a6ba01 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -16,8 +16,6 @@ #define MAXCONNECTIONS 64 - - struct mscclppConn { int connId; @@ -42,8 +40,8 @@ struct mscclppComm void* bootstrap; - uint64_t - magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches. + // Magic number for all network communication. Not a security key -- only goal is to detect mismatches. + uint64_t magic; int rank; // my rank in the communicator int nRanks; // number of GPUs in communicator diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index b7db058b..88404a65 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -29,7 +29,6 @@ struct alignas(16) mscclppDevConnSignalEpochId uint64_t proxy; }; - /*************************************************************************************************************** * A mscclppDevConn provides a zero-copy connection between two GPUs connected via P2P NVLink or InfiniBand. * The communication API is one-sided meaning that for every single data transfer, only one side @@ -183,11 +182,11 @@ struct mscclppDevConn // my remote peer's buffer. only non-NULL with gpu's direct access // gpu can directly write into it void* remoteBuff; - }; // Host interface for mscclppDevCon functionality -struct mscclppHostConn{ +struct mscclppHostConn +{ virtual ~mscclppHostConn() = default; virtual void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) = 0; virtual void signal() = 0; diff --git a/src/init.cc b/src/init.cc index f04f14fa..38cd63e1 100644 --- a/src/init.cc +++ b/src/init.cc @@ -248,7 +248,8 @@ MSCCLPP_API const char* mscclppGetErrorString(mscclppResult_t code) } } -MSCCLPP_API mscclppResult_t mscclppGetDeviceConnection(mscclppComm_t comm, int remoteRank, int tag, mscclppDevConn_t** devConn) +MSCCLPP_API mscclppResult_t mscclppGetDeviceConnection(mscclppComm_t comm, int remoteRank, int tag, + mscclppDevConn_t** devConn) { for (int i = 0; i < comm->nConns; i++) { if (comm->devConns[i].remoteRank == remoteRank && comm->devConns[i].tag == tag) { @@ -314,59 +315,72 @@ static void npkitCollectExitEvents(struct mscclppConn* conn, uint8_t type) #endif - -struct mscclppHostP2PConn : mscclppHostConn{ - mscclppHostP2PConn(mscclppConn* _conn, cudaStream_t _stream) : conn(_conn), p2pStream(_stream){} - - void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize){ - void* srcBuff = (void*)((char*)conn->devConn->localBuff + srcDataOffset); - void* dstBuff = (void*)((char*)conn->devConn->remoteBuff + dstDataOffset); - CUDACHECKNORET(cudaMemcpyAsync(dstBuff, srcBuff, dataSize, cudaMemcpyDeviceToDevice, p2pStream)); - npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_DATA_ENTRY, (uint32_t)dataSize); +struct mscclppHostP2PConn : mscclppHostConn +{ + mscclppHostP2PConn(mscclppConn* _conn, cudaStream_t _stream) : conn(_conn), p2pStream(_stream) + { } - void signal(){ - CUDACHECKNORET(cudaMemcpyAsync(&conn->devConn->remoteSignalEpochId->proxy, - &(conn->devConn->localSignalEpochId->device), sizeof(uint64_t), - cudaMemcpyDeviceToDevice, p2pStream)); - npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_FLAG_ENTRY, (uint32_t)sizeof(uint64_t)); + + void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) + { + void* srcBuff = (void*)((char*)conn->devConn->localBuff + srcDataOffset); + void* dstBuff = (void*)((char*)conn->devConn->remoteBuff + dstDataOffset); + CUDACHECKNORET(cudaMemcpyAsync(dstBuff, srcBuff, dataSize, cudaMemcpyDeviceToDevice, p2pStream)); + npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_DATA_ENTRY, (uint32_t)dataSize); } - void wait(){} - void flush(){ - CUDACHECKNORET(cudaStreamSynchronize(p2pStream)); - npkitCollectExitEvents(conn, NPKIT_EVENT_DMA_SEND_EXIT); + void signal() + { + CUDACHECKNORET(cudaMemcpyAsync(&conn->devConn->remoteSignalEpochId->proxy, + &(conn->devConn->localSignalEpochId->device), sizeof(uint64_t), + cudaMemcpyDeviceToDevice, p2pStream)); + npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_FLAG_ENTRY, (uint32_t)sizeof(uint64_t)); + } + void wait() + { + } + void flush() + { + CUDACHECKNORET(cudaStreamSynchronize(p2pStream)); + npkitCollectExitEvents(conn, NPKIT_EVENT_DMA_SEND_EXIT); } mscclppConn* conn; cudaStream_t p2pStream; }; -struct mscclppHostIBConn : mscclppHostConn{ - mscclppHostIBConn(mscclppConn* conn) : conn(conn) {} +struct mscclppHostIBConn : mscclppHostConn +{ + mscclppHostIBConn(mscclppConn* conn) : conn(conn) + { + } - void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize){ - this->ibQp->stageSend(this->ibBuffMr, &this->ibBuffMrInfo, (uint32_t)dataSize, - /*wrId=*/0, /*srcOffset=*/srcDataOffset, - /*dstOffset=*/dstDataOffset, - /*signaled=*/false); - int ret = this->ibQp->postSend(); - if (ret != 0) { - // Return value is errno. - WARN("data postSend failed: errno %d", ret); - } - npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_DATA_ENTRY, (uint32_t)dataSize); + void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) + { + this->ibQp->stageSend(this->ibBuffMr, &this->ibBuffMrInfo, (uint32_t)dataSize, + /*wrId=*/0, /*srcOffset=*/srcDataOffset, /*dstOffset=*/dstDataOffset, /*signaled=*/false); + int ret = this->ibQp->postSend(); + if (ret != 0) { + // Return value is errno. + WARN("data postSend failed: errno %d", ret); + } + npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_DATA_ENTRY, (uint32_t)dataSize); } - void signal(){ - // My local device flag is copied to the remote's proxy flag - this->ibQp->stageSend(this->ibSignalEpochIdMr, &this->ibSignalEpochIdMrInfo, sizeof(uint64_t), - /*wrId=*/0, /*srcOffset=*/0, /*dstOffset=*/sizeof(uint64_t), /*signaled=*/true); - int ret = this->ibQp->postSend(); - if (ret != 0) { - WARN("flag postSend failed: errno %d", ret); - } - npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_FLAG_ENTRY, (uint32_t)sizeof(uint64_t)); + void signal() + { + // My local device flag is copied to the remote's proxy flag + this->ibQp->stageSend(this->ibSignalEpochIdMr, &this->ibSignalEpochIdMrInfo, sizeof(uint64_t), + /*wrId=*/0, /*srcOffset=*/0, /*dstOffset=*/sizeof(uint64_t), /*signaled=*/true); + int ret = this->ibQp->postSend(); + if (ret != 0) { + WARN("flag postSend failed: errno %d", ret); + } + npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_FLAG_ENTRY, (uint32_t)sizeof(uint64_t)); } - void wait(){} - void flush(){ + void wait() + { + } + void flush() + { bool isWaiting = true; while (isWaiting) { int wcNum = this->ibQp->pollCq(); @@ -401,10 +415,8 @@ struct mscclppHostIBConn : mscclppHostConn{ struct mscclppIbMrInfo ibSignalEpochIdMrInfo; }; - - -MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void* localBuff, uint64_t buffSize, - mscclppTransport_t transportType, const char* ibDev) +MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void* localBuff, + uint64_t buffSize, mscclppTransport_t transportType, const char* ibDev) { // save this processes numa binding and set it to the one closest to the device // so that all the allocation are close to the device @@ -517,8 +529,7 @@ MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, i if (transportType == mscclppTransportIB) { conn->hostConn = new mscclppHostIBConn(conn); - } - else if (transportType == mscclppTransportP2P) { + } else if (transportType == mscclppTransportP2P) { conn->hostConn = new mscclppHostP2PConn(conn, proxyState->p2pStream); } @@ -665,7 +676,7 @@ struct bufferInfo }; MSCCLPP_API mscclppResult_t mscclppRegisterBuffer(mscclppComm_t comm, void* local_memory, size_t size, - mscclppRegisteredMemory* regMem) + mscclppRegisteredMemory* regMem) { std::vector ibMrs; for (int i = 0; i < comm->nConns; ++i) { @@ -706,8 +717,9 @@ MSCCLPP_API mscclppResult_t mscclppRegisterBuffer(mscclppComm_t comm, void* loca return mscclppSuccess; } -MSCCLPP_API mscclppResult_t mscclppRegisteredBufferWrite(mscclppComm_t comm, mscclppRegisteredMemory* regMem, void* srcBuff, - size_t size, uint32_t srcOffset, uint32_t dstOffset, int64_t stream) +MSCCLPP_API mscclppResult_t mscclppRegisteredBufferWrite(mscclppComm_t comm, mscclppRegisteredMemory* regMem, + void* srcBuff, size_t size, uint32_t srcOffset, + uint32_t dstOffset, int64_t stream) { int ret = 0; // TODO: transport should be an argument too so user can decide which transport to use @@ -720,9 +732,7 @@ MSCCLPP_API mscclppResult_t mscclppRegisteredBufferWrite(mscclppComm_t comm, msc } else { struct mscclppHostIBConn* hostConn = (struct mscclppHostIBConn*)conn->hostConn; hostConn->ibQp->stageSend(hostConn->ibBuffMr, &hostConn->ibBuffMrInfo, (uint32_t)size, - /*wrId=*/0, /*srcOffset=*/srcOffset, - /*dstOffset=*/dstOffset, - /*signaled=*/false); + /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset, /*signaled=*/false); if ((ret = hostConn->ibQp->postSend()) != 0) { // Return value is errno. WARN("data postSend failed: errno %d", ret); diff --git a/src/proxy.cc b/src/proxy.cc index 044316d7..6cfd799b 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -42,10 +42,9 @@ mscclppResult_t mscclppProxyFifo::create() { MSCCLPPCHECK(mscclppCudaCalloc(&this->fifoHead, 1)); #if defined(MSCCLPP_USE_GDRCOPY) - MSCCLPPCHECK(mscclppGdrCudaCalloc(&this->triggerFifo, &this->triggerFifoDev, MSCCLPP_PROXY_FIFO_SIZE, - &this->triggerFifoDesc)); MSCCLPPCHECK( - mscclppGdrCudaCalloc(&this->fifoTailDevHostPtr, &this->fifoTailDev, 1, &this->fifoTailDesc)); + mscclppGdrCudaCalloc(&this->triggerFifo, &this->triggerFifoDev, MSCCLPP_PROXY_FIFO_SIZE, &this->triggerFifoDesc)); + MSCCLPPCHECK(mscclppGdrCudaCalloc(&this->fifoTailDevHostPtr, &this->fifoTailDev, 1, &this->fifoTailDesc)); #else MSCCLPPCHECK(mscclppCudaHostCalloc(&this->triggerFifo, MSCCLPP_PROXY_FIFO_SIZE)); MSCCLPPCHECK(mscclppCudaCalloc(&this->fifoTailDev, 1)); @@ -118,7 +117,6 @@ static void processTrigger(const mscclppTrigger trigger, mscclppConn* conn) } } - void* mscclppProxyService(void* _args) { struct proxyArgs* args = (struct proxyArgs*)_args; @@ -147,7 +145,7 @@ void* mscclppProxyService(void* _args) if (trigger.value[0] == 0) { continue; // there is one in progreess } - + mscclppConn* conn = &comm->conns[trigger.fields.connId]; processTrigger(trigger, conn); From ca1f803692216dcab42872a4395047e479c359c0 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 12 Apr 2023 09:33:14 +0000 Subject: [PATCH 021/135] Rename remote MR infos --- src/init.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/init.cc b/src/init.cc index 38cd63e1..4a664f4b 100644 --- a/src/init.cc +++ b/src/init.cc @@ -356,7 +356,7 @@ struct mscclppHostIBConn : mscclppHostConn void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) { - this->ibQp->stageSend(this->ibBuffMr, &this->ibBuffMrInfo, (uint32_t)dataSize, + this->ibQp->stageSend(this->ibBuffMr, &this->ibBuffMrRemoteInfo, (uint32_t)dataSize, /*wrId=*/0, /*srcOffset=*/srcDataOffset, /*dstOffset=*/dstDataOffset, /*signaled=*/false); int ret = this->ibQp->postSend(); if (ret != 0) { @@ -368,7 +368,7 @@ struct mscclppHostIBConn : mscclppHostConn void signal() { // My local device flag is copied to the remote's proxy flag - this->ibQp->stageSend(this->ibSignalEpochIdMr, &this->ibSignalEpochIdMrInfo, sizeof(uint64_t), + this->ibQp->stageSend(this->ibSignalEpochIdMr, &this->ibSignalEpochIdMrRemoteInfo, sizeof(uint64_t), /*wrId=*/0, /*srcOffset=*/0, /*dstOffset=*/sizeof(uint64_t), /*signaled=*/true); int ret = this->ibQp->postSend(); if (ret != 0) { @@ -411,8 +411,8 @@ struct mscclppHostIBConn : mscclppHostConn struct mscclppIbQp* ibQp; struct mscclppIbMr* ibBuffMr; struct mscclppIbMr* ibSignalEpochIdMr; - struct mscclppIbMrInfo ibBuffMrInfo; - struct mscclppIbMrInfo ibSignalEpochIdMrInfo; + struct mscclppIbMrInfo ibBuffMrRemoteInfo; + struct mscclppIbMrInfo ibSignalEpochIdMrRemoteInfo; }; MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void* localBuff, @@ -631,8 +631,8 @@ mscclppResult_t mscclppIbConnectionSetupEnd(struct connInfo* connInfo /*input*/, WARN("Failed to transition QP to RTS"); return mscclppInvalidUsage; } - hostConn->ibBuffMrInfo = connInfo->infoBuffMr; - hostConn->ibSignalEpochIdMrInfo = connInfo->infoSignalEpochIdMr; + hostConn->ibBuffMrRemoteInfo = connInfo->infoBuffMr; + hostConn->ibSignalEpochIdMrRemoteInfo = connInfo->infoSignalEpochIdMr; return mscclppSuccess; } From 1d3ea7bb834b59b1ce4f6b9e7f48dea419883a8e Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 12 Apr 2023 17:25:54 +0000 Subject: [PATCH 022/135] fix --- src/init.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/init.cc b/src/init.cc index 4a664f4b..7f3cfdff 100644 --- a/src/init.cc +++ b/src/init.cc @@ -731,7 +731,7 @@ MSCCLPP_API mscclppResult_t mscclppRegisteredBufferWrite(mscclppComm_t comm, msc CUDACHECK(cudaMemcpyAsync(dstBuff, srcBuff, size, cudaMemcpyDeviceToDevice, (cudaStream_t)stream)); } else { struct mscclppHostIBConn* hostConn = (struct mscclppHostIBConn*)conn->hostConn; - hostConn->ibQp->stageSend(hostConn->ibBuffMr, &hostConn->ibBuffMrInfo, (uint32_t)size, + hostConn->ibQp->stageSend(hostConn->ibBuffMr, &hostConn->ibBuffMrRemoteInfo, (uint32_t)size, /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset, /*signaled=*/false); if ((ret = hostConn->ibQp->postSend()) != 0) { // Return value is errno. From 7c2108d135409e7f0f84c7b1f2735520af521f81 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 12 Apr 2023 18:05:27 +0000 Subject: [PATCH 023/135] fix --- src/init.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/init.cc b/src/init.cc index 7f3cfdff..a5793364 100644 --- a/src/init.cc +++ b/src/init.cc @@ -352,6 +352,7 @@ struct mscclppHostIBConn : mscclppHostConn { mscclppHostIBConn(mscclppConn* conn) : conn(conn) { + this->ibQp = NULL; } void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) From 0eec1d438bc5d34ffc34ba6970245d4faa309221 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Thu, 13 Apr 2023 18:38:38 +0000 Subject: [PATCH 024/135] Move over C++ API work to new branch --- src/communicator.cc | 51 +++++ src/include/mscclpp.hpp | 398 ++++++++++++++++++++++++++++++++++++ src/include/mscclppfifo.hpp | 53 +++++ 3 files changed, 502 insertions(+) create mode 100644 src/communicator.cc create mode 100644 src/include/mscclpp.hpp create mode 100644 src/include/mscclppfifo.hpp diff --git a/src/communicator.cc b/src/communicator.cc new file mode 100644 index 00000000..5519b9c5 --- /dev/null +++ b/src/communicator.cc @@ -0,0 +1,51 @@ +#include "mscclpp.hpp" +#include "mscclpp.h" + +namespace mscclpp { + +struct Communicator::impl { + mscclppComm_t comm; +}; + +void Communicator::initRank(int nranks, const char* ipPortPair, int rank) { + +} + +void Communicator::initRankFromId(int nranks, UniqueId id, int rank) { + +} + +void Communicator::bootstrapAllGather(void* data, int size) { + +} + +void Communicator::bootstrapBarrier() { + +} + +std::shared_ptr Communicator::connect(int remoteRank, int tag, void* localBuff, uint64_t buffSize, + TransportType transportType, const char* ibDev = 0) { + +} + +void Communicator::connectionSetup() { + +} + +void Communicator::destroy() { + +} + +int Communicator::rank() { + +} + +int Communicator::size() { + +} + +void Communicator::setBootstrapConnTimeout(unsigned timeout) { + +} + +} // namespace mscclpp \ No newline at end of file diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp new file mode 100644 index 00000000..d44b04b9 --- /dev/null +++ b/src/include/mscclpp.hpp @@ -0,0 +1,398 @@ +#ifndef MSCCLPP_HPP_ +#define MSCCLPP_HPP_ + +#define MSCCLPP_MAJOR 0 +#define MSCCLPP_MINOR 1 +#define MSCCLPP_PATCH 0 +#define MSCCLPP_VERSION (MSCCLPP_MAJOR * 10000 + MSCCLPP_MINOR * 100 + MSCCLPP_PATCH) + +// For every MSCCLPP_PROXY_FIFO_FLUSH_COUNTER, a flush of the tail to device memory is triggered. +// As long as MSCCLPP_PROXY_FIFO_SIZE is large enough, having a stale tail is not a problem. +#define MSCCLPP_PROXY_FIFO_SIZE 128 +#define MSCCLPP_PROXY_FIFO_FLUSH_COUNTER 4 + +#include +#include + +#include + +namespace mscclpp { + +struct alignas(16) SignalEpochId { + // every signal(), increaments this and either: + // 1) proxy thread pushes it to the remote peer's localSignalEpochId->proxy + // 2) gpu thread directly writes it to remoteSignalEpochId->device + uint64_t device; + // signal() function triggers the cpu proxy thread to write to it + uint64_t proxy; +}; + +enum ChannelTriggerType : uint64_t { + channelTriggerData = 0x1, + channelTriggerFlag = 0x2, + channelTriggerSync = 0x4 +}; + +// This is just a numeric ID. Each HostConnection will have an internal array indexed by these handles +// mapping to the actual +using BufferHandle = uint8_t; + +#define MSCCLPP_BITS_SIZE 32 +#define MSCCLPP_BITS_OFFSET 32 +#define MSCCLPP_BITS_BUFFER_HANDLE 8 +#define MSCCLPP_BITS_TYPE 3 +#define MSCCLPP_BITS_CONNID 10 + +// this is the basic structure of each work element in the fifo +// the summation of number of bits must be 128 or less +union ChannelTrigger { + ProxyTrigger value; + struct + { + // first 64 bits: value[0] + uint64_t size : MSCCLPP_BITS_SIZE; + uint64_t srcOffset : MSCCLPP_BITS_OFFSET; + uint64_t : (64 - MSCCLPP_BITS_SIZE - MSCCLPP_BITS_OFFSET); // ensure 64-bit alignment + // second 64 bits: value[1] + uint64_t dstOffset : MSCCLPP_BITS_OFFSET; + uint64_t srcBufferHandle : MSCCLPP_BITS_BUFFER_HANDLE; + uint64_t dstBufferHandle : MSCCLPP_BITS_BUFFER_HANDLE; + uint64_t type : MSCCLPP_BITS_TYPE; + uint64_t : (64 - MSCCLPP_BITS_OFFSET - MSCCLPP_BITS_BUFFER_HANDLE - MSCCLPP_BITS_BUFFER_HANDLE - MSCCLPP_BITS_TYPE); // ensure 64-bit alignment + } fields; + + ChannelTrigger() {} + ChannelTrigger(ProxyTrigger value) : value(value) {} + ChannelTrigger(ChannelTriggerType type, BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) { + value.fst = ((srcOffset << MSCCLPP_BITS_SIZE) + size); + value.snd = (((((((uint64_t)type << MSCCLPP_BITS_BUFFER_HANDLE) + dst) << MSCCLPP_BITS_BUFFER_HANDLE) + src) << MSCCLPP_BITS_OFFSET) + dstOffset); + } +}; + +/*************************************************************************************************************** + * A mscclppDevConn provides a zero-copy connection between two GPUs connected via P2P NVLink or InfiniBand. + * The communication API is one-sided meaning that for every single data transfer, only one side + * needs to execute unlike a two-sided communication stack such as NCCL where both sides + * need to execute a send and a receive instruction, respectively, for every transfer. + * + * A connection is uniquely identified by the (remoteRank, tag) pair at an endpoint. + * The two endpoints register buffers of the same size with the connection. + * + * The endpoints provide the remoteRank, tag, and the buffer when registering a connection with msccppConnect(). + * + * mscllppConnectionSetup() sets up all the registered connections. + * + *************************************************************************************************************** + * A proxy thread running on the CPU is necessary to perform transfers using InfiniBand or the DMA engine. + * The current implementation uses a single proxy thread per context - one IB connection or DMA engine per node. + * Thus multiple threadblocks using different connections might use the same CPU proxy thread. + * + * Before using any of functionality of connections, mscclppProxyLaunch needs to be called to spawn the + * proxy threads. There are currently two types of connections: + * + * P2P via NVLink: the DMA engine can perform the copy between the buffers. DMA engine has higher latency + * but has a higher bandwidth and costs no compute cycles on the GPU. + * + * InfiniBand: the RDMA engine copies the data over MLX devices. + * + *************************************************************************************************************** + * At the runtime, a GPU kernel has access to a mscclppDevConn object that provides the following functions: + * + * put(): [non-blocking] the sender initiates a data transfer to the receiver. + * + * signal(): [non-blocking] the sender signals the receiver that data is ready to be consumed. + * + * flush(): [blocking] the sender waits for all the data transfers to complete + * + * wait(): [blocking] the reciever waits on the signal() to start reading the data. + * + * The sender should not reuse the buffer till the flush() returns. + * The receiver should only access the data after the wait() returns. + * + * putWithSignal(): the sender initiates a data transfer and signals the receiver that data is ready to be consumed. + * This is an optimized version of a put() followed by a signal(). + * + * These functions hide the complexity of syncrhonization between the two GPUs and the CPU proxy thread. + * Example: + * + * // sender GPU + * devConn.put(data1) + * // not OK to write to data1 + * devConn.put(data2) + * // not OK to write to data1, data2 + * devConn.put(data3) // receiver GPU + * // not OK to write to data1, data2, data3 // not OK to read data1, data2, data3 + * devConn.signal() -------------------------------> devConn.wait() + * // not OK to write to data1, data2, data3 // OK to read data1, data2, data3 + * devConn.flush() + * // OK to write to data1, data2, data3 + * + * + * The two endpoint can concurrently use the same connection provided they are writing (puts) on different + * indices in the registered buffer. + **************************************************************************************************************/ +struct DeviceConnection { +#ifdef __CUDACC__ + // TODO: add buffer handles + + __forceinline__ __device__ void put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) + { + fifo.push(ChannelTrigger(channelTriggerData, dst, dstOffset, src, srcOffset, size).value); + } + + __forceinline__ __device__ void put(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) + { + put(dst, offset, src, offset, size); + } + + __forceinline__ __device__ void signal() + { + epochIncrement(); + fifo.push(ChannelTrigger(channelTriggerFlag, 0, 0, 0, 0, 1).value); + } + + __forceinline__ __device__ void putWithSignal(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) + { + epochIncrement(); + fifo.push(ChannelTrigger(channelTriggerData | channelTriggerFlag, dst, dstOffset, src, srcOffset, size).value); + } + + __forceinline__ __device__ void putWithSignal(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) + { + putWithSignal(dst, offset, src, offset, size); + } + + __forceinline__ __device__ void putWithSignalAndFlush(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) + { + epochIncrement(); + uint64_t curFifoHead = fifo.push(channelTriggerData | channelTriggerFlag | channelTriggerSync, dstOffset, srcOffset, size); + while (*(volatile uint64_t*)&fifo.triggerFifo[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0 && + *(volatile uint64_t*)fifo.triggerFifoTail <= curFifoHead) + ; + } + + __forceinline__ __device__ void putWithSignalAndFlush(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) + { + putWithSignalAndFlush(offset, offset, size); + } + + __forceinline__ __device__ void flush() + { + uint64_t curFifoHead = fifo.push(mscclppSync, 0, 0, 1); + // we need to wait for two conditions to be met to ensure the CPU is done flushing. (1) wait for the tail + // to go pass by curFifoHead (this is safety net) and (2) wait for the work element value to change to 0. + while (*(volatile uint64_t*)&fifo.triggerFifo[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0 && + *(volatile uint64_t*)fifo.triggerFifoTail <= curFifoHead) + ; + } + + __forceinline__ __device__ void wait() + { + (*waitEpochId) += 1; + while (*(volatile uint64_t*)&(localSignalEpochId->proxy) < (*waitEpochId)) + ; + } + + __forceinline__ __device__ void epochIncrement() + { + *(volatile uint64_t*)&(localSignalEpochId->device) += 1; + } + +#endif // __CUDACC__ + + int remoteRank; + int tag; + + SignalEpochId* localSignalEpochId; + // used by the signal() function directly from gpu + SignalEpochId* remoteSignalEpochId; + + // every wait(), increments this and then the gpu waits for either: + // 1) localSignalEpochId->proxy to be >= this in case of a proxy thread + // 2) remoteSignalEpochId->device to be >= this in case of a gpu thread + uint64_t* waitEpochId; + + // this is a concurrent fifo which is multiple threads from the device + // can produce for and the sole proxy thread consumes it. + ProxyFifo fifo; +}; + +class HostConnection { +public: + /* Register a region of GPU memory for use with this connection. Must be called before connectionSetup() + * in the communicator. + * + * Inputs: + * data: base pointer to the memory + * size: size of the memory region in bytes + * + * Returns: a handle to the buffer + */ + BufferHandle registerBuffer(void* data, uint64_t size); + + /* Get the number of times registerBuffer(...) was called on the remote peer. + * + * Returns: the number of buffers registered on the remote peer + */ + int numRemoteBuffers(); + + /* Get the BufferHandle returned by a call to registerBuffer(...) on the remote peer as identified by the index + * + * Inputs: + * index: the index of the handle to get + * + * Returns: a handle to the buffer on the remote peer + */ + BufferHandle getRemoteBuffer(int index); + + /* Create a DeviceConnection paired with this HostConnection. A background proxy thread will + * trigger operations on this HostConnection corresponding to put/signal/etc. calls made to the + * DeviceConnection. + * + * Inputs: + * startProxyThread: whether to start the proxy thread (default is true) + * + * Returns: the newly created DeviceConnection + */ + DeviceConnection toDevice(bool startProxyThread = true); + + void put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size); + void put(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size); + void signal(); + void flush(); + void wait(); + void epochIncrement(); + +private: + struct impl; + std::unique_ptr pimpl; +}; + +#define MSCCLPP_UNIQUE_ID_BYTES 128 +struct UniqueId { + char internal[MSCCLPP_UNIQUE_ID_BYTES]; +}; + +/* Create a unique ID for communication. Only needs to be called by one process. + * Use with mscclppCommInitRankFromId(). + * All processes need to provide the same ID to mscclppCommInitRankFromId(). + * + * Outputs: + * uniqueId: the unique ID to be created + */ +std::unique_ptr getUniqueId(); + +/* Transport Types */ +enum class TransportType : uint8_t { + P2P = 0, + IB = 1, +}; + +class Communicator { +public: + /* Initialize the communicator. nranks processes with rank 0 to nranks-1 need to call this function. + * + * Inputs: + * nranks: number of ranks in the communicator + * ipPortPair: a string of the form "ip:port" that represents the address of the root process + * rank: rank of the calling process + */ + void initRank(int nranks, const char* ipPortPair, int rank); + + /* Initialize the communicator from a given UniqueId. Same as mscclppCommInitRank() except that + * id is provided by the user by calling getUniqueId() + * + * Inputs: + * nranks: number of ranks in the communicator + * id: the unique ID to be used for communication + * rank: rank of the calling process + */ + void initRankFromId(int nranks, UniqueId id, int rank); + + /* Ring-based AllGather through the bootstrap socket. + * + * Inputs: + * data: data array to be gathered where `[r*size, (r+1)*size)` is the data for rank `r` + * size: data size per rank + */ + void bootstrapAllGather(void* data, int size); + + /* A no-op function that is used to synchronize all processes via a bootstrap allgather*/ + void bootstrapBarrier(); + + /* Connect to a remote rank. This function only prepares metadata for connection. The actual connection + * is made by a following call of mscclppConnectionSetup(). Note that this function is two-way and a connection + * from rank i to remote rank j needs to have a counterpart from rank j to rank i. + * Note that with IB, buffers are registered at a page level and if a buffer is spread through multiple pages + * and do not fully utilize all of them, IB's QP has to register for all involved pages. This potentially has + * security risks if the devConn's accesses are given to a malicious process. + * + * Inputs: + * remoteRank: the rank of the remote process + * tag: the tag of the connection. tag is copied into the corresponding mscclppDevConn_t, which can be + * used to identify the connection inside a GPU kernel. + * localBuff: the local send/receive buffer + * buffSize: the size of the local buffer + * transportType: the type of transport to be used (mscclppTransportP2P or mscclppTransportIB) + * ibDev: the name of the IB device to be used. Expects a null for mscclppTransportP2P. + */ + std::shared_ptr connect(int remoteRank, int tag, void* localBuff, uint64_t buffSize, + TransportType transportType, const char* ibDev = 0); + + /* Establish all connections created by mscclppConnect(). This function must be called after all mscclppConnect() + * calls are made. This function ensures that all remote ranks are ready to communicate when it returns. + */ + void connectionSetup(); + + /* Destroy the communicator. */ + void destroy(); + + /* Return the rank of the calling process. + * + * Outputs: + * rank: the rank of the calling process + */ + int rank(); + + /* Return the number of ranks of the communicator. + * + * Outputs: + * size: the number of ranks of the communicator + */ + int size(); + + /* Set the timeout for the bootstrap connection. + * + * Inputs: + * timeout: the timeout in seconds + */ + void setBootstrapConnTimeout(unsigned timeout); + +private: + struct impl; + std::unique_ptr pimpl; +}; + +/* Log handler type which is a callback function for + * however user likes to handle the log messages. Once set, + * the logger will just call this function with msg. + */ +typedef void (*LogHandler)(const char* msg); + +/* The default log handler. + * + * Inputs: + * msg: the log message + */ +void defaultLogHandler(const char* msg); + +/* Set a custom log handler. + * + * Inputs: + * handler: the log handler function + */ +void setLogHandler(LogHandler handler); + +} // namespace mscclpp + +#endif // MSCCLPP_H_ diff --git a/src/include/mscclppfifo.hpp b/src/include/mscclppfifo.hpp new file mode 100644 index 00000000..7ab03081 --- /dev/null +++ b/src/include/mscclppfifo.hpp @@ -0,0 +1,53 @@ +#ifndef MSCCLPPFIFO_H_ +#define MSCCLPPFIFO_H_ + +#include +#include + +namespace mscclpp { + +struct alignas(16) ProxyTrigger { + uint64_t fst, snd; +}; + +/* This is a concurrent fifo where multiple device threads can push mscclppTrigger work elements to + * and a single host proxy thread consumes these work elements. There is a head pointer allocated on device + * which starts with 0 and goes to 2^64-1 which is almost infinity. There are two copies of tail, one + * that is on the deivce (triggerFifoTail) and another that is on host (proxyState->fifoTailHost). + * The host always has the "true" tail and occasionally, pushes it to the copy on the device. + * Therefore, most of the time, the device has a stale version. The invariants are: + * triggerFifoTail <= proxyState->fifoTailHost <= triggerFifoHead. + * push() function increments triggerFifoHead, proxyState->fifoTailHost is updated in proxy.cc:mscclppProxyService + * and it occasionally flushes it to triggerFifoTail via a cudaMemcpyAsync. + * + * Why duplicating the tail is a good idea? The fifo is large engouh and we do not need frequent updates + * for the tail as there is usually enough space for device threads to push their work into. + */ +struct ProxyFifo { +#ifdef __CUDACC__ + __forceinline__ __device__ uint64_t push(ProxyTrigger element) + { + uint64_t curFifoHead = atomicAdd((unsigned long long int*)this->triggerFifoHead, 1); + while (curFifoHead >= MSCCLPP_PROXY_FIFO_SIZE + *((volatile uint64_t*)this->triggerFifoTail)) + ; + while (*(volatile uint64_t*)&this->triggerFifo[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0) + ; + uint64_t* valptr = (uint64_t*)&(this->triggerFifo[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE].value); + asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(valptr), + "l"(element.value[0]), "l"(element.value[1])); + return curFifoHead; + } +#endif // __CUDACC__ + + void startProxyThread(std::function handler); + void stopProxyThread(); + + ProxyTrigger* triggerFifo; // Allocate on host via cudaHostAlloc. This space is used for pushing the workelements + uint64_t* triggerFifoTail; // Allocated on device. proxyState->fifoTailHost is the true tail on host and pused + // occasionally to device + uint64_t* triggerFifoHead; // Allocated on device. Only accessed by device +}; + +} // namespace mscclpp + +#endif // MSCCLPPFIFO_H_ From 45172bec886279ff5bc4bdd908381ead3e09eb90 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Fri, 14 Apr 2023 14:21:53 +0000 Subject: [PATCH 025/135] Implement mscclpp::Communicator using C-style API --- src/communicator.cc | 70 ++++++++++++++++++++++++++++++----------- src/include/mscclpp.hpp | 35 +-------------------- 2 files changed, 53 insertions(+), 52 deletions(-) diff --git a/src/communicator.cc b/src/communicator.cc index 5519b9c5..3272987d 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -3,49 +3,83 @@ namespace mscclpp { +mscclppTransport_t transportTypeToCStyle(TransportType type) { + switch (type) { + case TransportType::IB: + return mscclppTransportIB; + case TransportType::P2P: + return mscclppTransportP2P; + default: + throw std::runtime_error("Unknown transport type"); + } +} + struct Communicator::impl { mscclppComm_t comm; + std::vector> connections; + + impl() : comm(nullptr) {} + + ~impl() { + if (comm) { + mscclppCommDestroy(comm); + } + } }; void Communicator::initRank(int nranks, const char* ipPortPair, int rank) { - + if (pimpl) { + throw std::runtime_error("Communicator already initialized"); + } + pimpl = std::make_unique(); + mscclppCommInitRank(&pimpl->comm, nranks, ipPortPair, rank); } void Communicator::initRankFromId(int nranks, UniqueId id, int rank) { - + if (pimpl) { + throw std::runtime_error("Communicator already initialized"); + } + pimpl = std::make_unique(); + static_assert(sizeof(mscclppUniqueId) == sizeof(UniqueId), "UniqueId size mismatch"); + mscclppUniqueId *cstyle_id = reinterpret_cast(&id); + mscclppCommInitRankFromId(&pimpl->comm, nranks, *cstyle_id, rank); } void Communicator::bootstrapAllGather(void* data, int size) { - + mscclppBootstrapAllGather(pimpl->comm, data, size); } void Communicator::bootstrapBarrier() { - + mscclppBootstrapBarrier(pimpl->comm); } -std::shared_ptr Communicator::connect(int remoteRank, int tag, void* localBuff, uint64_t buffSize, - TransportType transportType, const char* ibDev = 0) { - +std::shared_ptr Communicator::connect(int remoteRank, int tag, + TransportType transportType, const char* ibDev = 0) { + mscclppConnect(pimpl->comm, remoteRank, tag, transportTypeToCStyle(transportType), ibDev); + auto conn = std::make_shared(); + auto connId = pimpl->connections.size(); + conn->pimpl->init(connId); + pimpl->connections.push_back(conn); + return conn; } void Communicator::connectionSetup() { - -} - -void Communicator::destroy() { - + mscclppConnectionSetup(pimpl->comm); + for (int connIdx = 0; connIdx < pimpl->connections.size(); ++connIdx) { + pimpl->connections[connIdx]->pimpl->setup(); + } } int Communicator::rank() { - + int result; + mscclppCommRank(pimpl->comm, &result); + return result; } int Communicator::size() { - -} - -void Communicator::setBootstrapConnTimeout(unsigned timeout) { - + int result; + mscclppCommSize(pimpl->comm, &result); + return result; } } // namespace mscclpp \ No newline at end of file diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index d44b04b9..85aa22f8 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -331,21 +331,15 @@ public: * remoteRank: the rank of the remote process * tag: the tag of the connection. tag is copied into the corresponding mscclppDevConn_t, which can be * used to identify the connection inside a GPU kernel. - * localBuff: the local send/receive buffer - * buffSize: the size of the local buffer * transportType: the type of transport to be used (mscclppTransportP2P or mscclppTransportIB) * ibDev: the name of the IB device to be used. Expects a null for mscclppTransportP2P. */ - std::shared_ptr connect(int remoteRank, int tag, void* localBuff, uint64_t buffSize, - TransportType transportType, const char* ibDev = 0); + std::shared_ptr connect(int remoteRank, int tag, TransportType transportType, const char* ibDev = 0); /* Establish all connections created by mscclppConnect(). This function must be called after all mscclppConnect() * calls are made. This function ensures that all remote ranks are ready to communicate when it returns. */ void connectionSetup(); - - /* Destroy the communicator. */ - void destroy(); /* Return the rank of the calling process. * @@ -361,38 +355,11 @@ public: */ int size(); - /* Set the timeout for the bootstrap connection. - * - * Inputs: - * timeout: the timeout in seconds - */ - void setBootstrapConnTimeout(unsigned timeout); - private: struct impl; std::unique_ptr pimpl; }; -/* Log handler type which is a callback function for - * however user likes to handle the log messages. Once set, - * the logger will just call this function with msg. - */ -typedef void (*LogHandler)(const char* msg); - -/* The default log handler. - * - * Inputs: - * msg: the log message - */ -void defaultLogHandler(const char* msg); - -/* Set a custom log handler. - * - * Inputs: - * handler: the log handler function - */ -void setLogHandler(LogHandler handler); - } // namespace mscclpp #endif // MSCCLPP_H_ From a0f1d3602697bc980789935d29d65435bd902bef Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Fri, 14 Apr 2023 15:57:47 +0000 Subject: [PATCH 026/135] Start HostConnection implementation Add declarations on the C-side for functions to enable multiple buffer registrations per connection. --- src/{communicator.cc => communicator.cpp} | 23 ++++++---- src/host_connection.cpp | 55 +++++++++++++++++++++++ src/include/communicator.hpp | 0 src/include/host_connection.hpp | 21 +++++++++ src/include/mscclpp.h | 35 +++++++++++++++ src/include/mscclpp.hpp | 8 ++-- src/include/mscclppfifo.hpp | 4 +- 7 files changed, 131 insertions(+), 15 deletions(-) rename src/{communicator.cc => communicator.cpp} (75%) create mode 100644 src/host_connection.cpp create mode 100644 src/include/communicator.hpp create mode 100644 src/include/host_connection.hpp diff --git a/src/communicator.cc b/src/communicator.cpp similarity index 75% rename from src/communicator.cc rename to src/communicator.cpp index 3272987d..73d82997 100644 --- a/src/communicator.cc +++ b/src/communicator.cpp @@ -14,13 +14,13 @@ mscclppTransport_t transportTypeToCStyle(TransportType type) { } } -struct Communicator::impl { +struct Communicator::Impl { mscclppComm_t comm; std::vector> connections; - impl() : comm(nullptr) {} + Impl() : comm(nullptr) {} - ~impl() { + ~Impl() { if (comm) { mscclppCommDestroy(comm); } @@ -31,7 +31,7 @@ void Communicator::initRank(int nranks, const char* ipPortPair, int rank) { if (pimpl) { throw std::runtime_error("Communicator already initialized"); } - pimpl = std::make_unique(); + pimpl = std::make_unique(); mscclppCommInitRank(&pimpl->comm, nranks, ipPortPair, rank); } @@ -39,7 +39,7 @@ void Communicator::initRankFromId(int nranks, UniqueId id, int rank) { if (pimpl) { throw std::runtime_error("Communicator already initialized"); } - pimpl = std::make_unique(); + pimpl = std::make_unique(); static_assert(sizeof(mscclppUniqueId) == sizeof(UniqueId), "UniqueId size mismatch"); mscclppUniqueId *cstyle_id = reinterpret_cast(&id); mscclppCommInitRankFromId(&pimpl->comm, nranks, *cstyle_id, rank); @@ -55,18 +55,23 @@ void Communicator::bootstrapBarrier() { std::shared_ptr Communicator::connect(int remoteRank, int tag, TransportType transportType, const char* ibDev = 0) { - mscclppConnect(pimpl->comm, remoteRank, tag, transportTypeToCStyle(transportType), ibDev); + mscclppConnectWithoutBuffer(pimpl->comm, remoteRank, tag, transportTypeToCStyle(transportType), ibDev); auto conn = std::make_shared(); - auto connId = pimpl->connections.size(); - conn->pimpl->init(connId); + auto connIdx = pimpl->connections.size(); pimpl->connections.push_back(conn); return conn; } void Communicator::connectionSetup() { mscclppConnectionSetup(pimpl->comm); + mscclppHostConn_t *hostConns; + int numHostConns; + mscclppGetAllHostConnections(pimpl->comm, &hostConns, &numHostConns); + if (numHostConns != pimpl->connections.size()) { + throw std::logic_error("Number of HostConnections didn't match number of mscclppHostConns"); + } for (int connIdx = 0; connIdx < pimpl->connections.size(); ++connIdx) { - pimpl->connections[connIdx]->pimpl->setup(); + pimpl->connections[connIdx]->pimpl->setup(hostConns[connIdx]); } } diff --git a/src/host_connection.cpp b/src/host_connection.cpp new file mode 100644 index 00000000..6a06de63 --- /dev/null +++ b/src/host_connection.cpp @@ -0,0 +1,55 @@ +#include "host_connection.hpp" + +namespace mscclpp { + +HostConnection::Impl::Impl() : hostConn(nullptr) {} + +HostConnection::Impl::~Impl() { + // TODO: figure out memory ownership. Does this deallocate the mscclppHostConn? Likely not. +} + +void HostConnection::Impl::setup(mscclppHostConn_t *hostConn) { + this->hostConn = hostConn; +} + +BufferHandle HostConnection::registerBuffer(void* data, uint64_t size) { + +} + +int HostConnection::numRemoteBuffers() { + +} + +BufferHandle HostConnection::getRemoteBuffer(int index) { + +} + +DeviceConnection HostConnection::toDevice(bool startProxyThread = true) { + +} + +void HostConnection::put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) { + +} + +void HostConnection::put(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) { + +} + +void HostConnection::signal() { + +} + +void HostConnection::flush() { + +} + +void HostConnection::wait() { + +} + +void HostConnection::epochIncrement() { + +} + +} // namespace mscclpp \ No newline at end of file diff --git a/src/include/communicator.hpp b/src/include/communicator.hpp new file mode 100644 index 00000000..e69de29b diff --git a/src/include/host_connection.hpp b/src/include/host_connection.hpp new file mode 100644 index 00000000..4a66c846 --- /dev/null +++ b/src/include/host_connection.hpp @@ -0,0 +1,21 @@ +#ifndef MSCCLPP_HOST_CONNECTION_HPP_ +#define MSCCLPP_HOST_CONNECTION_HPP_ + +#include "mscclpp.hpp" +#include "mscclpp.h" + +namespace mscclpp { + +struct HostConnection::Impl { + mscclppHostConn_t* hostConn; + + Impl(); + + ~Impl(); + + void setup(mscclppHostConn_t *hostConn); +}; + +} // namespace mscclpp + +#endif // MSCCLPP_HOST_CONNECTION_HPP_ \ No newline at end of file diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index 88404a65..09e71a90 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -29,6 +29,8 @@ struct alignas(16) mscclppDevConnSignalEpochId uint64_t proxy; }; +using mscclppBufferHandle_t = uint8_t; + /*************************************************************************************************************** * A mscclppDevConn provides a zero-copy connection between two GPUs connected via P2P NVLink or InfiniBand. * The communication API is one-sided meaning that for every single data transfer, only one side @@ -189,6 +191,7 @@ struct mscclppHostConn { virtual ~mscclppHostConn() = default; virtual void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) = 0; + virtual void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, uint64_t dataSize) = 0; virtual void signal() = 0; virtual void wait() = 0; virtual void flush() = 0; @@ -337,6 +340,38 @@ const char* mscclppGetErrorString(mscclppResult_t result); mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void* localBuff, uint64_t buffSize, mscclppTransport_t transportType, const char* ibDev = 0); +/* Connect to a remote rank. This function only prepares metadata for connection. The actual connection + * is made by a following call of mscclppConnectionSetup(). Note that this function is two-way and a connection + * from rank i to remote rank j needs to have a counterpart from rank j to rank i. + * Note that with IB, buffers are registered at a page level and if a buffer is spread through multiple pages + * and do not fully utilize all of them, IB's QP has to register for all involved pages. This potentially has + * security risks if the devConn's accesses are given to a malicious process. + * + * This version does not register a buffer. Buffers should instead be registered with mscclppRegisterBuffer(). + * + * Inputs: + * comm: the communicator + * remoteRank: the rank of the remote process + * tag: the tag of the connection. tag is copied into the corresponding mscclppDevConn_t, which can be + * used to identify the connection inside a GPU kernel. + * transportType: the type of transport to be used (mscclppTransportP2P or mscclppTransportIB) + * ibDev: the name of the IB device to be used. Expects a null for mscclppTransportP2P. + */ +mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int remoteRank, int tag, mscclppTransport_t transportType, const char* ibDev = 0); + +/* Register a buffer for use with a connection. + * + * Inputs: + * comm: the communicator + * connIdx: the index of the connection by order of calls to mscclppConnect/mscclppConnectWithoutBuffer + * localBuff: the local send/receive buffer + * buffSize: the size of the local buffer + * + * Outputs: + * handle: a handle to the buffer registration + */ +mscclppResult_t mscclppRegisterBuffer(mscclppComm_t comm, int connIdx, void* localBuff, uint64_t buffSize, mscclppBufferHandle_t *handle); + /* Establish all connections declared by mscclppConnect(). This function must be called after all mscclppConnect() * calls are made. This function ensures that all remote ranks are ready to communicate when it returns. * diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index 85aa22f8..fbc96f43 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -264,8 +264,8 @@ public: void epochIncrement(); private: - struct impl; - std::unique_ptr pimpl; + struct Impl; + std::unique_ptr pimpl; }; #define MSCCLPP_UNIQUE_ID_BYTES 128 @@ -356,8 +356,8 @@ public: int size(); private: - struct impl; - std::unique_ptr pimpl; + struct Impl; + std::unique_ptr pimpl; }; } // namespace mscclpp diff --git a/src/include/mscclppfifo.hpp b/src/include/mscclppfifo.hpp index 7ab03081..27abd4c5 100644 --- a/src/include/mscclppfifo.hpp +++ b/src/include/mscclppfifo.hpp @@ -1,5 +1,5 @@ -#ifndef MSCCLPPFIFO_H_ -#define MSCCLPPFIFO_H_ +#ifndef MSCCLPPFIFO_HPP_ +#define MSCCLPPFIFO_HPP_ #include #include From 5ad92598c9db944de84d47e75e7cf743929fb402 Mon Sep 17 00:00:00 2001 From: Madan Musuvathi Date: Fri, 14 Apr 2023 20:43:37 +0000 Subject: [PATCH 027/135] added a test for the new C++ api --- tests/allgather_test_cpp.cu | 506 ++++++++++++++++++++++++++++++++++++ 1 file changed, 506 insertions(+) create mode 100644 tests/allgather_test_cpp.cu diff --git a/tests/allgather_test_cpp.cu b/tests/allgather_test_cpp.cu new file mode 100644 index 00000000..ca30945f --- /dev/null +++ b/tests/allgather_test_cpp.cu @@ -0,0 +1,506 @@ +#include "mscclpp.h" +#include "mscclpp.hpp" + +#ifdef MSCCLPP_USE_MPI_FOR_TESTS +#include "mpi.h" +#endif // MSCCLPP_USE_MPI_FOR_TESTS +#include +#include +#include +#include +#include +#include + +static int nranksPerNode = 8; + +// Propagate errors up + +#define MSCCLPPCHECK(call) \ + do { \ + mscclppResult_t res = call; \ + if (res != mscclppSuccess && res != mscclppInProgress) { \ + /* Print the back trace*/ \ + printf("Failure at %s:%d -> %s\n", __FILE__, __LINE__, mscclppGetErrorString(res)); \ + return res; \ + } \ + } while (0) + +// Check CUDA RT calls +#define CUDACHECK(cmd) \ + do { \ + cudaError_t err = cmd; \ + if (err != cudaSuccess) { \ + printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } while (false) + +// Measure current time in second. +static double getTime(void) +{ + struct timespec tspec; + if (clock_gettime(CLOCK_MONOTONIC, &tspec) == -1) { + printf("clock_gettime failed\n"); + exit(EXIT_FAILURE); + } + return (tspec.tv_nsec / 1.0e9) + tspec.tv_sec; +} + +__constant__ mscclpp::DeviceConnection constDevConns[16]; + +__device__ void allgather0(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, size_t nelemsPerGPU) +{ + // this allgather is really simple and implemented as an alltoall + + // this thread's role is a sender role + // put your data asynchronously + if ((threadIdx.x % 32) == 0) + devConn.putWithSignal(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int)); + // make sure everyone is put their data before some thread randomly blocks everyone else in signal + __syncthreads(); + // push with flag and sync to make sure the data is received + if ((threadIdx.x % 32) == 0) + devConn.flush(); + + // this thread's role is a receiver role. wait on the semaphore to make sure the data is ready + if ((threadIdx.x % 32) == 0) + devConn.wait(); +} + +__device__ void localAllGather(mscclppDevConn_t devConn, int rank, int world_size, int nranksPerNode, int remoteRank, + uint64_t offset, uint64_t size) +{ + // this allgather algorithm works as follows: + // Step 1: GPU rank i sends data to GPU rank (i+1) % nranksPerNode + // and waits for data from GPU rank (i-1) % nranksPerNode + // Step 2: GPU rank i sends data to GPU rank (i+2) % nranksPerNode + // ... + // This order is much better for DMA engine for NVLinks + for (int i = 1; i < nranksPerNode; i++) { + if ((remoteRank % nranksPerNode) == ((rank + i) % nranksPerNode)) { + // put your data to GPU (rank+i) % nranksPerNode and signal in one call + if ((threadIdx.x % 32) == 0) + devConn.putWithSignalAndFlush(offset, size); + } + // wait for the data from GPU (rank-i) % nranksPerNode to arrive + if ((remoteRank % nranksPerNode) == ((rank - i + nranksPerNode) % nranksPerNode)) { + if ((threadIdx.x % 32) == 0) + devConn.wait(); + } + asm volatile("bar.sync %0, %1;" ::"r"(11), "r"((nranksPerNode - 1) * 32) : "memory"); + } +} + +__device__ void allgather1(mscclppDevConn_t devConn, int rank, int world_size, int nranksPerNode, int remoteRank, + size_t nelemsPerGPU) +{ + localAllGather(devConn, rank, world_size, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), + nelemsPerGPU * sizeof(int)); +} + +__device__ void allgather2(mscclppDevConn_t devConn, int rank, int world_size, int nranksPerNode, int remoteRank, + size_t nelemsPerGPU) +{ + // this allgather is a pipelined and hierarchical one and only works for two nodes + // it is implemented as follows: + // Step 1: each node does a local allgather and concurrently, + // local GPU i exchange (piplineSize-1)/pipelineSize portion of their data with + // its cross-node neighbor (local GPU i on the other node) via IB + // Step 2: each node does a local allgather again with the data just received from its + // cross-node neighbor in step 1, and concurrently, exchange the rest of the data with + // its cross-node neighbor + // Step 3: each node does a local allgather for the last time with the rest of the data + + int pipelineSize = 3; + + // Step 1 + // local allgather + if (remoteRank / nranksPerNode == rank / nranksPerNode) { + localAllGather(devConn, rank, world_size, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), + nelemsPerGPU * sizeof(int)); + } + // cross-node exchange + if (remoteRank % nranksPerNode == rank % nranksPerNode) { + // opposite side + if ((threadIdx.x % 32) == 0) + devConn.putWithSignalAndFlush(rank * nelemsPerGPU * sizeof(int), + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int)); + if ((threadIdx.x % 32) == 0) + devConn.wait(); + } + + __syncthreads(); + + // Step 2 + // local allgather + int otherNghr = (rank + nranksPerNode) % world_size; + if (remoteRank / nranksPerNode == rank / nranksPerNode) { + localAllGather(devConn, rank, world_size, nranksPerNode, remoteRank, otherNghr * nelemsPerGPU * sizeof(int), + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int)); + } + + // cross-node exchange + if (remoteRank % nranksPerNode == rank % nranksPerNode) { + // opposite side + if ((threadIdx.x % 32) == 0) + devConn.putWithSignalAndFlush((rank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * + sizeof(int), + nelemsPerGPU / pipelineSize * sizeof(int)); + if ((threadIdx.x % 32) == 0) + devConn.wait(); + } + + __syncthreads(); + + // Step 3 + // local allgather + if (remoteRank / nranksPerNode == rank / nranksPerNode) { + localAllGather(devConn, rank, world_size, nranksPerNode, remoteRank, + (otherNghr * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int), + nelemsPerGPU / pipelineSize * sizeof(int)); + } +} + +__global__ void kernel(int rank, int world_size, int nranksPerNode, size_t nelemsPerGPU, int kernel) +{ + // find the mapping between remoteRank and devConns + int warpId = threadIdx.x / 32; + int remoteRank = (warpId < rank) ? warpId : warpId + 1; + // Each warp is responsible for one of the remote ranks + mscclppDevConn_t devConn = constDevConns[warpId]; + + if (kernel == 0) + allgather0(devConn, rank, world_size, remoteRank, nelemsPerGPU); + else if (kernel == 1) + allgather1(devConn, rank, world_size, nranksPerNode, remoteRank, nelemsPerGPU); + else if (kernel == 2) + allgather2(devConn, rank, world_size, nranksPerNode, remoteRank, nelemsPerGPU); +} + +int rankToLocalRank(int rank) +{ + return rank % nranksPerNode; +} + +int rankToNode(int rank) +{ + return rank / nranksPerNode; +} + +void print_usage(const char* prog) +{ +#ifdef MSCCLPP_USE_MPI_FOR_TESTS + printf("usage: %s IP:PORT [rank nranks]\n", prog); +#else + printf("usage: %s IP:PORT rank nranks\n", prog); +#endif +} + +void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSize, size_t nelemsPerGPU, int** data_h, + int** data_d) +{ + CUDACHECK(cudaMalloc(data_d, dataSize)); + CUDACHECK(cudaMemset(*data_d, 0, dataSize)); + + *data_h = new int[nelemsPerGPU * world_size]; + for (size_t i = 0; i < nelemsPerGPU * world_size; i++) { + int val = i + 1; + if (i / nelemsPerGPU == (size_t)rank) { + (*data_h)[i] = val; + } else { + (*data_h)[i] = 0; + } + } + CUDACHECK(cudaMemcpy(*data_d, *data_h, dataSize, cudaMemcpyHostToDevice)); +} + +void setupMscclppConnections(int rank, int world_size, mscclpp::Communicator& comm, int* data_d, size_t dataSize) +{ + int thisNode = rankToNode(rank); + int cudaNum = rankToLocalRank(rank); + std::string ibDevStr = "mlx5_ib" + std::to_string(cudaNum); + std::vector devConns(world_size); + + for (int r = 0; r < world_size; ++r) { + if (r == rank) + continue; + mscclpp::TransportType transportType; + const char* ibDev = ibDevStr.c_str(); + if (rankToNode(r) == thisNode) { + ibDev = NULL; + transportType = mscclpp::TransportType::P2P; + } else { + transportType = mscclpp::TransportType::IB; + } + // Connect with all other ranks + auto hostConn = comm.connect(r, 0, transportType, ibDev); + hostConn->registerBuffer(data_d, dataSize); + devConns.push_back(hostConn->toDevice(false)); + } + + comm.connectionSetup(); + assert(devConns.size() < sizeof(constDevConns) / sizeof(mscclpp::DeviceConnection)); + CUDACHECK(cudaMemcpyToSymbol(constDevConns, devConns.data(), sizeof(mscclpp::DeviceConnection) * devConns.size() )); +} + +void printUsage(const char* prog, bool isMpi) +{ + if (isMpi) { + std::string st = "you are using MPI for this test\n"; + st += "two possilbe usages are:\n"; + st += "> " + std::string(prog) + "\n"; + st += "or\n"; + st += "> " + std::string(prog) + " -ip_port [ip:port]\n"; + printf("%s", st.c_str()); + } else { + std::string st = "you are NOT using MPI for this test\n"; + st += "the only possible usage:\n"; + st += "> " + std::string(prog) + " -ip_port [ip:port] -rank [rank] -nranks [nranks]\n"; + printf("%s", st.c_str()); + } +} + +std::unordered_map parseArgs(int argc, const char* argv[], bool isMpi) +{ + std::unordered_map options; + + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + if (arg == "-rankspernode") { + if (isMpi) { + fprintf(stderr, "Error: -rankspernode should not be specified with MPI.\n"); + exit(-1); + } + if (i + 1 < argc) { + options["rankspernode"] = argv[++i]; + } else { + fprintf(stderr, "Error: -rankspernode option requires an argument.\n"); + ; + exit(-1); + } + } else if (arg == "-kernel") { + if (i + 1 < argc) { + options["kernel"] = argv[++i]; + } else { + fprintf(stderr, "Error: -kernel option requires an argument.\n"); + exit(-1); + } + } else if (arg == "-ip_port") { + if (i + 1 < argc) { + options["ip_port"] = argv[++i]; + } else { + fprintf(stderr, "Error: -ip_port option requires an argument.\n"); + exit(-1); + } + } else if (arg == "-rank") { + if (isMpi) { + fprintf(stderr, "Error: -rank should not be specified with MPI.\n"); + exit(-1); + } + if (i + 1 < argc) { + options["rank"] = argv[++i]; + } else { + fprintf(stderr, "Error: -ip_port option requires an argument.\n"); + exit(-1); + } + } else if (arg == "-nranks") { + if (isMpi) { + fprintf(stderr, "Error: -nranks should not be specified with MPI.\n"); + exit(-1); + } + if (i + 1 < argc) { + options["nranks"] = argv[++i]; + } else { + fprintf(stderr, "Error: -ip_port option requires an argument.\n"); + exit(-1); + } + } else if (arg == "-datasize") { + if (i + 1 < argc) { + options["datasize"] = argv[++i]; + } else { + fprintf(stderr, "Error: -datasize option requires an argument.\n"); + exit(-1); + } + } else if (arg == "-help" || arg == "-h") { + printUsage(argv[0], isMpi); + exit(0); + } else { + fprintf(stderr, "Error: Unknown option %s\n", argv[i]); + exit(-1); + } + } + return options; +} + +int main(int argc, const char* argv[]) +{ + bool isMpi = false; +#ifdef MSCCLPP_USE_MPI_FOR_TESTS + isMpi = true; +#endif + + auto parsedArgs = parseArgs(argc, argv, isMpi); + + int rank; + int world_size; +#ifdef MSCCLPP_USE_MPI_FOR_TESTS + MPI_Init(NULL, NULL); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + // get the local number of nodes with MPI + MPI_Comm shmcomm; + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm); + int shmrank; + MPI_Comm_size(shmcomm, &shmrank); + nranksPerNode = shmrank; + MPI_Comm_free(&shmcomm); +#else + if (parsedArgs.find("rank") == parsedArgs.end() || parsedArgs.find("nranks") == parsedArgs.end()) { + printUsage(argv[0], isMpi); + exit(-1); + } + rank = std::stoi(parsedArgs["rank"]); + world_size = std::stoi(parsedArgs["nranks"]); + if (parsedArgs.find("rankspernode") == parsedArgs.end()) { + printUsage(argv[0], isMpi); + exit(-1); + } + nranksPerNode = std::stoi(parsedArgs["rankspernode"]); +#endif + int kernelNum = 0; + if (parsedArgs.find("kernel") != parsedArgs.end()) { + kernelNum = std::stoi(parsedArgs["kernel"]); + } + char* ip_port = NULL; + if (parsedArgs.find("ip_port") == parsedArgs.end()) { + printUsage(argv[0], isMpi); + exit(-1); + } + ip_port = (char*)parsedArgs["ip_port"].c_str(); + + int thisNode = rankToNode(rank); + int cudaNum = rankToLocalRank(rank); + CUDACHECK(cudaSetDevice(cudaNum)); + + int* data_d; + int* data_h; + size_t dataSize = 1024 * 1024 * 1024; + if (parsedArgs.find("datasize") != parsedArgs.end()) { + dataSize = std::stoul(parsedArgs["datasize"]); + } + size_t nelemsPerGPU = dataSize / sizeof(int) / world_size; + + try{ + mscclpp::Communicator comm; + + if (rank == 0) + printf("Initializing MSCCL++\n"); + + comm.initRank(world_size, ip_port, rank); + + if (rank == 0) + printf("Initializing data for allgather test\n"); + initializeAndAllocateAllGatherData(rank, world_size, dataSize, nelemsPerGPU, &data_h, &data_d); + + if (rank == 0) + printf("Setting up the connection in MSCCL++\n"); + setupMscclppConnections(rank, world_size, comm, data_d, dataSize); + + } catch (std::exception& e) { + // todo: throw exceptions in the implementation and process them here + } + + if (rank == 0) + printf("Launching MSCCL++ proxy threads\n"); + MSCCLPPCHECK(mscclppProxyLaunch(comm)); + + if (rank == 0) + printf("Testing the correctness of AllGather implementation\n"); + cudaStream_t stream; + CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + CUDACHECK(cudaDeviceSynchronize()); + kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size, nranksPerNode, nelemsPerGPU, kernelNum); + CUDACHECK(cudaDeviceSynchronize()); + CUDACHECK(cudaMemcpy(data_h, data_d, dataSize, cudaMemcpyDeviceToHost)); + + for (size_t i = 0; i < nelemsPerGPU * world_size; i++) { + int val = i + 1; + if (data_h[i] != val) { + printf("oh uh! data_h[%ld] (%d) != val (%d)\n", i, data_h[i], val); + break; + } + } + int tmp[16]; + // A simple barrier + MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int))); + if (rank == 0) + printf("Successfully checked the correctness\n"); + + // Perf test + int iterwithoutcudagraph = 10; + if (rank == 0) + printf("Running %d iterations of the kernel without CUDA graph\n", iterwithoutcudagraph); + CUDACHECK(cudaStreamSynchronize(stream)); + MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int))); + for (int i = 0; i < iterwithoutcudagraph; ++i) { + kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size, nranksPerNode, nelemsPerGPU, kernelNum); + } + CUDACHECK(cudaStreamSynchronize(stream)); + MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int))); + + // cudaGraph Capture + int cudagraphiter = 10; + if (rank == 0) + printf("Capturing %d iterations of the kernel in a CUDA graph\n", cudagraphiter); + cudaGraph_t graph; + cudaGraphExec_t instance; + cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal); + for (int i = 0; i < cudagraphiter; ++i) { + kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size, nranksPerNode, nelemsPerGPU, kernelNum); + } + cudaStreamEndCapture(stream, &graph); + cudaGraphInstantiate(&instance, graph, NULL, NULL, 0); + + int cudagraphwarmup = 10; + if (rank == 0) + printf("Warming up %d iterations of the CUDA graph with %d iterations of the kernel\n", cudagraphwarmup, + cudagraphiter); + for (int i = 0; i < cudagraphwarmup; ++i) { + cudaGraphLaunch(instance, stream); + } + CUDACHECK(cudaStreamSynchronize(stream)); + + // measure runtime + int cudagraphlaunch = 10; + if (rank == 0) + printf("Running %d iterations of the CUDA graph with %d iterations of the kernel\n", cudagraphlaunch, + cudagraphiter); + MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int))); + double t0, t1, ms, time_in_us; + t0 = getTime(); + for (int i = 0; i < cudagraphlaunch; ++i) { + cudaGraphLaunch(instance, stream); + } + CUDACHECK(cudaStreamSynchronize(stream)); + + t1 = getTime(); + ms = (t1 - t0) * 1000.0; + time_in_us = ms * 1000. / (float)cudagraphlaunch / (float)cudagraphiter; + printf("Rank %d report: size %lu time: %f us/iter algBW %f GBps\n", rank, dataSize, time_in_us, + (double)(dataSize) / 1e9 / (time_in_us / 1e6)); + MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int))); + + if (rank == 0) + printf("Stopping MSCCL++ proxy threads\n"); + MSCCLPPCHECK(mscclppProxyStop(comm)); + + if (rank == 0) + printf("Destroying MSCCL++ communicator\n"); + MSCCLPPCHECK(mscclppCommDestroy(comm)); + printf("Rank %d succeeded!\n", rank); + +#ifdef MSCCLPP_USE_MPI_FOR_TESTS + MPI_Finalize(); +#endif + return 0; +} From 46790d79e8be753ce1cdd941f72e45b0d3a74eb5 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Fri, 14 Apr 2023 23:04:48 +0000 Subject: [PATCH 028/135] Implement C API buffer registration support --- src/include/comm.h | 12 ++- src/include/mscclpp.h | 2 +- src/init.cc | 216 ++++++++++++++++++++++++++++++++---------- 3 files changed, 177 insertions(+), 53 deletions(-) diff --git a/src/include/comm.h b/src/include/comm.h index 62a6ba01..8275e0cb 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -9,13 +9,16 @@ #include "ib.h" #include "proxy.h" - -#if defined(ENABLE_NPKIT) #include -#endif #define MAXCONNECTIONS 64 +struct mscclppBufferRegistration +{ + void *data; + uint64_t size; +}; + struct mscclppConn { int connId; @@ -25,6 +28,9 @@ struct mscclppConn struct mscclppDevConn* devConn; struct mscclppHostConn* hostConn; + std::vector bufferRegistrations; + std::vector remoteBufferRegistrations; + struct mscclppIbContext* ibCtx; #if defined(ENABLE_NPKIT) std::vector npkitUsedReqIds; diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index 09e71a90..e48eaaf8 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -370,7 +370,7 @@ mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int remoteRank, * Outputs: * handle: a handle to the buffer registration */ -mscclppResult_t mscclppRegisterBuffer(mscclppComm_t comm, int connIdx, void* localBuff, uint64_t buffSize, mscclppBufferHandle_t *handle); +mscclppResult_t mscclppRegisterBufferForConnection(mscclppComm_t comm, int connIdx, void* localBuff, uint64_t buffSize, mscclppBufferHandle_t *handle); /* Establish all connections declared by mscclppConnect(). This function must be called after all mscclppConnect() * calls are made. This function ensures that all remote ranks are ready to communicate when it returns. diff --git a/src/init.cc b/src/init.cc index a5793364..c867fd2a 100644 --- a/src/init.cc +++ b/src/init.cc @@ -9,6 +9,7 @@ #include "mscclpp.h" #include #include +#include #if defined(ENABLE_NPKIT) #include "npkit/npkit.h" #endif @@ -323,8 +324,12 @@ struct mscclppHostP2PConn : mscclppHostConn void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) { - void* srcBuff = (void*)((char*)conn->devConn->localBuff + srcDataOffset); - void* dstBuff = (void*)((char*)conn->devConn->remoteBuff + dstDataOffset); + put(1, dstDataOffset, 1, srcDataOffset, dataSize); + } + void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, uint64_t dataSize) + { + void* srcBuff = (void*)((char*)conn->bufferRegistrations[src].data + srcDataOffset); + void* dstBuff = (void*)((char*)conn->remoteBufferRegistrations[dst].data + dstDataOffset); CUDACHECKNORET(cudaMemcpyAsync(dstBuff, srcBuff, dataSize, cudaMemcpyDeviceToDevice, p2pStream)); npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_DATA_ENTRY, (uint32_t)dataSize); } @@ -357,7 +362,11 @@ struct mscclppHostIBConn : mscclppHostConn void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) { - this->ibQp->stageSend(this->ibBuffMr, &this->ibBuffMrRemoteInfo, (uint32_t)dataSize, + put(1, dstDataOffset, 1, srcDataOffset, dataSize); + } + void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, uint64_t dataSize) + { + this->ibQp->stageSend(this->ibMrs[src], &this->remoteIbMrInfos[dst], (uint32_t)dataSize, /*wrId=*/0, /*srcOffset=*/srcDataOffset, /*dstOffset=*/dstDataOffset, /*signaled=*/false); int ret = this->ibQp->postSend(); if (ret != 0) { @@ -369,7 +378,7 @@ struct mscclppHostIBConn : mscclppHostConn void signal() { // My local device flag is copied to the remote's proxy flag - this->ibQp->stageSend(this->ibSignalEpochIdMr, &this->ibSignalEpochIdMrRemoteInfo, sizeof(uint64_t), + this->ibQp->stageSend(this->ibMrs[0], &this->remoteIbMrInfos[0], sizeof(uint64_t), /*wrId=*/0, /*srcOffset=*/0, /*dstOffset=*/sizeof(uint64_t), /*signaled=*/true); int ret = this->ibQp->postSend(); if (ret != 0) { @@ -410,14 +419,11 @@ struct mscclppHostIBConn : mscclppHostConn mscclppConn* conn; struct mscclppIbQp* ibQp; - struct mscclppIbMr* ibBuffMr; - struct mscclppIbMr* ibSignalEpochIdMr; - struct mscclppIbMrInfo ibBuffMrRemoteInfo; - struct mscclppIbMrInfo ibSignalEpochIdMrRemoteInfo; + std::vector ibMrs; + std::vector remoteIbMrInfos; }; -MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void* localBuff, - uint64_t buffSize, mscclppTransport_t transportType, const char* ibDev) +MSCCLPP_API mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int remoteRank, int tag, mscclppTransport_t transportType, const char* ibDev) { // save this processes numa binding and set it to the one closest to the device // so that all the allocation are close to the device @@ -440,7 +446,7 @@ MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, i struct mscclppConn* conn = &comm->conns[connId]; conn->connId = connId; conn->transport = transportType; - conn->buffSize = buffSize; + conn->buffSize = 0; conn->ibCtx = NULL; int ibDevIdx = -1; @@ -537,7 +543,7 @@ MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, i struct mscclppDevConn* devConn = &comm->devConns[connId]; conn->devConn = devConn; - conn->devConn->localBuff = localBuff; + conn->devConn->localBuff = nullptr; MSCCLPPCHECK(mscclppCudaCalloc(&conn->devConn->localSignalEpochId, 1)); MSCCLPPCHECK(mscclppCudaCalloc(&conn->devConn->waitEpochId, 1)); conn->devConn->remoteRank = remoteRank; @@ -556,27 +562,99 @@ MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, i // change the numa binding back to user's MSCCLPPCHECK(setNumaState(curProcessState)); + mscclppBufferHandle_t signalHandle = -1; + MSCCLPPCHECK(mscclppRegisterBufferForConnection(comm, connId, conn->devConn->localSignalEpochId, sizeof(mscclppDevConnSignalEpochId), &signalHandle)); + if (signalHandle != 0) { + WARN("signal handle should be 0"); + return mscclppInternalError; + } + return mscclppSuccess; } -struct connInfo +MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void* localBuff, + uint64_t buffSize, mscclppTransport_t transportType, const char* ibDev) { - cudaIpcMemHandle_t handleBuff; - cudaIpcMemHandle_t handleSignalEpochId; - mscclppIbQpInfo infoQp; - mscclppIbMrInfo infoBuffMr; - mscclppIbMrInfo infoSignalEpochIdMr; -}; + int connId = comm->nConns; + MSCCLPPCHECK(mscclppConnectWithoutBuffer(comm, remoteRank, tag, transportType, ibDev)); + struct mscclppConn* conn = &comm->conns[connId]; -mscclppResult_t mscclppP2pConnectionSetupStart(struct connInfo* connInfo /*output*/, struct mscclppConn* conn /*input*/) -{ - if (connInfo == NULL || conn == NULL) { - WARN("connInfo or connection cannot be null"); + conn->buffSize = buffSize; + conn->devConn->localBuff = localBuff; + + mscclppBufferHandle_t localBuffHandle = -1; + MSCCLPPCHECK(mscclppRegisterBufferForConnection(comm, connId, conn->devConn->localSignalEpochId, buffSize, &localBuffHandle)); + if (localBuffHandle != 1) { + WARN("data buffer handle should be 1"); return mscclppInternalError; } - struct mscclppDevConn* devConn = conn->devConn; - CUDACHECK(cudaIpcGetMemHandle(&connInfo->handleBuff, devConn->localBuff)); - CUDACHECK(cudaIpcGetMemHandle(&connInfo->handleSignalEpochId, devConn->localSignalEpochId)); + + return mscclppSuccess; +} + +MSCCLPP_API mscclppResult_t mscclppRegisterBufferForConnection(mscclppComm_t comm, int connIdx, void* localBuff, uint64_t buffSize, mscclppBufferHandle_t *handle) { + if (connIdx >= comm->nConns) { + WARN("connIdx out of range"); + return mscclppInvalidArgument; + } + mscclppConn& conn = comm->conns[connIdx]; + *handle = conn.bufferRegistrations.size(); + conn.bufferRegistrations.emplace_back(); + conn.bufferRegistrations.back().data = localBuff; + conn.bufferRegistrations.back().size = buffSize; + + return mscclppSuccess; +} + +struct mscclppBufferRegistrationInfo +{ + cudaIpcMemHandle_t cudaHandle; + mscclppIbMrInfo ibMrInfo; + uint64_t size; +}; + +struct connInfo +{ + mscclppIbQpInfo infoQp; + std::vector bufferInfos; + + struct header { + mscclppIbQpInfo infoQp; + int numBufferInfos; + }; + + mscclppResult_t sendOverBootstrap(void* bootstrap, int remoteRank, int tag) { + header h; + h.infoQp = infoQp; + h.numBufferInfos = bufferInfos.size(); + MSCCLPPCHECK(bootstrapSend(bootstrap, remoteRank, tag, &h, sizeof(header))); + MSCCLPPCHECK(bootstrapSend(bootstrap, remoteRank, tag, bufferInfos.data(), bufferInfos.size() * sizeof(mscclppBufferRegistrationInfo))); + return mscclppSuccess; + } + + mscclppResult_t recvOverBootstrap(void* bootstrap, int remoteRank, int tag) { + header h; + MSCCLPPCHECK(bootstrapRecv(bootstrap, remoteRank, tag, &h, sizeof(header))); + infoQp = h.infoQp; + bufferInfos.resize(h.numBufferInfos); + MSCCLPPCHECK(bootstrapRecv(bootstrap, remoteRank, tag, bufferInfos.data(), bufferInfos.size() * sizeof(mscclppBufferRegistrationInfo))); + return mscclppSuccess; + } +}; + +mscclppResult_t mscclppP2pConnectionSetupStart(struct connInfo* connInfo /*input*/, struct mscclppConn* conn /*input*/) +{ + if (conn == NULL) { + WARN("connection cannot be null"); + return mscclppInternalError; + } + + // Add all registered buffers + for (const auto &bufReg : conn->bufferRegistrations) { + connInfo->bufferInfos.emplace_back(); + CUDACHECK(cudaIpcGetMemHandle(&connInfo->bufferInfos.back().cudaHandle, bufReg.data)); + connInfo->bufferInfos.back().size = bufReg.size; + } return mscclppSuccess; } @@ -586,10 +664,30 @@ mscclppResult_t mscclppP2pConnectionSetupEnd(struct connInfo* connInfo /*input*/ WARN("ipcHandles or connection cannot be null"); return mscclppInternalError; } - CUDACHECK( - cudaIpcOpenMemHandle((void**)&conn->devConn->remoteBuff, connInfo->handleBuff, cudaIpcMemLazyEnablePeerAccess)); - CUDACHECK(cudaIpcOpenMemHandle((void**)&conn->devConn->remoteSignalEpochId, connInfo->handleSignalEpochId, - cudaIpcMemLazyEnablePeerAccess)); + if (connInfo->bufferInfos.size() < 1) { + WARN("at least 1 buffer info expected"); + return mscclppInternalError; + } + + // Open all remote registered buffers + for (size_t i = 0; i < connInfo->bufferInfos.size(); i++) { + mscclppBufferRegistration newBufReg; + CUDACHECK(cudaIpcOpenMemHandle(&newBufReg.data, connInfo->bufferInfos[i].cudaHandle, cudaIpcMemLazyEnablePeerAccess)); + newBufReg.size = connInfo->bufferInfos[i].size; + conn->remoteBufferRegistrations.push_back(newBufReg); + } + + if (conn->remoteBufferRegistrations[0].size != sizeof(mscclppDevConnSignalEpochId)) { + WARN("buffer registration zero size doesn't match sizeof(mscclppDevConnSignalEpochId)"); + return mscclppInternalError; + } + conn->devConn->remoteSignalEpochId = (mscclppDevConnSignalEpochId*)conn->remoteBufferRegistrations[0].data; + + // For backwards compatibility with the previous API that assumed one data buffer per connection, set the remote buffer + // to the first remote data buffer + if (conn->remoteBufferRegistrations.size() > 1) { + conn->devConn->remoteBuff = conn->remoteBufferRegistrations[1].data; + } return mscclppSuccess; } @@ -608,12 +706,18 @@ mscclppResult_t mscclppIbConnectionSetupStart(struct connInfo* connInfo /*output if (hostConn->ibQp == NULL) { MSCCLPPCHECK(mscclppIbContextCreateQp(ibCtx, &hostConn->ibQp)); } - MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, devConn->localBuff, conn->buffSize, &hostConn->ibBuffMr)); - MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, devConn->localSignalEpochId, - sizeof(struct mscclppDevConnSignalEpochId), &hostConn->ibSignalEpochIdMr)); + + // Add all registered buffers + for (const auto &bufReg : conn->bufferRegistrations) { + hostConn->ibMrs.emplace_back(); + MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, bufReg.data, + sizeof(struct mscclppDevConnSignalEpochId), &hostConn->ibMrs.back())); + connInfo->bufferInfos.emplace_back(); + connInfo->bufferInfos.back().ibMrInfo = hostConn->ibMrs.back()->info; + connInfo->bufferInfos.back().size = bufReg.size; + } + connInfo->infoQp = hostConn->ibQp->info; - connInfo->infoBuffMr = hostConn->ibBuffMr->info; - connInfo->infoSignalEpochIdMr = hostConn->ibSignalEpochIdMr->info; return mscclppSuccess; } @@ -632,8 +736,18 @@ mscclppResult_t mscclppIbConnectionSetupEnd(struct connInfo* connInfo /*input*/, WARN("Failed to transition QP to RTS"); return mscclppInvalidUsage; } - hostConn->ibBuffMrRemoteInfo = connInfo->infoBuffMr; - hostConn->ibSignalEpochIdMrRemoteInfo = connInfo->infoSignalEpochIdMr; + + // No remote pointers to set with IB, so we just set the Mrs + + // Push the Mrs for all the remote registered buffers + for (size_t i = 1; i < connInfo->bufferInfos.size(); i++) { + hostConn->remoteIbMrInfos.push_back(connInfo->bufferInfos[i].ibMrInfo); + + mscclppBufferRegistration newBufReg; + newBufReg.data = nullptr; + newBufReg.size = connInfo->bufferInfos[i].size; + conn->remoteBufferRegistrations.push_back(newBufReg); + } return mscclppSuccess; } @@ -650,14 +764,15 @@ MSCCLPP_API mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm) MSCCLPPCHECK(mscclppIbConnectionSetupStart(&cInfo, conn)); } // TODO: from saemal: do we possibly deadlock if there are too many outstanding sends? - MSCCLPPCHECK(bootstrapSend(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag, &cInfo, sizeof(cInfo))); + // MSCCLPPCHECK(bootstrapSend(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag, &cInfo, sizeof(cInfo))); + MSCCLPPCHECK(cInfo.sendOverBootstrap(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag)); } // Recv info from peers for (int i = 0; i < comm->nConns; ++i) { struct mscclppConn* conn = &comm->conns[i]; struct connInfo cInfo; - MSCCLPPCHECK(bootstrapRecv(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag, &cInfo, sizeof(cInfo))); + MSCCLPPCHECK(cInfo.recvOverBootstrap(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag)); if (conn->transport == mscclppTransportP2P) { MSCCLPPCHECK(mscclppP2pConnectionSetupEnd(&cInfo, conn)); } else if (conn->transport == mscclppTransportIB) { @@ -731,16 +846,19 @@ MSCCLPP_API mscclppResult_t mscclppRegisteredBufferWrite(mscclppComm_t comm, msc void* dstBuff = regMem->p2p[i].remoteBuff; CUDACHECK(cudaMemcpyAsync(dstBuff, srcBuff, size, cudaMemcpyDeviceToDevice, (cudaStream_t)stream)); } else { - struct mscclppHostIBConn* hostConn = (struct mscclppHostIBConn*)conn->hostConn; - hostConn->ibQp->stageSend(hostConn->ibBuffMr, &hostConn->ibBuffMrRemoteInfo, (uint32_t)size, - /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset, /*signaled=*/false); - if ((ret = hostConn->ibQp->postSend()) != 0) { - // Return value is errno. - WARN("data postSend failed: errno %d", ret); - } - // ?? - // npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_ENTRY, (uint32_t)trigger.fields.dataSize, - // trigger.fields.connId); + WARN("mscclppRegisteredBufferWrite not implemented for IB"); + return mscclppInternalError; + // TODO: fix the following (Olli: probably by including the relevant ibBuffMr in the mscclppRegisteredMemory) + // struct mscclppHostIBConn* hostConn = (struct mscclppHostIBConn*)conn->hostConn; + // hostConn->ibQp->stageSend(hostConn->ibBuffMr, &hostConn->ibBuffMrRemoteInfo, (uint32_t)size, + // /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset, /*signaled=*/false); + // if ((ret = hostConn->ibQp->postSend()) != 0) { + // // Return value is errno. + // WARN("data postSend failed: errno %d", ret); + // } + // // ?? + // // npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_ENTRY, (uint32_t)trigger.fields.dataSize, + // // trigger.fields.connId); } } return mscclppSuccess; From 65597e1f63a38e18f1542977fd6a8c8f52cff967 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Fri, 14 Apr 2023 23:35:10 +0000 Subject: [PATCH 029/135] Fix a copy-paste mistake --- src/init.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/init.cc b/src/init.cc index c867fd2a..7c3b76b9 100644 --- a/src/init.cc +++ b/src/init.cc @@ -583,7 +583,7 @@ MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, i conn->devConn->localBuff = localBuff; mscclppBufferHandle_t localBuffHandle = -1; - MSCCLPPCHECK(mscclppRegisterBufferForConnection(comm, connId, conn->devConn->localSignalEpochId, buffSize, &localBuffHandle)); + MSCCLPPCHECK(mscclppRegisterBufferForConnection(comm, connId, localBuff, buffSize, &localBuffHandle)); if (localBuffHandle != 1) { WARN("data buffer handle should be 1"); return mscclppInternalError; From 0315c29aba63f3dd14fe40ef13e71e7db67b05a5 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Tue, 18 Apr 2023 05:30:07 +0000 Subject: [PATCH 030/135] first class --- src/include/bootstrap.h | 14 ++++++++++++++ src/include/mscclpp.h | 11 +++++++++++ 2 files changed, 25 insertions(+) diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h index 95320b07..25dbe51c 100644 --- a/src/include/bootstrap.h +++ b/src/include/bootstrap.h @@ -20,6 +20,20 @@ struct mscclppBootstrapHandle static_assert(sizeof(struct mscclppBootstrapHandle) <= sizeof(mscclppUniqueId), "Bootstrap handle is too large to fit inside MSCCLPP unique ID"); +class mscclppBootstrap : Bootstrap { +public: + mscclppBootstrap(std::string ip_port_pair, int rank, int nranks); + mscclppBootstrap(mscclppBootstrapHandle handle, int rank, int nranks); + mscclppBootstrapHandle mscclppGetUniqueId(); + void Send(void* data, int size, int peer, int tag); + void Recv(void* data, int size, int peer, int tag); + void AllGather(void* allData, int size); + void Barrier(); +private: + struct impl; + std::unique_ptr pimpl; +}; + mscclppResult_t bootstrapNetInit(const char* ip_port_pair = NULL); mscclppResult_t bootstrapCreateRoot(struct mscclppBootstrapHandle* handle); mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle, bool isRoot = true, diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index e48eaaf8..e10b8e4f 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -247,6 +247,17 @@ typedef enum mscclppNumResults = 8 } mscclppResult_t; + +class Bootstrap { +public: + Bootstrap(){}; + virtual ~Bootstrap() = 0; + virtual void Send(void* data, int size, int peer, int tag) = 0; + virtual void Recv(void* data, int size, int peer, int tag) = 0; + virtual void AllGather(void* allData, int size) = 0; + virtual void Barrier() = 0; +}; + /* Create a unique ID for communication. Only needs to be called by one process. * Use with mscclppCommInitRankFromId(). * All processes need to provide the same ID to mscclppCommInitRankFromId(). From ec9737db8267a2ad854fa815f265dd82d5e1371c Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Wed, 19 Apr 2023 00:34:47 +0000 Subject: [PATCH 031/135] progress --- src/bootstrap/bootstrap.cc | 74 ++++++++++++++++++++++++++++++++++++++ src/include/bootstrap.h | 4 ++- 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index 11389222..32d245d2 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -11,6 +11,80 @@ #include #include +struct mscclppBootstrap::impl{ + void NetInit(std::string ipPortPair = ""){ + static bool initialized = false; + static pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER; + if (__atomic_load_n(&initialized, __ATOMIC_ACQUIRE)) + return; + pthread_mutex_lock(&initLock); + if (!initialized) { + + if (ipPortPair != "") { + union mscclppSocketAddress remoteAddr; + if (mscclppSocketGetAddrFromString(&remoteAddr, ipPortPair.c_str()) != mscclppSuccess) { + throw std::runtime_error("Invalid ip:port, please use format: : or []: or :"); + } + if (mscclppFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, + 1) <= 0) { + throw std::runtime_error("NET/Socket : No usable listening interface found"); + } + } else { + int nIfs = mscclppFindInterfaces(this->bootstrapNetIfName, &this->bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1); + if (nIfs <= 0) { + throw std::runtime_error("Bootstrap : no socket interface found"); + } + } + char line[SOCKET_NAME_MAXLEN + MAX_IF_NAME_SIZE + 2]; + sprintf(line, " %s:", bootstrapNetIfName); + mscclppSocketToString(&bootstrapNetIfAddr, line + strlen(line)); + INFO(MSCCLPP_INIT, "Bootstrap : Using%s", line); + __atomic_store_n(&initialized, true, __ATOMIC_RELEASE); + } + pthread_mutex_unlock(&initLock); + } + + static char bootstrapNetIfName[MAX_IF_NAME_SIZE + 1]; + static union mscclppSocketAddress bootstrapNetIfAddr; +}; + +struct mscclppBootstrap::UniqueId{ + uint64_t magic; + union mscclppSocketAddress addr; +}; + +static uint64_t hashUniqueId(mscclppBootstrapHandle const& id) +{ + char const* bytes = (char const*)&id; + uint64_t h = 0xdeadbeef; + for (int i = 0; i < (int)sizeof(mscclppBootstrapHandle); i++) { + h ^= h >> 32; + h *= 0x8db3db47fa2994ad; + h += bytes[i]; + } + return h; +} + +std::unique_ptr mscclppBootstrap::GetUniqueId(){ + pimpl->NetInit(); + + mscclppBootstrap::UniqueId handle; + auto ret = getRandomData(&handle.magic, sizeof(handle.magic)); + if (ret != mscclppSuccess) { + throw std::runtime_error("getting random data failed"); + } + memcpy(&handle.addr, &pimpl->bootstrapNetIfAddr, sizeof(union mscclppSocketAddress)); + // ret = bootstrapCreateRoot(handle); + + // mscclppResult_t res = bootstrapGetUniqueId(&handle); + // if (res != mscclppSuccess) { + // throw std::runtime_error("Bootstrap : failed to get unique ID"); + // } + // TRACE_CALL("mscclppGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(handle)); + // return *(mscclppUniqueId*)&handle; +} + + struct bootstrapRootArgs { struct mscclppSocket* listenSock; diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h index 25dbe51c..dbf72e6c 100644 --- a/src/include/bootstrap.h +++ b/src/include/bootstrap.h @@ -24,11 +24,13 @@ class mscclppBootstrap : Bootstrap { public: mscclppBootstrap(std::string ip_port_pair, int rank, int nranks); mscclppBootstrap(mscclppBootstrapHandle handle, int rank, int nranks); - mscclppBootstrapHandle mscclppGetUniqueId(); void Send(void* data, int size, int peer, int tag); void Recv(void* data, int size, int peer, int tag); void AllGather(void* allData, int size); void Barrier(); + struct UniqueId; + std::unique_ptr GetUniqueId(); + private: struct impl; std::unique_ptr pimpl; From 83c7ba1afb4067e8f14859364758394b02965556 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Wed, 19 Apr 2023 17:11:21 +0000 Subject: [PATCH 032/135] C++ API working, allgather_test_cpp passing --- Makefile | 3 +- src/basic_proxy_handler.cc | 29 ++++ src/communicator.cc | 114 ++++++++++++++++ src/communicator.cpp | 90 ------------ src/fifo.cc | 67 +++++++++ src/host_connection.cc | 66 +++++++++ src/host_connection.cpp | 55 -------- src/include/api.h | 1 + src/include/basic_proxy_handler.hpp | 13 ++ src/include/checks.hpp | 29 ++++ src/include/communicator.hpp | 23 ++++ src/include/host_connection.hpp | 7 +- src/include/mscclpp.h | 2 +- src/include/mscclpp.hpp | 165 ++++++++++++++++++---- src/include/mscclppfifo.hpp | 54 +++++--- src/init.cc | 4 +- src/proxy_cpp.cc | 93 +++++++++++++ tests/allgather_test_cpp.cu | 204 ++++++++++++++-------------- 18 files changed, 720 insertions(+), 299 deletions(-) create mode 100644 src/basic_proxy_handler.cc create mode 100644 src/communicator.cc delete mode 100644 src/communicator.cpp create mode 100644 src/fifo.cc create mode 100644 src/host_connection.cc delete mode 100644 src/host_connection.cpp create mode 100644 src/include/basic_proxy_handler.hpp create mode 100644 src/include/checks.hpp create mode 100644 src/proxy_cpp.cc diff --git a/Makefile b/Makefile index 92a68248..e544aeee 100644 --- a/Makefile +++ b/Makefile @@ -120,6 +120,7 @@ LDFLAGS := $(NVLDFLAGS) $(GDRCOPY_LDFLAGS) -libverbs -lnuma LIBSRCS := $(addprefix src/,debug.cc utils.cc init.cc proxy.cc ib.cc config.cc) LIBSRCS += $(addprefix src/bootstrap/,bootstrap.cc socket.cc) +LIBSRCS += $(addprefix src/,communicator.cc fifo.cc host_connection.cc proxy_cpp.cc basic_proxy_handler.cc) ifneq ($(NPKIT), 0) LIBSRCS += $(addprefix src/misc/,npkit.cc) endif @@ -147,7 +148,7 @@ UTOBJTARGETS := $(UTOBJS:%=$(BUILDDIR)/$(OBJDIR)/%) UTBINS := $(patsubst %.o,$(BUILDDIR)/$(BINDIR)/%,$(UTOBJS)) TESTSDIR := tests -TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc allgather_test_standalone.cu) +TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc allgather_test_standalone.cu allgather_test_cpp.cu) TESTSOBJS := $(patsubst %.cc,%.o,$(TESTSSRCS)) $(patsubst %.cu,%.o,$(TESTSSRCS)) TESTSOBJTARGETS := $(TESTSOBJS:%=$(BUILDDIR)/$(OBJDIR)/%) TESTSBINS := $(patsubst %.o,$(BUILDDIR)/$(BINDIR)/%,$(TESTSOBJS)) diff --git a/src/basic_proxy_handler.cc b/src/basic_proxy_handler.cc new file mode 100644 index 00000000..736c44bd --- /dev/null +++ b/src/basic_proxy_handler.cc @@ -0,0 +1,29 @@ +#include "basic_proxy_handler.hpp" + +namespace mscclpp { + +ProxyHandler makeBasicProxyHandler(Communicator::Impl &comm) { + return [&comm](ProxyTrigger triggerRaw) { + ChannelTrigger *trigger = reinterpret_cast(&triggerRaw); + HostConnection& conn = *comm.connections.at(trigger->fields.connId); + + auto result = ProxyHandlerResult::Continue; + + if (trigger->fields.type & mscclppData) { + conn.put(trigger->fields.dstBufferHandle, trigger->fields.dstOffset, trigger->fields.srcBufferHandle, trigger->fields.srcOffset, trigger->fields.size); + } + + if (trigger->fields.type & mscclppFlag) { + conn.signal(); + } + + if (trigger->fields.type & mscclppSync) { + conn.flush(); + result = ProxyHandlerResult::FlushAndContinue; + } + + return result; + }; +} + +} // namespace mscclpp diff --git a/src/communicator.cc b/src/communicator.cc new file mode 100644 index 00000000..cade59a3 --- /dev/null +++ b/src/communicator.cc @@ -0,0 +1,114 @@ +#include "communicator.hpp" +#include "host_connection.hpp" +#include "comm.h" +#include "basic_proxy_handler.hpp" +#include "api.h" + +namespace mscclpp { + +Communicator::Impl::Impl() : comm(nullptr), proxy(makeBasicProxyHandler(*this)) {} + +Communicator::Impl::~Impl() { + if (comm) { + mscclppCommDestroy(comm); + } +} + +MSCCLPP_API_CPP Communicator::Communicator() = default; +MSCCLPP_API_CPP Communicator::~Communicator() = default; + +mscclppTransport_t transportTypeToCStyle(TransportType type) { + switch (type) { + case TransportType::IB: + return mscclppTransportIB; + case TransportType::P2P: + return mscclppTransportP2P; + default: + throw std::runtime_error("Unknown transport type"); + } +} + +MSCCLPP_API_CPP void Communicator::initRank(int nranks, const char* ipPortPair, int rank) { + if (pimpl) { + throw std::runtime_error("Communicator already initialized"); + } + pimpl = std::make_unique(); + mscclppCommInitRank(&pimpl->comm, nranks, ipPortPair, rank); +} + +MSCCLPP_API_CPP void Communicator::initRankFromId(int nranks, UniqueId id, int rank) { + if (pimpl) { + throw std::runtime_error("Communicator already initialized"); + } + pimpl = std::make_unique(); + static_assert(sizeof(mscclppUniqueId) == sizeof(UniqueId), "UniqueId size mismatch"); + mscclppUniqueId *cstyle_id = reinterpret_cast(&id); + mscclppCommInitRankFromId(&pimpl->comm, nranks, *cstyle_id, rank); +} + +MSCCLPP_API_CPP void Communicator::bootstrapAllGather(void* data, int size) { + if (!pimpl) { + throw std::runtime_error("Communicator not initialized"); + } + mscclppBootstrapAllGather(pimpl->comm, data, size); +} + +MSCCLPP_API_CPP void Communicator::bootstrapBarrier() { + if (!pimpl) { + throw std::runtime_error("Communicator not initialized"); + } + mscclppBootstrapBarrier(pimpl->comm); +} + +MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank, int tag, + TransportType transportType, const char* ibDev) { + if (!pimpl) { + throw std::runtime_error("Communicator not initialized"); + } + mscclppConnectWithoutBuffer(pimpl->comm, remoteRank, tag, transportTypeToCStyle(transportType), ibDev); + auto connIdx = pimpl->connections.size(); + auto conn = std::make_shared(std::make_unique(this, &pimpl->comm->conns[connIdx])); + pimpl->connections.push_back(conn); + return conn; +} + +MSCCLPP_API_CPP void Communicator::connectionSetup() { + if (!pimpl) { + throw std::runtime_error("Communicator not initialized"); + } + mscclppConnectionSetup(pimpl->comm); +} + +MSCCLPP_API_CPP void Communicator::startProxying() { + if (!pimpl) { + throw std::runtime_error("Communicator not initialized"); + } + pimpl->proxy.start(); +} + +MSCCLPP_API_CPP void Communicator::stopProxying() { + if (!pimpl) { + throw std::runtime_error("Communicator not initialized"); + } + pimpl->proxy.stop(); +} + +MSCCLPP_API_CPP int Communicator::rank() { + if (!pimpl) { + throw std::runtime_error("Communicator not initialized"); + } + int result; + mscclppCommRank(pimpl->comm, &result); + return result; +} + +MSCCLPP_API_CPP int Communicator::size() { + if (!pimpl) { + throw std::runtime_error("Communicator not initialized"); + } + int result; + mscclppCommSize(pimpl->comm, &result); + return result; +} + +} // namespace mscclpp \ No newline at end of file diff --git a/src/communicator.cpp b/src/communicator.cpp deleted file mode 100644 index 73d82997..00000000 --- a/src/communicator.cpp +++ /dev/null @@ -1,90 +0,0 @@ -#include "mscclpp.hpp" -#include "mscclpp.h" - -namespace mscclpp { - -mscclppTransport_t transportTypeToCStyle(TransportType type) { - switch (type) { - case TransportType::IB: - return mscclppTransportIB; - case TransportType::P2P: - return mscclppTransportP2P; - default: - throw std::runtime_error("Unknown transport type"); - } -} - -struct Communicator::Impl { - mscclppComm_t comm; - std::vector> connections; - - Impl() : comm(nullptr) {} - - ~Impl() { - if (comm) { - mscclppCommDestroy(comm); - } - } -}; - -void Communicator::initRank(int nranks, const char* ipPortPair, int rank) { - if (pimpl) { - throw std::runtime_error("Communicator already initialized"); - } - pimpl = std::make_unique(); - mscclppCommInitRank(&pimpl->comm, nranks, ipPortPair, rank); -} - -void Communicator::initRankFromId(int nranks, UniqueId id, int rank) { - if (pimpl) { - throw std::runtime_error("Communicator already initialized"); - } - pimpl = std::make_unique(); - static_assert(sizeof(mscclppUniqueId) == sizeof(UniqueId), "UniqueId size mismatch"); - mscclppUniqueId *cstyle_id = reinterpret_cast(&id); - mscclppCommInitRankFromId(&pimpl->comm, nranks, *cstyle_id, rank); -} - -void Communicator::bootstrapAllGather(void* data, int size) { - mscclppBootstrapAllGather(pimpl->comm, data, size); -} - -void Communicator::bootstrapBarrier() { - mscclppBootstrapBarrier(pimpl->comm); -} - -std::shared_ptr Communicator::connect(int remoteRank, int tag, - TransportType transportType, const char* ibDev = 0) { - mscclppConnectWithoutBuffer(pimpl->comm, remoteRank, tag, transportTypeToCStyle(transportType), ibDev); - auto conn = std::make_shared(); - auto connIdx = pimpl->connections.size(); - pimpl->connections.push_back(conn); - return conn; -} - -void Communicator::connectionSetup() { - mscclppConnectionSetup(pimpl->comm); - mscclppHostConn_t *hostConns; - int numHostConns; - mscclppGetAllHostConnections(pimpl->comm, &hostConns, &numHostConns); - if (numHostConns != pimpl->connections.size()) { - throw std::logic_error("Number of HostConnections didn't match number of mscclppHostConns"); - } - for (int connIdx = 0; connIdx < pimpl->connections.size(); ++connIdx) { - pimpl->connections[connIdx]->pimpl->setup(hostConns[connIdx]); - } -} - -int Communicator::rank() { - int result; - mscclppCommRank(pimpl->comm, &result); - return result; -} - -int Communicator::size() { - int result; - mscclppCommSize(pimpl->comm, &result); - return result; -} - -} // namespace mscclpp \ No newline at end of file diff --git a/src/fifo.cc b/src/fifo.cc new file mode 100644 index 00000000..fe7f12d3 --- /dev/null +++ b/src/fifo.cc @@ -0,0 +1,67 @@ +#include "mscclppfifo.hpp" +#include "alloc.h" +#include "checks.hpp" +#include +#include +#include + +namespace mscclpp { + +struct HostProxyFifo::Impl { + DeviceProxyFifo deviceFifo; + + // allocated on the host. Only accessed by the host. This is a copy of the + // value pointed to by fifoTailDev and the invariant is that + // *fifoTailDev <= hostTail. Meaning that host's copy of tail is + // always ahead of the device's copy and host updates the device's copy + // only when it is needed. Therefore, hostTail is the "true" tail + // and fifoTailDev is a "stale" tail. See proxy.cc to undertand how + // these updates are pushed to the device. + uint64_t hostTail; + + // for transferring fifo tail + cudaStream_t stream; +}; + +HostProxyFifo::HostProxyFifo() { + pimpl = std::make_unique(); + MSCCLPPTHROW(mscclppCudaCalloc(&pimpl->deviceFifo.head, 1)); + MSCCLPPTHROW(mscclppCudaHostCalloc(&pimpl->deviceFifo.triggers, MSCCLPP_PROXY_FIFO_SIZE)); + MSCCLPPTHROW(mscclppCudaCalloc(&pimpl->deviceFifo.tailReplica, 1)); + CUDATHROW(cudaStreamCreateWithFlags(&pimpl->stream, cudaStreamNonBlocking)); + pimpl->hostTail = 0; +} + +HostProxyFifo::~HostProxyFifo() { + MSCCLPPTHROW(mscclppCudaFree(pimpl->deviceFifo.head)); + MSCCLPPTHROW(mscclppCudaHostFree(pimpl->deviceFifo.triggers)); + MSCCLPPTHROW(mscclppCudaFree(pimpl->deviceFifo.tailReplica)); + CUDATHROW(cudaStreamDestroy(pimpl->stream)); +} + +void HostProxyFifo::poll(ProxyTrigger *trigger) { + __m128i xmm0 = _mm_load_si128((__m128i*)&pimpl->deviceFifo.triggers[pimpl->hostTail % MSCCLPP_PROXY_FIFO_SIZE]); + _mm_store_si128((__m128i*)trigger, xmm0); +} + +void HostProxyFifo::pop() { + *(volatile uint64_t*)(&pimpl->deviceFifo.triggers[pimpl->hostTail % MSCCLPP_PROXY_FIFO_SIZE]) = 0; + (pimpl->hostTail)++; +} + +void HostProxyFifo::flushTail(bool sync) { + // Flush the tail to device memory. This is either triggered every MSCCLPP_PROXY_FIFO_FLUSH_COUNTER to make sure + // that the fifo can make progress even if there is no request mscclppSync. However, mscclppSync type is for flush + // request. + CUDATHROW( + cudaMemcpyAsync(pimpl->deviceFifo.tailReplica, &pimpl->hostTail, sizeof(uint64_t), cudaMemcpyHostToDevice, pimpl->stream)); + if (sync) { + CUDATHROW(cudaStreamSynchronize(pimpl->stream)); + } +} + +DeviceProxyFifo HostProxyFifo::toDevice() { + return pimpl->deviceFifo; +} + +} // namespace mscclpp diff --git a/src/host_connection.cc b/src/host_connection.cc new file mode 100644 index 00000000..cba9f81d --- /dev/null +++ b/src/host_connection.cc @@ -0,0 +1,66 @@ +#include "host_connection.hpp" +#include "communicator.hpp" +#include "comm.h" +#include "mscclpp.h" +#include "mscclppfifo.h" +#include "api.h" + +namespace mscclpp { + +HostConnection::Impl::Impl(Communicator* comm, mscclppConn* conn) : comm(comm), conn(conn) { + this->hostConn = conn->hostConn; +} + +HostConnection::Impl::~Impl() { + // TODO: figure out memory ownership. Does this deallocate the mscclppHostConn? Likely not. +} + +MSCCLPP_API_CPP HostConnection::HostConnection(std::unique_ptr p) : pimpl(std::move(p)) {} + +MSCCLPP_API_CPP BufferHandle HostConnection::registerBuffer(void* data, uint64_t size) { + BufferHandle result; + static_assert(sizeof(BufferHandle) == sizeof(mscclppBufferHandle_t)); + mscclppRegisterBufferForConnection(pimpl->comm->pimpl->comm, pimpl->conn->connId, data, size, reinterpret_cast(&result)); + return result; +} + +MSCCLPP_API_CPP int HostConnection::numRemoteBuffers() { + if (!pimpl->conn) { + throw std::runtime_error("HostConnection not initialized"); + } + return pimpl->conn->remoteBufferRegistrations.size() - 1; +} + +MSCCLPP_API_CPP BufferHandle HostConnection::getRemoteBuffer(int index) { + return index + 1; +} + +MSCCLPP_API_CPP DeviceConnection HostConnection::toDevice() { + DeviceConnection devConn; + static_assert(sizeof(SignalEpochId) == sizeof(mscclppDevConnSignalEpochId)); + devConn.connectionId = pimpl->conn->connId; + devConn.localSignalEpochId = reinterpret_cast(pimpl->conn->devConn->localSignalEpochId); + devConn.remoteSignalEpochId = reinterpret_cast(pimpl->conn->devConn->remoteSignalEpochId); + devConn.waitEpochId = pimpl->conn->devConn->waitEpochId; + devConn.fifo = pimpl->comm->pimpl->proxy.fifo().toDevice(); + + return devConn; +} + +MSCCLPP_API_CPP void HostConnection::put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) { + pimpl->hostConn->put(dst, dstOffset, src, srcOffset, size); +} + +MSCCLPP_API_CPP void HostConnection::signal() { + pimpl->hostConn->signal(); +} + +MSCCLPP_API_CPP void HostConnection::flush() { + pimpl->hostConn->flush(); +} + +MSCCLPP_API_CPP void HostConnection::wait() { + pimpl->hostConn->wait(); +} + +} // namespace mscclpp \ No newline at end of file diff --git a/src/host_connection.cpp b/src/host_connection.cpp deleted file mode 100644 index 6a06de63..00000000 --- a/src/host_connection.cpp +++ /dev/null @@ -1,55 +0,0 @@ -#include "host_connection.hpp" - -namespace mscclpp { - -HostConnection::Impl::Impl() : hostConn(nullptr) {} - -HostConnection::Impl::~Impl() { - // TODO: figure out memory ownership. Does this deallocate the mscclppHostConn? Likely not. -} - -void HostConnection::Impl::setup(mscclppHostConn_t *hostConn) { - this->hostConn = hostConn; -} - -BufferHandle HostConnection::registerBuffer(void* data, uint64_t size) { - -} - -int HostConnection::numRemoteBuffers() { - -} - -BufferHandle HostConnection::getRemoteBuffer(int index) { - -} - -DeviceConnection HostConnection::toDevice(bool startProxyThread = true) { - -} - -void HostConnection::put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) { - -} - -void HostConnection::put(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) { - -} - -void HostConnection::signal() { - -} - -void HostConnection::flush() { - -} - -void HostConnection::wait() { - -} - -void HostConnection::epochIncrement() { - -} - -} // namespace mscclpp \ No newline at end of file diff --git a/src/include/api.h b/src/include/api.h index bc5bd1a6..cf546e39 100644 --- a/src/include/api.h +++ b/src/include/api.h @@ -2,5 +2,6 @@ #define MSCCLPP_API_H_ #define MSCCLPP_API extern "C" __attribute__((visibility("default"))) +#define MSCCLPP_API_CPP __attribute__((visibility("default"))) #endif // MSCCLPP_API_H_ diff --git a/src/include/basic_proxy_handler.hpp b/src/include/basic_proxy_handler.hpp new file mode 100644 index 00000000..1c4b3f86 --- /dev/null +++ b/src/include/basic_proxy_handler.hpp @@ -0,0 +1,13 @@ +#ifndef MSCCLPP_BASIC_PROXY_SERVICE_HPP_ +#define MSCCLPP_BASIC_PROXY_SERVICE_HPP_ + +#include "mscclpp.hpp" +#include "communicator.hpp" + +namespace mscclpp { + +ProxyHandler makeBasicProxyHandler(Communicator::Impl &comm); + +} + +#endif \ No newline at end of file diff --git a/src/include/checks.hpp b/src/include/checks.hpp new file mode 100644 index 00000000..ad985e76 --- /dev/null +++ b/src/include/checks.hpp @@ -0,0 +1,29 @@ +/************************************************************************* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef MSCCLPP_CHECKS_HPP_ +#define MSCCLPP_CHECKS_HPP_ + +#include "debug.h" +#include + +#define MSCCLPPTHROW(call) \ + do { \ + mscclppResult_t res = call; \ + if (res != mscclppSuccess && res != mscclppInProgress) { \ + throw std::runtime_error(std::string("Call to " #call " failed with error code ") + mscclppGetErrorString(res)); \ + } \ + } while (0); + +#define CUDATHROW(cmd) \ + do { \ + cudaError_t err = cmd; \ + if (err != cudaSuccess) { \ + throw std::runtime_error(std::string("Cuda failure '") + cudaGetErrorString(err) + "'"); \ + } \ + } while (false) + +#endif diff --git a/src/include/communicator.hpp b/src/include/communicator.hpp index e69de29b..8294eeb6 100644 --- a/src/include/communicator.hpp +++ b/src/include/communicator.hpp @@ -0,0 +1,23 @@ +#ifndef MSCCL_COMMUNICATOR_HPP_ +#define MSCCL_COMMUNICATOR_HPP_ + +#include "mscclpp.hpp" +#include "mscclpp.h" + +namespace mscclpp { + +struct Communicator::Impl { + mscclppComm_t comm; + std::vector> connections; + Proxy proxy; + + Impl(); + + ~Impl(); + + friend class HostConnection; +}; + +} // namespace mscclpp + +#endif \ No newline at end of file diff --git a/src/include/host_connection.hpp b/src/include/host_connection.hpp index 4a66c846..495130d9 100644 --- a/src/include/host_connection.hpp +++ b/src/include/host_connection.hpp @@ -3,17 +3,18 @@ #include "mscclpp.hpp" #include "mscclpp.h" +#include "comm.h" namespace mscclpp { struct HostConnection::Impl { + Communicator* comm; + mscclppConn* conn; mscclppHostConn_t* hostConn; - Impl(); + Impl(Communicator* comm, mscclppConn* conn); ~Impl(); - - void setup(mscclppHostConn_t *hostConn); }; } // namespace mscclpp diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index e48eaaf8..4465cfed 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -29,7 +29,7 @@ struct alignas(16) mscclppDevConnSignalEpochId uint64_t proxy; }; -using mscclppBufferHandle_t = uint8_t; +using mscclppBufferHandle_t = uint32_t; /*************************************************************************************************************** * A mscclppDevConn provides a zero-copy connection between two GPUs connected via P2P NVLink or InfiniBand. diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index fbc96f43..4a21bae7 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -13,6 +13,7 @@ #include #include +#include #include @@ -27,15 +28,14 @@ struct alignas(16) SignalEpochId { uint64_t proxy; }; -enum ChannelTriggerType : uint64_t { - channelTriggerData = 0x1, - channelTriggerFlag = 0x2, - channelTriggerSync = 0x4 -}; +using ChannelTriggerType = uint64_t; +const ChannelTriggerType channelTriggerData = 0x1; +const ChannelTriggerType channelTriggerFlag = 0x2; +const ChannelTriggerType channelTriggerSync = 0x4; // This is just a numeric ID. Each HostConnection will have an internal array indexed by these handles // mapping to the actual -using BufferHandle = uint8_t; +using BufferHandle = uint32_t; #define MSCCLPP_BITS_SIZE 32 #define MSCCLPP_BITS_OFFSET 32 @@ -58,15 +58,18 @@ union ChannelTrigger { uint64_t srcBufferHandle : MSCCLPP_BITS_BUFFER_HANDLE; uint64_t dstBufferHandle : MSCCLPP_BITS_BUFFER_HANDLE; uint64_t type : MSCCLPP_BITS_TYPE; + uint64_t connId : MSCCLPP_BITS_CONNID; uint64_t : (64 - MSCCLPP_BITS_OFFSET - MSCCLPP_BITS_BUFFER_HANDLE - MSCCLPP_BITS_BUFFER_HANDLE - MSCCLPP_BITS_TYPE); // ensure 64-bit alignment } fields; - ChannelTrigger() {} - ChannelTrigger(ProxyTrigger value) : value(value) {} - ChannelTrigger(ChannelTriggerType type, BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) { +#ifdef __CUDACC__ + __device__ ChannelTrigger() {} + __device__ ChannelTrigger(ProxyTrigger value) : value(value) {} + __device__ ChannelTrigger(ChannelTriggerType type, BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size, int connectionId) { value.fst = ((srcOffset << MSCCLPP_BITS_SIZE) + size); - value.snd = (((((((uint64_t)type << MSCCLPP_BITS_BUFFER_HANDLE) + dst) << MSCCLPP_BITS_BUFFER_HANDLE) + src) << MSCCLPP_BITS_OFFSET) + dstOffset); + value.snd = ((((((((connectionId << MSCCLPP_BITS_TYPE) + (uint64_t)type) << MSCCLPP_BITS_BUFFER_HANDLE) + dst) << MSCCLPP_BITS_BUFFER_HANDLE) + src) << MSCCLPP_BITS_OFFSET) + dstOffset); } +#endif // __CUDACC__ }; /*************************************************************************************************************** @@ -137,7 +140,7 @@ struct DeviceConnection { __forceinline__ __device__ void put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) { - fifo.push(ChannelTrigger(channelTriggerData, dst, dstOffset, src, srcOffset, size).value); + fifo.push(ChannelTrigger(channelTriggerData, dst, dstOffset, src, srcOffset, size, connectionId).value); } __forceinline__ __device__ void put(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) @@ -148,13 +151,13 @@ struct DeviceConnection { __forceinline__ __device__ void signal() { epochIncrement(); - fifo.push(ChannelTrigger(channelTriggerFlag, 0, 0, 0, 0, 1).value); + fifo.push(ChannelTrigger(channelTriggerFlag, 0, 0, 0, 0, 1, connectionId).value); } __forceinline__ __device__ void putWithSignal(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) { epochIncrement(); - fifo.push(ChannelTrigger(channelTriggerData | channelTriggerFlag, dst, dstOffset, src, srcOffset, size).value); + fifo.push(ChannelTrigger(channelTriggerData | channelTriggerFlag, dst, dstOffset, src, srcOffset, size, connectionId).value); } __forceinline__ __device__ void putWithSignal(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) @@ -165,24 +168,24 @@ struct DeviceConnection { __forceinline__ __device__ void putWithSignalAndFlush(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) { epochIncrement(); - uint64_t curFifoHead = fifo.push(channelTriggerData | channelTriggerFlag | channelTriggerSync, dstOffset, srcOffset, size); - while (*(volatile uint64_t*)&fifo.triggerFifo[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0 && - *(volatile uint64_t*)fifo.triggerFifoTail <= curFifoHead) + uint64_t curFifoHead = fifo.push(ChannelTrigger(channelTriggerData | channelTriggerFlag | channelTriggerSync, dst, dstOffset, src, srcOffset, size, connectionId).value); + while (*(volatile uint64_t*)&fifo.triggers[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0 && + *(volatile uint64_t*)fifo.tailReplica <= curFifoHead) ; } __forceinline__ __device__ void putWithSignalAndFlush(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) { - putWithSignalAndFlush(offset, offset, size); + putWithSignalAndFlush(dst, offset, src, offset, size); } __forceinline__ __device__ void flush() { - uint64_t curFifoHead = fifo.push(mscclppSync, 0, 0, 1); + uint64_t curFifoHead = fifo.push(ChannelTrigger(mscclppSync, 0, 0, 0, 0, 1, connectionId).value); // we need to wait for two conditions to be met to ensure the CPU is done flushing. (1) wait for the tail // to go pass by curFifoHead (this is safety net) and (2) wait for the work element value to change to 0. - while (*(volatile uint64_t*)&fifo.triggerFifo[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0 && - *(volatile uint64_t*)fifo.triggerFifoTail <= curFifoHead) + while (*(volatile uint64_t*)&fifo.triggers[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0 && + *(volatile uint64_t*)fifo.tailReplica <= curFifoHead) ; } @@ -200,8 +203,7 @@ struct DeviceConnection { #endif // __CUDACC__ - int remoteRank; - int tag; + int connectionId; SignalEpochId* localSignalEpochId; // used by the signal() function directly from gpu @@ -214,11 +216,80 @@ struct DeviceConnection { // this is a concurrent fifo which is multiple threads from the device // can produce for and the sole proxy thread consumes it. - ProxyFifo fifo; + DeviceProxyFifo fifo; +}; + +struct SimpleDeviceConnection { + SimpleDeviceConnection() {} + SimpleDeviceConnection(DeviceConnection devConn, BufferHandle dst, BufferHandle src) : devConn(devConn), dst(dst), src(src) {} + SimpleDeviceConnection(const SimpleDeviceConnection& other) = default; + SimpleDeviceConnection& operator=(SimpleDeviceConnection& other) = default; + +#ifdef __CUDACC__ + + __forceinline__ __device__ void put(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) + { + devConn.put(dst, dstOffset, src, srcOffset, size); + } + + __forceinline__ __device__ void put(uint64_t offset, uint64_t size) + { + put(offset, offset, size); + } + + __forceinline__ __device__ void signal() + { + devConn.signal(); + } + + __forceinline__ __device__ void putWithSignal(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) + { + devConn.putWithSignal(dst, dstOffset, src, srcOffset, size); + } + + __forceinline__ __device__ void putWithSignal(uint64_t offset, uint64_t size) + { + putWithSignal(offset, offset, size); + } + + __forceinline__ __device__ void putWithSignalAndFlush(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) + { + devConn.putWithSignalAndFlush(dst, dstOffset, src, srcOffset, size); + } + + __forceinline__ __device__ void putWithSignalAndFlush(uint64_t offset, uint64_t size) + { + putWithSignalAndFlush(offset, offset, size); + } + + __forceinline__ __device__ void flush() + { + devConn.flush(); + } + + __forceinline__ __device__ void wait() + { + devConn.wait(); + } + + __forceinline__ __device__ void epochIncrement() + { + devConn.epochIncrement(); + } + +#endif // __CUDACC__ + + DeviceConnection devConn; + BufferHandle dst; + BufferHandle src; }; class HostConnection { + struct Impl; public: + /* HostConnection can not be constructed from user code and must instead be created through Communicator::connect */ + HostConnection(std::unique_ptr); + /* Register a region of GPU memory for use with this connection. Must be called before connectionSetup() * in the communicator. * @@ -249,23 +320,21 @@ public: * trigger operations on this HostConnection corresponding to put/signal/etc. calls made to the * DeviceConnection. * - * Inputs: - * startProxyThread: whether to start the proxy thread (default is true) - * * Returns: the newly created DeviceConnection */ - DeviceConnection toDevice(bool startProxyThread = true); + DeviceConnection toDevice(); void put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size); - void put(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size); + void signal(); + void flush(); + void wait(); - void epochIncrement(); private: - struct Impl; std::unique_ptr pimpl; + friend class Communicator; }; #define MSCCLPP_UNIQUE_ID_BYTES 128 @@ -290,6 +359,9 @@ enum class TransportType : uint8_t { class Communicator { public: + Communicator(); + ~Communicator(); + /* Initialize the communicator. nranks processes with rank 0 to nranks-1 need to call this function. * * Inputs: @@ -341,6 +413,12 @@ public: */ void connectionSetup(); + /* Launch proxy thread(s). This function is supposed to be called before starting a kernel that uses DeviceConnection. */ + void startProxying(); + + /* Stop proxy thread(s). */ + void stopProxying(); + /* Return the rank of the calling process. * * Outputs: @@ -355,6 +433,33 @@ public: */ int size(); + struct Impl; +private: + std::unique_ptr pimpl; + friend class HostConnection; +}; + +enum class ProxyHandlerResult { + Continue, + FlushAndContinue, + Stop, +}; + +class Proxy; +using ProxyHandler = std::function; + +class Proxy { +public: + Proxy(ProxyHandler handler); + + ~Proxy(); + + void start(); + + void stop(); + + HostProxyFifo& fifo(); + private: struct Impl; std::unique_ptr pimpl; diff --git a/src/include/mscclppfifo.hpp b/src/include/mscclppfifo.hpp index 27abd4c5..b5f8ba4c 100644 --- a/src/include/mscclppfifo.hpp +++ b/src/include/mscclppfifo.hpp @@ -3,6 +3,7 @@ #include #include +#include namespace mscclpp { @@ -13,39 +14,56 @@ struct alignas(16) ProxyTrigger { /* This is a concurrent fifo where multiple device threads can push mscclppTrigger work elements to * and a single host proxy thread consumes these work elements. There is a head pointer allocated on device * which starts with 0 and goes to 2^64-1 which is almost infinity. There are two copies of tail, one - * that is on the deivce (triggerFifoTail) and another that is on host (proxyState->fifoTailHost). + * that is on the deivce (tailReplica) and another that is on host (proxyState->fifoTailHost). * The host always has the "true" tail and occasionally, pushes it to the copy on the device. * Therefore, most of the time, the device has a stale version. The invariants are: - * triggerFifoTail <= proxyState->fifoTailHost <= triggerFifoHead. - * push() function increments triggerFifoHead, proxyState->fifoTailHost is updated in proxy.cc:mscclppProxyService - * and it occasionally flushes it to triggerFifoTail via a cudaMemcpyAsync. + * tailReplica <= proxyState->fifoTailHost <= head. + * push() function increments head, proxyState->fifoTailHost is updated in proxy.cc:mscclppProxyService + * and it occasionally flushes it to tailReplica via a cudaMemcpyAsync. * * Why duplicating the tail is a good idea? The fifo is large engouh and we do not need frequent updates * for the tail as there is usually enough space for device threads to push their work into. */ -struct ProxyFifo { +struct DeviceProxyFifo { #ifdef __CUDACC__ - __forceinline__ __device__ uint64_t push(ProxyTrigger element) + __forceinline__ __device__ uint64_t push(ProxyTrigger trigger) { - uint64_t curFifoHead = atomicAdd((unsigned long long int*)this->triggerFifoHead, 1); - while (curFifoHead >= MSCCLPP_PROXY_FIFO_SIZE + *((volatile uint64_t*)this->triggerFifoTail)) + uint64_t curFifoHead = atomicAdd((unsigned long long int*)this->head, 1); + while (curFifoHead >= MSCCLPP_PROXY_FIFO_SIZE + *((volatile uint64_t*)this->tailReplica)) ; - while (*(volatile uint64_t*)&this->triggerFifo[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0) + while (*(volatile uint64_t*)&this->triggers[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0) ; - uint64_t* valptr = (uint64_t*)&(this->triggerFifo[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE].value); - asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(valptr), - "l"(element.value[0]), "l"(element.value[1])); + ProxyTrigger* triggerPtr = (ProxyTrigger*)&(this->triggers[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE]); + asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(triggerPtr), + "l"(trigger.fst), "l"(trigger.snd)); return curFifoHead; } #endif // __CUDACC__ - void startProxyThread(std::function handler); - void stopProxyThread(); - - ProxyTrigger* triggerFifo; // Allocate on host via cudaHostAlloc. This space is used for pushing the workelements - uint64_t* triggerFifoTail; // Allocated on device. proxyState->fifoTailHost is the true tail on host and pused + ProxyTrigger* triggers; // Allocate on host via cudaHostAlloc. This space is used for pushing the workelements + uint64_t* tailReplica; // Allocated on device. proxyState->fifoTailHost is the true tail on host and pused // occasionally to device - uint64_t* triggerFifoHead; // Allocated on device. Only accessed by device + uint64_t* head; // Allocated on device. Only accessed by device +}; + +class HostProxyFifo +{ +public: + HostProxyFifo(); + + ~HostProxyFifo(); + + void poll(ProxyTrigger *trigger); + + void pop(); + + void flushTail(bool sync = false); + + DeviceProxyFifo toDevice(); + +private: + struct Impl; + std::unique_ptr pimpl; }; } // namespace mscclpp diff --git a/src/init.cc b/src/init.cc index 7c3b76b9..7cf159c8 100644 --- a/src/init.cc +++ b/src/init.cc @@ -629,7 +629,7 @@ struct connInfo h.numBufferInfos = bufferInfos.size(); MSCCLPPCHECK(bootstrapSend(bootstrap, remoteRank, tag, &h, sizeof(header))); MSCCLPPCHECK(bootstrapSend(bootstrap, remoteRank, tag, bufferInfos.data(), bufferInfos.size() * sizeof(mscclppBufferRegistrationInfo))); - return mscclppSuccess; + return mscclppSuccess; } mscclppResult_t recvOverBootstrap(void* bootstrap, int remoteRank, int tag) { @@ -638,7 +638,7 @@ struct connInfo infoQp = h.infoQp; bufferInfos.resize(h.numBufferInfos); MSCCLPPCHECK(bootstrapRecv(bootstrap, remoteRank, tag, bufferInfos.data(), bufferInfos.size() * sizeof(mscclppBufferRegistrationInfo))); - return mscclppSuccess; + return mscclppSuccess; } }; diff --git a/src/proxy_cpp.cc b/src/proxy_cpp.cc new file mode 100644 index 00000000..9360d560 --- /dev/null +++ b/src/proxy_cpp.cc @@ -0,0 +1,93 @@ +#include "mscclpp.hpp" +#include "utils.h" +#include "api.h" +#include +#include + +namespace mscclpp { + +const int ProxyStopCheckPeriod = 1000; + +const int ProxyFlushPeriod = 4; + +struct Proxy::Impl { + ProxyHandler handler; + HostProxyFifo fifo; + std::thread service; + std::atomic_bool running; + + Impl(ProxyHandler handler) : handler(handler), running(false) {} +}; + +MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler) { + pimpl = std::make_unique(handler); +} + +MSCCLPP_API_CPP Proxy::~Proxy() { + if (pimpl) { + stop(); + } +} + +MSCCLPP_API_CPP void Proxy::start() { + pimpl->running = true; + pimpl->service = std::thread([this] { + // from this point on, proxy thread will stay close to the device + // PROXYMSCCLPPCHECK(numaBind(pimpl->comm->devNumaNode)); // TODO: reenable this + + ProxyHandler handler = this->pimpl->handler; + HostProxyFifo& fifo = this->pimpl->fifo; + std::atomic_bool& running = this->pimpl->running; + ProxyTrigger trigger; + + int runCnt = ProxyStopCheckPeriod; + uint64_t flushCnt = 0; + for (;;) { + if (runCnt-- == 0) { + runCnt = ProxyStopCheckPeriod; + if (!running) { + break; + } + } + // Poll to see if we are ready to send anything + fifo.poll(&trigger); + if (trigger.fst == 0) { // TODO: this check is a potential pitfall for custom triggers + continue; // there is one in progress + } + + ProxyHandlerResult result = handler(trigger); + + // Send completion: reset only the high 64 bits + fifo.pop(); + // Flush the tail to device memory. This is either triggered every ProxyFlushPeriod to make sure + // that the fifo can make progress even if there is no request mscclppSync. However, mscclppSync type is for flush + // request. + if ((++flushCnt % ProxyFlushPeriod) == 0 || result == ProxyHandlerResult::FlushAndContinue) { + // TODO: relocate this check: || (trigger.fields.type & mscclppSync) + fifo.flushTail(); + } + } + + // make sure the tail is flushed before we shut the proxy + fifo.flushTail(/*sync=*/true); + // TODO: do these need to run? + // bool isP2pProxy = (proxyState->ibContext == nullptr); + // if (isP2pProxy) { + // cudaStream_t p2pStream = proxyState->p2pStream; + // PROXYCUDACHECK(cudaStreamSynchronize(p2pStream)); + // } + }); +} + +MSCCLPP_API_CPP void Proxy::stop() { + pimpl->running = false; + if (pimpl->service.joinable()) { + pimpl->service.join(); + } +} + +MSCCLPP_API_CPP HostProxyFifo& Proxy::fifo() { + return pimpl->fifo; +} + +} // namespace mscclpp \ No newline at end of file diff --git a/tests/allgather_test_cpp.cu b/tests/allgather_test_cpp.cu index ca30945f..ddecadbf 100644 --- a/tests/allgather_test_cpp.cu +++ b/tests/allgather_test_cpp.cu @@ -10,6 +10,7 @@ #include #include #include +#include static int nranksPerNode = 8; @@ -46,9 +47,9 @@ static double getTime(void) return (tspec.tv_nsec / 1.0e9) + tspec.tv_sec; } -__constant__ mscclpp::DeviceConnection constDevConns[16]; +__constant__ mscclpp::SimpleDeviceConnection constDevConns[16]; -__device__ void allgather0(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, size_t nelemsPerGPU) +__device__ void allgather0(mscclpp::SimpleDeviceConnection devConn, int rank, int world_size, int remoteRank, size_t nelemsPerGPU) { // this allgather is really simple and implemented as an alltoall @@ -67,7 +68,7 @@ __device__ void allgather0(mscclppDevConn_t devConn, int rank, int world_size, i devConn.wait(); } -__device__ void localAllGather(mscclppDevConn_t devConn, int rank, int world_size, int nranksPerNode, int remoteRank, +__device__ void localAllGather(mscclpp::SimpleDeviceConnection devConn, int rank, int world_size, int nranksPerNode, int remoteRank, uint64_t offset, uint64_t size) { // this allgather algorithm works as follows: @@ -91,14 +92,14 @@ __device__ void localAllGather(mscclppDevConn_t devConn, int rank, int world_siz } } -__device__ void allgather1(mscclppDevConn_t devConn, int rank, int world_size, int nranksPerNode, int remoteRank, +__device__ void allgather1(mscclpp::SimpleDeviceConnection devConn, int rank, int world_size, int nranksPerNode, int remoteRank, size_t nelemsPerGPU) { localAllGather(devConn, rank, world_size, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int)); } -__device__ void allgather2(mscclppDevConn_t devConn, int rank, int world_size, int nranksPerNode, int remoteRank, +__device__ void allgather2(mscclpp::SimpleDeviceConnection devConn, int rank, int world_size, int nranksPerNode, int remoteRank, size_t nelemsPerGPU) { // this allgather is a pipelined and hierarchical one and only works for two nodes @@ -167,7 +168,7 @@ __global__ void kernel(int rank, int world_size, int nranksPerNode, size_t nelem int warpId = threadIdx.x / 32; int remoteRank = (warpId < rank) ? warpId : warpId + 1; // Each warp is responsible for one of the remote ranks - mscclppDevConn_t devConn = constDevConns[warpId]; + mscclpp::SimpleDeviceConnection devConn = constDevConns[warpId]; if (kernel == 0) allgather0(devConn, rank, world_size, remoteRank, nelemsPerGPU); @@ -219,7 +220,7 @@ void setupMscclppConnections(int rank, int world_size, mscclpp::Communicator& co int thisNode = rankToNode(rank); int cudaNum = rankToLocalRank(rank); std::string ibDevStr = "mlx5_ib" + std::to_string(cudaNum); - std::vector devConns(world_size); + std::vector, mscclpp::BufferHandle>> hostConns; for (int r = 0; r < world_size; ++r) { if (r == rank) @@ -234,13 +235,22 @@ void setupMscclppConnections(int rank, int world_size, mscclpp::Communicator& co } // Connect with all other ranks auto hostConn = comm.connect(r, 0, transportType, ibDev); - hostConn->registerBuffer(data_d, dataSize); - devConns.push_back(hostConn->toDevice(false)); + auto localBuffer = hostConn->registerBuffer(data_d, dataSize); + hostConns.emplace_back(hostConn, localBuffer); } comm.connectionSetup(); - assert(devConns.size() < sizeof(constDevConns) / sizeof(mscclpp::DeviceConnection)); - CUDACHECK(cudaMemcpyToSymbol(constDevConns, devConns.data(), sizeof(mscclpp::DeviceConnection) * devConns.size() )); + + std::vector devConns; + for (auto& entry : hostConns) { + assert(entry.first); + assert(entry.first->numRemoteBuffers() == 1); + auto remoteBuffer = entry.first->getRemoteBuffer(0); + devConns.emplace_back(entry.first->toDevice(), entry.second, remoteBuffer); + } + + assert(devConns.size() < sizeof(constDevConns) / sizeof(mscclpp::SimpleDeviceConnection)); + CUDACHECK(cudaMemcpyToSymbol(constDevConns, devConns.data(), sizeof(mscclpp::SimpleDeviceConnection) * devConns.size() )); } void printUsage(const char* prog, bool isMpi) @@ -406,97 +416,93 @@ int main(int argc, const char* argv[]) printf("Setting up the connection in MSCCL++\n"); setupMscclppConnections(rank, world_size, comm, data_d, dataSize); + if (rank == 0) + printf("Launching MSCCL++ proxy threads\n"); + comm.startProxying(); + + if (rank == 0) + printf("Testing the correctness of AllGather implementation\n"); + cudaStream_t stream; + CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + CUDACHECK(cudaDeviceSynchronize()); + kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size, nranksPerNode, nelemsPerGPU, kernelNum); + CUDACHECK(cudaDeviceSynchronize()); + CUDACHECK(cudaMemcpy(data_h, data_d, dataSize, cudaMemcpyDeviceToHost)); + + for (size_t i = 0; i < nelemsPerGPU * world_size; i++) { + int val = i + 1; + if (data_h[i] != val) { + printf("oh uh! data_h[%ld] (%d) != val (%d)\n", i, data_h[i], val); + break; + } + } + int tmp[16]; + // A simple barrier + comm.bootstrapAllGather(tmp, sizeof(int)); + if (rank == 0) + printf("Successfully checked the correctness\n"); + + // Perf test + int iterwithoutcudagraph = 10; + if (rank == 0) + printf("Running %d iterations of the kernel without CUDA graph\n", iterwithoutcudagraph); + CUDACHECK(cudaStreamSynchronize(stream)); + comm.bootstrapAllGather(tmp, sizeof(int)); + for (int i = 0; i < iterwithoutcudagraph; ++i) { + kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size, nranksPerNode, nelemsPerGPU, kernelNum); + } + CUDACHECK(cudaStreamSynchronize(stream)); + comm.bootstrapAllGather(tmp, sizeof(int)); + + // cudaGraph Capture + int cudagraphiter = 10; + if (rank == 0) + printf("Capturing %d iterations of the kernel in a CUDA graph\n", cudagraphiter); + cudaGraph_t graph; + cudaGraphExec_t instance; + cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal); + for (int i = 0; i < cudagraphiter; ++i) { + kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size, nranksPerNode, nelemsPerGPU, kernelNum); + } + cudaStreamEndCapture(stream, &graph); + cudaGraphInstantiate(&instance, graph, NULL, NULL, 0); + + int cudagraphwarmup = 10; + if (rank == 0) + printf("Warming up %d iterations of the CUDA graph with %d iterations of the kernel\n", cudagraphwarmup, + cudagraphiter); + for (int i = 0; i < cudagraphwarmup; ++i) { + cudaGraphLaunch(instance, stream); + } + CUDACHECK(cudaStreamSynchronize(stream)); + + // measure runtime + int cudagraphlaunch = 10; + if (rank == 0) + printf("Running %d iterations of the CUDA graph with %d iterations of the kernel\n", cudagraphlaunch, + cudagraphiter); + comm.bootstrapAllGather(tmp, sizeof(int)); + double t0, t1, ms, time_in_us; + t0 = getTime(); + for (int i = 0; i < cudagraphlaunch; ++i) { + cudaGraphLaunch(instance, stream); + } + CUDACHECK(cudaStreamSynchronize(stream)); + + t1 = getTime(); + ms = (t1 - t0) * 1000.0; + time_in_us = ms * 1000. / (float)cudagraphlaunch / (float)cudagraphiter; + printf("Rank %d report: size %lu time: %f us/iter algBW %f GBps\n", rank, dataSize, time_in_us, + (double)(dataSize) / 1e9 / (time_in_us / 1e6)); + comm.bootstrapAllGather(tmp, sizeof(int)); + + if (rank == 0) + printf("Stopping MSCCL++ proxy threads\n"); + comm.stopProxying(); + } catch (std::exception& e) { // todo: throw exceptions in the implementation and process them here } - - if (rank == 0) - printf("Launching MSCCL++ proxy threads\n"); - MSCCLPPCHECK(mscclppProxyLaunch(comm)); - - if (rank == 0) - printf("Testing the correctness of AllGather implementation\n"); - cudaStream_t stream; - CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - CUDACHECK(cudaDeviceSynchronize()); - kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size, nranksPerNode, nelemsPerGPU, kernelNum); - CUDACHECK(cudaDeviceSynchronize()); - CUDACHECK(cudaMemcpy(data_h, data_d, dataSize, cudaMemcpyDeviceToHost)); - - for (size_t i = 0; i < nelemsPerGPU * world_size; i++) { - int val = i + 1; - if (data_h[i] != val) { - printf("oh uh! data_h[%ld] (%d) != val (%d)\n", i, data_h[i], val); - break; - } - } - int tmp[16]; - // A simple barrier - MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int))); - if (rank == 0) - printf("Successfully checked the correctness\n"); - - // Perf test - int iterwithoutcudagraph = 10; - if (rank == 0) - printf("Running %d iterations of the kernel without CUDA graph\n", iterwithoutcudagraph); - CUDACHECK(cudaStreamSynchronize(stream)); - MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int))); - for (int i = 0; i < iterwithoutcudagraph; ++i) { - kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size, nranksPerNode, nelemsPerGPU, kernelNum); - } - CUDACHECK(cudaStreamSynchronize(stream)); - MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int))); - - // cudaGraph Capture - int cudagraphiter = 10; - if (rank == 0) - printf("Capturing %d iterations of the kernel in a CUDA graph\n", cudagraphiter); - cudaGraph_t graph; - cudaGraphExec_t instance; - cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal); - for (int i = 0; i < cudagraphiter; ++i) { - kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size, nranksPerNode, nelemsPerGPU, kernelNum); - } - cudaStreamEndCapture(stream, &graph); - cudaGraphInstantiate(&instance, graph, NULL, NULL, 0); - - int cudagraphwarmup = 10; - if (rank == 0) - printf("Warming up %d iterations of the CUDA graph with %d iterations of the kernel\n", cudagraphwarmup, - cudagraphiter); - for (int i = 0; i < cudagraphwarmup; ++i) { - cudaGraphLaunch(instance, stream); - } - CUDACHECK(cudaStreamSynchronize(stream)); - - // measure runtime - int cudagraphlaunch = 10; - if (rank == 0) - printf("Running %d iterations of the CUDA graph with %d iterations of the kernel\n", cudagraphlaunch, - cudagraphiter); - MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int))); - double t0, t1, ms, time_in_us; - t0 = getTime(); - for (int i = 0; i < cudagraphlaunch; ++i) { - cudaGraphLaunch(instance, stream); - } - CUDACHECK(cudaStreamSynchronize(stream)); - - t1 = getTime(); - ms = (t1 - t0) * 1000.0; - time_in_us = ms * 1000. / (float)cudagraphlaunch / (float)cudagraphiter; - printf("Rank %d report: size %lu time: %f us/iter algBW %f GBps\n", rank, dataSize, time_in_us, - (double)(dataSize) / 1e9 / (time_in_us / 1e6)); - MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int))); - - if (rank == 0) - printf("Stopping MSCCL++ proxy threads\n"); - MSCCLPPCHECK(mscclppProxyStop(comm)); - - if (rank == 0) - printf("Destroying MSCCL++ communicator\n"); - MSCCLPPCHECK(mscclppCommDestroy(comm)); printf("Rank %d succeeded!\n", rank); #ifdef MSCCLPP_USE_MPI_FOR_TESTS From 9fbb0debdd0951524775c48f2c05e2707e54c341 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Wed, 19 Apr 2023 22:02:23 +0000 Subject: [PATCH 033/135] C++ API changes --- src/basic_proxy_handler.cc | 2 +- src/communicator.cc | 37 +------ src/host_connection.cc | 35 +++++-- src/include/mscclpp.hpp | 194 ++++++++++++++++++++++-------------- src/proxy_cpp.cc | 6 +- tests/allgather_test_cpp.cu | 22 ++-- 6 files changed, 160 insertions(+), 136 deletions(-) diff --git a/src/basic_proxy_handler.cc b/src/basic_proxy_handler.cc index 736c44bd..482aa842 100644 --- a/src/basic_proxy_handler.cc +++ b/src/basic_proxy_handler.cc @@ -19,7 +19,7 @@ ProxyHandler makeBasicProxyHandler(Communicator::Impl &comm) { if (trigger->fields.type & mscclppSync) { conn.flush(); - result = ProxyHandlerResult::FlushAndContinue; + result = ProxyHandlerResult::FlushFifoTailAndContinue; } return result; diff --git a/src/communicator.cc b/src/communicator.cc index cade59a3..5a843c78 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -14,7 +14,6 @@ Communicator::Impl::~Impl() { } } -MSCCLPP_API_CPP Communicator::Communicator() = default; MSCCLPP_API_CPP Communicator::~Communicator() = default; mscclppTransport_t transportTypeToCStyle(TransportType type) { @@ -28,43 +27,26 @@ mscclppTransport_t transportTypeToCStyle(TransportType type) { } } -MSCCLPP_API_CPP void Communicator::initRank(int nranks, const char* ipPortPair, int rank) { - if (pimpl) { - throw std::runtime_error("Communicator already initialized"); - } - pimpl = std::make_unique(); +MSCCLPP_API_CPP Communicator::Communicator(int nranks, const char* ipPortPair, int rank) : pimpl(std::make_unique()) { mscclppCommInitRank(&pimpl->comm, nranks, ipPortPair, rank); } -MSCCLPP_API_CPP void Communicator::initRankFromId(int nranks, UniqueId id, int rank) { - if (pimpl) { - throw std::runtime_error("Communicator already initialized"); - } - pimpl = std::make_unique(); +MSCCLPP_API_CPP Communicator::Communicator(int nranks, UniqueId id, int rank) : pimpl(std::make_unique()) { static_assert(sizeof(mscclppUniqueId) == sizeof(UniqueId), "UniqueId size mismatch"); mscclppUniqueId *cstyle_id = reinterpret_cast(&id); mscclppCommInitRankFromId(&pimpl->comm, nranks, *cstyle_id, rank); } MSCCLPP_API_CPP void Communicator::bootstrapAllGather(void* data, int size) { - if (!pimpl) { - throw std::runtime_error("Communicator not initialized"); - } mscclppBootstrapAllGather(pimpl->comm, data, size); } MSCCLPP_API_CPP void Communicator::bootstrapBarrier() { - if (!pimpl) { - throw std::runtime_error("Communicator not initialized"); - } mscclppBootstrapBarrier(pimpl->comm); } MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank, int tag, TransportType transportType, const char* ibDev) { - if (!pimpl) { - throw std::runtime_error("Communicator not initialized"); - } mscclppConnectWithoutBuffer(pimpl->comm, remoteRank, tag, transportTypeToCStyle(transportType), ibDev); auto connIdx = pimpl->connections.size(); auto conn = std::make_shared(std::make_unique(this, &pimpl->comm->conns[connIdx])); @@ -73,39 +55,24 @@ MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remote } MSCCLPP_API_CPP void Communicator::connectionSetup() { - if (!pimpl) { - throw std::runtime_error("Communicator not initialized"); - } mscclppConnectionSetup(pimpl->comm); } MSCCLPP_API_CPP void Communicator::startProxying() { - if (!pimpl) { - throw std::runtime_error("Communicator not initialized"); - } pimpl->proxy.start(); } MSCCLPP_API_CPP void Communicator::stopProxying() { - if (!pimpl) { - throw std::runtime_error("Communicator not initialized"); - } pimpl->proxy.stop(); } MSCCLPP_API_CPP int Communicator::rank() { - if (!pimpl) { - throw std::runtime_error("Communicator not initialized"); - } int result; mscclppCommRank(pimpl->comm, &result); return result; } MSCCLPP_API_CPP int Communicator::size() { - if (!pimpl) { - throw std::runtime_error("Communicator not initialized"); - } int result; mscclppCommSize(pimpl->comm, &result); return result; diff --git a/src/host_connection.cc b/src/host_connection.cc index cba9f81d..72e11ffc 100644 --- a/src/host_connection.cc +++ b/src/host_connection.cc @@ -15,8 +15,14 @@ HostConnection::Impl::~Impl() { // TODO: figure out memory ownership. Does this deallocate the mscclppHostConn? Likely not. } +MSCCLPP_API_CPP HostConnection::~HostConnection() = default; + MSCCLPP_API_CPP HostConnection::HostConnection(std::unique_ptr p) : pimpl(std::move(p)) {} +MSCCLPP_API_CPP int HostConnection::getId() { + return pimpl->conn->connId; +} + MSCCLPP_API_CPP BufferHandle HostConnection::registerBuffer(void* data, uint64_t size) { BufferHandle result; static_assert(sizeof(BufferHandle) == sizeof(mscclppBufferHandle_t)); @@ -24,10 +30,15 @@ MSCCLPP_API_CPP BufferHandle HostConnection::registerBuffer(void* data, uint64_t return result; } +MSCCLPP_API_CPP int HostConnection::numLocalBuffers() { + return pimpl->conn->bufferRegistrations.size() - 1; +} + +MSCCLPP_API_CPP BufferHandle HostConnection::getLocalBuffer(int index) { + return index + 1; +} + MSCCLPP_API_CPP int HostConnection::numRemoteBuffers() { - if (!pimpl->conn) { - throw std::runtime_error("HostConnection not initialized"); - } return pimpl->conn->remoteBufferRegistrations.size() - 1; } @@ -35,16 +46,18 @@ MSCCLPP_API_CPP BufferHandle HostConnection::getRemoteBuffer(int index) { return index + 1; } -MSCCLPP_API_CPP DeviceConnection HostConnection::toDevice() { - DeviceConnection devConn; +MSCCLPP_API_CPP ConnectionEpoch HostConnection::getEpoch() { + ConnectionEpoch epoch; static_assert(sizeof(SignalEpochId) == sizeof(mscclppDevConnSignalEpochId)); - devConn.connectionId = pimpl->conn->connId; - devConn.localSignalEpochId = reinterpret_cast(pimpl->conn->devConn->localSignalEpochId); - devConn.remoteSignalEpochId = reinterpret_cast(pimpl->conn->devConn->remoteSignalEpochId); - devConn.waitEpochId = pimpl->conn->devConn->waitEpochId; - devConn.fifo = pimpl->comm->pimpl->proxy.fifo().toDevice(); + epoch.localSignalEpochId = reinterpret_cast(pimpl->conn->devConn->localSignalEpochId); + epoch.remoteSignalEpochId = reinterpret_cast(pimpl->conn->devConn->remoteSignalEpochId); + epoch.waitEpochId = pimpl->conn->devConn->waitEpochId; + return epoch; +} - return devConn; + +MSCCLPP_API_CPP DeviceProxyFifo HostConnection::getDeviceFifo() { + return pimpl->comm->pimpl->proxy.fifo().toDevice(); } MSCCLPP_API_CPP void HostConnection::put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) { diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index 4a21bae7..e41e94b8 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -72,6 +72,99 @@ union ChannelTrigger { #endif // __CUDACC__ }; +struct ConnectionEpoch { +#ifdef __CUDACC__ + __forceinline__ __device__ void wait() + { + (*waitEpochId) += 1; + while (*(volatile uint64_t*)&(localSignalEpochId->proxy) < (*waitEpochId)) + ; + } + + __forceinline__ __device__ void epochIncrement() + { + *(volatile uint64_t*)&(localSignalEpochId->device) += 1; + } +#endif // __CUDACC__ + + SignalEpochId* localSignalEpochId; + // used by the signal() function directly from gpu + SignalEpochId* remoteSignalEpochId; + + // every wait(), increments this and then the gpu waits for either: + // 1) localSignalEpochId->proxy to be >= this in case of a proxy thread + // 2) remoteSignalEpochId->device to be >= this in case of a gpu thread + uint64_t* waitEpochId; +}; + +class HostConnection { + struct Impl; +public: + /* HostConnection can not be constructed from user code and must instead be created through Communicator::connect */ + HostConnection(std::unique_ptr); + + ~HostConnection(); + + int getId(); + + /* Register a region of GPU memory for use with this connection. Must be called before connectionSetup() + * in the communicator. + * + * Inputs: + * data: base pointer to the memory + * size: size of the memory region in bytes + * + * Returns: a handle to the buffer + */ + BufferHandle registerBuffer(void* data, uint64_t size); + + /* Get the number of times registerBuffer(...) was called. + * + * Returns: the number of buffers registered + */ + int numLocalBuffers(); + + /* Get the BufferHandle returned by a call to registerBuffer(...) as identified by the index + * + * Inputs: + * index: the index of the handle to get + * + * Returns: a handle to the buffer + */ + BufferHandle getLocalBuffer(int index); + + /* Get the number of times registerBuffer(...) was called on the remote peer. + * + * Returns: the number of buffers registered on the remote peer + */ + int numRemoteBuffers(); + + /* Get the BufferHandle returned by a call to registerBuffer(...) on the remote peer as identified by the index + * + * Inputs: + * index: the index of the handle to get + * + * Returns: a handle to the buffer on the remote peer + */ + BufferHandle getRemoteBuffer(int index); + + ConnectionEpoch getEpoch(); + + DeviceProxyFifo getDeviceFifo(); + + void put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size); + + void signal(); + + void flush(); + + void wait(); + +private: + std::unique_ptr pimpl; + friend class Communicator; +}; + /*************************************************************************************************************** * A mscclppDevConn provides a zero-copy connection between two GPUs connected via P2P NVLink or InfiniBand. * The communication API is one-sided meaning that for every single data transfer, only one side @@ -135,9 +228,17 @@ union ChannelTrigger { * indices in the registered buffer. **************************************************************************************************************/ struct DeviceConnection { -#ifdef __CUDACC__ - // TODO: add buffer handles + DeviceConnection() = default; + DeviceConnection(HostConnection& hostConn) + : connectionId(hostConn.getId()), epoch(hostConn.getEpoch()), + fifo(hostConn.getDeviceFifo()) {} + + DeviceConnection(const DeviceConnection& other) = default; + + DeviceConnection& operator=(DeviceConnection& other) = default; + +#ifdef __CUDACC__ __forceinline__ __device__ void put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) { fifo.push(ChannelTrigger(channelTriggerData, dst, dstOffset, src, srcOffset, size, connectionId).value); @@ -191,28 +292,18 @@ struct DeviceConnection { __forceinline__ __device__ void wait() { - (*waitEpochId) += 1; - while (*(volatile uint64_t*)&(localSignalEpochId->proxy) < (*waitEpochId)) - ; + epoch.wait(); } __forceinline__ __device__ void epochIncrement() { - *(volatile uint64_t*)&(localSignalEpochId->device) += 1; + epoch.epochIncrement(); } - #endif // __CUDACC__ int connectionId; - SignalEpochId* localSignalEpochId; - // used by the signal() function directly from gpu - SignalEpochId* remoteSignalEpochId; - - // every wait(), increments this and then the gpu waits for either: - // 1) localSignalEpochId->proxy to be >= this in case of a proxy thread - // 2) remoteSignalEpochId->device to be >= this in case of a gpu thread - uint64_t* waitEpochId; + ConnectionEpoch epoch; // this is a concurrent fifo which is multiple threads from the device // can produce for and the sole proxy thread consumes it. @@ -220,9 +311,15 @@ struct DeviceConnection { }; struct SimpleDeviceConnection { - SimpleDeviceConnection() {} - SimpleDeviceConnection(DeviceConnection devConn, BufferHandle dst, BufferHandle src) : devConn(devConn), dst(dst), src(src) {} + SimpleDeviceConnection() = default; + + SimpleDeviceConnection(HostConnection& hostConn) : devConn(hostConn) { + dst = hostConn.getRemoteBuffer(0); + src = hostConn.getLocalBuffer(0); + } + SimpleDeviceConnection(const SimpleDeviceConnection& other) = default; + SimpleDeviceConnection& operator=(SimpleDeviceConnection& other) = default; #ifdef __CUDACC__ @@ -284,59 +381,6 @@ struct SimpleDeviceConnection { BufferHandle src; }; -class HostConnection { - struct Impl; -public: - /* HostConnection can not be constructed from user code and must instead be created through Communicator::connect */ - HostConnection(std::unique_ptr); - - /* Register a region of GPU memory for use with this connection. Must be called before connectionSetup() - * in the communicator. - * - * Inputs: - * data: base pointer to the memory - * size: size of the memory region in bytes - * - * Returns: a handle to the buffer - */ - BufferHandle registerBuffer(void* data, uint64_t size); - - /* Get the number of times registerBuffer(...) was called on the remote peer. - * - * Returns: the number of buffers registered on the remote peer - */ - int numRemoteBuffers(); - - /* Get the BufferHandle returned by a call to registerBuffer(...) on the remote peer as identified by the index - * - * Inputs: - * index: the index of the handle to get - * - * Returns: a handle to the buffer on the remote peer - */ - BufferHandle getRemoteBuffer(int index); - - /* Create a DeviceConnection paired with this HostConnection. A background proxy thread will - * trigger operations on this HostConnection corresponding to put/signal/etc. calls made to the - * DeviceConnection. - * - * Returns: the newly created DeviceConnection - */ - DeviceConnection toDevice(); - - void put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size); - - void signal(); - - void flush(); - - void wait(); - -private: - std::unique_ptr pimpl; - friend class Communicator; -}; - #define MSCCLPP_UNIQUE_ID_BYTES 128 struct UniqueId { char internal[MSCCLPP_UNIQUE_ID_BYTES]; @@ -359,8 +403,6 @@ enum class TransportType : uint8_t { class Communicator { public: - Communicator(); - ~Communicator(); /* Initialize the communicator. nranks processes with rank 0 to nranks-1 need to call this function. * @@ -369,7 +411,7 @@ public: * ipPortPair: a string of the form "ip:port" that represents the address of the root process * rank: rank of the calling process */ - void initRank(int nranks, const char* ipPortPair, int rank); + Communicator(int nranks, const char* ipPortPair, int rank); /* Initialize the communicator from a given UniqueId. Same as mscclppCommInitRank() except that * id is provided by the user by calling getUniqueId() @@ -379,7 +421,9 @@ public: * id: the unique ID to be used for communication * rank: rank of the calling process */ - void initRankFromId(int nranks, UniqueId id, int rank); + Communicator(int nranks, UniqueId id, int rank); + + ~Communicator(); /* Ring-based AllGather through the bootstrap socket. * @@ -441,7 +485,7 @@ private: enum class ProxyHandlerResult { Continue, - FlushAndContinue, + FlushFifoTailAndContinue, Stop, }; diff --git a/src/proxy_cpp.cc b/src/proxy_cpp.cc index 9360d560..2d1cf098 100644 --- a/src/proxy_cpp.cc +++ b/src/proxy_cpp.cc @@ -62,10 +62,14 @@ MSCCLPP_API_CPP void Proxy::start() { // Flush the tail to device memory. This is either triggered every ProxyFlushPeriod to make sure // that the fifo can make progress even if there is no request mscclppSync. However, mscclppSync type is for flush // request. - if ((++flushCnt % ProxyFlushPeriod) == 0 || result == ProxyHandlerResult::FlushAndContinue) { + if ((++flushCnt % ProxyFlushPeriod) == 0 || result == ProxyHandlerResult::FlushFifoTailAndContinue) { // TODO: relocate this check: || (trigger.fields.type & mscclppSync) fifo.flushTail(); } + + if (result == ProxyHandlerResult::Stop) { + break; + } } // make sure the tail is flushed before we shut the proxy diff --git a/tests/allgather_test_cpp.cu b/tests/allgather_test_cpp.cu index ddecadbf..9b056e84 100644 --- a/tests/allgather_test_cpp.cu +++ b/tests/allgather_test_cpp.cu @@ -11,6 +11,7 @@ #include #include #include +#include static int nranksPerNode = 8; @@ -220,7 +221,7 @@ void setupMscclppConnections(int rank, int world_size, mscclpp::Communicator& co int thisNode = rankToNode(rank); int cudaNum = rankToLocalRank(rank); std::string ibDevStr = "mlx5_ib" + std::to_string(cudaNum); - std::vector, mscclpp::BufferHandle>> hostConns; + std::vector> hostConns; for (int r = 0; r < world_size; ++r) { if (r == rank) @@ -235,19 +236,17 @@ void setupMscclppConnections(int rank, int world_size, mscclpp::Communicator& co } // Connect with all other ranks auto hostConn = comm.connect(r, 0, transportType, ibDev); - auto localBuffer = hostConn->registerBuffer(data_d, dataSize); - hostConns.emplace_back(hostConn, localBuffer); + hostConn->registerBuffer(data_d, dataSize); + hostConns.push_back(hostConn); } comm.connectionSetup(); std::vector devConns; - for (auto& entry : hostConns) { - assert(entry.first); - assert(entry.first->numRemoteBuffers() == 1); - auto remoteBuffer = entry.first->getRemoteBuffer(0); - devConns.emplace_back(entry.first->toDevice(), entry.second, remoteBuffer); - } + std::transform(hostConns.begin(), hostConns.end(), std::back_inserter(devConns), + [](std::shared_ptr& hostConn) { + return mscclpp::SimpleDeviceConnection(*hostConn); + }); assert(devConns.size() < sizeof(constDevConns) / sizeof(mscclpp::SimpleDeviceConnection)); CUDACHECK(cudaMemcpyToSymbol(constDevConns, devConns.data(), sizeof(mscclpp::SimpleDeviceConnection) * devConns.size() )); @@ -401,12 +400,9 @@ int main(int argc, const char* argv[]) size_t nelemsPerGPU = dataSize / sizeof(int) / world_size; try{ - mscclpp::Communicator comm; - if (rank == 0) printf("Initializing MSCCL++\n"); - - comm.initRank(world_size, ip_port, rank); + mscclpp::Communicator comm(world_size, ip_port, rank); if (rank == 0) printf("Initializing data for allgather test\n"); From 9c8942f7ac543f2730f2cc2fdbde6562aca03259 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Wed, 19 Apr 2023 22:09:53 +0000 Subject: [PATCH 034/135] wip --- src/bootstrap/bootstrap.cc | 53 ++++++++++++++++++++++++++++++-------- src/include/bootstrap.h | 5 ++-- 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index 32d245d2..235628f7 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -44,8 +44,27 @@ struct mscclppBootstrap::impl{ pthread_mutex_unlock(&initLock); } - static char bootstrapNetIfName[MAX_IF_NAME_SIZE + 1]; - static union mscclppSocketAddress bootstrapNetIfAddr; + void CreateRoot(){ + auto listenSock = std::make_shared(); + auto args = std::make_shared(); + pthread_t thread; + + MSCCLPPCHECK(mscclppSocketInit(listenSock, &handle->addr, handle->magic, mscclppSocketTypeBootstrap, NULL, 0)); + MSCCLPPCHECK(mscclppSocketListen(listenSock)); + MSCCLPPCHECK(mscclppSocketGetAddr(listenSock, &handle->addr)); + + MSCCLPPCHECK(mscclppCalloc(&args, 1)); + args->listenSock = listenSock; + args->magic = handle->magic; + NEQCHECK(pthread_create(&thread, NULL, bootstrapRoot, (void*)args), 0); + mscclppSetThreadName(thread, "MSCCLPP BootstrapR"); + NEQCHECK(pthread_detach(thread), 0); // will not be pthread_join()'d + return mscclppSuccess; + } + + char bootstrapNetIfName[MAX_IF_NAME_SIZE + 1]; + union mscclppSocketAddress bootstrapNetIfAddr; + int rank, nranks; }; struct mscclppBootstrap::UniqueId{ @@ -53,8 +72,8 @@ struct mscclppBootstrap::UniqueId{ union mscclppSocketAddress addr; }; -static uint64_t hashUniqueId(mscclppBootstrapHandle const& id) { +static uint64_t hashUniqueId(mscclppBootstrapHandle const& id) char const* bytes = (char const*)&id; uint64_t h = 0xdeadbeef; for (int i = 0; i < (int)sizeof(mscclppBootstrapHandle); i++) { @@ -74,16 +93,28 @@ std::unique_ptr mscclppBootstrap::GetUniqueId(){ throw std::runtime_error("getting random data failed"); } memcpy(&handle.addr, &pimpl->bootstrapNetIfAddr, sizeof(union mscclppSocketAddress)); - // ret = bootstrapCreateRoot(handle); - - // mscclppResult_t res = bootstrapGetUniqueId(&handle); - // if (res != mscclppSuccess) { - // throw std::runtime_error("Bootstrap : failed to get unique ID"); - // } - // TRACE_CALL("mscclppGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(handle)); - // return *(mscclppUniqueId*)&handle; + return std::make_unique(handle); } +mscclppBootstrap::mscclppBootstrap(){ + pimpl = std::make_unique(); +} + +std::unique_ptr mscclppBootstrap::GetUniqueId(){ + pimpl->NetInit(); + + mscclppBootstrap::UniqueId handle; + auto ret = getRandomData(&handle.magic, sizeof(handle.magic)); + if (ret != mscclppSuccess) { + throw std::runtime_error("getting random data failed"); + } + memcpy(&handle.addr, &pimpl->bootstrapNetIfAddr, sizeof(union mscclppSocketAddress)); + return std::make_unique(handle); +} + +void mscclppBootstrap::Initliaze(std::string ipPortPair, int _rank, int _nranks) : rank(_rank), nranks(_nranks) { + pimpl->NetInit(ipPortPair); +} struct bootstrapRootArgs { diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h index dbf72e6c..69d916aa 100644 --- a/src/include/bootstrap.h +++ b/src/include/bootstrap.h @@ -22,8 +22,9 @@ static_assert(sizeof(struct mscclppBootstrapHandle) <= sizeof(mscclppUniqueId), class mscclppBootstrap : Bootstrap { public: - mscclppBootstrap(std::string ip_port_pair, int rank, int nranks); - mscclppBootstrap(mscclppBootstrapHandle handle, int rank, int nranks); + mscclppBootstrap(); + void Initliaze(std::string ipPortPair, int rank, int nranks); + void Initliaze(mscclppBootstrapHandle handle, int rank, int nranks); void Send(void* data, int size, int peer, int tag); void Recv(void* data, int size, int peer, int tag); void AllGather(void* allData, int size); From 804692f28264ecede517a08cf33ba4946e8c22e6 Mon Sep 17 00:00:00 2001 From: Binyang2014 Date: Fri, 21 Apr 2023 13:59:42 +0800 Subject: [PATCH 035/135] Binyli/bootstrap (#60) Bootstrap refactor. --- src/bootstrap/bootstrap.cc | 1104 ++++++++++++++++-------------------- src/include/bootstrap.h | 27 +- src/include/comm.h | 2 +- src/include/mscclpp.h | 4 +- 4 files changed, 512 insertions(+), 625 deletions(-) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index 235628f7..c2e503d8 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -1,80 +1,22 @@ -/************************************************************************* - * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - #include "bootstrap.h" #include "config.h" #include "mscclpp.h" #include "utils.h" + +#include +#include +#include +#include +#include + +#include #include #include -struct mscclppBootstrap::impl{ - void NetInit(std::string ipPortPair = ""){ - static bool initialized = false; - static pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER; - if (__atomic_load_n(&initialized, __ATOMIC_ACQUIRE)) - return; - pthread_mutex_lock(&initLock); - if (!initialized) { - - if (ipPortPair != "") { - union mscclppSocketAddress remoteAddr; - if (mscclppSocketGetAddrFromString(&remoteAddr, ipPortPair.c_str()) != mscclppSuccess) { - throw std::runtime_error("Invalid ip:port, please use format: : or []: or :"); - } - if (mscclppFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, - 1) <= 0) { - throw std::runtime_error("NET/Socket : No usable listening interface found"); - } - } else { - int nIfs = mscclppFindInterfaces(this->bootstrapNetIfName, &this->bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1); - if (nIfs <= 0) { - throw std::runtime_error("Bootstrap : no socket interface found"); - } - } - char line[SOCKET_NAME_MAXLEN + MAX_IF_NAME_SIZE + 2]; - sprintf(line, " %s:", bootstrapNetIfName); - mscclppSocketToString(&bootstrapNetIfAddr, line + strlen(line)); - INFO(MSCCLPP_INIT, "Bootstrap : Using%s", line); - __atomic_store_n(&initialized, true, __ATOMIC_RELEASE); - } - pthread_mutex_unlock(&initLock); - } - - void CreateRoot(){ - auto listenSock = std::make_shared(); - auto args = std::make_shared(); - pthread_t thread; - - MSCCLPPCHECK(mscclppSocketInit(listenSock, &handle->addr, handle->magic, mscclppSocketTypeBootstrap, NULL, 0)); - MSCCLPPCHECK(mscclppSocketListen(listenSock)); - MSCCLPPCHECK(mscclppSocketGetAddr(listenSock, &handle->addr)); - - MSCCLPPCHECK(mscclppCalloc(&args, 1)); - args->listenSock = listenSock; - args->magic = handle->magic; - NEQCHECK(pthread_create(&thread, NULL, bootstrapRoot, (void*)args), 0); - mscclppSetThreadName(thread, "MSCCLPP BootstrapR"); - NEQCHECK(pthread_detach(thread), 0); // will not be pthread_join()'d - return mscclppSuccess; - } - - char bootstrapNetIfName[MAX_IF_NAME_SIZE + 1]; - union mscclppSocketAddress bootstrapNetIfAddr; - int rank, nranks; -}; - -struct mscclppBootstrap::UniqueId{ - uint64_t magic; - union mscclppSocketAddress addr; -}; - +namespace { +uint64_t hashUniqueId(const mscclppBootstrapHandle& id) { -static uint64_t hashUniqueId(mscclppBootstrapHandle const& id) - char const* bytes = (char const*)&id; + const char* bytes = (const char*)&id; uint64_t h = 0xdeadbeef; for (int i = 0; i < (int)sizeof(mscclppBootstrapHandle); i++) { h ^= h >> 32; @@ -84,127 +26,7 @@ static uint64_t hashUniqueId(mscclppBootstrapHandle const& id) return h; } -std::unique_ptr mscclppBootstrap::GetUniqueId(){ - pimpl->NetInit(); - - mscclppBootstrap::UniqueId handle; - auto ret = getRandomData(&handle.magic, sizeof(handle.magic)); - if (ret != mscclppSuccess) { - throw std::runtime_error("getting random data failed"); - } - memcpy(&handle.addr, &pimpl->bootstrapNetIfAddr, sizeof(union mscclppSocketAddress)); - return std::make_unique(handle); -} - -mscclppBootstrap::mscclppBootstrap(){ - pimpl = std::make_unique(); -} - -std::unique_ptr mscclppBootstrap::GetUniqueId(){ - pimpl->NetInit(); - - mscclppBootstrap::UniqueId handle; - auto ret = getRandomData(&handle.magic, sizeof(handle.magic)); - if (ret != mscclppSuccess) { - throw std::runtime_error("getting random data failed"); - } - memcpy(&handle.addr, &pimpl->bootstrapNetIfAddr, sizeof(union mscclppSocketAddress)); - return std::make_unique(handle); -} - -void mscclppBootstrap::Initliaze(std::string ipPortPair, int _rank, int _nranks) : rank(_rank), nranks(_nranks) { - pimpl->NetInit(ipPortPair); -} - -struct bootstrapRootArgs -{ - struct mscclppSocket* listenSock; - uint64_t magic; -}; - -/* Init functions */ -static char bootstrapNetIfName[MAX_IF_NAME_SIZE + 1]; -static union mscclppSocketAddress bootstrapNetIfAddr; -static int bootstrapNetInitDone = 0; -pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER; - -mscclppResult_t bootstrapNetInit(const char* ip_port_pair) -{ - if (bootstrapNetInitDone == 0) { - pthread_mutex_lock(&bootstrapNetLock); - if (bootstrapNetInitDone == 0) { - const char* env; - if (ip_port_pair) { - env = ip_port_pair; - } else { - env = getenv("MSCCLPP_COMM_ID"); - } - if (env) { - union mscclppSocketAddress remoteAddr; - if (mscclppSocketGetAddrFromString(&remoteAddr, env) != mscclppSuccess) { - WARN("Invalid MSCCLPP_COMM_ID, please use format: : or []: or :"); - return mscclppInvalidArgument; - } - if (mscclppFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, - 1) <= 0) { - WARN("NET/Socket : No usable listening interface found"); - return mscclppSystemError; - } - } else { - int nIfs = mscclppFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1); - if (nIfs <= 0) { - WARN("Bootstrap : no socket interface found"); - return mscclppInternalError; - } - } - char line[SOCKET_NAME_MAXLEN + MAX_IF_NAME_SIZE + 2]; - sprintf(line, " %s:", bootstrapNetIfName); - mscclppSocketToString(&bootstrapNetIfAddr, line + strlen(line)); - INFO(MSCCLPP_INIT, "Bootstrap : Using%s", line); - bootstrapNetInitDone = 1; - } - pthread_mutex_unlock(&bootstrapNetLock); - } - return mscclppSuccess; -} - -/* Socket Interface Selection type */ -enum bootstrapInterface_t -{ - findSubnetIf = -1, - dontCareIf = -2 -}; - -// Additional sync functions -static mscclppResult_t bootstrapNetSend(struct mscclppSocket* sock, void* data, int size) -{ - MSCCLPPCHECK(mscclppSocketSend(sock, &size, sizeof(int))); - MSCCLPPCHECK(mscclppSocketSend(sock, data, size)); - return mscclppSuccess; -} -static mscclppResult_t bootstrapNetRecv(struct mscclppSocket* sock, void* data, int size) -{ - int recvSize; - MSCCLPPCHECK(mscclppSocketRecv(sock, &recvSize, sizeof(int))); - if (recvSize > size) { - WARN("Message truncated : received %d bytes instead of %d", recvSize, size); - return mscclppInternalError; - } - MSCCLPPCHECK(mscclppSocketRecv(sock, data, std::min(recvSize, size))); - return mscclppSuccess; -} - -struct extInfo -{ - int rank; - int nranks; - union mscclppSocketAddress extAddressListenRoot; - union mscclppSocketAddress extAddressListen; -}; - -#include - -static mscclppResult_t setFilesLimit() +mscclppResult_t setFilesLimit() { struct rlimit filesLimit; SYSCHECK(getrlimit(RLIMIT_NOFILE, &filesLimit), "getrlimit"); @@ -213,459 +35,531 @@ static mscclppResult_t setFilesLimit() return mscclppSuccess; } -static void* bootstrapRoot(void* rargs) +} // namespace + +/* Socket Interface Selection type */ +enum bootstrapInterface_t { - struct bootstrapRootArgs* args = (struct bootstrapRootArgs*)rargs; - struct mscclppSocket* listenSock = args->listenSock; - uint64_t magic = args->magic; - mscclppResult_t res = mscclppSuccess; - int nranks = 0, c = 0; - struct extInfo info; - union mscclppSocketAddress* rankAddresses = NULL; - union mscclppSocketAddress* rankAddressesRoot = NULL; // for initial rank <-> root information exchange - union mscclppSocketAddress* zero = NULL; - MSCCLPPCHECKGOTO(mscclppCalloc(&zero, 1), res, out); - setFilesLimit(); + findSubnetIf = -1, + dontCareIf = -2 +}; - TRACE(MSCCLPP_INIT, "BEGIN"); - /* Receive addresses from all ranks */ - do { - struct mscclppSocket sock; - MSCCLPPCHECKGOTO(mscclppSocketInit(&sock), res, out); - MSCCLPPCHECKGOTO(mscclppSocketAccept(&sock, listenSock), res, out); - MSCCLPPCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out); - MSCCLPPCHECKGOTO(mscclppSocketClose(&sock), res, out); - - if (c == 0) { - nranks = info.nranks; - MSCCLPPCHECKGOTO(mscclppCalloc(&rankAddresses, nranks), res, out); - MSCCLPPCHECKGOTO(mscclppCalloc(&rankAddressesRoot, nranks), res, out); - } - - if (nranks != info.nranks) { - WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", nranks, info.nranks); - goto out; - } - - if (memcmp(zero, &rankAddressesRoot[info.rank], sizeof(union mscclppSocketAddress)) != 0) { - WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks); - goto out; - } - - // Save the connection handle for that rank - memcpy(rankAddressesRoot + info.rank, &info.extAddressListenRoot, sizeof(union mscclppSocketAddress)); - memcpy(rankAddresses + info.rank, &info.extAddressListen, sizeof(union mscclppSocketAddress)); - - ++c; - TRACE(MSCCLPP_INIT, "Received connect from rank %d total %d/%d", info.rank, c, nranks); - } while (c < nranks); - TRACE(MSCCLPP_INIT, "COLLECTED ALL %d HANDLES", nranks); - - // Send the connect handle for the next rank in the AllGather ring - for (int r = 0; r < nranks; ++r) { - int next = (r + 1) % nranks; - struct mscclppSocket sock; - MSCCLPPCHECKGOTO(mscclppSocketInit(&sock, rankAddressesRoot + r, magic, mscclppSocketTypeBootstrap), res, out); - MSCCLPPCHECKGOTO(mscclppSocketConnect(&sock), res, out); - MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, rankAddresses + next, sizeof(union mscclppSocketAddress)), res, out); - MSCCLPPCHECKGOTO(mscclppSocketClose(&sock), res, out); - } - TRACE(MSCCLPP_INIT, "SENT OUT ALL %d HANDLES", nranks); - -out: - if (listenSock != NULL) { - mscclppSocketClose(listenSock); - free(listenSock); - } - if (rankAddresses) - free(rankAddresses); - if (rankAddressesRoot) - free(rankAddressesRoot); - if (zero) - free(zero); - free(rargs); - - TRACE(MSCCLPP_INIT, "DONE"); - return NULL; -} - -mscclppResult_t bootstrapCreateRoot(struct mscclppBootstrapHandle* handle) +struct MscclppBootstrap::UniqueId { - struct mscclppSocket* listenSock; - struct bootstrapRootArgs* args; - pthread_t thread; + uint64_t magic; + union mscclppSocketAddress addr; +}; - MSCCLPPCHECK(mscclppCalloc(&listenSock, 1)); - MSCCLPPCHECK(mscclppSocketInit(listenSock, &handle->addr, handle->magic, mscclppSocketTypeBootstrap, NULL, 0)); - MSCCLPPCHECK(mscclppSocketListen(listenSock)); - MSCCLPPCHECK(mscclppSocketGetAddr(listenSock, &handle->addr)); - - MSCCLPPCHECK(mscclppCalloc(&args, 1)); - args->listenSock = listenSock; - args->magic = handle->magic; - NEQCHECK(pthread_create(&thread, NULL, bootstrapRoot, (void*)args), 0); - mscclppSetThreadName(thread, "MSCCLPP BootstrapR"); - NEQCHECK(pthread_detach(thread), 0); // will not be pthread_join()'d - return mscclppSuccess; -} - -// #include -// #include - -mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle, bool isRoot, const char* ip_port_pair) -{ - memset(handle, 0, sizeof(mscclppBootstrapHandle)); - const char* env = NULL; - - if (ip_port_pair) { - env = ip_port_pair; - } else { - env = getenv("MSCCLPP_COMM_ID"); - } - if (env) { - handle->magic = 0xdeadbeef; - - INFO(MSCCLPP_ENV, "MSCCLPP_COMM_ID set by environment to %s", env); - if (mscclppSocketGetAddrFromString(&handle->addr, env) != mscclppSuccess) { - WARN("Invalid MSCCLPP_COMM_ID, please use format: : or []: or :"); - return mscclppInvalidArgument; - } - if (isRoot) - MSCCLPPCHECK(bootstrapCreateRoot(handle)); - } else { - MSCCLPPCHECK(getRandomData(&handle->magic, sizeof(handle->magic))); - memcpy(&handle->addr, &bootstrapNetIfAddr, sizeof(union mscclppSocketAddress)); - MSCCLPPCHECK(bootstrapCreateRoot(handle)); - } - // printf("addr = %s port = %d\n", inet_ntoa(handle->addr.sin.sin_addr), (int)ntohs(handle->addr.sin.sin_port)); - // printf("addr = %s\n", inet_ntoa((*(struct sockaddr_in*)&handle->addr.sa).sin_addr)); - - return mscclppSuccess; -} - -struct unexConn +struct unexpectedConn { int peer; int tag; struct mscclppSocket sock; - struct unexConn* next; }; -struct bootstrapState +struct extInfo { - struct mscclppSocket listenSock; - struct mscclppSocket ringRecvSocket; - struct mscclppSocket ringSendSocket; - union mscclppSocketAddress* peerCommAddresses; - union mscclppSocketAddress* peerProxyAddresses; - struct unexConn* unexpectedConnections; - int cudaDev; int rank; - int nranks; - uint64_t magic; - volatile uint32_t* abortFlag; + int nRanks; + union mscclppSocketAddress extAddressListenRoot; + union mscclppSocketAddress extAddressListen; }; -mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscclppComm* comm) +class MscclppBootstrap::Impl { - int rank = comm->rank; - int nranks = comm->nRanks; - struct bootstrapState* state; - struct mscclppSocket* proxySocket; - mscclppSocketAddress nextAddr; - struct mscclppSocket sock, listenSockRoot; - struct extInfo info; +public: + static char bootstrapNetIfName[MAX_IF_NAME_SIZE + 1]; + static union mscclppSocketAddress bootstrapNetIfAddr; - MSCCLPPCHECK(mscclppCalloc(&state, 1)); - state->rank = rank; - state->nranks = nranks; - state->abortFlag = comm->abortFlag; - comm->bootstrap = state; - comm->magic = state->magic = handle->magic; + static void bootstrapRoot(mscclppSocket* listenSock, uint64_t magic, int nRanks); - TRACE(MSCCLPP_INIT, "rank %d nranks %d", rank, nranks); + Impl(std::string ipPortPair, int rank, int nRanks, const mscclppBootstrapHandle handle); + mscclppResult_t init(const mscclppComm& comm); + mscclppResult_t createRoot(MscclppBootstrap::UniqueId& handle); + mscclppResult_t allGather(void* allData, int size); - info.rank = rank; - info.nranks = nranks; + void startBootstrapThread(); - // Create socket for other ranks to contact me - MSCCLPPCHECK(mscclppSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->magic, mscclppSocketTypeBootstrap, - comm->abortFlag)); - MSCCLPPCHECK(mscclppSocketListen(&state->listenSock)); - MSCCLPPCHECK(mscclppSocketGetAddr(&state->listenSock, &info.extAddressListen)); + MscclppBootstrap::UniqueId uniqueId_; +private: + int rank_; + int nRanks_; + mscclppSocket listenSock_; + mscclppSocket ringRecvSocket_; + mscclppSocket ringSendSocket_; + std::vector peerCommAddresses_; + std::vector peerProxyAddresses_; + std::queue unexpectedConnections_; + volatile uint32_t* abortFlag_; - // Create socket for root to contact me - MSCCLPPCHECK( - mscclppSocketInit(&listenSockRoot, &bootstrapNetIfAddr, comm->magic, mscclppSocketTypeBootstrap, comm->abortFlag)); - MSCCLPPCHECK(mscclppSocketListen(&listenSockRoot)); - MSCCLPPCHECK(mscclppSocketGetAddr(&listenSockRoot, &info.extAddressListenRoot)); + static mscclppResult_t netSend(mscclppSocket* sock, void* data, int size); + static mscclppResult_t netRecv(mscclppSocket* sock, void* data, int size); - // stagger connection times to avoid an overload of the root - if (nranks > 128) { - long msec = rank; - struct timespec tv; - tv.tv_sec = msec / 1000; - tv.tv_nsec = 1000000 * (msec % 1000); - TRACE(MSCCLPP_INIT, "rank %d delaying connection to root by %ld msec", rank, msec); - (void)nanosleep(&tv, NULL); + mscclppResult_t netInit(std::string ipPortPair); +}; + +MscclppBootstrap::Impl::Impl(std::string ipPortPair, int rank, int nRanks, const mscclppBootstrapHandle handle) + : rank_(rank), nRanks_(nRanks), peerCommAddresses_(nRanks, mscclppSocketAddress()), + peerProxyAddresses_(nRanks, mscclppSocketAddress()), abortFlag_(nullptr) +{ + int ret = netInit(ipPortPair); + if (ret != mscclppSuccess) { + throw std::runtime_error("Failed to initialize network"); } - // send info on my listening socket to root - MSCCLPPCHECK(mscclppSocketInit(&sock, &handle->addr, comm->magic, mscclppSocketTypeBootstrap, comm->abortFlag)); - MSCCLPPCHECK(mscclppSocketConnect(&sock)); - MSCCLPPCHECK(bootstrapNetSend(&sock, &info, sizeof(info))); - MSCCLPPCHECK(mscclppSocketClose(&sock)); - - // get info on my "next" rank in the bootstrap ring from root - MSCCLPPCHECK(mscclppSocketInit(&sock)); - MSCCLPPCHECK(mscclppSocketAccept(&sock, &listenSockRoot)); - MSCCLPPCHECK(bootstrapNetRecv(&sock, &nextAddr, sizeof(union mscclppSocketAddress))); - MSCCLPPCHECK(mscclppSocketClose(&sock)); - MSCCLPPCHECK(mscclppSocketClose(&listenSockRoot)); - - MSCCLPPCHECK( - mscclppSocketInit(&state->ringSendSocket, &nextAddr, comm->magic, mscclppSocketTypeBootstrap, comm->abortFlag)); - MSCCLPPCHECK(mscclppSocketConnect(&state->ringSendSocket)); - // Accept the connect request from the previous rank in the AllGather ring - MSCCLPPCHECK(mscclppSocketInit(&state->ringRecvSocket)); - MSCCLPPCHECK(mscclppSocketAccept(&state->ringRecvSocket, &state->listenSock)); - - // AllGather all listen handlers - MSCCLPPCHECK(mscclppCalloc(&state->peerCommAddresses, nranks)); - MSCCLPPCHECK(mscclppSocketGetAddr(&state->listenSock, state->peerCommAddresses + rank)); - MSCCLPPCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union mscclppSocketAddress))); - - // Create the service proxy - MSCCLPPCHECK(mscclppCalloc(&state->peerProxyAddresses, nranks)); - - // proxy is aborted through a message; don't set abortFlag - MSCCLPPCHECK(mscclppCalloc(&proxySocket, 1)); - MSCCLPPCHECK( - mscclppSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, mscclppSocketTypeProxy, comm->abortFlag)); - MSCCLPPCHECK(mscclppSocketListen(proxySocket)); - MSCCLPPCHECK(mscclppSocketGetAddr(proxySocket, state->peerProxyAddresses + rank)); - MSCCLPPCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union mscclppSocketAddress))); - // MSCCLPPCHECK(mscclppProxyInit(comm, proxySocket, state->peerProxyAddresses)); - - TRACE(MSCCLPP_INIT, "rank %d nranks %d - DONE", rank, nranks); - - return mscclppSuccess; -} - -mscclppResult_t bootstrapAllGather(void* commState, void* allData, int size) -{ - struct bootstrapState* state = (struct bootstrapState*)commState; - char* data = (char*)allData; - int rank = state->rank; - int nranks = state->nranks; - - TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d", rank, nranks, size); - - /* Simple ring based AllGather - * At each step i receive data from (rank-i-1) from left - * and send previous step's data from (rank-i) to right - */ - for (int i = 0; i < nranks - 1; i++) { - size_t rslice = (rank - i - 1 + nranks) % nranks; - size_t sslice = (rank - i + nranks) % nranks; - - // Send slice to the right - MSCCLPPCHECK(bootstrapNetSend(&state->ringSendSocket, data + sslice * size, size)); - // Recv slice from the left - MSCCLPPCHECK(bootstrapNetRecv(&state->ringRecvSocket, data + rslice * size, size)); + mscclppBootstrapHandle zeroHandle = {0}; + if (memcmp(&handle, &zeroHandle, sizeof(mscclppBootstrapHandle)) != 0) { + uniqueId_.magic = handle.magic; + uniqueId_.addr = handle.addr; + return; } - TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); - return mscclppSuccess; -} - -mscclppResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) -{ - mscclppResult_t ret = mscclppSuccess; - struct bootstrapState* state = (struct bootstrapState*)commState; - struct mscclppSocket sock; - - MSCCLPPCHECKGOTO(mscclppSocketInit(&sock, state->peerCommAddresses + peer, state->magic, mscclppSocketTypeBootstrap, - state->abortFlag), - ret, fail); - MSCCLPPCHECKGOTO(mscclppSocketConnect(&sock), ret, fail); - MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, &state->rank, sizeof(int)), ret, fail); - MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, &tag, sizeof(int)), ret, fail); - MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, data, size), ret, fail); - -exit: - MSCCLPPCHECK(mscclppSocketClose(&sock)); - return ret; -fail: - goto exit; -} - -mscclppResult_t bootstrapBarrier(void* commState, int* ranks, int rank, int nranks, int tag) -{ - if (nranks == 1) - return mscclppSuccess; - TRACE(MSCCLPP_INIT, "rank %d nranks %d tag %x - ENTER", rank, nranks, tag); - - /* Simple intra process barrier - * - * Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet, - * "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988" - */ - int data[1]; - for (int mask = 1; mask < nranks; mask <<= 1) { - int src = (rank - mask + nranks) % nranks; - int dst = (rank + mask) % nranks; - MSCCLPPCHECK(bootstrapSend(commState, ranks[dst], tag, data, sizeof(data))); - MSCCLPPCHECK(bootstrapRecv(commState, ranks[src], tag, data, sizeof(data))); - } - - TRACE(MSCCLPP_INIT, "rank %d nranks %d tag %x - DONE", rank, nranks, tag); - return mscclppSuccess; -} - -mscclppResult_t bootstrapIntraNodeAllGather(void* commState, int* ranks, int rank, int nranks, void* allData, int size) -{ - if (nranks == 1) - return mscclppSuccess; - char* data = (char*)allData; - TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - ENTER", rank, nranks, size); - - for (int i = 1; i < nranks; i++) { - int src = (rank - i + nranks) % nranks; - int dst = (rank + i) % nranks; - MSCCLPPCHECK(bootstrapSend(commState, ranks[dst], /*tag=*/i, data + rank * size, size)); - MSCCLPPCHECK(bootstrapRecv(commState, ranks[src], /*tag=*/i, data + src * size, size)); - } - - TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); - return mscclppSuccess; -} - -mscclppResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct mscclppSocket* sock) -{ - // New unex - struct unexConn* unex; - MSCCLPPCHECK(mscclppCalloc(&unex, 1)); - unex->peer = peer; - unex->tag = tag; - memcpy(&unex->sock, sock, sizeof(struct mscclppSocket)); - - // Enqueue - struct unexConn* list = state->unexpectedConnections; - if (list == NULL) { - state->unexpectedConnections = unex; - return mscclppSuccess; - } - while (list->next) - list = list->next; - list->next = unex; - return mscclppSuccess; -} - -mscclppResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct mscclppSocket* sock, - int* found) -{ - struct unexConn* elem = state->unexpectedConnections; - struct unexConn* prev = NULL; - *found = 0; - while (elem) { - if (elem->peer == peer && elem->tag == tag) { - if (prev == NULL) { - state->unexpectedConnections = elem->next; - } else { - prev->next = elem->next; - } - memcpy(sock, &elem->sock, sizeof(struct mscclppSocket)); - free(elem); - *found = 1; - return mscclppSuccess; + if (!ipPortPair.empty()) { + uniqueId_.magic = 0xdeadbeef; + } else { + mscclppResult_t ret = getRandomData(&uniqueId_.magic, sizeof(uniqueId_.magic)); + if (ret != mscclppSuccess) { + throw std::runtime_error("getting random data failed"); } - prev = elem; - elem = elem->next; } - return mscclppSuccess; + std::memcpy(&uniqueId_.addr, &bootstrapNetIfAddr, sizeof(union mscclppSocketAddress)); } -static void unexpectedFree(struct bootstrapState* state) +mscclppResult_t MscclppBootstrap::Impl::netInit(std::string ipPortPair) { - struct unexConn* elem = state->unexpectedConnections; - struct unexConn* prev = NULL; - - while (elem) { - prev = elem; - elem = elem->next; - free(prev); - } - return; -} - -// We can't know who we'll receive from, so we need to receive everything at once -mscclppResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) -{ - mscclppResult_t ret = mscclppSuccess; - struct bootstrapState* state = (struct bootstrapState*)commState; - struct mscclppSocket sock; - int newPeer, newTag; - - // Search unexpected connections first - int found; - MSCCLPPCHECK(unexpectedDequeue(state, peer, tag, &sock, &found)); - if (found) { - MSCCLPPCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, fail); - goto exit; - } - - // Then look for new connections - while (1) { - MSCCLPPCHECKGOTO(mscclppSocketInit(&sock), ret, fail); - MSCCLPPCHECKGOTO(mscclppSocketAccept(&sock, &state->listenSock), ret, fail); - MSCCLPPCHECKGOTO(bootstrapNetRecv(&sock, &newPeer, sizeof(int)), ret, fail); - MSCCLPPCHECKGOTO(bootstrapNetRecv(&sock, &newTag, sizeof(int)), ret, fail); - if (newPeer == peer && newTag == tag) { - MSCCLPPCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, fail); - goto exit; + if (!ipPortPair.empty()) { + union mscclppSocketAddress remoteAddr; + if (mscclppSocketGetAddrFromString(&remoteAddr, ipPortPair.c_str()) != mscclppSuccess) { + WARN("Invalid MSCCLPP_COMM_ID, please use format: : or []: or :"); + return mscclppInvalidArgument; } - // Unexpected connection. Save for later. - MSCCLPPCHECKGOTO(unexpectedEnqueue(state, newPeer, newTag, &sock), ret, fail); - } -exit: - MSCCLPPCHECK(mscclppSocketClose(&sock)); - return ret; -fail: - goto exit; -} - -mscclppResult_t bootstrapClose(void* commState) -{ - struct bootstrapState* state = (struct bootstrapState*)commState; - if (state->unexpectedConnections != NULL) { - unexpectedFree(state); - if (*state->abortFlag == 0) { - WARN("Unexpected connections are not empty"); + if (mscclppFindInterfaceMatchSubnet(this->bootstrapNetIfName, &this->bootstrapNetIfAddr, &remoteAddr, + MAX_IF_NAME_SIZE, 1) <= 0) { + WARN("NET/Socket : No usable listening interface found"); + return mscclppSystemError; + } + } else { + int ret = mscclppFindInterfaces(this->bootstrapNetIfName, &this->bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1); + if (ret <= 0) { + WARN("Bootstrap : no socket interface found"); return mscclppInternalError; } } - MSCCLPPCHECK(mscclppSocketClose(&state->listenSock)); - MSCCLPPCHECK(mscclppSocketClose(&state->ringSendSocket)); - MSCCLPPCHECK(mscclppSocketClose(&state->ringRecvSocket)); - - free(state->peerCommAddresses); - free(state); - + char line[SOCKET_NAME_MAXLEN + MAX_IF_NAME_SIZE + 2]; + std::sprintf(line, " %s:", bootstrapNetIfName); + mscclppSocketToString(&bootstrapNetIfAddr, line + strlen(line)); + INFO(MSCCLPP_INIT, "Bootstrap : Using%s", line); return mscclppSuccess; } -mscclppResult_t bootstrapAbort(void* commState) +mscclppResult_t MscclppBootstrap::Impl::init(const mscclppComm& comm) { - struct bootstrapState* state = (struct bootstrapState*)commState; - if (commState == NULL) - return mscclppSuccess; - MSCCLPPCHECK(mscclppSocketClose(&state->listenSock)); - MSCCLPPCHECK(mscclppSocketClose(&state->ringSendSocket)); - MSCCLPPCHECK(mscclppSocketClose(&state->ringRecvSocket)); - free(state->peerCommAddresses); - free(state->peerProxyAddresses); - free(state); return mscclppSuccess; } + +MscclppBootstrap::MscclppBootstrap(std::string ipPortPair, int rank, int nRanks) +{ + pimpl = std::make_unique(ipPortPair, rank, nRanks, mscclppBootstrapHandle{0}); +} + +MscclppBootstrap::MscclppBootstrap(mscclppBootstrapHandle handle, int rank, int nRanks) +{ + pimpl = std::make_unique("", rank, nRanks, handle); +} + +MscclppBootstrap::UniqueId MscclppBootstrap::getUniqueId() +{ + return pimpl->uniqueId_; +} + +// void MscclppBootstrap::Impl::bootstrapRoot(mscclppSocket* listenSock, uint64_t magic, int nRanks) +// { +// extInfo info; +// mscclppResult_t res = mscclppSuccess; +// int numCollected = 0; +// std::vector rankAddresses(nRanks, mscclppSocketAddress()); +// // for initial rank <-> root information exchange +// std::vector rankAddressesRoot(nRanks, mscclppSocketAddress()); + +// mscclppSocketAddress zero; +// std::memset(rankAddresses.data(), 0, sizeof(mscclppSocketAddress) * nRanks); +// std::memset(rankAddressesRoot.data(), 0, sizeof(mscclppSocketAddress) * nRanks); +// std::memset(&zero, 0, sizeof(mscclppSocketAddress)); +// setFilesLimit(); + +// TRACE(MSCCLPP_INIT, "BEGIN"); +// /* Receive addresses from all ranks */ +// do { +// mscclppSocket sock; +// MSCCLPPCHECKGOTO(mscclppSocketInit(&sock), res, out); +// MSCCLPPCHECKGOTO(mscclppSocketAccept(&sock, listenSock), res, out); +// MSCCLPPCHECKGOTO(NetRecv(&sock, &info, sizeof(info)), res, out); +// MSCCLPPCHECKGOTO(mscclppSocketClose(&sock), res, out); + +// if (nRanks != info.nRanks) { +// WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", nRanks, info.nRanks); +// return; +// } + +// if (std::memcmp(&zero, &rankAddressesRoot[info.rank], sizeof(mscclppSocketAddress)) != 0) { +// WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nRanks); +// return; +// } + +// // Save the connection handle for that rank +// rankAddressesRoot[info.rank] = info.extAddressListenRoot; +// rankAddresses[info.rank] = info.extAddressListen; + +// ++numCollected; +// TRACE(MSCCLPP_INIT, "Received connect from rank %d total %d/%d", info.rank, c, nranks); +// } while (numCollected < nRanks); +// TRACE(MSCCLPP_INIT, "COLLECTED ALL %d HANDLES", nranks); + +// // Send the connect handle for the next rank in the AllGather ring +// for (int r = 0; r < nRanks; ++r) { +// int next = (r + 1) % nRanks; +// mscclppSocket sock; +// MSCCLPPCHECKGOTO(mscclppSocketInit(&sock, &rankAddressesRoot[r], magic, mscclppSocketTypeBootstrap), res, out); +// MSCCLPPCHECKGOTO(mscclppSocketConnect(&sock), res, out); +// MSCCLPPCHECKGOTO(NetSend(&sock, &rankAddresses[next], sizeof(mscclppSocketAddress)), res, out); +// MSCCLPPCHECKGOTO(mscclppSocketClose(&sock), res, out); +// } +// TRACE(MSCCLPP_INIT, "SENT OUT ALL %d HANDLES", nRanks); + +// out: +// if (listenSock != nullptr) { +// mscclppSocketClose(listenSock); +// free(listenSock); +// } +// TRACE(MSCCLPP_INIT, "DONE"); +// } + +// mscclppResult_t MscclppBootstrap::Impl::createRoot(mscclppBootstrap::UniqueId& handle) +// { +// MSCCLPPCHECK(mscclppSocketInit(&this->listenSock, &handle.addr, handle.magic, mscclppSocketTypeBootstrap, NULL, 0)); +// MSCCLPPCHECK(mscclppSocketListen(&this->listenSock)); +// MSCCLPPCHECK(mscclppSocketGetAddr(&this->listenSock, &handle.addr)); + +// std::thread thread(BootstrapRoot, listenSock, handle.magic, nRanks); +// mscclppSetThreadName(thread.native_handle(), "MSCCLPP BootstrapR"); +// thread.detach(); +// return mscclppSuccess; +// } + +// // Additional sync functions +// mscclppResult_t MscclppBootstrap::Impl::netSend(mscclppSocket* sock, void* data, int size) +// { +// MSCCLPPCHECK(mscclppSocketSend(sock, &size, sizeof(int))); +// MSCCLPPCHECK(mscclppSocketSend(sock, data, size)); +// return mscclppSuccess; +// } + +// mscclppResult_t MscclppBootstrap::Impl::netRecv(mscclppSocket* sock, void* data, int size) +// { +// int recvSize; +// MSCCLPPCHECK(mscclppSocketRecv(sock, &recvSize, sizeof(int))); +// if (recvSize > size) { +// WARN("Message truncated : received %d bytes instead of %d", recvSize, size); +// return mscclppInternalError; +// } +// MSCCLPPCHECK(mscclppSocketRecv(sock, data, std::min(recvSize, size))); +// return mscclppSuccess; +// } + +// mscclppResult_t MscclppBootstrap::Impl::init(const mscclppComm& comm) +// { +// this->rank = comm.rank; +// this->nRanks = comm.nRanks; + +// mscclppSocket* proxySocket; +// mscclppSocketAddress nextAddr; +// mscclppSocket sock, listenSockRoot; +// extInfo info; + +// TRACE(MSCCLPP_INIT, "rank %d nranks %d", rank, nranks); + +// info.rank = rank; +// info.nRanks = this->nRanks; + +// uint64_t magic = this->handle.magic; +// // Create socket for other ranks to contact me +// MSCCLPPCHECK( +// mscclppSocketInit(&this->listenSock, &bootstrapNetIfAddr, magic, mscclppSocketTypeBootstrap, this->abortFlag)); +// MSCCLPPCHECK(mscclppSocketListen(&this->listenSock)); +// MSCCLPPCHECK(mscclppSocketGetAddr(&this->listenSock, &info.extAddressListen)); + +// // Create socket for root to contact me +// MSCCLPPCHECK( +// mscclppSocketInit(&listenSockRoot, &bootstrapNetIfAddr, magic, mscclppSocketTypeBootstrap, this->abortFlag)); +// MSCCLPPCHECK(mscclppSocketListen(&listenSockRoot)); +// MSCCLPPCHECK(mscclppSocketGetAddr(&listenSockRoot, &info.extAddressListenRoot)); + +// // stagger connection times to avoid an overload of the root +// if (this->nRanks > 128) { +// long msec = rank; +// struct timespec tv; +// tv.tv_sec = msec / 1000; +// tv.tv_nsec = 1000000 * (msec % 1000); +// TRACE(MSCCLPP_INIT, "rank %d delaying connection to root by %ld msec", rank, msec); +// (void)nanosleep(&tv, NULL); +// } + +// // send info on my listening socket to root +// MSCCLPPCHECK(mscclppSocketInit(&sock, &this->handle.addr, magic, mscclppSocketTypeBootstrap, this->abortFlag)); +// MSCCLPPCHECK(mscclppSocketConnect(&sock)); +// MSCCLPPCHECK(NetSend(&sock, &info, sizeof(info))); +// MSCCLPPCHECK(mscclppSocketClose(&sock)); + +// // get info on my "next" rank in the bootstrap ring from root +// MSCCLPPCHECK(mscclppSocketInit(&sock)); +// MSCCLPPCHECK(mscclppSocketAccept(&sock, &listenSockRoot)); +// MSCCLPPCHECK(NetRecv(&sock, &nextAddr, sizeof(union mscclppSocketAddress))); +// MSCCLPPCHECK(mscclppSocketClose(&sock)); +// MSCCLPPCHECK(mscclppSocketClose(&listenSockRoot)); + +// MSCCLPPCHECK( +// mscclppSocketInit(&this->ringSendSocket, &nextAddr, magic, mscclppSocketTypeBootstrap, this->abortFlag)); +// MSCCLPPCHECK(mscclppSocketConnect(&this->ringSendSocket)); +// // Accept the connect request from the previous rank in the AllGather ring +// MSCCLPPCHECK(mscclppSocketInit(&this->ringRecvSocket)); +// MSCCLPPCHECK(mscclppSocketAccept(&this->ringRecvSocket, &this->listenSock)); + +// // AllGather all listen handlers +// MSCCLPPCHECK(mscclppCalloc(&this->peerCommAddresses, this->nRanks)); +// MSCCLPPCHECK(mscclppSocketGetAddr(&this->listenSock, this->peerCommAddresses + rank)); +// MSCCLPPCHECK(bootstrapAllGather(state, this->peerCommAddresses, sizeof(union mscclppSocketAddress))); + +// // Create the service proxy +// MSCCLPPCHECK(mscclppCalloc(&this->peerProxyAddresses, this->nRanks)); + +// // proxy is aborted through a message; don't set abortFlag +// MSCCLPPCHECK(mscclppCalloc(&proxySocket, 1)); +// MSCCLPPCHECK( +// mscclppSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, mscclppSocketTypeProxy, comm->abortFlag)); +// MSCCLPPCHECK(mscclppSocketListen(proxySocket)); +// MSCCLPPCHECK(mscclppSocketGetAddr(proxySocket, &this->peerProxyAddresses[rank])); +// MSCCLPPCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union mscclppSocketAddress))); + +// TRACE(MSCCLPP_INIT, "rank %d nranks %d - DONE", rank, nranks); + +// return mscclppSuccess; +// } + +// mscclppResult_t MscclppBootstrap::Impl::allGather(void* allData, int size) +// { +// char* data = static_cast(allData); +// int rank = this->rank; +// int nRanks = this->nRanks; + +// TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d", rank, nRanks, size); + +// /* Simple ring based AllGather +// * At each step i receive data from (rank-i-1) from left +// * and send previous step's data from (rank-i) to right +// */ +// for (int i = 0; i < nRanks - 1; i++) { +// size_t rSlice = (rank - i - 1 + nRanks) % nRanks; +// size_t sSlice = (rank - i + nRanks) % nRanks; + +// // Send slice to the right +// MSCCLPPCHECK(NetSend(&this->ringSendSocket, data + sSlice * size, size)); +// // Recv slice from the left +// MSCCLPPCHECK(bootstrapNetRecv(&this->ringRecvSocket, data + rSlice * size, size)); +// } + +// TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); +// return mscclppSuccess; +// } + +// mscclppResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) +// { +// mscclppResult_t ret = mscclppSuccess; +// struct bootstrapState* state = (struct bootstrapState*)commState; +// struct mscclppSocket sock; + +// MSCCLPPCHECKGOTO(mscclppSocketInit(&sock, state->peerCommAddresses + peer, state->magic, mscclppSocketTypeBootstrap, +// state->abortFlag), +// ret, fail); +// MSCCLPPCHECKGOTO(mscclppSocketConnect(&sock), ret, fail); +// MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, &state->rank, sizeof(int)), ret, fail); +// MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, &tag, sizeof(int)), ret, fail); +// MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, data, size), ret, fail); + +// exit: +// MSCCLPPCHECK(mscclppSocketClose(&sock)); +// return ret; +// fail: +// goto exit; +// } + +// mscclppResult_t bootstrapBarrier(void* commState, int* ranks, int rank, int nranks, int tag) +// { +// if (nranks == 1) +// return mscclppSuccess; +// TRACE(MSCCLPP_INIT, "rank %d nranks %d tag %x - ENTER", rank, nranks, tag); + +// /* Simple intra process barrier +// * +// * Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet, +// * "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988" +// */ +// int data[1]; +// for (int mask = 1; mask < nranks; mask <<= 1) { +// int src = (rank - mask + nranks) % nranks; +// int dst = (rank + mask) % nranks; +// MSCCLPPCHECK(bootstrapSend(commState, ranks[dst], tag, data, sizeof(data))); +// MSCCLPPCHECK(bootstrapRecv(commState, ranks[src], tag, data, sizeof(data))); +// } + +// TRACE(MSCCLPP_INIT, "rank %d nranks %d tag %x - DONE", rank, nranks, tag); +// return mscclppSuccess; +// } + +// mscclppResult_t bootstrapIntraNodeAllGather(void* commState, int* ranks, int rank, int nranks, void* allData, int size) +// { +// if (nranks == 1) +// return mscclppSuccess; +// char* data = (char*)allData; +// TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - ENTER", rank, nranks, size); + +// for (int i = 1; i < nranks; i++) { +// int src = (rank - i + nranks) % nranks; +// int dst = (rank + i) % nranks; +// MSCCLPPCHECK(bootstrapSend(commState, ranks[dst], /*tag=*/i, data + rank * size, size)); +// MSCCLPPCHECK(bootstrapRecv(commState, ranks[src], /*tag=*/i, data + src * size, size)); +// } + +// TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); +// return mscclppSuccess; +// } + +// mscclppResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct mscclppSocket* sock) +// { +// // New unex +// struct unexConn* unex; +// MSCCLPPCHECK(mscclppCalloc(&unex, 1)); +// unex->peer = peer; +// unex->tag = tag; +// memcpy(&unex->sock, sock, sizeof(struct mscclppSocket)); + +// // Enqueue +// struct unexConn* list = state->unexpectedConnections; +// if (list == NULL) { +// state->unexpectedConnections = unex; +// return mscclppSuccess; +// } +// while (list->next) +// list = list->next; +// list->next = unex; +// return mscclppSuccess; +// } + +// mscclppResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct mscclppSocket* sock, +// int* found) +// { +// struct unexConn* elem = state->unexpectedConnections; +// struct unexConn* prev = NULL; +// *found = 0; +// while (elem) { +// if (elem->peer == peer && elem->tag == tag) { +// if (prev == NULL) { +// state->unexpectedConnections = elem->next; +// } else { +// prev->next = elem->next; +// } +// memcpy(sock, &elem->sock, sizeof(struct mscclppSocket)); +// free(elem); +// *found = 1; +// return mscclppSuccess; +// } +// prev = elem; +// elem = elem->next; +// } +// return mscclppSuccess; +// } + +// static void unexpectedFree(struct bootstrapState* state) +// { +// struct unexConn* elem = state->unexpectedConnections; +// struct unexConn* prev = NULL; + +// while (elem) { +// prev = elem; +// elem = elem->next; +// free(prev); +// } +// return; +// } + +// // We can't know who we'll receive from, so we need to receive everything at once +// mscclppResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) +// { +// mscclppResult_t ret = mscclppSuccess; +// struct bootstrapState* state = (struct bootstrapState*)commState; +// struct mscclppSocket sock; +// int newPeer, newTag; + +// // Search unexpected connections first +// int found; +// MSCCLPPCHECK(unexpectedDequeue(state, peer, tag, &sock, &found)); +// if (found) { +// MSCCLPPCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, fail); +// goto exit; +// } + +// // Then look for new connections +// while (1) { +// MSCCLPPCHECKGOTO(mscclppSocketInit(&sock), ret, fail); +// MSCCLPPCHECKGOTO(mscclppSocketAccept(&sock, &state->listenSock), ret, fail); +// MSCCLPPCHECKGOTO(bootstrapNetRecv(&sock, &newPeer, sizeof(int)), ret, fail); +// MSCCLPPCHECKGOTO(bootstrapNetRecv(&sock, &newTag, sizeof(int)), ret, fail); +// if (newPeer == peer && newTag == tag) { +// MSCCLPPCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, fail); +// goto exit; +// } +// // Unexpected connection. Save for later. +// MSCCLPPCHECKGOTO(unexpectedEnqueue(state, newPeer, newTag, &sock), ret, fail); +// } +// exit: +// MSCCLPPCHECK(mscclppSocketClose(&sock)); +// return ret; +// fail: +// goto exit; +// } + +// mscclppResult_t bootstrapClose(void* commState) +// { +// struct bootstrapState* state = (struct bootstrapState*)commState; +// if (state->unexpectedConnections != nullptr) { +// unexpectedFree(state); +// if (*state->abortFlag == 0) { +// WARN("Unexpected connections are not empty"); +// return mscclppInternalError; +// } +// } + +// MSCCLPPCHECK(mscclppSocketClose(&state->listenSock)); +// MSCCLPPCHECK(mscclppSocketClose(&state->ringSendSocket)); +// MSCCLPPCHECK(mscclppSocketClose(&state->ringRecvSocket)); + +// free(state->peerCommAddresses); +// free(state); + +// return mscclppSuccess; +// } + +// mscclppResult_t bootstrapAbort(void* commState) +// { +// struct bootstrapState* state = (struct bootstrapState*)commState; +// if (commState == nullptr) +// return mscclppSuccess; +// MSCCLPPCHECK(mscclppSocketClose(&state->listenSock)); +// MSCCLPPCHECK(mscclppSocketClose(&state->ringSendSocket)); +// MSCCLPPCHECK(mscclppSocketClose(&state->ringRecvSocket)); +// free(state->peerCommAddresses); +// free(state->peerProxyAddresses); +// free(state); +// return mscclppSuccess; +// } + diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h index 69d916aa..246a380e 100644 --- a/src/include/bootstrap.h +++ b/src/include/bootstrap.h @@ -1,11 +1,4 @@ -/************************************************************************* - * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef MSCCLPP_BOOTSTRAP_H_ -#define MSCCLPP_BOOTSTRAP_H_ +#pragma once #include "mscclpp.h" #include "socket.h" @@ -17,27 +10,28 @@ struct mscclppBootstrapHandle uint64_t magic; union mscclppSocketAddress addr; }; + static_assert(sizeof(struct mscclppBootstrapHandle) <= sizeof(mscclppUniqueId), "Bootstrap handle is too large to fit inside MSCCLPP unique ID"); -class mscclppBootstrap : Bootstrap { +class MscclppBootstrap : Bootstrap { public: - mscclppBootstrap(); - void Initliaze(std::string ipPortPair, int rank, int nranks); - void Initliaze(mscclppBootstrapHandle handle, int rank, int nranks); + MscclppBootstrap(std::string ipPortPair, int rank, int nRanks); + MscclppBootstrap(mscclppBootstrapHandle handle, int rank, int nRanks); + void Initialize(const mscclppComm& comm); void Send(void* data, int size, int peer, int tag); void Recv(void* data, int size, int peer, int tag); void AllGather(void* allData, int size); void Barrier(); + void Close(); struct UniqueId; - std::unique_ptr GetUniqueId(); + UniqueId getUniqueId(); private: - struct impl; - std::unique_ptr pimpl; + class Impl; + std::unique_ptr pimpl; }; -mscclppResult_t bootstrapNetInit(const char* ip_port_pair = NULL); mscclppResult_t bootstrapCreateRoot(struct mscclppBootstrapHandle* handle); mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle, bool isRoot = true, const char* ip_port_pair = NULL); @@ -49,4 +43,3 @@ mscclppResult_t bootstrapBarrier(void* commState, int* ranks, int rank, int nran mscclppResult_t bootstrapIntraNodeAllGather(void* commState, int* ranks, int rank, int nranks, void* allData, int size); mscclppResult_t bootstrapClose(void* commState); mscclppResult_t bootstrapAbort(void* commState); -#endif diff --git a/src/include/comm.h b/src/include/comm.h index 8275e0cb..b76c8c4f 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -44,7 +44,7 @@ struct mscclppComm struct mscclppDevConn devConns[MAXCONNECTIONS]; int nConns; - void* bootstrap; + MscclppBootstrap bootstrap; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches. uint64_t magic; diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index e10b8e4f..6ad587fb 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -357,7 +357,7 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void * Note that with IB, buffers are registered at a page level and if a buffer is spread through multiple pages * and do not fully utilize all of them, IB's QP has to register for all involved pages. This potentially has * security risks if the devConn's accesses are given to a malicious process. - * + * * This version does not register a buffer. Buffers should instead be registered with mscclppRegisterBuffer(). * * Inputs: @@ -377,7 +377,7 @@ mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int remoteRank, * connIdx: the index of the connection by order of calls to mscclppConnect/mscclppConnectWithoutBuffer * localBuff: the local send/receive buffer * buffSize: the size of the local buffer - * + * * Outputs: * handle: a handle to the buffer registration */ From 7ac861b1e9ad2bf04990d921046391141754d55b Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 21 Apr 2023 08:41:33 +0000 Subject: [PATCH 036/135] Refactor bootstrap --- src/bootstrap/bootstrap.cc | 716 ++++++++++++++++--------------------- src/bootstrap/socket.cc | 2 +- src/include/bootstrap.h | 6 +- src/include/socket.h | 2 +- 4 files changed, 319 insertions(+), 407 deletions(-) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index c2e503d8..84d6d25b 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -1,9 +1,6 @@ #include "bootstrap.h" -#include "config.h" -#include "mscclpp.h" #include "utils.h" -#include #include #include #include @@ -11,7 +8,6 @@ #include #include -#include namespace { uint64_t hashUniqueId(const mscclppBootstrapHandle& id) @@ -71,16 +67,16 @@ public: static char bootstrapNetIfName[MAX_IF_NAME_SIZE + 1]; static union mscclppSocketAddress bootstrapNetIfAddr; - static void bootstrapRoot(mscclppSocket* listenSock, uint64_t magic, int nRanks); - Impl(std::string ipPortPair, int rank, int nRanks, const mscclppBootstrapHandle handle); - mscclppResult_t init(const mscclppComm& comm); - mscclppResult_t createRoot(MscclppBootstrap::UniqueId& handle); + ~Impl(); + mscclppResult_t initialize(); mscclppResult_t allGather(void* allData, int size); - - void startBootstrapThread(); + mscclppResult_t send(void* data, int size, int peer, int tag); + mscclppResult_t recv(void* data, int size, int peer, int tag); + mscclppResult_t barrier(); MscclppBootstrap::UniqueId uniqueId_; + private: int rank_; int nRanks_; @@ -91,10 +87,16 @@ private: std::vector peerProxyAddresses_; std::queue unexpectedConnections_; volatile uint32_t* abortFlag_; + std::thread rootThread_; - static mscclppResult_t netSend(mscclppSocket* sock, void* data, int size); + static mscclppResult_t netSend(mscclppSocket* sock, const void* data, int size); static mscclppResult_t netRecv(mscclppSocket* sock, void* data, int size); + mscclppResult_t bootstrapRoot(); + mscclppResult_t getRemoteAddresses(mscclppSocket* listenSock, std::vector& rankAddresses, + std::vector& rankAddressesRoot, int& rank); + mscclppResult_t sendHandleToPeer(int peer, const std::vector& rankAddresses, + const std::vector& rankAddressesRoot); mscclppResult_t netInit(std::string ipPortPair); }; @@ -123,6 +125,141 @@ MscclppBootstrap::Impl::Impl(std::string ipPortPair, int rank, int nRanks, const } } std::memcpy(&uniqueId_.addr, &bootstrapNetIfAddr, sizeof(union mscclppSocketAddress)); + if (rank_ == 0) { + rootThread_ = std::thread(&MscclppBootstrap::Impl::bootstrapRoot, this, &listenSock_, uniqueId_.magic, nRanks_); + } +} + +MscclppBootstrap::Impl::~Impl() +{ + if (rootThread_.joinable()) { + rootThread_.join(); + } +} + +mscclppResult_t MscclppBootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, + std::vector& rankAddresses, + std::vector& rankAddressesRoot, + int& rank) +{ + mscclppSocket sock; + extInfo info; + mscclppResult_t res = mscclppSuccess; + + mscclppSocketAddress zero; + std::memset(&zero, 0, sizeof(mscclppSocketAddress)); + res = mscclppSocketInit(&sock); + if (res != mscclppSuccess) { + WARN("Bootstrap Root : mscclppSocketInit failed"); + return res; + } + res = mscclppSocketAccept(&sock, listenSock); + if (res != mscclppSuccess) { + WARN("Bootstrap Root : mscclppSocketAccept failed"); + return res; + } + res = netRecv(&sock, &info, sizeof(info)); + if (res != mscclppSuccess) { + WARN("Bootstrap Root : netRecv failed"); + return res; + } + res = mscclppSocketClose(&sock); + if (res != mscclppSuccess) { + WARN("Bootstrap Root : mscclppSocketClose failed"); + return res; + } + + if (this->nRanks_ != info.nRanks) { + WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", this->nRanks_, info.nRanks); + return res; + } + + if (std::memcmp(&zero, &rankAddressesRoot[info.rank], sizeof(mscclppSocketAddress)) != 0) { + WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, this->nRanks_); + return res; + } + + // Save the connection handle for that rank + rankAddressesRoot[info.rank] = info.extAddressListenRoot; + rankAddresses[info.rank] = info.extAddressListen; + rank = info.rank; + return res; +} + +mscclppResult_t MscclppBootstrap::Impl::sendHandleToPeer(int peer, + const std::vector& rankAddresses, + const std::vector& rankAddressesRoot) +{ + mscclppSocket sock; + mscclppResult_t res; + int next = (peer + 1) % this->nRanks_; + res = mscclppSocketInit(&sock, &rankAddressesRoot[peer], this->uniqueId_.magic, mscclppSocketTypeBootstrap); + if (res != mscclppSuccess) { + WARN("Bootstrap Root : mscclppSocketInit failed"); + return res; + } + res = mscclppSocketConnect(&sock); + if (res != mscclppSuccess) { + WARN("Bootstrap Root : mscclppSocketConnect failed"); + return res; + } + res = netSend(&sock, &rankAddresses[next], sizeof(mscclppSocketAddress)); + if (res != mscclppSuccess) { + WARN("Bootstrap Root : netSend failed"); + return res; + } + res = mscclppSocketClose(&sock); + if (res != mscclppSuccess) { + WARN("Bootstrap Root : mscclppSocketClose failed"); + return res; + } + return mscclppSuccess; +} + +mscclppResult_t MscclppBootstrap::Impl::bootstrapRoot() +{ + mscclppResult_t res = mscclppSuccess; + int numCollected = 0; + std::vector rankAddresses(this->nRanks_, mscclppSocketAddress()); + // for initial rank <-> root information exchange + std::vector rankAddressesRoot(this->nRanks_, mscclppSocketAddress()); + + std::memset(rankAddresses.data(), 0, sizeof(mscclppSocketAddress) * this->nRanks_); + std::memset(rankAddressesRoot.data(), 0, sizeof(mscclppSocketAddress) * this->nRanks_); + setFilesLimit(); + + mscclppSocket listenSock; + MSCCLPPCHECK( + mscclppSocketInit(&listenSock, &uniqueId_.addr, uniqueId_.magic, mscclppSocketTypeBootstrap, nullptr, 0)); + MSCCLPPCHECK(mscclppSocketListen(&listenSock)); + + TRACE(MSCCLPP_INIT, "BEGIN"); + /* Receive addresses from all ranks */ + do { + int rank; + res = getRemoteAddresses(&listenSock, rankAddresses, rankAddressesRoot, rank); + if (res != mscclppSuccess) { + WARN("Bootstrap Root : getRemoteAddresses failed"); + break; + } + ++numCollected; + TRACE(MSCCLPP_INIT, "Received connect from rank %d total %d/%d", rank, numCollected, this->nRanks_); + } while (numCollected < this->nRanks_); + TRACE(MSCCLPP_INIT, "COLLECTED ALL %d HANDLES", this->nRanks_); + + // Send the connect handle for the next rank in the AllGather ring + for (int peer = 0; peer < this->nRanks_; ++peer) { + res = sendHandleToPeer(peer, rankAddresses, rankAddressesRoot); + if (res != mscclppSuccess) { + WARN("Bootstrap Root : sendHandleToPeer failed"); + break; + } + } + if (res == mscclppSuccess) { + TRACE(MSCCLPP_INIT, "SENT OUT ALL %d HANDLES", this->nRanks_); + } + TRACE(MSCCLPP_INIT, "DONE"); + return res; } mscclppResult_t MscclppBootstrap::Impl::netInit(std::string ipPortPair) @@ -153,413 +290,188 @@ mscclppResult_t MscclppBootstrap::Impl::netInit(std::string ipPortPair) return mscclppSuccess; } -mscclppResult_t MscclppBootstrap::Impl::init(const mscclppComm& comm) +mscclppResult_t MscclppBootstrap::Impl::initialize() { + mscclppSocket* proxySocket; + mscclppSocketAddress nextAddr; + mscclppSocket sock, listenSockRoot; + extInfo info; + + TRACE(MSCCLPP_INIT, "rank %d nranks %d", rank, nranks); + + info.rank = this->rank_; + info.nRanks = this->nRanks_; + + uint64_t magic = this->uniqueId_.magic; + // Create socket for other ranks to contact me + MSCCLPPCHECK( + mscclppSocketInit(&this->listenSock_, &bootstrapNetIfAddr, magic, mscclppSocketTypeBootstrap, this->abortFlag_)); + MSCCLPPCHECK(mscclppSocketListen(&this->listenSock_)); + MSCCLPPCHECK(mscclppSocketGetAddr(&this->listenSock_, &info.extAddressListen)); + + // Create socket for root to contact me + MSCCLPPCHECK( + mscclppSocketInit(&listenSockRoot, &bootstrapNetIfAddr, magic, mscclppSocketTypeBootstrap, this->abortFlag_)); + MSCCLPPCHECK(mscclppSocketListen(&listenSockRoot)); + MSCCLPPCHECK(mscclppSocketGetAddr(&listenSockRoot, &info.extAddressListenRoot)); + + // stagger connection times to avoid an overload of the root + auto randomSleep = [](int rank) { + struct timespec tv; + tv.tv_sec = rank / 1000; + tv.tv_nsec = 1000000 * (rank % 1000); + TRACE(MSCCLPP_INIT, "rank %d delaying connection to root by %ld msec", rank, rank); + (void)nanosleep(&tv, NULL); + }; + if (this->nRanks_ > 128) { + randomSleep(this->rank_); + } + + // send info on my listening socket to root + MSCCLPPCHECK(mscclppSocketInit(&sock, &this->uniqueId_.addr, magic, mscclppSocketTypeBootstrap, this->abortFlag_)); + MSCCLPPCHECK(mscclppSocketConnect(&sock)); + MSCCLPPCHECK(netSend(&sock, &info, sizeof(info))); + MSCCLPPCHECK(mscclppSocketClose(&sock)); + + // get info on my "next" rank in the bootstrap ring from root + MSCCLPPCHECK(mscclppSocketInit(&sock)); + MSCCLPPCHECK(mscclppSocketAccept(&sock, &listenSockRoot)); + MSCCLPPCHECK(netRecv(&sock, &nextAddr, sizeof(union mscclppSocketAddress))); + MSCCLPPCHECK(mscclppSocketClose(&sock)); + MSCCLPPCHECK(mscclppSocketClose(&listenSockRoot)); + + MSCCLPPCHECK( + mscclppSocketInit(&this->ringSendSocket_, &nextAddr, magic, mscclppSocketTypeBootstrap, this->abortFlag_)); + MSCCLPPCHECK(mscclppSocketConnect(&this->ringSendSocket_)); + // Accept the connect request from the previous rank in the AllGather ring + MSCCLPPCHECK(mscclppSocketInit(&this->ringRecvSocket_)); + MSCCLPPCHECK(mscclppSocketAccept(&this->ringRecvSocket_, &this->listenSock_)); + + // AllGather all listen handlers + MSCCLPPCHECK(mscclppSocketGetAddr(&this->listenSock_, &this->peerCommAddresses_[rank_])); + MSCCLPPCHECK(allGather(this->peerCommAddresses_.data(), sizeof(union mscclppSocketAddress))); + + // proxy is aborted through a message; don't set abortFlag + MSCCLPPCHECK(mscclppCalloc(&proxySocket, 1)); + MSCCLPPCHECK(mscclppSocketInit(proxySocket, &bootstrapNetIfAddr, magic, mscclppSocketTypeProxy, this->abortFlag_)); + MSCCLPPCHECK(mscclppSocketListen(proxySocket)); + MSCCLPPCHECK(mscclppSocketGetAddr(proxySocket, &this->peerProxyAddresses_[rank_])); + MSCCLPPCHECK(allGather(this->peerProxyAddresses_.data(), sizeof(union mscclppSocketAddress))); + + TRACE(MSCCLPP_INIT, "rank %d nranks %d - DONE", rank, nranks); + return mscclppSuccess; } +mscclppResult_t MscclppBootstrap::Impl::allGather(void* allData, int size) +{ + char* data = static_cast(allData); + int rank = this->rank_; + int nRanks = this->nRanks_; + + TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d", rank, nRanks, size); + + /* Simple ring based AllGather + * At each step i receive data from (rank-i-1) from left + * and send previous step's data from (rank-i) to right + */ + for (int i = 0; i < nRanks - 1; i++) { + size_t rSlice = (rank - i - 1 + nRanks) % nRanks; + size_t sSlice = (rank - i + nRanks) % nRanks; + + // Send slice to the right + MSCCLPPCHECK(netSend(&this->ringSendSocket_, data + sSlice * size, size)); + // Recv slice from the left + MSCCLPPCHECK(netRecv(&this->ringRecvSocket_, data + rSlice * size, size)); + } + + TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); + return mscclppSuccess; +} + +mscclppResult_t MscclppBootstrap::Impl::netSend(mscclppSocket* sock, const void* data, int size) +{ + MSCCLPPCHECK(mscclppSocketSend(sock, &size, sizeof(int))); + MSCCLPPCHECK(mscclppSocketSend(sock, const_cast(data), size)); + return mscclppSuccess; +} + +mscclppResult_t MscclppBootstrap::Impl::netRecv(mscclppSocket* sock, void* data, int size) +{ + int recvSize; + MSCCLPPCHECK(mscclppSocketRecv(sock, &recvSize, sizeof(int))); + if (recvSize > size) { + WARN("Message truncated : received %d bytes instead of %d", recvSize, size); + return mscclppInternalError; + } + MSCCLPPCHECK(mscclppSocketRecv(sock, data, std::min(recvSize, size))); + return mscclppSuccess; +} + +mscclppResult_t MscclppBootstrap::Impl::send(void* data, int size, int peer, int tag) +{ + mscclppSocket sock; + MSCCLPPCHECK(mscclppSocketInit(&sock, &this->peerCommAddresses_[peer], this->uniqueId_.magic, + mscclppSocketTypeBootstrap, this->abortFlag_)); + MSCCLPPCHECK(mscclppSocketConnect(&sock)); + MSCCLPPCHECK(netSend(&sock, &this->rank_, sizeof(int))); + MSCCLPPCHECK(netSend(&sock, &tag, sizeof(int))); + MSCCLPPCHECK(netSend(&sock, data, size)); + + MSCCLPPCHECK(mscclppSocketClose(&sock)); +} + MscclppBootstrap::MscclppBootstrap(std::string ipPortPair, int rank, int nRanks) { - pimpl = std::make_unique(ipPortPair, rank, nRanks, mscclppBootstrapHandle{0}); + pimpl_ = std::make_unique(ipPortPair, rank, nRanks, mscclppBootstrapHandle{0}); } MscclppBootstrap::MscclppBootstrap(mscclppBootstrapHandle handle, int rank, int nRanks) { - pimpl = std::make_unique("", rank, nRanks, handle); + pimpl_ = std::make_unique("", rank, nRanks, handle); } -MscclppBootstrap::UniqueId MscclppBootstrap::getUniqueId() +MscclppBootstrap::UniqueId MscclppBootstrap::GetUniqueId() { - return pimpl->uniqueId_; + return pimpl_->uniqueId_; } -// void MscclppBootstrap::Impl::bootstrapRoot(mscclppSocket* listenSock, uint64_t magic, int nRanks) -// { -// extInfo info; -// mscclppResult_t res = mscclppSuccess; -// int numCollected = 0; -// std::vector rankAddresses(nRanks, mscclppSocketAddress()); -// // for initial rank <-> root information exchange -// std::vector rankAddressesRoot(nRanks, mscclppSocketAddress()); +void MscclppBootstrap::Send(void* data, int size, int peer, int tag) +{ + mscclppResult_t res = pimpl_->send(data, size, peer, tag); + if (res != mscclppSuccess) { + throw std::runtime_error("MscclppBootstrap::Send failed"); + } +} -// mscclppSocketAddress zero; -// std::memset(rankAddresses.data(), 0, sizeof(mscclppSocketAddress) * nRanks); -// std::memset(rankAddressesRoot.data(), 0, sizeof(mscclppSocketAddress) * nRanks); -// std::memset(&zero, 0, sizeof(mscclppSocketAddress)); -// setFilesLimit(); +void MscclppBootstrap::Recv(void* data, int size, int peer, int tag) +{ + mscclppResult_t res = pimpl_->recv(data, size, peer, tag); + if (res != mscclppSuccess) { + throw std::runtime_error("MscclppBootstrap::Recv failed"); + } +} -// TRACE(MSCCLPP_INIT, "BEGIN"); -// /* Receive addresses from all ranks */ -// do { -// mscclppSocket sock; -// MSCCLPPCHECKGOTO(mscclppSocketInit(&sock), res, out); -// MSCCLPPCHECKGOTO(mscclppSocketAccept(&sock, listenSock), res, out); -// MSCCLPPCHECKGOTO(NetRecv(&sock, &info, sizeof(info)), res, out); -// MSCCLPPCHECKGOTO(mscclppSocketClose(&sock), res, out); +void MscclppBootstrap::AllGather(void* allData, int size) +{ + mscclppResult_t res = pimpl_->allGather(allData, size); + if (res != mscclppSuccess) { + throw std::runtime_error("MscclppBootstrap::AllGather failed"); + } +} -// if (nRanks != info.nRanks) { -// WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", nRanks, info.nRanks); -// return; -// } - -// if (std::memcmp(&zero, &rankAddressesRoot[info.rank], sizeof(mscclppSocketAddress)) != 0) { -// WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nRanks); -// return; -// } - -// // Save the connection handle for that rank -// rankAddressesRoot[info.rank] = info.extAddressListenRoot; -// rankAddresses[info.rank] = info.extAddressListen; - -// ++numCollected; -// TRACE(MSCCLPP_INIT, "Received connect from rank %d total %d/%d", info.rank, c, nranks); -// } while (numCollected < nRanks); -// TRACE(MSCCLPP_INIT, "COLLECTED ALL %d HANDLES", nranks); - -// // Send the connect handle for the next rank in the AllGather ring -// for (int r = 0; r < nRanks; ++r) { -// int next = (r + 1) % nRanks; -// mscclppSocket sock; -// MSCCLPPCHECKGOTO(mscclppSocketInit(&sock, &rankAddressesRoot[r], magic, mscclppSocketTypeBootstrap), res, out); -// MSCCLPPCHECKGOTO(mscclppSocketConnect(&sock), res, out); -// MSCCLPPCHECKGOTO(NetSend(&sock, &rankAddresses[next], sizeof(mscclppSocketAddress)), res, out); -// MSCCLPPCHECKGOTO(mscclppSocketClose(&sock), res, out); -// } -// TRACE(MSCCLPP_INIT, "SENT OUT ALL %d HANDLES", nRanks); - -// out: -// if (listenSock != nullptr) { -// mscclppSocketClose(listenSock); -// free(listenSock); -// } -// TRACE(MSCCLPP_INIT, "DONE"); -// } - -// mscclppResult_t MscclppBootstrap::Impl::createRoot(mscclppBootstrap::UniqueId& handle) -// { -// MSCCLPPCHECK(mscclppSocketInit(&this->listenSock, &handle.addr, handle.magic, mscclppSocketTypeBootstrap, NULL, 0)); -// MSCCLPPCHECK(mscclppSocketListen(&this->listenSock)); -// MSCCLPPCHECK(mscclppSocketGetAddr(&this->listenSock, &handle.addr)); - -// std::thread thread(BootstrapRoot, listenSock, handle.magic, nRanks); -// mscclppSetThreadName(thread.native_handle(), "MSCCLPP BootstrapR"); -// thread.detach(); -// return mscclppSuccess; -// } - -// // Additional sync functions -// mscclppResult_t MscclppBootstrap::Impl::netSend(mscclppSocket* sock, void* data, int size) -// { -// MSCCLPPCHECK(mscclppSocketSend(sock, &size, sizeof(int))); -// MSCCLPPCHECK(mscclppSocketSend(sock, data, size)); -// return mscclppSuccess; -// } - -// mscclppResult_t MscclppBootstrap::Impl::netRecv(mscclppSocket* sock, void* data, int size) -// { -// int recvSize; -// MSCCLPPCHECK(mscclppSocketRecv(sock, &recvSize, sizeof(int))); -// if (recvSize > size) { -// WARN("Message truncated : received %d bytes instead of %d", recvSize, size); -// return mscclppInternalError; -// } -// MSCCLPPCHECK(mscclppSocketRecv(sock, data, std::min(recvSize, size))); -// return mscclppSuccess; -// } - -// mscclppResult_t MscclppBootstrap::Impl::init(const mscclppComm& comm) -// { -// this->rank = comm.rank; -// this->nRanks = comm.nRanks; - -// mscclppSocket* proxySocket; -// mscclppSocketAddress nextAddr; -// mscclppSocket sock, listenSockRoot; -// extInfo info; - -// TRACE(MSCCLPP_INIT, "rank %d nranks %d", rank, nranks); - -// info.rank = rank; -// info.nRanks = this->nRanks; - -// uint64_t magic = this->handle.magic; -// // Create socket for other ranks to contact me -// MSCCLPPCHECK( -// mscclppSocketInit(&this->listenSock, &bootstrapNetIfAddr, magic, mscclppSocketTypeBootstrap, this->abortFlag)); -// MSCCLPPCHECK(mscclppSocketListen(&this->listenSock)); -// MSCCLPPCHECK(mscclppSocketGetAddr(&this->listenSock, &info.extAddressListen)); - -// // Create socket for root to contact me -// MSCCLPPCHECK( -// mscclppSocketInit(&listenSockRoot, &bootstrapNetIfAddr, magic, mscclppSocketTypeBootstrap, this->abortFlag)); -// MSCCLPPCHECK(mscclppSocketListen(&listenSockRoot)); -// MSCCLPPCHECK(mscclppSocketGetAddr(&listenSockRoot, &info.extAddressListenRoot)); - -// // stagger connection times to avoid an overload of the root -// if (this->nRanks > 128) { -// long msec = rank; -// struct timespec tv; -// tv.tv_sec = msec / 1000; -// tv.tv_nsec = 1000000 * (msec % 1000); -// TRACE(MSCCLPP_INIT, "rank %d delaying connection to root by %ld msec", rank, msec); -// (void)nanosleep(&tv, NULL); -// } - -// // send info on my listening socket to root -// MSCCLPPCHECK(mscclppSocketInit(&sock, &this->handle.addr, magic, mscclppSocketTypeBootstrap, this->abortFlag)); -// MSCCLPPCHECK(mscclppSocketConnect(&sock)); -// MSCCLPPCHECK(NetSend(&sock, &info, sizeof(info))); -// MSCCLPPCHECK(mscclppSocketClose(&sock)); - -// // get info on my "next" rank in the bootstrap ring from root -// MSCCLPPCHECK(mscclppSocketInit(&sock)); -// MSCCLPPCHECK(mscclppSocketAccept(&sock, &listenSockRoot)); -// MSCCLPPCHECK(NetRecv(&sock, &nextAddr, sizeof(union mscclppSocketAddress))); -// MSCCLPPCHECK(mscclppSocketClose(&sock)); -// MSCCLPPCHECK(mscclppSocketClose(&listenSockRoot)); - -// MSCCLPPCHECK( -// mscclppSocketInit(&this->ringSendSocket, &nextAddr, magic, mscclppSocketTypeBootstrap, this->abortFlag)); -// MSCCLPPCHECK(mscclppSocketConnect(&this->ringSendSocket)); -// // Accept the connect request from the previous rank in the AllGather ring -// MSCCLPPCHECK(mscclppSocketInit(&this->ringRecvSocket)); -// MSCCLPPCHECK(mscclppSocketAccept(&this->ringRecvSocket, &this->listenSock)); - -// // AllGather all listen handlers -// MSCCLPPCHECK(mscclppCalloc(&this->peerCommAddresses, this->nRanks)); -// MSCCLPPCHECK(mscclppSocketGetAddr(&this->listenSock, this->peerCommAddresses + rank)); -// MSCCLPPCHECK(bootstrapAllGather(state, this->peerCommAddresses, sizeof(union mscclppSocketAddress))); - -// // Create the service proxy -// MSCCLPPCHECK(mscclppCalloc(&this->peerProxyAddresses, this->nRanks)); - -// // proxy is aborted through a message; don't set abortFlag -// MSCCLPPCHECK(mscclppCalloc(&proxySocket, 1)); -// MSCCLPPCHECK( -// mscclppSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, mscclppSocketTypeProxy, comm->abortFlag)); -// MSCCLPPCHECK(mscclppSocketListen(proxySocket)); -// MSCCLPPCHECK(mscclppSocketGetAddr(proxySocket, &this->peerProxyAddresses[rank])); -// MSCCLPPCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union mscclppSocketAddress))); - -// TRACE(MSCCLPP_INIT, "rank %d nranks %d - DONE", rank, nranks); - -// return mscclppSuccess; -// } - -// mscclppResult_t MscclppBootstrap::Impl::allGather(void* allData, int size) -// { -// char* data = static_cast(allData); -// int rank = this->rank; -// int nRanks = this->nRanks; - -// TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d", rank, nRanks, size); - -// /* Simple ring based AllGather -// * At each step i receive data from (rank-i-1) from left -// * and send previous step's data from (rank-i) to right -// */ -// for (int i = 0; i < nRanks - 1; i++) { -// size_t rSlice = (rank - i - 1 + nRanks) % nRanks; -// size_t sSlice = (rank - i + nRanks) % nRanks; - -// // Send slice to the right -// MSCCLPPCHECK(NetSend(&this->ringSendSocket, data + sSlice * size, size)); -// // Recv slice from the left -// MSCCLPPCHECK(bootstrapNetRecv(&this->ringRecvSocket, data + rSlice * size, size)); -// } - -// TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); -// return mscclppSuccess; -// } - -// mscclppResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) -// { -// mscclppResult_t ret = mscclppSuccess; -// struct bootstrapState* state = (struct bootstrapState*)commState; -// struct mscclppSocket sock; - -// MSCCLPPCHECKGOTO(mscclppSocketInit(&sock, state->peerCommAddresses + peer, state->magic, mscclppSocketTypeBootstrap, -// state->abortFlag), -// ret, fail); -// MSCCLPPCHECKGOTO(mscclppSocketConnect(&sock), ret, fail); -// MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, &state->rank, sizeof(int)), ret, fail); -// MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, &tag, sizeof(int)), ret, fail); -// MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, data, size), ret, fail); - -// exit: -// MSCCLPPCHECK(mscclppSocketClose(&sock)); -// return ret; -// fail: -// goto exit; -// } - -// mscclppResult_t bootstrapBarrier(void* commState, int* ranks, int rank, int nranks, int tag) -// { -// if (nranks == 1) -// return mscclppSuccess; -// TRACE(MSCCLPP_INIT, "rank %d nranks %d tag %x - ENTER", rank, nranks, tag); - -// /* Simple intra process barrier -// * -// * Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet, -// * "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988" -// */ -// int data[1]; -// for (int mask = 1; mask < nranks; mask <<= 1) { -// int src = (rank - mask + nranks) % nranks; -// int dst = (rank + mask) % nranks; -// MSCCLPPCHECK(bootstrapSend(commState, ranks[dst], tag, data, sizeof(data))); -// MSCCLPPCHECK(bootstrapRecv(commState, ranks[src], tag, data, sizeof(data))); -// } - -// TRACE(MSCCLPP_INIT, "rank %d nranks %d tag %x - DONE", rank, nranks, tag); -// return mscclppSuccess; -// } - -// mscclppResult_t bootstrapIntraNodeAllGather(void* commState, int* ranks, int rank, int nranks, void* allData, int size) -// { -// if (nranks == 1) -// return mscclppSuccess; -// char* data = (char*)allData; -// TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - ENTER", rank, nranks, size); - -// for (int i = 1; i < nranks; i++) { -// int src = (rank - i + nranks) % nranks; -// int dst = (rank + i) % nranks; -// MSCCLPPCHECK(bootstrapSend(commState, ranks[dst], /*tag=*/i, data + rank * size, size)); -// MSCCLPPCHECK(bootstrapRecv(commState, ranks[src], /*tag=*/i, data + src * size, size)); -// } - -// TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); -// return mscclppSuccess; -// } - -// mscclppResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct mscclppSocket* sock) -// { -// // New unex -// struct unexConn* unex; -// MSCCLPPCHECK(mscclppCalloc(&unex, 1)); -// unex->peer = peer; -// unex->tag = tag; -// memcpy(&unex->sock, sock, sizeof(struct mscclppSocket)); - -// // Enqueue -// struct unexConn* list = state->unexpectedConnections; -// if (list == NULL) { -// state->unexpectedConnections = unex; -// return mscclppSuccess; -// } -// while (list->next) -// list = list->next; -// list->next = unex; -// return mscclppSuccess; -// } - -// mscclppResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct mscclppSocket* sock, -// int* found) -// { -// struct unexConn* elem = state->unexpectedConnections; -// struct unexConn* prev = NULL; -// *found = 0; -// while (elem) { -// if (elem->peer == peer && elem->tag == tag) { -// if (prev == NULL) { -// state->unexpectedConnections = elem->next; -// } else { -// prev->next = elem->next; -// } -// memcpy(sock, &elem->sock, sizeof(struct mscclppSocket)); -// free(elem); -// *found = 1; -// return mscclppSuccess; -// } -// prev = elem; -// elem = elem->next; -// } -// return mscclppSuccess; -// } - -// static void unexpectedFree(struct bootstrapState* state) -// { -// struct unexConn* elem = state->unexpectedConnections; -// struct unexConn* prev = NULL; - -// while (elem) { -// prev = elem; -// elem = elem->next; -// free(prev); -// } -// return; -// } - -// // We can't know who we'll receive from, so we need to receive everything at once -// mscclppResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) -// { -// mscclppResult_t ret = mscclppSuccess; -// struct bootstrapState* state = (struct bootstrapState*)commState; -// struct mscclppSocket sock; -// int newPeer, newTag; - -// // Search unexpected connections first -// int found; -// MSCCLPPCHECK(unexpectedDequeue(state, peer, tag, &sock, &found)); -// if (found) { -// MSCCLPPCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, fail); -// goto exit; -// } - -// // Then look for new connections -// while (1) { -// MSCCLPPCHECKGOTO(mscclppSocketInit(&sock), ret, fail); -// MSCCLPPCHECKGOTO(mscclppSocketAccept(&sock, &state->listenSock), ret, fail); -// MSCCLPPCHECKGOTO(bootstrapNetRecv(&sock, &newPeer, sizeof(int)), ret, fail); -// MSCCLPPCHECKGOTO(bootstrapNetRecv(&sock, &newTag, sizeof(int)), ret, fail); -// if (newPeer == peer && newTag == tag) { -// MSCCLPPCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, fail); -// goto exit; -// } -// // Unexpected connection. Save for later. -// MSCCLPPCHECKGOTO(unexpectedEnqueue(state, newPeer, newTag, &sock), ret, fail); -// } -// exit: -// MSCCLPPCHECK(mscclppSocketClose(&sock)); -// return ret; -// fail: -// goto exit; -// } - -// mscclppResult_t bootstrapClose(void* commState) -// { -// struct bootstrapState* state = (struct bootstrapState*)commState; -// if (state->unexpectedConnections != nullptr) { -// unexpectedFree(state); -// if (*state->abortFlag == 0) { -// WARN("Unexpected connections are not empty"); -// return mscclppInternalError; -// } -// } - -// MSCCLPPCHECK(mscclppSocketClose(&state->listenSock)); -// MSCCLPPCHECK(mscclppSocketClose(&state->ringSendSocket)); -// MSCCLPPCHECK(mscclppSocketClose(&state->ringRecvSocket)); - -// free(state->peerCommAddresses); -// free(state); - -// return mscclppSuccess; -// } - -// mscclppResult_t bootstrapAbort(void* commState) -// { -// struct bootstrapState* state = (struct bootstrapState*)commState; -// if (commState == nullptr) -// return mscclppSuccess; -// MSCCLPPCHECK(mscclppSocketClose(&state->listenSock)); -// MSCCLPPCHECK(mscclppSocketClose(&state->ringSendSocket)); -// MSCCLPPCHECK(mscclppSocketClose(&state->ringRecvSocket)); -// free(state->peerCommAddresses); -// free(state->peerProxyAddresses); -// free(state); -// return mscclppSuccess; -// } +void MscclppBootstrap::Initialize() +{ + mscclppResult_t res = pimpl_->initialize(); + if (res != mscclppSuccess) { + throw std::runtime_error("MscclppBootstrap::Initialize failed"); + } +} +void MscclppBootstrap::Barrier() +{ + mscclppResult_t res = pimpl_->barrier(); + if (res != mscclppSuccess) { + throw std::runtime_error("MscclppBootstrap::Barrier failed"); + } +} diff --git a/src/bootstrap/socket.cc b/src/bootstrap/socket.cc index b3998d91..4241390d 100644 --- a/src/bootstrap/socket.cc +++ b/src/bootstrap/socket.cc @@ -769,7 +769,7 @@ exit: return ret; } -mscclppResult_t mscclppSocketInit(struct mscclppSocket* sock, union mscclppSocketAddress* addr, uint64_t magic, +mscclppResult_t mscclppSocketInit(struct mscclppSocket* sock, const mscclppSocketAddress* addr, uint64_t magic, enum mscclppSocketType type, volatile uint32_t* abortFlag, int asyncFlag) { mscclppResult_t ret = mscclppSuccess; diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h index 246a380e..afe6eca3 100644 --- a/src/include/bootstrap.h +++ b/src/include/bootstrap.h @@ -18,18 +18,18 @@ class MscclppBootstrap : Bootstrap { public: MscclppBootstrap(std::string ipPortPair, int rank, int nRanks); MscclppBootstrap(mscclppBootstrapHandle handle, int rank, int nRanks); - void Initialize(const mscclppComm& comm); + void Initialize(); void Send(void* data, int size, int peer, int tag); void Recv(void* data, int size, int peer, int tag); void AllGather(void* allData, int size); void Barrier(); void Close(); struct UniqueId; - UniqueId getUniqueId(); + UniqueId GetUniqueId(); private: class Impl; - std::unique_ptr pimpl; + std::unique_ptr pimpl_; }; mscclppResult_t bootstrapCreateRoot(struct mscclppBootstrapHandle* handle); diff --git a/src/include/socket.h b/src/include/socket.h index 556c6bb8..53bdd98d 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -75,7 +75,7 @@ int mscclppFindInterfaceMatchSubnet(char* ifNames, union mscclppSocketAddress* l int mscclppFindInterfaces(char* ifNames, union mscclppSocketAddress* ifAddrs, int ifNameMaxSize, int maxIfs); // Initialize a socket -mscclppResult_t mscclppSocketInit(struct mscclppSocket* sock, union mscclppSocketAddress* addr = NULL, +mscclppResult_t mscclppSocketInit(struct mscclppSocket* sock, const mscclppSocketAddress* addr = NULL, uint64_t magic = MSCCLPP_SOCKET_MAGIC, enum mscclppSocketType type = mscclppSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0); From 7e1a77a132bb749af4b408bf218e00a614efb8c6 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 21 Apr 2023 09:41:52 +0000 Subject: [PATCH 037/135] make build pass --- src/bootstrap/bootstrap.cc | 600 ++++++++++++++++++++++++++++++++++++- src/include/bootstrap.h | 11 +- src/include/comm.h | 2 +- src/init.cc | 3 + 4 files changed, 598 insertions(+), 18 deletions(-) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index 84d6d25b..0dac4e2f 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -64,9 +64,6 @@ struct extInfo class MscclppBootstrap::Impl { public: - static char bootstrapNetIfName[MAX_IF_NAME_SIZE + 1]; - static union mscclppSocketAddress bootstrapNetIfAddr; - Impl(std::string ipPortPair, int rank, int nRanks, const mscclppBootstrapHandle handle); ~Impl(); mscclppResult_t initialize(); @@ -74,6 +71,7 @@ public: mscclppResult_t send(void* data, int size, int peer, int tag); mscclppResult_t recv(void* data, int size, int peer, int tag); mscclppResult_t barrier(); + mscclppResult_t close(); MscclppBootstrap::UniqueId uniqueId_; @@ -88,6 +86,8 @@ private: std::queue unexpectedConnections_; volatile uint32_t* abortFlag_; std::thread rootThread_; + char netIfName_[MAX_IF_NAME_SIZE + 1]; + union mscclppSocketAddress netIfAddr_; static mscclppResult_t netSend(mscclppSocket* sock, const void* data, int size); static mscclppResult_t netRecv(mscclppSocket* sock, void* data, int size); @@ -124,9 +124,9 @@ MscclppBootstrap::Impl::Impl(std::string ipPortPair, int rank, int nRanks, const throw std::runtime_error("getting random data failed"); } } - std::memcpy(&uniqueId_.addr, &bootstrapNetIfAddr, sizeof(union mscclppSocketAddress)); + std::memcpy(&uniqueId_.addr, &netIfAddr_, sizeof(union mscclppSocketAddress)); if (rank_ == 0) { - rootThread_ = std::thread(&MscclppBootstrap::Impl::bootstrapRoot, this, &listenSock_, uniqueId_.magic, nRanks_); + rootThread_ = std::thread(&MscclppBootstrap::Impl::bootstrapRoot, this); } } @@ -270,13 +270,13 @@ mscclppResult_t MscclppBootstrap::Impl::netInit(std::string ipPortPair) WARN("Invalid MSCCLPP_COMM_ID, please use format: : or []: or :"); return mscclppInvalidArgument; } - if (mscclppFindInterfaceMatchSubnet(this->bootstrapNetIfName, &this->bootstrapNetIfAddr, &remoteAddr, - MAX_IF_NAME_SIZE, 1) <= 0) { + if (mscclppFindInterfaceMatchSubnet(netIfName_, &netIfAddr_, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) { WARN("NET/Socket : No usable listening interface found"); return mscclppSystemError; } } else { - int ret = mscclppFindInterfaces(this->bootstrapNetIfName, &this->bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1); + int ret = + mscclppFindInterfaces(netIfName_, &netIfAddr_, MAX_IF_NAME_SIZE, 1); if (ret <= 0) { WARN("Bootstrap : no socket interface found"); return mscclppInternalError; @@ -284,8 +284,8 @@ mscclppResult_t MscclppBootstrap::Impl::netInit(std::string ipPortPair) } char line[SOCKET_NAME_MAXLEN + MAX_IF_NAME_SIZE + 2]; - std::sprintf(line, " %s:", bootstrapNetIfName); - mscclppSocketToString(&bootstrapNetIfAddr, line + strlen(line)); + std::sprintf(line, " %s:", netIfName_); + mscclppSocketToString(&netIfAddr_, line + strlen(line)); INFO(MSCCLPP_INIT, "Bootstrap : Using%s", line); return mscclppSuccess; } @@ -305,13 +305,13 @@ mscclppResult_t MscclppBootstrap::Impl::initialize() uint64_t magic = this->uniqueId_.magic; // Create socket for other ranks to contact me MSCCLPPCHECK( - mscclppSocketInit(&this->listenSock_, &bootstrapNetIfAddr, magic, mscclppSocketTypeBootstrap, this->abortFlag_)); + mscclppSocketInit(&this->listenSock_, &netIfAddr_, magic, mscclppSocketTypeBootstrap, this->abortFlag_)); MSCCLPPCHECK(mscclppSocketListen(&this->listenSock_)); MSCCLPPCHECK(mscclppSocketGetAddr(&this->listenSock_, &info.extAddressListen)); // Create socket for root to contact me MSCCLPPCHECK( - mscclppSocketInit(&listenSockRoot, &bootstrapNetIfAddr, magic, mscclppSocketTypeBootstrap, this->abortFlag_)); + mscclppSocketInit(&listenSockRoot, &netIfAddr_, magic, mscclppSocketTypeBootstrap, this->abortFlag_)); MSCCLPPCHECK(mscclppSocketListen(&listenSockRoot)); MSCCLPPCHECK(mscclppSocketGetAddr(&listenSockRoot, &info.extAddressListenRoot)); @@ -353,7 +353,7 @@ mscclppResult_t MscclppBootstrap::Impl::initialize() // proxy is aborted through a message; don't set abortFlag MSCCLPPCHECK(mscclppCalloc(&proxySocket, 1)); - MSCCLPPCHECK(mscclppSocketInit(proxySocket, &bootstrapNetIfAddr, magic, mscclppSocketTypeProxy, this->abortFlag_)); + MSCCLPPCHECK(mscclppSocketInit(proxySocket, &netIfAddr_, magic, mscclppSocketTypeProxy, this->abortFlag_)); MSCCLPPCHECK(mscclppSocketListen(proxySocket)); MSCCLPPCHECK(mscclppSocketGetAddr(proxySocket, &this->peerProxyAddresses_[rank_])); MSCCLPPCHECK(allGather(this->peerProxyAddresses_.data(), sizeof(union mscclppSocketAddress))); @@ -419,6 +419,22 @@ mscclppResult_t MscclppBootstrap::Impl::send(void* data, int size, int peer, int MSCCLPPCHECK(netSend(&sock, data, size)); MSCCLPPCHECK(mscclppSocketClose(&sock)); + return mscclppSuccess; +} + +mscclppResult_t MscclppBootstrap::Impl::recv(void* data, int size, int peer, int tag) +{ + return mscclppSuccess; +} + +mscclppResult_t MscclppBootstrap::Impl::barrier() +{ + return mscclppSuccess; +} + +mscclppResult_t MscclppBootstrap::Impl::close() +{ + return mscclppSuccess; } MscclppBootstrap::MscclppBootstrap(std::string ipPortPair, int rank, int nRanks) @@ -475,3 +491,561 @@ void MscclppBootstrap::Barrier() throw std::runtime_error("MscclppBootstrap::Barrier failed"); } } + +void MscclppBootstrap::Close() +{ + mscclppResult_t res = pimpl_->close(); + if (res != mscclppSuccess) { + throw std::runtime_error("MscclppBootstrap::Close failed"); + } +} + + + +// ------------------- Old bootstrap functions ------------------- +struct bootstrapRootArgs +{ + struct mscclppSocket* listenSock; + uint64_t magic; +}; + +/* Init functions */ +static char bootstrapNetIfName[MAX_IF_NAME_SIZE + 1]; +static union mscclppSocketAddress bootstrapNetIfAddr; +static int bootstrapNetInitDone = 0; +pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER; + +mscclppResult_t bootstrapNetInit(const char* ip_port_pair) +{ + if (bootstrapNetInitDone == 0) { + pthread_mutex_lock(&bootstrapNetLock); + if (bootstrapNetInitDone == 0) { + const char* env; + if (ip_port_pair) { + env = ip_port_pair; + } else { + env = getenv("MSCCLPP_COMM_ID"); + } + if (env) { + union mscclppSocketAddress remoteAddr; + if (mscclppSocketGetAddrFromString(&remoteAddr, env) != mscclppSuccess) { + WARN("Invalid MSCCLPP_COMM_ID, please use format: : or []: or :"); + return mscclppInvalidArgument; + } + if (mscclppFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, + 1) <= 0) { + WARN("NET/Socket : No usable listening interface found"); + return mscclppSystemError; + } + } else { + int nIfs = mscclppFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1); + if (nIfs <= 0) { + WARN("Bootstrap : no socket interface found"); + return mscclppInternalError; + } + } + char line[SOCKET_NAME_MAXLEN + MAX_IF_NAME_SIZE + 2]; + sprintf(line, " %s:", bootstrapNetIfName); + mscclppSocketToString(&bootstrapNetIfAddr, line + strlen(line)); + INFO(MSCCLPP_INIT, "Bootstrap : Using%s", line); + bootstrapNetInitDone = 1; + } + pthread_mutex_unlock(&bootstrapNetLock); + } + return mscclppSuccess; +} + +// Additional sync functions +static mscclppResult_t bootstrapNetSend(struct mscclppSocket* sock, void* data, int size) +{ + MSCCLPPCHECK(mscclppSocketSend(sock, &size, sizeof(int))); + MSCCLPPCHECK(mscclppSocketSend(sock, data, size)); + return mscclppSuccess; +} +static mscclppResult_t bootstrapNetRecv(struct mscclppSocket* sock, void* data, int size) +{ + int recvSize; + MSCCLPPCHECK(mscclppSocketRecv(sock, &recvSize, sizeof(int))); + if (recvSize > size) { + WARN("Message truncated : received %d bytes instead of %d", recvSize, size); + return mscclppInternalError; + } + MSCCLPPCHECK(mscclppSocketRecv(sock, data, std::min(recvSize, size))); + return mscclppSuccess; +} + +// struct extInfo +// { +// int rank; +// int nranks; +// union mscclppSocketAddress extAddressListenRoot; +// union mscclppSocketAddress extAddressListen; +// }; + +#include + +// static mscclppResult_t setFilesLimit() +// { +// struct rlimit filesLimit; +// SYSCHECK(getrlimit(RLIMIT_NOFILE, &filesLimit), "getrlimit"); +// filesLimit.rlim_cur = filesLimit.rlim_max; +// SYSCHECK(setrlimit(RLIMIT_NOFILE, &filesLimit), "setrlimit"); +// return mscclppSuccess; +// } + +static void* bootstrapRoot(void* rargs) +{ + struct bootstrapRootArgs* args = (struct bootstrapRootArgs*)rargs; + struct mscclppSocket* listenSock = args->listenSock; + uint64_t magic = args->magic; + mscclppResult_t res = mscclppSuccess; + int nranks = 0, c = 0; + struct extInfo info; + union mscclppSocketAddress* rankAddresses = NULL; + union mscclppSocketAddress* rankAddressesRoot = NULL; // for initial rank <-> root information exchange + union mscclppSocketAddress* zero = NULL; + MSCCLPPCHECKGOTO(mscclppCalloc(&zero, 1), res, out); + setFilesLimit(); + + TRACE(MSCCLPP_INIT, "BEGIN"); + /* Receive addresses from all ranks */ + do { + struct mscclppSocket sock; + MSCCLPPCHECKGOTO(mscclppSocketInit(&sock), res, out); + MSCCLPPCHECKGOTO(mscclppSocketAccept(&sock, listenSock), res, out); + MSCCLPPCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out); + MSCCLPPCHECKGOTO(mscclppSocketClose(&sock), res, out); + + if (c == 0) { + nranks = info.nRanks; + MSCCLPPCHECKGOTO(mscclppCalloc(&rankAddresses, nranks), res, out); + MSCCLPPCHECKGOTO(mscclppCalloc(&rankAddressesRoot, nranks), res, out); + } + + if (nranks != info.nRanks) { + WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", nranks, info.nRanks); + goto out; + } + + if (memcmp(zero, &rankAddressesRoot[info.rank], sizeof(union mscclppSocketAddress)) != 0) { + WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks); + goto out; + } + + // Save the connection handle for that rank + memcpy(rankAddressesRoot + info.rank, &info.extAddressListenRoot, sizeof(union mscclppSocketAddress)); + memcpy(rankAddresses + info.rank, &info.extAddressListen, sizeof(union mscclppSocketAddress)); + + ++c; + TRACE(MSCCLPP_INIT, "Received connect from rank %d total %d/%d", info.rank, c, nranks); + } while (c < nranks); + TRACE(MSCCLPP_INIT, "COLLECTED ALL %d HANDLES", nranks); + + // Send the connect handle for the next rank in the AllGather ring + for (int r = 0; r < nranks; ++r) { + int next = (r + 1) % nranks; + struct mscclppSocket sock; + MSCCLPPCHECKGOTO(mscclppSocketInit(&sock, rankAddressesRoot + r, magic, mscclppSocketTypeBootstrap), res, out); + MSCCLPPCHECKGOTO(mscclppSocketConnect(&sock), res, out); + MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, rankAddresses + next, sizeof(union mscclppSocketAddress)), res, out); + MSCCLPPCHECKGOTO(mscclppSocketClose(&sock), res, out); + } + TRACE(MSCCLPP_INIT, "SENT OUT ALL %d HANDLES", nranks); + +out: + if (listenSock != NULL) { + mscclppSocketClose(listenSock); + free(listenSock); + } + if (rankAddresses) + free(rankAddresses); + if (rankAddressesRoot) + free(rankAddressesRoot); + if (zero) + free(zero); + free(rargs); + + TRACE(MSCCLPP_INIT, "DONE"); + return NULL; +} + +mscclppResult_t bootstrapCreateRoot(struct mscclppBootstrapHandle* handle) +{ + struct mscclppSocket* listenSock; + struct bootstrapRootArgs* args; + pthread_t thread; + + MSCCLPPCHECK(mscclppCalloc(&listenSock, 1)); + MSCCLPPCHECK(mscclppSocketInit(listenSock, &handle->addr, handle->magic, mscclppSocketTypeBootstrap, NULL, 0)); + MSCCLPPCHECK(mscclppSocketListen(listenSock)); + MSCCLPPCHECK(mscclppSocketGetAddr(listenSock, &handle->addr)); + + MSCCLPPCHECK(mscclppCalloc(&args, 1)); + args->listenSock = listenSock; + args->magic = handle->magic; + NEQCHECK(pthread_create(&thread, NULL, bootstrapRoot, (void*)args), 0); + mscclppSetThreadName(thread, "MSCCLPP BootstrapR"); + NEQCHECK(pthread_detach(thread), 0); // will not be pthread_join()'d + return mscclppSuccess; +} + +// #include +// #include + +mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle, bool isRoot, const char* ip_port_pair) +{ + memset(handle, 0, sizeof(mscclppBootstrapHandle)); + const char* env = NULL; + + if (ip_port_pair) { + env = ip_port_pair; + } else { + env = getenv("MSCCLPP_COMM_ID"); + } + if (env) { + handle->magic = 0xdeadbeef; + + INFO(MSCCLPP_ENV, "MSCCLPP_COMM_ID set by environment to %s", env); + if (mscclppSocketGetAddrFromString(&handle->addr, env) != mscclppSuccess) { + WARN("Invalid MSCCLPP_COMM_ID, please use format: : or []: or :"); + return mscclppInvalidArgument; + } + if (isRoot) + MSCCLPPCHECK(bootstrapCreateRoot(handle)); + } else { + MSCCLPPCHECK(getRandomData(&handle->magic, sizeof(handle->magic))); + memcpy(&handle->addr, &bootstrapNetIfAddr, sizeof(union mscclppSocketAddress)); + MSCCLPPCHECK(bootstrapCreateRoot(handle)); + } + // printf("addr = %s port = %d\n", inet_ntoa(handle->addr.sin.sin_addr), (int)ntohs(handle->addr.sin.sin_port)); + // printf("addr = %s\n", inet_ntoa((*(struct sockaddr_in*)&handle->addr.sa).sin_addr)); + + return mscclppSuccess; +} + +struct unexConn +{ + int peer; + int tag; + struct mscclppSocket sock; + struct unexConn* next; +}; + +struct bootstrapState +{ + struct mscclppSocket listenSock; + struct mscclppSocket ringRecvSocket; + struct mscclppSocket ringSendSocket; + union mscclppSocketAddress* peerCommAddresses; + union mscclppSocketAddress* peerProxyAddresses; + struct unexConn* unexpectedConnections; + int cudaDev; + int rank; + int nranks; + uint64_t magic; + volatile uint32_t* abortFlag; +}; + +mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscclppComm* comm) +{ + int rank = comm->rank; + int nranks = comm->nRanks; + struct bootstrapState* state; + struct mscclppSocket* proxySocket; + mscclppSocketAddress nextAddr; + struct mscclppSocket sock, listenSockRoot; + struct extInfo info; + + MSCCLPPCHECK(mscclppCalloc(&state, 1)); + state->rank = rank; + state->nranks = nranks; + state->abortFlag = comm->abortFlag; + comm->bootstrap = state; + comm->magic = state->magic = handle->magic; + + TRACE(MSCCLPP_INIT, "rank %d nranks %d", rank, nranks); + + info.rank = rank; + info.nRanks = nranks; + + // Create socket for other ranks to contact me + MSCCLPPCHECK(mscclppSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->magic, mscclppSocketTypeBootstrap, + comm->abortFlag)); + MSCCLPPCHECK(mscclppSocketListen(&state->listenSock)); + MSCCLPPCHECK(mscclppSocketGetAddr(&state->listenSock, &info.extAddressListen)); + + // Create socket for root to contact me + MSCCLPPCHECK( + mscclppSocketInit(&listenSockRoot, &bootstrapNetIfAddr, comm->magic, mscclppSocketTypeBootstrap, comm->abortFlag)); + MSCCLPPCHECK(mscclppSocketListen(&listenSockRoot)); + MSCCLPPCHECK(mscclppSocketGetAddr(&listenSockRoot, &info.extAddressListenRoot)); + + // stagger connection times to avoid an overload of the root + if (nranks > 128) { + long msec = rank; + struct timespec tv; + tv.tv_sec = msec / 1000; + tv.tv_nsec = 1000000 * (msec % 1000); + TRACE(MSCCLPP_INIT, "rank %d delaying connection to root by %ld msec", rank, msec); + (void)nanosleep(&tv, NULL); + } + + // send info on my listening socket to root + MSCCLPPCHECK(mscclppSocketInit(&sock, &handle->addr, comm->magic, mscclppSocketTypeBootstrap, comm->abortFlag)); + MSCCLPPCHECK(mscclppSocketConnect(&sock)); + MSCCLPPCHECK(bootstrapNetSend(&sock, &info, sizeof(info))); + MSCCLPPCHECK(mscclppSocketClose(&sock)); + + // get info on my "next" rank in the bootstrap ring from root + MSCCLPPCHECK(mscclppSocketInit(&sock)); + MSCCLPPCHECK(mscclppSocketAccept(&sock, &listenSockRoot)); + MSCCLPPCHECK(bootstrapNetRecv(&sock, &nextAddr, sizeof(union mscclppSocketAddress))); + MSCCLPPCHECK(mscclppSocketClose(&sock)); + MSCCLPPCHECK(mscclppSocketClose(&listenSockRoot)); + + MSCCLPPCHECK( + mscclppSocketInit(&state->ringSendSocket, &nextAddr, comm->magic, mscclppSocketTypeBootstrap, comm->abortFlag)); + MSCCLPPCHECK(mscclppSocketConnect(&state->ringSendSocket)); + // Accept the connect request from the previous rank in the AllGather ring + MSCCLPPCHECK(mscclppSocketInit(&state->ringRecvSocket)); + MSCCLPPCHECK(mscclppSocketAccept(&state->ringRecvSocket, &state->listenSock)); + + // AllGather all listen handlers + MSCCLPPCHECK(mscclppCalloc(&state->peerCommAddresses, nranks)); + MSCCLPPCHECK(mscclppSocketGetAddr(&state->listenSock, state->peerCommAddresses + rank)); + MSCCLPPCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union mscclppSocketAddress))); + + // Create the service proxy + MSCCLPPCHECK(mscclppCalloc(&state->peerProxyAddresses, nranks)); + + // proxy is aborted through a message; don't set abortFlag + MSCCLPPCHECK(mscclppCalloc(&proxySocket, 1)); + MSCCLPPCHECK( + mscclppSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, mscclppSocketTypeProxy, comm->abortFlag)); + MSCCLPPCHECK(mscclppSocketListen(proxySocket)); + MSCCLPPCHECK(mscclppSocketGetAddr(proxySocket, state->peerProxyAddresses + rank)); + MSCCLPPCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union mscclppSocketAddress))); + // MSCCLPPCHECK(mscclppProxyInit(comm, proxySocket, state->peerProxyAddresses)); + + TRACE(MSCCLPP_INIT, "rank %d nranks %d - DONE", rank, nranks); + + return mscclppSuccess; +} + +mscclppResult_t bootstrapAllGather(void* commState, void* allData, int size) +{ + struct bootstrapState* state = (struct bootstrapState*)commState; + char* data = (char*)allData; + int rank = state->rank; + int nranks = state->nranks; + + TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d", rank, nranks, size); + + /* Simple ring based AllGather + * At each step i receive data from (rank-i-1) from left + * and send previous step's data from (rank-i) to right + */ + for (int i = 0; i < nranks - 1; i++) { + size_t rslice = (rank - i - 1 + nranks) % nranks; + size_t sslice = (rank - i + nranks) % nranks; + + // Send slice to the right + MSCCLPPCHECK(bootstrapNetSend(&state->ringSendSocket, data + sslice * size, size)); + // Recv slice from the left + MSCCLPPCHECK(bootstrapNetRecv(&state->ringRecvSocket, data + rslice * size, size)); + } + + TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); + return mscclppSuccess; +} + +mscclppResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) +{ + mscclppResult_t ret = mscclppSuccess; + struct bootstrapState* state = (struct bootstrapState*)commState; + struct mscclppSocket sock; + + MSCCLPPCHECKGOTO(mscclppSocketInit(&sock, state->peerCommAddresses + peer, state->magic, mscclppSocketTypeBootstrap, + state->abortFlag), + ret, fail); + MSCCLPPCHECKGOTO(mscclppSocketConnect(&sock), ret, fail); + MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, &state->rank, sizeof(int)), ret, fail); + MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, &tag, sizeof(int)), ret, fail); + MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, data, size), ret, fail); + +exit: + MSCCLPPCHECK(mscclppSocketClose(&sock)); + return ret; +fail: + goto exit; +} + +mscclppResult_t bootstrapBarrier(void* commState, int* ranks, int rank, int nranks, int tag) +{ + if (nranks == 1) + return mscclppSuccess; + TRACE(MSCCLPP_INIT, "rank %d nranks %d tag %x - ENTER", rank, nranks, tag); + + /* Simple intra process barrier + * + * Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet, + * "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988" + */ + int data[1]; + for (int mask = 1; mask < nranks; mask <<= 1) { + int src = (rank - mask + nranks) % nranks; + int dst = (rank + mask) % nranks; + MSCCLPPCHECK(bootstrapSend(commState, ranks[dst], tag, data, sizeof(data))); + MSCCLPPCHECK(bootstrapRecv(commState, ranks[src], tag, data, sizeof(data))); + } + + TRACE(MSCCLPP_INIT, "rank %d nranks %d tag %x - DONE", rank, nranks, tag); + return mscclppSuccess; +} + +mscclppResult_t bootstrapIntraNodeAllGather(void* commState, int* ranks, int rank, int nranks, void* allData, int size) +{ + if (nranks == 1) + return mscclppSuccess; + char* data = (char*)allData; + TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - ENTER", rank, nranks, size); + + for (int i = 1; i < nranks; i++) { + int src = (rank - i + nranks) % nranks; + int dst = (rank + i) % nranks; + MSCCLPPCHECK(bootstrapSend(commState, ranks[dst], /*tag=*/i, data + rank * size, size)); + MSCCLPPCHECK(bootstrapRecv(commState, ranks[src], /*tag=*/i, data + src * size, size)); + } + + TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); + return mscclppSuccess; +} + +mscclppResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct mscclppSocket* sock) +{ + // New unex + struct unexConn* unex; + MSCCLPPCHECK(mscclppCalloc(&unex, 1)); + unex->peer = peer; + unex->tag = tag; + memcpy(&unex->sock, sock, sizeof(struct mscclppSocket)); + + // Enqueue + struct unexConn* list = state->unexpectedConnections; + if (list == NULL) { + state->unexpectedConnections = unex; + return mscclppSuccess; + } + while (list->next) + list = list->next; + list->next = unex; + return mscclppSuccess; +} + +mscclppResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct mscclppSocket* sock, + int* found) +{ + struct unexConn* elem = state->unexpectedConnections; + struct unexConn* prev = NULL; + *found = 0; + while (elem) { + if (elem->peer == peer && elem->tag == tag) { + if (prev == NULL) { + state->unexpectedConnections = elem->next; + } else { + prev->next = elem->next; + } + memcpy(sock, &elem->sock, sizeof(struct mscclppSocket)); + free(elem); + *found = 1; + return mscclppSuccess; + } + prev = elem; + elem = elem->next; + } + return mscclppSuccess; +} + +static void unexpectedFree(struct bootstrapState* state) +{ + struct unexConn* elem = state->unexpectedConnections; + struct unexConn* prev = NULL; + + while (elem) { + prev = elem; + elem = elem->next; + free(prev); + } + return; +} + +// We can't know who we'll receive from, so we need to receive everything at once +mscclppResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) +{ + mscclppResult_t ret = mscclppSuccess; + struct bootstrapState* state = (struct bootstrapState*)commState; + struct mscclppSocket sock; + int newPeer, newTag; + + // Search unexpected connections first + int found; + MSCCLPPCHECK(unexpectedDequeue(state, peer, tag, &sock, &found)); + if (found) { + MSCCLPPCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, fail); + goto exit; + } + + // Then look for new connections + while (1) { + MSCCLPPCHECKGOTO(mscclppSocketInit(&sock), ret, fail); + MSCCLPPCHECKGOTO(mscclppSocketAccept(&sock, &state->listenSock), ret, fail); + MSCCLPPCHECKGOTO(bootstrapNetRecv(&sock, &newPeer, sizeof(int)), ret, fail); + MSCCLPPCHECKGOTO(bootstrapNetRecv(&sock, &newTag, sizeof(int)), ret, fail); + if (newPeer == peer && newTag == tag) { + MSCCLPPCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, fail); + goto exit; + } + // Unexpected connection. Save for later. + MSCCLPPCHECKGOTO(unexpectedEnqueue(state, newPeer, newTag, &sock), ret, fail); + } +exit: + MSCCLPPCHECK(mscclppSocketClose(&sock)); + return ret; +fail: + goto exit; +} + +mscclppResult_t bootstrapClose(void* commState) +{ + struct bootstrapState* state = (struct bootstrapState*)commState; + if (state->unexpectedConnections != NULL) { + unexpectedFree(state); + if (*state->abortFlag == 0) { + WARN("Unexpected connections are not empty"); + return mscclppInternalError; + } + } + + MSCCLPPCHECK(mscclppSocketClose(&state->listenSock)); + MSCCLPPCHECK(mscclppSocketClose(&state->ringSendSocket)); + MSCCLPPCHECK(mscclppSocketClose(&state->ringRecvSocket)); + + free(state->peerCommAddresses); + free(state); + + return mscclppSuccess; +} + +mscclppResult_t bootstrapAbort(void* commState) +{ + struct bootstrapState* state = (struct bootstrapState*)commState; + if (commState == NULL) + return mscclppSuccess; + MSCCLPPCHECK(mscclppSocketClose(&state->listenSock)); + MSCCLPPCHECK(mscclppSocketClose(&state->ringSendSocket)); + MSCCLPPCHECK(mscclppSocketClose(&state->ringRecvSocket)); + free(state->peerCommAddresses); + free(state->peerProxyAddresses); + free(state); + return mscclppSuccess; +} \ No newline at end of file diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h index afe6eca3..8eb7b939 100644 --- a/src/include/bootstrap.h +++ b/src/include/bootstrap.h @@ -18,11 +18,13 @@ class MscclppBootstrap : Bootstrap { public: MscclppBootstrap(std::string ipPortPair, int rank, int nRanks); MscclppBootstrap(mscclppBootstrapHandle handle, int rank, int nRanks); + ~MscclppBootstrap() = default; + void Initialize(); - void Send(void* data, int size, int peer, int tag); - void Recv(void* data, int size, int peer, int tag); - void AllGather(void* allData, int size); - void Barrier(); + void Send(void* data, int size, int peer, int tag) override; + void Recv(void* data, int size, int peer, int tag) override; + void AllGather(void* allData, int size) override; + void Barrier() override; void Close(); struct UniqueId; UniqueId GetUniqueId(); @@ -32,6 +34,7 @@ private: std::unique_ptr pimpl_; }; +mscclppResult_t bootstrapNetInit(const char* ip_port_pair = NULL); mscclppResult_t bootstrapCreateRoot(struct mscclppBootstrapHandle* handle); mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle, bool isRoot = true, const char* ip_port_pair = NULL); diff --git a/src/include/comm.h b/src/include/comm.h index b76c8c4f..8275e0cb 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -44,7 +44,7 @@ struct mscclppComm struct mscclppDevConn devConns[MAXCONNECTIONS]; int nConns; - MscclppBootstrap bootstrap; + void* bootstrap; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches. uint64_t magic; diff --git a/src/init.cc b/src/init.cc index 7c3b76b9..1850f68c 100644 --- a/src/init.cc +++ b/src/init.cc @@ -926,3 +926,6 @@ MSCCLPP_API mscclppResult_t mscclppSetBootstrapConnTimeout(int timeout) config->setBootstrapConnectionTimeoutConfig(timeout); return mscclppSuccess; } + +Bootstrap::~Bootstrap() { +} From 0bc3c3e574b896f591aba2f049d9db02f138753f Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Sat, 22 Apr 2023 00:35:25 +0000 Subject: [PATCH 038/135] Core API teasing out WIP --- src/epoch.cc | 22 ++ src/include/channel.hpp | 295 ++++++++++++++++++++ src/include/epoch.hpp | 52 ++++ src/include/mscclpp.hpp | 485 +++++---------------------------- src/include/proxy.hpp | 39 +++ src/include/registered_ptr.hpp | 40 +++ 6 files changed, 516 insertions(+), 417 deletions(-) create mode 100644 src/epoch.cc create mode 100644 src/include/channel.hpp create mode 100644 src/include/epoch.hpp create mode 100644 src/include/proxy.hpp create mode 100644 src/include/registered_ptr.hpp diff --git a/src/epoch.cc b/src/epoch.cc new file mode 100644 index 00000000..1fee307e --- /dev/null +++ b/src/epoch.cc @@ -0,0 +1,22 @@ +#include "epoch.hpp" +#include "checks.hpp" + +namespace mscclpp { + +struct Epoch::Impl { + DeviceEpoch deviceEpoch; + + Impl() { + MSCCLPPTHROW(mscclppCudaCalloc(&deviceEpoch.localSignalEpochId, 1)); + MSCCLPPTHROW(mscclppCudaCalloc(&deviceEpoch.waitEpochId, 1)); + } + + ~Impl() { + MSCCLPPTHROW(mscclppCudaFree(deviceEpoch.localSignalEpochId)); + MSCCLPPTHROW(mscclppCudaFree(deviceEpoch.waitEpochId)); + } +}; + +Epoch::Epoch() : pimpl(std::make_unique()) {} + +} // namespace mscclpp \ No newline at end of file diff --git a/src/include/channel.hpp b/src/include/channel.hpp new file mode 100644 index 00000000..cb1931b0 --- /dev/null +++ b/src/include/channel.hpp @@ -0,0 +1,295 @@ +#ifndef MSCCLPP_CHANNEL_HPP_ +#define MSCCLPP_CHANNEL_HPP_ + +#include "mscclpp.hpp" +#include "proxy.hpp" + +namespace mscclpp { + +// For every MSCCLPP_PROXY_FIFO_FLUSH_COUNTER, a flush of the tail to device memory is triggered. +// As long as MSCCLPP_PROXY_FIFO_SIZE is large enough, having a stale tail is not a problem. +#define MSCCLPP_PROXY_FIFO_SIZE 128 +#define MSCCLPP_PROXY_FIFO_FLUSH_COUNTER 4 + +using ChannelTriggerType = uint64_t; +const ChannelTriggerType channelTriggerData = 0x1; +const ChannelTriggerType channelTriggerFlag = 0x2; +const ChannelTriggerType channelTriggerSync = 0x4; + +// This is just a numeric ID. Each HostConnection will have an internal array indexed by these handles +// mapping to the actual +using BufferHandle = uint32_t; + +#define MSCCLPP_BITS_SIZE 32 +#define MSCCLPP_BITS_OFFSET 32 +#define MSCCLPP_BITS_BUFFER_HANDLE 8 +#define MSCCLPP_BITS_TYPE 3 +#define MSCCLPP_BITS_CONNID 10 + +// this is the basic structure of each work element in the fifo +// the summation of number of bits must be 128 or less +union ChannelTrigger { + ProxyTrigger value; + struct + { + // first 64 bits: value[0] + uint64_t size : MSCCLPP_BITS_SIZE; + uint64_t srcOffset : MSCCLPP_BITS_OFFSET; + uint64_t : (64 - MSCCLPP_BITS_SIZE - MSCCLPP_BITS_OFFSET); // ensure 64-bit alignment + // second 64 bits: value[1] + uint64_t dstOffset : MSCCLPP_BITS_OFFSET; + uint64_t srcBufferHandle : MSCCLPP_BITS_BUFFER_HANDLE; + uint64_t dstBufferHandle : MSCCLPP_BITS_BUFFER_HANDLE; + uint64_t type : MSCCLPP_BITS_TYPE; + uint64_t connId : MSCCLPP_BITS_CONNID; + uint64_t : (64 - MSCCLPP_BITS_OFFSET - MSCCLPP_BITS_BUFFER_HANDLE - MSCCLPP_BITS_BUFFER_HANDLE - MSCCLPP_BITS_TYPE); // ensure 64-bit alignment + } fields; + +#ifdef __CUDACC__ + __device__ ChannelTrigger() {} + __device__ ChannelTrigger(ProxyTrigger value) : value(value) {} + __device__ ChannelTrigger(ChannelTriggerType type, BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size, int connectionId) { + value.fst = ((srcOffset << MSCCLPP_BITS_SIZE) + size); + value.snd = ((((((((connectionId << MSCCLPP_BITS_TYPE) + (uint64_t)type) << MSCCLPP_BITS_BUFFER_HANDLE) + dst) << MSCCLPP_BITS_BUFFER_HANDLE) + src) << MSCCLPP_BITS_OFFSET) + dstOffset); + } +#endif // __CUDACC__ +}; + +struct ConnectionEpoch { +#ifdef __CUDACC__ + __forceinline__ __device__ void wait() + { + (*waitEpochId) += 1; + while (*(volatile uint64_t*)&(localSignalEpochId->proxy) < (*waitEpochId)) + ; + } + + __forceinline__ __device__ void epochIncrement() + { + *(volatile uint64_t*)&(localSignalEpochId->device) += 1; + } +#endif // __CUDACC__ + + SignalEpochId* localSignalEpochId; + // used by the signal() function directly from gpu + SignalEpochId* remoteSignalEpochId; + + // every wait(), increments this and then the gpu waits for either: + // 1) localSignalEpochId->proxy to be >= this in case of a proxy thread + // 2) remoteSignalEpochId->device to be >= this in case of a gpu thread + uint64_t* waitEpochId; +}; + +class HostConnection { + struct Impl; +public: + /* HostConnection can not be constructed from user code and must instead be created through Communicator::connect */ + HostConnection(std::unique_ptr); + + ~HostConnection(); + + void write() + + int getId(); + + /* Get the number of times registerBuffer(...) was called. + * + * Returns: the number of buffers registered + */ + int numLocalBuffers(); + + /* Get the BufferHandle returned by a call to registerBuffer(...) as identified by the index + * + * Inputs: + * index: the index of the handle to get + * + * Returns: a handle to the buffer + */ + BufferHandle getLocalBuffer(int index); + + /* Get the number of times registerBuffer(...) was called on the remote peer. + * + * Returns: the number of buffers registered on the remote peer + */ + int numRemoteBuffers(); + + /* Get the BufferHandle returned by a call to registerBuffer(...) on the remote peer as identified by the index + * + * Inputs: + * index: the index of the handle to get + * + * Returns: a handle to the buffer on the remote peer + */ + BufferHandle getRemoteBuffer(int index); + + ConnectionEpoch getEpoch(); + + DeviceProxyFifo getDeviceFifo(); + + void put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size); + + void signal(); + + void flush(); + + void wait(); + +private: + std::unique_ptr pimpl; + friend class Communicator; +}; + +struct DeviceConnection { + DeviceConnection() = default; + + DeviceConnection(HostConnection& hostConn) + : connectionId(hostConn.getId()), epoch(hostConn.getEpoch()), + fifo(hostConn.getDeviceFifo()) {} + + DeviceConnection(const DeviceConnection& other) = default; + + DeviceConnection& operator=(DeviceConnection& other) = default; + +#ifdef __CUDACC__ + __forceinline__ __device__ void put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) + { + fifo.push(ChannelTrigger(channelTriggerData, dst, dstOffset, src, srcOffset, size, connectionId).value); + } + + __forceinline__ __device__ void put(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) + { + put(dst, offset, src, offset, size); + } + + __forceinline__ __device__ void signal() + { + epochIncrement(); + fifo.push(ChannelTrigger(channelTriggerFlag, 0, 0, 0, 0, 1, connectionId).value); + } + + __forceinline__ __device__ void putWithSignal(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) + { + epochIncrement(); + fifo.push(ChannelTrigger(channelTriggerData | channelTriggerFlag, dst, dstOffset, src, srcOffset, size, connectionId).value); + } + + __forceinline__ __device__ void putWithSignal(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) + { + putWithSignal(dst, offset, src, offset, size); + } + + __forceinline__ __device__ void putWithSignalAndFlush(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) + { + epochIncrement(); + uint64_t curFifoHead = fifo.push(ChannelTrigger(channelTriggerData | channelTriggerFlag | channelTriggerSync, dst, dstOffset, src, srcOffset, size, connectionId).value); + while (*(volatile uint64_t*)&fifo.triggers[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0 && + *(volatile uint64_t*)fifo.tailReplica <= curFifoHead) + ; + } + + __forceinline__ __device__ void putWithSignalAndFlush(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) + { + putWithSignalAndFlush(dst, offset, src, offset, size); + } + + __forceinline__ __device__ void flush() + { + uint64_t curFifoHead = fifo.push(ChannelTrigger(mscclppSync, 0, 0, 0, 0, 1, connectionId).value); + // we need to wait for two conditions to be met to ensure the CPU is done flushing. (1) wait for the tail + // to go pass by curFifoHead (this is safety net) and (2) wait for the work element value to change to 0. + while (*(volatile uint64_t*)&fifo.triggers[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0 && + *(volatile uint64_t*)fifo.tailReplica <= curFifoHead) + ; + } + + __forceinline__ __device__ void wait() + { + epoch.wait(); + } + + __forceinline__ __device__ void epochIncrement() + { + epoch.epochIncrement(); + } +#endif // __CUDACC__ + + int connectionId; + + ConnectionEpoch epoch; + + // this is a concurrent fifo which is multiple threads from the device + // can produce for and the sole proxy thread consumes it. + DeviceProxyFifo fifo; +}; + +struct SimpleDeviceConnection { + SimpleDeviceConnection() = default; + + SimpleDeviceConnection(HostConnection& hostConn) : devConn(hostConn) { + dst = hostConn.getRemoteBuffer(0); + src = hostConn.getLocalBuffer(0); + } + + SimpleDeviceConnection(const SimpleDeviceConnection& other) = default; + + SimpleDeviceConnection& operator=(SimpleDeviceConnection& other) = default; + +#ifdef __CUDACC__ + + __forceinline__ __device__ void put(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) + { + devConn.put(dst, dstOffset, src, srcOffset, size); + } + + __forceinline__ __device__ void put(uint64_t offset, uint64_t size) + { + put(offset, offset, size); + } + + __forceinline__ __device__ void signal() + { + devConn.signal(); + } + + __forceinline__ __device__ void putWithSignal(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) + { + devConn.putWithSignal(dst, dstOffset, src, srcOffset, size); + } + + __forceinline__ __device__ void putWithSignal(uint64_t offset, uint64_t size) + { + putWithSignal(offset, offset, size); + } + + __forceinline__ __device__ void putWithSignalAndFlush(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) + { + devConn.putWithSignalAndFlush(dst, dstOffset, src, srcOffset, size); + } + + __forceinline__ __device__ void putWithSignalAndFlush(uint64_t offset, uint64_t size) + { + putWithSignalAndFlush(offset, offset, size); + } + + __forceinline__ __device__ void flush() + { + devConn.flush(); + } + + __forceinline__ __device__ void wait() + { + devConn.wait(); + } + + __forceinline__ __device__ void epochIncrement() + { + devConn.epochIncrement(); + } + +#endif // __CUDACC__ + + DeviceConnection devConn; + BufferHandle dst; + BufferHandle src; +}; + diff --git a/src/include/epoch.hpp b/src/include/epoch.hpp new file mode 100644 index 00000000..942edd8b --- /dev/null +++ b/src/include/epoch.hpp @@ -0,0 +1,52 @@ +#ifndef MSCCLPP_EPOCH_HPP_ +#define MSCCLPP_EPOCH_HPP_ + +#include "mscclpp.hpp" + +namespace mscclpp { + +struct alignas(16) SignalEpochId { + // every signal(), increaments this and either: + // 1) proxy thread pushes it to the remote peer's localSignalEpochId->proxy + // 2) gpu thread directly writes it to remoteSignalEpochId->device + uint64_t device; + // signal() function triggers the cpu proxy thread to write to it + uint64_t proxy; +}; + +struct DeviceEpoch { +#ifdef __CUDACC__ + __forceinline__ __device__ void wait() + { + (*waitEpochId) += 1; + while (*(volatile uint64_t*)&(localSignalEpochId->proxy) < (*waitEpochId)) + ; + } + + __forceinline__ __device__ void epochIncrement() + { + *(volatile uint64_t*)&(localSignalEpochId->device) += 1; + } +#endif // __CUDACC__ + + SignalEpochId* localSignalEpochId; + SignalEpochId* remoteSignalEpochId; + uint64_t* waitEpochId; +}; + + +class Epoch { + struct Impl; + std::unique_ptr pimpl; +public: + Epoch(); + ~Epoch(); + + void signal(); + + DeviceEpoch& getDeviceEpoch(); +}; + +} // namespace mscclpp + +#endif // MSCCLPP_EPOCH_HPP_ \ No newline at end of file diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index e41e94b8..67d40050 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -6,381 +6,11 @@ #define MSCCLPP_PATCH 0 #define MSCCLPP_VERSION (MSCCLPP_MAJOR * 10000 + MSCCLPP_MINOR * 100 + MSCCLPP_PATCH) -// For every MSCCLPP_PROXY_FIFO_FLUSH_COUNTER, a flush of the tail to device memory is triggered. -// As long as MSCCLPP_PROXY_FIFO_SIZE is large enough, having a stale tail is not a problem. -#define MSCCLPP_PROXY_FIFO_SIZE 128 -#define MSCCLPP_PROXY_FIFO_FLUSH_COUNTER 4 - #include #include -#include - -#include namespace mscclpp { -struct alignas(16) SignalEpochId { - // every signal(), increaments this and either: - // 1) proxy thread pushes it to the remote peer's localSignalEpochId->proxy - // 2) gpu thread directly writes it to remoteSignalEpochId->device - uint64_t device; - // signal() function triggers the cpu proxy thread to write to it - uint64_t proxy; -}; - -using ChannelTriggerType = uint64_t; -const ChannelTriggerType channelTriggerData = 0x1; -const ChannelTriggerType channelTriggerFlag = 0x2; -const ChannelTriggerType channelTriggerSync = 0x4; - -// This is just a numeric ID. Each HostConnection will have an internal array indexed by these handles -// mapping to the actual -using BufferHandle = uint32_t; - -#define MSCCLPP_BITS_SIZE 32 -#define MSCCLPP_BITS_OFFSET 32 -#define MSCCLPP_BITS_BUFFER_HANDLE 8 -#define MSCCLPP_BITS_TYPE 3 -#define MSCCLPP_BITS_CONNID 10 - -// this is the basic structure of each work element in the fifo -// the summation of number of bits must be 128 or less -union ChannelTrigger { - ProxyTrigger value; - struct - { - // first 64 bits: value[0] - uint64_t size : MSCCLPP_BITS_SIZE; - uint64_t srcOffset : MSCCLPP_BITS_OFFSET; - uint64_t : (64 - MSCCLPP_BITS_SIZE - MSCCLPP_BITS_OFFSET); // ensure 64-bit alignment - // second 64 bits: value[1] - uint64_t dstOffset : MSCCLPP_BITS_OFFSET; - uint64_t srcBufferHandle : MSCCLPP_BITS_BUFFER_HANDLE; - uint64_t dstBufferHandle : MSCCLPP_BITS_BUFFER_HANDLE; - uint64_t type : MSCCLPP_BITS_TYPE; - uint64_t connId : MSCCLPP_BITS_CONNID; - uint64_t : (64 - MSCCLPP_BITS_OFFSET - MSCCLPP_BITS_BUFFER_HANDLE - MSCCLPP_BITS_BUFFER_HANDLE - MSCCLPP_BITS_TYPE); // ensure 64-bit alignment - } fields; - -#ifdef __CUDACC__ - __device__ ChannelTrigger() {} - __device__ ChannelTrigger(ProxyTrigger value) : value(value) {} - __device__ ChannelTrigger(ChannelTriggerType type, BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size, int connectionId) { - value.fst = ((srcOffset << MSCCLPP_BITS_SIZE) + size); - value.snd = ((((((((connectionId << MSCCLPP_BITS_TYPE) + (uint64_t)type) << MSCCLPP_BITS_BUFFER_HANDLE) + dst) << MSCCLPP_BITS_BUFFER_HANDLE) + src) << MSCCLPP_BITS_OFFSET) + dstOffset); - } -#endif // __CUDACC__ -}; - -struct ConnectionEpoch { -#ifdef __CUDACC__ - __forceinline__ __device__ void wait() - { - (*waitEpochId) += 1; - while (*(volatile uint64_t*)&(localSignalEpochId->proxy) < (*waitEpochId)) - ; - } - - __forceinline__ __device__ void epochIncrement() - { - *(volatile uint64_t*)&(localSignalEpochId->device) += 1; - } -#endif // __CUDACC__ - - SignalEpochId* localSignalEpochId; - // used by the signal() function directly from gpu - SignalEpochId* remoteSignalEpochId; - - // every wait(), increments this and then the gpu waits for either: - // 1) localSignalEpochId->proxy to be >= this in case of a proxy thread - // 2) remoteSignalEpochId->device to be >= this in case of a gpu thread - uint64_t* waitEpochId; -}; - -class HostConnection { - struct Impl; -public: - /* HostConnection can not be constructed from user code and must instead be created through Communicator::connect */ - HostConnection(std::unique_ptr); - - ~HostConnection(); - - int getId(); - - /* Register a region of GPU memory for use with this connection. Must be called before connectionSetup() - * in the communicator. - * - * Inputs: - * data: base pointer to the memory - * size: size of the memory region in bytes - * - * Returns: a handle to the buffer - */ - BufferHandle registerBuffer(void* data, uint64_t size); - - /* Get the number of times registerBuffer(...) was called. - * - * Returns: the number of buffers registered - */ - int numLocalBuffers(); - - /* Get the BufferHandle returned by a call to registerBuffer(...) as identified by the index - * - * Inputs: - * index: the index of the handle to get - * - * Returns: a handle to the buffer - */ - BufferHandle getLocalBuffer(int index); - - /* Get the number of times registerBuffer(...) was called on the remote peer. - * - * Returns: the number of buffers registered on the remote peer - */ - int numRemoteBuffers(); - - /* Get the BufferHandle returned by a call to registerBuffer(...) on the remote peer as identified by the index - * - * Inputs: - * index: the index of the handle to get - * - * Returns: a handle to the buffer on the remote peer - */ - BufferHandle getRemoteBuffer(int index); - - ConnectionEpoch getEpoch(); - - DeviceProxyFifo getDeviceFifo(); - - void put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size); - - void signal(); - - void flush(); - - void wait(); - -private: - std::unique_ptr pimpl; - friend class Communicator; -}; - -/*************************************************************************************************************** - * A mscclppDevConn provides a zero-copy connection between two GPUs connected via P2P NVLink or InfiniBand. - * The communication API is one-sided meaning that for every single data transfer, only one side - * needs to execute unlike a two-sided communication stack such as NCCL where both sides - * need to execute a send and a receive instruction, respectively, for every transfer. - * - * A connection is uniquely identified by the (remoteRank, tag) pair at an endpoint. - * The two endpoints register buffers of the same size with the connection. - * - * The endpoints provide the remoteRank, tag, and the buffer when registering a connection with msccppConnect(). - * - * mscllppConnectionSetup() sets up all the registered connections. - * - *************************************************************************************************************** - * A proxy thread running on the CPU is necessary to perform transfers using InfiniBand or the DMA engine. - * The current implementation uses a single proxy thread per context - one IB connection or DMA engine per node. - * Thus multiple threadblocks using different connections might use the same CPU proxy thread. - * - * Before using any of functionality of connections, mscclppProxyLaunch needs to be called to spawn the - * proxy threads. There are currently two types of connections: - * - * P2P via NVLink: the DMA engine can perform the copy between the buffers. DMA engine has higher latency - * but has a higher bandwidth and costs no compute cycles on the GPU. - * - * InfiniBand: the RDMA engine copies the data over MLX devices. - * - *************************************************************************************************************** - * At the runtime, a GPU kernel has access to a mscclppDevConn object that provides the following functions: - * - * put(): [non-blocking] the sender initiates a data transfer to the receiver. - * - * signal(): [non-blocking] the sender signals the receiver that data is ready to be consumed. - * - * flush(): [blocking] the sender waits for all the data transfers to complete - * - * wait(): [blocking] the reciever waits on the signal() to start reading the data. - * - * The sender should not reuse the buffer till the flush() returns. - * The receiver should only access the data after the wait() returns. - * - * putWithSignal(): the sender initiates a data transfer and signals the receiver that data is ready to be consumed. - * This is an optimized version of a put() followed by a signal(). - * - * These functions hide the complexity of syncrhonization between the two GPUs and the CPU proxy thread. - * Example: - * - * // sender GPU - * devConn.put(data1) - * // not OK to write to data1 - * devConn.put(data2) - * // not OK to write to data1, data2 - * devConn.put(data3) // receiver GPU - * // not OK to write to data1, data2, data3 // not OK to read data1, data2, data3 - * devConn.signal() -------------------------------> devConn.wait() - * // not OK to write to data1, data2, data3 // OK to read data1, data2, data3 - * devConn.flush() - * // OK to write to data1, data2, data3 - * - * - * The two endpoint can concurrently use the same connection provided they are writing (puts) on different - * indices in the registered buffer. - **************************************************************************************************************/ -struct DeviceConnection { - DeviceConnection() = default; - - DeviceConnection(HostConnection& hostConn) - : connectionId(hostConn.getId()), epoch(hostConn.getEpoch()), - fifo(hostConn.getDeviceFifo()) {} - - DeviceConnection(const DeviceConnection& other) = default; - - DeviceConnection& operator=(DeviceConnection& other) = default; - -#ifdef __CUDACC__ - __forceinline__ __device__ void put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) - { - fifo.push(ChannelTrigger(channelTriggerData, dst, dstOffset, src, srcOffset, size, connectionId).value); - } - - __forceinline__ __device__ void put(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) - { - put(dst, offset, src, offset, size); - } - - __forceinline__ __device__ void signal() - { - epochIncrement(); - fifo.push(ChannelTrigger(channelTriggerFlag, 0, 0, 0, 0, 1, connectionId).value); - } - - __forceinline__ __device__ void putWithSignal(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) - { - epochIncrement(); - fifo.push(ChannelTrigger(channelTriggerData | channelTriggerFlag, dst, dstOffset, src, srcOffset, size, connectionId).value); - } - - __forceinline__ __device__ void putWithSignal(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) - { - putWithSignal(dst, offset, src, offset, size); - } - - __forceinline__ __device__ void putWithSignalAndFlush(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) - { - epochIncrement(); - uint64_t curFifoHead = fifo.push(ChannelTrigger(channelTriggerData | channelTriggerFlag | channelTriggerSync, dst, dstOffset, src, srcOffset, size, connectionId).value); - while (*(volatile uint64_t*)&fifo.triggers[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0 && - *(volatile uint64_t*)fifo.tailReplica <= curFifoHead) - ; - } - - __forceinline__ __device__ void putWithSignalAndFlush(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) - { - putWithSignalAndFlush(dst, offset, src, offset, size); - } - - __forceinline__ __device__ void flush() - { - uint64_t curFifoHead = fifo.push(ChannelTrigger(mscclppSync, 0, 0, 0, 0, 1, connectionId).value); - // we need to wait for two conditions to be met to ensure the CPU is done flushing. (1) wait for the tail - // to go pass by curFifoHead (this is safety net) and (2) wait for the work element value to change to 0. - while (*(volatile uint64_t*)&fifo.triggers[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0 && - *(volatile uint64_t*)fifo.tailReplica <= curFifoHead) - ; - } - - __forceinline__ __device__ void wait() - { - epoch.wait(); - } - - __forceinline__ __device__ void epochIncrement() - { - epoch.epochIncrement(); - } -#endif // __CUDACC__ - - int connectionId; - - ConnectionEpoch epoch; - - // this is a concurrent fifo which is multiple threads from the device - // can produce for and the sole proxy thread consumes it. - DeviceProxyFifo fifo; -}; - -struct SimpleDeviceConnection { - SimpleDeviceConnection() = default; - - SimpleDeviceConnection(HostConnection& hostConn) : devConn(hostConn) { - dst = hostConn.getRemoteBuffer(0); - src = hostConn.getLocalBuffer(0); - } - - SimpleDeviceConnection(const SimpleDeviceConnection& other) = default; - - SimpleDeviceConnection& operator=(SimpleDeviceConnection& other) = default; - -#ifdef __CUDACC__ - - __forceinline__ __device__ void put(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) - { - devConn.put(dst, dstOffset, src, srcOffset, size); - } - - __forceinline__ __device__ void put(uint64_t offset, uint64_t size) - { - put(offset, offset, size); - } - - __forceinline__ __device__ void signal() - { - devConn.signal(); - } - - __forceinline__ __device__ void putWithSignal(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) - { - devConn.putWithSignal(dst, dstOffset, src, srcOffset, size); - } - - __forceinline__ __device__ void putWithSignal(uint64_t offset, uint64_t size) - { - putWithSignal(offset, offset, size); - } - - __forceinline__ __device__ void putWithSignalAndFlush(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) - { - devConn.putWithSignalAndFlush(dst, dstOffset, src, srcOffset, size); - } - - __forceinline__ __device__ void putWithSignalAndFlush(uint64_t offset, uint64_t size) - { - putWithSignalAndFlush(offset, offset, size); - } - - __forceinline__ __device__ void flush() - { - devConn.flush(); - } - - __forceinline__ __device__ void wait() - { - devConn.wait(); - } - - __forceinline__ __device__ void epochIncrement() - { - devConn.epochIncrement(); - } - -#endif // __CUDACC__ - - DeviceConnection devConn; - BufferHandle dst; - BufferHandle src; -}; - #define MSCCLPP_UNIQUE_ID_BYTES 128 struct UniqueId { char internal[MSCCLPP_UNIQUE_ID_BYTES]; @@ -395,13 +25,66 @@ struct UniqueId { */ std::unique_ptr getUniqueId(); -/* Transport Types */ -enum class TransportType : uint8_t { - P2P = 0, - IB = 1, +using TransportFlags = uint32_t; +const TransportFlags TransportCudaIpc = 0b1; +const TransportFlags TransportIB = 0b10; +const TransportFlags TransportIB1 = 0b100; +const TransportFlags TransportIB2 = 0b1000; +const TransportFlags TransportIB3 = 0b10000; +const TransportFlags TransportIB4 = 0b100000; +const TransportFlags TransportIB5 = 0b1000000; +const TransportFlags TransportIB6 = 0b10000000; +const TransportFlags TransportIB7 = 0b100000000; +const TransportFlags TransportAll = 0b111111111; + +class Communicator; + +class RegisteredMemory { + struct Impl; + std::shared_ptr pimpl; +public: + + RegisteredMemory(std::shared_ptr pimpl); + ~RegisteredMemory(); + + void* data(); + size_t size(); + TransportFlags transports(); + + std::vector serialize(); + static RegisteredMemory deserialize(const std::vector& data); + + int rank(); + bool isLocal(); + bool isRemote(); +}; + +class Connection { + struct Impl; + std::unique_ptr pimpl; +public: + + /* Connection can not be constructed from user code and must instead be created through Communicator::connect */ + Connection(std::unique_ptr); + ~Connection(); + + void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size); + + void flush(); + + TransportFlags transport(); + TransportFlags remoteTransport(); // Good to have because different IB transports can still connect to each other + + // template void write(RegisteredPtr dst, RegisteredPtr src, uint64_t size) { + // write(dst.memory(), dst.offset() * sizeof(T), src.memory(), src.offset() * sizeof(T), size); + // } + + friend class Communicator; }; class Communicator { + struct Impl; + std::unique_ptr pimpl; public: /* Initialize the communicator. nranks processes with rank 0 to nranks-1 need to call this function. @@ -436,6 +119,16 @@ public: /* A no-op function that is used to synchronize all processes via a bootstrap allgather*/ void bootstrapBarrier(); + /* Register a region of GPU memory for use in this communicator. + * + * Inputs: + * data: base pointer to the memory + * size: size of the memory region in bytes + * + * Returns: a handle to the buffer + */ + RegisteredMemory registerMemory(void* ptr, size_t size, TransportFlags transports); + /* Connect to a remote rank. This function only prepares metadata for connection. The actual connection * is made by a following call of mscclppConnectionSetup(). Note that this function is two-way and a connection * from rank i to remote rank j needs to have a counterpart from rank j to rank i. @@ -450,19 +143,8 @@ public: * transportType: the type of transport to be used (mscclppTransportP2P or mscclppTransportIB) * ibDev: the name of the IB device to be used. Expects a null for mscclppTransportP2P. */ - std::shared_ptr connect(int remoteRank, int tag, TransportType transportType, const char* ibDev = 0); + std::shared_ptr connect(int remoteRank, int tag, TransportFlags transport); - /* Establish all connections created by mscclppConnect(). This function must be called after all mscclppConnect() - * calls are made. This function ensures that all remote ranks are ready to communicate when it returns. - */ - void connectionSetup(); - - /* Launch proxy thread(s). This function is supposed to be called before starting a kernel that uses DeviceConnection. */ - void startProxying(); - - /* Stop proxy thread(s). */ - void stopProxying(); - /* Return the rank of the calling process. * * Outputs: @@ -476,37 +158,6 @@ public: * size: the number of ranks of the communicator */ int size(); - - struct Impl; -private: - std::unique_ptr pimpl; - friend class HostConnection; -}; - -enum class ProxyHandlerResult { - Continue, - FlushFifoTailAndContinue, - Stop, -}; - -class Proxy; -using ProxyHandler = std::function; - -class Proxy { -public: - Proxy(ProxyHandler handler); - - ~Proxy(); - - void start(); - - void stop(); - - HostProxyFifo& fifo(); - -private: - struct Impl; - std::unique_ptr pimpl; }; } // namespace mscclpp diff --git a/src/include/proxy.hpp b/src/include/proxy.hpp new file mode 100644 index 00000000..70b6ba49 --- /dev/null +++ b/src/include/proxy.hpp @@ -0,0 +1,39 @@ +#ifndef MSCCLPP_PROXY_HPP_ +#define MSCCLPP_PROXY_HPP_ + +#include + +#include +#include + +namespace mscclpp { + +enum class ProxyHandlerResult { + Continue, + FlushFifoTailAndContinue, + Stop, +}; + +class Proxy; +using ProxyHandler = std::function; + +class Proxy { +public: + Proxy(ProxyHandler handler); + + ~Proxy(); + + void start(); + + void stop(); + + HostProxyFifo& fifo(); + +private: + struct Impl; + std::unique_ptr pimpl; +}; + +} // namespace mscclpp + +#endif // MSCCLPP_PROXY_HPP_ \ No newline at end of file diff --git a/src/include/registered_ptr.hpp b/src/include/registered_ptr.hpp new file mode 100644 index 00000000..7eadb6b0 --- /dev/null +++ b/src/include/registered_ptr.hpp @@ -0,0 +1,40 @@ +#ifndef MSCCLPP_REGISTERED_PTR_HPP_ +#define MSCCLPP_REGISTERED_PTR_HPP_ + +namespace mscclpp { + +template +class RegisteredPtr { + RegisteredMemory memory; + size_t offset; +public: + RegisteredPtr(RegisteredMemory memory, size_t offset) : memory(memory), offset(offset) {} + RegisteredPtr(RegisteredMemory memory) : RegisteredPtr(memory, 0) {} + ~RegisteredPtr() {} + + RegisteredMemory memory() { + return memory; + } + + T* data() { + return reinterpret_cast(memory.data()); + } + + size_t size() { + return memory.size() / sizeof(T); + } + + size_t offset() { + return offset; + } + + RegisteredPtr operator+(size_t offset) { + return RegisteredPtr(memory, this->offset + offset); + } + + // TODO: all other relevant overloads +}; + +} // namespace mscclpp + +#endif // MSCCLPP_REGISTERED_PTR_HPP_ \ No newline at end of file From 073460c341a81290d0856e746541aa50c58d8c49 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sun, 23 Apr 2023 14:25:56 +0000 Subject: [PATCH 039/135] fx compile issue --- Makefile | 4 +-- src/bootstrap/bootstrap.cc | 49 +++++++++++++++++-------------------- src/include/bootstrap.h | 22 +++++++++++------ src/include/mscclpp.h | 2 +- src/init.cc | 3 --- tests/bootstrap_test_cpp.cc | 19 ++++++++++++++ 6 files changed, 59 insertions(+), 40 deletions(-) create mode 100644 tests/bootstrap_test_cpp.cc diff --git a/Makefile b/Makefile index 92a68248..801a1ffd 100644 --- a/Makefile +++ b/Makefile @@ -147,7 +147,7 @@ UTOBJTARGETS := $(UTOBJS:%=$(BUILDDIR)/$(OBJDIR)/%) UTBINS := $(patsubst %.o,$(BUILDDIR)/$(BINDIR)/%,$(UTOBJS)) TESTSDIR := tests -TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc allgather_test_standalone.cu) +TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc allgather_test_standalone.cu bootstrap_test_cpp.cc) TESTSOBJS := $(patsubst %.cc,%.o,$(TESTSSRCS)) $(patsubst %.cu,%.o,$(TESTSSRCS)) TESTSOBJTARGETS := $(TESTSOBJS:%=$(BUILDDIR)/$(OBJDIR)/%) TESTSBINS := $(patsubst %.o,$(BUILDDIR)/$(BINDIR)/%,$(TESTSOBJS)) @@ -215,7 +215,7 @@ $(BUILDDIR)/$(BINDIR)/$(UTDIR)/%: $(BUILDDIR)/$(OBJDIR)/$(UTDIR)/%.o $(LIBOBJTAR # Compile .cc tests $(BUILDDIR)/$(OBJDIR)/$(TESTSDIR)/%.o: $(TESTSDIR)/%.cc $(INCTARGETS) @mkdir -p $(@D) - $(CXX) -o $@ -I$(BUILDDIR)/$(INCDIR) $(MPI_INC) $(CXXFLAGS) -c $< $(MPI_MACRO) + $(CXX) -o $@ -I$(BUILDDIR)/$(INCDIR) $(MPI_INC) $(CXXFLAGS) -Isrc/include -c $< $(MPI_MACRO) # Compile .cu tests $(BUILDDIR)/$(OBJDIR)/$(TESTSDIR)/%.o: $(TESTSDIR)/%.cu $(INCTARGETS) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index 0dac4e2f..e54b38e8 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -40,12 +41,6 @@ enum bootstrapInterface_t dontCareIf = -2 }; -struct MscclppBootstrap::UniqueId -{ - uint64_t magic; - union mscclppSocketAddress addr; -}; - struct unexpectedConn { int peer; @@ -64,7 +59,7 @@ struct extInfo class MscclppBootstrap::Impl { public: - Impl(std::string ipPortPair, int rank, int nRanks, const mscclppBootstrapHandle handle); + Impl(std::string ipPortPair, int rank, int nRanks, const UniqueId uniqueId); ~Impl(); mscclppResult_t initialize(); mscclppResult_t allGather(void* allData, int size); @@ -73,7 +68,7 @@ public: mscclppResult_t barrier(); mscclppResult_t close(); - MscclppBootstrap::UniqueId uniqueId_; + static UniqueId uniqueId_; private: int rank_; @@ -100,7 +95,9 @@ private: mscclppResult_t netInit(std::string ipPortPair); }; -MscclppBootstrap::Impl::Impl(std::string ipPortPair, int rank, int nRanks, const mscclppBootstrapHandle handle) +UniqueId MscclppBootstrap::Impl::uniqueId_; + +MscclppBootstrap::Impl::Impl(std::string ipPortPair, int rank, int nRanks, const UniqueId uniqueId) : rank_(rank), nRanks_(nRanks), peerCommAddresses_(nRanks, mscclppSocketAddress()), peerProxyAddresses_(nRanks, mscclppSocketAddress()), abortFlag_(nullptr) { @@ -109,10 +106,11 @@ MscclppBootstrap::Impl::Impl(std::string ipPortPair, int rank, int nRanks, const throw std::runtime_error("Failed to initialize network"); } - mscclppBootstrapHandle zeroHandle = {0}; - if (memcmp(&handle, &zeroHandle, sizeof(mscclppBootstrapHandle)) != 0) { - uniqueId_.magic = handle.magic; - uniqueId_.addr = handle.addr; + UniqueId zeroId; + std::memset(&zeroId, 0, sizeof(UniqueId)); + if (std::memcmp(&uniqueId, &zeroId, sizeof(UniqueId)) != 0) { + uniqueId_.magic = uniqueId.magic; + uniqueId_.addr = uniqueId.addr; return; } @@ -275,8 +273,7 @@ mscclppResult_t MscclppBootstrap::Impl::netInit(std::string ipPortPair) return mscclppSystemError; } } else { - int ret = - mscclppFindInterfaces(netIfName_, &netIfAddr_, MAX_IF_NAME_SIZE, 1); + int ret = mscclppFindInterfaces(netIfName_, &netIfAddr_, MAX_IF_NAME_SIZE, 1); if (ret <= 0) { WARN("Bootstrap : no socket interface found"); return mscclppInternalError; @@ -304,14 +301,12 @@ mscclppResult_t MscclppBootstrap::Impl::initialize() uint64_t magic = this->uniqueId_.magic; // Create socket for other ranks to contact me - MSCCLPPCHECK( - mscclppSocketInit(&this->listenSock_, &netIfAddr_, magic, mscclppSocketTypeBootstrap, this->abortFlag_)); + MSCCLPPCHECK(mscclppSocketInit(&this->listenSock_, &netIfAddr_, magic, mscclppSocketTypeBootstrap, this->abortFlag_)); MSCCLPPCHECK(mscclppSocketListen(&this->listenSock_)); MSCCLPPCHECK(mscclppSocketGetAddr(&this->listenSock_, &info.extAddressListen)); // Create socket for root to contact me - MSCCLPPCHECK( - mscclppSocketInit(&listenSockRoot, &netIfAddr_, magic, mscclppSocketTypeBootstrap, this->abortFlag_)); + MSCCLPPCHECK(mscclppSocketInit(&listenSockRoot, &netIfAddr_, magic, mscclppSocketTypeBootstrap, this->abortFlag_)); MSCCLPPCHECK(mscclppSocketListen(&listenSockRoot)); MSCCLPPCHECK(mscclppSocketGetAddr(&listenSockRoot, &info.extAddressListenRoot)); @@ -439,17 +434,21 @@ mscclppResult_t MscclppBootstrap::Impl::close() MscclppBootstrap::MscclppBootstrap(std::string ipPortPair, int rank, int nRanks) { - pimpl_ = std::make_unique(ipPortPair, rank, nRanks, mscclppBootstrapHandle{0}); + UniqueId uniqueId; + std::memset(&uniqueId, 0, sizeof(uniqueId)); + // pimpl_ = std::make_unique(ipPortPair, rank, nRanks, uniqueId); + pimpl_ = new Impl(ipPortPair, rank, nRanks, uniqueId); } -MscclppBootstrap::MscclppBootstrap(mscclppBootstrapHandle handle, int rank, int nRanks) +MscclppBootstrap::MscclppBootstrap(UniqueId uniqueId, int rank, int nRanks) { - pimpl_ = std::make_unique("", rank, nRanks, handle); + pimpl_ = new Impl("", rank, nRanks, uniqueId); + // pimpl_ = std::make_unique("", rank, nRanks, uniqueId); } -MscclppBootstrap::UniqueId MscclppBootstrap::GetUniqueId() +UniqueId MscclppBootstrap::GetUniqueId() { - return pimpl_->uniqueId_; + return Impl::uniqueId_; } void MscclppBootstrap::Send(void* data, int size, int peer, int tag) @@ -500,8 +499,6 @@ void MscclppBootstrap::Close() } } - - // ------------------- Old bootstrap functions ------------------- struct bootstrapRootArgs { diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h index 8eb7b939..175981e4 100644 --- a/src/include/bootstrap.h +++ b/src/include/bootstrap.h @@ -5,20 +5,23 @@ #include "comm.h" -struct mscclppBootstrapHandle +struct UniqueId { uint64_t magic; union mscclppSocketAddress addr; }; -static_assert(sizeof(struct mscclppBootstrapHandle) <= sizeof(mscclppUniqueId), +static_assert(sizeof(UniqueId) <= sizeof(mscclppUniqueId), "Bootstrap handle is too large to fit inside MSCCLPP unique ID"); -class MscclppBootstrap : Bootstrap { +class __attribute__((visibility("default"))) MscclppBootstrap : public Bootstrap +{ public: MscclppBootstrap(std::string ipPortPair, int rank, int nRanks); - MscclppBootstrap(mscclppBootstrapHandle handle, int rank, int nRanks); - ~MscclppBootstrap() = default; + MscclppBootstrap(UniqueId uniqueId, int rank, int nRanks); + ~MscclppBootstrap() override = default; + + static UniqueId GetUniqueId(); void Initialize(); void Send(void* data, int size, int peer, int tag) override; @@ -26,14 +29,17 @@ public: void AllGather(void* allData, int size) override; void Barrier() override; void Close(); - struct UniqueId; - UniqueId GetUniqueId(); private: class Impl; - std::unique_ptr pimpl_; + Impl* pimpl_; }; +struct mscclppBootstrapHandle +{ + uint64_t magic; + union mscclppSocketAddress addr; +}; mscclppResult_t bootstrapNetInit(const char* ip_port_pair = NULL); mscclppResult_t bootstrapCreateRoot(struct mscclppBootstrapHandle* handle); mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle, bool isRoot = true, diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index 6ad587fb..0e7f76e5 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -251,7 +251,7 @@ typedef enum class Bootstrap { public: Bootstrap(){}; - virtual ~Bootstrap() = 0; + virtual ~Bootstrap() = default; virtual void Send(void* data, int size, int peer, int tag) = 0; virtual void Recv(void* data, int size, int peer, int tag) = 0; virtual void AllGather(void* allData, int size) = 0; diff --git a/src/init.cc b/src/init.cc index 1850f68c..7c3b76b9 100644 --- a/src/init.cc +++ b/src/init.cc @@ -926,6 +926,3 @@ MSCCLPP_API mscclppResult_t mscclppSetBootstrapConnTimeout(int timeout) config->setBootstrapConnectionTimeoutConfig(timeout); return mscclppSuccess; } - -Bootstrap::~Bootstrap() { -} diff --git a/tests/bootstrap_test_cpp.cc b/tests/bootstrap_test_cpp.cc new file mode 100644 index 00000000..34137577 --- /dev/null +++ b/tests/bootstrap_test_cpp.cc @@ -0,0 +1,19 @@ +#include "bootstrap.h" + +#include + +#include + +int main() +{ + int rank, worldSize; + MPI_Init(NULL, NULL); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &worldSize); + + std::shared_ptr bootstrap(new MscclppBootstrap("", rank, worldSize)); + // need to call initialization first + + MPI_Finalize(); + return 0; +} \ No newline at end of file From 35ade686ff502386f1bf09640ac12c31a26a8e8d Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sun, 23 Apr 2023 14:47:07 +0000 Subject: [PATCH 040/135] IB in cpp style WIP --- src/communicator.cc | 16 +-- src/ib.cc | 191 +++++++++++++++++++++++++++-------- src/include/channel.hpp | 6 +- src/include/communicator.hpp | 4 +- src/include/ib.hpp | 61 +++++++++++ 5 files changed, 226 insertions(+), 52 deletions(-) create mode 100644 src/include/ib.hpp diff --git a/src/communicator.cc b/src/communicator.cc index 5a843c78..d12b20e4 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -1,3 +1,4 @@ +#include "mscclpp.hpp" #include "communicator.hpp" #include "host_connection.hpp" #include "comm.h" @@ -16,14 +17,14 @@ Communicator::Impl::~Impl() { MSCCLPP_API_CPP Communicator::~Communicator() = default; -mscclppTransport_t transportTypeToCStyle(TransportType type) { - switch (type) { - case TransportType::IB: +static mscclppTransport_t transportFlagsToCStyle(TransportFlags flags) { + switch (flags) { + case TransportIB: return mscclppTransportIB; - case TransportType::P2P: + case TransportCudaIpc: return mscclppTransportP2P; default: - throw std::runtime_error("Unknown transport type"); + throw std::runtime_error("Unsupported conversion"); } } @@ -45,9 +46,8 @@ MSCCLPP_API_CPP void Communicator::bootstrapBarrier() { mscclppBootstrapBarrier(pimpl->comm); } -MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank, int tag, - TransportType transportType, const char* ibDev) { - mscclppConnectWithoutBuffer(pimpl->comm, remoteRank, tag, transportTypeToCStyle(transportType), ibDev); +MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank, int tag, TransportFlags transportFlags, const char* ibDev) { + mscclppConnectWithoutBuffer(pimpl->comm, remoteRank, tag, transportFlagsToCStyle(transportFlags), ibDev); auto connIdx = pimpl->connections.size(); auto conn = std::make_shared(std::make_unique(this, &pimpl->comm->conns[connIdx])); pimpl->connections.push_back(conn); diff --git a/src/ib.cc b/src/ib.cc index bb574e21..4a094761 100644 --- a/src/ib.cc +++ b/src/ib.cc @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -9,48 +10,8 @@ #include "comm.h" #include "debug.h" #include "ib.h" - -static int getIbDevNumaNode(const char* ibDevPath) -{ - if (ibDevPath == NULL) { - WARN("ibDevPath is NULL"); - return -1; - } - const char* postfix = "/device/numa_node"; - FILE* fp = NULL; - char* filePath = NULL; - int node = -1; - int res; - if (mscclppCalloc(&filePath, strlen(ibDevPath) + strlen(postfix) + 1) != mscclppSuccess) { - WARN("mscclppCalloc failed"); - goto exit; - } - memcpy(filePath, ibDevPath, strlen(ibDevPath) * sizeof(char)); - filePath[strlen(ibDevPath)] = '\0'; - if (strncat(filePath, postfix, strlen(postfix)) == NULL) { - WARN("strncat failed"); - goto exit; - } - fp = fopen(filePath, "r"); - if (fp == NULL) { - WARN("fopen failed (errno %d, path %s)", errno, filePath); - goto exit; - } - res = fscanf(fp, "%d", &node); - if (res != 1) { - WARN("fscanf failed (errno %d, path %s)", errno, filePath); - node = -1; - goto exit; - } -exit: - if (filePath != NULL) { - free(filePath); - } - if (fp != NULL) { - fclose(fp); - } - return node; -} +#include "ib.hpp" +#include "checks.hpp" mscclppResult_t mscclppIbContextCreate(struct mscclppIbContext** ctx, const char* ibDevName) { @@ -400,3 +361,149 @@ int mscclppIbQp::pollCq() { return ibv_poll_cq(this->cq, MSCCLPP_IB_CQ_POLL_NUM, this->wcs); } + +namespace mscclpp { + +IbQp::IbQp(void* ctx, void* pd, int port) +{ + struct ibv_context* _ctx = static_cast(ctx); + struct ibv_pd* _pd = static_cast(pd); + + this->cq = ibv_create_cq(_ctx, MSCCLPP_IB_CQ_SIZE, nullptr, nullptr, 0); + if (this->cq == nullptr) { + std::stringstream err; + err << "ibv_create_cq failed (errno " << errno << ")"; + throw std::runtime_error(err.str()); + } + + struct ibv_qp_init_attr qpInitAttr; + std::memset(&qpInitAttr, 0, sizeof(qpInitAttr)); + qpInitAttr.sq_sig_all = 0; + qpInitAttr.send_cq = static_cast(this->cq); + qpInitAttr.recv_cq = static_cast(this->cq); + qpInitAttr.qp_type = IBV_QPT_RC; + qpInitAttr.cap.max_send_wr = MAXCONNECTIONS * MSCCLPP_PROXY_FIFO_SIZE; + qpInitAttr.cap.max_recv_wr = MAXCONNECTIONS * MSCCLPP_PROXY_FIFO_SIZE; + qpInitAttr.cap.max_send_sge = 1; + qpInitAttr.cap.max_recv_sge = 1; + qpInitAttr.cap.max_inline_data = 0; + + struct ibv_qp* _qp = ibv_create_qp(_pd, &qpInitAttr); + if (_qp == nullptr) { + std::stringstream err; + err << "ibv_create_qp failed (errno " << errno << ")"; + throw std::runtime_error(err.str()); + } + + struct ibv_port_attr portAttr; + if (ibv_query_port(_ctx, port, &portAttr) != 0) { + std::stringstream err; + err << "ibv_query_port failed (errno " << errno << ")"; + throw std::runtime_error(err.str()); + } + this->info.lid = portAttr.lid; + this->info.port = port; + this->info.linkLayer = portAttr.link_layer; + this->info.qpn = _qp->qp_num; + this->info.mtu = portAttr.active_mtu; + if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND) { + union ibv_gid gid; + if (ibv_query_gid(_ctx, port, 0, &gid) != 0) { + std::stringstream err; + err << "ibv_query_gid failed (errno " << errno << ")"; + throw std::runtime_error(err.str()); + } + this->info.spn = gid.global.subnet_prefix; + } + + struct ibv_qp_attr qpAttr; + memset(&qpAttr, 0, sizeof(qpAttr)); + qpAttr.qp_state = IBV_QPS_INIT; + qpAttr.pkey_index = 0; + qpAttr.port_num = port; + qpAttr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; + if (ibv_modify_qp(_qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS) != 0) { + std::stringstream err; + err << "ibv_modify_qp failed (errno " << errno << ")"; + throw std::runtime_error(err.str()); + } + this->qp = _qp; +} + +IbCtx::IbCtx(const std::string& ibDevName) +{ + int num; + struct ibv_device** devices = ibv_get_device_list(&num); + for (int i = 0; i < num; ++i) { + if (std::string(devices[i]->name) == ibDevName) { + this->ctx = ibv_open_device(devices[i]); + break; + } + } + ibv_free_device_list(devices); + if (this->ctx == nullptr) { + std::stringstream err; + err << "ibv_open_device failed (errno " << errno << ", device name << " << ibDevName << ")"; + throw std::runtime_error(err.str()); + } + this->pd = ibv_alloc_pd(static_cast(this->ctx)); + if (this->pd == nullptr) { + std::stringstream err; + err << "ibv_alloc_pd failed (errno " << errno << ")"; + throw std::runtime_error(err.str()); + } +} + +IbCtx::~IbCtx() +{ + if (this->pd != nullptr) { + ibv_dealloc_pd(static_cast(this->pd)); + } + if (this->ctx != nullptr) { + ibv_close_device(static_cast(this->ctx)); + } +} + +bool IbCtx::isPortUsable(int port) const +{ + struct ibv_port_attr portAttr; + if (ibv_query_port(static_cast(this->ctx), port, &portAttr) != 0) { + std::stringstream err; + err << "ibv_query_port failed (errno " << errno << ", port << " << port << ")"; + throw std::runtime_error(err.str()); + } + return portAttr.state == IBV_PORT_ACTIVE && (portAttr.link_layer == IBV_LINK_LAYER_ETHERNET || + portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND); +} + +int IbCtx::getAnyActivePort() const +{ + struct ibv_device_attr devAttr; + if (ibv_query_device(static_cast(this->ctx), &devAttr) != 0) { + std::stringstream err; + err << "ibv_query_device failed (errno " << errno << ")"; + throw std::runtime_error(err.str()); + } + for (uint8_t port = 1; port <= devAttr.phys_port_cnt; ++port) { + if (this->isPortUsable(port)) { + return port; + } + } + return -1; +} + +IbQp* IbCtx::createQp(int port /*=-1*/) +{ + if (port == -1) { + port = this->getAnyActivePort(); + if (port == -1) { + throw std::runtime_error("No active port found"); + } + } else if (!this->isPortUsable(port)) { + throw std::runtime_error("invalid IB port: " + std::to_string(port)); + } + qps.emplace_back(new IbQp(this->ctx, this->pd, port)); + return qps.back().get(); +} + +} // namespace mscclpp diff --git a/src/include/channel.hpp b/src/include/channel.hpp index cb1931b0..10a5f601 100644 --- a/src/include/channel.hpp +++ b/src/include/channel.hpp @@ -2,6 +2,7 @@ #define MSCCLPP_CHANNEL_HPP_ #include "mscclpp.hpp" +#include "epoch.hpp" #include "proxy.hpp" namespace mscclpp { @@ -88,7 +89,7 @@ public: ~HostConnection(); - void write() + void write(); int getId(); @@ -293,3 +294,6 @@ struct SimpleDeviceConnection { BufferHandle src; }; +} // namespace mscclpp + +#endif // MSCCLPP_CHANNEL_HPP_ diff --git a/src/include/communicator.hpp b/src/include/communicator.hpp index 8294eeb6..f2816c1a 100644 --- a/src/include/communicator.hpp +++ b/src/include/communicator.hpp @@ -3,6 +3,8 @@ #include "mscclpp.hpp" #include "mscclpp.h" +#include "channel.hpp" +#include "proxy.hpp" namespace mscclpp { @@ -20,4 +22,4 @@ struct Communicator::Impl { } // namespace mscclpp -#endif \ No newline at end of file +#endif // MSCCL_COMMUNICATOR_HPP_ diff --git a/src/include/ib.hpp b/src/include/ib.hpp new file mode 100644 index 00000000..4c58cfdc --- /dev/null +++ b/src/include/ib.hpp @@ -0,0 +1,61 @@ +#ifndef MSCCLPP_IB_HPP_ +#define MSCCLPP_IB_HPP_ + +#include +#include +#include + +namespace mscclpp { + +// QP info to be shared with the remote peer +struct IbQpInfo +{ + uint16_t lid; + uint8_t port; + uint8_t linkLayer; + uint32_t qpn; + uint64_t spn; + uint32_t mtu; +}; + +class IbQp +{ +public: + ~IbQp(); + + IbQpInfo info; + +private: + IbQp(void* ctx, void* pd, int port); + + void* qp; + void* cq; + void* wcs; + void* wrs; + void* sges; + int wrn; + + friend class IbCtx; +}; + + +class IbCtx +{ +public: + IbCtx(const std::string& ibDevName); + ~IbCtx(); + + IbQp* createQp(int port = -1); + +private: + bool IbCtx::isPortUsable(int port) const; + int IbCtx::getAnyActivePort() const; + + void* ctx; + void* pd; + std::list> qps; +}; + +} // namespace mscclpp + +#endif // MSCCLPP_IB_HPP_ From a9cfb82fcb20fb79774273f1d5a4f92504ca94d4 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Mon, 24 Apr 2023 05:58:11 +0000 Subject: [PATCH 041/135] wip --- src/bootstrap/bootstrap.cc | 136 +++++++++++++++++++++++++----------- src/include/bootstrap.h | 8 +-- tests/bootstrap_test_cpp.cc | 8 ++- 3 files changed, 105 insertions(+), 47 deletions(-) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index e54b38e8..b12afb61 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -59,20 +59,24 @@ struct extInfo class MscclppBootstrap::Impl { public: - Impl(std::string ipPortPair, int rank, int nRanks, const UniqueId uniqueId); + Impl(int rank, int nRanks); ~Impl(); - mscclppResult_t initialize(); + void initialize(const UniqueId uniqueId); + void initialize(std::string ipPortPair); + mscclppResult_t establishConnections(); + UniqueId getUniqueId(); mscclppResult_t allGather(void* allData, int size); mscclppResult_t send(void* data, int size, int peer, int tag); mscclppResult_t recv(void* data, int size, int peer, int tag); mscclppResult_t barrier(); mscclppResult_t close(); - static UniqueId uniqueId_; + UniqueId uniqueId_; private: int rank_; int nRanks_; + bool netInitialized; mscclppSocket listenSock_; mscclppSocket ringRecvSocket_; mscclppSocket ringSendSocket_; @@ -95,37 +99,67 @@ private: mscclppResult_t netInit(std::string ipPortPair); }; -UniqueId MscclppBootstrap::Impl::uniqueId_; +// UniqueId MscclppBootstrap::Impl::uniqueId_; -MscclppBootstrap::Impl::Impl(std::string ipPortPair, int rank, int nRanks, const UniqueId uniqueId) - : rank_(rank), nRanks_(nRanks), peerCommAddresses_(nRanks, mscclppSocketAddress()), +MscclppBootstrap::Impl::Impl(int rank, int nRanks) + : rank_(rank), nRanks_(nRanks), netInitialized(false), peerCommAddresses_(nRanks, mscclppSocketAddress()), peerProxyAddresses_(nRanks, mscclppSocketAddress()), abortFlag_(nullptr) +{ +} + +UniqueId MscclppBootstrap::Impl::getUniqueId() +{ + UniqueId uniqueId; + auto ret = netInit(""); + if (ret != mscclppSuccess) { + throw std::runtime_error("Failed to initialize network"); + } + ret = getRandomData(&uniqueId.magic, sizeof(uniqueId.magic)); + if (ret != mscclppSuccess) { + throw std::runtime_error("getting random data failed"); + } + std::memcpy(&uniqueId.addr, &netIfAddr_, sizeof(union mscclppSocketAddress)); + + return uniqueId; +} + +void MscclppBootstrap::Impl::initialize(const UniqueId uniqueId) +{ + int ret = netInit(""); + if (ret != mscclppSuccess) { + throw std::runtime_error("Failed to initialize network"); + } + + uniqueId_.magic = uniqueId.magic; + uniqueId_.addr = uniqueId.addr; + + if (rank_ == 0) { + rootThread_ = std::thread(&MscclppBootstrap::Impl::bootstrapRoot, this); + } + + ret = establishConnections(); + if (ret != mscclppSuccess) { + throw std::runtime_error("Failed to establish connections"); + } +} + +void MscclppBootstrap::Impl::initialize(std::string ipPortPair) { int ret = netInit(ipPortPair); if (ret != mscclppSuccess) { throw std::runtime_error("Failed to initialize network"); } - UniqueId zeroId; - std::memset(&zeroId, 0, sizeof(UniqueId)); - if (std::memcmp(&uniqueId, &zeroId, sizeof(UniqueId)) != 0) { - uniqueId_.magic = uniqueId.magic; - uniqueId_.addr = uniqueId.addr; - return; - } - - if (!ipPortPair.empty()) { - uniqueId_.magic = 0xdeadbeef; - } else { - mscclppResult_t ret = getRandomData(&uniqueId_.magic, sizeof(uniqueId_.magic)); - if (ret != mscclppSuccess) { - throw std::runtime_error("getting random data failed"); - } - } + uniqueId_.magic = 0xdeadbeef; std::memcpy(&uniqueId_.addr, &netIfAddr_, sizeof(union mscclppSocketAddress)); if (rank_ == 0) { rootThread_ = std::thread(&MscclppBootstrap::Impl::bootstrapRoot, this); } + + ret = establishConnections(); + if (ret != mscclppSuccess) { + throw std::runtime_error("Failed to establish connections"); + } } MscclppBootstrap::Impl::~Impl() @@ -145,33 +179,39 @@ mscclppResult_t MscclppBootstrap::Impl::getRemoteAddresses(mscclppSocket* listen mscclppResult_t res = mscclppSuccess; mscclppSocketAddress zero; + printf("hh 0\n"); std::memset(&zero, 0, sizeof(mscclppSocketAddress)); res = mscclppSocketInit(&sock); if (res != mscclppSuccess) { WARN("Bootstrap Root : mscclppSocketInit failed"); return res; } + printf("hh 1\n"); res = mscclppSocketAccept(&sock, listenSock); if (res != mscclppSuccess) { WARN("Bootstrap Root : mscclppSocketAccept failed"); return res; } + printf("hh 2\n"); res = netRecv(&sock, &info, sizeof(info)); if (res != mscclppSuccess) { WARN("Bootstrap Root : netRecv failed"); return res; } + printf("hh 3\n"); res = mscclppSocketClose(&sock); if (res != mscclppSuccess) { WARN("Bootstrap Root : mscclppSocketClose failed"); return res; } + printf("hh 4\n"); if (this->nRanks_ != info.nRanks) { WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", this->nRanks_, info.nRanks); return res; } + printf("hh 5\n"); if (std::memcmp(&zero, &rankAddressesRoot[info.rank], sizeof(mscclppSocketAddress)) != 0) { WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, this->nRanks_); return res; @@ -216,6 +256,7 @@ mscclppResult_t MscclppBootstrap::Impl::sendHandleToPeer(int peer, mscclppResult_t MscclppBootstrap::Impl::bootstrapRoot() { + printf("I am here0 magic %x\n", uniqueId_.magic); mscclppResult_t res = mscclppSuccess; int numCollected = 0; std::vector rankAddresses(this->nRanks_, mscclppSocketAddress()); @@ -226,16 +267,20 @@ mscclppResult_t MscclppBootstrap::Impl::bootstrapRoot() std::memset(rankAddressesRoot.data(), 0, sizeof(mscclppSocketAddress) * this->nRanks_); setFilesLimit(); + printf("I am here1 %x\n", uniqueId_.magic); mscclppSocket listenSock; MSCCLPPCHECK( mscclppSocketInit(&listenSock, &uniqueId_.addr, uniqueId_.magic, mscclppSocketTypeBootstrap, nullptr, 0)); MSCCLPPCHECK(mscclppSocketListen(&listenSock)); + printf("I am here2\n"); TRACE(MSCCLPP_INIT, "BEGIN"); + printf("I am here3\n"); /* Receive addresses from all ranks */ do { int rank; res = getRemoteAddresses(&listenSock, rankAddresses, rankAddressesRoot, rank); + printf("I am here4\n"); if (res != mscclppSuccess) { WARN("Bootstrap Root : getRemoteAddresses failed"); break; @@ -262,6 +307,8 @@ mscclppResult_t MscclppBootstrap::Impl::bootstrapRoot() mscclppResult_t MscclppBootstrap::Impl::netInit(std::string ipPortPair) { + if (netInitialized) + return mscclppSuccess; if (!ipPortPair.empty()) { union mscclppSocketAddress remoteAddr; if (mscclppSocketGetAddrFromString(&remoteAddr, ipPortPair.c_str()) != mscclppSuccess) { @@ -284,17 +331,18 @@ mscclppResult_t MscclppBootstrap::Impl::netInit(std::string ipPortPair) std::sprintf(line, " %s:", netIfName_); mscclppSocketToString(&netIfAddr_, line + strlen(line)); INFO(MSCCLPP_INIT, "Bootstrap : Using%s", line); + netInitialized = true; return mscclppSuccess; } -mscclppResult_t MscclppBootstrap::Impl::initialize() +mscclppResult_t MscclppBootstrap::Impl::establishConnections() { mscclppSocket* proxySocket; mscclppSocketAddress nextAddr; mscclppSocket sock, listenSockRoot; extInfo info; - TRACE(MSCCLPP_INIT, "rank %d nranks %d", rank, nranks); + TRACE(MSCCLPP_INIT, "rank %d nranks %d", rank_, nRanks_); info.rank = this->rank_; info.nRanks = this->nRanks_; @@ -322,11 +370,21 @@ mscclppResult_t MscclppBootstrap::Impl::initialize() randomSleep(this->rank_); } + + char line[SOCKET_NAME_MAXLEN + MAX_IF_NAME_SIZE + 2]; + std::sprintf(line, " %s:", netIfName_); + mscclppSocketToString(&this->uniqueId_.addr, line + strlen(line)); + + printf("tt 1 %s\n", line); // send info on my listening socket to root MSCCLPPCHECK(mscclppSocketInit(&sock, &this->uniqueId_.addr, magic, mscclppSocketTypeBootstrap, this->abortFlag_)); + printf("tt 2\n"); MSCCLPPCHECK(mscclppSocketConnect(&sock)); + printf("tt 3\n"); MSCCLPPCHECK(netSend(&sock, &info, sizeof(info))); + printf("tt 4\n"); MSCCLPPCHECK(mscclppSocketClose(&sock)); + printf("tt 5\n"); // get info on my "next" rank in the bootstrap ring from root MSCCLPPCHECK(mscclppSocketInit(&sock)); @@ -353,7 +411,7 @@ mscclppResult_t MscclppBootstrap::Impl::initialize() MSCCLPPCHECK(mscclppSocketGetAddr(proxySocket, &this->peerProxyAddresses_[rank_])); MSCCLPPCHECK(allGather(this->peerProxyAddresses_.data(), sizeof(union mscclppSocketAddress))); - TRACE(MSCCLPP_INIT, "rank %d nranks %d - DONE", rank, nranks); + TRACE(MSCCLPP_INIT, "rank %d nranks %d - DONE", rank_, nRanks_); return mscclppSuccess; } @@ -380,7 +438,7 @@ mscclppResult_t MscclppBootstrap::Impl::allGather(void* allData, int size) MSCCLPPCHECK(netRecv(&this->ringRecvSocket_, data + rSlice * size, size)); } - TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); + TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - DONE", rank, nRanks, size); return mscclppSuccess; } @@ -432,23 +490,15 @@ mscclppResult_t MscclppBootstrap::Impl::close() return mscclppSuccess; } -MscclppBootstrap::MscclppBootstrap(std::string ipPortPair, int rank, int nRanks) +MscclppBootstrap::MscclppBootstrap(int rank, int nRanks) { - UniqueId uniqueId; - std::memset(&uniqueId, 0, sizeof(uniqueId)); // pimpl_ = std::make_unique(ipPortPair, rank, nRanks, uniqueId); - pimpl_ = new Impl(ipPortPair, rank, nRanks, uniqueId); -} - -MscclppBootstrap::MscclppBootstrap(UniqueId uniqueId, int rank, int nRanks) -{ - pimpl_ = new Impl("", rank, nRanks, uniqueId); - // pimpl_ = std::make_unique("", rank, nRanks, uniqueId); + pimpl_ = new Impl(rank, nRanks); } UniqueId MscclppBootstrap::GetUniqueId() { - return Impl::uniqueId_; + return pimpl_->getUniqueId(); } void MscclppBootstrap::Send(void* data, int size, int peer, int tag) @@ -475,12 +525,14 @@ void MscclppBootstrap::AllGather(void* allData, int size) } } -void MscclppBootstrap::Initialize() +void MscclppBootstrap::Initialize(const UniqueId uniqueId) { - mscclppResult_t res = pimpl_->initialize(); - if (res != mscclppSuccess) { - throw std::runtime_error("MscclppBootstrap::Initialize failed"); - } + pimpl_->initialize(uniqueId); +} + +void MscclppBootstrap::Initialize(std::string ipPortPair) +{ + pimpl_->initialize(ipPortPair); } void MscclppBootstrap::Barrier() diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h index 175981e4..a76c99b7 100644 --- a/src/include/bootstrap.h +++ b/src/include/bootstrap.h @@ -17,13 +17,13 @@ static_assert(sizeof(UniqueId) <= sizeof(mscclppUniqueId), class __attribute__((visibility("default"))) MscclppBootstrap : public Bootstrap { public: - MscclppBootstrap(std::string ipPortPair, int rank, int nRanks); - MscclppBootstrap(UniqueId uniqueId, int rank, int nRanks); + MscclppBootstrap(int rank, int nRanks); ~MscclppBootstrap() override = default; - static UniqueId GetUniqueId(); + UniqueId GetUniqueId(); - void Initialize(); + void Initialize(const UniqueId uniqueId); + void Initialize(std::string ipPortPair); void Send(void* data, int size, int peer, int tag) override; void Recv(void* data, int size, int peer, int tag) override; void AllGather(void* allData, int size) override; diff --git a/tests/bootstrap_test_cpp.cc b/tests/bootstrap_test_cpp.cc index 34137577..8e3b1e87 100644 --- a/tests/bootstrap_test_cpp.cc +++ b/tests/bootstrap_test_cpp.cc @@ -11,7 +11,13 @@ int main() MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &worldSize); - std::shared_ptr bootstrap(new MscclppBootstrap("", rank, worldSize)); + std::shared_ptr bootstrap(new MscclppBootstrap(rank, worldSize)); + // bootstrap->Initialize("costsim-dev-00000A:50000"); + UniqueId id; + if (rank == 0) + id = bootstrap->GetUniqueId(); + MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); + bootstrap->Initialize(id); // need to call initialization first MPI_Finalize(); From 6f4dc57331a9ea6f33fe387972a55024baf3ed50 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Mon, 24 Apr 2023 07:45:01 +0000 Subject: [PATCH 042/135] fixed --- src/bootstrap/bootstrap.cc | 5 +++++ tests/bootstrap_test_cpp.cc | 12 ++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index b12afb61..97a68634 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -152,6 +152,11 @@ void MscclppBootstrap::Impl::initialize(std::string ipPortPair) uniqueId_.magic = 0xdeadbeef; std::memcpy(&uniqueId_.addr, &netIfAddr_, sizeof(union mscclppSocketAddress)); + ret = mscclppSocketGetAddrFromString(&uniqueId_.addr, ipPortPair.c_str()); + if (ret != mscclppSuccess) { + throw std::runtime_error("Failed to get address from string"); + } + if (rank_ == 0) { rootThread_ = std::thread(&MscclppBootstrap::Impl::bootstrapRoot, this); } diff --git a/tests/bootstrap_test_cpp.cc b/tests/bootstrap_test_cpp.cc index 8e3b1e87..8a170ccd 100644 --- a/tests/bootstrap_test_cpp.cc +++ b/tests/bootstrap_test_cpp.cc @@ -12,12 +12,12 @@ int main() MPI_Comm_size(MPI_COMM_WORLD, &worldSize); std::shared_ptr bootstrap(new MscclppBootstrap(rank, worldSize)); - // bootstrap->Initialize("costsim-dev-00000A:50000"); - UniqueId id; - if (rank == 0) - id = bootstrap->GetUniqueId(); - MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); - bootstrap->Initialize(id); + bootstrap->Initialize("costsim-dev-00000A:50000"); + // UniqueId id; + // if (rank == 0) + // id = bootstrap->GetUniqueId(); + // MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); + // bootstrap->Initialize(id); // need to call initialization first MPI_Finalize(); From f0f058410a97cf0a2d22f9b1acd615a352573570 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Mon, 24 Apr 2023 19:25:06 +0000 Subject: [PATCH 043/135] working bootstrap initialization --- src/bootstrap/bootstrap.cc | 66 ++++++++++++++++++------------------- tests/bootstrap_test_cpp.cc | 12 +++---- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index 97a68634..8b20a472 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -91,7 +91,8 @@ private: static mscclppResult_t netSend(mscclppSocket* sock, const void* data, int size); static mscclppResult_t netRecv(mscclppSocket* sock, void* data, int size); - mscclppResult_t bootstrapRoot(); + void bootstrapCreateRoot(); + mscclppResult_t bootstrapRoot(mscclppSocket listenSock); mscclppResult_t getRemoteAddresses(mscclppSocket* listenSock, std::vector& rankAddresses, std::vector& rankAddressesRoot, int& rank); mscclppResult_t sendHandleToPeer(int peer, const std::vector& rankAddresses, @@ -109,18 +110,18 @@ MscclppBootstrap::Impl::Impl(int rank, int nRanks) UniqueId MscclppBootstrap::Impl::getUniqueId() { - UniqueId uniqueId; auto ret = netInit(""); if (ret != mscclppSuccess) { throw std::runtime_error("Failed to initialize network"); } - ret = getRandomData(&uniqueId.magic, sizeof(uniqueId.magic)); + ret = getRandomData(&uniqueId_.magic, sizeof(uniqueId_.magic)); if (ret != mscclppSuccess) { throw std::runtime_error("getting random data failed"); } - std::memcpy(&uniqueId.addr, &netIfAddr_, sizeof(union mscclppSocketAddress)); + std::memcpy(&uniqueId_.addr, &netIfAddr_, sizeof(union mscclppSocketAddress)); + bootstrapCreateRoot(); - return uniqueId; + return uniqueId_; } void MscclppBootstrap::Impl::initialize(const UniqueId uniqueId) @@ -132,10 +133,7 @@ void MscclppBootstrap::Impl::initialize(const UniqueId uniqueId) uniqueId_.magic = uniqueId.magic; uniqueId_.addr = uniqueId.addr; - - if (rank_ == 0) { - rootThread_ = std::thread(&MscclppBootstrap::Impl::bootstrapRoot, this); - } + // printf("addr = %s port = %d\n", inet_ntoa(uniqueId_.addr.sin.sin_addr), (int)ntohs(uniqueId_.addr.sin.sin_port)); ret = establishConnections(); if (ret != mscclppSuccess) { @@ -158,7 +156,7 @@ void MscclppBootstrap::Impl::initialize(std::string ipPortPair) } if (rank_ == 0) { - rootThread_ = std::thread(&MscclppBootstrap::Impl::bootstrapRoot, this); + bootstrapCreateRoot(); } ret = establishConnections(); @@ -184,39 +182,33 @@ mscclppResult_t MscclppBootstrap::Impl::getRemoteAddresses(mscclppSocket* listen mscclppResult_t res = mscclppSuccess; mscclppSocketAddress zero; - printf("hh 0\n"); std::memset(&zero, 0, sizeof(mscclppSocketAddress)); res = mscclppSocketInit(&sock); if (res != mscclppSuccess) { WARN("Bootstrap Root : mscclppSocketInit failed"); return res; } - printf("hh 1\n"); res = mscclppSocketAccept(&sock, listenSock); if (res != mscclppSuccess) { WARN("Bootstrap Root : mscclppSocketAccept failed"); return res; } - printf("hh 2\n"); res = netRecv(&sock, &info, sizeof(info)); if (res != mscclppSuccess) { WARN("Bootstrap Root : netRecv failed"); return res; } - printf("hh 3\n"); res = mscclppSocketClose(&sock); if (res != mscclppSuccess) { WARN("Bootstrap Root : mscclppSocketClose failed"); return res; } - printf("hh 4\n"); if (this->nRanks_ != info.nRanks) { WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", this->nRanks_, info.nRanks); return res; } - printf("hh 5\n"); if (std::memcmp(&zero, &rankAddressesRoot[info.rank], sizeof(mscclppSocketAddress)) != 0) { WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, this->nRanks_); return res; @@ -259,9 +251,31 @@ mscclppResult_t MscclppBootstrap::Impl::sendHandleToPeer(int peer, return mscclppSuccess; } -mscclppResult_t MscclppBootstrap::Impl::bootstrapRoot() +void MscclppBootstrap::Impl::bootstrapCreateRoot() +{ + mscclppSocket listenSock; + + // mscclppSocket* listenSock = new mscclppSocket(); // TODO(saemal) make this a shared ptr + auto ret = mscclppSocketInit(&listenSock, &uniqueId_.addr, uniqueId_.magic, mscclppSocketTypeBootstrap, nullptr, 0); + if (ret != mscclppSuccess) { + throw std::runtime_error("Failed to initialize socket"); + } + ret = mscclppSocketListen(&listenSock); + if (ret != mscclppSuccess) { + throw std::runtime_error("Failed to listen on socket"); + } + ret = mscclppSocketGetAddr(&listenSock, &uniqueId_.addr); + if (ret != mscclppSuccess) { + throw std::runtime_error("Failed to get socket address"); + } + auto lambda = [this, listenSock]() { + this->bootstrapRoot(listenSock); + }; + rootThread_ = std::thread(lambda); +} + +mscclppResult_t MscclppBootstrap::Impl::bootstrapRoot(mscclppSocket listenSock) { - printf("I am here0 magic %x\n", uniqueId_.magic); mscclppResult_t res = mscclppSuccess; int numCollected = 0; std::vector rankAddresses(this->nRanks_, mscclppSocketAddress()); @@ -272,20 +286,11 @@ mscclppResult_t MscclppBootstrap::Impl::bootstrapRoot() std::memset(rankAddressesRoot.data(), 0, sizeof(mscclppSocketAddress) * this->nRanks_); setFilesLimit(); - printf("I am here1 %x\n", uniqueId_.magic); - mscclppSocket listenSock; - MSCCLPPCHECK( - mscclppSocketInit(&listenSock, &uniqueId_.addr, uniqueId_.magic, mscclppSocketTypeBootstrap, nullptr, 0)); - MSCCLPPCHECK(mscclppSocketListen(&listenSock)); - printf("I am here2\n"); - TRACE(MSCCLPP_INIT, "BEGIN"); - printf("I am here3\n"); /* Receive addresses from all ranks */ do { int rank; res = getRemoteAddresses(&listenSock, rankAddresses, rankAddressesRoot, rank); - printf("I am here4\n"); if (res != mscclppSuccess) { WARN("Bootstrap Root : getRemoteAddresses failed"); break; @@ -380,16 +385,11 @@ mscclppResult_t MscclppBootstrap::Impl::establishConnections() std::sprintf(line, " %s:", netIfName_); mscclppSocketToString(&this->uniqueId_.addr, line + strlen(line)); - printf("tt 1 %s\n", line); // send info on my listening socket to root MSCCLPPCHECK(mscclppSocketInit(&sock, &this->uniqueId_.addr, magic, mscclppSocketTypeBootstrap, this->abortFlag_)); - printf("tt 2\n"); MSCCLPPCHECK(mscclppSocketConnect(&sock)); - printf("tt 3\n"); MSCCLPPCHECK(netSend(&sock, &info, sizeof(info))); - printf("tt 4\n"); MSCCLPPCHECK(mscclppSocketClose(&sock)); - printf("tt 5\n"); // get info on my "next" rank in the bootstrap ring from root MSCCLPPCHECK(mscclppSocketInit(&sock)); @@ -771,7 +771,7 @@ mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle, bool memcpy(&handle->addr, &bootstrapNetIfAddr, sizeof(union mscclppSocketAddress)); MSCCLPPCHECK(bootstrapCreateRoot(handle)); } - // printf("addr = %s port = %d\n", inet_ntoa(handle->addr.sin.sin_addr), (int)ntohs(handle->addr.sin.sin_port)); + printf("addr = %s port = %d\n", inet_ntoa(handle->addr.sin.sin_addr), (int)ntohs(handle->addr.sin.sin_port)); // printf("addr = %s\n", inet_ntoa((*(struct sockaddr_in*)&handle->addr.sa).sin_addr)); return mscclppSuccess; diff --git a/tests/bootstrap_test_cpp.cc b/tests/bootstrap_test_cpp.cc index 8a170ccd..8e3b1e87 100644 --- a/tests/bootstrap_test_cpp.cc +++ b/tests/bootstrap_test_cpp.cc @@ -12,12 +12,12 @@ int main() MPI_Comm_size(MPI_COMM_WORLD, &worldSize); std::shared_ptr bootstrap(new MscclppBootstrap(rank, worldSize)); - bootstrap->Initialize("costsim-dev-00000A:50000"); - // UniqueId id; - // if (rank == 0) - // id = bootstrap->GetUniqueId(); - // MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); - // bootstrap->Initialize(id); + // bootstrap->Initialize("costsim-dev-00000A:50000"); + UniqueId id; + if (rank == 0) + id = bootstrap->GetUniqueId(); + MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); + bootstrap->Initialize(id); // need to call initialization first MPI_Finalize(); From 27114d91fb92d7a814e8645ad5f5db14f6fc58db Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Mon, 24 Apr 2023 21:50:03 +0000 Subject: [PATCH 044/135] bootstrap tests pass --- src/bootstrap/bootstrap.cc | 53 ++++++++++++++++++++++++++----------- src/include/bootstrap.h | 3 +-- tests/bootstrap_test_cpp.cc | 35 +++++++++++++++++++++++- 3 files changed, 73 insertions(+), 18 deletions(-) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index 8b20a472..b700654d 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -41,11 +41,11 @@ enum bootstrapInterface_t dontCareIf = -2 }; -struct unexpectedConn +struct unexpectedMsg { int peer; int tag; - struct mscclppSocket sock; + std::shared_ptr sock; }; struct extInfo @@ -81,8 +81,8 @@ private: mscclppSocket ringRecvSocket_; mscclppSocket ringSendSocket_; std::vector peerCommAddresses_; - std::vector peerProxyAddresses_; - std::queue unexpectedConnections_; + std::list unexpectedMessages_; + std::vector barrierArr_; volatile uint32_t* abortFlag_; std::thread rootThread_; char netIfName_[MAX_IF_NAME_SIZE + 1]; @@ -104,7 +104,7 @@ private: MscclppBootstrap::Impl::Impl(int rank, int nRanks) : rank_(rank), nRanks_(nRanks), netInitialized(false), peerCommAddresses_(nRanks, mscclppSocketAddress()), - peerProxyAddresses_(nRanks, mscclppSocketAddress()), abortFlag_(nullptr) + barrierArr_(nRanks, 0), abortFlag_(nullptr) { } @@ -347,7 +347,6 @@ mscclppResult_t MscclppBootstrap::Impl::netInit(std::string ipPortPair) mscclppResult_t MscclppBootstrap::Impl::establishConnections() { - mscclppSocket* proxySocket; mscclppSocketAddress nextAddr; mscclppSocket sock, listenSockRoot; extInfo info; @@ -409,15 +408,7 @@ mscclppResult_t MscclppBootstrap::Impl::establishConnections() MSCCLPPCHECK(mscclppSocketGetAddr(&this->listenSock_, &this->peerCommAddresses_[rank_])); MSCCLPPCHECK(allGather(this->peerCommAddresses_.data(), sizeof(union mscclppSocketAddress))); - // proxy is aborted through a message; don't set abortFlag - MSCCLPPCHECK(mscclppCalloc(&proxySocket, 1)); - MSCCLPPCHECK(mscclppSocketInit(proxySocket, &netIfAddr_, magic, mscclppSocketTypeProxy, this->abortFlag_)); - MSCCLPPCHECK(mscclppSocketListen(proxySocket)); - MSCCLPPCHECK(mscclppSocketGetAddr(proxySocket, &this->peerProxyAddresses_[rank_])); - MSCCLPPCHECK(allGather(this->peerProxyAddresses_.data(), sizeof(union mscclppSocketAddress))); - TRACE(MSCCLPP_INIT, "rank %d nranks %d - DONE", rank_, nRanks_); - return mscclppSuccess; } @@ -482,16 +473,48 @@ mscclppResult_t MscclppBootstrap::Impl::send(void* data, int size, int peer, int mscclppResult_t MscclppBootstrap::Impl::recv(void* data, int size, int peer, int tag) { + // search over all unexpected messages + for (auto it = unexpectedMessages_.begin(); it != unexpectedMessages_.end(); ++it){ + if (it->peer == peer && it->tag == tag){ + // found a match + MSCCLPPCHECK(netRecv(it->sock.get(), data, size)); + MSCCLPPCHECK(mscclppSocketClose(it->sock.get())); + unexpectedMessages_.erase(it); + return mscclppSuccess; + } + } + // didn't find one + while (true) { + auto sock = std::make_shared(); + int newPeer, newTag; + MSCCLPPCHECK(mscclppSocketInit(sock.get())); + MSCCLPPCHECK(mscclppSocketAccept(sock.get(), &this->listenSock_)); + MSCCLPPCHECK(netRecv(sock.get(), &newPeer, sizeof(int))); + MSCCLPPCHECK(netRecv(sock.get(), &newTag, sizeof(int))); + if (newPeer == peer && newTag == tag) { + MSCCLPPCHECK(netRecv(sock.get(), ((char*)data), size)); + MSCCLPPCHECK(mscclppSocketClose(sock.get())); + return mscclppSuccess; + } + // Unexpected message. Save for later. + unexpectedMessages_.push_back({newPeer, newTag, sock}); + } + return mscclppSuccess; } mscclppResult_t MscclppBootstrap::Impl::barrier() { + MSCCLPPCHECK(allGather(barrierArr_.data(), sizeof(int))); return mscclppSuccess; } mscclppResult_t MscclppBootstrap::Impl::close() { + MSCCLPPCHECK(mscclppSocketClose(&this->listenSock_)); + MSCCLPPCHECK(mscclppSocketClose(&this->ringSendSocket_)); + MSCCLPPCHECK(mscclppSocketClose(&this->ringRecvSocket_)); + return mscclppSuccess; } @@ -548,7 +571,7 @@ void MscclppBootstrap::Barrier() } } -void MscclppBootstrap::Close() +MscclppBootstrap::~MscclppBootstrap() { mscclppResult_t res = pimpl_->close(); if (res != mscclppSuccess) { diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h index a76c99b7..3b683707 100644 --- a/src/include/bootstrap.h +++ b/src/include/bootstrap.h @@ -18,7 +18,7 @@ class __attribute__((visibility("default"))) MscclppBootstrap : public Bootstrap { public: MscclppBootstrap(int rank, int nRanks); - ~MscclppBootstrap() override = default; + ~MscclppBootstrap(); UniqueId GetUniqueId(); @@ -28,7 +28,6 @@ public: void Recv(void* data, int size, int peer, int tag) override; void AllGather(void* allData, int size) override; void Barrier() override; - void Close(); private: class Impl; diff --git a/tests/bootstrap_test_cpp.cc b/tests/bootstrap_test_cpp.cc index 8e3b1e87..a810cb2f 100644 --- a/tests/bootstrap_test_cpp.cc +++ b/tests/bootstrap_test_cpp.cc @@ -18,7 +18,40 @@ int main() id = bootstrap->GetUniqueId(); MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); bootstrap->Initialize(id); - // need to call initialization first + + std::vector tmp(worldSize, 0); + tmp[rank] = rank+1; + bootstrap->AllGather(tmp.data(), sizeof(int)); + for (int i = 0; i < worldSize; i++){ + if (tmp[i] != i+1) + printf("error AllGather: rank %d: tmp[%d] = %d\n", rank, i, tmp[i]); + } + printf("rank %d: AllGather test passed!\n", rank); + + bootstrap->Barrier(); + printf("rank %d: Barrier test passed!\n", rank); + + for (int i = 0; i < worldSize; i++){ + if (i == rank) + continue; + int msg1 = (rank + 1)*2; + int msg2 = (rank + 1)*2+1; + bootstrap->Send(&msg1, sizeof(int), i, 0); + bootstrap->Send(&msg2, sizeof(int), i, 1); + } + + for (int i = 0; i < worldSize; i++){ + if (i == rank) + continue; + int msg1 = 0; + int msg2 = 0; + // recv them in the opposite order to check correctness + bootstrap->Recv(&msg2, sizeof(int), i, 1); + bootstrap->Recv(&msg1, sizeof(int), i, 0); + if (msg1 != (i+1)*2 || msg2 != (i+1)*2+1) + printf("error Send/Recv: rank %d: msg1 = %d, msg2 = %d\n", rank, msg1, msg2); + } + printf("rank %d: Send/Recv test passed!\n", rank); MPI_Finalize(); return 0; From d6e91338d4c9e6863a0c35a54986140fc367682a Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Mon, 24 Apr 2023 23:07:38 +0000 Subject: [PATCH 045/135] bootstrap tests pass --- src/bootstrap/bootstrap.cc | 336 +++++++++++++----------------------- src/include/bootstrap.h | 8 +- tests/bootstrap_test_cpp.cc | 2 +- 3 files changed, 124 insertions(+), 222 deletions(-) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index b700654d..b38d4b84 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -1,5 +1,6 @@ #include "bootstrap.h" #include "utils.h" +#include "checks.hpp" #include #include @@ -56,20 +57,20 @@ struct extInfo union mscclppSocketAddress extAddressListen; }; -class MscclppBootstrap::Impl +class mscclppBootstrap::Impl { public: Impl(int rank, int nRanks); ~Impl(); void initialize(const UniqueId uniqueId); void initialize(std::string ipPortPair); - mscclppResult_t establishConnections(); + void establishConnections(); UniqueId getUniqueId(); - mscclppResult_t allGather(void* allData, int size); - mscclppResult_t send(void* data, int size, int peer, int tag); - mscclppResult_t recv(void* data, int size, int peer, int tag); - mscclppResult_t barrier(); - mscclppResult_t close(); + void allGather(void* allData, int size); + void send(void* data, int size, int peer, int tag); + void recv(void* data, int size, int peer, int tag); + void barrier(); + void close(); UniqueId uniqueId_; @@ -88,170 +89,111 @@ private: char netIfName_[MAX_IF_NAME_SIZE + 1]; union mscclppSocketAddress netIfAddr_; - static mscclppResult_t netSend(mscclppSocket* sock, const void* data, int size); - static mscclppResult_t netRecv(mscclppSocket* sock, void* data, int size); + void netSend(mscclppSocket* sock, const void* data, int size); + void netRecv(mscclppSocket* sock, void* data, int size); void bootstrapCreateRoot(); - mscclppResult_t bootstrapRoot(mscclppSocket listenSock); - mscclppResult_t getRemoteAddresses(mscclppSocket* listenSock, std::vector& rankAddresses, + void bootstrapRoot(mscclppSocket listenSock); + void getRemoteAddresses(mscclppSocket* listenSock, std::vector& rankAddresses, std::vector& rankAddressesRoot, int& rank); - mscclppResult_t sendHandleToPeer(int peer, const std::vector& rankAddresses, + void sendHandleToPeer(int peer, const std::vector& rankAddresses, const std::vector& rankAddressesRoot); - mscclppResult_t netInit(std::string ipPortPair); + void netInit(std::string ipPortPair); }; // UniqueId MscclppBootstrap::Impl::uniqueId_; -MscclppBootstrap::Impl::Impl(int rank, int nRanks) +mscclppBootstrap::Impl::Impl(int rank, int nRanks) : rank_(rank), nRanks_(nRanks), netInitialized(false), peerCommAddresses_(nRanks, mscclppSocketAddress()), barrierArr_(nRanks, 0), abortFlag_(nullptr) { } -UniqueId MscclppBootstrap::Impl::getUniqueId() +UniqueId mscclppBootstrap::Impl::getUniqueId() { - auto ret = netInit(""); - if (ret != mscclppSuccess) { - throw std::runtime_error("Failed to initialize network"); - } - ret = getRandomData(&uniqueId_.magic, sizeof(uniqueId_.magic)); - if (ret != mscclppSuccess) { - throw std::runtime_error("getting random data failed"); - } + netInit(""); + MSCCLPPTHROW(getRandomData(&uniqueId_.magic, sizeof(uniqueId_.magic))); std::memcpy(&uniqueId_.addr, &netIfAddr_, sizeof(union mscclppSocketAddress)); bootstrapCreateRoot(); return uniqueId_; } -void MscclppBootstrap::Impl::initialize(const UniqueId uniqueId) +void mscclppBootstrap::Impl::initialize(const UniqueId uniqueId) { - int ret = netInit(""); - if (ret != mscclppSuccess) { - throw std::runtime_error("Failed to initialize network"); - } + netInit(""); uniqueId_.magic = uniqueId.magic; uniqueId_.addr = uniqueId.addr; // printf("addr = %s port = %d\n", inet_ntoa(uniqueId_.addr.sin.sin_addr), (int)ntohs(uniqueId_.addr.sin.sin_port)); - ret = establishConnections(); - if (ret != mscclppSuccess) { - throw std::runtime_error("Failed to establish connections"); - } + establishConnections(); } -void MscclppBootstrap::Impl::initialize(std::string ipPortPair) +void mscclppBootstrap::Impl::initialize(std::string ipPortPair) { - int ret = netInit(ipPortPair); - if (ret != mscclppSuccess) { - throw std::runtime_error("Failed to initialize network"); - } + netInit(ipPortPair); uniqueId_.magic = 0xdeadbeef; std::memcpy(&uniqueId_.addr, &netIfAddr_, sizeof(union mscclppSocketAddress)); - ret = mscclppSocketGetAddrFromString(&uniqueId_.addr, ipPortPair.c_str()); - if (ret != mscclppSuccess) { - throw std::runtime_error("Failed to get address from string"); - } + MSCCLPPTHROW(mscclppSocketGetAddrFromString(&uniqueId_.addr, ipPortPair.c_str())); if (rank_ == 0) { bootstrapCreateRoot(); } - ret = establishConnections(); - if (ret != mscclppSuccess) { - throw std::runtime_error("Failed to establish connections"); - } + establishConnections(); } -MscclppBootstrap::Impl::~Impl() +mscclppBootstrap::Impl::~Impl() { if (rootThread_.joinable()) { rootThread_.join(); } } -mscclppResult_t MscclppBootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, +void mscclppBootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, std::vector& rankAddresses, std::vector& rankAddressesRoot, int& rank) { mscclppSocket sock; extInfo info; - mscclppResult_t res = mscclppSuccess; mscclppSocketAddress zero; std::memset(&zero, 0, sizeof(mscclppSocketAddress)); - res = mscclppSocketInit(&sock); - if (res != mscclppSuccess) { - WARN("Bootstrap Root : mscclppSocketInit failed"); - return res; - } - res = mscclppSocketAccept(&sock, listenSock); - if (res != mscclppSuccess) { - WARN("Bootstrap Root : mscclppSocketAccept failed"); - return res; - } - res = netRecv(&sock, &info, sizeof(info)); - if (res != mscclppSuccess) { - WARN("Bootstrap Root : netRecv failed"); - return res; - } - res = mscclppSocketClose(&sock); - if (res != mscclppSuccess) { - WARN("Bootstrap Root : mscclppSocketClose failed"); - return res; - } + MSCCLPPTHROW(mscclppSocketInit(&sock)); + MSCCLPPTHROW(mscclppSocketAccept(&sock, listenSock)); + netRecv(&sock, &info, sizeof(info)); + MSCCLPPTHROW(mscclppSocketClose(&sock)); if (this->nRanks_ != info.nRanks) { - WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", this->nRanks_, info.nRanks); - return res; + throw std::runtime_error("Bootstrap Root : mismatch in rank count from procs " + std::to_string(this->nRanks_) + " : " + std::to_string(info.nRanks)); } if (std::memcmp(&zero, &rankAddressesRoot[info.rank], sizeof(mscclppSocketAddress)) != 0) { - WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, this->nRanks_); - return res; + throw std::runtime_error("Bootstrap Root : rank " + std::to_string(info.rank) + " of " + std::to_string(this->nRanks_) + " has already checked in"); } // Save the connection handle for that rank rankAddressesRoot[info.rank] = info.extAddressListenRoot; rankAddresses[info.rank] = info.extAddressListen; rank = info.rank; - return res; } -mscclppResult_t MscclppBootstrap::Impl::sendHandleToPeer(int peer, +void mscclppBootstrap::Impl::sendHandleToPeer(int peer, const std::vector& rankAddresses, const std::vector& rankAddressesRoot) { mscclppSocket sock; - mscclppResult_t res; int next = (peer + 1) % this->nRanks_; - res = mscclppSocketInit(&sock, &rankAddressesRoot[peer], this->uniqueId_.magic, mscclppSocketTypeBootstrap); - if (res != mscclppSuccess) { - WARN("Bootstrap Root : mscclppSocketInit failed"); - return res; - } - res = mscclppSocketConnect(&sock); - if (res != mscclppSuccess) { - WARN("Bootstrap Root : mscclppSocketConnect failed"); - return res; - } - res = netSend(&sock, &rankAddresses[next], sizeof(mscclppSocketAddress)); - if (res != mscclppSuccess) { - WARN("Bootstrap Root : netSend failed"); - return res; - } - res = mscclppSocketClose(&sock); - if (res != mscclppSuccess) { - WARN("Bootstrap Root : mscclppSocketClose failed"); - return res; - } - return mscclppSuccess; + MSCCLPPTHROW(mscclppSocketInit(&sock, &rankAddressesRoot[peer], this->uniqueId_.magic, mscclppSocketTypeBootstrap)); + MSCCLPPTHROW(mscclppSocketConnect(&sock)); + netSend(&sock, &rankAddresses[next], sizeof(mscclppSocketAddress)); + MSCCLPPTHROW(mscclppSocketClose(&sock)); } -void MscclppBootstrap::Impl::bootstrapCreateRoot() +void mscclppBootstrap::Impl::bootstrapCreateRoot() { mscclppSocket listenSock; @@ -274,9 +216,8 @@ void MscclppBootstrap::Impl::bootstrapCreateRoot() rootThread_ = std::thread(lambda); } -mscclppResult_t MscclppBootstrap::Impl::bootstrapRoot(mscclppSocket listenSock) +void mscclppBootstrap::Impl::bootstrapRoot(mscclppSocket listenSock) { - mscclppResult_t res = mscclppSuccess; int numCollected = 0; std::vector rankAddresses(this->nRanks_, mscclppSocketAddress()); // for initial rank <-> root information exchange @@ -290,11 +231,7 @@ mscclppResult_t MscclppBootstrap::Impl::bootstrapRoot(mscclppSocket listenSock) /* Receive addresses from all ranks */ do { int rank; - res = getRemoteAddresses(&listenSock, rankAddresses, rankAddressesRoot, rank); - if (res != mscclppSuccess) { - WARN("Bootstrap Root : getRemoteAddresses failed"); - break; - } + getRemoteAddresses(&listenSock, rankAddresses, rankAddressesRoot, rank); ++numCollected; TRACE(MSCCLPP_INIT, "Received connect from rank %d total %d/%d", rank, numCollected, this->nRanks_); } while (numCollected < this->nRanks_); @@ -302,38 +239,28 @@ mscclppResult_t MscclppBootstrap::Impl::bootstrapRoot(mscclppSocket listenSock) // Send the connect handle for the next rank in the AllGather ring for (int peer = 0; peer < this->nRanks_; ++peer) { - res = sendHandleToPeer(peer, rankAddresses, rankAddressesRoot); - if (res != mscclppSuccess) { - WARN("Bootstrap Root : sendHandleToPeer failed"); - break; - } - } - if (res == mscclppSuccess) { - TRACE(MSCCLPP_INIT, "SENT OUT ALL %d HANDLES", this->nRanks_); + sendHandleToPeer(peer, rankAddresses, rankAddressesRoot); } + TRACE(MSCCLPP_INIT, "DONE"); - return res; } -mscclppResult_t MscclppBootstrap::Impl::netInit(std::string ipPortPair) +void mscclppBootstrap::Impl::netInit(std::string ipPortPair) { if (netInitialized) - return mscclppSuccess; + return; if (!ipPortPair.empty()) { union mscclppSocketAddress remoteAddr; if (mscclppSocketGetAddrFromString(&remoteAddr, ipPortPair.c_str()) != mscclppSuccess) { - WARN("Invalid MSCCLPP_COMM_ID, please use format: : or []: or :"); - return mscclppInvalidArgument; + throw std::runtime_error("Invalid ipPortPair, please use format: : or []: or :"); } if (mscclppFindInterfaceMatchSubnet(netIfName_, &netIfAddr_, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) { - WARN("NET/Socket : No usable listening interface found"); - return mscclppSystemError; + throw std::runtime_error("NET/Socket : No usable listening interface found"); } } else { int ret = mscclppFindInterfaces(netIfName_, &netIfAddr_, MAX_IF_NAME_SIZE, 1); if (ret <= 0) { - WARN("Bootstrap : no socket interface found"); - return mscclppInternalError; + throw std::runtime_error("Bootstrap : no socket interface found"); } } @@ -342,10 +269,9 @@ mscclppResult_t MscclppBootstrap::Impl::netInit(std::string ipPortPair) mscclppSocketToString(&netIfAddr_, line + strlen(line)); INFO(MSCCLPP_INIT, "Bootstrap : Using%s", line); netInitialized = true; - return mscclppSuccess; } -mscclppResult_t MscclppBootstrap::Impl::establishConnections() +void mscclppBootstrap::Impl::establishConnections() { mscclppSocketAddress nextAddr; mscclppSocket sock, listenSockRoot; @@ -358,14 +284,14 @@ mscclppResult_t MscclppBootstrap::Impl::establishConnections() uint64_t magic = this->uniqueId_.magic; // Create socket for other ranks to contact me - MSCCLPPCHECK(mscclppSocketInit(&this->listenSock_, &netIfAddr_, magic, mscclppSocketTypeBootstrap, this->abortFlag_)); - MSCCLPPCHECK(mscclppSocketListen(&this->listenSock_)); - MSCCLPPCHECK(mscclppSocketGetAddr(&this->listenSock_, &info.extAddressListen)); + MSCCLPPTHROW(mscclppSocketInit(&this->listenSock_, &netIfAddr_, magic, mscclppSocketTypeBootstrap, this->abortFlag_)); + MSCCLPPTHROW(mscclppSocketListen(&this->listenSock_)); + MSCCLPPTHROW(mscclppSocketGetAddr(&this->listenSock_, &info.extAddressListen)); // Create socket for root to contact me - MSCCLPPCHECK(mscclppSocketInit(&listenSockRoot, &netIfAddr_, magic, mscclppSocketTypeBootstrap, this->abortFlag_)); - MSCCLPPCHECK(mscclppSocketListen(&listenSockRoot)); - MSCCLPPCHECK(mscclppSocketGetAddr(&listenSockRoot, &info.extAddressListenRoot)); + MSCCLPPTHROW(mscclppSocketInit(&listenSockRoot, &netIfAddr_, magic, mscclppSocketTypeBootstrap, this->abortFlag_)); + MSCCLPPTHROW(mscclppSocketListen(&listenSockRoot)); + MSCCLPPTHROW(mscclppSocketGetAddr(&listenSockRoot, &info.extAddressListenRoot)); // stagger connection times to avoid an overload of the root auto randomSleep = [](int rank) { @@ -385,34 +311,33 @@ mscclppResult_t MscclppBootstrap::Impl::establishConnections() mscclppSocketToString(&this->uniqueId_.addr, line + strlen(line)); // send info on my listening socket to root - MSCCLPPCHECK(mscclppSocketInit(&sock, &this->uniqueId_.addr, magic, mscclppSocketTypeBootstrap, this->abortFlag_)); - MSCCLPPCHECK(mscclppSocketConnect(&sock)); - MSCCLPPCHECK(netSend(&sock, &info, sizeof(info))); - MSCCLPPCHECK(mscclppSocketClose(&sock)); + MSCCLPPTHROW(mscclppSocketInit(&sock, &this->uniqueId_.addr, magic, mscclppSocketTypeBootstrap, this->abortFlag_)); + MSCCLPPTHROW(mscclppSocketConnect(&sock)); + netSend(&sock, &info, sizeof(info)); + MSCCLPPTHROW(mscclppSocketClose(&sock)); // get info on my "next" rank in the bootstrap ring from root - MSCCLPPCHECK(mscclppSocketInit(&sock)); - MSCCLPPCHECK(mscclppSocketAccept(&sock, &listenSockRoot)); - MSCCLPPCHECK(netRecv(&sock, &nextAddr, sizeof(union mscclppSocketAddress))); - MSCCLPPCHECK(mscclppSocketClose(&sock)); - MSCCLPPCHECK(mscclppSocketClose(&listenSockRoot)); + MSCCLPPTHROW(mscclppSocketInit(&sock)); + MSCCLPPTHROW(mscclppSocketAccept(&sock, &listenSockRoot)); + netRecv(&sock, &nextAddr, sizeof(union mscclppSocketAddress)); + MSCCLPPTHROW(mscclppSocketClose(&sock)); + MSCCLPPTHROW(mscclppSocketClose(&listenSockRoot)); - MSCCLPPCHECK( + MSCCLPPTHROW( mscclppSocketInit(&this->ringSendSocket_, &nextAddr, magic, mscclppSocketTypeBootstrap, this->abortFlag_)); - MSCCLPPCHECK(mscclppSocketConnect(&this->ringSendSocket_)); + MSCCLPPTHROW(mscclppSocketConnect(&this->ringSendSocket_)); // Accept the connect request from the previous rank in the AllGather ring - MSCCLPPCHECK(mscclppSocketInit(&this->ringRecvSocket_)); - MSCCLPPCHECK(mscclppSocketAccept(&this->ringRecvSocket_, &this->listenSock_)); + MSCCLPPTHROW(mscclppSocketInit(&this->ringRecvSocket_)); + MSCCLPPTHROW(mscclppSocketAccept(&this->ringRecvSocket_, &this->listenSock_)); // AllGather all listen handlers - MSCCLPPCHECK(mscclppSocketGetAddr(&this->listenSock_, &this->peerCommAddresses_[rank_])); - MSCCLPPCHECK(allGather(this->peerCommAddresses_.data(), sizeof(union mscclppSocketAddress))); + MSCCLPPTHROW(mscclppSocketGetAddr(&this->listenSock_, &this->peerCommAddresses_[rank_])); + allGather(this->peerCommAddresses_.data(), sizeof(union mscclppSocketAddress)); TRACE(MSCCLPP_INIT, "rank %d nranks %d - DONE", rank_, nRanks_); - return mscclppSuccess; } -mscclppResult_t MscclppBootstrap::Impl::allGather(void* allData, int size) +void mscclppBootstrap::Impl::allGather(void* allData, int size) { char* data = static_cast(allData); int rank = this->rank_; @@ -429,154 +354,129 @@ mscclppResult_t MscclppBootstrap::Impl::allGather(void* allData, int size) size_t sSlice = (rank - i + nRanks) % nRanks; // Send slice to the right - MSCCLPPCHECK(netSend(&this->ringSendSocket_, data + sSlice * size, size)); + netSend(&this->ringSendSocket_, data + sSlice * size, size); // Recv slice from the left - MSCCLPPCHECK(netRecv(&this->ringRecvSocket_, data + rSlice * size, size)); + netRecv(&this->ringRecvSocket_, data + rSlice * size, size); } TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - DONE", rank, nRanks, size); - return mscclppSuccess; } -mscclppResult_t MscclppBootstrap::Impl::netSend(mscclppSocket* sock, const void* data, int size) +void mscclppBootstrap::Impl::netSend(mscclppSocket* sock, const void* data, int size) { - MSCCLPPCHECK(mscclppSocketSend(sock, &size, sizeof(int))); - MSCCLPPCHECK(mscclppSocketSend(sock, const_cast(data), size)); - return mscclppSuccess; + MSCCLPPTHROW(mscclppSocketSend(sock, &size, sizeof(int))); + MSCCLPPTHROW(mscclppSocketSend(sock, const_cast(data), size)); } -mscclppResult_t MscclppBootstrap::Impl::netRecv(mscclppSocket* sock, void* data, int size) +void mscclppBootstrap::Impl::netRecv(mscclppSocket* sock, void* data, int size) { int recvSize; - MSCCLPPCHECK(mscclppSocketRecv(sock, &recvSize, sizeof(int))); + MSCCLPPTHROW(mscclppSocketRecv(sock, &recvSize, sizeof(int))); if (recvSize > size) { - WARN("Message truncated : received %d bytes instead of %d", recvSize, size); - return mscclppInternalError; + throw std::runtime_error("Message truncated : received " + std::to_string(recvSize) + " bytes instead of " + std::to_string(size)); } - MSCCLPPCHECK(mscclppSocketRecv(sock, data, std::min(recvSize, size))); - return mscclppSuccess; + MSCCLPPTHROW(mscclppSocketRecv(sock, data, std::min(recvSize, size))); } -mscclppResult_t MscclppBootstrap::Impl::send(void* data, int size, int peer, int tag) +void mscclppBootstrap::Impl::send(void* data, int size, int peer, int tag) { mscclppSocket sock; - MSCCLPPCHECK(mscclppSocketInit(&sock, &this->peerCommAddresses_[peer], this->uniqueId_.magic, + MSCCLPPTHROW(mscclppSocketInit(&sock, &this->peerCommAddresses_[peer], this->uniqueId_.magic, mscclppSocketTypeBootstrap, this->abortFlag_)); - MSCCLPPCHECK(mscclppSocketConnect(&sock)); - MSCCLPPCHECK(netSend(&sock, &this->rank_, sizeof(int))); - MSCCLPPCHECK(netSend(&sock, &tag, sizeof(int))); - MSCCLPPCHECK(netSend(&sock, data, size)); + MSCCLPPTHROW(mscclppSocketConnect(&sock)); + netSend(&sock, &this->rank_, sizeof(int)); + netSend(&sock, &tag, sizeof(int)); + netSend(&sock, data, size); - MSCCLPPCHECK(mscclppSocketClose(&sock)); - return mscclppSuccess; + MSCCLPPTHROW(mscclppSocketClose(&sock)); } -mscclppResult_t MscclppBootstrap::Impl::recv(void* data, int size, int peer, int tag) +void mscclppBootstrap::Impl::recv(void* data, int size, int peer, int tag) { // search over all unexpected messages for (auto it = unexpectedMessages_.begin(); it != unexpectedMessages_.end(); ++it){ if (it->peer == peer && it->tag == tag){ // found a match - MSCCLPPCHECK(netRecv(it->sock.get(), data, size)); - MSCCLPPCHECK(mscclppSocketClose(it->sock.get())); + netRecv(it->sock.get(), data, size); + MSCCLPPTHROW(mscclppSocketClose(it->sock.get())); unexpectedMessages_.erase(it); - return mscclppSuccess; + return; } } // didn't find one while (true) { auto sock = std::make_shared(); int newPeer, newTag; - MSCCLPPCHECK(mscclppSocketInit(sock.get())); - MSCCLPPCHECK(mscclppSocketAccept(sock.get(), &this->listenSock_)); - MSCCLPPCHECK(netRecv(sock.get(), &newPeer, sizeof(int))); - MSCCLPPCHECK(netRecv(sock.get(), &newTag, sizeof(int))); + MSCCLPPTHROW(mscclppSocketInit(sock.get())); + MSCCLPPTHROW(mscclppSocketAccept(sock.get(), &this->listenSock_)); + netRecv(sock.get(), &newPeer, sizeof(int)); + netRecv(sock.get(), &newTag, sizeof(int)); if (newPeer == peer && newTag == tag) { - MSCCLPPCHECK(netRecv(sock.get(), ((char*)data), size)); - MSCCLPPCHECK(mscclppSocketClose(sock.get())); - return mscclppSuccess; + netRecv(sock.get(), ((char*)data), size); + MSCCLPPTHROW(mscclppSocketClose(sock.get())); + return; } // Unexpected message. Save for later. unexpectedMessages_.push_back({newPeer, newTag, sock}); } - - return mscclppSuccess; } -mscclppResult_t MscclppBootstrap::Impl::barrier() +void mscclppBootstrap::Impl::barrier() { - MSCCLPPCHECK(allGather(barrierArr_.data(), sizeof(int))); - return mscclppSuccess; + allGather(barrierArr_.data(), sizeof(int)); } -mscclppResult_t MscclppBootstrap::Impl::close() +void mscclppBootstrap::Impl::close() { - MSCCLPPCHECK(mscclppSocketClose(&this->listenSock_)); - MSCCLPPCHECK(mscclppSocketClose(&this->ringSendSocket_)); - MSCCLPPCHECK(mscclppSocketClose(&this->ringRecvSocket_)); - - return mscclppSuccess; + MSCCLPPTHROW(mscclppSocketClose(&this->listenSock_)); + MSCCLPPTHROW(mscclppSocketClose(&this->ringSendSocket_)); + MSCCLPPTHROW(mscclppSocketClose(&this->ringRecvSocket_)); } -MscclppBootstrap::MscclppBootstrap(int rank, int nRanks) +mscclppBootstrap::mscclppBootstrap(int rank, int nRanks) { // pimpl_ = std::make_unique(ipPortPair, rank, nRanks, uniqueId); pimpl_ = new Impl(rank, nRanks); } -UniqueId MscclppBootstrap::GetUniqueId() +UniqueId mscclppBootstrap::GetUniqueId() { return pimpl_->getUniqueId(); } -void MscclppBootstrap::Send(void* data, int size, int peer, int tag) +void mscclppBootstrap::Send(void* data, int size, int peer, int tag) { - mscclppResult_t res = pimpl_->send(data, size, peer, tag); - if (res != mscclppSuccess) { - throw std::runtime_error("MscclppBootstrap::Send failed"); - } + pimpl_->send(data, size, peer, tag); } -void MscclppBootstrap::Recv(void* data, int size, int peer, int tag) +void mscclppBootstrap::Recv(void* data, int size, int peer, int tag) { - mscclppResult_t res = pimpl_->recv(data, size, peer, tag); - if (res != mscclppSuccess) { - throw std::runtime_error("MscclppBootstrap::Recv failed"); - } + pimpl_->recv(data, size, peer, tag); } -void MscclppBootstrap::AllGather(void* allData, int size) +void mscclppBootstrap::AllGather(void* allData, int size) { - mscclppResult_t res = pimpl_->allGather(allData, size); - if (res != mscclppSuccess) { - throw std::runtime_error("MscclppBootstrap::AllGather failed"); - } + pimpl_->allGather(allData, size); } -void MscclppBootstrap::Initialize(const UniqueId uniqueId) +void mscclppBootstrap::Initialize(const UniqueId uniqueId) { pimpl_->initialize(uniqueId); } -void MscclppBootstrap::Initialize(std::string ipPortPair) +void mscclppBootstrap::Initialize(std::string ipPortPair) { pimpl_->initialize(ipPortPair); } -void MscclppBootstrap::Barrier() +void mscclppBootstrap::Barrier() { - mscclppResult_t res = pimpl_->barrier(); - if (res != mscclppSuccess) { - throw std::runtime_error("MscclppBootstrap::Barrier failed"); - } + pimpl_->barrier(); } -MscclppBootstrap::~MscclppBootstrap() +mscclppBootstrap::~mscclppBootstrap() { - mscclppResult_t res = pimpl_->close(); - if (res != mscclppSuccess) { - throw std::runtime_error("MscclppBootstrap::Close failed"); - } + pimpl_->close(); } // ------------------- Old bootstrap functions ------------------- diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h index 3b683707..ba69d170 100644 --- a/src/include/bootstrap.h +++ b/src/include/bootstrap.h @@ -14,11 +14,11 @@ struct UniqueId static_assert(sizeof(UniqueId) <= sizeof(mscclppUniqueId), "Bootstrap handle is too large to fit inside MSCCLPP unique ID"); -class __attribute__((visibility("default"))) MscclppBootstrap : public Bootstrap +class __attribute__((visibility("default"))) mscclppBootstrap : public Bootstrap { public: - MscclppBootstrap(int rank, int nRanks); - ~MscclppBootstrap(); + mscclppBootstrap(int rank, int nRanks); + ~mscclppBootstrap(); UniqueId GetUniqueId(); @@ -34,6 +34,8 @@ private: Impl* pimpl_; }; +// ------------------- Old bootstrap headers: to be removed ------------------- + struct mscclppBootstrapHandle { uint64_t magic; diff --git a/tests/bootstrap_test_cpp.cc b/tests/bootstrap_test_cpp.cc index a810cb2f..c2ef61f0 100644 --- a/tests/bootstrap_test_cpp.cc +++ b/tests/bootstrap_test_cpp.cc @@ -11,7 +11,7 @@ int main() MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &worldSize); - std::shared_ptr bootstrap(new MscclppBootstrap(rank, worldSize)); + std::shared_ptr bootstrap(new mscclppBootstrap(rank, worldSize)); // bootstrap->Initialize("costsim-dev-00000A:50000"); UniqueId id; if (rank == 0) From 2a46644692cd92e479f16b02e682a1317742afff Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Mon, 24 Apr 2023 23:08:30 +0000 Subject: [PATCH 046/135] adding checks.hpp --- src/include/checks.hpp | 55 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 src/include/checks.hpp diff --git a/src/include/checks.hpp b/src/include/checks.hpp new file mode 100644 index 00000000..ee5f7058 --- /dev/null +++ b/src/include/checks.hpp @@ -0,0 +1,55 @@ +/************************************************************************* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef MSCCLPP_CHECKS_HPP_ +#define MSCCLPP_CHECKS_HPP_ + +#include "debug.h" +#include + +#define MSCCLPPTHROW(call) \ + do { \ + mscclppResult_t res = call; \ + if (res != mscclppSuccess && res != mscclppInProgress) { \ + throw std::runtime_error(std::string("Call to " #call " failed with error code ") + mscclppGetErrorString(res)); \ + } \ + } while (0); + +#define CUDATHROW(cmd) \ + do { \ + cudaError_t err = cmd; \ + if (err != cudaSuccess) { \ + throw std::runtime_error(std::string("Cuda failure '") + cudaGetErrorString(err) + "'"); \ + } \ + } while (false) + +#endif + +#include +// Check system calls +#define SYSCHECKTHROW(call, name) \ + do { \ + int retval; \ + SYSCHECKVAL(call, name, retval); \ + } while (false) + +#define SYSCHECKVALTHROW(call, name, retval) \ + do { \ + SYSCHECKSYNC(call, name, retval); \ + if (retval == -1) { \ + std::runtime_error(std::string("Call to " name " failed : ") + strerror(errno)); \ + } \ + } while (false) + +#define SYSCHECKSYNCTHROW(call, name, retval) \ + do { \ + retval = call; \ + if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ + INFO(MSCCLPP_ALL, "Call to " name " returned %s, retrying", strerror(errno)); \ + } else { \ + break; \ + } \ + } while (true) From 2c52ab37cebbf13af9cfc2506fbaaaf0993542d5 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Mon, 24 Apr 2023 23:09:12 +0000 Subject: [PATCH 047/135] lint --- src/bootstrap/bootstrap.cc | 37 ++++---- src/communicator.cpp | 67 ++++++++------ src/host_connection.cpp | 50 ++++++----- src/include/checks.hpp | 8 +- src/include/comm.h | 2 +- src/include/host_connection.hpp | 7 +- src/include/mscclpp.h | 13 +-- src/include/mscclpp.hpp | 155 ++++++++++++++++++-------------- src/include/mscclppfifo.hpp | 18 ++-- src/init.cc | 53 ++++++----- tests/allgather_test_cpp.cu | 10 +-- tests/bootstrap_test_cpp.cc | 86 +++++++++--------- 12 files changed, 281 insertions(+), 225 deletions(-) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index b38d4b84..a7f43267 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -1,6 +1,6 @@ #include "bootstrap.h" -#include "utils.h" #include "checks.hpp" +#include "utils.h" #include #include @@ -95,9 +95,9 @@ private: void bootstrapCreateRoot(); void bootstrapRoot(mscclppSocket listenSock); void getRemoteAddresses(mscclppSocket* listenSock, std::vector& rankAddresses, - std::vector& rankAddressesRoot, int& rank); + std::vector& rankAddressesRoot, int& rank); void sendHandleToPeer(int peer, const std::vector& rankAddresses, - const std::vector& rankAddressesRoot); + const std::vector& rankAddressesRoot); void netInit(std::string ipPortPair); }; @@ -153,9 +153,8 @@ mscclppBootstrap::Impl::~Impl() } void mscclppBootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, - std::vector& rankAddresses, - std::vector& rankAddressesRoot, - int& rank) + std::vector& rankAddresses, + std::vector& rankAddressesRoot, int& rank) { mscclppSocket sock; extInfo info; @@ -168,11 +167,13 @@ void mscclppBootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, MSCCLPPTHROW(mscclppSocketClose(&sock)); if (this->nRanks_ != info.nRanks) { - throw std::runtime_error("Bootstrap Root : mismatch in rank count from procs " + std::to_string(this->nRanks_) + " : " + std::to_string(info.nRanks)); + throw std::runtime_error("Bootstrap Root : mismatch in rank count from procs " + std::to_string(this->nRanks_) + + " : " + std::to_string(info.nRanks)); } if (std::memcmp(&zero, &rankAddressesRoot[info.rank], sizeof(mscclppSocketAddress)) != 0) { - throw std::runtime_error("Bootstrap Root : rank " + std::to_string(info.rank) + " of " + std::to_string(this->nRanks_) + " has already checked in"); + throw std::runtime_error("Bootstrap Root : rank " + std::to_string(info.rank) + " of " + + std::to_string(this->nRanks_) + " has already checked in"); } // Save the connection handle for that rank @@ -181,9 +182,8 @@ void mscclppBootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, rank = info.rank; } -void mscclppBootstrap::Impl::sendHandleToPeer(int peer, - const std::vector& rankAddresses, - const std::vector& rankAddressesRoot) +void mscclppBootstrap::Impl::sendHandleToPeer(int peer, const std::vector& rankAddresses, + const std::vector& rankAddressesRoot) { mscclppSocket sock; int next = (peer + 1) % this->nRanks_; @@ -210,9 +210,7 @@ void mscclppBootstrap::Impl::bootstrapCreateRoot() if (ret != mscclppSuccess) { throw std::runtime_error("Failed to get socket address"); } - auto lambda = [this, listenSock]() { - this->bootstrapRoot(listenSock); - }; + auto lambda = [this, listenSock]() { this->bootstrapRoot(listenSock); }; rootThread_ = std::thread(lambda); } @@ -252,7 +250,8 @@ void mscclppBootstrap::Impl::netInit(std::string ipPortPair) if (!ipPortPair.empty()) { union mscclppSocketAddress remoteAddr; if (mscclppSocketGetAddrFromString(&remoteAddr, ipPortPair.c_str()) != mscclppSuccess) { - throw std::runtime_error("Invalid ipPortPair, please use format: : or []: or :"); + throw std::runtime_error( + "Invalid ipPortPair, please use format: : or []: or :"); } if (mscclppFindInterfaceMatchSubnet(netIfName_, &netIfAddr_, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) { throw std::runtime_error("NET/Socket : No usable listening interface found"); @@ -305,7 +304,6 @@ void mscclppBootstrap::Impl::establishConnections() randomSleep(this->rank_); } - char line[SOCKET_NAME_MAXLEN + MAX_IF_NAME_SIZE + 2]; std::sprintf(line, " %s:", netIfName_); mscclppSocketToString(&this->uniqueId_.addr, line + strlen(line)); @@ -373,7 +371,8 @@ void mscclppBootstrap::Impl::netRecv(mscclppSocket* sock, void* data, int size) int recvSize; MSCCLPPTHROW(mscclppSocketRecv(sock, &recvSize, sizeof(int))); if (recvSize > size) { - throw std::runtime_error("Message truncated : received " + std::to_string(recvSize) + " bytes instead of " + std::to_string(size)); + throw std::runtime_error("Message truncated : received " + std::to_string(recvSize) + " bytes instead of " + + std::to_string(size)); } MSCCLPPTHROW(mscclppSocketRecv(sock, data, std::min(recvSize, size))); } @@ -394,8 +393,8 @@ void mscclppBootstrap::Impl::send(void* data, int size, int peer, int tag) void mscclppBootstrap::Impl::recv(void* data, int size, int peer, int tag) { // search over all unexpected messages - for (auto it = unexpectedMessages_.begin(); it != unexpectedMessages_.end(); ++it){ - if (it->peer == peer && it->tag == tag){ + for (auto it = unexpectedMessages_.begin(); it != unexpectedMessages_.end(); ++it) { + if (it->peer == peer && it->tag == tag) { // found a match netRecv(it->sock.get(), data, size); MSCCLPPTHROW(mscclppSocketClose(it->sock.get())); diff --git a/src/communicator.cpp b/src/communicator.cpp index 73d82997..ee3b9cd1 100644 --- a/src/communicator.cpp +++ b/src/communicator.cpp @@ -1,33 +1,39 @@ -#include "mscclpp.hpp" #include "mscclpp.h" +#include "mscclpp.hpp" namespace mscclpp { -mscclppTransport_t transportTypeToCStyle(TransportType type) { +mscclppTransport_t transportTypeToCStyle(TransportType type) +{ switch (type) { - case TransportType::IB: - return mscclppTransportIB; - case TransportType::P2P: - return mscclppTransportP2P; - default: - throw std::runtime_error("Unknown transport type"); + case TransportType::IB: + return mscclppTransportIB; + case TransportType::P2P: + return mscclppTransportP2P; + default: + throw std::runtime_error("Unknown transport type"); } } -struct Communicator::Impl { - mscclppComm_t comm; - std::vector> connections; +struct Communicator::Impl +{ + mscclppComm_t comm; + std::vector> connections; - Impl() : comm(nullptr) {} + Impl() : comm(nullptr) + { + } - ~Impl() { - if (comm) { - mscclppCommDestroy(comm); - } + ~Impl() + { + if (comm) { + mscclppCommDestroy(comm); } + } }; -void Communicator::initRank(int nranks, const char* ipPortPair, int rank) { +void Communicator::initRank(int nranks, const char* ipPortPair, int rank) +{ if (pimpl) { throw std::runtime_error("Communicator already initialized"); } @@ -35,26 +41,30 @@ void Communicator::initRank(int nranks, const char* ipPortPair, int rank) { mscclppCommInitRank(&pimpl->comm, nranks, ipPortPair, rank); } -void Communicator::initRankFromId(int nranks, UniqueId id, int rank) { +void Communicator::initRankFromId(int nranks, UniqueId id, int rank) +{ if (pimpl) { throw std::runtime_error("Communicator already initialized"); } pimpl = std::make_unique(); static_assert(sizeof(mscclppUniqueId) == sizeof(UniqueId), "UniqueId size mismatch"); - mscclppUniqueId *cstyle_id = reinterpret_cast(&id); + mscclppUniqueId* cstyle_id = reinterpret_cast(&id); mscclppCommInitRankFromId(&pimpl->comm, nranks, *cstyle_id, rank); } -void Communicator::bootstrapAllGather(void* data, int size) { +void Communicator::bootstrapAllGather(void* data, int size) +{ mscclppBootstrapAllGather(pimpl->comm, data, size); } -void Communicator::bootstrapBarrier() { +void Communicator::bootstrapBarrier() +{ mscclppBootstrapBarrier(pimpl->comm); } -std::shared_ptr Communicator::connect(int remoteRank, int tag, - TransportType transportType, const char* ibDev = 0) { +std::shared_ptr Communicator::connect(int remoteRank, int tag, TransportType transportType, + const char* ibDev = 0) +{ mscclppConnectWithoutBuffer(pimpl->comm, remoteRank, tag, transportTypeToCStyle(transportType), ibDev); auto conn = std::make_shared(); auto connIdx = pimpl->connections.size(); @@ -62,9 +72,10 @@ std::shared_ptr Communicator::connect(int remoteRank, int tag, return conn; } -void Communicator::connectionSetup() { +void Communicator::connectionSetup() +{ mscclppConnectionSetup(pimpl->comm); - mscclppHostConn_t *hostConns; + mscclppHostConn_t* hostConns; int numHostConns; mscclppGetAllHostConnections(pimpl->comm, &hostConns, &numHostConns); if (numHostConns != pimpl->connections.size()) { @@ -75,13 +86,15 @@ void Communicator::connectionSetup() { } } -int Communicator::rank() { +int Communicator::rank() +{ int result; mscclppCommRank(pimpl->comm, &result); return result; } -int Communicator::size() { +int Communicator::size() +{ int result; mscclppCommSize(pimpl->comm, &result); return result; diff --git a/src/host_connection.cpp b/src/host_connection.cpp index 6a06de63..d41c60d4 100644 --- a/src/host_connection.cpp +++ b/src/host_connection.cpp @@ -2,54 +2,58 @@ namespace mscclpp { -HostConnection::Impl::Impl() : hostConn(nullptr) {} +HostConnection::Impl::Impl() : hostConn(nullptr) +{ +} -HostConnection::Impl::~Impl() { +HostConnection::Impl::~Impl() +{ // TODO: figure out memory ownership. Does this deallocate the mscclppHostConn? Likely not. } -void HostConnection::Impl::setup(mscclppHostConn_t *hostConn) { +void HostConnection::Impl::setup(mscclppHostConn_t* hostConn) +{ this->hostConn = hostConn; } -BufferHandle HostConnection::registerBuffer(void* data, uint64_t size) { - +BufferHandle HostConnection::registerBuffer(void* data, uint64_t size) +{ } -int HostConnection::numRemoteBuffers() { - +int HostConnection::numRemoteBuffers() +{ } -BufferHandle HostConnection::getRemoteBuffer(int index) { - +BufferHandle HostConnection::getRemoteBuffer(int index) +{ } -DeviceConnection HostConnection::toDevice(bool startProxyThread = true) { - +DeviceConnection HostConnection::toDevice(bool startProxyThread = true) +{ } -void HostConnection::put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) { - +void HostConnection::put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) +{ } -void HostConnection::put(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) { - +void HostConnection::put(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) +{ } -void HostConnection::signal() { - +void HostConnection::signal() +{ } -void HostConnection::flush() { - +void HostConnection::flush() +{ } -void HostConnection::wait() { - +void HostConnection::wait() +{ } -void HostConnection::epochIncrement() { - +void HostConnection::epochIncrement() +{ } } // namespace mscclpp \ No newline at end of file diff --git a/src/include/checks.hpp b/src/include/checks.hpp index ee5f7058..bb88ebf6 100644 --- a/src/include/checks.hpp +++ b/src/include/checks.hpp @@ -30,21 +30,21 @@ #include // Check system calls -#define SYSCHECKTHROW(call, name) \ +#define SYSCHECKTHROW(call, name) \ do { \ int retval; \ SYSCHECKVAL(call, name, retval); \ } while (false) -#define SYSCHECKVALTHROW(call, name, retval) \ +#define SYSCHECKVALTHROW(call, name, retval) \ do { \ SYSCHECKSYNC(call, name, retval); \ if (retval == -1) { \ - std::runtime_error(std::string("Call to " name " failed : ") + strerror(errno)); \ + std::runtime_error(std::string("Call to " name " failed : ") + strerror(errno)); \ } \ } while (false) -#define SYSCHECKSYNCTHROW(call, name, retval) \ +#define SYSCHECKSYNCTHROW(call, name, retval) \ do { \ retval = call; \ if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ diff --git a/src/include/comm.h b/src/include/comm.h index 8275e0cb..672cdd95 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -15,7 +15,7 @@ struct mscclppBufferRegistration { - void *data; + void* data; uint64_t size; }; diff --git a/src/include/host_connection.hpp b/src/include/host_connection.hpp index 4a66c846..bdf49df4 100644 --- a/src/include/host_connection.hpp +++ b/src/include/host_connection.hpp @@ -1,19 +1,20 @@ #ifndef MSCCLPP_HOST_CONNECTION_HPP_ #define MSCCLPP_HOST_CONNECTION_HPP_ -#include "mscclpp.hpp" #include "mscclpp.h" +#include "mscclpp.hpp" namespace mscclpp { -struct HostConnection::Impl { +struct HostConnection::Impl +{ mscclppHostConn_t* hostConn; Impl(); ~Impl(); - void setup(mscclppHostConn_t *hostConn); + void setup(mscclppHostConn_t* hostConn); }; } // namespace mscclpp diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index 0e7f76e5..f8931834 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -191,7 +191,8 @@ struct mscclppHostConn { virtual ~mscclppHostConn() = default; virtual void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) = 0; - virtual void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, uint64_t dataSize) = 0; + virtual void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, + uint64_t dataSize) = 0; virtual void signal() = 0; virtual void wait() = 0; virtual void flush() = 0; @@ -247,8 +248,8 @@ typedef enum mscclppNumResults = 8 } mscclppResult_t; - -class Bootstrap { +class Bootstrap +{ public: Bootstrap(){}; virtual ~Bootstrap() = default; @@ -368,7 +369,8 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void * transportType: the type of transport to be used (mscclppTransportP2P or mscclppTransportIB) * ibDev: the name of the IB device to be used. Expects a null for mscclppTransportP2P. */ -mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int remoteRank, int tag, mscclppTransport_t transportType, const char* ibDev = 0); +mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int remoteRank, int tag, + mscclppTransport_t transportType, const char* ibDev = 0); /* Register a buffer for use with a connection. * @@ -381,7 +383,8 @@ mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int remoteRank, * Outputs: * handle: a handle to the buffer registration */ -mscclppResult_t mscclppRegisterBufferForConnection(mscclppComm_t comm, int connIdx, void* localBuff, uint64_t buffSize, mscclppBufferHandle_t *handle); +mscclppResult_t mscclppRegisterBufferForConnection(mscclppComm_t comm, int connIdx, void* localBuff, uint64_t buffSize, + mscclppBufferHandle_t* handle); /* Establish all connections declared by mscclppConnect(). This function must be called after all mscclppConnect() * calls are made. This function ensures that all remote ranks are ready to communicate when it returns. diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index fbc96f43..1dbe180f 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -11,14 +11,15 @@ #define MSCCLPP_PROXY_FIFO_SIZE 128 #define MSCCLPP_PROXY_FIFO_FLUSH_COUNTER 4 -#include #include +#include #include namespace mscclpp { -struct alignas(16) SignalEpochId { +struct alignas(16) SignalEpochId +{ // every signal(), increaments this and either: // 1) proxy thread pushes it to the remote peer's localSignalEpochId->proxy // 2) gpu thread directly writes it to remoteSignalEpochId->device @@ -27,14 +28,15 @@ struct alignas(16) SignalEpochId { uint64_t proxy; }; -enum ChannelTriggerType : uint64_t { +enum ChannelTriggerType : uint64_t +{ channelTriggerData = 0x1, channelTriggerFlag = 0x2, channelTriggerSync = 0x4 }; // This is just a numeric ID. Each HostConnection will have an internal array indexed by these handles -// mapping to the actual +// mapping to the actual using BufferHandle = uint8_t; #define MSCCLPP_BITS_SIZE 32 @@ -58,14 +60,23 @@ union ChannelTrigger { uint64_t srcBufferHandle : MSCCLPP_BITS_BUFFER_HANDLE; uint64_t dstBufferHandle : MSCCLPP_BITS_BUFFER_HANDLE; uint64_t type : MSCCLPP_BITS_TYPE; - uint64_t : (64 - MSCCLPP_BITS_OFFSET - MSCCLPP_BITS_BUFFER_HANDLE - MSCCLPP_BITS_BUFFER_HANDLE - MSCCLPP_BITS_TYPE); // ensure 64-bit alignment + uint64_t : (64 - MSCCLPP_BITS_OFFSET - MSCCLPP_BITS_BUFFER_HANDLE - MSCCLPP_BITS_BUFFER_HANDLE - + MSCCLPP_BITS_TYPE); // ensure 64-bit alignment } fields; - ChannelTrigger() {} - ChannelTrigger(ProxyTrigger value) : value(value) {} - ChannelTrigger(ChannelTriggerType type, BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) { + ChannelTrigger() + { + } + ChannelTrigger(ProxyTrigger value) : value(value) + { + } + ChannelTrigger(ChannelTriggerType type, BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, + uint64_t size) + { value.fst = ((srcOffset << MSCCLPP_BITS_SIZE) + size); - value.snd = (((((((uint64_t)type << MSCCLPP_BITS_BUFFER_HANDLE) + dst) << MSCCLPP_BITS_BUFFER_HANDLE) + src) << MSCCLPP_BITS_OFFSET) + dstOffset); + value.snd = (((((((uint64_t)type << MSCCLPP_BITS_BUFFER_HANDLE) + dst) << MSCCLPP_BITS_BUFFER_HANDLE) + src) + << MSCCLPP_BITS_OFFSET) + + dstOffset); } }; @@ -131,11 +142,13 @@ union ChannelTrigger { * The two endpoint can concurrently use the same connection provided they are writing (puts) on different * indices in the registered buffer. **************************************************************************************************************/ -struct DeviceConnection { +struct DeviceConnection +{ #ifdef __CUDACC__ // TODO: add buffer handles - __forceinline__ __device__ void put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) + __forceinline__ __device__ void put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, + uint64_t size) { fifo.push(ChannelTrigger(channelTriggerData, dst, dstOffset, src, srcOffset, size).value); } @@ -151,7 +164,8 @@ struct DeviceConnection { fifo.push(ChannelTrigger(channelTriggerFlag, 0, 0, 0, 0, 1).value); } - __forceinline__ __device__ void putWithSignal(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) + __forceinline__ __device__ void putWithSignal(BufferHandle dst, uint64_t dstOffset, BufferHandle src, + uint64_t srcOffset, uint64_t size) { epochIncrement(); fifo.push(ChannelTrigger(channelTriggerData | channelTriggerFlag, dst, dstOffset, src, srcOffset, size).value); @@ -162,16 +176,19 @@ struct DeviceConnection { putWithSignal(dst, offset, src, offset, size); } - __forceinline__ __device__ void putWithSignalAndFlush(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) + __forceinline__ __device__ void putWithSignalAndFlush(BufferHandle dst, uint64_t dstOffset, BufferHandle src, + uint64_t srcOffset, uint64_t size) { epochIncrement(); - uint64_t curFifoHead = fifo.push(channelTriggerData | channelTriggerFlag | channelTriggerSync, dstOffset, srcOffset, size); + uint64_t curFifoHead = + fifo.push(channelTriggerData | channelTriggerFlag | channelTriggerSync, dstOffset, srcOffset, size); while (*(volatile uint64_t*)&fifo.triggerFifo[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0 && *(volatile uint64_t*)fifo.triggerFifoTail <= curFifoHead) ; } - __forceinline__ __device__ void putWithSignalAndFlush(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) + __forceinline__ __device__ void putWithSignalAndFlush(BufferHandle dst, BufferHandle src, uint64_t offset, + uint64_t size) { putWithSignalAndFlush(offset, offset, size); } @@ -217,7 +234,8 @@ struct DeviceConnection { ProxyFifo fifo; }; -class HostConnection { +class HostConnection +{ public: /* Register a region of GPU memory for use with this connection. Must be called before connectionSetup() * in the communicator. @@ -225,7 +243,7 @@ public: * Inputs: * data: base pointer to the memory * size: size of the memory region in bytes - * + * * Returns: a handle to the buffer */ BufferHandle registerBuffer(void* data, uint64_t size); @@ -240,7 +258,7 @@ public: * * Inputs: * index: the index of the handle to get - * + * * Returns: a handle to the buffer on the remote peer */ BufferHandle getRemoteBuffer(int index); @@ -248,10 +266,10 @@ public: /* Create a DeviceConnection paired with this HostConnection. A background proxy thread will * trigger operations on this HostConnection corresponding to put/signal/etc. calls made to the * DeviceConnection. - * + * * Inputs: * startProxyThread: whether to start the proxy thread (default is true) - * + * * Returns: the newly created DeviceConnection */ DeviceConnection toDevice(bool startProxyThread = true); @@ -269,7 +287,8 @@ private: }; #define MSCCLPP_UNIQUE_ID_BYTES 128 -struct UniqueId { +struct UniqueId +{ char internal[MSCCLPP_UNIQUE_ID_BYTES]; }; @@ -283,76 +302,78 @@ struct UniqueId { std::unique_ptr getUniqueId(); /* Transport Types */ -enum class TransportType : uint8_t { +enum class TransportType : uint8_t +{ P2P = 0, IB = 1, }; -class Communicator { +class Communicator +{ public: /* Initialize the communicator. nranks processes with rank 0 to nranks-1 need to call this function. - * - * Inputs: - * nranks: number of ranks in the communicator - * ipPortPair: a string of the form "ip:port" that represents the address of the root process - * rank: rank of the calling process - */ + * + * Inputs: + * nranks: number of ranks in the communicator + * ipPortPair: a string of the form "ip:port" that represents the address of the root process + * rank: rank of the calling process + */ void initRank(int nranks, const char* ipPortPair, int rank); - + /* Initialize the communicator from a given UniqueId. Same as mscclppCommInitRank() except that - * id is provided by the user by calling getUniqueId() - * - * Inputs: - * nranks: number of ranks in the communicator - * id: the unique ID to be used for communication - * rank: rank of the calling process - */ + * id is provided by the user by calling getUniqueId() + * + * Inputs: + * nranks: number of ranks in the communicator + * id: the unique ID to be used for communication + * rank: rank of the calling process + */ void initRankFromId(int nranks, UniqueId id, int rank); - + /* Ring-based AllGather through the bootstrap socket. - * - * Inputs: - * data: data array to be gathered where `[r*size, (r+1)*size)` is the data for rank `r` - * size: data size per rank - */ + * + * Inputs: + * data: data array to be gathered where `[r*size, (r+1)*size)` is the data for rank `r` + * size: data size per rank + */ void bootstrapAllGather(void* data, int size); /* A no-op function that is used to synchronize all processes via a bootstrap allgather*/ void bootstrapBarrier(); /* Connect to a remote rank. This function only prepares metadata for connection. The actual connection - * is made by a following call of mscclppConnectionSetup(). Note that this function is two-way and a connection - * from rank i to remote rank j needs to have a counterpart from rank j to rank i. - * Note that with IB, buffers are registered at a page level and if a buffer is spread through multiple pages - * and do not fully utilize all of them, IB's QP has to register for all involved pages. This potentially has - * security risks if the devConn's accesses are given to a malicious process. - * - * Inputs: - * remoteRank: the rank of the remote process - * tag: the tag of the connection. tag is copied into the corresponding mscclppDevConn_t, which can be - * used to identify the connection inside a GPU kernel. - * transportType: the type of transport to be used (mscclppTransportP2P or mscclppTransportIB) - * ibDev: the name of the IB device to be used. Expects a null for mscclppTransportP2P. - */ + * is made by a following call of mscclppConnectionSetup(). Note that this function is two-way and a connection + * from rank i to remote rank j needs to have a counterpart from rank j to rank i. + * Note that with IB, buffers are registered at a page level and if a buffer is spread through multiple pages + * and do not fully utilize all of them, IB's QP has to register for all involved pages. This potentially has + * security risks if the devConn's accesses are given to a malicious process. + * + * Inputs: + * remoteRank: the rank of the remote process + * tag: the tag of the connection. tag is copied into the corresponding mscclppDevConn_t, which can be + * used to identify the connection inside a GPU kernel. + * transportType: the type of transport to be used (mscclppTransportP2P or mscclppTransportIB) + * ibDev: the name of the IB device to be used. Expects a null for mscclppTransportP2P. + */ std::shared_ptr connect(int remoteRank, int tag, TransportType transportType, const char* ibDev = 0); /* Establish all connections created by mscclppConnect(). This function must be called after all mscclppConnect() - * calls are made. This function ensures that all remote ranks are ready to communicate when it returns. - */ + * calls are made. This function ensures that all remote ranks are ready to communicate when it returns. + */ void connectionSetup(); - + /* Return the rank of the calling process. - * - * Outputs: - * rank: the rank of the calling process - */ + * + * Outputs: + * rank: the rank of the calling process + */ int rank(); /* Return the number of ranks of the communicator. - * - * Outputs: - * size: the number of ranks of the communicator - */ + * + * Outputs: + * size: the number of ranks of the communicator + */ int size(); private: diff --git a/src/include/mscclppfifo.hpp b/src/include/mscclppfifo.hpp index 27abd4c5..f602216d 100644 --- a/src/include/mscclppfifo.hpp +++ b/src/include/mscclppfifo.hpp @@ -1,12 +1,13 @@ #ifndef MSCCLPPFIFO_HPP_ #define MSCCLPPFIFO_HPP_ -#include #include +#include namespace mscclpp { -struct alignas(16) ProxyTrigger { +struct alignas(16) ProxyTrigger +{ uint64_t fst, snd; }; @@ -23,7 +24,8 @@ struct alignas(16) ProxyTrigger { * Why duplicating the tail is a good idea? The fifo is large engouh and we do not need frequent updates * for the tail as there is usually enough space for device threads to push their work into. */ -struct ProxyFifo { +struct ProxyFifo +{ #ifdef __CUDACC__ __forceinline__ __device__ uint64_t push(ProxyTrigger element) { @@ -33,8 +35,8 @@ struct ProxyFifo { while (*(volatile uint64_t*)&this->triggerFifo[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0) ; uint64_t* valptr = (uint64_t*)&(this->triggerFifo[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE].value); - asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(valptr), - "l"(element.value[0]), "l"(element.value[1])); + asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(valptr), "l"(element.value[0]), + "l"(element.value[1])); return curFifoHead; } #endif // __CUDACC__ @@ -43,9 +45,9 @@ struct ProxyFifo { void stopProxyThread(); ProxyTrigger* triggerFifo; // Allocate on host via cudaHostAlloc. This space is used for pushing the workelements - uint64_t* triggerFifoTail; // Allocated on device. proxyState->fifoTailHost is the true tail on host and pused - // occasionally to device - uint64_t* triggerFifoHead; // Allocated on device. Only accessed by device + uint64_t* triggerFifoTail; // Allocated on device. proxyState->fifoTailHost is the true tail on host and pused + // occasionally to device + uint64_t* triggerFifoHead; // Allocated on device. Only accessed by device }; } // namespace mscclpp diff --git a/src/init.cc b/src/init.cc index 7c3b76b9..d30d2c17 100644 --- a/src/init.cc +++ b/src/init.cc @@ -326,7 +326,8 @@ struct mscclppHostP2PConn : mscclppHostConn { put(1, dstDataOffset, 1, srcDataOffset, dataSize); } - void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, uint64_t dataSize) + void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, + uint64_t dataSize) { void* srcBuff = (void*)((char*)conn->bufferRegistrations[src].data + srcDataOffset); void* dstBuff = (void*)((char*)conn->remoteBufferRegistrations[dst].data + dstDataOffset); @@ -364,7 +365,8 @@ struct mscclppHostIBConn : mscclppHostConn { put(1, dstDataOffset, 1, srcDataOffset, dataSize); } - void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, uint64_t dataSize) + void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, + uint64_t dataSize) { this->ibQp->stageSend(this->ibMrs[src], &this->remoteIbMrInfos[dst], (uint32_t)dataSize, /*wrId=*/0, /*srcOffset=*/srcDataOffset, /*dstOffset=*/dstDataOffset, /*signaled=*/false); @@ -423,7 +425,8 @@ struct mscclppHostIBConn : mscclppHostConn std::vector remoteIbMrInfos; }; -MSCCLPP_API mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int remoteRank, int tag, mscclppTransport_t transportType, const char* ibDev) +MSCCLPP_API mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int remoteRank, int tag, + mscclppTransport_t transportType, const char* ibDev) { // save this processes numa binding and set it to the one closest to the device // so that all the allocation are close to the device @@ -563,7 +566,8 @@ MSCCLPP_API mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int MSCCLPPCHECK(setNumaState(curProcessState)); mscclppBufferHandle_t signalHandle = -1; - MSCCLPPCHECK(mscclppRegisterBufferForConnection(comm, connId, conn->devConn->localSignalEpochId, sizeof(mscclppDevConnSignalEpochId), &signalHandle)); + MSCCLPPCHECK(mscclppRegisterBufferForConnection(comm, connId, conn->devConn->localSignalEpochId, + sizeof(mscclppDevConnSignalEpochId), &signalHandle)); if (signalHandle != 0) { WARN("signal handle should be 0"); return mscclppInternalError; @@ -592,7 +596,9 @@ MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, i return mscclppSuccess; } -MSCCLPP_API mscclppResult_t mscclppRegisterBufferForConnection(mscclppComm_t comm, int connIdx, void* localBuff, uint64_t buffSize, mscclppBufferHandle_t *handle) { +MSCCLPP_API mscclppResult_t mscclppRegisterBufferForConnection(mscclppComm_t comm, int connIdx, void* localBuff, + uint64_t buffSize, mscclppBufferHandle_t* handle) +{ if (connIdx >= comm->nConns) { WARN("connIdx out of range"); return mscclppInvalidArgument; @@ -618,27 +624,32 @@ struct connInfo mscclppIbQpInfo infoQp; std::vector bufferInfos; - struct header { + struct header + { mscclppIbQpInfo infoQp; int numBufferInfos; }; - mscclppResult_t sendOverBootstrap(void* bootstrap, int remoteRank, int tag) { + mscclppResult_t sendOverBootstrap(void* bootstrap, int remoteRank, int tag) + { header h; h.infoQp = infoQp; h.numBufferInfos = bufferInfos.size(); MSCCLPPCHECK(bootstrapSend(bootstrap, remoteRank, tag, &h, sizeof(header))); - MSCCLPPCHECK(bootstrapSend(bootstrap, remoteRank, tag, bufferInfos.data(), bufferInfos.size() * sizeof(mscclppBufferRegistrationInfo))); - return mscclppSuccess; + MSCCLPPCHECK(bootstrapSend(bootstrap, remoteRank, tag, bufferInfos.data(), + bufferInfos.size() * sizeof(mscclppBufferRegistrationInfo))); + return mscclppSuccess; } - mscclppResult_t recvOverBootstrap(void* bootstrap, int remoteRank, int tag) { + mscclppResult_t recvOverBootstrap(void* bootstrap, int remoteRank, int tag) + { header h; MSCCLPPCHECK(bootstrapRecv(bootstrap, remoteRank, tag, &h, sizeof(header))); infoQp = h.infoQp; bufferInfos.resize(h.numBufferInfos); - MSCCLPPCHECK(bootstrapRecv(bootstrap, remoteRank, tag, bufferInfos.data(), bufferInfos.size() * sizeof(mscclppBufferRegistrationInfo))); - return mscclppSuccess; + MSCCLPPCHECK(bootstrapRecv(bootstrap, remoteRank, tag, bufferInfos.data(), + bufferInfos.size() * sizeof(mscclppBufferRegistrationInfo))); + return mscclppSuccess; } }; @@ -650,7 +661,7 @@ mscclppResult_t mscclppP2pConnectionSetupStart(struct connInfo* connInfo /*input } // Add all registered buffers - for (const auto &bufReg : conn->bufferRegistrations) { + for (const auto& bufReg : conn->bufferRegistrations) { connInfo->bufferInfos.emplace_back(); CUDACHECK(cudaIpcGetMemHandle(&connInfo->bufferInfos.back().cudaHandle, bufReg.data)); connInfo->bufferInfos.back().size = bufReg.size; @@ -672,7 +683,8 @@ mscclppResult_t mscclppP2pConnectionSetupEnd(struct connInfo* connInfo /*input*/ // Open all remote registered buffers for (size_t i = 0; i < connInfo->bufferInfos.size(); i++) { mscclppBufferRegistration newBufReg; - CUDACHECK(cudaIpcOpenMemHandle(&newBufReg.data, connInfo->bufferInfos[i].cudaHandle, cudaIpcMemLazyEnablePeerAccess)); + CUDACHECK( + cudaIpcOpenMemHandle(&newBufReg.data, connInfo->bufferInfos[i].cudaHandle, cudaIpcMemLazyEnablePeerAccess)); newBufReg.size = connInfo->bufferInfos[i].size; conn->remoteBufferRegistrations.push_back(newBufReg); } @@ -683,8 +695,8 @@ mscclppResult_t mscclppP2pConnectionSetupEnd(struct connInfo* connInfo /*input*/ } conn->devConn->remoteSignalEpochId = (mscclppDevConnSignalEpochId*)conn->remoteBufferRegistrations[0].data; - // For backwards compatibility with the previous API that assumed one data buffer per connection, set the remote buffer - // to the first remote data buffer + // For backwards compatibility with the previous API that assumed one data buffer per connection, set the remote + // buffer to the first remote data buffer if (conn->remoteBufferRegistrations.size() > 1) { conn->devConn->remoteBuff = conn->remoteBufferRegistrations[1].data; } @@ -708,10 +720,10 @@ mscclppResult_t mscclppIbConnectionSetupStart(struct connInfo* connInfo /*output } // Add all registered buffers - for (const auto &bufReg : conn->bufferRegistrations) { + for (const auto& bufReg : conn->bufferRegistrations) { hostConn->ibMrs.emplace_back(); - MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, bufReg.data, - sizeof(struct mscclppDevConnSignalEpochId), &hostConn->ibMrs.back())); + MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, bufReg.data, sizeof(struct mscclppDevConnSignalEpochId), + &hostConn->ibMrs.back())); connInfo->bufferInfos.emplace_back(); connInfo->bufferInfos.back().ibMrInfo = hostConn->ibMrs.back()->info; connInfo->bufferInfos.back().size = bufReg.size; @@ -764,7 +776,8 @@ MSCCLPP_API mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm) MSCCLPPCHECK(mscclppIbConnectionSetupStart(&cInfo, conn)); } // TODO: from saemal: do we possibly deadlock if there are too many outstanding sends? - // MSCCLPPCHECK(bootstrapSend(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag, &cInfo, sizeof(cInfo))); + // MSCCLPPCHECK(bootstrapSend(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag, &cInfo, + // sizeof(cInfo))); MSCCLPPCHECK(cInfo.sendOverBootstrap(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag)); } diff --git a/tests/allgather_test_cpp.cu b/tests/allgather_test_cpp.cu index ca30945f..aa49ca2f 100644 --- a/tests/allgather_test_cpp.cu +++ b/tests/allgather_test_cpp.cu @@ -240,7 +240,7 @@ void setupMscclppConnections(int rank, int world_size, mscclpp::Communicator& co comm.connectionSetup(); assert(devConns.size() < sizeof(constDevConns) / sizeof(mscclpp::DeviceConnection)); - CUDACHECK(cudaMemcpyToSymbol(constDevConns, devConns.data(), sizeof(mscclpp::DeviceConnection) * devConns.size() )); + CUDACHECK(cudaMemcpyToSymbol(constDevConns, devConns.data(), sizeof(mscclpp::DeviceConnection) * devConns.size())); } void printUsage(const char* prog, bool isMpi) @@ -390,20 +390,20 @@ int main(int argc, const char* argv[]) } size_t nelemsPerGPU = dataSize / sizeof(int) / world_size; - try{ + try { mscclpp::Communicator comm; if (rank == 0) - printf("Initializing MSCCL++\n"); + printf("Initializing MSCCL++\n"); comm.initRank(world_size, ip_port, rank); if (rank == 0) - printf("Initializing data for allgather test\n"); + printf("Initializing data for allgather test\n"); initializeAndAllocateAllGatherData(rank, world_size, dataSize, nelemsPerGPU, &data_h, &data_d); if (rank == 0) - printf("Setting up the connection in MSCCL++\n"); + printf("Setting up the connection in MSCCL++\n"); setupMscclppConnections(rank, world_size, comm, data_d, dataSize); } catch (std::exception& e) { diff --git a/tests/bootstrap_test_cpp.cc b/tests/bootstrap_test_cpp.cc index c2ef61f0..2f59c070 100644 --- a/tests/bootstrap_test_cpp.cc +++ b/tests/bootstrap_test_cpp.cc @@ -6,53 +6,53 @@ int main() { - int rank, worldSize; - MPI_Init(NULL, NULL); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - MPI_Comm_size(MPI_COMM_WORLD, &worldSize); + int rank, worldSize; + MPI_Init(NULL, NULL); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &worldSize); - std::shared_ptr bootstrap(new mscclppBootstrap(rank, worldSize)); - // bootstrap->Initialize("costsim-dev-00000A:50000"); - UniqueId id; - if (rank == 0) - id = bootstrap->GetUniqueId(); - MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); - bootstrap->Initialize(id); + std::shared_ptr bootstrap(new mscclppBootstrap(rank, worldSize)); + // bootstrap->Initialize("costsim-dev-00000A:50000"); + UniqueId id; + if (rank == 0) + id = bootstrap->GetUniqueId(); + MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); + bootstrap->Initialize(id); - std::vector tmp(worldSize, 0); - tmp[rank] = rank+1; - bootstrap->AllGather(tmp.data(), sizeof(int)); - for (int i = 0; i < worldSize; i++){ - if (tmp[i] != i+1) - printf("error AllGather: rank %d: tmp[%d] = %d\n", rank, i, tmp[i]); - } - printf("rank %d: AllGather test passed!\n", rank); + std::vector tmp(worldSize, 0); + tmp[rank] = rank + 1; + bootstrap->AllGather(tmp.data(), sizeof(int)); + for (int i = 0; i < worldSize; i++) { + if (tmp[i] != i + 1) + printf("error AllGather: rank %d: tmp[%d] = %d\n", rank, i, tmp[i]); + } + printf("rank %d: AllGather test passed!\n", rank); - bootstrap->Barrier(); - printf("rank %d: Barrier test passed!\n", rank); + bootstrap->Barrier(); + printf("rank %d: Barrier test passed!\n", rank); - for (int i = 0; i < worldSize; i++){ - if (i == rank) - continue; - int msg1 = (rank + 1)*2; - int msg2 = (rank + 1)*2+1; - bootstrap->Send(&msg1, sizeof(int), i, 0); - bootstrap->Send(&msg2, sizeof(int), i, 1); - } + for (int i = 0; i < worldSize; i++) { + if (i == rank) + continue; + int msg1 = (rank + 1) * 2; + int msg2 = (rank + 1) * 2 + 1; + bootstrap->Send(&msg1, sizeof(int), i, 0); + bootstrap->Send(&msg2, sizeof(int), i, 1); + } - for (int i = 0; i < worldSize; i++){ - if (i == rank) - continue; - int msg1 = 0; - int msg2 = 0; - // recv them in the opposite order to check correctness - bootstrap->Recv(&msg2, sizeof(int), i, 1); - bootstrap->Recv(&msg1, sizeof(int), i, 0); - if (msg1 != (i+1)*2 || msg2 != (i+1)*2+1) - printf("error Send/Recv: rank %d: msg1 = %d, msg2 = %d\n", rank, msg1, msg2); - } - printf("rank %d: Send/Recv test passed!\n", rank); + for (int i = 0; i < worldSize; i++) { + if (i == rank) + continue; + int msg1 = 0; + int msg2 = 0; + // recv them in the opposite order to check correctness + bootstrap->Recv(&msg2, sizeof(int), i, 1); + bootstrap->Recv(&msg1, sizeof(int), i, 0); + if (msg1 != (i + 1) * 2 || msg2 != (i + 1) * 2 + 1) + printf("error Send/Recv: rank %d: msg1 = %d, msg2 = %d\n", rank, msg1, msg2); + } + printf("rank %d: Send/Recv test passed!\n", rank); - MPI_Finalize(); - return 0; + MPI_Finalize(); + return 0; } \ No newline at end of file From 3fd95265fd742f89387767d4b010de6603abcfd9 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Mon, 24 Apr 2023 23:22:56 +0000 Subject: [PATCH 048/135] Revert "lint" This reverts commit 2c52ab37cebbf13af9cfc2506fbaaaf0993542d5. --- src/bootstrap/bootstrap.cc | 37 ++++---- src/communicator.cpp | 67 ++++++-------- src/host_connection.cpp | 50 +++++------ src/include/checks.hpp | 8 +- src/include/comm.h | 2 +- src/include/host_connection.hpp | 7 +- src/include/mscclpp.h | 13 ++- src/include/mscclpp.hpp | 155 ++++++++++++++------------------ src/include/mscclppfifo.hpp | 18 ++-- src/init.cc | 53 +++++------ tests/allgather_test_cpp.cu | 10 +-- tests/bootstrap_test_cpp.cc | 86 +++++++++--------- 12 files changed, 225 insertions(+), 281 deletions(-) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index a7f43267..b38d4b84 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -1,6 +1,6 @@ #include "bootstrap.h" -#include "checks.hpp" #include "utils.h" +#include "checks.hpp" #include #include @@ -95,9 +95,9 @@ private: void bootstrapCreateRoot(); void bootstrapRoot(mscclppSocket listenSock); void getRemoteAddresses(mscclppSocket* listenSock, std::vector& rankAddresses, - std::vector& rankAddressesRoot, int& rank); + std::vector& rankAddressesRoot, int& rank); void sendHandleToPeer(int peer, const std::vector& rankAddresses, - const std::vector& rankAddressesRoot); + const std::vector& rankAddressesRoot); void netInit(std::string ipPortPair); }; @@ -153,8 +153,9 @@ mscclppBootstrap::Impl::~Impl() } void mscclppBootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, - std::vector& rankAddresses, - std::vector& rankAddressesRoot, int& rank) + std::vector& rankAddresses, + std::vector& rankAddressesRoot, + int& rank) { mscclppSocket sock; extInfo info; @@ -167,13 +168,11 @@ void mscclppBootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, MSCCLPPTHROW(mscclppSocketClose(&sock)); if (this->nRanks_ != info.nRanks) { - throw std::runtime_error("Bootstrap Root : mismatch in rank count from procs " + std::to_string(this->nRanks_) + - " : " + std::to_string(info.nRanks)); + throw std::runtime_error("Bootstrap Root : mismatch in rank count from procs " + std::to_string(this->nRanks_) + " : " + std::to_string(info.nRanks)); } if (std::memcmp(&zero, &rankAddressesRoot[info.rank], sizeof(mscclppSocketAddress)) != 0) { - throw std::runtime_error("Bootstrap Root : rank " + std::to_string(info.rank) + " of " + - std::to_string(this->nRanks_) + " has already checked in"); + throw std::runtime_error("Bootstrap Root : rank " + std::to_string(info.rank) + " of " + std::to_string(this->nRanks_) + " has already checked in"); } // Save the connection handle for that rank @@ -182,8 +181,9 @@ void mscclppBootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, rank = info.rank; } -void mscclppBootstrap::Impl::sendHandleToPeer(int peer, const std::vector& rankAddresses, - const std::vector& rankAddressesRoot) +void mscclppBootstrap::Impl::sendHandleToPeer(int peer, + const std::vector& rankAddresses, + const std::vector& rankAddressesRoot) { mscclppSocket sock; int next = (peer + 1) % this->nRanks_; @@ -210,7 +210,9 @@ void mscclppBootstrap::Impl::bootstrapCreateRoot() if (ret != mscclppSuccess) { throw std::runtime_error("Failed to get socket address"); } - auto lambda = [this, listenSock]() { this->bootstrapRoot(listenSock); }; + auto lambda = [this, listenSock]() { + this->bootstrapRoot(listenSock); + }; rootThread_ = std::thread(lambda); } @@ -250,8 +252,7 @@ void mscclppBootstrap::Impl::netInit(std::string ipPortPair) if (!ipPortPair.empty()) { union mscclppSocketAddress remoteAddr; if (mscclppSocketGetAddrFromString(&remoteAddr, ipPortPair.c_str()) != mscclppSuccess) { - throw std::runtime_error( - "Invalid ipPortPair, please use format: : or []: or :"); + throw std::runtime_error("Invalid ipPortPair, please use format: : or []: or :"); } if (mscclppFindInterfaceMatchSubnet(netIfName_, &netIfAddr_, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) { throw std::runtime_error("NET/Socket : No usable listening interface found"); @@ -304,6 +305,7 @@ void mscclppBootstrap::Impl::establishConnections() randomSleep(this->rank_); } + char line[SOCKET_NAME_MAXLEN + MAX_IF_NAME_SIZE + 2]; std::sprintf(line, " %s:", netIfName_); mscclppSocketToString(&this->uniqueId_.addr, line + strlen(line)); @@ -371,8 +373,7 @@ void mscclppBootstrap::Impl::netRecv(mscclppSocket* sock, void* data, int size) int recvSize; MSCCLPPTHROW(mscclppSocketRecv(sock, &recvSize, sizeof(int))); if (recvSize > size) { - throw std::runtime_error("Message truncated : received " + std::to_string(recvSize) + " bytes instead of " + - std::to_string(size)); + throw std::runtime_error("Message truncated : received " + std::to_string(recvSize) + " bytes instead of " + std::to_string(size)); } MSCCLPPTHROW(mscclppSocketRecv(sock, data, std::min(recvSize, size))); } @@ -393,8 +394,8 @@ void mscclppBootstrap::Impl::send(void* data, int size, int peer, int tag) void mscclppBootstrap::Impl::recv(void* data, int size, int peer, int tag) { // search over all unexpected messages - for (auto it = unexpectedMessages_.begin(); it != unexpectedMessages_.end(); ++it) { - if (it->peer == peer && it->tag == tag) { + for (auto it = unexpectedMessages_.begin(); it != unexpectedMessages_.end(); ++it){ + if (it->peer == peer && it->tag == tag){ // found a match netRecv(it->sock.get(), data, size); MSCCLPPTHROW(mscclppSocketClose(it->sock.get())); diff --git a/src/communicator.cpp b/src/communicator.cpp index ee3b9cd1..73d82997 100644 --- a/src/communicator.cpp +++ b/src/communicator.cpp @@ -1,39 +1,33 @@ -#include "mscclpp.h" #include "mscclpp.hpp" +#include "mscclpp.h" namespace mscclpp { -mscclppTransport_t transportTypeToCStyle(TransportType type) -{ +mscclppTransport_t transportTypeToCStyle(TransportType type) { switch (type) { - case TransportType::IB: - return mscclppTransportIB; - case TransportType::P2P: - return mscclppTransportP2P; - default: - throw std::runtime_error("Unknown transport type"); + case TransportType::IB: + return mscclppTransportIB; + case TransportType::P2P: + return mscclppTransportP2P; + default: + throw std::runtime_error("Unknown transport type"); } } -struct Communicator::Impl -{ - mscclppComm_t comm; - std::vector> connections; +struct Communicator::Impl { + mscclppComm_t comm; + std::vector> connections; - Impl() : comm(nullptr) - { - } + Impl() : comm(nullptr) {} - ~Impl() - { - if (comm) { - mscclppCommDestroy(comm); + ~Impl() { + if (comm) { + mscclppCommDestroy(comm); + } } - } }; -void Communicator::initRank(int nranks, const char* ipPortPair, int rank) -{ +void Communicator::initRank(int nranks, const char* ipPortPair, int rank) { if (pimpl) { throw std::runtime_error("Communicator already initialized"); } @@ -41,30 +35,26 @@ void Communicator::initRank(int nranks, const char* ipPortPair, int rank) mscclppCommInitRank(&pimpl->comm, nranks, ipPortPair, rank); } -void Communicator::initRankFromId(int nranks, UniqueId id, int rank) -{ +void Communicator::initRankFromId(int nranks, UniqueId id, int rank) { if (pimpl) { throw std::runtime_error("Communicator already initialized"); } pimpl = std::make_unique(); static_assert(sizeof(mscclppUniqueId) == sizeof(UniqueId), "UniqueId size mismatch"); - mscclppUniqueId* cstyle_id = reinterpret_cast(&id); + mscclppUniqueId *cstyle_id = reinterpret_cast(&id); mscclppCommInitRankFromId(&pimpl->comm, nranks, *cstyle_id, rank); } -void Communicator::bootstrapAllGather(void* data, int size) -{ +void Communicator::bootstrapAllGather(void* data, int size) { mscclppBootstrapAllGather(pimpl->comm, data, size); } -void Communicator::bootstrapBarrier() -{ +void Communicator::bootstrapBarrier() { mscclppBootstrapBarrier(pimpl->comm); } -std::shared_ptr Communicator::connect(int remoteRank, int tag, TransportType transportType, - const char* ibDev = 0) -{ +std::shared_ptr Communicator::connect(int remoteRank, int tag, + TransportType transportType, const char* ibDev = 0) { mscclppConnectWithoutBuffer(pimpl->comm, remoteRank, tag, transportTypeToCStyle(transportType), ibDev); auto conn = std::make_shared(); auto connIdx = pimpl->connections.size(); @@ -72,10 +62,9 @@ std::shared_ptr Communicator::connect(int remoteRank, int tag, T return conn; } -void Communicator::connectionSetup() -{ +void Communicator::connectionSetup() { mscclppConnectionSetup(pimpl->comm); - mscclppHostConn_t* hostConns; + mscclppHostConn_t *hostConns; int numHostConns; mscclppGetAllHostConnections(pimpl->comm, &hostConns, &numHostConns); if (numHostConns != pimpl->connections.size()) { @@ -86,15 +75,13 @@ void Communicator::connectionSetup() } } -int Communicator::rank() -{ +int Communicator::rank() { int result; mscclppCommRank(pimpl->comm, &result); return result; } -int Communicator::size() -{ +int Communicator::size() { int result; mscclppCommSize(pimpl->comm, &result); return result; diff --git a/src/host_connection.cpp b/src/host_connection.cpp index d41c60d4..6a06de63 100644 --- a/src/host_connection.cpp +++ b/src/host_connection.cpp @@ -2,58 +2,54 @@ namespace mscclpp { -HostConnection::Impl::Impl() : hostConn(nullptr) -{ -} +HostConnection::Impl::Impl() : hostConn(nullptr) {} -HostConnection::Impl::~Impl() -{ +HostConnection::Impl::~Impl() { // TODO: figure out memory ownership. Does this deallocate the mscclppHostConn? Likely not. } -void HostConnection::Impl::setup(mscclppHostConn_t* hostConn) -{ +void HostConnection::Impl::setup(mscclppHostConn_t *hostConn) { this->hostConn = hostConn; } -BufferHandle HostConnection::registerBuffer(void* data, uint64_t size) -{ +BufferHandle HostConnection::registerBuffer(void* data, uint64_t size) { + } -int HostConnection::numRemoteBuffers() -{ +int HostConnection::numRemoteBuffers() { + } -BufferHandle HostConnection::getRemoteBuffer(int index) -{ +BufferHandle HostConnection::getRemoteBuffer(int index) { + } -DeviceConnection HostConnection::toDevice(bool startProxyThread = true) -{ +DeviceConnection HostConnection::toDevice(bool startProxyThread = true) { + } -void HostConnection::put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) -{ +void HostConnection::put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) { + } -void HostConnection::put(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) -{ +void HostConnection::put(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) { + } -void HostConnection::signal() -{ +void HostConnection::signal() { + } -void HostConnection::flush() -{ +void HostConnection::flush() { + } -void HostConnection::wait() -{ +void HostConnection::wait() { + } -void HostConnection::epochIncrement() -{ +void HostConnection::epochIncrement() { + } } // namespace mscclpp \ No newline at end of file diff --git a/src/include/checks.hpp b/src/include/checks.hpp index bb88ebf6..ee5f7058 100644 --- a/src/include/checks.hpp +++ b/src/include/checks.hpp @@ -30,21 +30,21 @@ #include // Check system calls -#define SYSCHECKTHROW(call, name) \ +#define SYSCHECKTHROW(call, name) \ do { \ int retval; \ SYSCHECKVAL(call, name, retval); \ } while (false) -#define SYSCHECKVALTHROW(call, name, retval) \ +#define SYSCHECKVALTHROW(call, name, retval) \ do { \ SYSCHECKSYNC(call, name, retval); \ if (retval == -1) { \ - std::runtime_error(std::string("Call to " name " failed : ") + strerror(errno)); \ + std::runtime_error(std::string("Call to " name " failed : ") + strerror(errno)); \ } \ } while (false) -#define SYSCHECKSYNCTHROW(call, name, retval) \ +#define SYSCHECKSYNCTHROW(call, name, retval) \ do { \ retval = call; \ if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ diff --git a/src/include/comm.h b/src/include/comm.h index 672cdd95..8275e0cb 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -15,7 +15,7 @@ struct mscclppBufferRegistration { - void* data; + void *data; uint64_t size; }; diff --git a/src/include/host_connection.hpp b/src/include/host_connection.hpp index bdf49df4..4a66c846 100644 --- a/src/include/host_connection.hpp +++ b/src/include/host_connection.hpp @@ -1,20 +1,19 @@ #ifndef MSCCLPP_HOST_CONNECTION_HPP_ #define MSCCLPP_HOST_CONNECTION_HPP_ -#include "mscclpp.h" #include "mscclpp.hpp" +#include "mscclpp.h" namespace mscclpp { -struct HostConnection::Impl -{ +struct HostConnection::Impl { mscclppHostConn_t* hostConn; Impl(); ~Impl(); - void setup(mscclppHostConn_t* hostConn); + void setup(mscclppHostConn_t *hostConn); }; } // namespace mscclpp diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index f8931834..0e7f76e5 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -191,8 +191,7 @@ struct mscclppHostConn { virtual ~mscclppHostConn() = default; virtual void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) = 0; - virtual void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, - uint64_t dataSize) = 0; + virtual void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, uint64_t dataSize) = 0; virtual void signal() = 0; virtual void wait() = 0; virtual void flush() = 0; @@ -248,8 +247,8 @@ typedef enum mscclppNumResults = 8 } mscclppResult_t; -class Bootstrap -{ + +class Bootstrap { public: Bootstrap(){}; virtual ~Bootstrap() = default; @@ -369,8 +368,7 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void * transportType: the type of transport to be used (mscclppTransportP2P or mscclppTransportIB) * ibDev: the name of the IB device to be used. Expects a null for mscclppTransportP2P. */ -mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int remoteRank, int tag, - mscclppTransport_t transportType, const char* ibDev = 0); +mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int remoteRank, int tag, mscclppTransport_t transportType, const char* ibDev = 0); /* Register a buffer for use with a connection. * @@ -383,8 +381,7 @@ mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int remoteRank, * Outputs: * handle: a handle to the buffer registration */ -mscclppResult_t mscclppRegisterBufferForConnection(mscclppComm_t comm, int connIdx, void* localBuff, uint64_t buffSize, - mscclppBufferHandle_t* handle); +mscclppResult_t mscclppRegisterBufferForConnection(mscclppComm_t comm, int connIdx, void* localBuff, uint64_t buffSize, mscclppBufferHandle_t *handle); /* Establish all connections declared by mscclppConnect(). This function must be called after all mscclppConnect() * calls are made. This function ensures that all remote ranks are ready to communicate when it returns. diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index 1dbe180f..fbc96f43 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -11,15 +11,14 @@ #define MSCCLPP_PROXY_FIFO_SIZE 128 #define MSCCLPP_PROXY_FIFO_FLUSH_COUNTER 4 -#include #include +#include #include namespace mscclpp { -struct alignas(16) SignalEpochId -{ +struct alignas(16) SignalEpochId { // every signal(), increaments this and either: // 1) proxy thread pushes it to the remote peer's localSignalEpochId->proxy // 2) gpu thread directly writes it to remoteSignalEpochId->device @@ -28,15 +27,14 @@ struct alignas(16) SignalEpochId uint64_t proxy; }; -enum ChannelTriggerType : uint64_t -{ +enum ChannelTriggerType : uint64_t { channelTriggerData = 0x1, channelTriggerFlag = 0x2, channelTriggerSync = 0x4 }; // This is just a numeric ID. Each HostConnection will have an internal array indexed by these handles -// mapping to the actual +// mapping to the actual using BufferHandle = uint8_t; #define MSCCLPP_BITS_SIZE 32 @@ -60,23 +58,14 @@ union ChannelTrigger { uint64_t srcBufferHandle : MSCCLPP_BITS_BUFFER_HANDLE; uint64_t dstBufferHandle : MSCCLPP_BITS_BUFFER_HANDLE; uint64_t type : MSCCLPP_BITS_TYPE; - uint64_t : (64 - MSCCLPP_BITS_OFFSET - MSCCLPP_BITS_BUFFER_HANDLE - MSCCLPP_BITS_BUFFER_HANDLE - - MSCCLPP_BITS_TYPE); // ensure 64-bit alignment + uint64_t : (64 - MSCCLPP_BITS_OFFSET - MSCCLPP_BITS_BUFFER_HANDLE - MSCCLPP_BITS_BUFFER_HANDLE - MSCCLPP_BITS_TYPE); // ensure 64-bit alignment } fields; - ChannelTrigger() - { - } - ChannelTrigger(ProxyTrigger value) : value(value) - { - } - ChannelTrigger(ChannelTriggerType type, BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, - uint64_t size) - { + ChannelTrigger() {} + ChannelTrigger(ProxyTrigger value) : value(value) {} + ChannelTrigger(ChannelTriggerType type, BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) { value.fst = ((srcOffset << MSCCLPP_BITS_SIZE) + size); - value.snd = (((((((uint64_t)type << MSCCLPP_BITS_BUFFER_HANDLE) + dst) << MSCCLPP_BITS_BUFFER_HANDLE) + src) - << MSCCLPP_BITS_OFFSET) + - dstOffset); + value.snd = (((((((uint64_t)type << MSCCLPP_BITS_BUFFER_HANDLE) + dst) << MSCCLPP_BITS_BUFFER_HANDLE) + src) << MSCCLPP_BITS_OFFSET) + dstOffset); } }; @@ -142,13 +131,11 @@ union ChannelTrigger { * The two endpoint can concurrently use the same connection provided they are writing (puts) on different * indices in the registered buffer. **************************************************************************************************************/ -struct DeviceConnection -{ +struct DeviceConnection { #ifdef __CUDACC__ // TODO: add buffer handles - __forceinline__ __device__ void put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, - uint64_t size) + __forceinline__ __device__ void put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) { fifo.push(ChannelTrigger(channelTriggerData, dst, dstOffset, src, srcOffset, size).value); } @@ -164,8 +151,7 @@ struct DeviceConnection fifo.push(ChannelTrigger(channelTriggerFlag, 0, 0, 0, 0, 1).value); } - __forceinline__ __device__ void putWithSignal(BufferHandle dst, uint64_t dstOffset, BufferHandle src, - uint64_t srcOffset, uint64_t size) + __forceinline__ __device__ void putWithSignal(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) { epochIncrement(); fifo.push(ChannelTrigger(channelTriggerData | channelTriggerFlag, dst, dstOffset, src, srcOffset, size).value); @@ -176,19 +162,16 @@ struct DeviceConnection putWithSignal(dst, offset, src, offset, size); } - __forceinline__ __device__ void putWithSignalAndFlush(BufferHandle dst, uint64_t dstOffset, BufferHandle src, - uint64_t srcOffset, uint64_t size) + __forceinline__ __device__ void putWithSignalAndFlush(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) { epochIncrement(); - uint64_t curFifoHead = - fifo.push(channelTriggerData | channelTriggerFlag | channelTriggerSync, dstOffset, srcOffset, size); + uint64_t curFifoHead = fifo.push(channelTriggerData | channelTriggerFlag | channelTriggerSync, dstOffset, srcOffset, size); while (*(volatile uint64_t*)&fifo.triggerFifo[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0 && *(volatile uint64_t*)fifo.triggerFifoTail <= curFifoHead) ; } - __forceinline__ __device__ void putWithSignalAndFlush(BufferHandle dst, BufferHandle src, uint64_t offset, - uint64_t size) + __forceinline__ __device__ void putWithSignalAndFlush(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) { putWithSignalAndFlush(offset, offset, size); } @@ -234,8 +217,7 @@ struct DeviceConnection ProxyFifo fifo; }; -class HostConnection -{ +class HostConnection { public: /* Register a region of GPU memory for use with this connection. Must be called before connectionSetup() * in the communicator. @@ -243,7 +225,7 @@ public: * Inputs: * data: base pointer to the memory * size: size of the memory region in bytes - * + * * Returns: a handle to the buffer */ BufferHandle registerBuffer(void* data, uint64_t size); @@ -258,7 +240,7 @@ public: * * Inputs: * index: the index of the handle to get - * + * * Returns: a handle to the buffer on the remote peer */ BufferHandle getRemoteBuffer(int index); @@ -266,10 +248,10 @@ public: /* Create a DeviceConnection paired with this HostConnection. A background proxy thread will * trigger operations on this HostConnection corresponding to put/signal/etc. calls made to the * DeviceConnection. - * + * * Inputs: * startProxyThread: whether to start the proxy thread (default is true) - * + * * Returns: the newly created DeviceConnection */ DeviceConnection toDevice(bool startProxyThread = true); @@ -287,8 +269,7 @@ private: }; #define MSCCLPP_UNIQUE_ID_BYTES 128 -struct UniqueId -{ +struct UniqueId { char internal[MSCCLPP_UNIQUE_ID_BYTES]; }; @@ -302,78 +283,76 @@ struct UniqueId std::unique_ptr getUniqueId(); /* Transport Types */ -enum class TransportType : uint8_t -{ +enum class TransportType : uint8_t { P2P = 0, IB = 1, }; -class Communicator -{ +class Communicator { public: /* Initialize the communicator. nranks processes with rank 0 to nranks-1 need to call this function. - * - * Inputs: - * nranks: number of ranks in the communicator - * ipPortPair: a string of the form "ip:port" that represents the address of the root process - * rank: rank of the calling process - */ + * + * Inputs: + * nranks: number of ranks in the communicator + * ipPortPair: a string of the form "ip:port" that represents the address of the root process + * rank: rank of the calling process + */ void initRank(int nranks, const char* ipPortPair, int rank); - + /* Initialize the communicator from a given UniqueId. Same as mscclppCommInitRank() except that - * id is provided by the user by calling getUniqueId() - * - * Inputs: - * nranks: number of ranks in the communicator - * id: the unique ID to be used for communication - * rank: rank of the calling process - */ + * id is provided by the user by calling getUniqueId() + * + * Inputs: + * nranks: number of ranks in the communicator + * id: the unique ID to be used for communication + * rank: rank of the calling process + */ void initRankFromId(int nranks, UniqueId id, int rank); - + /* Ring-based AllGather through the bootstrap socket. - * - * Inputs: - * data: data array to be gathered where `[r*size, (r+1)*size)` is the data for rank `r` - * size: data size per rank - */ + * + * Inputs: + * data: data array to be gathered where `[r*size, (r+1)*size)` is the data for rank `r` + * size: data size per rank + */ void bootstrapAllGather(void* data, int size); /* A no-op function that is used to synchronize all processes via a bootstrap allgather*/ void bootstrapBarrier(); /* Connect to a remote rank. This function only prepares metadata for connection. The actual connection - * is made by a following call of mscclppConnectionSetup(). Note that this function is two-way and a connection - * from rank i to remote rank j needs to have a counterpart from rank j to rank i. - * Note that with IB, buffers are registered at a page level and if a buffer is spread through multiple pages - * and do not fully utilize all of them, IB's QP has to register for all involved pages. This potentially has - * security risks if the devConn's accesses are given to a malicious process. - * - * Inputs: - * remoteRank: the rank of the remote process - * tag: the tag of the connection. tag is copied into the corresponding mscclppDevConn_t, which can be - * used to identify the connection inside a GPU kernel. - * transportType: the type of transport to be used (mscclppTransportP2P or mscclppTransportIB) - * ibDev: the name of the IB device to be used. Expects a null for mscclppTransportP2P. - */ + * is made by a following call of mscclppConnectionSetup(). Note that this function is two-way and a connection + * from rank i to remote rank j needs to have a counterpart from rank j to rank i. + * Note that with IB, buffers are registered at a page level and if a buffer is spread through multiple pages + * and do not fully utilize all of them, IB's QP has to register for all involved pages. This potentially has + * security risks if the devConn's accesses are given to a malicious process. + * + * Inputs: + * remoteRank: the rank of the remote process + * tag: the tag of the connection. tag is copied into the corresponding mscclppDevConn_t, which can be + * used to identify the connection inside a GPU kernel. + * transportType: the type of transport to be used (mscclppTransportP2P or mscclppTransportIB) + * ibDev: the name of the IB device to be used. Expects a null for mscclppTransportP2P. + */ std::shared_ptr connect(int remoteRank, int tag, TransportType transportType, const char* ibDev = 0); /* Establish all connections created by mscclppConnect(). This function must be called after all mscclppConnect() - * calls are made. This function ensures that all remote ranks are ready to communicate when it returns. - */ + * calls are made. This function ensures that all remote ranks are ready to communicate when it returns. + */ void connectionSetup(); - + /* Return the rank of the calling process. - * - * Outputs: - * rank: the rank of the calling process - */ + * + * Outputs: + * rank: the rank of the calling process + */ int rank(); /* Return the number of ranks of the communicator. - * - * Outputs: - * size: the number of ranks of the communicator - */ + * + * Outputs: + * size: the number of ranks of the communicator + */ int size(); private: diff --git a/src/include/mscclppfifo.hpp b/src/include/mscclppfifo.hpp index f602216d..27abd4c5 100644 --- a/src/include/mscclppfifo.hpp +++ b/src/include/mscclppfifo.hpp @@ -1,13 +1,12 @@ #ifndef MSCCLPPFIFO_HPP_ #define MSCCLPPFIFO_HPP_ -#include #include +#include namespace mscclpp { -struct alignas(16) ProxyTrigger -{ +struct alignas(16) ProxyTrigger { uint64_t fst, snd; }; @@ -24,8 +23,7 @@ struct alignas(16) ProxyTrigger * Why duplicating the tail is a good idea? The fifo is large engouh and we do not need frequent updates * for the tail as there is usually enough space for device threads to push their work into. */ -struct ProxyFifo -{ +struct ProxyFifo { #ifdef __CUDACC__ __forceinline__ __device__ uint64_t push(ProxyTrigger element) { @@ -35,8 +33,8 @@ struct ProxyFifo while (*(volatile uint64_t*)&this->triggerFifo[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0) ; uint64_t* valptr = (uint64_t*)&(this->triggerFifo[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE].value); - asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(valptr), "l"(element.value[0]), - "l"(element.value[1])); + asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(valptr), + "l"(element.value[0]), "l"(element.value[1])); return curFifoHead; } #endif // __CUDACC__ @@ -45,9 +43,9 @@ struct ProxyFifo void stopProxyThread(); ProxyTrigger* triggerFifo; // Allocate on host via cudaHostAlloc. This space is used for pushing the workelements - uint64_t* triggerFifoTail; // Allocated on device. proxyState->fifoTailHost is the true tail on host and pused - // occasionally to device - uint64_t* triggerFifoHead; // Allocated on device. Only accessed by device + uint64_t* triggerFifoTail; // Allocated on device. proxyState->fifoTailHost is the true tail on host and pused + // occasionally to device + uint64_t* triggerFifoHead; // Allocated on device. Only accessed by device }; } // namespace mscclpp diff --git a/src/init.cc b/src/init.cc index d30d2c17..7c3b76b9 100644 --- a/src/init.cc +++ b/src/init.cc @@ -326,8 +326,7 @@ struct mscclppHostP2PConn : mscclppHostConn { put(1, dstDataOffset, 1, srcDataOffset, dataSize); } - void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, - uint64_t dataSize) + void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, uint64_t dataSize) { void* srcBuff = (void*)((char*)conn->bufferRegistrations[src].data + srcDataOffset); void* dstBuff = (void*)((char*)conn->remoteBufferRegistrations[dst].data + dstDataOffset); @@ -365,8 +364,7 @@ struct mscclppHostIBConn : mscclppHostConn { put(1, dstDataOffset, 1, srcDataOffset, dataSize); } - void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, - uint64_t dataSize) + void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, uint64_t dataSize) { this->ibQp->stageSend(this->ibMrs[src], &this->remoteIbMrInfos[dst], (uint32_t)dataSize, /*wrId=*/0, /*srcOffset=*/srcDataOffset, /*dstOffset=*/dstDataOffset, /*signaled=*/false); @@ -425,8 +423,7 @@ struct mscclppHostIBConn : mscclppHostConn std::vector remoteIbMrInfos; }; -MSCCLPP_API mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int remoteRank, int tag, - mscclppTransport_t transportType, const char* ibDev) +MSCCLPP_API mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int remoteRank, int tag, mscclppTransport_t transportType, const char* ibDev) { // save this processes numa binding and set it to the one closest to the device // so that all the allocation are close to the device @@ -566,8 +563,7 @@ MSCCLPP_API mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int MSCCLPPCHECK(setNumaState(curProcessState)); mscclppBufferHandle_t signalHandle = -1; - MSCCLPPCHECK(mscclppRegisterBufferForConnection(comm, connId, conn->devConn->localSignalEpochId, - sizeof(mscclppDevConnSignalEpochId), &signalHandle)); + MSCCLPPCHECK(mscclppRegisterBufferForConnection(comm, connId, conn->devConn->localSignalEpochId, sizeof(mscclppDevConnSignalEpochId), &signalHandle)); if (signalHandle != 0) { WARN("signal handle should be 0"); return mscclppInternalError; @@ -596,9 +592,7 @@ MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, i return mscclppSuccess; } -MSCCLPP_API mscclppResult_t mscclppRegisterBufferForConnection(mscclppComm_t comm, int connIdx, void* localBuff, - uint64_t buffSize, mscclppBufferHandle_t* handle) -{ +MSCCLPP_API mscclppResult_t mscclppRegisterBufferForConnection(mscclppComm_t comm, int connIdx, void* localBuff, uint64_t buffSize, mscclppBufferHandle_t *handle) { if (connIdx >= comm->nConns) { WARN("connIdx out of range"); return mscclppInvalidArgument; @@ -624,32 +618,27 @@ struct connInfo mscclppIbQpInfo infoQp; std::vector bufferInfos; - struct header - { + struct header { mscclppIbQpInfo infoQp; int numBufferInfos; }; - mscclppResult_t sendOverBootstrap(void* bootstrap, int remoteRank, int tag) - { + mscclppResult_t sendOverBootstrap(void* bootstrap, int remoteRank, int tag) { header h; h.infoQp = infoQp; h.numBufferInfos = bufferInfos.size(); MSCCLPPCHECK(bootstrapSend(bootstrap, remoteRank, tag, &h, sizeof(header))); - MSCCLPPCHECK(bootstrapSend(bootstrap, remoteRank, tag, bufferInfos.data(), - bufferInfos.size() * sizeof(mscclppBufferRegistrationInfo))); - return mscclppSuccess; + MSCCLPPCHECK(bootstrapSend(bootstrap, remoteRank, tag, bufferInfos.data(), bufferInfos.size() * sizeof(mscclppBufferRegistrationInfo))); + return mscclppSuccess; } - mscclppResult_t recvOverBootstrap(void* bootstrap, int remoteRank, int tag) - { + mscclppResult_t recvOverBootstrap(void* bootstrap, int remoteRank, int tag) { header h; MSCCLPPCHECK(bootstrapRecv(bootstrap, remoteRank, tag, &h, sizeof(header))); infoQp = h.infoQp; bufferInfos.resize(h.numBufferInfos); - MSCCLPPCHECK(bootstrapRecv(bootstrap, remoteRank, tag, bufferInfos.data(), - bufferInfos.size() * sizeof(mscclppBufferRegistrationInfo))); - return mscclppSuccess; + MSCCLPPCHECK(bootstrapRecv(bootstrap, remoteRank, tag, bufferInfos.data(), bufferInfos.size() * sizeof(mscclppBufferRegistrationInfo))); + return mscclppSuccess; } }; @@ -661,7 +650,7 @@ mscclppResult_t mscclppP2pConnectionSetupStart(struct connInfo* connInfo /*input } // Add all registered buffers - for (const auto& bufReg : conn->bufferRegistrations) { + for (const auto &bufReg : conn->bufferRegistrations) { connInfo->bufferInfos.emplace_back(); CUDACHECK(cudaIpcGetMemHandle(&connInfo->bufferInfos.back().cudaHandle, bufReg.data)); connInfo->bufferInfos.back().size = bufReg.size; @@ -683,8 +672,7 @@ mscclppResult_t mscclppP2pConnectionSetupEnd(struct connInfo* connInfo /*input*/ // Open all remote registered buffers for (size_t i = 0; i < connInfo->bufferInfos.size(); i++) { mscclppBufferRegistration newBufReg; - CUDACHECK( - cudaIpcOpenMemHandle(&newBufReg.data, connInfo->bufferInfos[i].cudaHandle, cudaIpcMemLazyEnablePeerAccess)); + CUDACHECK(cudaIpcOpenMemHandle(&newBufReg.data, connInfo->bufferInfos[i].cudaHandle, cudaIpcMemLazyEnablePeerAccess)); newBufReg.size = connInfo->bufferInfos[i].size; conn->remoteBufferRegistrations.push_back(newBufReg); } @@ -695,8 +683,8 @@ mscclppResult_t mscclppP2pConnectionSetupEnd(struct connInfo* connInfo /*input*/ } conn->devConn->remoteSignalEpochId = (mscclppDevConnSignalEpochId*)conn->remoteBufferRegistrations[0].data; - // For backwards compatibility with the previous API that assumed one data buffer per connection, set the remote - // buffer to the first remote data buffer + // For backwards compatibility with the previous API that assumed one data buffer per connection, set the remote buffer + // to the first remote data buffer if (conn->remoteBufferRegistrations.size() > 1) { conn->devConn->remoteBuff = conn->remoteBufferRegistrations[1].data; } @@ -720,10 +708,10 @@ mscclppResult_t mscclppIbConnectionSetupStart(struct connInfo* connInfo /*output } // Add all registered buffers - for (const auto& bufReg : conn->bufferRegistrations) { + for (const auto &bufReg : conn->bufferRegistrations) { hostConn->ibMrs.emplace_back(); - MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, bufReg.data, sizeof(struct mscclppDevConnSignalEpochId), - &hostConn->ibMrs.back())); + MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, bufReg.data, + sizeof(struct mscclppDevConnSignalEpochId), &hostConn->ibMrs.back())); connInfo->bufferInfos.emplace_back(); connInfo->bufferInfos.back().ibMrInfo = hostConn->ibMrs.back()->info; connInfo->bufferInfos.back().size = bufReg.size; @@ -776,8 +764,7 @@ MSCCLPP_API mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm) MSCCLPPCHECK(mscclppIbConnectionSetupStart(&cInfo, conn)); } // TODO: from saemal: do we possibly deadlock if there are too many outstanding sends? - // MSCCLPPCHECK(bootstrapSend(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag, &cInfo, - // sizeof(cInfo))); + // MSCCLPPCHECK(bootstrapSend(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag, &cInfo, sizeof(cInfo))); MSCCLPPCHECK(cInfo.sendOverBootstrap(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag)); } diff --git a/tests/allgather_test_cpp.cu b/tests/allgather_test_cpp.cu index aa49ca2f..ca30945f 100644 --- a/tests/allgather_test_cpp.cu +++ b/tests/allgather_test_cpp.cu @@ -240,7 +240,7 @@ void setupMscclppConnections(int rank, int world_size, mscclpp::Communicator& co comm.connectionSetup(); assert(devConns.size() < sizeof(constDevConns) / sizeof(mscclpp::DeviceConnection)); - CUDACHECK(cudaMemcpyToSymbol(constDevConns, devConns.data(), sizeof(mscclpp::DeviceConnection) * devConns.size())); + CUDACHECK(cudaMemcpyToSymbol(constDevConns, devConns.data(), sizeof(mscclpp::DeviceConnection) * devConns.size() )); } void printUsage(const char* prog, bool isMpi) @@ -390,20 +390,20 @@ int main(int argc, const char* argv[]) } size_t nelemsPerGPU = dataSize / sizeof(int) / world_size; - try { + try{ mscclpp::Communicator comm; if (rank == 0) - printf("Initializing MSCCL++\n"); + printf("Initializing MSCCL++\n"); comm.initRank(world_size, ip_port, rank); if (rank == 0) - printf("Initializing data for allgather test\n"); + printf("Initializing data for allgather test\n"); initializeAndAllocateAllGatherData(rank, world_size, dataSize, nelemsPerGPU, &data_h, &data_d); if (rank == 0) - printf("Setting up the connection in MSCCL++\n"); + printf("Setting up the connection in MSCCL++\n"); setupMscclppConnections(rank, world_size, comm, data_d, dataSize); } catch (std::exception& e) { diff --git a/tests/bootstrap_test_cpp.cc b/tests/bootstrap_test_cpp.cc index 2f59c070..c2ef61f0 100644 --- a/tests/bootstrap_test_cpp.cc +++ b/tests/bootstrap_test_cpp.cc @@ -6,53 +6,53 @@ int main() { - int rank, worldSize; - MPI_Init(NULL, NULL); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - MPI_Comm_size(MPI_COMM_WORLD, &worldSize); + int rank, worldSize; + MPI_Init(NULL, NULL); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &worldSize); - std::shared_ptr bootstrap(new mscclppBootstrap(rank, worldSize)); - // bootstrap->Initialize("costsim-dev-00000A:50000"); - UniqueId id; - if (rank == 0) - id = bootstrap->GetUniqueId(); - MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); - bootstrap->Initialize(id); + std::shared_ptr bootstrap(new mscclppBootstrap(rank, worldSize)); + // bootstrap->Initialize("costsim-dev-00000A:50000"); + UniqueId id; + if (rank == 0) + id = bootstrap->GetUniqueId(); + MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); + bootstrap->Initialize(id); - std::vector tmp(worldSize, 0); - tmp[rank] = rank + 1; - bootstrap->AllGather(tmp.data(), sizeof(int)); - for (int i = 0; i < worldSize; i++) { - if (tmp[i] != i + 1) - printf("error AllGather: rank %d: tmp[%d] = %d\n", rank, i, tmp[i]); - } - printf("rank %d: AllGather test passed!\n", rank); + std::vector tmp(worldSize, 0); + tmp[rank] = rank+1; + bootstrap->AllGather(tmp.data(), sizeof(int)); + for (int i = 0; i < worldSize; i++){ + if (tmp[i] != i+1) + printf("error AllGather: rank %d: tmp[%d] = %d\n", rank, i, tmp[i]); + } + printf("rank %d: AllGather test passed!\n", rank); - bootstrap->Barrier(); - printf("rank %d: Barrier test passed!\n", rank); + bootstrap->Barrier(); + printf("rank %d: Barrier test passed!\n", rank); - for (int i = 0; i < worldSize; i++) { - if (i == rank) - continue; - int msg1 = (rank + 1) * 2; - int msg2 = (rank + 1) * 2 + 1; - bootstrap->Send(&msg1, sizeof(int), i, 0); - bootstrap->Send(&msg2, sizeof(int), i, 1); - } + for (int i = 0; i < worldSize; i++){ + if (i == rank) + continue; + int msg1 = (rank + 1)*2; + int msg2 = (rank + 1)*2+1; + bootstrap->Send(&msg1, sizeof(int), i, 0); + bootstrap->Send(&msg2, sizeof(int), i, 1); + } - for (int i = 0; i < worldSize; i++) { - if (i == rank) - continue; - int msg1 = 0; - int msg2 = 0; - // recv them in the opposite order to check correctness - bootstrap->Recv(&msg2, sizeof(int), i, 1); - bootstrap->Recv(&msg1, sizeof(int), i, 0); - if (msg1 != (i + 1) * 2 || msg2 != (i + 1) * 2 + 1) - printf("error Send/Recv: rank %d: msg1 = %d, msg2 = %d\n", rank, msg1, msg2); - } - printf("rank %d: Send/Recv test passed!\n", rank); + for (int i = 0; i < worldSize; i++){ + if (i == rank) + continue; + int msg1 = 0; + int msg2 = 0; + // recv them in the opposite order to check correctness + bootstrap->Recv(&msg2, sizeof(int), i, 1); + bootstrap->Recv(&msg1, sizeof(int), i, 0); + if (msg1 != (i+1)*2 || msg2 != (i+1)*2+1) + printf("error Send/Recv: rank %d: msg1 = %d, msg2 = %d\n", rank, msg1, msg2); + } + printf("rank %d: Send/Recv test passed!\n", rank); - MPI_Finalize(); - return 0; + MPI_Finalize(); + return 0; } \ No newline at end of file From e4ee2eba25de399e4242b5ee9fd9f607b1b40e88 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Tue, 25 Apr 2023 00:41:45 +0000 Subject: [PATCH 049/135] WIP Connection in C++ --- src/communicator.cc | 38 ++++++++++++++-------- src/connection.cc | 54 +++++++++++++++++++++++++++++++ src/include/communicator.hpp | 12 +++---- src/include/connection.hpp | 48 +++++++++++++++++++++++++++ src/include/mscclpp.hpp | 38 +++++++++++----------- src/include/registered_memory.hpp | 46 ++++++++++++++++++++++++++ src/registered_memory.cc | 7 ++++ 7 files changed, 205 insertions(+), 38 deletions(-) create mode 100644 src/connection.cc create mode 100644 src/include/connection.hpp create mode 100644 src/include/registered_memory.hpp create mode 100644 src/registered_memory.cc diff --git a/src/communicator.cc b/src/communicator.cc index d12b20e4..a74923bb 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -17,9 +17,16 @@ Communicator::Impl::~Impl() { MSCCLPP_API_CPP Communicator::~Communicator() = default; -static mscclppTransport_t transportFlagsToCStyle(TransportFlags flags) { +static mscclppTransport_t transportToCStyle(TransportFlags flags) { switch (flags) { - case TransportIB: + case TransportIB0: + case TransportIB1: + case TransportIB2: + case TransportIB3: + case TransportIB4: + case TransportIB5: + case TransportIB6: + case TransportIB7: return mscclppTransportIB; case TransportCudaIpc: return mscclppTransportP2P; @@ -46,10 +53,23 @@ MSCCLPP_API_CPP void Communicator::bootstrapBarrier() { mscclppBootstrapBarrier(pimpl->comm); } -MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank, int tag, TransportFlags transportFlags, const char* ibDev) { - mscclppConnectWithoutBuffer(pimpl->comm, remoteRank, tag, transportFlagsToCStyle(transportFlags), ibDev); +MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank, int tag, TransportFlags transport) { + std::string ibDev; + switch (transport) { + case TransportIB0: + case TransportIB1: + case TransportIB2: + case TransportIB3: + case TransportIB4: + case TransportIB5: + case TransportIB6: + case TransportIB7: + ibDev = getIBDeviceName(transport); + break; + } + mscclppConnectWithoutBuffer(pimpl->comm, remoteRank, tag, transportToCStyle(transport), ibDev.c_str()); auto connIdx = pimpl->connections.size(); - auto conn = std::make_shared(std::make_unique(this, &pimpl->comm->conns[connIdx])); + auto conn = std::make_shared(std::make_unique(this, &pimpl->comm->conns[connIdx])); pimpl->connections.push_back(conn); return conn; } @@ -58,14 +78,6 @@ MSCCLPP_API_CPP void Communicator::connectionSetup() { mscclppConnectionSetup(pimpl->comm); } -MSCCLPP_API_CPP void Communicator::startProxying() { - pimpl->proxy.start(); -} - -MSCCLPP_API_CPP void Communicator::stopProxying() { - pimpl->proxy.stop(); -} - MSCCLPP_API_CPP int Communicator::rank() { int result; mscclppCommRank(pimpl->comm, &result); diff --git a/src/connection.cc b/src/connection.cc new file mode 100644 index 00000000..12ebee02 --- /dev/null +++ b/src/connection.cc @@ -0,0 +1,54 @@ +#include "connection.hpp" +#include "checks.hpp" +#include "registered_memory.hpp" + +namespace mscclpp { + +void validateTransport(RegisteredMemory mem, TransportFlags transport) { + if (mem.transports() & transport == TransportNone) { + throw std::runtime_error("mem does not support transport"); + } +} + +TransportFlags CudaIpcConnection::transport() { + return TransportCudaIpc; +} + +TransportFlags CudaIpcConnection::remoteTransport() { + return TransportCudaIpc; +} + +CudaIpcConnection::CudaIpcConnection() { + cudaStreamCreate(&stream); +} + +CudaIpcConnection::~CudaIpcConnection() { + cudaStreamDestroy(stream); +} + +void CudaIpcConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size) { + validateTransport(dst, remoteTransport()); + validateTransport(src, transport()); + + auto dstPtr = dst.impl->getTransportData(remoteTransport()); + auto srcPtr = src.impl->getTransportData(transport()); + CUDATHROW(cudaMemcpyAsync(dstPtr + dstOffset, srcPtr + srcOffset, size, cudaMemcpyDeviceToDevice, stream)); + npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_DATA_ENTRY, (uint32_t)dataSize); +} + +void CudaIpcConnection::flush() { + CUDATHROW(cudaStreamSynchronize(stream)); + npkitCollectExitEvents(conn, NPKIT_EVENT_DMA_SEND_EXIT); +} + +IBConnection::IBConnection(TransportFlags transport) : transport_(transport), remoteTransport_(TransportNone) {} + +TransportFlags IBConnection::transport() { + return transport_; +} + +TransportFlags IBConnection::remoteTransport() { + return remoteTransport_; +} + +} // namespace mscclpp diff --git a/src/include/communicator.hpp b/src/include/communicator.hpp index f2816c1a..827b0281 100644 --- a/src/include/communicator.hpp +++ b/src/include/communicator.hpp @@ -9,15 +9,15 @@ namespace mscclpp { struct Communicator::Impl { - mscclppComm_t comm; - std::vector> connections; - Proxy proxy; + mscclppComm_t comm; + std::vector> connections; + Proxy proxy; - Impl(); + Impl(); - ~Impl(); + ~Impl(); - friend class HostConnection; + friend class Connection; }; } // namespace mscclpp diff --git a/src/include/connection.hpp b/src/include/connection.hpp new file mode 100644 index 00000000..048e2c6a --- /dev/null +++ b/src/include/connection.hpp @@ -0,0 +1,48 @@ +#ifndef MSCCLPP_CONNECTION_HPP_ +#define MSCCLPP_CONNECTION_HPP_ + +#include "mscclpp.hpp" +#include +#include "ib.h" + +namespace mscclpp { + +class CudaIpcConnection : public Connection { + cudaStream_t stream; +public: + + CudaIpcConnection(); + + virtual ~CudaIpcConnection(); + + virtual TransportFlags transport(); + + virtual TransportFlags remoteTransport(); + + virtual void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size); + + virtual void flush(); +}; + +class IBConnection : public Connection { + TransportFlags transport_; + TransportFlags remoteTransport_; + mscclppIbQp qp; +public: + + IBConnection(TransportFlags transport); + + virtual ~IBConnection(); + + virtual TransportFlags transport(); + + virtual TransportFlags remoteTransport(); + + virtual void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size); + + virtual void flush(); +}; + +} // namespace mscclpp + +#endif // MSCCLPP_CONNECTION_HPP_ diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index 67d40050..f4d73ab4 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -26,8 +26,9 @@ struct UniqueId { std::unique_ptr getUniqueId(); using TransportFlags = uint32_t; +const TransportFlags TransportNone = 0b0; const TransportFlags TransportCudaIpc = 0b1; -const TransportFlags TransportIB = 0b10; +const TransportFlags TransportIB0 = 0b10; const TransportFlags TransportIB1 = 0b100; const TransportFlags TransportIB2 = 0b1000; const TransportFlags TransportIB3 = 0b10000; @@ -37,7 +38,12 @@ const TransportFlags TransportIB6 = 0b10000000; const TransportFlags TransportIB7 = 0b100000000; const TransportFlags TransportAll = 0b111111111; +int getIBDeviceCount(); +std::string getIBDeviceName(TransportFlags ibTransport); +TransportFlags getIBTransportByDeviceName(const std::string& ibDeviceName); + class Communicator; +class Connection; class RegisteredMemory { struct Impl; @@ -55,31 +61,20 @@ public: static RegisteredMemory deserialize(const std::vector& data); int rank(); - bool isLocal(); - bool isRemote(); + + friend class Connection; }; class Connection { - struct Impl; - std::unique_ptr pimpl; -public: + virtual ~Connection() = 0; - /* Connection can not be constructed from user code and must instead be created through Communicator::connect */ - Connection(std::unique_ptr); - ~Connection(); + virtual void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size) = 0; - void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size); + virtual void flush() = 0; - void flush(); + virtual TransportFlags transport() = 0; - TransportFlags transport(); - TransportFlags remoteTransport(); // Good to have because different IB transports can still connect to each other - - // template void write(RegisteredPtr dst, RegisteredPtr src, uint64_t size) { - // write(dst.memory(), dst.offset() * sizeof(T), src.memory(), src.offset() * sizeof(T), size); - // } - - friend class Communicator; + virtual TransportFlags remoteTransport() = 0; }; class Communicator { @@ -145,6 +140,11 @@ public: */ std::shared_ptr connect(int remoteRank, int tag, TransportFlags transport); + /* Establish all connections declared by connect(). This function must be called after all connect() + * calls are made. This function ensures that all remote ranks are ready to communicate when it returns. + */ + void connectionSetup(); + /* Return the rank of the calling process. * * Outputs: diff --git a/src/include/registered_memory.hpp b/src/include/registered_memory.hpp new file mode 100644 index 00000000..82fe942e --- /dev/null +++ b/src/include/registered_memory.hpp @@ -0,0 +1,46 @@ +#ifndef MSCCLPP_REGISTERED_MEMORY_HPP_ +#define MSCCLPP_REGISTERED_MEMORY_HPP_ + +#include "mscclpp.hpp" +#include "ib.h" +#include +#include + +namespace mscclpp { + +struct IBTransportData { + mscclppIbMr localIbMr; + mscclppIbMrInfo remoteIbMrInfo; +}; + +struct TransportData { + TransportFlags transport; + union { + void* cudaIpcPtr; + IBTransportData ibData; + } +}; + +struct RegisteredMemory::Impl { + void* data; + size_t size; + TransportFlags transports; + std::vector transportData; + + Impl(void* data, size_t size, TransportFlags transports); + + ~Impl(); + + template T& getTransportData(TransportFlags transport) { + for (auto& data : transportData) { + if (data.transport == transport) { + return data; + } + } + throw std::runtime_error("Transport data not found"); + } +}; + +} // namespace mscclpp + +#endif // MSCCLPP_REGISTERED_MEMORY_HPP_ diff --git a/src/registered_memory.cc b/src/registered_memory.cc new file mode 100644 index 00000000..d491e72f --- /dev/null +++ b/src/registered_memory.cc @@ -0,0 +1,7 @@ +#include "registered_memory.hpp" + +namespace mscclpp { + + + +} // namespace mscclpp From 3546e80aa07b0751ec74d47b732dc7a7865cf209 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Tue, 25 Apr 2023 00:47:48 +0000 Subject: [PATCH 050/135] unique ptr for pimpl_ in bootstrap --- src/bootstrap/bootstrap.cc | 2 +- src/include/bootstrap.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index b38d4b84..3b5b0166 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -436,7 +436,7 @@ void mscclppBootstrap::Impl::close() mscclppBootstrap::mscclppBootstrap(int rank, int nRanks) { // pimpl_ = std::make_unique(ipPortPair, rank, nRanks, uniqueId); - pimpl_ = new Impl(rank, nRanks); + pimpl_ = std::make_unique(rank, nRanks); } UniqueId mscclppBootstrap::GetUniqueId() diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h index ba69d170..e17da010 100644 --- a/src/include/bootstrap.h +++ b/src/include/bootstrap.h @@ -31,7 +31,7 @@ public: private: class Impl; - Impl* pimpl_; + std::unique_ptr pimpl_; }; // ------------------- Old bootstrap headers: to be removed ------------------- From 8428b49858c2c17fcebd49b3cb182e8231fb11e3 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Tue, 25 Apr 2023 01:51:47 +0000 Subject: [PATCH 051/135] a few minor changes --- src/bootstrap/bootstrap.cc | 80 +++++++++++++++++++------------------- src/include/bootstrap.h | 2 +- 2 files changed, 41 insertions(+), 41 deletions(-) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index 3b5b0166..fc9f0645 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -26,7 +26,7 @@ uint64_t hashUniqueId(const mscclppBootstrapHandle& id) mscclppResult_t setFilesLimit() { - struct rlimit filesLimit; + rlimit filesLimit; SYSCHECK(getrlimit(RLIMIT_NOFILE, &filesLimit), "getrlimit"); filesLimit.rlim_cur = filesLimit.rlim_max; SYSCHECK(setrlimit(RLIMIT_NOFILE, &filesLimit), "setrlimit"); @@ -53,8 +53,8 @@ struct extInfo { int rank; int nRanks; - union mscclppSocketAddress extAddressListenRoot; - union mscclppSocketAddress extAddressListen; + mscclppSocketAddress extAddressListenRoot; + mscclppSocketAddress extAddressListen; }; class mscclppBootstrap::Impl @@ -62,15 +62,15 @@ class mscclppBootstrap::Impl public: Impl(int rank, int nRanks); ~Impl(); - void initialize(const UniqueId uniqueId); - void initialize(std::string ipPortPair); - void establishConnections(); - UniqueId getUniqueId(); - void allGather(void* allData, int size); - void send(void* data, int size, int peer, int tag); - void recv(void* data, int size, int peer, int tag); - void barrier(); - void close(); + void Initialize(const UniqueId uniqueId); + void Initialize(std::string ipPortPair); + void EstablishConnections(); + UniqueId GetUniqueId(); + void AllGather(void* allData, int size); + void Send(void* data, int size, int peer, int tag); + void Recv(void* data, int size, int peer, int tag); + void Barrier(); + void Close(); UniqueId uniqueId_; @@ -87,7 +87,7 @@ private: volatile uint32_t* abortFlag_; std::thread rootThread_; char netIfName_[MAX_IF_NAME_SIZE + 1]; - union mscclppSocketAddress netIfAddr_; + mscclppSocketAddress netIfAddr_; void netSend(mscclppSocket* sock, const void* data, int size); void netRecv(mscclppSocket* sock, void* data, int size); @@ -109,17 +109,17 @@ mscclppBootstrap::Impl::Impl(int rank, int nRanks) { } -UniqueId mscclppBootstrap::Impl::getUniqueId() +UniqueId mscclppBootstrap::Impl::GetUniqueId() { netInit(""); MSCCLPPTHROW(getRandomData(&uniqueId_.magic, sizeof(uniqueId_.magic))); - std::memcpy(&uniqueId_.addr, &netIfAddr_, sizeof(union mscclppSocketAddress)); + std::memcpy(&uniqueId_.addr, &netIfAddr_, sizeof(mscclppSocketAddress)); bootstrapCreateRoot(); return uniqueId_; } -void mscclppBootstrap::Impl::initialize(const UniqueId uniqueId) +void mscclppBootstrap::Impl::Initialize(const UniqueId uniqueId) { netInit(""); @@ -127,22 +127,22 @@ void mscclppBootstrap::Impl::initialize(const UniqueId uniqueId) uniqueId_.addr = uniqueId.addr; // printf("addr = %s port = %d\n", inet_ntoa(uniqueId_.addr.sin.sin_addr), (int)ntohs(uniqueId_.addr.sin.sin_port)); - establishConnections(); + EstablishConnections(); } -void mscclppBootstrap::Impl::initialize(std::string ipPortPair) +void mscclppBootstrap::Impl::Initialize(std::string ipPortPair) { netInit(ipPortPair); uniqueId_.magic = 0xdeadbeef; - std::memcpy(&uniqueId_.addr, &netIfAddr_, sizeof(union mscclppSocketAddress)); + std::memcpy(&uniqueId_.addr, &netIfAddr_, sizeof(mscclppSocketAddress)); MSCCLPPTHROW(mscclppSocketGetAddrFromString(&uniqueId_.addr, ipPortPair.c_str())); if (rank_ == 0) { bootstrapCreateRoot(); } - establishConnections(); + EstablishConnections(); } mscclppBootstrap::Impl::~Impl() @@ -250,7 +250,7 @@ void mscclppBootstrap::Impl::netInit(std::string ipPortPair) if (netInitialized) return; if (!ipPortPair.empty()) { - union mscclppSocketAddress remoteAddr; + mscclppSocketAddress remoteAddr; if (mscclppSocketGetAddrFromString(&remoteAddr, ipPortPair.c_str()) != mscclppSuccess) { throw std::runtime_error("Invalid ipPortPair, please use format: : or []: or :"); } @@ -271,7 +271,7 @@ void mscclppBootstrap::Impl::netInit(std::string ipPortPair) netInitialized = true; } -void mscclppBootstrap::Impl::establishConnections() +void mscclppBootstrap::Impl::EstablishConnections() { mscclppSocketAddress nextAddr; mscclppSocket sock, listenSockRoot; @@ -295,7 +295,7 @@ void mscclppBootstrap::Impl::establishConnections() // stagger connection times to avoid an overload of the root auto randomSleep = [](int rank) { - struct timespec tv; + timespec tv; tv.tv_sec = rank / 1000; tv.tv_nsec = 1000000 * (rank % 1000); TRACE(MSCCLPP_INIT, "rank %d delaying connection to root by %ld msec", rank, rank); @@ -319,7 +319,7 @@ void mscclppBootstrap::Impl::establishConnections() // get info on my "next" rank in the bootstrap ring from root MSCCLPPTHROW(mscclppSocketInit(&sock)); MSCCLPPTHROW(mscclppSocketAccept(&sock, &listenSockRoot)); - netRecv(&sock, &nextAddr, sizeof(union mscclppSocketAddress)); + netRecv(&sock, &nextAddr, sizeof(mscclppSocketAddress)); MSCCLPPTHROW(mscclppSocketClose(&sock)); MSCCLPPTHROW(mscclppSocketClose(&listenSockRoot)); @@ -332,12 +332,12 @@ void mscclppBootstrap::Impl::establishConnections() // AllGather all listen handlers MSCCLPPTHROW(mscclppSocketGetAddr(&this->listenSock_, &this->peerCommAddresses_[rank_])); - allGather(this->peerCommAddresses_.data(), sizeof(union mscclppSocketAddress)); + AllGather(this->peerCommAddresses_.data(), sizeof(mscclppSocketAddress)); TRACE(MSCCLPP_INIT, "rank %d nranks %d - DONE", rank_, nRanks_); } -void mscclppBootstrap::Impl::allGather(void* allData, int size) +void mscclppBootstrap::Impl::AllGather(void* allData, int size) { char* data = static_cast(allData); int rank = this->rank_; @@ -378,7 +378,7 @@ void mscclppBootstrap::Impl::netRecv(mscclppSocket* sock, void* data, int size) MSCCLPPTHROW(mscclppSocketRecv(sock, data, std::min(recvSize, size))); } -void mscclppBootstrap::Impl::send(void* data, int size, int peer, int tag) +void mscclppBootstrap::Impl::Send(void* data, int size, int peer, int tag) { mscclppSocket sock; MSCCLPPTHROW(mscclppSocketInit(&sock, &this->peerCommAddresses_[peer], this->uniqueId_.magic, @@ -391,7 +391,7 @@ void mscclppBootstrap::Impl::send(void* data, int size, int peer, int tag) MSCCLPPTHROW(mscclppSocketClose(&sock)); } -void mscclppBootstrap::Impl::recv(void* data, int size, int peer, int tag) +void mscclppBootstrap::Impl::Recv(void* data, int size, int peer, int tag) { // search over all unexpected messages for (auto it = unexpectedMessages_.begin(); it != unexpectedMessages_.end(); ++it){ @@ -421,12 +421,12 @@ void mscclppBootstrap::Impl::recv(void* data, int size, int peer, int tag) } } -void mscclppBootstrap::Impl::barrier() +void mscclppBootstrap::Impl::Barrier() { - allGather(barrierArr_.data(), sizeof(int)); + AllGather(barrierArr_.data(), sizeof(int)); } -void mscclppBootstrap::Impl::close() +void mscclppBootstrap::Impl::Close() { MSCCLPPTHROW(mscclppSocketClose(&this->listenSock_)); MSCCLPPTHROW(mscclppSocketClose(&this->ringSendSocket_)); @@ -441,42 +441,42 @@ mscclppBootstrap::mscclppBootstrap(int rank, int nRanks) UniqueId mscclppBootstrap::GetUniqueId() { - return pimpl_->getUniqueId(); + return pimpl_->GetUniqueId(); } void mscclppBootstrap::Send(void* data, int size, int peer, int tag) { - pimpl_->send(data, size, peer, tag); + pimpl_->Send(data, size, peer, tag); } void mscclppBootstrap::Recv(void* data, int size, int peer, int tag) { - pimpl_->recv(data, size, peer, tag); + pimpl_->Recv(data, size, peer, tag); } void mscclppBootstrap::AllGather(void* allData, int size) { - pimpl_->allGather(allData, size); + pimpl_->AllGather(allData, size); } -void mscclppBootstrap::Initialize(const UniqueId uniqueId) +void mscclppBootstrap::Initialize(UniqueId uniqueId) { - pimpl_->initialize(uniqueId); + pimpl_->Initialize(uniqueId); } void mscclppBootstrap::Initialize(std::string ipPortPair) { - pimpl_->initialize(ipPortPair); + pimpl_->Initialize(ipPortPair); } void mscclppBootstrap::Barrier() { - pimpl_->barrier(); + pimpl_->Barrier(); } mscclppBootstrap::~mscclppBootstrap() { - pimpl_->close(); + pimpl_->Close(); } // ------------------- Old bootstrap functions ------------------- diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h index e17da010..2a6b99ba 100644 --- a/src/include/bootstrap.h +++ b/src/include/bootstrap.h @@ -22,7 +22,7 @@ public: UniqueId GetUniqueId(); - void Initialize(const UniqueId uniqueId); + void Initialize(UniqueId uniqueId); void Initialize(std::string ipPortPair); void Send(void* data, int size, int peer, int tag) override; void Recv(void* data, int size, int peer, int tag) override; From 31f7897d5deb37ff6638ee19129a8a50336e47da Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 25 Apr 2023 11:47:58 +0000 Subject: [PATCH 052/135] integrate with new interfaces in mscclpp.hpp --- Makefile | 6 +- src/bootstrap/bootstrap.cc | 132 +++++++++++++++++++++--------------- src/include/bootstrap.h | 29 -------- src/include/mscclpp.h | 10 --- src/include/mscclpp.hpp | 44 ++++++++++-- tests/bootstrap_test_cpp.cc | 22 +++--- 6 files changed, 130 insertions(+), 113 deletions(-) diff --git a/Makefile b/Makefile index 881296f4..41004ce3 100644 --- a/Makefile +++ b/Makefile @@ -134,7 +134,7 @@ HEADERS := $(wildcard src/include/*.h) CPPSOURCES := $(shell find ./ -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)' -not -path "./build/*" -not -path "./python/*") PYTHONCPPSOURCES := $(shell find ./python/src/ -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)') -INCEXPORTS := mscclpp.h mscclppfifo.h +INCEXPORTS := mscclpp.h mscclppfifo.h mscclpp.hpp mscclppfifo.hpp INCTARGETS := $(INCEXPORTS:%=$(BUILDDIR)/$(INCDIR)/%) LIBNAME := libmscclpp.so @@ -198,7 +198,7 @@ $(BUILDDIR)/$(OBJDIR)/$(UTDIR)/%.o: $(UTDIR)/%.cc $(HEADERS) @mkdir -p $(@D) $(CXX) -o $@ $(INCLUDE) $(CXXFLAGS) -c $< -$(BUILDDIR)/$(INCDIR)/%.h: src/$(INCDIR)/%.h +$(BUILDDIR)/$(INCDIR)/%: src/$(INCDIR)/% @mkdir -p $(@D) cp $< $@ @@ -216,7 +216,7 @@ $(BUILDDIR)/$(BINDIR)/$(UTDIR)/%: $(BUILDDIR)/$(OBJDIR)/$(UTDIR)/%.o $(LIBOBJTAR # Compile .cc tests $(BUILDDIR)/$(OBJDIR)/$(TESTSDIR)/%.o: $(TESTSDIR)/%.cc $(INCTARGETS) @mkdir -p $(@D) - $(CXX) -o $@ -I$(BUILDDIR)/$(INCDIR) $(MPI_INC) $(CXXFLAGS) -Isrc/include -c $< $(MPI_MACRO) + $(CXX) -o $@ -I$(BUILDDIR)/$(INCDIR) $(MPI_INC) $(CXXFLAGS) -c $< $(MPI_MACRO) # Compile .cu tests $(BUILDDIR)/$(OBJDIR)/$(TESTSDIR)/%.o: $(TESTSDIR)/%.cu $(INCTARGETS) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index fc9f0645..2447489e 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -1,6 +1,8 @@ +#include "mscclpp.hpp" #include "bootstrap.h" #include "utils.h" #include "checks.hpp" +#include "api.h" #include #include @@ -11,6 +13,8 @@ #include #include +using namespace mscclpp; + namespace { uint64_t hashUniqueId(const mscclppBootstrapHandle& id) { @@ -57,24 +61,33 @@ struct extInfo mscclppSocketAddress extAddressListen; }; -class mscclppBootstrap::Impl +struct UniqueIdInternal +{ + uint64_t magic; + union mscclppSocketAddress addr; +}; +static_assert(sizeof(UniqueIdInternal) <= sizeof(UniqueId), + "UniqueIdInternal is too large to fit into UniqueId"); + +class DefaultBootstrap::Impl { public: Impl(int rank, int nRanks); ~Impl(); - void Initialize(const UniqueId uniqueId); - void Initialize(std::string ipPortPair); - void EstablishConnections(); - UniqueId GetUniqueId(); - void AllGather(void* allData, int size); - void Send(void* data, int size, int peer, int tag); - void Recv(void* data, int size, int peer, int tag); - void Barrier(); - void Close(); + void initialize(const UniqueId uniqueId); + void initialize(std::string ipPortPair); + void establishConnections(); + UniqueId createUniqueId(); + UniqueId getUniqueId() const; + void allGather(void* allData, int size); + void send(void* data, int size, int peer, int tag); + void recv(void* data, int size, int peer, int tag); + void barrier(); + void close(); - UniqueId uniqueId_; private: + UniqueIdInternal uniqueId_; int rank_; int nRanks_; bool netInitialized; @@ -103,34 +116,38 @@ private: // UniqueId MscclppBootstrap::Impl::uniqueId_; -mscclppBootstrap::Impl::Impl(int rank, int nRanks) +DefaultBootstrap::Impl::Impl(int rank, int nRanks) : rank_(rank), nRanks_(nRanks), netInitialized(false), peerCommAddresses_(nRanks, mscclppSocketAddress()), barrierArr_(nRanks, 0), abortFlag_(nullptr) { } -UniqueId mscclppBootstrap::Impl::GetUniqueId() +UniqueId DefaultBootstrap::Impl::getUniqueId() const +{ + UniqueId ret; + std::memcpy(&ret, &uniqueId_, sizeof(uniqueId_)); + return ret; +} + +UniqueId DefaultBootstrap::Impl::createUniqueId() { netInit(""); MSCCLPPTHROW(getRandomData(&uniqueId_.magic, sizeof(uniqueId_.magic))); std::memcpy(&uniqueId_.addr, &netIfAddr_, sizeof(mscclppSocketAddress)); bootstrapCreateRoot(); - - return uniqueId_; + return getUniqueId(); } -void mscclppBootstrap::Impl::Initialize(const UniqueId uniqueId) +void DefaultBootstrap::Impl::initialize(const UniqueId uniqueId) { netInit(""); - uniqueId_.magic = uniqueId.magic; - uniqueId_.addr = uniqueId.addr; - // printf("addr = %s port = %d\n", inet_ntoa(uniqueId_.addr.sin.sin_addr), (int)ntohs(uniqueId_.addr.sin.sin_port)); + std::memcpy(&uniqueId_, &uniqueId, sizeof(uniqueId_)); - EstablishConnections(); + establishConnections(); } -void mscclppBootstrap::Impl::Initialize(std::string ipPortPair) +void DefaultBootstrap::Impl::initialize(std::string ipPortPair) { netInit(ipPortPair); @@ -142,17 +159,17 @@ void mscclppBootstrap::Impl::Initialize(std::string ipPortPair) bootstrapCreateRoot(); } - EstablishConnections(); + establishConnections(); } -mscclppBootstrap::Impl::~Impl() +DefaultBootstrap::Impl::~Impl() { if (rootThread_.joinable()) { rootThread_.join(); } } -void mscclppBootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, +void DefaultBootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, std::vector& rankAddresses, std::vector& rankAddressesRoot, int& rank) @@ -181,7 +198,7 @@ void mscclppBootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, rank = info.rank; } -void mscclppBootstrap::Impl::sendHandleToPeer(int peer, +void DefaultBootstrap::Impl::sendHandleToPeer(int peer, const std::vector& rankAddresses, const std::vector& rankAddressesRoot) { @@ -193,7 +210,7 @@ void mscclppBootstrap::Impl::sendHandleToPeer(int peer, MSCCLPPTHROW(mscclppSocketClose(&sock)); } -void mscclppBootstrap::Impl::bootstrapCreateRoot() +void DefaultBootstrap::Impl::bootstrapCreateRoot() { mscclppSocket listenSock; @@ -216,7 +233,7 @@ void mscclppBootstrap::Impl::bootstrapCreateRoot() rootThread_ = std::thread(lambda); } -void mscclppBootstrap::Impl::bootstrapRoot(mscclppSocket listenSock) +void DefaultBootstrap::Impl::bootstrapRoot(mscclppSocket listenSock) { int numCollected = 0; std::vector rankAddresses(this->nRanks_, mscclppSocketAddress()); @@ -245,7 +262,7 @@ void mscclppBootstrap::Impl::bootstrapRoot(mscclppSocket listenSock) TRACE(MSCCLPP_INIT, "DONE"); } -void mscclppBootstrap::Impl::netInit(std::string ipPortPair) +void DefaultBootstrap::Impl::netInit(std::string ipPortPair) { if (netInitialized) return; @@ -271,7 +288,7 @@ void mscclppBootstrap::Impl::netInit(std::string ipPortPair) netInitialized = true; } -void mscclppBootstrap::Impl::EstablishConnections() +void DefaultBootstrap::Impl::establishConnections() { mscclppSocketAddress nextAddr; mscclppSocket sock, listenSockRoot; @@ -332,12 +349,12 @@ void mscclppBootstrap::Impl::EstablishConnections() // AllGather all listen handlers MSCCLPPTHROW(mscclppSocketGetAddr(&this->listenSock_, &this->peerCommAddresses_[rank_])); - AllGather(this->peerCommAddresses_.data(), sizeof(mscclppSocketAddress)); + allGather(this->peerCommAddresses_.data(), sizeof(mscclppSocketAddress)); TRACE(MSCCLPP_INIT, "rank %d nranks %d - DONE", rank_, nRanks_); } -void mscclppBootstrap::Impl::AllGather(void* allData, int size) +void DefaultBootstrap::Impl::allGather(void* allData, int size) { char* data = static_cast(allData); int rank = this->rank_; @@ -362,13 +379,13 @@ void mscclppBootstrap::Impl::AllGather(void* allData, int size) TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - DONE", rank, nRanks, size); } -void mscclppBootstrap::Impl::netSend(mscclppSocket* sock, const void* data, int size) +void DefaultBootstrap::Impl::netSend(mscclppSocket* sock, const void* data, int size) { MSCCLPPTHROW(mscclppSocketSend(sock, &size, sizeof(int))); MSCCLPPTHROW(mscclppSocketSend(sock, const_cast(data), size)); } -void mscclppBootstrap::Impl::netRecv(mscclppSocket* sock, void* data, int size) +void DefaultBootstrap::Impl::netRecv(mscclppSocket* sock, void* data, int size) { int recvSize; MSCCLPPTHROW(mscclppSocketRecv(sock, &recvSize, sizeof(int))); @@ -378,7 +395,7 @@ void mscclppBootstrap::Impl::netRecv(mscclppSocket* sock, void* data, int size) MSCCLPPTHROW(mscclppSocketRecv(sock, data, std::min(recvSize, size))); } -void mscclppBootstrap::Impl::Send(void* data, int size, int peer, int tag) +void DefaultBootstrap::Impl::send(void* data, int size, int peer, int tag) { mscclppSocket sock; MSCCLPPTHROW(mscclppSocketInit(&sock, &this->peerCommAddresses_[peer], this->uniqueId_.magic, @@ -391,7 +408,7 @@ void mscclppBootstrap::Impl::Send(void* data, int size, int peer, int tag) MSCCLPPTHROW(mscclppSocketClose(&sock)); } -void mscclppBootstrap::Impl::Recv(void* data, int size, int peer, int tag) +void DefaultBootstrap::Impl::recv(void* data, int size, int peer, int tag) { // search over all unexpected messages for (auto it = unexpectedMessages_.begin(); it != unexpectedMessages_.end(); ++it){ @@ -421,62 +438,67 @@ void mscclppBootstrap::Impl::Recv(void* data, int size, int peer, int tag) } } -void mscclppBootstrap::Impl::Barrier() +void DefaultBootstrap::Impl::barrier() { - AllGather(barrierArr_.data(), sizeof(int)); + allGather(barrierArr_.data(), sizeof(int)); } -void mscclppBootstrap::Impl::Close() +void DefaultBootstrap::Impl::close() { MSCCLPPTHROW(mscclppSocketClose(&this->listenSock_)); MSCCLPPTHROW(mscclppSocketClose(&this->ringSendSocket_)); MSCCLPPTHROW(mscclppSocketClose(&this->ringRecvSocket_)); } -mscclppBootstrap::mscclppBootstrap(int rank, int nRanks) +MSCCLPP_API_CPP DefaultBootstrap::DefaultBootstrap(int rank, int nRanks) { // pimpl_ = std::make_unique(ipPortPair, rank, nRanks, uniqueId); pimpl_ = std::make_unique(rank, nRanks); } -UniqueId mscclppBootstrap::GetUniqueId() +MSCCLPP_API_CPP UniqueId DefaultBootstrap::createUniqueId() { - return pimpl_->GetUniqueId(); + return pimpl_->createUniqueId(); } -void mscclppBootstrap::Send(void* data, int size, int peer, int tag) +MSCCLPP_API_CPP UniqueId DefaultBootstrap::getUniqueId() const { - pimpl_->Send(data, size, peer, tag); + return pimpl_->getUniqueId(); } -void mscclppBootstrap::Recv(void* data, int size, int peer, int tag) +MSCCLPP_API_CPP void DefaultBootstrap::send(void* data, int size, int peer, int tag) { - pimpl_->Recv(data, size, peer, tag); + pimpl_->send(data, size, peer, tag); } -void mscclppBootstrap::AllGather(void* allData, int size) +MSCCLPP_API_CPP void DefaultBootstrap::recv(void* data, int size, int peer, int tag) { - pimpl_->AllGather(allData, size); + pimpl_->recv(data, size, peer, tag); } -void mscclppBootstrap::Initialize(UniqueId uniqueId) +MSCCLPP_API_CPP void DefaultBootstrap::allGather(void* allData, int size) { - pimpl_->Initialize(uniqueId); + pimpl_->allGather(allData, size); } -void mscclppBootstrap::Initialize(std::string ipPortPair) +MSCCLPP_API_CPP void DefaultBootstrap::initialize(UniqueId uniqueId) { - pimpl_->Initialize(ipPortPair); + pimpl_->initialize(uniqueId); } -void mscclppBootstrap::Barrier() +MSCCLPP_API_CPP void DefaultBootstrap::initialize(std::string ipPortPair) { - pimpl_->Barrier(); + pimpl_->initialize(ipPortPair); } -mscclppBootstrap::~mscclppBootstrap() +MSCCLPP_API_CPP void DefaultBootstrap::barrier() { - pimpl_->Close(); + pimpl_->barrier(); +} + +MSCCLPP_API_CPP DefaultBootstrap::~DefaultBootstrap() +{ + pimpl_->close(); } // ------------------- Old bootstrap functions ------------------- diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h index 2a6b99ba..6bb20f81 100644 --- a/src/include/bootstrap.h +++ b/src/include/bootstrap.h @@ -5,35 +5,6 @@ #include "comm.h" -struct UniqueId -{ - uint64_t magic; - union mscclppSocketAddress addr; -}; - -static_assert(sizeof(UniqueId) <= sizeof(mscclppUniqueId), - "Bootstrap handle is too large to fit inside MSCCLPP unique ID"); - -class __attribute__((visibility("default"))) mscclppBootstrap : public Bootstrap -{ -public: - mscclppBootstrap(int rank, int nRanks); - ~mscclppBootstrap(); - - UniqueId GetUniqueId(); - - void Initialize(UniqueId uniqueId); - void Initialize(std::string ipPortPair); - void Send(void* data, int size, int peer, int tag) override; - void Recv(void* data, int size, int peer, int tag) override; - void AllGather(void* allData, int size) override; - void Barrier() override; - -private: - class Impl; - std::unique_ptr pimpl_; -}; - // ------------------- Old bootstrap headers: to be removed ------------------- struct mscclppBootstrapHandle diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index a9675d1e..6f96af10 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -248,16 +248,6 @@ typedef enum } mscclppResult_t; -class Bootstrap { -public: - Bootstrap(){}; - virtual ~Bootstrap() = default; - virtual void Send(void* data, int size, int peer, int tag) = 0; - virtual void Recv(void* data, int size, int peer, int tag) = 0; - virtual void AllGather(void* allData, int size) = 0; - virtual void Barrier() = 0; -}; - /* Create a unique ID for communication. Only needs to be called by one process. * Use with mscclppCommInitRankFromId(). * All processes need to provide the same ID to mscclppCommInitRankFromId(). diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index e41e94b8..6a7230bd 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -13,12 +13,51 @@ #include #include +#include #include #include namespace mscclpp { +#define MSCCLPP_UNIQUE_ID_BYTES 128 +struct UniqueId { + char internal[MSCCLPP_UNIQUE_ID_BYTES]; +}; + +class Bootstrap +{ +public: + Bootstrap(){}; + virtual ~Bootstrap() = default; + virtual void send(void* data, int size, int peer, int tag) = 0; + virtual void recv(void* data, int size, int peer, int tag) = 0; + virtual void allGather(void* allData, int size) = 0; + virtual void barrier() = 0; +}; + +class DefaultBootstrap : public Bootstrap +{ +public: + DefaultBootstrap(int rank, int nRanks); + ~DefaultBootstrap(); + + UniqueId createUniqueId(); + UniqueId getUniqueId() const; + + void initialize(UniqueId uniqueId); + void initialize(std::string ipPortPair); + void send(void* data, int size, int peer, int tag) override; + void recv(void* data, int size, int peer, int tag) override; + void allGather(void* allData, int size) override; + void barrier() override; + +private: + class Impl; + std::unique_ptr pimpl_; +}; + + struct alignas(16) SignalEpochId { // every signal(), increaments this and either: // 1) proxy thread pushes it to the remote peer's localSignalEpochId->proxy @@ -381,11 +420,6 @@ struct SimpleDeviceConnection { BufferHandle src; }; -#define MSCCLPP_UNIQUE_ID_BYTES 128 -struct UniqueId { - char internal[MSCCLPP_UNIQUE_ID_BYTES]; -}; - /* Create a unique ID for communication. Only needs to be called by one process. * Use with mscclppCommInitRankFromId(). * All processes need to provide the same ID to mscclppCommInitRankFromId(). diff --git a/tests/bootstrap_test_cpp.cc b/tests/bootstrap_test_cpp.cc index c2ef61f0..534c4114 100644 --- a/tests/bootstrap_test_cpp.cc +++ b/tests/bootstrap_test_cpp.cc @@ -1,4 +1,4 @@ -#include "bootstrap.h" +#include "mscclpp.hpp" #include @@ -11,24 +11,24 @@ int main() MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &worldSize); - std::shared_ptr bootstrap(new mscclppBootstrap(rank, worldSize)); + std::shared_ptr bootstrap(new mscclpp::DefaultBootstrap(rank, worldSize)); // bootstrap->Initialize("costsim-dev-00000A:50000"); - UniqueId id; + mscclpp::UniqueId id; if (rank == 0) - id = bootstrap->GetUniqueId(); + id = bootstrap->createUniqueId(); MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); - bootstrap->Initialize(id); + bootstrap->initialize(id); std::vector tmp(worldSize, 0); tmp[rank] = rank+1; - bootstrap->AllGather(tmp.data(), sizeof(int)); + bootstrap->allGather(tmp.data(), sizeof(int)); for (int i = 0; i < worldSize; i++){ if (tmp[i] != i+1) printf("error AllGather: rank %d: tmp[%d] = %d\n", rank, i, tmp[i]); } printf("rank %d: AllGather test passed!\n", rank); - bootstrap->Barrier(); + bootstrap->barrier(); printf("rank %d: Barrier test passed!\n", rank); for (int i = 0; i < worldSize; i++){ @@ -36,8 +36,8 @@ int main() continue; int msg1 = (rank + 1)*2; int msg2 = (rank + 1)*2+1; - bootstrap->Send(&msg1, sizeof(int), i, 0); - bootstrap->Send(&msg2, sizeof(int), i, 1); + bootstrap->send(&msg1, sizeof(int), i, 0); + bootstrap->send(&msg2, sizeof(int), i, 1); } for (int i = 0; i < worldSize; i++){ @@ -46,8 +46,8 @@ int main() int msg1 = 0; int msg2 = 0; // recv them in the opposite order to check correctness - bootstrap->Recv(&msg2, sizeof(int), i, 1); - bootstrap->Recv(&msg1, sizeof(int), i, 0); + bootstrap->recv(&msg2, sizeof(int), i, 1); + bootstrap->recv(&msg1, sizeof(int), i, 0); if (msg1 != (i+1)*2 || msg2 != (i+1)*2+1) printf("error Send/Recv: rank %d: msg1 = %d, msg2 = %d\n", rank, msg1, msg2); } From bb195b2f290a0745f412d230b2e1bb5eb0f79e0e Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 25 Apr 2023 11:57:02 +0000 Subject: [PATCH 053/135] PascalCase for type names --- src/bootstrap/bootstrap.cc | 60 +++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index 2447489e..3cf955a9 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -46,14 +46,14 @@ enum bootstrapInterface_t dontCareIf = -2 }; -struct unexpectedMsg +struct UnexpectedMsg { int peer; int tag; std::shared_ptr sock; }; -struct extInfo +struct ExtInfo { int rank; int nRanks; @@ -95,7 +95,7 @@ private: mscclppSocket ringRecvSocket_; mscclppSocket ringSendSocket_; std::vector peerCommAddresses_; - std::list unexpectedMessages_; + std::list unexpectedMessages_; std::vector barrierArr_; volatile uint32_t* abortFlag_; std::thread rootThread_; @@ -175,7 +175,7 @@ void DefaultBootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, int& rank) { mscclppSocket sock; - extInfo info; + ExtInfo info; mscclppSocketAddress zero; std::memset(&zero, 0, sizeof(mscclppSocketAddress)); @@ -292,7 +292,7 @@ void DefaultBootstrap::Impl::establishConnections() { mscclppSocketAddress nextAddr; mscclppSocket sock, listenSockRoot; - extInfo info; + ExtInfo info; TRACE(MSCCLPP_INIT, "rank %d nranks %d", rank_, nRanks_); @@ -502,7 +502,7 @@ MSCCLPP_API_CPP DefaultBootstrap::~DefaultBootstrap() } // ------------------- Old bootstrap functions ------------------- -struct bootstrapRootArgs +struct BootstrapRootArgs { struct mscclppSocket* listenSock; uint64_t magic; @@ -573,7 +573,7 @@ static mscclppResult_t bootstrapNetRecv(struct mscclppSocket* sock, void* data, return mscclppSuccess; } -// struct extInfo +// struct ExtInfo // { // int rank; // int nranks; @@ -594,12 +594,12 @@ static mscclppResult_t bootstrapNetRecv(struct mscclppSocket* sock, void* data, static void* bootstrapRoot(void* rargs) { - struct bootstrapRootArgs* args = (struct bootstrapRootArgs*)rargs; + struct BootstrapRootArgs* args = (struct BootstrapRootArgs*)rargs; struct mscclppSocket* listenSock = args->listenSock; uint64_t magic = args->magic; mscclppResult_t res = mscclppSuccess; int nranks = 0, c = 0; - struct extInfo info; + struct ExtInfo info; union mscclppSocketAddress* rankAddresses = NULL; union mscclppSocketAddress* rankAddressesRoot = NULL; // for initial rank <-> root information exchange union mscclppSocketAddress* zero = NULL; @@ -671,7 +671,7 @@ out: mscclppResult_t bootstrapCreateRoot(struct mscclppBootstrapHandle* handle) { struct mscclppSocket* listenSock; - struct bootstrapRootArgs* args; + struct BootstrapRootArgs* args; pthread_t thread; MSCCLPPCHECK(mscclppCalloc(&listenSock, 1)); @@ -722,22 +722,22 @@ mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle, bool return mscclppSuccess; } -struct unexConn +struct UnexConn { int peer; int tag; struct mscclppSocket sock; - struct unexConn* next; + struct UnexConn* next; }; -struct bootstrapState +struct BootstrapState { struct mscclppSocket listenSock; struct mscclppSocket ringRecvSocket; struct mscclppSocket ringSendSocket; union mscclppSocketAddress* peerCommAddresses; union mscclppSocketAddress* peerProxyAddresses; - struct unexConn* unexpectedConnections; + struct UnexConn* unexpectedConnections; int cudaDev; int rank; int nranks; @@ -749,11 +749,11 @@ mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscc { int rank = comm->rank; int nranks = comm->nRanks; - struct bootstrapState* state; + struct BootstrapState* state; struct mscclppSocket* proxySocket; mscclppSocketAddress nextAddr; struct mscclppSocket sock, listenSockRoot; - struct extInfo info; + struct ExtInfo info; MSCCLPPCHECK(mscclppCalloc(&state, 1)); state->rank = rank; @@ -833,7 +833,7 @@ mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscc mscclppResult_t bootstrapAllGather(void* commState, void* allData, int size) { - struct bootstrapState* state = (struct bootstrapState*)commState; + struct BootstrapState* state = (struct BootstrapState*)commState; char* data = (char*)allData; int rank = state->rank; int nranks = state->nranks; @@ -861,7 +861,7 @@ mscclppResult_t bootstrapAllGather(void* commState, void* allData, int size) mscclppResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) { mscclppResult_t ret = mscclppSuccess; - struct bootstrapState* state = (struct bootstrapState*)commState; + struct BootstrapState* state = (struct BootstrapState*)commState; struct mscclppSocket sock; MSCCLPPCHECKGOTO(mscclppSocketInit(&sock, state->peerCommAddresses + peer, state->magic, mscclppSocketTypeBootstrap, @@ -920,17 +920,17 @@ mscclppResult_t bootstrapIntraNodeAllGather(void* commState, int* ranks, int ran return mscclppSuccess; } -mscclppResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct mscclppSocket* sock) +mscclppResult_t unexpectedEnqueue(struct BootstrapState* state, int peer, int tag, struct mscclppSocket* sock) { // New unex - struct unexConn* unex; + struct UnexConn* unex; MSCCLPPCHECK(mscclppCalloc(&unex, 1)); unex->peer = peer; unex->tag = tag; memcpy(&unex->sock, sock, sizeof(struct mscclppSocket)); // Enqueue - struct unexConn* list = state->unexpectedConnections; + struct UnexConn* list = state->unexpectedConnections; if (list == NULL) { state->unexpectedConnections = unex; return mscclppSuccess; @@ -941,11 +941,11 @@ mscclppResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int ta return mscclppSuccess; } -mscclppResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct mscclppSocket* sock, +mscclppResult_t unexpectedDequeue(struct BootstrapState* state, int peer, int tag, struct mscclppSocket* sock, int* found) { - struct unexConn* elem = state->unexpectedConnections; - struct unexConn* prev = NULL; + struct UnexConn* elem = state->unexpectedConnections; + struct UnexConn* prev = NULL; *found = 0; while (elem) { if (elem->peer == peer && elem->tag == tag) { @@ -965,10 +965,10 @@ mscclppResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int ta return mscclppSuccess; } -static void unexpectedFree(struct bootstrapState* state) +static void unexpectedFree(struct BootstrapState* state) { - struct unexConn* elem = state->unexpectedConnections; - struct unexConn* prev = NULL; + struct UnexConn* elem = state->unexpectedConnections; + struct UnexConn* prev = NULL; while (elem) { prev = elem; @@ -982,7 +982,7 @@ static void unexpectedFree(struct bootstrapState* state) mscclppResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) { mscclppResult_t ret = mscclppSuccess; - struct bootstrapState* state = (struct bootstrapState*)commState; + struct BootstrapState* state = (struct BootstrapState*)commState; struct mscclppSocket sock; int newPeer, newTag; @@ -1016,7 +1016,7 @@ fail: mscclppResult_t bootstrapClose(void* commState) { - struct bootstrapState* state = (struct bootstrapState*)commState; + struct BootstrapState* state = (struct BootstrapState*)commState; if (state->unexpectedConnections != NULL) { unexpectedFree(state); if (*state->abortFlag == 0) { @@ -1037,7 +1037,7 @@ mscclppResult_t bootstrapClose(void* commState) mscclppResult_t bootstrapAbort(void* commState) { - struct bootstrapState* state = (struct bootstrapState*)commState; + struct BootstrapState* state = (struct BootstrapState*)commState; if (commState == NULL) return mscclppSuccess; MSCCLPPCHECK(mscclppSocketClose(&state->listenSock)); From 4115559c2faae171a866e0cf9f5fe44bb84870e0 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 25 Apr 2023 12:25:08 +0000 Subject: [PATCH 054/135] cleanup --- src/bootstrap/bootstrap.cc | 45 +++++++++---------- tests/bootstrap_test_cpp.cc | 86 ++++++++++++++++++------------------- 2 files changed, 64 insertions(+), 67 deletions(-) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index 3cf955a9..13a4df76 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -1,8 +1,8 @@ -#include "mscclpp.hpp" #include "bootstrap.h" -#include "utils.h" -#include "checks.hpp" #include "api.h" +#include "checks.hpp" +#include "mscclpp.hpp" +#include "utils.h" #include #include @@ -66,8 +66,7 @@ struct UniqueIdInternal uint64_t magic; union mscclppSocketAddress addr; }; -static_assert(sizeof(UniqueIdInternal) <= sizeof(UniqueId), - "UniqueIdInternal is too large to fit into UniqueId"); +static_assert(sizeof(UniqueIdInternal) <= sizeof(UniqueId), "UniqueIdInternal is too large to fit into UniqueId"); class DefaultBootstrap::Impl { @@ -85,7 +84,6 @@ public: void barrier(); void close(); - private: UniqueIdInternal uniqueId_; int rank_; @@ -108,9 +106,9 @@ private: void bootstrapCreateRoot(); void bootstrapRoot(mscclppSocket listenSock); void getRemoteAddresses(mscclppSocket* listenSock, std::vector& rankAddresses, - std::vector& rankAddressesRoot, int& rank); + std::vector& rankAddressesRoot, int& rank); void sendHandleToPeer(int peer, const std::vector& rankAddresses, - const std::vector& rankAddressesRoot); + const std::vector& rankAddressesRoot); void netInit(std::string ipPortPair); }; @@ -170,9 +168,8 @@ DefaultBootstrap::Impl::~Impl() } void DefaultBootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, - std::vector& rankAddresses, - std::vector& rankAddressesRoot, - int& rank) + std::vector& rankAddresses, + std::vector& rankAddressesRoot, int& rank) { mscclppSocket sock; ExtInfo info; @@ -185,11 +182,13 @@ void DefaultBootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, MSCCLPPTHROW(mscclppSocketClose(&sock)); if (this->nRanks_ != info.nRanks) { - throw std::runtime_error("Bootstrap Root : mismatch in rank count from procs " + std::to_string(this->nRanks_) + " : " + std::to_string(info.nRanks)); + throw std::runtime_error("Bootstrap Root : mismatch in rank count from procs " + std::to_string(this->nRanks_) + + " : " + std::to_string(info.nRanks)); } if (std::memcmp(&zero, &rankAddressesRoot[info.rank], sizeof(mscclppSocketAddress)) != 0) { - throw std::runtime_error("Bootstrap Root : rank " + std::to_string(info.rank) + " of " + std::to_string(this->nRanks_) + " has already checked in"); + throw std::runtime_error("Bootstrap Root : rank " + std::to_string(info.rank) + " of " + + std::to_string(this->nRanks_) + " has already checked in"); } // Save the connection handle for that rank @@ -198,9 +197,8 @@ void DefaultBootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, rank = info.rank; } -void DefaultBootstrap::Impl::sendHandleToPeer(int peer, - const std::vector& rankAddresses, - const std::vector& rankAddressesRoot) +void DefaultBootstrap::Impl::sendHandleToPeer(int peer, const std::vector& rankAddresses, + const std::vector& rankAddressesRoot) { mscclppSocket sock; int next = (peer + 1) % this->nRanks_; @@ -227,9 +225,7 @@ void DefaultBootstrap::Impl::bootstrapCreateRoot() if (ret != mscclppSuccess) { throw std::runtime_error("Failed to get socket address"); } - auto lambda = [this, listenSock]() { - this->bootstrapRoot(listenSock); - }; + auto lambda = [this, listenSock]() { this->bootstrapRoot(listenSock); }; rootThread_ = std::thread(lambda); } @@ -269,7 +265,8 @@ void DefaultBootstrap::Impl::netInit(std::string ipPortPair) if (!ipPortPair.empty()) { mscclppSocketAddress remoteAddr; if (mscclppSocketGetAddrFromString(&remoteAddr, ipPortPair.c_str()) != mscclppSuccess) { - throw std::runtime_error("Invalid ipPortPair, please use format: : or []: or :"); + throw std::runtime_error( + "Invalid ipPortPair, please use format: : or []: or :"); } if (mscclppFindInterfaceMatchSubnet(netIfName_, &netIfAddr_, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) { throw std::runtime_error("NET/Socket : No usable listening interface found"); @@ -322,7 +319,6 @@ void DefaultBootstrap::Impl::establishConnections() randomSleep(this->rank_); } - char line[SOCKET_NAME_MAXLEN + MAX_IF_NAME_SIZE + 2]; std::sprintf(line, " %s:", netIfName_); mscclppSocketToString(&this->uniqueId_.addr, line + strlen(line)); @@ -390,7 +386,8 @@ void DefaultBootstrap::Impl::netRecv(mscclppSocket* sock, void* data, int size) int recvSize; MSCCLPPTHROW(mscclppSocketRecv(sock, &recvSize, sizeof(int))); if (recvSize > size) { - throw std::runtime_error("Message truncated : received " + std::to_string(recvSize) + " bytes instead of " + std::to_string(size)); + throw std::runtime_error("Message truncated : received " + std::to_string(recvSize) + " bytes instead of " + + std::to_string(size)); } MSCCLPPTHROW(mscclppSocketRecv(sock, data, std::min(recvSize, size))); } @@ -411,8 +408,8 @@ void DefaultBootstrap::Impl::send(void* data, int size, int peer, int tag) void DefaultBootstrap::Impl::recv(void* data, int size, int peer, int tag) { // search over all unexpected messages - for (auto it = unexpectedMessages_.begin(); it != unexpectedMessages_.end(); ++it){ - if (it->peer == peer && it->tag == tag){ + for (auto it = unexpectedMessages_.begin(); it != unexpectedMessages_.end(); ++it) { + if (it->peer == peer && it->tag == tag) { // found a match netRecv(it->sock.get(), data, size); MSCCLPPTHROW(mscclppSocketClose(it->sock.get())); diff --git a/tests/bootstrap_test_cpp.cc b/tests/bootstrap_test_cpp.cc index 534c4114..d6e33be9 100644 --- a/tests/bootstrap_test_cpp.cc +++ b/tests/bootstrap_test_cpp.cc @@ -6,53 +6,53 @@ int main() { - int rank, worldSize; - MPI_Init(NULL, NULL); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - MPI_Comm_size(MPI_COMM_WORLD, &worldSize); + int rank, worldSize; + MPI_Init(NULL, NULL); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &worldSize); - std::shared_ptr bootstrap(new mscclpp::DefaultBootstrap(rank, worldSize)); - // bootstrap->Initialize("costsim-dev-00000A:50000"); - mscclpp::UniqueId id; - if (rank == 0) - id = bootstrap->createUniqueId(); - MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); - bootstrap->initialize(id); + std::shared_ptr bootstrap(new mscclpp::DefaultBootstrap(rank, worldSize)); + // bootstrap->Initialize("costsim-dev-00000A:50000"); + mscclpp::UniqueId id; + if (rank == 0) + id = bootstrap->createUniqueId(); + MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); + bootstrap->initialize(id); - std::vector tmp(worldSize, 0); - tmp[rank] = rank+1; - bootstrap->allGather(tmp.data(), sizeof(int)); - for (int i = 0; i < worldSize; i++){ - if (tmp[i] != i+1) - printf("error AllGather: rank %d: tmp[%d] = %d\n", rank, i, tmp[i]); - } - printf("rank %d: AllGather test passed!\n", rank); + std::vector tmp(worldSize, 0); + tmp[rank] = rank + 1; + bootstrap->allGather(tmp.data(), sizeof(int)); + for (int i = 0; i < worldSize; i++) { + if (tmp[i] != i + 1) + printf("error AllGather: rank %d: tmp[%d] = %d\n", rank, i, tmp[i]); + } + printf("rank %d: AllGather test passed!\n", rank); - bootstrap->barrier(); - printf("rank %d: Barrier test passed!\n", rank); + bootstrap->barrier(); + printf("rank %d: Barrier test passed!\n", rank); - for (int i = 0; i < worldSize; i++){ - if (i == rank) - continue; - int msg1 = (rank + 1)*2; - int msg2 = (rank + 1)*2+1; - bootstrap->send(&msg1, sizeof(int), i, 0); - bootstrap->send(&msg2, sizeof(int), i, 1); - } + for (int i = 0; i < worldSize; i++) { + if (i == rank) + continue; + int msg1 = (rank + 1) * 2; + int msg2 = (rank + 1) * 2 + 1; + bootstrap->send(&msg1, sizeof(int), i, 0); + bootstrap->send(&msg2, sizeof(int), i, 1); + } - for (int i = 0; i < worldSize; i++){ - if (i == rank) - continue; - int msg1 = 0; - int msg2 = 0; - // recv them in the opposite order to check correctness - bootstrap->recv(&msg2, sizeof(int), i, 1); - bootstrap->recv(&msg1, sizeof(int), i, 0); - if (msg1 != (i+1)*2 || msg2 != (i+1)*2+1) - printf("error Send/Recv: rank %d: msg1 = %d, msg2 = %d\n", rank, msg1, msg2); - } - printf("rank %d: Send/Recv test passed!\n", rank); + for (int i = 0; i < worldSize; i++) { + if (i == rank) + continue; + int msg1 = 0; + int msg2 = 0; + // recv them in the opposite order to check correctness + bootstrap->recv(&msg2, sizeof(int), i, 1); + bootstrap->recv(&msg1, sizeof(int), i, 0); + if (msg1 != (i + 1) * 2 || msg2 != (i + 1) * 2 + 1) + printf("error Send/Recv: rank %d: msg1 = %d, msg2 = %d\n", rank, msg1, msg2); + } + printf("rank %d: Send/Recv test passed!\n", rank); - MPI_Finalize(); - return 0; + MPI_Finalize(); + return 0; } \ No newline at end of file From 71b075e0d74d4c68519b32611cbe6b0d39ed8b42 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 25 Apr 2023 12:29:32 +0000 Subject: [PATCH 055/135] Rename --- src/bootstrap/bootstrap.cc | 60 ++++++++++++++++++------------------- src/include/mscclpp.hpp | 12 ++++---- tests/bootstrap_test_cpp.cc | 2 +- 3 files changed, 37 insertions(+), 37 deletions(-) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index 13a4df76..852af3bb 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -68,7 +68,7 @@ struct UniqueIdInternal }; static_assert(sizeof(UniqueIdInternal) <= sizeof(UniqueId), "UniqueIdInternal is too large to fit into UniqueId"); -class DefaultBootstrap::Impl +class Bootstrap::Impl { public: Impl(int rank, int nRanks); @@ -114,20 +114,20 @@ private: // UniqueId MscclppBootstrap::Impl::uniqueId_; -DefaultBootstrap::Impl::Impl(int rank, int nRanks) +Bootstrap::Impl::Impl(int rank, int nRanks) : rank_(rank), nRanks_(nRanks), netInitialized(false), peerCommAddresses_(nRanks, mscclppSocketAddress()), barrierArr_(nRanks, 0), abortFlag_(nullptr) { } -UniqueId DefaultBootstrap::Impl::getUniqueId() const +UniqueId Bootstrap::Impl::getUniqueId() const { UniqueId ret; std::memcpy(&ret, &uniqueId_, sizeof(uniqueId_)); return ret; } -UniqueId DefaultBootstrap::Impl::createUniqueId() +UniqueId Bootstrap::Impl::createUniqueId() { netInit(""); MSCCLPPTHROW(getRandomData(&uniqueId_.magic, sizeof(uniqueId_.magic))); @@ -136,7 +136,7 @@ UniqueId DefaultBootstrap::Impl::createUniqueId() return getUniqueId(); } -void DefaultBootstrap::Impl::initialize(const UniqueId uniqueId) +void Bootstrap::Impl::initialize(const UniqueId uniqueId) { netInit(""); @@ -145,7 +145,7 @@ void DefaultBootstrap::Impl::initialize(const UniqueId uniqueId) establishConnections(); } -void DefaultBootstrap::Impl::initialize(std::string ipPortPair) +void Bootstrap::Impl::initialize(std::string ipPortPair) { netInit(ipPortPair); @@ -160,14 +160,14 @@ void DefaultBootstrap::Impl::initialize(std::string ipPortPair) establishConnections(); } -DefaultBootstrap::Impl::~Impl() +Bootstrap::Impl::~Impl() { if (rootThread_.joinable()) { rootThread_.join(); } } -void DefaultBootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, +void Bootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, std::vector& rankAddresses, std::vector& rankAddressesRoot, int& rank) { @@ -197,7 +197,7 @@ void DefaultBootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, rank = info.rank; } -void DefaultBootstrap::Impl::sendHandleToPeer(int peer, const std::vector& rankAddresses, +void Bootstrap::Impl::sendHandleToPeer(int peer, const std::vector& rankAddresses, const std::vector& rankAddressesRoot) { mscclppSocket sock; @@ -208,7 +208,7 @@ void DefaultBootstrap::Impl::sendHandleToPeer(int peer, const std::vector rankAddresses(this->nRanks_, mscclppSocketAddress()); @@ -258,7 +258,7 @@ void DefaultBootstrap::Impl::bootstrapRoot(mscclppSocket listenSock) TRACE(MSCCLPP_INIT, "DONE"); } -void DefaultBootstrap::Impl::netInit(std::string ipPortPair) +void Bootstrap::Impl::netInit(std::string ipPortPair) { if (netInitialized) return; @@ -285,7 +285,7 @@ void DefaultBootstrap::Impl::netInit(std::string ipPortPair) netInitialized = true; } -void DefaultBootstrap::Impl::establishConnections() +void Bootstrap::Impl::establishConnections() { mscclppSocketAddress nextAddr; mscclppSocket sock, listenSockRoot; @@ -350,7 +350,7 @@ void DefaultBootstrap::Impl::establishConnections() TRACE(MSCCLPP_INIT, "rank %d nranks %d - DONE", rank_, nRanks_); } -void DefaultBootstrap::Impl::allGather(void* allData, int size) +void Bootstrap::Impl::allGather(void* allData, int size) { char* data = static_cast(allData); int rank = this->rank_; @@ -375,13 +375,13 @@ void DefaultBootstrap::Impl::allGather(void* allData, int size) TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - DONE", rank, nRanks, size); } -void DefaultBootstrap::Impl::netSend(mscclppSocket* sock, const void* data, int size) +void Bootstrap::Impl::netSend(mscclppSocket* sock, const void* data, int size) { MSCCLPPTHROW(mscclppSocketSend(sock, &size, sizeof(int))); MSCCLPPTHROW(mscclppSocketSend(sock, const_cast(data), size)); } -void DefaultBootstrap::Impl::netRecv(mscclppSocket* sock, void* data, int size) +void Bootstrap::Impl::netRecv(mscclppSocket* sock, void* data, int size) { int recvSize; MSCCLPPTHROW(mscclppSocketRecv(sock, &recvSize, sizeof(int))); @@ -392,7 +392,7 @@ void DefaultBootstrap::Impl::netRecv(mscclppSocket* sock, void* data, int size) MSCCLPPTHROW(mscclppSocketRecv(sock, data, std::min(recvSize, size))); } -void DefaultBootstrap::Impl::send(void* data, int size, int peer, int tag) +void Bootstrap::Impl::send(void* data, int size, int peer, int tag) { mscclppSocket sock; MSCCLPPTHROW(mscclppSocketInit(&sock, &this->peerCommAddresses_[peer], this->uniqueId_.magic, @@ -405,7 +405,7 @@ void DefaultBootstrap::Impl::send(void* data, int size, int peer, int tag) MSCCLPPTHROW(mscclppSocketClose(&sock)); } -void DefaultBootstrap::Impl::recv(void* data, int size, int peer, int tag) +void Bootstrap::Impl::recv(void* data, int size, int peer, int tag) { // search over all unexpected messages for (auto it = unexpectedMessages_.begin(); it != unexpectedMessages_.end(); ++it) { @@ -435,65 +435,65 @@ void DefaultBootstrap::Impl::recv(void* data, int size, int peer, int tag) } } -void DefaultBootstrap::Impl::barrier() +void Bootstrap::Impl::barrier() { allGather(barrierArr_.data(), sizeof(int)); } -void DefaultBootstrap::Impl::close() +void Bootstrap::Impl::close() { MSCCLPPTHROW(mscclppSocketClose(&this->listenSock_)); MSCCLPPTHROW(mscclppSocketClose(&this->ringSendSocket_)); MSCCLPPTHROW(mscclppSocketClose(&this->ringRecvSocket_)); } -MSCCLPP_API_CPP DefaultBootstrap::DefaultBootstrap(int rank, int nRanks) +MSCCLPP_API_CPP Bootstrap::Bootstrap(int rank, int nRanks) { // pimpl_ = std::make_unique(ipPortPair, rank, nRanks, uniqueId); pimpl_ = std::make_unique(rank, nRanks); } -MSCCLPP_API_CPP UniqueId DefaultBootstrap::createUniqueId() +MSCCLPP_API_CPP UniqueId Bootstrap::createUniqueId() { return pimpl_->createUniqueId(); } -MSCCLPP_API_CPP UniqueId DefaultBootstrap::getUniqueId() const +MSCCLPP_API_CPP UniqueId Bootstrap::getUniqueId() const { return pimpl_->getUniqueId(); } -MSCCLPP_API_CPP void DefaultBootstrap::send(void* data, int size, int peer, int tag) +MSCCLPP_API_CPP void Bootstrap::send(void* data, int size, int peer, int tag) { pimpl_->send(data, size, peer, tag); } -MSCCLPP_API_CPP void DefaultBootstrap::recv(void* data, int size, int peer, int tag) +MSCCLPP_API_CPP void Bootstrap::recv(void* data, int size, int peer, int tag) { pimpl_->recv(data, size, peer, tag); } -MSCCLPP_API_CPP void DefaultBootstrap::allGather(void* allData, int size) +MSCCLPP_API_CPP void Bootstrap::allGather(void* allData, int size) { pimpl_->allGather(allData, size); } -MSCCLPP_API_CPP void DefaultBootstrap::initialize(UniqueId uniqueId) +MSCCLPP_API_CPP void Bootstrap::initialize(UniqueId uniqueId) { pimpl_->initialize(uniqueId); } -MSCCLPP_API_CPP void DefaultBootstrap::initialize(std::string ipPortPair) +MSCCLPP_API_CPP void Bootstrap::initialize(std::string ipPortPair) { pimpl_->initialize(ipPortPair); } -MSCCLPP_API_CPP void DefaultBootstrap::barrier() +MSCCLPP_API_CPP void Bootstrap::barrier() { pimpl_->barrier(); } -MSCCLPP_API_CPP DefaultBootstrap::~DefaultBootstrap() +MSCCLPP_API_CPP Bootstrap::~Bootstrap() { pimpl_->close(); } diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index 6a7230bd..12ac7873 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -25,22 +25,22 @@ struct UniqueId { char internal[MSCCLPP_UNIQUE_ID_BYTES]; }; -class Bootstrap +class BaseBootstrap { public: - Bootstrap(){}; - virtual ~Bootstrap() = default; + BaseBootstrap(){}; + virtual ~BaseBootstrap() = default; virtual void send(void* data, int size, int peer, int tag) = 0; virtual void recv(void* data, int size, int peer, int tag) = 0; virtual void allGather(void* allData, int size) = 0; virtual void barrier() = 0; }; -class DefaultBootstrap : public Bootstrap +class Bootstrap : public BaseBootstrap { public: - DefaultBootstrap(int rank, int nRanks); - ~DefaultBootstrap(); + Bootstrap(int rank, int nRanks); + ~Bootstrap(); UniqueId createUniqueId(); UniqueId getUniqueId() const; diff --git a/tests/bootstrap_test_cpp.cc b/tests/bootstrap_test_cpp.cc index d6e33be9..0de6a2d5 100644 --- a/tests/bootstrap_test_cpp.cc +++ b/tests/bootstrap_test_cpp.cc @@ -11,7 +11,7 @@ int main() MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &worldSize); - std::shared_ptr bootstrap(new mscclpp::DefaultBootstrap(rank, worldSize)); + std::shared_ptr bootstrap(new mscclpp::Bootstrap(rank, worldSize)); // bootstrap->Initialize("costsim-dev-00000A:50000"); mscclpp::UniqueId id; if (rank == 0) From 8f2f053f2f9a2630131b150fb24da9e9e9345aa0 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Tue, 25 Apr 2023 21:08:49 +0000 Subject: [PATCH 056/135] more clean up --- src/bootstrap/bootstrap.cc | 15 +++------------ tests/bootstrap_test_cpp.cc | 10 +++++++--- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index 852af3bb..2c83d8ea 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -213,18 +213,9 @@ void Bootstrap::Impl::bootstrapCreateRoot() mscclppSocket listenSock; // mscclppSocket* listenSock = new mscclppSocket(); // TODO(saemal) make this a shared ptr - auto ret = mscclppSocketInit(&listenSock, &uniqueId_.addr, uniqueId_.magic, mscclppSocketTypeBootstrap, nullptr, 0); - if (ret != mscclppSuccess) { - throw std::runtime_error("Failed to initialize socket"); - } - ret = mscclppSocketListen(&listenSock); - if (ret != mscclppSuccess) { - throw std::runtime_error("Failed to listen on socket"); - } - ret = mscclppSocketGetAddr(&listenSock, &uniqueId_.addr); - if (ret != mscclppSuccess) { - throw std::runtime_error("Failed to get socket address"); - } + MSCCLPPTHROW(mscclppSocketInit(&listenSock, &uniqueId_.addr, uniqueId_.magic, mscclppSocketTypeBootstrap, nullptr, 0)); + MSCCLPPTHROW(mscclppSocketListen(&listenSock)); + MSCCLPPTHROW(mscclppSocketGetAddr(&listenSock, &uniqueId_.addr)); auto lambda = [this, listenSock]() { this->bootstrapRoot(listenSock); }; rootThread_ = std::thread(lambda); } diff --git a/tests/bootstrap_test_cpp.cc b/tests/bootstrap_test_cpp.cc index 0de6a2d5..e7160edd 100644 --- a/tests/bootstrap_test_cpp.cc +++ b/tests/bootstrap_test_cpp.cc @@ -34,10 +34,12 @@ int main() for (int i = 0; i < worldSize; i++) { if (i == rank) continue; - int msg1 = (rank + 1) * 2; - int msg2 = (rank + 1) * 2 + 1; + int msg1 = (rank + 1) * 3; + int msg2 = (rank + 1) * 3 + 1; + int msg3 = (rank + 1) * 3 + 2; bootstrap->send(&msg1, sizeof(int), i, 0); bootstrap->send(&msg2, sizeof(int), i, 1); + bootstrap->send(&msg3, sizeof(int), i, 2); } for (int i = 0; i < worldSize; i++) { @@ -45,10 +47,12 @@ int main() continue; int msg1 = 0; int msg2 = 0; + int msg3 = 0; // recv them in the opposite order to check correctness bootstrap->recv(&msg2, sizeof(int), i, 1); + bootstrap->recv(&msg3, sizeof(int), i, 2); bootstrap->recv(&msg1, sizeof(int), i, 0); - if (msg1 != (i + 1) * 2 || msg2 != (i + 1) * 2 + 1) + if (msg1 != (i + 1) * 3 || msg2 != (i + 1) * 3 + 1 || msg3 != (i + 1) * 3 + 2) printf("error Send/Recv: rank %d: msg1 = %d, msg2 = %d\n", rank, msg1, msg2); } printf("rank %d: Send/Recv test passed!\n", rank); From b73b0132bafbc5a836afd2ff5ea956e3506ee716 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Tue, 25 Apr 2023 21:27:23 +0000 Subject: [PATCH 057/135] using find instead of searching --- src/bootstrap/bootstrap.cc | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index 2c83d8ea..51ac66d9 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -4,6 +4,7 @@ #include "mscclpp.hpp" #include "utils.h" +#include #include #include #include @@ -399,14 +400,14 @@ void Bootstrap::Impl::send(void* data, int size, int peer, int tag) void Bootstrap::Impl::recv(void* data, int size, int peer, int tag) { // search over all unexpected messages - for (auto it = unexpectedMessages_.begin(); it != unexpectedMessages_.end(); ++it) { - if (it->peer == peer && it->tag == tag) { - // found a match - netRecv(it->sock.get(), data, size); - MSCCLPPTHROW(mscclppSocketClose(it->sock.get())); - unexpectedMessages_.erase(it); - return; - } + auto lambda = [peer, tag](const UnexpectedMsg& msg) { return msg.peer == peer && msg.tag == tag; }; + auto it = std::find_if(unexpectedMessages_.begin(), unexpectedMessages_.end(), lambda); + if (it != unexpectedMessages_.end()) { + // found a match + netRecv(it->sock.get(), data, size); + MSCCLPPTHROW(mscclppSocketClose(it->sock.get())); + unexpectedMessages_.erase(it); + return; } // didn't find one while (true) { From 8fc822c8489fe6653cfbef1a2b9b5cf633522fc0 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Tue, 25 Apr 2023 22:26:48 +0000 Subject: [PATCH 058/135] more tests for bootstrap --- src/bootstrap/bootstrap.cc | 22 ++++++ src/include/mscclpp.hpp | 4 ++ tests/bootstrap_test_cpp.cc | 140 +++++++++++++++++++++++++++--------- 3 files changed, 133 insertions(+), 33 deletions(-) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index 51ac66d9..dfce50b4 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -79,6 +79,8 @@ public: void establishConnections(); UniqueId createUniqueId(); UniqueId getUniqueId() const; + int getRank(); + int getNranks(); void allGather(void* allData, int size); void send(void* data, int size, int peer, int tag); void recv(void* data, int size, int peer, int tag); @@ -137,6 +139,16 @@ UniqueId Bootstrap::Impl::createUniqueId() return getUniqueId(); } +int Bootstrap::Impl::getRank() +{ + return rank_; +} + +int Bootstrap::Impl::getNranks() +{ + return nRanks_; +} + void Bootstrap::Impl::initialize(const UniqueId uniqueId) { netInit(""); @@ -455,6 +467,16 @@ MSCCLPP_API_CPP UniqueId Bootstrap::getUniqueId() const return pimpl_->getUniqueId(); } +MSCCLPP_API_CPP int Bootstrap::getRank() +{ + return pimpl_->getRank(); +} + +MSCCLPP_API_CPP int Bootstrap::getNranks() +{ + return pimpl_->getNranks(); +} + MSCCLPP_API_CPP void Bootstrap::send(void* data, int size, int peer, int tag) { pimpl_->send(data, size, peer, tag); diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index 12ac7873..bcbbf41d 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -30,6 +30,8 @@ class BaseBootstrap public: BaseBootstrap(){}; virtual ~BaseBootstrap() = default; + virtual int getRank() = 0; + virtual int getNranks() = 0; virtual void send(void* data, int size, int peer, int tag) = 0; virtual void recv(void* data, int size, int peer, int tag) = 0; virtual void allGather(void* allData, int size) = 0; @@ -47,6 +49,8 @@ public: void initialize(UniqueId uniqueId); void initialize(std::string ipPortPair); + int getRank() override; + int getNranks() override; void send(void* data, int size, int peer, int tag) override; void recv(void* data, int size, int peer, int tag) override; void allGather(void* allData, int size) override; diff --git a/tests/bootstrap_test_cpp.cc b/tests/bootstrap_test_cpp.cc index e7160edd..34e58b59 100644 --- a/tests/bootstrap_test_cpp.cc +++ b/tests/bootstrap_test_cpp.cc @@ -1,49 +1,41 @@ #include "mscclpp.hpp" #include - +#include +#include #include -int main() -{ - int rank, worldSize; - MPI_Init(NULL, NULL); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - MPI_Comm_size(MPI_COMM_WORLD, &worldSize); - - std::shared_ptr bootstrap(new mscclpp::Bootstrap(rank, worldSize)); - // bootstrap->Initialize("costsim-dev-00000A:50000"); - mscclpp::UniqueId id; - if (rank == 0) - id = bootstrap->createUniqueId(); - MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); - bootstrap->initialize(id); - - std::vector tmp(worldSize, 0); - tmp[rank] = rank + 1; +void test_allgather(std::shared_ptr bootstrap){ + std::vector tmp(bootstrap->getNranks(), 0); + tmp[bootstrap->getRank()] = bootstrap->getRank() + 1; bootstrap->allGather(tmp.data(), sizeof(int)); - for (int i = 0; i < worldSize; i++) { - if (tmp[i] != i + 1) - printf("error AllGather: rank %d: tmp[%d] = %d\n", rank, i, tmp[i]); + for (int i = 0; i < bootstrap->getNranks(); i++) { + assert(tmp[i] == i + 1); } - printf("rank %d: AllGather test passed!\n", rank); + if (bootstrap->getRank() == 0) + std::cout << "AllGather test passed!" << std::endl; +} +void test_barrier(std::shared_ptr bootstrap){ bootstrap->barrier(); - printf("rank %d: Barrier test passed!\n", rank); + if (bootstrap->getRank() == 0) + std::cout << "Barrier test passed!" << std::endl; +} - for (int i = 0; i < worldSize; i++) { - if (i == rank) +void test_sendrecv(std::shared_ptr bootstrap){ + for (int i = 0; i < bootstrap->getNranks(); i++) { + if (bootstrap->getRank() == 0) continue; - int msg1 = (rank + 1) * 3; - int msg2 = (rank + 1) * 3 + 1; - int msg3 = (rank + 1) * 3 + 2; + int msg1 = (bootstrap->getRank() + 1) * 3; + int msg2 = (bootstrap->getRank() + 1) * 3 + 1; + int msg3 = (bootstrap->getRank() + 1) * 3 + 2; bootstrap->send(&msg1, sizeof(int), i, 0); bootstrap->send(&msg2, sizeof(int), i, 1); bootstrap->send(&msg3, sizeof(int), i, 2); } - for (int i = 0; i < worldSize; i++) { - if (i == rank) + for (int i = 0; i < bootstrap->getNranks(); i++) { + if (i == bootstrap->getRank()) continue; int msg1 = 0; int msg2 = 0; @@ -52,10 +44,92 @@ int main() bootstrap->recv(&msg2, sizeof(int), i, 1); bootstrap->recv(&msg3, sizeof(int), i, 2); bootstrap->recv(&msg1, sizeof(int), i, 0); - if (msg1 != (i + 1) * 3 || msg2 != (i + 1) * 3 + 1 || msg3 != (i + 1) * 3 + 2) - printf("error Send/Recv: rank %d: msg1 = %d, msg2 = %d\n", rank, msg1, msg2); + assert(msg1 == (i + 1) * 3); + assert(msg2 == (i + 1) * 3 + 1); + assert(msg3 == (i + 1) * 3 + 2); } - printf("rank %d: Send/Recv test passed!\n", rank); + if (bootstrap->getRank() == 0) + std::cout << "Send/Recv test passed!" << std::endl; +} + +void test_all(std::shared_ptr bootstrap){ + test_allgather(bootstrap); + test_barrier(bootstrap); + // test_sendrecv(bootstrap); +} + +void test_mscclpp_bootstrap_with_id(int rank, int worldSize){ + std::shared_ptr bootstrap(new mscclpp::Bootstrap(rank, worldSize)); + mscclpp::UniqueId id; + if (bootstrap->getRank() == 0) + id = bootstrap->createUniqueId(); + MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); + bootstrap->initialize(id); + + test_all(bootstrap); + if (bootstrap->getRank() == 0) + std::cout << "--- MSCCLPP::Bootstrap test with unique id passed! ---" << std::endl; +} + +void test_mscclpp_bootstrap_with_ip_port_pair(int rank, int worldSize, char* ipPortPiar){ + std::shared_ptr bootstrap(new mscclpp::Bootstrap(rank, worldSize)); + bootstrap->initialize(ipPortPiar); + + test_all(bootstrap); + if (bootstrap->getRank() == 0) + std::cout << "--- MSCCLPP::Bootstrap test with ip_port pair passed! ---" << std::endl; +} + +class MPIBootstrap : public mscclpp::BaseBootstrap { +public: + MPIBootstrap() : BaseBootstrap() {} + int getRank() override { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + return rank; + } + int getNranks() override { + int worldSize; + MPI_Comm_size(MPI_COMM_WORLD, &worldSize); + return worldSize; + } + void allGather(void *sendbuf, int size) override { + MPI_Allgather(MPI_IN_PLACE, 0, MPI_BYTE, sendbuf, size, MPI_BYTE, MPI_COMM_WORLD); + } + void barrier() override { + MPI_Barrier(MPI_COMM_WORLD); + } + void send(void *sendbuf, int size, int dest, int tag) override { + MPI_Send(sendbuf, size, MPI_BYTE, dest, tag, MPI_COMM_WORLD); + } + void recv(void *recvbuf, int size, int source, int tag) override { + MPI_Recv(recvbuf, size, MPI_BYTE, source, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + } +}; + +void test_mpi_bootstrap(){ + std::shared_ptr bootstrap(new MPIBootstrap()); + test_all(bootstrap); + if (bootstrap->getRank() == 0) + std::cout << "--- MPI Bootstrap test passed! ---" << std::endl; +} + +int main(int argc, char **argv) +{ + int rank, worldSize; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &worldSize); + if (argc > 2){ + if (rank == 0) + std::cout << "Usage: " << argv[0] << " [ip:port]" << std::endl; + MPI_Finalize(); + return 0; + } + test_mscclpp_bootstrap_with_id(rank, worldSize); + if (argc == 2) + test_mscclpp_bootstrap_with_ip_port_pair(rank, worldSize, argv[1]); + test_mpi_bootstrap(); MPI_Finalize(); return 0; From 90a8860bcc45624a945dca33a480206fc861c41d Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Wed, 26 Apr 2023 03:04:56 +0000 Subject: [PATCH 059/135] Registered memory (de)serialization and Connection work --- src/connection.cc | 66 +++++++++++++++- src/include/connection.hpp | 6 +- src/include/mscclpp.hpp | 5 +- src/include/registered_memory.hpp | 29 +++---- src/registered_memory.cc | 121 ++++++++++++++++++++++++++++++ 5 files changed, 201 insertions(+), 26 deletions(-) diff --git a/src/connection.cc b/src/connection.cc index 12ebee02..48b2d197 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -1,6 +1,7 @@ #include "connection.hpp" #include "checks.hpp" #include "registered_memory.hpp" +#include "npkit.h" namespace mscclpp { @@ -10,6 +11,8 @@ void validateTransport(RegisteredMemory mem, TransportFlags transport) { } } +// CudaIpcConnection + TransportFlags CudaIpcConnection::transport() { return TransportCudaIpc; } @@ -30,17 +33,20 @@ void CudaIpcConnection::write(RegisteredMemory dst, uint64_t dstOffset, Register validateTransport(dst, remoteTransport()); validateTransport(src, transport()); - auto dstPtr = dst.impl->getTransportData(remoteTransport()); - auto srcPtr = src.impl->getTransportData(transport()); + auto dstPtr = dst.impl->data; + auto srcPtr = src.impl->data; + CUDATHROW(cudaMemcpyAsync(dstPtr + dstOffset, srcPtr + srcOffset, size, cudaMemcpyDeviceToDevice, stream)); - npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_DATA_ENTRY, (uint32_t)dataSize); + // npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_DATA_ENTRY, (uint32_t)size); } void CudaIpcConnection::flush() { CUDATHROW(cudaStreamSynchronize(stream)); - npkitCollectExitEvents(conn, NPKIT_EVENT_DMA_SEND_EXIT); + // npkitCollectExitEvents(conn, NPKIT_EVENT_DMA_SEND_EXIT); } +// IBConnection + IBConnection::IBConnection(TransportFlags transport) : transport_(transport), remoteTransport_(TransportNone) {} TransportFlags IBConnection::transport() { @@ -51,4 +57,56 @@ TransportFlags IBConnection::remoteTransport() { return remoteTransport_; } +IBConnection::IBConnection(TransportFlags transport, Communicator::Impl& commImpl) : transport_(transport), remoteTransport_(TransportNone) { + MSCCLPPTHROW(mscclppIbContextCreateQp(commImpl.getIbContext(transport), &qp)); +} + +IBConnection::~IBConnection() { + // TODO: Destroy QP? +} + +void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size) { + validateTransport(dst, remoteTransport()); + validateTransport(src, transport()); + + auto dstMrInfo = dst.impl->getTransportInfo(remoteTransport()); + auto srcMr = src.impl->getTransportInfo(transport()); + + qp->stageSend(srcMr, &dstMrInfo, (uint32_t)size, + /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset, /*signaled=*/false); + int ret = qp->postSend(); + if (ret != 0) { + // Return value is errno. + WARN("data postSend failed: errno %d", ret); + } + // npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_DATA_ENTRY, (uint32_t)size); +} + +void IBConnection::flush() { + bool isWaiting = true; + while (isWaiting) { + int wcNum = qp->pollCq(); + if (wcNum < 0) { + WARN("pollCq failed: errno %d", errno); + continue; + } + for (int i = 0; i < wcNum; ++i) { + struct ibv_wc* wc = &qp->wcs[i]; + if (wc->status != IBV_WC_SUCCESS) { + WARN("wc status %d", wc->status); + continue; + } + if (wc->qp_num != qp->qp->qp_num) { + WARN("got wc of unknown qp_num %d", wc->qp_num); + continue; + } + if (wc->opcode == IBV_WC_RDMA_WRITE) { + isWaiting = false; + break; + } + } + } + // npkitCollectExitEvents(conn, NPKIT_EVENT_IB_SEND_EXIT); +} + } // namespace mscclpp diff --git a/src/include/connection.hpp b/src/include/connection.hpp index 048e2c6a..72f0eb90 100644 --- a/src/include/connection.hpp +++ b/src/include/connection.hpp @@ -7,6 +7,8 @@ namespace mscclpp { +// TODO: Add functionality to these classes for Communicator to do connectionSetup + class CudaIpcConnection : public Connection { cudaStream_t stream; public: @@ -27,10 +29,10 @@ public: class IBConnection : public Connection { TransportFlags transport_; TransportFlags remoteTransport_; - mscclppIbQp qp; + mscclppIbQp* qp; public: - IBConnection(TransportFlags transport); + IBConnection(TransportFlags transport, Communicator::Impl& commImpl); virtual ~IBConnection(); diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index f4d73ab4..52b0511b 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -36,7 +36,9 @@ const TransportFlags TransportIB4 = 0b100000; const TransportFlags TransportIB5 = 0b1000000; const TransportFlags TransportIB6 = 0b10000000; const TransportFlags TransportIB7 = 0b100000000; + const TransportFlags TransportAll = 0b111111111; +const TransportFlags TransportAllIB = 0b111111110; int getIBDeviceCount(); std::string getIBDeviceName(TransportFlags ibTransport); @@ -55,13 +57,12 @@ public: void* data(); size_t size(); + int rank(); TransportFlags transports(); std::vector serialize(); static RegisteredMemory deserialize(const std::vector& data); - int rank(); - friend class Connection; }; diff --git a/src/include/registered_memory.hpp b/src/include/registered_memory.hpp index 82fe942e..24eed981 100644 --- a/src/include/registered_memory.hpp +++ b/src/include/registered_memory.hpp @@ -2,39 +2,32 @@ #define MSCCLPP_REGISTERED_MEMORY_HPP_ #include "mscclpp.hpp" +#include "mscclpp.h" #include "ib.h" #include #include namespace mscclpp { -struct IBTransportData { - mscclppIbMr localIbMr; - mscclppIbMrInfo remoteIbMrInfo; -}; - -struct TransportData { +struct TransportInfo { TransportFlags transport; - union { - void* cudaIpcPtr; - IBTransportData ibData; - } + std::variant data; }; struct RegisteredMemory::Impl { void* data; size_t size; + int rank; TransportFlags transports; - std::vector transportData; + std::vector transportInfos; - Impl(void* data, size_t size, TransportFlags transports); + Impl(void* data, size_t size, int rank, TransportFlags transports); + Impl(const std::vector& data); - ~Impl(); - - template T& getTransportData(TransportFlags transport) { - for (auto& data : transportData) { - if (data.transport == transport) { - return data; + template T& getTransportInfo(TransportFlags transport) { + for (auto& entry : transportInfos) { + if (entry.transport == transport) { + return std::get(entry.data); } } throw std::runtime_error("Transport data not found"); diff --git a/src/registered_memory.cc b/src/registered_memory.cc index d491e72f..eabb9e7d 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -2,6 +2,127 @@ namespace mscclpp { +RegisteredMemory::Impl::Impl(void* data, size_t size, int rank, TransportFlags transports, Communicator& comm) : data(data), size(size), rank(rank), transports(transports) { + if (transports & TransportCudaIpc) { + TransportInfo transportInfo; + transportInfo.transport = TransportCudaIpc; + cudaIpcMemHandle_t handle; + CUDATHROW(cudaIpcGetMemHandle(&handle, data)); + transportInfo.data = handle; + this->transportInfos.push_back(transportInfo); + } + if (transports & TransportAllIB) { + auto addIb = [&](TransportFlags ibTransport) { + TransportInfo transportInfo; + transportInfo.transport = ibTransport; + mscclppIbMr* mr; + MSCCLPPTHROW(mscclppIbContextRegisterMr(comm.pimpl->getIbContext(ibTransport), data, size, &mr)); + transportInfo.data = mr; + this->transportInfos.push_back(transportInfo); + }; + if (transports & TransportIB0) addIb(TransportIB0); + if (transports & TransportIB1) addIb(TransportIB1); + if (transports & TransportIB2) addIb(TransportIB2); + if (transports & TransportIB3) addIb(TransportIB3); + if (transports & TransportIB4) addIb(TransportIB4); + if (transports & TransportIB5) addIb(TransportIB5); + if (transports & TransportIB6) addIb(TransportIB6); + if (transports & TransportIB7) addIb(TransportIB7); + } +} +RegisteredMemory::RegisteredMemory(std::shared_ptr pimpl) : impl(pimpl) {} + +RegisteredMemory::~RegisteredMemory() = default; + +void* RegisteredMemory::data() { + return impl->data; +} + +size_t RegisteredMemory::size() { + return impl->size; +} + +int RegisteredMemory::rank() { + return impl->rank; +} + +TransportFlags RegisteredMemory::transports() { + return impl->transports; +} + +std::vector RegisteredMemory::serialize() { + std::vector result; + std::copy_n(reinterpret_cast(&impl->size), sizeof(impl->size), std::back_inserter(result)); + std::copy_n(reinterpret_cast(&impl->rank), sizeof(impl->rank), std::back_inserter(result)); + std::copy_n(reinterpret_cast(&impl->transports), sizeof(impl->transports), std::back_inserter(result)); + if (impl->transportInfos.size() > std::numeric_limits::max()) { + throw std::runtime_error("Too many transport info entries"); + } + int8_t transportCount = impl->transportInfos.size(); + std::copy_n(reinterpret_cast(&transportCount), sizeof(transportCount), std::back_inserter(result)); + for (auto& entry : impl->transportInfos) { + std::copy_n(reinterpret_cast(&entry.transport), sizeof(entry.transport), std::back_inserter(result)); + std::visit(overloaded{ + [&](std::monostate&){ + throw std::runtime_error("Transport info not set"); + }, + [&](cudaIpcMemHandle_t handle){ + std::copy_n(reinterpret_cast(&handle), sizeof(handle), std::back_inserter(result)); + }, + [&](mscclppIbMr* mr){ + std::copy_n(reinterpret_cast(&mr->info), sizeof(mr->info), std::back_inserter(result)); + }, + [&](mscclppIbMrInfo info){ + std::copy_n(reinterpret_cast(&info), sizeof(info), std::back_inserter(result)); + } + }, entry.data); + } + return result; +} + +static RegisteredMemory RegisteredMemory::deserialize(const std::vector& data) { + return RegisteredMemory(std::make_shared(data)); +} + +RegisteredMemory::Impl::Impl(const std::vector& data) { + auto it = data.begin(); + std::copy_n(it, sizeof(this->size), reinterpret_cast(&this->size)); + it += sizeof(this->size); + std::copy_n(it, sizeof(this->rank), reinterpret_cast(&this->rank)); + it += sizeof(this->rank); + std::copy_n(it, sizeof(this->transports), reinterpret_cast(&this->transports)); + it += sizeof(this->transports); + int8_t transportCount; + std::copy_n(it, sizeof(transportCount), reinterpret_cast(&transportCount)); + it += sizeof(transportCount); + for (int i = 0; i < transportCount; ++i) { + TransportInfo transportInfo; + std::copy_n(it, sizeof(transportInfo.transport), reinterpret_cast(&transportInfo.transport)); + it += sizeof(transportInfo.transport); + if (transportInfo.transport & TransportCudaIpc) { + cudaIpcMemHandle_t handle; + std::copy_n(it, sizeof(handle), reinterpret_cast(&handle)); + it += sizeof(handle); + transportInfo.data = handle; + } else if (transportInfo.transport & TransportAllIB) { + mscclppIbMrInfo info; + std::copy_n(it, sizeof(info), reinterpret_cast(&info)); + it += sizeof(info); + transportInfo.data = info; + } else { + throw std::runtime_error("Unknown transport"); + } + this->transportInfos.push_back(transportInfo); + } + if (it != data.end()) { + throw std::runtime_error("Deserialization failed"); + } + + if (transports & TransportCudaIpc) { + auto cudaIpcHandle = getTransportInfo(TransportCudaIpc); + CUDATHROW(cudaIpcOpenMemHandle(&data, cudaIpcHandle, cudaIpcMemLazyEnablePeerAccess)); + } +} } // namespace mscclpp From d746201287d63407ac110c5d6aae85a3aafddab2 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Wed, 26 Apr 2023 17:46:47 +0000 Subject: [PATCH 060/135] WIP builds, but doesn't link --- Makefile | 5 ++- src/communicator.cc | 50 ++++++++++++++-------- src/connection.cc | 53 +++++++++++++---------- src/include/communicator.hpp | 6 ++- src/include/connection.hpp | 21 ++++----- src/include/ib.hpp | 4 +- src/include/mscclpp.hpp | 13 +++--- src/include/registered_memory.hpp | 17 +++++--- src/registered_memory.cc | 71 +++++++++++++++---------------- 9 files changed, 136 insertions(+), 104 deletions(-) diff --git a/Makefile b/Makefile index e544aeee..9aaf34b8 100644 --- a/Makefile +++ b/Makefile @@ -120,7 +120,8 @@ LDFLAGS := $(NVLDFLAGS) $(GDRCOPY_LDFLAGS) -libverbs -lnuma LIBSRCS := $(addprefix src/,debug.cc utils.cc init.cc proxy.cc ib.cc config.cc) LIBSRCS += $(addprefix src/bootstrap/,bootstrap.cc socket.cc) -LIBSRCS += $(addprefix src/,communicator.cc fifo.cc host_connection.cc proxy_cpp.cc basic_proxy_handler.cc) +LIBSRCS += $(addprefix src/,communicator.cc connection.cc registered_memory.cc) +#LIBSRCS += $(addprefix src/,fifo.cc host_connection.cc proxy_cpp.cc basic_proxy_handler.cc) ifneq ($(NPKIT), 0) LIBSRCS += $(addprefix src/misc/,npkit.cc) endif @@ -148,7 +149,7 @@ UTOBJTARGETS := $(UTOBJS:%=$(BUILDDIR)/$(OBJDIR)/%) UTBINS := $(patsubst %.o,$(BUILDDIR)/$(BINDIR)/%,$(UTOBJS)) TESTSDIR := tests -TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc allgather_test_standalone.cu allgather_test_cpp.cu) +TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc allgather_test_standalone.cu) # allgather_test_cpp.cu TESTSOBJS := $(patsubst %.cc,%.o,$(TESTSSRCS)) $(patsubst %.cu,%.o,$(TESTSSRCS)) TESTSOBJTARGETS := $(TESTSOBJS:%=$(BUILDDIR)/$(OBJDIR)/%) TESTSBINS := $(patsubst %.o,$(BUILDDIR)/$(BINDIR)/%,$(TESTSOBJS)) diff --git a/src/communicator.cc b/src/communicator.cc index a74923bb..316801de 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -4,17 +4,39 @@ #include "comm.h" #include "basic_proxy_handler.hpp" #include "api.h" +#include "utils.h" +#include "checks.hpp" +#include "debug.h" +#include "connection.hpp" namespace mscclpp { -Communicator::Impl::Impl() : comm(nullptr), proxy(makeBasicProxyHandler(*this)) {} +Communicator::Impl::Impl() : comm(nullptr) {} Communicator::Impl::~Impl() { + for (auto& entry : ibContexts) { + mscclppIbContextDestroy(entry.second); + } + ibContexts.clear(); if (comm) { mscclppCommDestroy(comm); } } +mscclppIbContext* Communicator::Impl::getIbContext(TransportFlags ibTransport) { + // Find IB context or create it + auto it = ibContexts.find(ibTransport); + if (it == ibContexts.end()) { + auto ibDev = getIBDeviceName(ibTransport); + mscclppIbContext* ibCtx; + MSCCLPPTHROW(mscclppIbContextCreate(&ibCtx, ibDev.c_str())); + ibContexts[ibTransport] = ibCtx; + return ibCtx; + } else { + return it->second; + } +} + MSCCLPP_API_CPP Communicator::~Communicator() = default; static mscclppTransport_t transportToCStyle(TransportFlags flags) { @@ -54,24 +76,16 @@ MSCCLPP_API_CPP void Communicator::bootstrapBarrier() { } MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank, int tag, TransportFlags transport) { - std::string ibDev; - switch (transport) { - case TransportIB0: - case TransportIB1: - case TransportIB2: - case TransportIB3: - case TransportIB4: - case TransportIB5: - case TransportIB6: - case TransportIB7: - ibDev = getIBDeviceName(transport); - break; + std::shared_ptr conn; + if (transport | TransportCudaIpc) { + auto cudaIpcConn = std::make_shared(); + conn = cudaIpcConn; + } else if (transport | TransportAllIB) { + auto ibConn = std::make_shared(transport, *pimpl); + conn = ibConn; + } else { + throw std::runtime_error("Unsupported transport"); } - mscclppConnectWithoutBuffer(pimpl->comm, remoteRank, tag, transportToCStyle(transport), ibDev.c_str()); - auto connIdx = pimpl->connections.size(); - auto conn = std::make_shared(std::make_unique(this, &pimpl->comm->conns[connIdx])); - pimpl->connections.push_back(conn); - return conn; } MSCCLPP_API_CPP void Communicator::connectionSetup() { diff --git a/src/connection.cc b/src/connection.cc index 48b2d197..3e053cb3 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -1,26 +1,18 @@ #include "connection.hpp" #include "checks.hpp" #include "registered_memory.hpp" -#include "npkit.h" +#include "npkit/npkit.h" namespace mscclpp { void validateTransport(RegisteredMemory mem, TransportFlags transport) { - if (mem.transports() & transport == TransportNone) { + if ((mem.transports() & transport) == TransportNone) { throw std::runtime_error("mem does not support transport"); } } // CudaIpcConnection -TransportFlags CudaIpcConnection::transport() { - return TransportCudaIpc; -} - -TransportFlags CudaIpcConnection::remoteTransport() { - return TransportCudaIpc; -} - CudaIpcConnection::CudaIpcConnection() { cudaStreamCreate(&stream); } @@ -29,12 +21,20 @@ CudaIpcConnection::~CudaIpcConnection() { cudaStreamDestroy(stream); } +TransportFlags CudaIpcConnection::transport() { + return TransportCudaIpc; +} + +TransportFlags CudaIpcConnection::remoteTransport() { + return TransportCudaIpc; +} + void CudaIpcConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size) { validateTransport(dst, remoteTransport()); validateTransport(src, transport()); - auto dstPtr = dst.impl->data; - auto srcPtr = src.impl->data; + auto dstPtr = dst.data(); + auto srcPtr = src.data(); CUDATHROW(cudaMemcpyAsync(dstPtr + dstOffset, srcPtr + srcOffset, size, cudaMemcpyDeviceToDevice, stream)); // npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_DATA_ENTRY, (uint32_t)size); @@ -47,7 +47,13 @@ void CudaIpcConnection::flush() { // IBConnection -IBConnection::IBConnection(TransportFlags transport) : transport_(transport), remoteTransport_(TransportNone) {} +IBConnection::IBConnection(TransportFlags transport, Communicator::Impl& commImpl) : transport_(transport), remoteTransport_(TransportNone) { + MSCCLPPTHROW(mscclppIbContextCreateQp(commImpl.getIbContext(transport), &qp)); +} + +IBConnection::~IBConnection() { + // TODO: Destroy QP? +} TransportFlags IBConnection::transport() { return transport_; @@ -57,20 +63,21 @@ TransportFlags IBConnection::remoteTransport() { return remoteTransport_; } -IBConnection::IBConnection(TransportFlags transport, Communicator::Impl& commImpl) : transport_(transport), remoteTransport_(TransportNone) { - MSCCLPPTHROW(mscclppIbContextCreateQp(commImpl.getIbContext(transport), &qp)); -} - -IBConnection::~IBConnection() { - // TODO: Destroy QP? -} - void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size) { validateTransport(dst, remoteTransport()); validateTransport(src, transport()); - auto dstMrInfo = dst.impl->getTransportInfo(remoteTransport()); - auto srcMr = src.impl->getTransportInfo(transport()); + auto dstTransportInfo = getRegisteredMemoryImpl(dst)->getTransportInfo(remoteTransport()); + if (dstTransportInfo.ibLocal) { + throw std::runtime_error("dst is local, which is not supported"); + } + auto srcTransportInfo = getRegisteredMemoryImpl(src)->getTransportInfo(remoteTransport()); + if (!srcTransportInfo.ibLocal) { + throw std::runtime_error("src is remote, which is not supported"); + } + + auto dstMrInfo = dstTransportInfo.ibMrInfo; + auto srcMr = srcTransportInfo.ibMr; qp->stageSend(srcMr, &dstMrInfo, (uint32_t)size, /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset, /*signaled=*/false); diff --git a/src/include/communicator.hpp b/src/include/communicator.hpp index 827b0281..8eb0e202 100644 --- a/src/include/communicator.hpp +++ b/src/include/communicator.hpp @@ -5,19 +5,21 @@ #include "mscclpp.h" #include "channel.hpp" #include "proxy.hpp" +#include "ib.h" +#include namespace mscclpp { struct Communicator::Impl { mscclppComm_t comm; std::vector> connections; - Proxy proxy; + std::unordered_map ibContexts; Impl(); ~Impl(); - friend class Connection; + mscclppIbContext* getIbContext(TransportFlags ibTransport); }; } // namespace mscclpp diff --git a/src/include/connection.hpp b/src/include/connection.hpp index 72f0eb90..94d727e7 100644 --- a/src/include/connection.hpp +++ b/src/include/connection.hpp @@ -4,6 +4,7 @@ #include "mscclpp.hpp" #include #include "ib.h" +#include "communicator.hpp" namespace mscclpp { @@ -15,15 +16,15 @@ public: CudaIpcConnection(); - virtual ~CudaIpcConnection(); + ~CudaIpcConnection(); - virtual TransportFlags transport(); + TransportFlags transport() override; - virtual TransportFlags remoteTransport(); + TransportFlags remoteTransport() override; - virtual void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size); + void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size) override; - virtual void flush(); + void flush() override; }; class IBConnection : public Connection { @@ -34,15 +35,15 @@ public: IBConnection(TransportFlags transport, Communicator::Impl& commImpl); - virtual ~IBConnection(); + ~IBConnection(); - virtual TransportFlags transport(); + TransportFlags transport() override; - virtual TransportFlags remoteTransport(); + TransportFlags remoteTransport() override; - virtual void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size); + void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size) override; - virtual void flush(); + void flush() override; }; } // namespace mscclpp diff --git a/src/include/ib.hpp b/src/include/ib.hpp index 4c58cfdc..85c92af7 100644 --- a/src/include/ib.hpp +++ b/src/include/ib.hpp @@ -48,8 +48,8 @@ public: IbQp* createQp(int port = -1); private: - bool IbCtx::isPortUsable(int port) const; - int IbCtx::getAnyActivePort() const; + bool isPortUsable(int port) const; + int getAnyActivePort() const; void* ctx; void* pd; diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index 52b0511b..9c699efb 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -67,8 +67,7 @@ public: }; class Connection { - virtual ~Connection() = 0; - +public: virtual void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size) = 0; virtual void flush() = 0; @@ -76,13 +75,13 @@ class Connection { virtual TransportFlags transport() = 0; virtual TransportFlags remoteTransport() = 0; + +protected: + static std::shared_ptr getRegisteredMemoryImpl(RegisteredMemory&); }; class Communicator { - struct Impl; - std::unique_ptr pimpl; public: - /* Initialize the communicator. nranks processes with rank 0 to nranks-1 need to call this function. * * Inputs: @@ -159,6 +158,10 @@ public: * size: the number of ranks of the communicator */ int size(); + + struct Impl; +private: + std::unique_ptr pimpl; }; } // namespace mscclpp diff --git a/src/include/registered_memory.hpp b/src/include/registered_memory.hpp index 24eed981..7a0ab1d0 100644 --- a/src/include/registered_memory.hpp +++ b/src/include/registered_memory.hpp @@ -4,14 +4,21 @@ #include "mscclpp.hpp" #include "mscclpp.h" #include "ib.h" -#include +#include "communicator.hpp" #include namespace mscclpp { struct TransportInfo { TransportFlags transport; - std::variant data; + + // TODO: rewrite this using std::variant or something + bool ibLocal; + union { + cudaIpcMemHandle_t cudaIpcHandle; + mscclppIbMr* ibMr; + mscclppIbMrInfo ibMrInfo; + }; }; struct RegisteredMemory::Impl { @@ -21,13 +28,13 @@ struct RegisteredMemory::Impl { TransportFlags transports; std::vector transportInfos; - Impl(void* data, size_t size, int rank, TransportFlags transports); + Impl(void* data, size_t size, int rank, TransportFlags transports, Communicator::Impl& commImpl); Impl(const std::vector& data); - template T& getTransportInfo(TransportFlags transport) { + TransportInfo& getTransportInfo(TransportFlags transport) { for (auto& entry : transportInfos) { if (entry.transport == transport) { - return std::get(entry.data); + return entry; } } throw std::runtime_error("Transport data not found"); diff --git a/src/registered_memory.cc b/src/registered_memory.cc index eabb9e7d..7a5a0725 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -1,14 +1,16 @@ #include "registered_memory.hpp" +#include "checks.hpp" +#include namespace mscclpp { -RegisteredMemory::Impl::Impl(void* data, size_t size, int rank, TransportFlags transports, Communicator& comm) : data(data), size(size), rank(rank), transports(transports) { +RegisteredMemory::Impl::Impl(void* data, size_t size, int rank, TransportFlags transports, Communicator::Impl& commImpl) : data(data), size(size), rank(rank), transports(transports) { if (transports & TransportCudaIpc) { TransportInfo transportInfo; transportInfo.transport = TransportCudaIpc; cudaIpcMemHandle_t handle; CUDATHROW(cudaIpcGetMemHandle(&handle, data)); - transportInfo.data = handle; + transportInfo.cudaIpcHandle = handle; this->transportInfos.push_back(transportInfo); } if (transports & TransportAllIB) { @@ -16,8 +18,9 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, int rank, TransportFlags t TransportInfo transportInfo; transportInfo.transport = ibTransport; mscclppIbMr* mr; - MSCCLPPTHROW(mscclppIbContextRegisterMr(comm.pimpl->getIbContext(ibTransport), data, size, &mr)); - transportInfo.data = mr; + MSCCLPPTHROW(mscclppIbContextRegisterMr(commImpl.getIbContext(ibTransport), data, size, &mr)); + transportInfo.ibMr = mr; + transportInfo.ibLocal = true; this->transportInfos.push_back(transportInfo); }; if (transports & TransportIB0) addIb(TransportIB0); @@ -31,62 +34,55 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, int rank, TransportFlags t } } -RegisteredMemory::RegisteredMemory(std::shared_ptr pimpl) : impl(pimpl) {} +RegisteredMemory::RegisteredMemory(std::shared_ptr pimpl) : pimpl(pimpl) {} RegisteredMemory::~RegisteredMemory() = default; void* RegisteredMemory::data() { - return impl->data; + return pimpl->data; } size_t RegisteredMemory::size() { - return impl->size; + return pimpl->size; } int RegisteredMemory::rank() { - return impl->rank; + return pimpl->rank; } TransportFlags RegisteredMemory::transports() { - return impl->transports; + return pimpl->transports; } std::vector RegisteredMemory::serialize() { std::vector result; - std::copy_n(reinterpret_cast(&impl->size), sizeof(impl->size), std::back_inserter(result)); - std::copy_n(reinterpret_cast(&impl->rank), sizeof(impl->rank), std::back_inserter(result)); - std::copy_n(reinterpret_cast(&impl->transports), sizeof(impl->transports), std::back_inserter(result)); - if (impl->transportInfos.size() > std::numeric_limits::max()) { + std::copy_n(reinterpret_cast(&pimpl->size), sizeof(pimpl->size), std::back_inserter(result)); + std::copy_n(reinterpret_cast(&pimpl->rank), sizeof(pimpl->rank), std::back_inserter(result)); + std::copy_n(reinterpret_cast(&pimpl->transports), sizeof(pimpl->transports), std::back_inserter(result)); + if (pimpl->transportInfos.size() > std::numeric_limits::max()) { throw std::runtime_error("Too many transport info entries"); } - int8_t transportCount = impl->transportInfos.size(); + int8_t transportCount = pimpl->transportInfos.size(); std::copy_n(reinterpret_cast(&transportCount), sizeof(transportCount), std::back_inserter(result)); - for (auto& entry : impl->transportInfos) { + for (auto& entry : pimpl->transportInfos) { std::copy_n(reinterpret_cast(&entry.transport), sizeof(entry.transport), std::back_inserter(result)); - std::visit(overloaded{ - [&](std::monostate&){ - throw std::runtime_error("Transport info not set"); - }, - [&](cudaIpcMemHandle_t handle){ - std::copy_n(reinterpret_cast(&handle), sizeof(handle), std::back_inserter(result)); - }, - [&](mscclppIbMr* mr){ - std::copy_n(reinterpret_cast(&mr->info), sizeof(mr->info), std::back_inserter(result)); - }, - [&](mscclppIbMrInfo info){ - std::copy_n(reinterpret_cast(&info), sizeof(info), std::back_inserter(result)); - } - }, entry.data); + if (entry.transport == TransportCudaIpc) { + std::copy_n(reinterpret_cast(&entry.cudaIpcHandle), sizeof(entry.cudaIpcHandle), std::back_inserter(result)); + } else if (entry.transport & TransportAllIB) { + std::copy_n(reinterpret_cast(&entry.ibMrInfo), sizeof(entry.ibMrInfo), std::back_inserter(result)); + } else { + throw std::runtime_error("Unknown transport"); + } } return result; } -static RegisteredMemory RegisteredMemory::deserialize(const std::vector& data) { +RegisteredMemory RegisteredMemory::deserialize(const std::vector& data) { return RegisteredMemory(std::make_shared(data)); } -RegisteredMemory::Impl::Impl(const std::vector& data) { - auto it = data.begin(); +RegisteredMemory::Impl::Impl(const std::vector& serialization) { + auto it = serialization.begin(); std::copy_n(it, sizeof(this->size), reinterpret_cast(&this->size)); it += sizeof(this->size); std::copy_n(it, sizeof(this->rank), reinterpret_cast(&this->rank)); @@ -104,24 +100,25 @@ RegisteredMemory::Impl::Impl(const std::vector& data) { cudaIpcMemHandle_t handle; std::copy_n(it, sizeof(handle), reinterpret_cast(&handle)); it += sizeof(handle); - transportInfo.data = handle; + transportInfo.cudaIpcHandle = handle; } else if (transportInfo.transport & TransportAllIB) { mscclppIbMrInfo info; std::copy_n(it, sizeof(info), reinterpret_cast(&info)); it += sizeof(info); - transportInfo.data = info; + transportInfo.ibMrInfo = info; + transportInfo.ibLocal = false; } else { throw std::runtime_error("Unknown transport"); } this->transportInfos.push_back(transportInfo); } - if (it != data.end()) { + if (it != serialization.end()) { throw std::runtime_error("Deserialization failed"); } if (transports & TransportCudaIpc) { - auto cudaIpcHandle = getTransportInfo(TransportCudaIpc); - CUDATHROW(cudaIpcOpenMemHandle(&data, cudaIpcHandle, cudaIpcMemLazyEnablePeerAccess)); + auto entry = getTransportInfo(TransportCudaIpc); + CUDATHROW(cudaIpcOpenMemHandle(&data, entry.cudaIpcHandle, cudaIpcMemLazyEnablePeerAccess)); } } From 5443ed1ec22cedb7db7e0c55e9d80e555576ad06 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Wed, 26 Apr 2023 18:07:17 +0000 Subject: [PATCH 061/135] ConnectionSetup stuff --- src/communicator.cc | 12 +++++++++--- src/connection.cc | 14 +++++++++++--- src/include/communicator.hpp | 4 +++- src/include/connection.hpp | 18 +++++++++++++++--- 4 files changed, 38 insertions(+), 10 deletions(-) diff --git a/src/communicator.cc b/src/communicator.cc index 316801de..9ce5b779 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -76,20 +76,26 @@ MSCCLPP_API_CPP void Communicator::bootstrapBarrier() { } MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank, int tag, TransportFlags transport) { - std::shared_ptr conn; + std::shared_ptr conn; if (transport | TransportCudaIpc) { auto cudaIpcConn = std::make_shared(); conn = cudaIpcConn; } else if (transport | TransportAllIB) { - auto ibConn = std::make_shared(transport, *pimpl); + auto ibConn = std::make_shared(remoteRank, tag, transport, *pimpl); conn = ibConn; } else { throw std::runtime_error("Unsupported transport"); } + pimpl->connections.push_back(conn); } MSCCLPP_API_CPP void Communicator::connectionSetup() { - mscclppConnectionSetup(pimpl->comm); + for (auto& conn : pimpl->connections) { + conn->startSetup(*this); + } + for (auto& conn : pimpl->connections) { + conn->endSetup(*this); + } } MSCCLPP_API_CPP int Communicator::rank() { diff --git a/src/connection.cc b/src/connection.cc index 3e053cb3..24482c7b 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -33,8 +33,8 @@ void CudaIpcConnection::write(RegisteredMemory dst, uint64_t dstOffset, Register validateTransport(dst, remoteTransport()); validateTransport(src, transport()); - auto dstPtr = dst.data(); - auto srcPtr = src.data(); + char* dstPtr = (char*)dst.data(); + char* srcPtr = (char*)src.data(); CUDATHROW(cudaMemcpyAsync(dstPtr + dstOffset, srcPtr + srcOffset, size, cudaMemcpyDeviceToDevice, stream)); // npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_DATA_ENTRY, (uint32_t)size); @@ -47,7 +47,7 @@ void CudaIpcConnection::flush() { // IBConnection -IBConnection::IBConnection(TransportFlags transport, Communicator::Impl& commImpl) : transport_(transport), remoteTransport_(TransportNone) { +IBConnection::IBConnection(int remoteRank, int tag, TransportFlags transport, Communicator::Impl& commImpl) : remoteRank(remoteRank), tag(tag), transport_(transport), remoteTransport_(TransportNone) { MSCCLPPTHROW(mscclppIbContextCreateQp(commImpl.getIbContext(transport), &qp)); } @@ -116,4 +116,12 @@ void IBConnection::flush() { // npkitCollectExitEvents(conn, NPKIT_EVENT_IB_SEND_EXIT); } +void startSetup(Communicator& comm) { + // TODO: use bootstrapper from comm to send over QP info +} + +void endSetup(Communicator& comm) { + // TODO: use bootstrapper from comm to receive QP info and do the rtr/rts calls +} + } // namespace mscclpp diff --git a/src/include/communicator.hpp b/src/include/communicator.hpp index 8eb0e202..879501c0 100644 --- a/src/include/communicator.hpp +++ b/src/include/communicator.hpp @@ -10,9 +10,11 @@ namespace mscclpp { +class ConnectionBase; + struct Communicator::Impl { mscclppComm_t comm; - std::vector> connections; + std::vector> connections; std::unordered_map ibContexts; Impl(); diff --git a/src/include/connection.hpp b/src/include/connection.hpp index 94d727e7..ac1dd6a1 100644 --- a/src/include/connection.hpp +++ b/src/include/connection.hpp @@ -10,7 +10,13 @@ namespace mscclpp { // TODO: Add functionality to these classes for Communicator to do connectionSetup -class CudaIpcConnection : public Connection { +class ConnectionBase : public Connection { +public: + virtual void startSetup(Communicator&) {}; + virtual void endSetup(Communicator&) {}; +}; + +class CudaIpcConnection : public ConnectionBase { cudaStream_t stream; public: @@ -27,13 +33,15 @@ public: void flush() override; }; -class IBConnection : public Connection { +class IBConnection : public ConnectionBase { + int remoteRank; + int tag; TransportFlags transport_; TransportFlags remoteTransport_; mscclppIbQp* qp; public: - IBConnection(TransportFlags transport, Communicator::Impl& commImpl); + IBConnection(int remoteRank, int tag, TransportFlags transport, Communicator::Impl& commImpl); ~IBConnection(); @@ -44,6 +52,10 @@ public: void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size) override; void flush() override; + + void startSetup(Communicator& comm) override; + + void endSetup(Communicator& comm) override; }; } // namespace mscclpp From 9c6e68525353ef0d4ea450b816fefd0c34a0f45b Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Wed, 26 Apr 2023 23:46:22 +0000 Subject: [PATCH 062/135] connectionSetup() for IBConnection --- src/connection.cc | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/connection.cc b/src/connection.cc index 24482c7b..b682903a 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -116,12 +116,19 @@ void IBConnection::flush() { // npkitCollectExitEvents(conn, NPKIT_EVENT_IB_SEND_EXIT); } -void startSetup(Communicator& comm) { - // TODO: use bootstrapper from comm to send over QP info +void IBConnection::startSetup(Communicator& comm) { + comm.bootstrap().send(&qp->info, sizeof(qp->info), remoteRank, tag); } -void endSetup(Communicator& comm) { - // TODO: use bootstrapper from comm to receive QP info and do the rtr/rts calls +void IBConnection::endSetup(Communicator& comm) { + mscclppIbQpInfo qpInfo; + comm.bootstrap().recv(&qpInfo, sizeof(qpInfo), remoteRank, tag); + if (qp->rtr(&qpInfo) != 0) { + throw std::runtime_error("Failed to transition QP to RTR"); + } + if (qp->rts() != 0) { + throw std::runtime_error("Failed to transition QP to RTS"); + } } } // namespace mscclpp From 7c87ca300526663ff0121b98a29f3415c9aff87d Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Thu, 27 Apr 2023 00:01:38 +0000 Subject: [PATCH 063/135] Missing functions and TODOs --- TODO.md | 7 +++++ src/communicator.cc | 77 +++++++++++++++++++++++++++++++++++++++++++++ src/connection.cc | 6 ++++ 3 files changed, 90 insertions(+) create mode 100644 TODO.md diff --git a/TODO.md b/TODO.md new file mode 100644 index 00000000..5338c10e --- /dev/null +++ b/TODO.md @@ -0,0 +1,7 @@ +# Core API extraction + +- Add a test for host side Communicator/RegisteredMemory/Connection use. +- Implement a standalone "epoch" synchronization construct that can be used as a component in custom proxies. epoch.hpp/cc has the beginnings of this. +- Reimplement the "standard" proxy service + DeviceConnection on top of the new Communicator/RegisteredMemory/Connection core API. Remants of the old code is in channel.hpp, basic_proxy_handler.hpp/cc and host_connection.hpp/cc. +- Change the new IBConnection and Communicator to use the new C++ IbCtx and IbQp classes. +- Implement IbQp::~IbQp() \ No newline at end of file diff --git a/src/communicator.cc b/src/communicator.cc index 9ce5b779..ce26d64a 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -110,4 +110,81 @@ MSCCLPP_API_CPP int Communicator::size() { return result; } +// TODO: move these elsewhere + +int getIBDeviceCount() { + int num; + struct ibv_device** devices = ibv_get_device_list(&num); + return num; +} + +std::string getIBDeviceName(TransportFlags ibTransport) { + int num; + struct ibv_device** devices = ibv_get_device_list(&num); + int ibTransportIndex; + switch (ibTransport) { // TODO: get rid of this ugly switch + case TransportIB0: + ibTransportIndex = 0; + break; + case TransportIB1: + ibTransportIndex = 1; + break; + case TransportIB2: + ibTransportIndex = 2; + break; + case TransportIB3: + ibTransportIndex = 3; + break; + case TransportIB4: + ibTransportIndex = 4; + break; + case TransportIB5: + ibTransportIndex = 5; + break; + case TransportIB6: + ibTransportIndex = 6; + break; + case TransportIB7: + ibTransportIndex = 7; + break; + default: + throw std::runtime_error("Not an IB transport"); + } + if (ibTransportIndex >= num) { + throw std::runtime_error("IB transport out of range"); + } + return devices[ibTransportIndex]->name; +} + +TransportFlags getIBTransportByDeviceName(const std::string& ibDeviceName) { + int num; + struct ibv_device** devices = ibv_get_device_list(&num); + for (int i = 0; i < num; ++i) { + if (ibDeviceName == devices[i]->name) { + switch (i) { // TODO: get rid of this ugly switch + case 0: + return TransportIB0; + case 1: + return TransportIB1; + case 2: + return TransportIB2; + case 3: + return TransportIB3; + case 4: + return TransportIB4; + case 5: + return TransportIB5; + case 6: + return TransportIB6; + case 7: + return TransportIB7; + default: + throw std::runtime_error("IB device index out of range"); + } + } + } + throw std::runtime_error("IB device not found"); +} + + } // namespace mscclpp \ No newline at end of file diff --git a/src/connection.cc b/src/connection.cc index b682903a..8d1b5e11 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -11,6 +11,12 @@ void validateTransport(RegisteredMemory mem, TransportFlags transport) { } } +// Connection + +std::shared_ptr Connection::getRegisteredMemoryImpl(RegisteredMemory& mem) { + return mem.pimpl; +} + // CudaIpcConnection CudaIpcConnection::CudaIpcConnection() { From d096874d578d1f8c5e598002c3bd51b7b5972dc7 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Thu, 27 Apr 2023 00:22:29 +0000 Subject: [PATCH 064/135] TODO updates --- TODO.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TODO.md b/TODO.md index 5338c10e..63cb4eb7 100644 --- a/TODO.md +++ b/TODO.md @@ -2,6 +2,6 @@ - Add a test for host side Communicator/RegisteredMemory/Connection use. - Implement a standalone "epoch" synchronization construct that can be used as a component in custom proxies. epoch.hpp/cc has the beginnings of this. -- Reimplement the "standard" proxy service + DeviceConnection on top of the new Communicator/RegisteredMemory/Connection core API. Remants of the old code is in channel.hpp, basic_proxy_handler.hpp/cc and host_connection.hpp/cc. +- Reimplement the "standard" proxy service + DeviceConnection on top of the new Communicator/RegisteredMemory/Connection core API. Remants of the old code is in channel.hpp, basic_proxy_handler.hpp/cc and host_connection.hpp/cc. Probably need a manager class to wrap all of this. - Change the new IBConnection and Communicator to use the new C++ IbCtx and IbQp classes. - Implement IbQp::~IbQp() \ No newline at end of file From 0e9f6fadc73b36f0117ccc5845f72c0884d0ce52 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Thu, 27 Apr 2023 00:26:00 +0000 Subject: [PATCH 065/135] TODOs --- TODO.md | 3 ++- src/registered_memory.cc | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/TODO.md b/TODO.md index 63cb4eb7..677b46cf 100644 --- a/TODO.md +++ b/TODO.md @@ -4,4 +4,5 @@ - Implement a standalone "epoch" synchronization construct that can be used as a component in custom proxies. epoch.hpp/cc has the beginnings of this. - Reimplement the "standard" proxy service + DeviceConnection on top of the new Communicator/RegisteredMemory/Connection core API. Remants of the old code is in channel.hpp, basic_proxy_handler.hpp/cc and host_connection.hpp/cc. Probably need a manager class to wrap all of this. - Change the new IBConnection and Communicator to use the new C++ IbCtx and IbQp classes. -- Implement IbQp::~IbQp() \ No newline at end of file +- Implement IbQp::~IbQp() +- Fix RegisteredMemory::Impl::Impl to get the IPC handle from the base pointer, not the derived pointer. \ No newline at end of file diff --git a/src/registered_memory.cc b/src/registered_memory.cc index 7a5a0725..d9476e4f 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -9,6 +9,7 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, int rank, TransportFlags t TransportInfo transportInfo; transportInfo.transport = TransportCudaIpc; cudaIpcMemHandle_t handle; + // TODO: translate data to a base pointer CUDATHROW(cudaIpcGetMemHandle(&handle, data)); transportInfo.cudaIpcHandle = handle; this->transportInfos.push_back(transportInfo); From 47d4606f130deb6311a8d23f482f59aa630957d6 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Thu, 27 Apr 2023 00:33:24 +0000 Subject: [PATCH 066/135] Add registerMemory --- src/communicator.cc | 7 ++++++- src/include/mscclpp.hpp | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/communicator.cc b/src/communicator.cc index ce26d64a..c34dbb31 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -8,6 +8,7 @@ #include "checks.hpp" #include "debug.h" #include "connection.hpp" +#include "registered_memory.hpp" namespace mscclpp { @@ -75,6 +76,10 @@ MSCCLPP_API_CPP void Communicator::bootstrapBarrier() { mscclppBootstrapBarrier(pimpl->comm); } +RegisteredMemory Communicator::registerMemory(void* ptr, size_t size, TransportFlags transports) { + return RegisteredMemory(std::make_shared(ptr, size, pimpl->comm->rank, transports, *pimpl)); +} + MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank, int tag, TransportFlags transport) { std::shared_ptr conn; if (transport | TransportCudaIpc) { @@ -114,7 +119,7 @@ MSCCLPP_API_CPP int Communicator::size() { int getIBDeviceCount() { int num; - struct ibv_device** devices = ibv_get_device_list(&num); + ibv_get_device_list(&num); return num; } diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index f2d8667e..bd4bc067 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -102,6 +102,7 @@ public: static RegisteredMemory deserialize(const std::vector& data); friend class Connection; + friend class Communicator; }; class Connection { From 08e80f1754527fe9f72026d032c1b08301587a8d Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 27 Apr 2023 04:01:46 +0000 Subject: [PATCH 067/135] IB: completely replaced with C++ interfaces --- src/communicator.cc | 87 +---- src/connection.cc | 34 +- src/ib.cc | 612 +++++++++++++----------------- src/include/comm.h | 7 +- src/include/communicator.hpp | 6 +- src/include/connection.hpp | 4 +- src/include/ib.h | 69 ---- src/include/ib.hpp | 53 ++- src/include/mscclpp.h | 17 +- src/include/proxy.h | 2 +- src/include/registered_memory.hpp | 6 +- src/init.cc | 79 ++-- src/proxy.cc | 2 +- src/registered_memory.cc | 5 +- tests/unittests/ib_test.cc | 64 ++-- 15 files changed, 409 insertions(+), 638 deletions(-) delete mode 100644 src/include/ib.h diff --git a/src/communicator.cc b/src/communicator.cc index c34dbb31..6c501d70 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -16,7 +16,7 @@ Communicator::Impl::Impl() : comm(nullptr) {} Communicator::Impl::~Impl() { for (auto& entry : ibContexts) { - mscclppIbContextDestroy(entry.second); + delete entry.second; } ibContexts.clear(); if (comm) { @@ -24,13 +24,12 @@ Communicator::Impl::~Impl() { } } -mscclppIbContext* Communicator::Impl::getIbContext(TransportFlags ibTransport) { +IbCtx* Communicator::Impl::getIbContext(TransportFlags ibTransport) { // Find IB context or create it auto it = ibContexts.find(ibTransport); if (it == ibContexts.end()) { auto ibDev = getIBDeviceName(ibTransport); - mscclppIbContext* ibCtx; - MSCCLPPTHROW(mscclppIbContextCreate(&ibCtx, ibDev.c_str())); + IbCtx* ibCtx = new IbCtx(ibDev); ibContexts[ibTransport] = ibCtx; return ibCtx; } else { @@ -92,6 +91,7 @@ MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank throw std::runtime_error("Unsupported transport"); } pimpl->connections.push_back(conn); + return conn; } MSCCLPP_API_CPP void Communicator::connectionSetup() { @@ -115,81 +115,4 @@ MSCCLPP_API_CPP int Communicator::size() { return result; } -// TODO: move these elsewhere - -int getIBDeviceCount() { - int num; - ibv_get_device_list(&num); - return num; -} - -std::string getIBDeviceName(TransportFlags ibTransport) { - int num; - struct ibv_device** devices = ibv_get_device_list(&num); - int ibTransportIndex; - switch (ibTransport) { // TODO: get rid of this ugly switch - case TransportIB0: - ibTransportIndex = 0; - break; - case TransportIB1: - ibTransportIndex = 1; - break; - case TransportIB2: - ibTransportIndex = 2; - break; - case TransportIB3: - ibTransportIndex = 3; - break; - case TransportIB4: - ibTransportIndex = 4; - break; - case TransportIB5: - ibTransportIndex = 5; - break; - case TransportIB6: - ibTransportIndex = 6; - break; - case TransportIB7: - ibTransportIndex = 7; - break; - default: - throw std::runtime_error("Not an IB transport"); - } - if (ibTransportIndex >= num) { - throw std::runtime_error("IB transport out of range"); - } - return devices[ibTransportIndex]->name; -} - -TransportFlags getIBTransportByDeviceName(const std::string& ibDeviceName) { - int num; - struct ibv_device** devices = ibv_get_device_list(&num); - for (int i = 0; i < num; ++i) { - if (ibDeviceName == devices[i]->name) { - switch (i) { // TODO: get rid of this ugly switch - case 0: - return TransportIB0; - case 1: - return TransportIB1; - case 2: - return TransportIB2; - case 3: - return TransportIB3; - case 4: - return TransportIB4; - case 5: - return TransportIB5; - case 6: - return TransportIB6; - case 7: - return TransportIB7; - default: - throw std::runtime_error("IB device index out of range"); - } - } - } - throw std::runtime_error("IB device not found"); -} - - -} // namespace mscclpp \ No newline at end of file +} // namespace mscclpp diff --git a/src/connection.cc b/src/connection.cc index 8d1b5e11..1e21694c 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -2,6 +2,7 @@ #include "checks.hpp" #include "registered_memory.hpp" #include "npkit/npkit.h" +#include "infiniband/verbs.h" namespace mscclpp { @@ -54,7 +55,7 @@ void CudaIpcConnection::flush() { // IBConnection IBConnection::IBConnection(int remoteRank, int tag, TransportFlags transport, Communicator::Impl& commImpl) : remoteRank(remoteRank), tag(tag), transport_(transport), remoteTransport_(TransportNone) { - MSCCLPPTHROW(mscclppIbContextCreateQp(commImpl.getIbContext(transport), &qp)); + qp = commImpl.getIbContext(transport)->createQp(); } IBConnection::~IBConnection() { @@ -85,13 +86,8 @@ void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMem auto dstMrInfo = dstTransportInfo.ibMrInfo; auto srcMr = srcTransportInfo.ibMr; - qp->stageSend(srcMr, &dstMrInfo, (uint32_t)size, - /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset, /*signaled=*/false); - int ret = qp->postSend(); - if (ret != 0) { - // Return value is errno. - WARN("data postSend failed: errno %d", ret); - } + qp->stageSend(srcMr, dstMrInfo, (uint32_t)size, /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset, /*signaled=*/false); + qp->postSend(); // npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_DATA_ENTRY, (uint32_t)size); } @@ -104,15 +100,11 @@ void IBConnection::flush() { continue; } for (int i = 0; i < wcNum; ++i) { - struct ibv_wc* wc = &qp->wcs[i]; + const struct ibv_wc* wc = reinterpret_cast(qp->getWc(i)); if (wc->status != IBV_WC_SUCCESS) { WARN("wc status %d", wc->status); continue; } - if (wc->qp_num != qp->qp->qp_num) { - WARN("got wc of unknown qp_num %d", wc->qp_num); - continue; - } if (wc->opcode == IBV_WC_RDMA_WRITE) { isWaiting = false; break; @@ -123,18 +115,16 @@ void IBConnection::flush() { } void IBConnection::startSetup(Communicator& comm) { - comm.bootstrap().send(&qp->info, sizeof(qp->info), remoteRank, tag); + // TODO(chhwang): temporarily disabled to compile + // comm.bootstrap().send(&qp->getInfo(), sizeof(qp->getInfo()), remoteRank, tag); } void IBConnection::endSetup(Communicator& comm) { - mscclppIbQpInfo qpInfo; - comm.bootstrap().recv(&qpInfo, sizeof(qpInfo), remoteRank, tag); - if (qp->rtr(&qpInfo) != 0) { - throw std::runtime_error("Failed to transition QP to RTR"); - } - if (qp->rts() != 0) { - throw std::runtime_error("Failed to transition QP to RTS"); - } + IbQpInfo qpInfo; + // TODO(chhwang): temporarily disabled to compile + // comm.bootstrap().recv(&qpInfo, sizeof(qpInfo), remoteRank, tag); + qp->rtr(qpInfo); + qp->rts(); } } // namespace mscclpp diff --git a/src/ib.cc b/src/ib.cc index 4a094761..4dc0285b 100644 --- a/src/ib.cc +++ b/src/ib.cc @@ -4,370 +4,67 @@ #include #include #include -#include +#include "mscclpp.hpp" #include "alloc.h" #include "comm.h" #include "debug.h" -#include "ib.h" #include "ib.hpp" #include "checks.hpp" +#include +#include -mscclppResult_t mscclppIbContextCreate(struct mscclppIbContext** ctx, const char* ibDevName) -{ - struct mscclppIbContext* _ctx; - MSCCLPPCHECK(mscclppCalloc(&_ctx, 1)); +namespace mscclpp { - std::vector ports; - - int num; - struct ibv_device** devices = ibv_get_device_list(&num); - for (int i = 0; i < num; ++i) { - if (strncmp(devices[i]->name, ibDevName, IBV_SYSFS_NAME_MAX) == 0) { - _ctx->ctx = ibv_open_device(devices[i]); - break; - } - } - ibv_free_device_list(devices); - if (_ctx->ctx == nullptr) { - WARN("ibv_open_device failed (errno %d, device name %s)", errno, ibDevName); - goto fail; - } - - // Check available ports - struct ibv_device_attr devAttr; - if (ibv_query_device(_ctx->ctx, &devAttr) != 0) { - WARN("ibv_query_device failed (errno %d, device name %s)", errno, ibDevName); - goto fail; - } - - for (uint8_t i = 1; i <= devAttr.phys_port_cnt; ++i) { - struct ibv_port_attr portAttr; - if (ibv_query_port(_ctx->ctx, i, &portAttr) != 0) { - WARN("ibv_query_port failed (errno %d, port %d)", errno, i); - goto fail; - } - if (portAttr.state != IBV_PORT_ACTIVE) { - continue; - } - if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) { - continue; - } - ports.push_back((int)i); - } - if (ports.size() == 0) { - WARN("no active IB port found"); - goto fail; - } - MSCCLPPCHECK(mscclppCalloc(&_ctx->ports, ports.size())); - _ctx->nPorts = (int)ports.size(); - for (int i = 0; i < _ctx->nPorts; ++i) { - _ctx->ports[i] = ports[i]; - } - - _ctx->pd = ibv_alloc_pd(_ctx->ctx); - if (_ctx->pd == NULL) { - WARN("ibv_alloc_pd failed (errno %d)", errno); - goto fail; - } - - *ctx = _ctx; - return mscclppSuccess; -fail: - *ctx = NULL; - if (_ctx->ports != NULL) { - free(_ctx->ports); - } - free(_ctx); - return mscclppInternalError; -} - -mscclppResult_t mscclppIbContextDestroy(struct mscclppIbContext* ctx) -{ - for (int i = 0; i < ctx->nMrs; ++i) { - if (ctx->mrs[i].mr) { - ibv_dereg_mr(ctx->mrs[i].mr); - } - } - for (int i = 0; i < ctx->nQps; ++i) { - if (ctx->qps[i].qp) { - ibv_destroy_qp(ctx->qps[i].qp); - } - ibv_destroy_cq(ctx->qps[i].cq); - free(ctx->qps[i].wcs); - free(ctx->qps[i].sges); - free(ctx->qps[i].wrs); - } - if (ctx->pd != NULL) { - ibv_dealloc_pd(ctx->pd); - } - if (ctx->ctx != NULL) { - ibv_close_device(ctx->ctx); - } - free(ctx->mrs); - free(ctx->qps); - free(ctx->ports); - free(ctx); - return mscclppSuccess; -} - -mscclppResult_t mscclppIbContextCreateQp(struct mscclppIbContext* ctx, struct mscclppIbQp** ibQp, int port /*=-1*/) -{ - if (port < 0) { - port = ctx->ports[0]; - } else { - bool found = false; - for (int i = 0; i < ctx->nPorts; ++i) { - if (ctx->ports[i] == port) { - found = true; - break; - } - } - if (!found) { - WARN("invalid IB port: %d", port); - return mscclppInternalError; - } - } - - struct ibv_cq* cq = ibv_create_cq(ctx->ctx, MSCCLPP_IB_CQ_SIZE, NULL, NULL, 0); - if (cq == NULL) { - WARN("ibv_create_cq failed (errno %d)", errno); - return mscclppInternalError; - } - - struct ibv_qp_init_attr qp_init_attr; - std::memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr)); - qp_init_attr.sq_sig_all = 0; - qp_init_attr.send_cq = cq; - qp_init_attr.recv_cq = cq; - qp_init_attr.qp_type = IBV_QPT_RC; - qp_init_attr.cap.max_send_wr = MAXCONNECTIONS * MSCCLPP_PROXY_FIFO_SIZE; - qp_init_attr.cap.max_recv_wr = MAXCONNECTIONS * MSCCLPP_PROXY_FIFO_SIZE; - qp_init_attr.cap.max_send_sge = 1; - qp_init_attr.cap.max_recv_sge = 1; - qp_init_attr.cap.max_inline_data = 0; - struct ibv_qp* qp = ibv_create_qp(ctx->pd, &qp_init_attr); - if (qp == nullptr) { - WARN("ibv_create_qp failed (errno %d)", errno); - return mscclppInternalError; - } - struct ibv_port_attr port_attr; - if (ibv_query_port(ctx->ctx, port, &port_attr) != 0) { - WARN("ibv_query_port failed (errno %d, port %d)", errno, port); - return mscclppInternalError; - } - - // Register QP to this ctx - qp->context = ctx->ctx; - if (qp->context == NULL) { - WARN("IB context is NULL"); - return mscclppInternalError; - } - ctx->nQps++; - if (ctx->qps == NULL) { - MSCCLPPCHECK(mscclppCalloc(&ctx->qps, MAXCONNECTIONS)); - ctx->maxQps = MAXCONNECTIONS; - } - if (ctx->maxQps < ctx->nQps) { - WARN("too many QPs"); - return mscclppInternalError; - } - struct mscclppIbQp* _ibQp = &ctx->qps[ctx->nQps - 1]; - _ibQp->qp = qp; - _ibQp->info.lid = port_attr.lid; - _ibQp->info.port = port; - _ibQp->info.linkLayer = port_attr.link_layer; - _ibQp->info.qpn = qp->qp_num; - _ibQp->info.mtu = port_attr.active_mtu; - if (port_attr.link_layer != IBV_LINK_LAYER_INFINIBAND) { - union ibv_gid gid; - if (ibv_query_gid(ctx->ctx, port, 0, &gid) != 0) { - WARN("ibv_query_gid failed (errno %d)", errno); - return mscclppInternalError; - } - _ibQp->info.spn = gid.global.subnet_prefix; - } - - struct ibv_qp_attr qp_attr; - std::memset(&qp_attr, 0, sizeof(struct ibv_qp_attr)); - qp_attr.qp_state = IBV_QPS_INIT; - qp_attr.pkey_index = 0; - qp_attr.port_num = port; - qp_attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; - if (ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS) != 0) { - WARN("ibv_modify_qp failed (errno %d)", errno); - return mscclppInternalError; - } - - MSCCLPPCHECK(mscclppCalloc(&_ibQp->wrs, MSCCLPP_IB_MAX_SENDS)); - MSCCLPPCHECK(mscclppCalloc(&_ibQp->sges, MSCCLPP_IB_MAX_SENDS)); - MSCCLPPCHECK(mscclppCalloc(&_ibQp->wcs, MSCCLPP_IB_CQ_POLL_NUM)); - _ibQp->cq = cq; - - *ibQp = _ibQp; - - return mscclppSuccess; -} - -mscclppResult_t mscclppIbContextRegisterMr(struct mscclppIbContext* ctx, void* buff, size_t size, - struct mscclppIbMr** ibMr) +IbMr::IbMr(void* pd, void* buff, std::size_t size) : buff(buff) { if (size == 0) { - WARN("invalid size: %zu", size); - return mscclppInvalidArgument; + throw std::runtime_error("invalid size: " + std::to_string(size)); } static __thread uintptr_t pageSize = 0; if (pageSize == 0) { pageSize = sysconf(_SC_PAGESIZE); } uintptr_t addr = reinterpret_cast(buff) & -pageSize; - size_t pages = (size + (reinterpret_cast(buff) - addr) + pageSize - 1) / pageSize; - struct ibv_mr* mr = - ibv_reg_mr(ctx->pd, reinterpret_cast(addr), pages * pageSize, - IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_RELAXED_ORDERING); - if (mr == nullptr) { - WARN("ibv_reg_mr failed (errno %d)", errno); - return mscclppInternalError; + std::size_t pages = (size + (reinterpret_cast(buff) - addr) + pageSize - 1) / pageSize; + struct ibv_pd* _pd = reinterpret_cast(pd); + struct ibv_mr* _mr = ibv_reg_mr(_pd, reinterpret_cast(addr), pages * pageSize, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_RELAXED_ORDERING); + if (_mr == nullptr) { + std::stringstream err; + err << "ibv_reg_mr failed (errno " << errno << ")"; + throw std::runtime_error(err.str()); } - ctx->nMrs++; - if (ctx->mrs == NULL) { - MSCCLPPCHECK(mscclppCalloc(&ctx->mrs, MAXCONNECTIONS)); - ctx->maxMrs = MAXCONNECTIONS; - } - if (ctx->maxMrs < ctx->nMrs) { - WARN("too many MRs"); - return mscclppInternalError; - } - struct mscclppIbMr* _ibMr = &ctx->mrs[ctx->nMrs - 1]; - _ibMr->mr = mr; - _ibMr->buff = buff; - _ibMr->info.addr = (uint64_t)buff; - _ibMr->info.rkey = mr->rkey; - *ibMr = _ibMr; - return mscclppSuccess; + this->mr = _mr; + this->size = pages * pageSize; } -////////////////////////////////////////////////////////////////////////////// - -int mscclppIbQp::rtr(const mscclppIbQpInfo* info) +IbMr::~IbMr() { - struct ibv_qp_attr qp_attr; - std::memset(&qp_attr, 0, sizeof(struct ibv_qp_attr)); - qp_attr.qp_state = IBV_QPS_RTR; - qp_attr.path_mtu = info->mtu; - qp_attr.dest_qp_num = info->qpn; - qp_attr.rq_psn = 0; - qp_attr.max_dest_rd_atomic = 1; - qp_attr.min_rnr_timer = 0x12; - if (info->linkLayer == IBV_LINK_LAYER_ETHERNET) { - qp_attr.ah_attr.is_global = 1; - qp_attr.ah_attr.grh.dgid.global.subnet_prefix = info->spn; - qp_attr.ah_attr.grh.dgid.global.interface_id = info->lid; - qp_attr.ah_attr.grh.flow_label = 0; - qp_attr.ah_attr.grh.sgid_index = 0; - qp_attr.ah_attr.grh.hop_limit = 255; - qp_attr.ah_attr.grh.traffic_class = 0; - } else { - qp_attr.ah_attr.is_global = 0; - qp_attr.ah_attr.dlid = info->lid; - } - qp_attr.ah_attr.sl = 0; - qp_attr.ah_attr.src_path_bits = 0; - qp_attr.ah_attr.port_num = info->port; - return ibv_modify_qp(this->qp, &qp_attr, - IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | - IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER); + ibv_dereg_mr(reinterpret_cast(this->mr)); } -int mscclppIbQp::rts() +IbMrInfo IbMr::getInfo() const { - struct ibv_qp_attr qp_attr; - std::memset(&qp_attr, 0, sizeof(struct ibv_qp_attr)); - qp_attr.qp_state = IBV_QPS_RTS; - qp_attr.timeout = 18; - qp_attr.retry_cnt = 7; - qp_attr.rnr_retry = 7; - qp_attr.sq_psn = 0; - qp_attr.max_rd_atomic = 1; - return ibv_modify_qp(this->qp, &qp_attr, - IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | - IBV_QP_MAX_QP_RD_ATOMIC); + IbMrInfo info; + info.addr = reinterpret_cast(this->buff); + info.rkey = reinterpret_cast(this->mr)->rkey; + return info; } -int mscclppIbQp::stageSend(struct mscclppIbMr* ibMr, const mscclppIbMrInfo* info, uint32_t size, uint64_t wrId, - uint64_t srcOffset, uint64_t dstOffset, bool signaled) +const void* IbMr::getBuff() const { - if (this->wrn >= MSCCLPP_IB_MAX_SENDS) { - return -1; - } - int wrn = this->wrn; - struct ibv_send_wr* wr_ = &this->wrs[wrn]; - struct ibv_sge* sge_ = &this->sges[wrn]; - // std::memset(wr_, 0, sizeof(struct ibv_send_wr)); - // std::memset(sge_, 0, sizeof(struct ibv_sge)); - wr_->wr_id = wrId; - wr_->sg_list = sge_; - wr_->num_sge = 1; - wr_->opcode = IBV_WR_RDMA_WRITE; - wr_->send_flags = signaled ? IBV_SEND_SIGNALED : 0; - wr_->wr.rdma.remote_addr = (uint64_t)(info->addr) + dstOffset; - wr_->wr.rdma.rkey = info->rkey; - wr_->next = nullptr; - sge_->addr = (uint64_t)(ibMr->buff) + srcOffset; - sge_->length = size; - sge_->lkey = ibMr->mr->lkey; - if (wrn > 0) { - this->wrs[wrn - 1].next = wr_; - } - this->wrn++; - return this->wrn; + return this->buff; } -int mscclppIbQp::stageSendWithImm(struct mscclppIbMr* ibMr, const mscclppIbMrInfo* info, uint32_t size, uint64_t wrId, - uint64_t srcOffset, uint64_t dstOffset, bool signaled, unsigned int immData) +uint32_t IbMr::getLkey() const { - int wrn = this->stageSend(ibMr, info, size, wrId, srcOffset, dstOffset, signaled); - this->wrs[wrn - 1].opcode = IBV_WR_RDMA_WRITE_WITH_IMM; - this->wrs[wrn - 1].imm_data = immData; - return wrn; + return reinterpret_cast(this->mr)->lkey; } -int mscclppIbQp::postSend() -{ - if (this->wrn == 0) { - return 0; - } - - struct ibv_send_wr* bad_wr; - int ret = ibv_post_send(this->qp, this->wrs, &bad_wr); - if (ret != 0) { - return ret; - } - this->wrn = 0; - return 0; -} - -int mscclppIbQp::postRecv(uint64_t wrId) -{ - struct ibv_recv_wr wr, *bad_wr; - wr.wr_id = wrId; - wr.sg_list = nullptr; - wr.num_sge = 0; - wr.next = nullptr; - return ibv_post_recv(this->qp, &wr, &bad_wr); -} - -int mscclppIbQp::pollCq() -{ - return ibv_poll_cq(this->cq, MSCCLPP_IB_CQ_POLL_NUM, this->wcs); -} - -namespace mscclpp { - IbQp::IbQp(void* ctx, void* pd, int port) { - struct ibv_context* _ctx = static_cast(ctx); - struct ibv_pd* _pd = static_cast(pd); + struct ibv_context* _ctx = reinterpret_cast(ctx); + struct ibv_pd* _pd = reinterpret_cast(pd); this->cq = ibv_create_cq(_ctx, MSCCLPP_IB_CQ_SIZE, nullptr, nullptr, 0); if (this->cq == nullptr) { @@ -379,8 +76,8 @@ IbQp::IbQp(void* ctx, void* pd, int port) struct ibv_qp_init_attr qpInitAttr; std::memset(&qpInitAttr, 0, sizeof(qpInitAttr)); qpInitAttr.sq_sig_all = 0; - qpInitAttr.send_cq = static_cast(this->cq); - qpInitAttr.recv_cq = static_cast(this->cq); + qpInitAttr.send_cq = reinterpret_cast(this->cq); + qpInitAttr.recv_cq = reinterpret_cast(this->cq); qpInitAttr.qp_type = IBV_QPT_RC; qpInitAttr.cap.max_send_wr = MAXCONNECTIONS * MSCCLPP_PROXY_FIFO_SIZE; qpInitAttr.cap.max_recv_wr = MAXCONNECTIONS * MSCCLPP_PROXY_FIFO_SIZE; @@ -428,14 +125,160 @@ IbQp::IbQp(void* ctx, void* pd, int port) throw std::runtime_error(err.str()); } this->qp = _qp; + MSCCLPPTHROW(mscclppCalloc(reinterpret_cast(&this->wrs), MSCCLPP_IB_MAX_SENDS)); + MSCCLPPTHROW(mscclppCalloc(reinterpret_cast(&this->sges), MSCCLPP_IB_MAX_SENDS)); + MSCCLPPTHROW(mscclppCalloc(reinterpret_cast(&this->wcs), MSCCLPP_IB_CQ_POLL_NUM)); } -IbCtx::IbCtx(const std::string& ibDevName) +IbQp::~IbQp() +{ + ibv_destroy_qp(reinterpret_cast(this->qp)); + ibv_destroy_cq(reinterpret_cast(this->cq)); + std::free(this->wrs); + std::free(this->sges); + std::free(this->wcs); +} + +void IbQp::rtr(const IbQpInfo& info) +{ + struct ibv_qp_attr qp_attr; + std::memset(&qp_attr, 0, sizeof(struct ibv_qp_attr)); + qp_attr.qp_state = IBV_QPS_RTR; + qp_attr.path_mtu = static_cast(info.mtu); + qp_attr.dest_qp_num = info.qpn; + qp_attr.rq_psn = 0; + qp_attr.max_dest_rd_atomic = 1; + qp_attr.min_rnr_timer = 0x12; + if (info.linkLayer == IBV_LINK_LAYER_ETHERNET) { + qp_attr.ah_attr.is_global = 1; + qp_attr.ah_attr.grh.dgid.global.subnet_prefix = info.spn; + qp_attr.ah_attr.grh.dgid.global.interface_id = info.lid; + qp_attr.ah_attr.grh.flow_label = 0; + qp_attr.ah_attr.grh.sgid_index = 0; + qp_attr.ah_attr.grh.hop_limit = 255; + qp_attr.ah_attr.grh.traffic_class = 0; + } else { + qp_attr.ah_attr.is_global = 0; + qp_attr.ah_attr.dlid = info.lid; + } + qp_attr.ah_attr.sl = 0; + qp_attr.ah_attr.src_path_bits = 0; + qp_attr.ah_attr.port_num = info.port; + int ret = ibv_modify_qp(reinterpret_cast(this->qp), &qp_attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER); + if (ret != 0) { + std::stringstream err; + err << "ibv_modify_qp failed (errno " << errno << ")"; + throw std::runtime_error(err.str()); + } +} + +void IbQp::rts() +{ + struct ibv_qp_attr qp_attr; + std::memset(&qp_attr, 0, sizeof(struct ibv_qp_attr)); + qp_attr.qp_state = IBV_QPS_RTS; + qp_attr.timeout = 18; + qp_attr.retry_cnt = 7; + qp_attr.rnr_retry = 7; + qp_attr.sq_psn = 0; + qp_attr.max_rd_atomic = 1; + int ret = ibv_modify_qp(reinterpret_cast(this->qp), &qp_attr, IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC); + if (ret != 0) { + std::stringstream err; + err << "ibv_modify_qp failed (errno " << errno << ")"; + throw std::runtime_error(err.str()); + } +} + +int IbQp::stageSend(const IbMr *mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled) +{ + if (this->wrn >= MSCCLPP_IB_MAX_SENDS) { + return -1; + } + int wrn = this->wrn; + struct ibv_send_wr* wrs_ = reinterpret_cast(this->wrs); + struct ibv_sge* sges_ = reinterpret_cast(this->sges); + + struct ibv_send_wr* wr_ = &wrs_[wrn]; + struct ibv_sge* sge_ = &sges_[wrn]; + wr_->wr_id = wrId; + wr_->sg_list = sge_; + wr_->num_sge = 1; + wr_->opcode = IBV_WR_RDMA_WRITE; + wr_->send_flags = signaled ? IBV_SEND_SIGNALED : 0; + wr_->wr.rdma.remote_addr = (uint64_t)(info.addr) + dstOffset; + wr_->wr.rdma.rkey = info.rkey; + wr_->next = nullptr; + sge_->addr = (uint64_t)(mr->getBuff()) + srcOffset; + sge_->length = size; + sge_->lkey = mr->getLkey(); + if (wrn > 0) { + wrs_[wrn - 1].next = wr_; + } + this->wrn++; + return this->wrn; +} + +int IbQp::stageSendWithImm(const IbMr *mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled, unsigned int immData) +{ + int wrn = this->stageSend(mr, info, size, wrId, srcOffset, dstOffset, signaled); + struct ibv_send_wr* wrs_ = reinterpret_cast(this->wrs); + wrs_[wrn - 1].opcode = IBV_WR_RDMA_WRITE_WITH_IMM; + wrs_[wrn - 1].imm_data = immData; + return wrn; +} + +void IbQp::postSend() +{ + if (this->wrn == 0) { + return; + } + struct ibv_send_wr* bad_wr; + int ret = ibv_post_send(reinterpret_cast(this->qp), reinterpret_cast(this->wrs), &bad_wr); + if (ret != 0) { + std::stringstream err; + err << "ibv_post_send failed (errno " << errno << ")"; + throw std::runtime_error(err.str()); + } + this->wrn = 0; +} + +void IbQp::postRecv(uint64_t wrId) +{ + struct ibv_recv_wr wr, *bad_wr; + wr.wr_id = wrId; + wr.sg_list = nullptr; + wr.num_sge = 0; + wr.next = nullptr; + int ret = ibv_post_recv(reinterpret_cast(this->qp), &wr, &bad_wr); + if (ret != 0) { + std::stringstream err; + err << "ibv_post_recv failed (errno " << errno << ")"; + throw std::runtime_error(err.str()); + } +} + +int IbQp::pollCq() +{ + return ibv_poll_cq(reinterpret_cast(this->cq), MSCCLPP_IB_CQ_POLL_NUM, reinterpret_cast(this->wcs)); +} + +const IbQpInfo& IbQp::getInfo() const +{ + return this->info; +} + +const void* IbQp::getWc(int idx) const +{ + return &reinterpret_cast(this->wcs)[idx]; +} + +IbCtx::IbCtx(const std::string& devName) : devName(devName) { int num; struct ibv_device** devices = ibv_get_device_list(&num); for (int i = 0; i < num; ++i) { - if (std::string(devices[i]->name) == ibDevName) { + if (std::string(devices[i]->name) == devName) { this->ctx = ibv_open_device(devices[i]); break; } @@ -443,10 +286,10 @@ IbCtx::IbCtx(const std::string& ibDevName) ibv_free_device_list(devices); if (this->ctx == nullptr) { std::stringstream err; - err << "ibv_open_device failed (errno " << errno << ", device name << " << ibDevName << ")"; + err << "ibv_open_device failed (errno " << errno << ", device name << " << devName << ")"; throw std::runtime_error(err.str()); } - this->pd = ibv_alloc_pd(static_cast(this->ctx)); + this->pd = ibv_alloc_pd(reinterpret_cast(this->ctx)); if (this->pd == nullptr) { std::stringstream err; err << "ibv_alloc_pd failed (errno " << errno << ")"; @@ -456,18 +299,20 @@ IbCtx::IbCtx(const std::string& ibDevName) IbCtx::~IbCtx() { + this->mrs.clear(); + this->qps.clear(); if (this->pd != nullptr) { - ibv_dealloc_pd(static_cast(this->pd)); + ibv_dealloc_pd(reinterpret_cast(this->pd)); } if (this->ctx != nullptr) { - ibv_close_device(static_cast(this->ctx)); + ibv_close_device(reinterpret_cast(this->ctx)); } } bool IbCtx::isPortUsable(int port) const { struct ibv_port_attr portAttr; - if (ibv_query_port(static_cast(this->ctx), port, &portAttr) != 0) { + if (ibv_query_port(reinterpret_cast(this->ctx), port, &portAttr) != 0) { std::stringstream err; err << "ibv_query_port failed (errno " << errno << ", port << " << port << ")"; throw std::runtime_error(err.str()); @@ -479,7 +324,7 @@ bool IbCtx::isPortUsable(int port) const int IbCtx::getAnyActivePort() const { struct ibv_device_attr devAttr; - if (ibv_query_device(static_cast(this->ctx), &devAttr) != 0) { + if (ibv_query_device(reinterpret_cast(this->ctx), &devAttr) != 0) { std::stringstream err; err << "ibv_query_device failed (errno " << errno << ")"; throw std::runtime_error(err.str()); @@ -506,4 +351,89 @@ IbQp* IbCtx::createQp(int port /*=-1*/) return qps.back().get(); } +const IbMr* IbCtx::registerMr(void* buff, std::size_t size) +{ + mrs.emplace_back(new IbMr(this->pd, buff, size)); + return mrs.back().get(); +} + +const std::string& IbCtx::getDevName() const +{ + return this->devName; +} + +int getIBDeviceCount() { + int num; + ibv_get_device_list(&num); + return num; +} + +std::string getIBDeviceName(TransportFlags ibTransport) { + int num; + struct ibv_device** devices = ibv_get_device_list(&num); + int ibTransportIndex; + switch (ibTransport) { // TODO: get rid of this ugly switch + case TransportIB0: + ibTransportIndex = 0; + break; + case TransportIB1: + ibTransportIndex = 1; + break; + case TransportIB2: + ibTransportIndex = 2; + break; + case TransportIB3: + ibTransportIndex = 3; + break; + case TransportIB4: + ibTransportIndex = 4; + break; + case TransportIB5: + ibTransportIndex = 5; + break; + case TransportIB6: + ibTransportIndex = 6; + break; + case TransportIB7: + ibTransportIndex = 7; + break; + default: + throw std::runtime_error("Not an IB transport"); + } + if (ibTransportIndex >= num) { + throw std::runtime_error("IB transport out of range"); + } + return devices[ibTransportIndex]->name; +} + +TransportFlags getIBTransportByDeviceName(const std::string& ibDeviceName) { + int num; + struct ibv_device** devices = ibv_get_device_list(&num); + for (int i = 0; i < num; ++i) { + if (ibDeviceName == devices[i]->name) { + switch (i) { // TODO: get rid of this ugly switch + case 0: + return TransportIB0; + case 1: + return TransportIB1; + case 2: + return TransportIB2; + case 3: + return TransportIB3; + case 4: + return TransportIB4; + case 5: + return TransportIB5; + case 6: + return TransportIB6; + case 7: + return TransportIB7; + default: + throw std::runtime_error("IB device index out of range"); + } + } + } + throw std::runtime_error("IB device not found"); +} + } // namespace mscclpp diff --git a/src/include/comm.h b/src/include/comm.h index 8275e0cb..dce724fa 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -7,9 +7,10 @@ #ifndef MSCCLPP_COMM_H_ #define MSCCLPP_COMM_H_ -#include "ib.h" +#include "ib.hpp" #include "proxy.h" #include +#include #define MAXCONNECTIONS 64 @@ -31,7 +32,7 @@ struct mscclppConn std::vector bufferRegistrations; std::vector remoteBufferRegistrations; - struct mscclppIbContext* ibCtx; + mscclpp::IbCtx* ibCtx; #if defined(ENABLE_NPKIT) std::vector npkitUsedReqIds; std::vector npkitFreeReqIds; @@ -57,7 +58,7 @@ struct mscclppComm // Flag to ask MSCCLPP kernels to abort volatile uint32_t* abortFlag; - struct mscclppIbContext* ibContext[MSCCLPP_IB_MAX_DEVS]; + std::unique_ptr ibContext[MSCCLPP_IB_MAX_DEVS]; struct mscclppProxyState* proxyState[MSCCLPP_PROXY_MAX_NUM]; }; diff --git a/src/include/communicator.hpp b/src/include/communicator.hpp index 879501c0..37abb31b 100644 --- a/src/include/communicator.hpp +++ b/src/include/communicator.hpp @@ -5,7 +5,7 @@ #include "mscclpp.h" #include "channel.hpp" #include "proxy.hpp" -#include "ib.h" +#include "ib.hpp" #include namespace mscclpp { @@ -15,13 +15,13 @@ class ConnectionBase; struct Communicator::Impl { mscclppComm_t comm; std::vector> connections; - std::unordered_map ibContexts; + std::unordered_map ibContexts; Impl(); ~Impl(); - mscclppIbContext* getIbContext(TransportFlags ibTransport); + IbCtx* getIbContext(TransportFlags ibTransport); }; } // namespace mscclpp diff --git a/src/include/connection.hpp b/src/include/connection.hpp index ac1dd6a1..dcf21362 100644 --- a/src/include/connection.hpp +++ b/src/include/connection.hpp @@ -3,7 +3,7 @@ #include "mscclpp.hpp" #include -#include "ib.h" +#include "ib.hpp" #include "communicator.hpp" namespace mscclpp { @@ -38,7 +38,7 @@ class IBConnection : public ConnectionBase { int tag; TransportFlags transport_; TransportFlags remoteTransport_; - mscclppIbQp* qp; + IbQp* qp; public: IBConnection(int remoteRank, int tag, TransportFlags transport, Communicator::Impl& commImpl); diff --git a/src/include/ib.h b/src/include/ib.h deleted file mode 100644 index 7494ab11..00000000 --- a/src/include/ib.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef MSCCLPP_IB_H_ -#define MSCCLPP_IB_H_ - -#include "mscclpp.h" -#include -#include -#include -#include - -#define MSCCLPP_IB_CQ_SIZE 1024 -#define MSCCLPP_IB_CQ_POLL_NUM 4 -#define MSCCLPP_IB_MAX_SENDS 64 -#define MSCCLPP_IB_MAX_DEVS 8 - -// QP info to be shared with the remote peer -struct mscclppIbQpInfo -{ - uint16_t lid; - uint8_t port; - uint8_t linkLayer; - uint32_t qpn; - uint64_t spn; - ibv_mtu mtu; -}; - -// IB queue pair -struct mscclppIbQp -{ - struct ibv_qp* qp; - struct mscclppIbQpInfo info; - struct ibv_send_wr* wrs; - struct ibv_sge* sges; - struct ibv_cq* cq; - struct ibv_wc* wcs; - int wrn; - - int rtr(const mscclppIbQpInfo* info); - int rts(); - int stageSend(struct mscclppIbMr* ibMr, const mscclppIbMrInfo* info, uint32_t size, uint64_t wrId, uint64_t srcOffset, - uint64_t dstOffset, bool signaled); - int stageSendWithImm(struct mscclppIbMr* ibMr, const mscclppIbMrInfo* info, uint32_t size, uint64_t wrId, - uint64_t srcOffset, uint64_t dstOffset, bool signaled, unsigned int immData); - int postSend(); - int postRecv(uint64_t wrId); - int pollCq(); -}; - -// Holds resources of a single IB device. -struct mscclppIbContext -{ - struct ibv_context* ctx; - struct ibv_pd* pd; - int* ports; - int nPorts; - struct mscclppIbQp* qps; - int nQps; - int maxQps; - struct mscclppIbMr* mrs; - int nMrs; - int maxMrs; -}; - -mscclppResult_t mscclppIbContextCreate(struct mscclppIbContext** ctx, const char* ibDevName); -mscclppResult_t mscclppIbContextDestroy(struct mscclppIbContext* ctx); -mscclppResult_t mscclppIbContextCreateQp(struct mscclppIbContext* ctx, struct mscclppIbQp** ibQp, int port = -1); -mscclppResult_t mscclppIbContextRegisterMr(struct mscclppIbContext* ctx, void* buff, size_t size, - struct mscclppIbMr** ibMr); - -#endif diff --git a/src/include/ib.hpp b/src/include/ib.hpp index 85c92af7..d04b75bd 100644 --- a/src/include/ib.hpp +++ b/src/include/ib.hpp @@ -5,8 +5,38 @@ #include #include +#define MSCCLPP_IB_CQ_SIZE 1024 +#define MSCCLPP_IB_CQ_POLL_NUM 1 +#define MSCCLPP_IB_MAX_SENDS 64 +#define MSCCLPP_IB_MAX_DEVS 8 + namespace mscclpp { +struct IbMrInfo +{ + uint64_t addr; + uint32_t rkey; +}; + +class IbMr +{ +public: + ~IbMr(); + + IbMrInfo getInfo() const; + const void* getBuff() const; + uint32_t getLkey() const; + +private: + IbMr(void* pd, void* buff, std::size_t size); + + void* mr; + void* buff; + std::size_t size; + + friend class IbCtx; +}; + // QP info to be shared with the remote peer struct IbQpInfo { @@ -15,7 +45,7 @@ struct IbQpInfo uint8_t linkLayer; uint32_t qpn; uint64_t spn; - uint32_t mtu; + int mtu; }; class IbQp @@ -23,11 +53,22 @@ class IbQp public: ~IbQp(); - IbQpInfo info; + void rtr(const IbQpInfo& info); + void rts(); + int stageSend(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled); + int stageSendWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled, unsigned int immData); + void postSend(); + void postRecv(uint64_t wrId); + int pollCq(); + + const IbQpInfo& getInfo() const; + const void* getWc(int idx) const; private: IbQp(void* ctx, void* pd, int port); + IbQpInfo info; + void* qp; void* cq; void* wcs; @@ -38,22 +79,26 @@ private: friend class IbCtx; }; - class IbCtx { public: - IbCtx(const std::string& ibDevName); + IbCtx(const std::string& devName); ~IbCtx(); IbQp* createQp(int port = -1); + const IbMr* registerMr(void* buff, std::size_t size); + + const std::string& getDevName() const; private: bool isPortUsable(int port) const; int getAnyActivePort() const; + const std::string devName; void* ctx; void* pd; std::list> qps; + std::list> mrs; }; } // namespace mscclpp diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index 6f96af10..c01246ab 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -207,25 +207,10 @@ typedef struct char internal[MSCCLPP_UNIQUE_ID_BYTES]; } mscclppUniqueId; -// MR info to be shared with the remote peer -struct mscclppIbMrInfo -{ - uint64_t addr; - uint32_t rkey; -}; - -// IB memory region -struct mscclppIbMr -{ - struct ibv_mr* mr; - void* buff; - struct mscclppIbMrInfo info; -}; - struct mscclppRegisteredMemoryP2P { void* remoteBuff; - mscclppIbMr* IbMr; + const void* IbMr; }; struct mscclppRegisteredMemory diff --git a/src/include/proxy.h b/src/include/proxy.h index 3da0196c..3746806b 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -59,7 +59,7 @@ struct mscclppProxyState mscclppProxyRunState_t run; int numaNodeToBind; - struct mscclppIbContext* ibContext; // For IB connection only + mscclpp::IbCtx* ibContext; // For IB connection only cudaStream_t p2pStream; // for P2P DMA engine only struct mscclppProxyFifo fifo; diff --git a/src/include/registered_memory.hpp b/src/include/registered_memory.hpp index 7a0ab1d0..d2270d46 100644 --- a/src/include/registered_memory.hpp +++ b/src/include/registered_memory.hpp @@ -3,7 +3,7 @@ #include "mscclpp.hpp" #include "mscclpp.h" -#include "ib.h" +#include "ib.hpp" #include "communicator.hpp" #include @@ -16,8 +16,8 @@ struct TransportInfo { bool ibLocal; union { cudaIpcMemHandle_t cudaIpcHandle; - mscclppIbMr* ibMr; - mscclppIbMrInfo ibMrInfo; + const IbMr* ibMr; + IbMrInfo ibMrInfo; }; }; diff --git a/src/init.cc b/src/init.cc index 7cf159c8..c5b6a66b 100644 --- a/src/init.cc +++ b/src/init.cc @@ -7,6 +7,7 @@ #include "gdr.h" #endif #include "mscclpp.h" +#include "infiniband/verbs.h" #include #include #include @@ -191,7 +192,7 @@ MSCCLPP_API mscclppResult_t mscclppCommDestroy(mscclppComm_t comm) for (int i = 0; i < MSCCLPP_IB_MAX_DEVS; ++i) { if (comm->ibContext[i]) { - MSCCLPPCHECK(mscclppIbContextDestroy(comm->ibContext[i])); + comm->ibContext[i].reset(nullptr); } } @@ -366,24 +367,17 @@ struct mscclppHostIBConn : mscclppHostConn } void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, uint64_t dataSize) { - this->ibQp->stageSend(this->ibMrs[src], &this->remoteIbMrInfos[dst], (uint32_t)dataSize, + this->ibQp->stageSend(this->ibMrs[src], this->remoteIbMrInfos[dst], (uint32_t)dataSize, /*wrId=*/0, /*srcOffset=*/srcDataOffset, /*dstOffset=*/dstDataOffset, /*signaled=*/false); - int ret = this->ibQp->postSend(); - if (ret != 0) { - // Return value is errno. - WARN("data postSend failed: errno %d", ret); - } + this->ibQp->postSend(); npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_DATA_ENTRY, (uint32_t)dataSize); } void signal() { // My local device flag is copied to the remote's proxy flag - this->ibQp->stageSend(this->ibMrs[0], &this->remoteIbMrInfos[0], sizeof(uint64_t), + this->ibQp->stageSend(this->ibMrs[0], this->remoteIbMrInfos[0], sizeof(uint64_t), /*wrId=*/0, /*srcOffset=*/0, /*dstOffset=*/sizeof(uint64_t), /*signaled=*/true); - int ret = this->ibQp->postSend(); - if (ret != 0) { - WARN("flag postSend failed: errno %d", ret); - } + this->ibQp->postSend(); npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_FLAG_ENTRY, (uint32_t)sizeof(uint64_t)); } void wait() @@ -399,15 +393,11 @@ struct mscclppHostIBConn : mscclppHostConn continue; } for (int i = 0; i < wcNum; ++i) { - struct ibv_wc* wc = &this->ibQp->wcs[i]; + struct ibv_wc* wc = (struct ibv_wc*)this->ibQp->getWc(i); if (wc->status != IBV_WC_SUCCESS) { WARN("wc status %d", wc->status); continue; } - if (wc->qp_num != this->ibQp->qp->qp_num) { - WARN("got wc of unknown qp_num %d", wc->qp_num); - continue; - } if (wc->opcode == IBV_WC_RDMA_WRITE) { isWaiting = false; break; @@ -418,9 +408,9 @@ struct mscclppHostIBConn : mscclppHostConn } mscclppConn* conn; - struct mscclppIbQp* ibQp; - std::vector ibMrs; - std::vector remoteIbMrInfos; + mscclpp::IbQp* ibQp; + std::vector ibMrs; + std::vector remoteIbMrInfos; }; MSCCLPP_API mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int remoteRank, int tag, mscclppTransport_t transportType, const char* ibDev) @@ -458,7 +448,7 @@ MSCCLPP_API mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int if (firstNullIdx == -1) { firstNullIdx = i; } - } else if (strncmp(comm->ibContext[i]->ctx->device->name, ibDev, IBV_SYSFS_NAME_MAX) == 0) { + } else if (strncmp(comm->ibContext[i]->getDevName().c_str(), ibDev, IBV_SYSFS_NAME_MAX) == 0) { ibDevIdx = i; break; } @@ -468,13 +458,10 @@ MSCCLPP_API mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int if (ibDevIdx == -1) { // Create a new context. ibDevIdx = firstNullIdx; - if (mscclppIbContextCreate(&comm->ibContext[ibDevIdx], ibDev) != mscclppSuccess) { - WARN("Failed to create IB context"); - return mscclppInternalError; - } + comm->ibContext[ibDevIdx].reset(new mscclpp::IbCtx(std::string(ibDev))); } // Set the ib context for this conn - conn->ibCtx = comm->ibContext[ibDevIdx]; + conn->ibCtx = comm->ibContext[ibDevIdx].get(); } else if (transportType == mscclppTransportP2P) { // do the rest of the initialization later @@ -609,17 +596,17 @@ MSCCLPP_API mscclppResult_t mscclppRegisterBufferForConnection(mscclppComm_t com struct mscclppBufferRegistrationInfo { cudaIpcMemHandle_t cudaHandle; - mscclppIbMrInfo ibMrInfo; + mscclpp::IbMrInfo ibMrInfo; uint64_t size; }; struct connInfo { - mscclppIbQpInfo infoQp; + mscclpp::IbQpInfo infoQp; std::vector bufferInfos; struct header { - mscclppIbQpInfo infoQp; + mscclpp::IbQpInfo infoQp; int numBufferInfos; }; @@ -702,22 +689,20 @@ mscclppResult_t mscclppIbConnectionSetupStart(struct connInfo* connInfo /*output devConn->remoteBuff = NULL; devConn->remoteSignalEpochId = NULL; - struct mscclppIbContext* ibCtx = conn->ibCtx; + mscclpp::IbCtx* ibCtx = conn->ibCtx; if (hostConn->ibQp == NULL) { - MSCCLPPCHECK(mscclppIbContextCreateQp(ibCtx, &hostConn->ibQp)); + hostConn->ibQp = ibCtx->createQp(); } // Add all registered buffers for (const auto &bufReg : conn->bufferRegistrations) { - hostConn->ibMrs.emplace_back(); - MSCCLPPCHECK(mscclppIbContextRegisterMr(ibCtx, bufReg.data, - sizeof(struct mscclppDevConnSignalEpochId), &hostConn->ibMrs.back())); + hostConn->ibMrs.emplace_back(ibCtx->registerMr(bufReg.data, sizeof(struct mscclppDevConnSignalEpochId))); connInfo->bufferInfos.emplace_back(); - connInfo->bufferInfos.back().ibMrInfo = hostConn->ibMrs.back()->info; + connInfo->bufferInfos.back().ibMrInfo = hostConn->ibMrs.back()->getInfo(); connInfo->bufferInfos.back().size = bufReg.size; } - connInfo->infoQp = hostConn->ibQp->info; + connInfo->infoQp = hostConn->ibQp->getInfo(); return mscclppSuccess; } @@ -728,14 +713,8 @@ mscclppResult_t mscclppIbConnectionSetupEnd(struct connInfo* connInfo /*input*/, return mscclppInternalError; } struct mscclppHostIBConn* hostConn = (struct mscclppHostIBConn*)conn->hostConn; - if (hostConn->ibQp->rtr(&connInfo->infoQp) != 0) { - WARN("Failed to transition QP to RTR"); - return mscclppInvalidUsage; - } - if (hostConn->ibQp->rts() != 0) { - WARN("Failed to transition QP to RTS"); - return mscclppInvalidUsage; - } + hostConn->ibQp->rtr(connInfo->infoQp); + hostConn->ibQp->rts(); // No remote pointers to set with IB, so we just set the Mrs @@ -788,25 +767,25 @@ MSCCLPP_API mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm) struct bufferInfo { cudaIpcMemHandle_t handleBuff; - mscclppIbMrInfo infoBuffMr; + mscclpp::IbMrInfo infoBuffMr; }; MSCCLPP_API mscclppResult_t mscclppRegisterBuffer(mscclppComm_t comm, void* local_memory, size_t size, mscclppRegisteredMemory* regMem) { - std::vector ibMrs; + std::vector ibMrs; for (int i = 0; i < comm->nConns; ++i) { struct mscclppConn* conn = &comm->conns[i]; struct bufferInfo bInfo; - struct mscclppIbMr* ibBuffMr; + const mscclpp::IbMr* ibBuffMr; // TODO: (conn->transport & mscclppTransportP2P) to support both P2P and IB if (conn->transport == mscclppTransportP2P) { CUDACHECK(cudaIpcGetMemHandle(&bInfo.handleBuff, local_memory)); } else if (conn->transport == mscclppTransportIB) { - MSCCLPPCHECK(mscclppIbContextRegisterMr(conn->ibCtx, local_memory, size, &ibBuffMr)); - bInfo.infoBuffMr = ibBuffMr->info; - ibMrs.push_back(ibBuffMr); + ibBuffMr = conn->ibCtx->registerMr(local_memory, size); + bInfo.infoBuffMr = ibBuffMr->getInfo(); + ibMrs.emplace_back(ibBuffMr); } MSCCLPPCHECK(bootstrapSend(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag, &bInfo, sizeof(bInfo))); diff --git a/src/proxy.cc b/src/proxy.cc index 6cfd799b..c8bf4414 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -2,7 +2,7 @@ #include "checks.h" #include "comm.h" #include "debug.h" -#include "ib.h" +#include "ib.hpp" #include "socket.h" #include diff --git a/src/registered_memory.cc b/src/registered_memory.cc index d9476e4f..f0db85ce 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -18,8 +18,7 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, int rank, TransportFlags t auto addIb = [&](TransportFlags ibTransport) { TransportInfo transportInfo; transportInfo.transport = ibTransport; - mscclppIbMr* mr; - MSCCLPPTHROW(mscclppIbContextRegisterMr(commImpl.getIbContext(ibTransport), data, size, &mr)); + const IbMr* mr = commImpl.getIbContext(ibTransport)->registerMr(data, size); transportInfo.ibMr = mr; transportInfo.ibLocal = true; this->transportInfos.push_back(transportInfo); @@ -103,7 +102,7 @@ RegisteredMemory::Impl::Impl(const std::vector& serialization) { it += sizeof(handle); transportInfo.cudaIpcHandle = handle; } else if (transportInfo.transport & TransportAllIB) { - mscclppIbMrInfo info; + IbMrInfo info; std::copy_n(it, sizeof(info), reinterpret_cast(&info)); it += sizeof(info); transportInfo.ibMrInfo = info; diff --git a/tests/unittests/ib_test.cc b/tests/unittests/ib_test.cc index 2c194eaf..6f84398f 100644 --- a/tests/unittests/ib_test.cc +++ b/tests/unittests/ib_test.cc @@ -1,8 +1,10 @@ #include "alloc.h" #include "checks.h" -#include "ib.h" -#include +#include "ib.hpp" +#include "infiniband/verbs.h" +#include "mscclpp.hpp" #include +#include // Measure current time in second. static double getTime(void) @@ -24,8 +26,8 @@ int main(int argc, const char* argv[]) printf("Usage: %s <0(recv)/1(send)> \n", argv[0]); return 1; } - const char* ip_port = argv[1]; - int is_send = atoi(argv[2]); + const char* ipPortPair = argv[1]; + int isSend = atoi(argv[2]); int cudaDevId = atoi(argv[3]); std::string ibDevName = "mlx5_ib" + std::string(argv[4]); @@ -35,51 +37,40 @@ int main(int argc, const char* argv[]) int nelem = 1; MSCCLPPCHECK(mscclppCudaCalloc(&data, nelem)); - mscclppComm_t comm; - MSCCLPPCHECK(mscclppCommInitRank(&comm, 2, ip_port, is_send)); + std::shared_ptr bootstrap(new mscclpp::Bootstrap(isSend, 2)); + bootstrap->initialize(ipPortPair); - struct mscclppIbContext* ctx; - struct mscclppIbQp* qp; - struct mscclppIbMr* mr; - MSCCLPPCHECK(mscclppIbContextCreate(&ctx, ibDevName.c_str())); - MSCCLPPCHECK(mscclppIbContextCreateQp(ctx, &qp)); - MSCCLPPCHECK(mscclppIbContextRegisterMr(ctx, data, sizeof(int) * nelem, &mr)); + mscclpp::IbCtx ctx(ibDevName); + mscclpp::IbQp* qp = ctx.createQp(); + const mscclpp::IbMr* mr = ctx.registerMr(data, sizeof(int) * nelem); - struct mscclppIbQpInfo* qpInfo; - MSCCLPPCHECK(mscclppCalloc(&qpInfo, 2)); - qpInfo[is_send] = qp->info; + std::array qpInfo; + qpInfo[isSend] = qp->getInfo(); - struct mscclppIbMrInfo* mrInfo; - MSCCLPPCHECK(mscclppCalloc(&mrInfo, 2)); - mrInfo[is_send] = mr->info; + std::array mrInfo; + mrInfo[isSend] = mr->getInfo(); - MSCCLPPCHECK(mscclppBootstrapAllGather(comm, qpInfo, sizeof(struct mscclppIbQpInfo))); - MSCCLPPCHECK(mscclppBootstrapAllGather(comm, mrInfo, sizeof(struct mscclppIbMrInfo))); + bootstrap->allGather(qpInfo.data(), sizeof(mscclpp::IbQpInfo)); + bootstrap->allGather(mrInfo.data(), sizeof(mscclpp::IbMrInfo)); - for (int i = 0; i < 2; ++i) { - if (i == is_send) + for (int i = 0; i < bootstrap->getNranks(); ++i) { + if (i == isSend) continue; - qp->rtr(&qpInfo[i]); + qp->rtr(qpInfo[i]); qp->rts(); break; } printf("connection succeed\n"); - // A simple barrier - int* tmp; - MSCCLPPCHECK(mscclppCalloc(&tmp, 2)); - MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int))); + bootstrap->barrier(); - if (is_send) { + if (isSend) { int maxIter = 100000; double start = getTime(); for (int iter = 0; iter < maxIter; ++iter) { - qp->stageSend(mr, &mrInfo[0], sizeof(int) * nelem, 0, 0, 0, true); - if (qp->postSend() != 0) { - WARN("postSend failed"); - return 1; - } + qp->stageSend(mr, mrInfo[0], sizeof(int) * nelem, 0, 0, 0, true); + qp->postSend(); bool waiting = true; while (waiting) { int wcNum = qp->pollCq(); @@ -88,7 +79,7 @@ int main(int argc, const char* argv[]) return 1; } for (int i = 0; i < wcNum; ++i) { - struct ibv_wc* wc = &qp->wcs[i]; + const struct ibv_wc* wc = reinterpret_cast(qp->getWc(i)); if (wc->status != IBV_WC_SUCCESS) { WARN("wc status %d", wc->status); return 1; @@ -103,10 +94,7 @@ int main(int argc, const char* argv[]) } // A simple barrier - MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int))); - - MSCCLPPCHECK(mscclppIbContextDestroy(ctx)); - MSCCLPPCHECK(mscclppCommDestroy(comm)); + bootstrap->barrier(); return 0; } From 76410382468b78d90a31917f9b09e03bec8847bc Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Thu, 27 Apr 2023 04:15:24 +0000 Subject: [PATCH 068/135] wip --- src/communicator.cc | 5 +---- src/include/communicator.hpp | 3 ++- src/include/mscclpp.hpp | 31 +++---------------------------- tests/bootstrap_test_cpp.cc | 4 ++-- 4 files changed, 8 insertions(+), 35 deletions(-) diff --git a/src/communicator.cc b/src/communicator.cc index 9ce5b779..d905748a 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -11,16 +11,13 @@ namespace mscclpp { -Communicator::Impl::Impl() : comm(nullptr) {} +Communicator::Impl::Impl(std::shared_ptr bootstrap) : bootstrap_(bootstrap) {} Communicator::Impl::~Impl() { for (auto& entry : ibContexts) { mscclppIbContextDestroy(entry.second); } ibContexts.clear(); - if (comm) { - mscclppCommDestroy(comm); - } } mscclppIbContext* Communicator::Impl::getIbContext(TransportFlags ibTransport) { diff --git a/src/include/communicator.hpp b/src/include/communicator.hpp index 879501c0..7c5289ad 100644 --- a/src/include/communicator.hpp +++ b/src/include/communicator.hpp @@ -16,8 +16,9 @@ struct Communicator::Impl { mscclppComm_t comm; std::vector> connections; std::unordered_map ibContexts; + std::shared_ptr bootstrap_; - Impl(); + Impl(std::shared_ptr bootstrap); ~Impl(); diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index f2d8667e..f7e15872 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -120,24 +120,13 @@ protected: class Communicator { public: - /* Initialize the communicator. nranks processes with rank 0 to nranks-1 need to call this function. + /* Initialize the communicator. * * Inputs: - * nranks: number of ranks in the communicator - * ipPortPair: a string of the form "ip:port" that represents the address of the root process - * rank: rank of the calling process + * bootstrap: an implementation of the of BaseBootstrap that the communicator will use */ - Communicator(int nranks, const char* ipPortPair, int rank); + Communicator(std::shared_ptr bootstrap); - /* Initialize the communicator from a given UniqueId. Same as mscclppCommInitRank() except that - * id is provided by the user by calling getUniqueId() - * - * Inputs: - * nranks: number of ranks in the communicator - * id: the unique ID to be used for communication - * rank: rank of the calling process - */ - Communicator(int nranks, UniqueId id, int rank); ~Communicator(); @@ -183,20 +172,6 @@ public: */ void connectionSetup(); - /* Return the rank of the calling process. - * - * Outputs: - * rank: the rank of the calling process - */ - int rank(); - - /* Return the number of ranks of the communicator. - * - * Outputs: - * size: the number of ranks of the communicator - */ - int size(); - struct Impl; private: std::unique_ptr pimpl; diff --git a/tests/bootstrap_test_cpp.cc b/tests/bootstrap_test_cpp.cc index 34e58b59..6c29e369 100644 --- a/tests/bootstrap_test_cpp.cc +++ b/tests/bootstrap_test_cpp.cc @@ -55,11 +55,11 @@ void test_sendrecv(std::shared_ptr bootstrap){ void test_all(std::shared_ptr bootstrap){ test_allgather(bootstrap); test_barrier(bootstrap); - // test_sendrecv(bootstrap); + test_sendrecv(bootstrap); } void test_mscclpp_bootstrap_with_id(int rank, int worldSize){ - std::shared_ptr bootstrap(new mscclpp::Bootstrap(rank, worldSize)); + auto bootstrap = std::make_shared(rank, worldSize); mscclpp::UniqueId id; if (bootstrap->getRank() == 0) id = bootstrap->createUniqueId(); From c24896b62f4d7e906bf2121303837cbae0bd3abd Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Thu, 27 Apr 2023 04:23:44 +0000 Subject: [PATCH 069/135] bootstrap to the communicator --- src/communicator.cc | 22 +--------------------- tests/bootstrap_test_cpp.cc | 4 ++-- 2 files changed, 3 insertions(+), 23 deletions(-) diff --git a/src/communicator.cc b/src/communicator.cc index 81753fb6..02ee7a87 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -54,15 +54,7 @@ static mscclppTransport_t transportToCStyle(TransportFlags flags) { } } -MSCCLPP_API_CPP Communicator::Communicator(int nranks, const char* ipPortPair, int rank) : pimpl(std::make_unique()) { - mscclppCommInitRank(&pimpl->comm, nranks, ipPortPair, rank); -} - -MSCCLPP_API_CPP Communicator::Communicator(int nranks, UniqueId id, int rank) : pimpl(std::make_unique()) { - static_assert(sizeof(mscclppUniqueId) == sizeof(UniqueId), "UniqueId size mismatch"); - mscclppUniqueId *cstyle_id = reinterpret_cast(&id); - mscclppCommInitRankFromId(&pimpl->comm, nranks, *cstyle_id, rank); -} +MSCCLPP_API_CPP Communicator::Communicator(std::shared_ptr bootstrap) : pimpl(std::make_unique(bootstrap)) {} MSCCLPP_API_CPP void Communicator::bootstrapAllGather(void* data, int size) { mscclppBootstrapAllGather(pimpl->comm, data, size); @@ -100,16 +92,4 @@ MSCCLPP_API_CPP void Communicator::connectionSetup() { } } -MSCCLPP_API_CPP int Communicator::rank() { - int result; - mscclppCommRank(pimpl->comm, &result); - return result; -} - -MSCCLPP_API_CPP int Communicator::size() { - int result; - mscclppCommSize(pimpl->comm, &result); - return result; -} - } // namespace mscclpp diff --git a/tests/bootstrap_test_cpp.cc b/tests/bootstrap_test_cpp.cc index 6c29e369..bdde8467 100644 --- a/tests/bootstrap_test_cpp.cc +++ b/tests/bootstrap_test_cpp.cc @@ -24,7 +24,7 @@ void test_barrier(std::shared_ptr bootstrap){ void test_sendrecv(std::shared_ptr bootstrap){ for (int i = 0; i < bootstrap->getNranks(); i++) { - if (bootstrap->getRank() == 0) + if (bootstrap->getRank() == i) continue; int msg1 = (bootstrap->getRank() + 1) * 3; int msg2 = (bootstrap->getRank() + 1) * 3 + 1; @@ -35,7 +35,7 @@ void test_sendrecv(std::shared_ptr bootstrap){ } for (int i = 0; i < bootstrap->getNranks(); i++) { - if (i == bootstrap->getRank()) + if (bootstrap->getRank() == i) continue; int msg1 = 0; int msg2 = 0; From b0c7e869099a5d45222dadda83ea4caee33d2f2b Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 27 Apr 2023 05:01:07 +0000 Subject: [PATCH 070/135] Communicator owns IB contexts --- src/communicator.cc | 10 +++------- src/include/communicator.hpp | 3 ++- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/communicator.cc b/src/communicator.cc index 02ee7a87..c24b0c5e 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -15,9 +15,6 @@ namespace mscclpp { Communicator::Impl::Impl(std::shared_ptr bootstrap) : bootstrap_(bootstrap) {} Communicator::Impl::~Impl() { - for (auto& entry : ibContexts) { - delete entry.second; - } ibContexts.clear(); } @@ -26,11 +23,10 @@ IbCtx* Communicator::Impl::getIbContext(TransportFlags ibTransport) { auto it = ibContexts.find(ibTransport); if (it == ibContexts.end()) { auto ibDev = getIBDeviceName(ibTransport); - IbCtx* ibCtx = new IbCtx(ibDev); - ibContexts[ibTransport] = ibCtx; - return ibCtx; + ibContexts[ibTransport] = std::make_unique(ibDev); + return ibContexts[ibTransport].get(); } else { - return it->second; + return it->second.get(); } } diff --git a/src/include/communicator.hpp b/src/include/communicator.hpp index 3c3737ae..53d0fd73 100644 --- a/src/include/communicator.hpp +++ b/src/include/communicator.hpp @@ -7,6 +7,7 @@ #include "proxy.hpp" #include "ib.hpp" #include +#include namespace mscclpp { @@ -15,7 +16,7 @@ class ConnectionBase; struct Communicator::Impl { mscclppComm_t comm; std::vector> connections; - std::unordered_map ibContexts; + std::unordered_map> ibContexts; std::shared_ptr bootstrap_; Impl(std::shared_ptr bootstrap); From df80d8854bdd8bf89773053a42209eff3784b11e Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Thu, 27 Apr 2023 05:26:08 +0000 Subject: [PATCH 071/135] connect test --- Makefile | 2 +- tests/communicator_test_cpp.cc | 48 ++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 tests/communicator_test_cpp.cc diff --git a/Makefile b/Makefile index b2d2cceb..950751d7 100644 --- a/Makefile +++ b/Makefile @@ -149,7 +149,7 @@ UTOBJTARGETS := $(UTOBJS:%=$(BUILDDIR)/$(OBJDIR)/%) UTBINS := $(patsubst %.o,$(BUILDDIR)/$(BINDIR)/%,$(UTOBJS)) TESTSDIR := tests -TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc allgather_test_standalone.cu bootstrap_test_cpp.cc) # allgather_test_cpp.cu +TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc allgather_test_standalone.cu bootstrap_test_cpp.cc communicator_test_cpp.cc) # allgather_test_cpp.cu TESTSOBJS := $(patsubst %.cc,%.o,$(TESTSSRCS)) $(patsubst %.cu,%.o,$(TESTSSRCS)) TESTSOBJTARGETS := $(TESTSOBJS:%=$(BUILDDIR)/$(OBJDIR)/%) TESTSBINS := $(patsubst %.o,$(BUILDDIR)/$(BINDIR)/%,$(TESTSOBJS)) diff --git a/tests/communicator_test_cpp.cc b/tests/communicator_test_cpp.cc new file mode 100644 index 00000000..fc3a72e8 --- /dev/null +++ b/tests/communicator_test_cpp.cc @@ -0,0 +1,48 @@ +#include "mscclpp.hpp" + +#include +#include +#include +#include + +void test_communicator(int rank, int worldSize, int nranksPerNode){ + auto bootstrap = std::make_shared(rank, worldSize); + mscclpp::UniqueId id; + if (bootstrap->getRank() == 0) + id = bootstrap->createUniqueId(); + MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); + bootstrap->initialize(id); + + auto communicator = std::make_shared(bootstrap); + for (int i = 0; i < worldSize; i++){ + if (i != rank){ + if (i % nranksPerNode == rank % nranksPerNode) + auto connect = communicator->connect(i, 0, mscclpp::TransportCudaIpc); + else + auto connect = communicator->connect(i, 0, mscclpp::TransportAllIB); + } + } + + if (bootstrap->getRank() == 0) + std::cout << "--- MSCCLPP::Communicator tests passed! ---" << std::endl; +} + + +int main(int argc, char **argv) +{ + int rank, worldSize; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &worldSize); + MPI_Comm shmcomm; + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm); + int shmWorldSize; + MPI_Comm_size(shmcomm, &shmWorldSize); + int nranksPerNode = shmWorldSize; + MPI_Comm_free(&shmcomm); + + test_communicator(rank, worldSize, nranksPerNode); + + MPI_Finalize(); + return 0; +} \ No newline at end of file From 8eda6369ee2b71bfd92e34af021574f7356cfffe Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Thu, 27 Apr 2023 06:08:35 +0000 Subject: [PATCH 072/135] testing connection setup --- src/communicator.cc | 4 ++-- src/connection.cc | 10 +++++----- src/ib.cc | 2 +- src/include/connection.hpp | 12 ++++++------ src/include/ib.hpp | 2 +- tests/communicator_test_cpp.cc | 22 +++++++++++++++++++--- 6 files changed, 34 insertions(+), 18 deletions(-) diff --git a/src/communicator.cc b/src/communicator.cc index c24b0c5e..7e1348e8 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -81,10 +81,10 @@ MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank MSCCLPP_API_CPP void Communicator::connectionSetup() { for (auto& conn : pimpl->connections) { - conn->startSetup(*this); + conn->startSetup(pimpl->bootstrap_); } for (auto& conn : pimpl->connections) { - conn->endSetup(*this); + conn->endSetup(pimpl->bootstrap_); } } diff --git a/src/connection.cc b/src/connection.cc index 1e21694c..fc653c2a 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -54,7 +54,7 @@ void CudaIpcConnection::flush() { // IBConnection -IBConnection::IBConnection(int remoteRank, int tag, TransportFlags transport, Communicator::Impl& commImpl) : remoteRank(remoteRank), tag(tag), transport_(transport), remoteTransport_(TransportNone) { +IBConnection::IBConnection(int remoteRank, int tag, TransportFlags transport, Communicator::Impl& commImpl) : remoteRank_(remoteRank), tag_(tag), transport_(transport), remoteTransport_(TransportNone) { qp = commImpl.getIbContext(transport)->createQp(); } @@ -114,15 +114,15 @@ void IBConnection::flush() { // npkitCollectExitEvents(conn, NPKIT_EVENT_IB_SEND_EXIT); } -void IBConnection::startSetup(Communicator& comm) { +void IBConnection::startSetup(std::shared_ptr bootstrap) { // TODO(chhwang): temporarily disabled to compile - // comm.bootstrap().send(&qp->getInfo(), sizeof(qp->getInfo()), remoteRank, tag); + bootstrap->send(&qp->getInfo(), sizeof(qp->getInfo()), remoteRank_, tag_); } -void IBConnection::endSetup(Communicator& comm) { +void IBConnection::endSetup(std::shared_ptr bootstrap) { IbQpInfo qpInfo; // TODO(chhwang): temporarily disabled to compile - // comm.bootstrap().recv(&qpInfo, sizeof(qpInfo), remoteRank, tag); + bootstrap->recv(&qpInfo, sizeof(qpInfo), remoteRank_, tag_); qp->rtr(qpInfo); qp->rts(); } diff --git a/src/ib.cc b/src/ib.cc index 4dc0285b..fe3334a3 100644 --- a/src/ib.cc +++ b/src/ib.cc @@ -263,7 +263,7 @@ int IbQp::pollCq() return ibv_poll_cq(reinterpret_cast(this->cq), MSCCLPP_IB_CQ_POLL_NUM, reinterpret_cast(this->wcs)); } -const IbQpInfo& IbQp::getInfo() const +IbQpInfo& IbQp::getInfo() { return this->info; } diff --git a/src/include/connection.hpp b/src/include/connection.hpp index dcf21362..132726f7 100644 --- a/src/include/connection.hpp +++ b/src/include/connection.hpp @@ -12,8 +12,8 @@ namespace mscclpp { class ConnectionBase : public Connection { public: - virtual void startSetup(Communicator&) {}; - virtual void endSetup(Communicator&) {}; + virtual void startSetup(std::shared_ptr bootstrap) {}; + virtual void endSetup(std::shared_ptr bootstrap) {}; }; class CudaIpcConnection : public ConnectionBase { @@ -34,8 +34,8 @@ public: }; class IBConnection : public ConnectionBase { - int remoteRank; - int tag; + int remoteRank_; + int tag_; TransportFlags transport_; TransportFlags remoteTransport_; IbQp* qp; @@ -53,9 +53,9 @@ public: void flush() override; - void startSetup(Communicator& comm) override; + void startSetup(std::shared_ptr bootstrap) override; - void endSetup(Communicator& comm) override; + void endSetup(std::shared_ptr bootstrap) override; }; } // namespace mscclpp diff --git a/src/include/ib.hpp b/src/include/ib.hpp index d04b75bd..b1baeb75 100644 --- a/src/include/ib.hpp +++ b/src/include/ib.hpp @@ -61,7 +61,7 @@ public: void postRecv(uint64_t wrId); int pollCq(); - const IbQpInfo& getInfo() const; + IbQpInfo& getInfo(); const void* getWc(int idx) const; private: diff --git a/tests/communicator_test_cpp.cc b/tests/communicator_test_cpp.cc index fc3a72e8..d3fe15b0 100644 --- a/tests/communicator_test_cpp.cc +++ b/tests/communicator_test_cpp.cc @@ -5,6 +5,20 @@ #include #include +mscclpp::TransportFlags findIb(int localRank){ + mscclpp::TransportFlags IBs[] = { + mscclpp::TransportIB0, + mscclpp::TransportIB1, + mscclpp::TransportIB2, + mscclpp::TransportIB3, + mscclpp::TransportIB4, + mscclpp::TransportIB5, + mscclpp::TransportIB6, + mscclpp::TransportIB7 + }; + return IBs[localRank]; +} + void test_communicator(int rank, int worldSize, int nranksPerNode){ auto bootstrap = std::make_shared(rank, worldSize); mscclpp::UniqueId id; @@ -16,12 +30,14 @@ void test_communicator(int rank, int worldSize, int nranksPerNode){ auto communicator = std::make_shared(bootstrap); for (int i = 0; i < worldSize; i++){ if (i != rank){ - if (i % nranksPerNode == rank % nranksPerNode) + if (i % nranksPerNode == rank % nranksPerNode){ auto connect = communicator->connect(i, 0, mscclpp::TransportCudaIpc); - else - auto connect = communicator->connect(i, 0, mscclpp::TransportAllIB); + } else { + auto connect = communicator->connect(i, 0, findIb(rank % nranksPerNode)); + } } } + communicator->connectionSetup(); if (bootstrap->getRank() == 0) std::cout << "--- MSCCLPP::Communicator tests passed! ---" << std::endl; From 06c6df2350427fb9b7d955075b95c2705c3326d3 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Thu, 27 Apr 2023 19:06:35 +0000 Subject: [PATCH 073/135] Separate out Transport and TransportFlags --- src/communicator.cc | 26 +---- src/connection.cc | 20 ++-- src/ib.cc | 36 +++---- src/include/communicator.hpp | 4 +- src/include/connection.hpp | 14 +-- src/include/mscclpp.hpp | 151 ++++++++++++++++++++++++++---- src/include/registered_memory.hpp | 4 +- src/registered_memory.cc | 36 +++---- tests/communicator_test_cpp.cc | 22 ++--- 9 files changed, 204 insertions(+), 109 deletions(-) diff --git a/src/communicator.cc b/src/communicator.cc index 7e1348e8..1420c51d 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -18,7 +18,7 @@ Communicator::Impl::~Impl() { ibContexts.clear(); } -IbCtx* Communicator::Impl::getIbContext(TransportFlags ibTransport) { +IbCtx* Communicator::Impl::getIbContext(Transport ibTransport) { // Find IB context or create it auto it = ibContexts.find(ibTransport); if (it == ibContexts.end()) { @@ -32,24 +32,6 @@ IbCtx* Communicator::Impl::getIbContext(TransportFlags ibTransport) { MSCCLPP_API_CPP Communicator::~Communicator() = default; -static mscclppTransport_t transportToCStyle(TransportFlags flags) { - switch (flags) { - case TransportIB0: - case TransportIB1: - case TransportIB2: - case TransportIB3: - case TransportIB4: - case TransportIB5: - case TransportIB6: - case TransportIB7: - return mscclppTransportIB; - case TransportCudaIpc: - return mscclppTransportP2P; - default: - throw std::runtime_error("Unsupported conversion"); - } -} - MSCCLPP_API_CPP Communicator::Communicator(std::shared_ptr bootstrap) : pimpl(std::make_unique(bootstrap)) {} MSCCLPP_API_CPP void Communicator::bootstrapAllGather(void* data, int size) { @@ -64,12 +46,12 @@ RegisteredMemory Communicator::registerMemory(void* ptr, size_t size, TransportF return RegisteredMemory(std::make_shared(ptr, size, pimpl->comm->rank, transports, *pimpl)); } -MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank, int tag, TransportFlags transport) { +MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank, int tag, Transport transport) { std::shared_ptr conn; - if (transport | TransportCudaIpc) { + if (transport == Transport::CudaIpc) { auto cudaIpcConn = std::make_shared(); conn = cudaIpcConn; - } else if (transport | TransportAllIB) { + } else if (AllIBTransports.has(transport)) { auto ibConn = std::make_shared(remoteRank, tag, transport, *pimpl); conn = ibConn; } else { diff --git a/src/connection.cc b/src/connection.cc index fc653c2a..031f63ec 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -6,8 +6,8 @@ namespace mscclpp { -void validateTransport(RegisteredMemory mem, TransportFlags transport) { - if ((mem.transports() & transport) == TransportNone) { +void validateTransport(RegisteredMemory mem, Transport transport) { + if (!mem.transports().has(transport)) { throw std::runtime_error("mem does not support transport"); } } @@ -28,12 +28,12 @@ CudaIpcConnection::~CudaIpcConnection() { cudaStreamDestroy(stream); } -TransportFlags CudaIpcConnection::transport() { - return TransportCudaIpc; +Transport CudaIpcConnection::transport() { + return Transport::CudaIpc; } -TransportFlags CudaIpcConnection::remoteTransport() { - return TransportCudaIpc; +Transport CudaIpcConnection::remoteTransport() { + return Transport::CudaIpc; } void CudaIpcConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size) { @@ -54,7 +54,7 @@ void CudaIpcConnection::flush() { // IBConnection -IBConnection::IBConnection(int remoteRank, int tag, TransportFlags transport, Communicator::Impl& commImpl) : remoteRank_(remoteRank), tag_(tag), transport_(transport), remoteTransport_(TransportNone) { +IBConnection::IBConnection(int remoteRank, int tag, Transport transport, Communicator::Impl& commImpl) : remoteRank_(remoteRank), tag_(tag), transport_(transport), remoteTransport_(Transport::Unknown) { qp = commImpl.getIbContext(transport)->createQp(); } @@ -62,11 +62,11 @@ IBConnection::~IBConnection() { // TODO: Destroy QP? } -TransportFlags IBConnection::transport() { +Transport IBConnection::transport() { return transport_; } -TransportFlags IBConnection::remoteTransport() { +Transport IBConnection::remoteTransport() { return remoteTransport_; } @@ -115,13 +115,11 @@ void IBConnection::flush() { } void IBConnection::startSetup(std::shared_ptr bootstrap) { - // TODO(chhwang): temporarily disabled to compile bootstrap->send(&qp->getInfo(), sizeof(qp->getInfo()), remoteRank_, tag_); } void IBConnection::endSetup(std::shared_ptr bootstrap) { IbQpInfo qpInfo; - // TODO(chhwang): temporarily disabled to compile bootstrap->recv(&qpInfo, sizeof(qpInfo), remoteRank_, tag_); qp->rtr(qpInfo); qp->rts(); diff --git a/src/ib.cc b/src/ib.cc index fe3334a3..88d14d8e 100644 --- a/src/ib.cc +++ b/src/ib.cc @@ -368,33 +368,33 @@ int getIBDeviceCount() { return num; } -std::string getIBDeviceName(TransportFlags ibTransport) { +std::string getIBDeviceName(Transport ibTransport) { int num; struct ibv_device** devices = ibv_get_device_list(&num); int ibTransportIndex; switch (ibTransport) { // TODO: get rid of this ugly switch - case TransportIB0: + case Transport::IB0: ibTransportIndex = 0; break; - case TransportIB1: + case Transport::IB1: ibTransportIndex = 1; break; - case TransportIB2: + case Transport::IB2: ibTransportIndex = 2; break; - case TransportIB3: + case Transport::IB3: ibTransportIndex = 3; break; - case TransportIB4: + case Transport::IB4: ibTransportIndex = 4; break; - case TransportIB5: + case Transport::IB5: ibTransportIndex = 5; break; - case TransportIB6: + case Transport::IB6: ibTransportIndex = 6; break; - case TransportIB7: + case Transport::IB7: ibTransportIndex = 7; break; default: @@ -406,28 +406,28 @@ std::string getIBDeviceName(TransportFlags ibTransport) { return devices[ibTransportIndex]->name; } -TransportFlags getIBTransportByDeviceName(const std::string& ibDeviceName) { +Transport getIBTransportByDeviceName(const std::string& ibDeviceName) { int num; struct ibv_device** devices = ibv_get_device_list(&num); for (int i = 0; i < num; ++i) { if (ibDeviceName == devices[i]->name) { switch (i) { // TODO: get rid of this ugly switch case 0: - return TransportIB0; + return Transport::IB0; case 1: - return TransportIB1; + return Transport::IB1; case 2: - return TransportIB2; + return Transport::IB2; case 3: - return TransportIB3; + return Transport::IB3; case 4: - return TransportIB4; + return Transport::IB4; case 5: - return TransportIB5; + return Transport::IB5; case 6: - return TransportIB6; + return Transport::IB6; case 7: - return TransportIB7; + return Transport::IB7; default: throw std::runtime_error("IB device index out of range"); } diff --git a/src/include/communicator.hpp b/src/include/communicator.hpp index 53d0fd73..8ca4e952 100644 --- a/src/include/communicator.hpp +++ b/src/include/communicator.hpp @@ -16,14 +16,14 @@ class ConnectionBase; struct Communicator::Impl { mscclppComm_t comm; std::vector> connections; - std::unordered_map> ibContexts; + std::unordered_map> ibContexts; std::shared_ptr bootstrap_; Impl(std::shared_ptr bootstrap); ~Impl(); - IbCtx* getIbContext(TransportFlags ibTransport); + IbCtx* getIbContext(Transport ibTransport); }; } // namespace mscclpp diff --git a/src/include/connection.hpp b/src/include/connection.hpp index 132726f7..bd08802c 100644 --- a/src/include/connection.hpp +++ b/src/include/connection.hpp @@ -24,9 +24,9 @@ public: ~CudaIpcConnection(); - TransportFlags transport() override; + Transport transport() override; - TransportFlags remoteTransport() override; + Transport remoteTransport() override; void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size) override; @@ -36,18 +36,18 @@ public: class IBConnection : public ConnectionBase { int remoteRank_; int tag_; - TransportFlags transport_; - TransportFlags remoteTransport_; + Transport transport_; + Transport remoteTransport_; IbQp* qp; public: - IBConnection(int remoteRank, int tag, TransportFlags transport, Communicator::Impl& commImpl); + IBConnection(int remoteRank, int tag, Transport transport, Communicator::Impl& commImpl); ~IBConnection(); - TransportFlags transport() override; + Transport transport() override; - TransportFlags remoteTransport() override; + Transport remoteTransport() override; void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size) override; diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index f14e19c1..3b9c6d8d 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -9,6 +9,7 @@ #include #include #include +#include namespace mscclpp { @@ -63,24 +64,129 @@ private: */ std::unique_ptr getUniqueId(); -using TransportFlags = uint32_t; -const TransportFlags TransportNone = 0b0; -const TransportFlags TransportCudaIpc = 0b1; -const TransportFlags TransportIB0 = 0b10; -const TransportFlags TransportIB1 = 0b100; -const TransportFlags TransportIB2 = 0b1000; -const TransportFlags TransportIB3 = 0b10000; -const TransportFlags TransportIB4 = 0b100000; -const TransportFlags TransportIB5 = 0b1000000; -const TransportFlags TransportIB6 = 0b10000000; -const TransportFlags TransportIB7 = 0b100000000; +enum class Transport { + Unknown, + CudaIpc, + IB0, + IB1, + IB2, + IB3, + IB4, + IB5, + IB6, + IB7, + NumTransports +}; -const TransportFlags TransportAll = 0b111111111; -const TransportFlags TransportAllIB = 0b111111110; +namespace detail { + const size_t TransportFlagsSize = 10; + static_assert(TransportFlagsSize == static_cast(Transport::NumTransports), "TransportFlagsSize must match the number of transports"); + using TransportFlagsBase = std::bitset; +} + +class TransportFlags : private detail::TransportFlagsBase { +public: + TransportFlags() = default; + TransportFlags(Transport transport) : detail::TransportFlagsBase(1 << static_cast(transport)) {} + + bool has(Transport transport) const { + return detail::TransportFlagsBase::test(static_cast(transport)); + } + + bool none() const { + return detail::TransportFlagsBase::none(); + } + + bool any() const { + return detail::TransportFlagsBase::any(); + } + + bool all() const { + return detail::TransportFlagsBase::all(); + } + + size_t count() const { + return detail::TransportFlagsBase::count(); + } + + TransportFlags& operator|=(TransportFlags other) { + detail::TransportFlagsBase::operator|=(other); + return *this; + } + + TransportFlags operator|(TransportFlags other) const { + return TransportFlags(*this) |= other; + } + + TransportFlags operator|(Transport transport) const { + return *this | TransportFlags(transport); + } + + TransportFlags& operator&=(TransportFlags other) { + detail::TransportFlagsBase::operator&=(other); + return *this; + } + + TransportFlags operator&(TransportFlags other) const { + return TransportFlags(*this) &= other; + } + + TransportFlags operator&(Transport transport) const { + return *this & TransportFlags(transport); + } + + TransportFlags& operator^=(TransportFlags other) { + detail::TransportFlagsBase::operator^=(other); + return *this; + } + + TransportFlags operator^(TransportFlags other) const { + return TransportFlags(*this) ^= other; + } + + TransportFlags operator^(Transport transport) const { + return *this ^ TransportFlags(transport); + } + + TransportFlags operator~() const { + return TransportFlags(*this).flip(); + } + + bool operator==(TransportFlags other) const { + return detail::TransportFlagsBase::operator==(other); + } + + bool operator!=(TransportFlags other) const { + return detail::TransportFlagsBase::operator!=(other); + } + + detail::TransportFlagsBase toBitset() const { + return *this; + } + +private: + TransportFlags(detail::TransportFlagsBase bitset) : detail::TransportFlagsBase(bitset) {} +}; + +inline TransportFlags operator|(Transport transport1, Transport transport2) { + return TransportFlags(transport1) | transport2; +} + +inline TransportFlags operator&(Transport transport1, Transport transport2) { + return TransportFlags(transport1) & transport2; +} + +inline TransportFlags operator^(Transport transport1, Transport transport2) { + return TransportFlags(transport1) ^ transport2; +} + +const TransportFlags NoTransports = TransportFlags(); +const TransportFlags AllIBTransports = Transport::IB0 | Transport::IB1 | Transport::IB2 | Transport::IB3 | Transport::IB4 | Transport::IB5 | Transport::IB6 | Transport::IB7; +const TransportFlags AllTransports = AllIBTransports | Transport::CudaIpc; int getIBDeviceCount(); -std::string getIBDeviceName(TransportFlags ibTransport); -TransportFlags getIBTransportByDeviceName(const std::string& ibDeviceName); +std::string getIBDeviceName(Transport ibTransport); +Transport getIBTransportByDeviceName(const std::string& ibDeviceName); class Communicator; class Connection; @@ -111,9 +217,9 @@ public: virtual void flush() = 0; - virtual TransportFlags transport() = 0; + virtual Transport transport() = 0; - virtual TransportFlags remoteTransport() = 0; + virtual Transport remoteTransport() = 0; protected: static std::shared_ptr getRegisteredMemoryImpl(RegisteredMemory&); @@ -166,7 +272,7 @@ public: * transportType: the type of transport to be used (mscclppTransportP2P or mscclppTransportIB) * ibDev: the name of the IB device to be used. Expects a null for mscclppTransportP2P. */ - std::shared_ptr connect(int remoteRank, int tag, TransportFlags transport); + std::shared_ptr connect(int remoteRank, int tag, Transport transport); /* Establish all connections declared by connect(). This function must be called after all connect() * calls are made. This function ensures that all remote ranks are ready to communicate when it returns. @@ -180,4 +286,13 @@ private: } // namespace mscclpp +namespace std { + template <> + struct hash { + size_t operator()(const mscclpp::TransportFlags& flags) const { + return hash()(flags.toBitset()); + } + }; +} + #endif // MSCCLPP_H_ diff --git a/src/include/registered_memory.hpp b/src/include/registered_memory.hpp index d2270d46..afe42da4 100644 --- a/src/include/registered_memory.hpp +++ b/src/include/registered_memory.hpp @@ -10,7 +10,7 @@ namespace mscclpp { struct TransportInfo { - TransportFlags transport; + Transport transport; // TODO: rewrite this using std::variant or something bool ibLocal; @@ -31,7 +31,7 @@ struct RegisteredMemory::Impl { Impl(void* data, size_t size, int rank, TransportFlags transports, Communicator::Impl& commImpl); Impl(const std::vector& data); - TransportInfo& getTransportInfo(TransportFlags transport) { + TransportInfo& getTransportInfo(Transport transport) { for (auto& entry : transportInfos) { if (entry.transport == transport) { return entry; diff --git a/src/registered_memory.cc b/src/registered_memory.cc index f0db85ce..b26ea2d5 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -5,17 +5,17 @@ namespace mscclpp { RegisteredMemory::Impl::Impl(void* data, size_t size, int rank, TransportFlags transports, Communicator::Impl& commImpl) : data(data), size(size), rank(rank), transports(transports) { - if (transports & TransportCudaIpc) { + if (transports.has(Transport::CudaIpc)) { TransportInfo transportInfo; - transportInfo.transport = TransportCudaIpc; + transportInfo.transport = Transport::CudaIpc; cudaIpcMemHandle_t handle; // TODO: translate data to a base pointer CUDATHROW(cudaIpcGetMemHandle(&handle, data)); transportInfo.cudaIpcHandle = handle; this->transportInfos.push_back(transportInfo); } - if (transports & TransportAllIB) { - auto addIb = [&](TransportFlags ibTransport) { + if ((transports & AllIBTransports).any()) { + auto addIb = [&](Transport ibTransport) { TransportInfo transportInfo; transportInfo.transport = ibTransport; const IbMr* mr = commImpl.getIbContext(ibTransport)->registerMr(data, size); @@ -23,14 +23,14 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, int rank, TransportFlags t transportInfo.ibLocal = true; this->transportInfos.push_back(transportInfo); }; - if (transports & TransportIB0) addIb(TransportIB0); - if (transports & TransportIB1) addIb(TransportIB1); - if (transports & TransportIB2) addIb(TransportIB2); - if (transports & TransportIB3) addIb(TransportIB3); - if (transports & TransportIB4) addIb(TransportIB4); - if (transports & TransportIB5) addIb(TransportIB5); - if (transports & TransportIB6) addIb(TransportIB6); - if (transports & TransportIB7) addIb(TransportIB7); + if (transports.has(Transport::IB0)) addIb(Transport::IB0); + if (transports.has(Transport::IB1)) addIb(Transport::IB1); + if (transports.has(Transport::IB2)) addIb(Transport::IB2); + if (transports.has(Transport::IB3)) addIb(Transport::IB3); + if (transports.has(Transport::IB4)) addIb(Transport::IB4); + if (transports.has(Transport::IB5)) addIb(Transport::IB5); + if (transports.has(Transport::IB6)) addIb(Transport::IB6); + if (transports.has(Transport::IB7)) addIb(Transport::IB7); } } @@ -66,9 +66,9 @@ std::vector RegisteredMemory::serialize() { std::copy_n(reinterpret_cast(&transportCount), sizeof(transportCount), std::back_inserter(result)); for (auto& entry : pimpl->transportInfos) { std::copy_n(reinterpret_cast(&entry.transport), sizeof(entry.transport), std::back_inserter(result)); - if (entry.transport == TransportCudaIpc) { + if (entry.transport == Transport::CudaIpc) { std::copy_n(reinterpret_cast(&entry.cudaIpcHandle), sizeof(entry.cudaIpcHandle), std::back_inserter(result)); - } else if (entry.transport & TransportAllIB) { + } else if (AllIBTransports.has(entry.transport)) { std::copy_n(reinterpret_cast(&entry.ibMrInfo), sizeof(entry.ibMrInfo), std::back_inserter(result)); } else { throw std::runtime_error("Unknown transport"); @@ -96,12 +96,12 @@ RegisteredMemory::Impl::Impl(const std::vector& serialization) { TransportInfo transportInfo; std::copy_n(it, sizeof(transportInfo.transport), reinterpret_cast(&transportInfo.transport)); it += sizeof(transportInfo.transport); - if (transportInfo.transport & TransportCudaIpc) { + if (transportInfo.transport == Transport::CudaIpc) { cudaIpcMemHandle_t handle; std::copy_n(it, sizeof(handle), reinterpret_cast(&handle)); it += sizeof(handle); transportInfo.cudaIpcHandle = handle; - } else if (transportInfo.transport & TransportAllIB) { + } else if (AllIBTransports.has(transportInfo.transport)) { IbMrInfo info; std::copy_n(it, sizeof(info), reinterpret_cast(&info)); it += sizeof(info); @@ -116,8 +116,8 @@ RegisteredMemory::Impl::Impl(const std::vector& serialization) { throw std::runtime_error("Deserialization failed"); } - if (transports & TransportCudaIpc) { - auto entry = getTransportInfo(TransportCudaIpc); + if (transports.has(Transport::CudaIpc)) { + auto entry = getTransportInfo(Transport::CudaIpc); CUDATHROW(cudaIpcOpenMemHandle(&data, entry.cudaIpcHandle, cudaIpcMemLazyEnablePeerAccess)); } } diff --git a/tests/communicator_test_cpp.cc b/tests/communicator_test_cpp.cc index d3fe15b0..9ca46988 100644 --- a/tests/communicator_test_cpp.cc +++ b/tests/communicator_test_cpp.cc @@ -5,16 +5,16 @@ #include #include -mscclpp::TransportFlags findIb(int localRank){ - mscclpp::TransportFlags IBs[] = { - mscclpp::TransportIB0, - mscclpp::TransportIB1, - mscclpp::TransportIB2, - mscclpp::TransportIB3, - mscclpp::TransportIB4, - mscclpp::TransportIB5, - mscclpp::TransportIB6, - mscclpp::TransportIB7 +mscclpp::Transport findIb(int localRank){ + mscclpp::Transport IBs[] = { + mscclpp::Transport::IB0, + mscclpp::Transport::IB1, + mscclpp::Transport::IB2, + mscclpp::Transport::IB3, + mscclpp::Transport::IB4, + mscclpp::Transport::IB5, + mscclpp::Transport::IB6, + mscclpp::Transport::IB7 }; return IBs[localRank]; } @@ -31,7 +31,7 @@ void test_communicator(int rank, int worldSize, int nranksPerNode){ for (int i = 0; i < worldSize; i++){ if (i != rank){ if (i % nranksPerNode == rank % nranksPerNode){ - auto connect = communicator->connect(i, 0, mscclpp::TransportCudaIpc); + auto connect = communicator->connect(i, 0, mscclpp::Transport::CudaIpc); } else { auto connect = communicator->connect(i, 0, findIb(rank % nranksPerNode)); } From aaa3f0e94521c5c3ec24a67b6a39e6aa31c71917 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Thu, 27 Apr 2023 19:17:19 +0000 Subject: [PATCH 074/135] host hashes in communicator --- src/communicator.cc | 20 +++++++++++++++++++- src/include/communicator.hpp | 1 + tests/communicator_test_cpp.cc | 3 ++- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/communicator.cc b/src/communicator.cc index 7e1348e8..6f458fe5 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -1,3 +1,5 @@ +#include + #include "mscclpp.hpp" #include "communicator.hpp" #include "host_connection.hpp" @@ -12,7 +14,13 @@ namespace mscclpp { -Communicator::Impl::Impl(std::shared_ptr bootstrap) : bootstrap_(bootstrap) {} +Communicator::Impl::Impl(std::shared_ptr bootstrap) : bootstrap_(bootstrap) { + rankToHash_.resize(bootstrap->getNranks()); + auto hostHash = getHostHash(); + INFO(MSCCLPP_INIT, "Host hash: %lx", hostHash); + rankToHash_[bootstrap->getRank()] = hostHash; + bootstrap->allGather(rankToHash_.data(), sizeof(uint64_t)); +} Communicator::Impl::~Impl() { ibContexts.clear(); @@ -67,11 +75,21 @@ RegisteredMemory Communicator::registerMemory(void* ptr, size_t size, TransportF MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank, int tag, TransportFlags transport) { std::shared_ptr conn; if (transport | TransportCudaIpc) { + // sanity check: make sure the IPC connection is being made within a node + if (pimpl->rankToHash_[remoteRank] != pimpl->rankToHash_[pimpl->bootstrap_->getRank()]) { + std::stringstream ss; + ss << "Cuda IPC connection can only be made within a node: " << remoteRank << " != " << pimpl->bootstrap_->getRank(); + throw std::runtime_error(ss.str()); + } auto cudaIpcConn = std::make_shared(); conn = cudaIpcConn; + INFO(MSCCLPP_INIT, "Cuda IPC connection between %d(%lx) and %d(%lx) created", pimpl->bootstrap_->getRank(), pimpl->rankToHash_[pimpl->bootstrap_->getRank()], + remoteRank, pimpl->rankToHash_[remoteRank]); } else if (transport | TransportAllIB) { auto ibConn = std::make_shared(remoteRank, tag, transport, *pimpl); conn = ibConn; + INFO(MSCCLPP_INIT, "IB connection between %d(%lx) via %s and %d(%lx) created", pimpl->bootstrap_->getRank(), pimpl->rankToHash_[pimpl->bootstrap_->getRank()], + getIBDeviceName(transport).c_str(), remoteRank, pimpl->rankToHash_[remoteRank]); } else { throw std::runtime_error("Unsupported transport"); } diff --git a/src/include/communicator.hpp b/src/include/communicator.hpp index 53d0fd73..5be00a67 100644 --- a/src/include/communicator.hpp +++ b/src/include/communicator.hpp @@ -18,6 +18,7 @@ struct Communicator::Impl { std::vector> connections; std::unordered_map> ibContexts; std::shared_ptr bootstrap_; + std::vector rankToHash_; Impl(std::shared_ptr bootstrap); diff --git a/tests/communicator_test_cpp.cc b/tests/communicator_test_cpp.cc index d3fe15b0..05595313 100644 --- a/tests/communicator_test_cpp.cc +++ b/tests/communicator_test_cpp.cc @@ -30,7 +30,8 @@ void test_communicator(int rank, int worldSize, int nranksPerNode){ auto communicator = std::make_shared(bootstrap); for (int i = 0; i < worldSize; i++){ if (i != rank){ - if (i % nranksPerNode == rank % nranksPerNode){ + if (i / nranksPerNode == rank / nranksPerNode){ + printf("i %d rank %d nranksPerNode %d\n", i, rank, nranksPerNode); auto connect = communicator->connect(i, 0, mscclpp::TransportCudaIpc); } else { auto connect = communicator->connect(i, 0, findIb(rank % nranksPerNode)); From afc5887da20a24a0d0ec03a2be6f309e12a44d54 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Thu, 27 Apr 2023 20:32:06 +0000 Subject: [PATCH 075/135] moving the debug info into other levels --- src/communicator.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/communicator.cc b/src/communicator.cc index 726efbc8..bdccf8eb 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -66,10 +66,12 @@ MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank } auto cudaIpcConn = std::make_shared(); conn = cudaIpcConn; + INFO(MSCCLPP_P2P, "Cuda IPC connection between rank %d(%lx) and remoteRank %d(%lx) created", pimpl->bootstrap_->getRank(), pimpl->rankToHash_[pimpl->bootstrap_->getRank()], + remoteRank, pimpl->rankToHash_[remoteRank]); } else if (AllIBTransports.has(transport)) { auto ibConn = std::make_shared(remoteRank, tag, transport, *pimpl); conn = ibConn; - INFO(MSCCLPP_INIT, "IB connection between %d(%lx) via %s and %d(%lx) created", pimpl->bootstrap_->getRank(), pimpl->rankToHash_[pimpl->bootstrap_->getRank()], + INFO(MSCCLPP_NET, "IB connection between rank %d(%lx) via %s and remoteRank %d(%lx) created", pimpl->bootstrap_->getRank(), pimpl->rankToHash_[pimpl->bootstrap_->getRank()], getIBDeviceName(transport).c_str(), remoteRank, pimpl->rankToHash_[remoteRank]); } else { throw std::runtime_error("Unsupported transport"); From 82c27625e604c7ccd3d138adefddf1778b0e0e09 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Thu, 27 Apr 2023 21:33:15 +0000 Subject: [PATCH 076/135] ipc uses a base ptr now --- Makefile | 2 +- src/basic_proxy_handler.cc | 8 +- src/bootstrap/bootstrap.cc | 10 +- src/communicator.cc | 61 ++++++---- src/connection.cc | 60 ++++++---- src/epoch.cc | 13 +- src/fifo.cc | 29 +++-- src/host_connection.cc | 55 ++++++--- src/ib.cc | 131 +++++++++++--------- src/include/basic_proxy_handler.hpp | 4 +- src/include/channel.hpp | 70 +++++++---- src/include/checks.hpp | 11 ++ src/include/comm.h | 4 +- src/include/communicator.hpp | 11 +- src/include/connection.hpp | 27 +++-- src/include/epoch.hpp | 11 +- src/include/host_connection.hpp | 7 +- src/include/ib.hpp | 8 +- src/include/mscclpp.h | 10 +- src/include/mscclpp.hpp | 180 +++++++++++++++++----------- src/include/mscclppfifo.hpp | 25 ++-- src/include/proxy.h | 2 +- src/include/proxy.hpp | 10 +- src/include/registered_memory.hpp | 15 ++- src/include/registered_ptr.hpp | 34 ++++-- src/init.cc | 47 +++++--- src/proxy_cpp.cc | 28 +++-- src/registered_memory.cc | 60 +++++++--- tests/allgather_test_cpp.cu | 43 +++---- tests/bootstrap_test_cpp.cc | 52 +++++--- tests/communicator_test_cpp.cc | 32 ++--- tests/unittests/ib_test.cc | 2 +- 32 files changed, 650 insertions(+), 412 deletions(-) diff --git a/Makefile b/Makefile index 950751d7..41896041 100644 --- a/Makefile +++ b/Makefile @@ -61,7 +61,7 @@ endif NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xfatbin -compress-all # Use addprefix so that we can specify more than one path -NVLDFLAGS := -L$(CUDA_LIB) -lcudart -lrt +NVLDFLAGS := -L$(CUDA_LIB) -lcudart -lrt -lcuda ifeq ($(DEBUG), 0) NVCUFLAGS += -O3 diff --git a/src/basic_proxy_handler.cc b/src/basic_proxy_handler.cc index 482aa842..42470131 100644 --- a/src/basic_proxy_handler.cc +++ b/src/basic_proxy_handler.cc @@ -2,15 +2,17 @@ namespace mscclpp { -ProxyHandler makeBasicProxyHandler(Communicator::Impl &comm) { +ProxyHandler makeBasicProxyHandler(Communicator::Impl& comm) +{ return [&comm](ProxyTrigger triggerRaw) { - ChannelTrigger *trigger = reinterpret_cast(&triggerRaw); + ChannelTrigger* trigger = reinterpret_cast(&triggerRaw); HostConnection& conn = *comm.connections.at(trigger->fields.connId); auto result = ProxyHandlerResult::Continue; if (trigger->fields.type & mscclppData) { - conn.put(trigger->fields.dstBufferHandle, trigger->fields.dstOffset, trigger->fields.srcBufferHandle, trigger->fields.srcOffset, trigger->fields.size); + conn.put(trigger->fields.dstBufferHandle, trigger->fields.dstOffset, trigger->fields.srcBufferHandle, + trigger->fields.srcOffset, trigger->fields.size); } if (trigger->fields.type & mscclppFlag) { diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index dfce50b4..75225799 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -180,9 +180,8 @@ Bootstrap::Impl::~Impl() } } -void Bootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, - std::vector& rankAddresses, - std::vector& rankAddressesRoot, int& rank) +void Bootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, std::vector& rankAddresses, + std::vector& rankAddressesRoot, int& rank) { mscclppSocket sock; ExtInfo info; @@ -211,7 +210,7 @@ void Bootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, } void Bootstrap::Impl::sendHandleToPeer(int peer, const std::vector& rankAddresses, - const std::vector& rankAddressesRoot) + const std::vector& rankAddressesRoot) { mscclppSocket sock; int next = (peer + 1) % this->nRanks_; @@ -226,7 +225,8 @@ void Bootstrap::Impl::bootstrapCreateRoot() mscclppSocket listenSock; // mscclppSocket* listenSock = new mscclppSocket(); // TODO(saemal) make this a shared ptr - MSCCLPPTHROW(mscclppSocketInit(&listenSock, &uniqueId_.addr, uniqueId_.magic, mscclppSocketTypeBootstrap, nullptr, 0)); + MSCCLPPTHROW( + mscclppSocketInit(&listenSock, &uniqueId_.addr, uniqueId_.magic, mscclppSocketTypeBootstrap, nullptr, 0)); MSCCLPPTHROW(mscclppSocketListen(&listenSock)); MSCCLPPTHROW(mscclppSocketGetAddr(&listenSock, &uniqueId_.addr)); auto lambda = [this, listenSock]() { this->bootstrapRoot(listenSock); }; diff --git a/src/communicator.cc b/src/communicator.cc index bdccf8eb..78df252d 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -1,20 +1,21 @@ #include -#include "mscclpp.hpp" -#include "communicator.hpp" -#include "host_connection.hpp" -#include "comm.h" -#include "basic_proxy_handler.hpp" #include "api.h" -#include "utils.h" +#include "basic_proxy_handler.hpp" #include "checks.hpp" -#include "debug.h" +#include "comm.h" +#include "communicator.hpp" #include "connection.hpp" +#include "debug.h" +#include "host_connection.hpp" +#include "mscclpp.hpp" #include "registered_memory.hpp" +#include "utils.h" namespace mscclpp { -Communicator::Impl::Impl(std::shared_ptr bootstrap) : bootstrap_(bootstrap) { +Communicator::Impl::Impl(std::shared_ptr bootstrap) : bootstrap_(bootstrap) +{ rankToHash_.resize(bootstrap->getNranks()); auto hostHash = getHostHash(); INFO(MSCCLPP_INIT, "Host hash: %lx", hostHash); @@ -22,11 +23,13 @@ Communicator::Impl::Impl(std::shared_ptr bootstrap) : bootstrap_( bootstrap->allGather(rankToHash_.data(), sizeof(uint64_t)); } -Communicator::Impl::~Impl() { +Communicator::Impl::~Impl() +{ ibContexts.clear(); } -IbCtx* Communicator::Impl::getIbContext(Transport ibTransport) { +IbCtx* Communicator::Impl::getIbContext(Transport ibTransport) +{ // Find IB context or create it auto it = ibContexts.find(ibTransport); if (it == ibContexts.end()) { @@ -40,39 +43,50 @@ IbCtx* Communicator::Impl::getIbContext(Transport ibTransport) { MSCCLPP_API_CPP Communicator::~Communicator() = default; -MSCCLPP_API_CPP Communicator::Communicator(std::shared_ptr bootstrap) : pimpl(std::make_unique(bootstrap)) {} +MSCCLPP_API_CPP Communicator::Communicator(std::shared_ptr bootstrap) + : pimpl(std::make_unique(bootstrap)) +{ +} -MSCCLPP_API_CPP void Communicator::bootstrapAllGather(void* data, int size) { +MSCCLPP_API_CPP void Communicator::bootstrapAllGather(void* data, int size) +{ mscclppBootstrapAllGather(pimpl->comm, data, size); } -MSCCLPP_API_CPP void Communicator::bootstrapBarrier() { +MSCCLPP_API_CPP void Communicator::bootstrapBarrier() +{ mscclppBootstrapBarrier(pimpl->comm); } -RegisteredMemory Communicator::registerMemory(void* ptr, size_t size, TransportFlags transports) { +RegisteredMemory Communicator::registerMemory(void* ptr, size_t size, TransportFlags transports) +{ return RegisteredMemory(std::make_shared(ptr, size, pimpl->comm->rank, transports, *pimpl)); } -MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank, int tag, Transport transport) { +MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank, int tag, Transport transport) +{ std::shared_ptr conn; if (transport == Transport::CudaIpc) { // sanity check: make sure the IPC connection is being made within a node if (pimpl->rankToHash_[remoteRank] != pimpl->rankToHash_[pimpl->bootstrap_->getRank()]) { std::stringstream ss; - ss << "Cuda IPC connection can only be made within a node: " << remoteRank << "(" << std::hex << pimpl->rankToHash_[pimpl->bootstrap_->getRank()] << ")" << " != " - << pimpl->bootstrap_->getRank() << "(" << std::hex << pimpl->rankToHash_[pimpl->bootstrap_->getRank()] << ")"; + ss << "Cuda IPC connection can only be made within a node: " << remoteRank << "(" << std::hex + << pimpl->rankToHash_[pimpl->bootstrap_->getRank()] << ")" + << " != " << pimpl->bootstrap_->getRank() << "(" << std::hex + << pimpl->rankToHash_[pimpl->bootstrap_->getRank()] << ")"; throw std::runtime_error(ss.str()); - } + } auto cudaIpcConn = std::make_shared(); conn = cudaIpcConn; - INFO(MSCCLPP_P2P, "Cuda IPC connection between rank %d(%lx) and remoteRank %d(%lx) created", pimpl->bootstrap_->getRank(), pimpl->rankToHash_[pimpl->bootstrap_->getRank()], - remoteRank, pimpl->rankToHash_[remoteRank]); + INFO(MSCCLPP_P2P, "Cuda IPC connection between rank %d(%lx) and remoteRank %d(%lx) created", + pimpl->bootstrap_->getRank(), pimpl->rankToHash_[pimpl->bootstrap_->getRank()], remoteRank, + pimpl->rankToHash_[remoteRank]); } else if (AllIBTransports.has(transport)) { auto ibConn = std::make_shared(remoteRank, tag, transport, *pimpl); conn = ibConn; - INFO(MSCCLPP_NET, "IB connection between rank %d(%lx) via %s and remoteRank %d(%lx) created", pimpl->bootstrap_->getRank(), pimpl->rankToHash_[pimpl->bootstrap_->getRank()], - getIBDeviceName(transport).c_str(), remoteRank, pimpl->rankToHash_[remoteRank]); + INFO(MSCCLPP_NET, "IB connection between rank %d(%lx) via %s and remoteRank %d(%lx) created", + pimpl->bootstrap_->getRank(), pimpl->rankToHash_[pimpl->bootstrap_->getRank()], + getIBDeviceName(transport).c_str(), remoteRank, pimpl->rankToHash_[remoteRank]); } else { throw std::runtime_error("Unsupported transport"); } @@ -80,7 +94,8 @@ MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank return conn; } -MSCCLPP_API_CPP void Communicator::connectionSetup() { +MSCCLPP_API_CPP void Communicator::connectionSetup() +{ for (auto& conn : pimpl->connections) { conn->startSetup(pimpl->bootstrap_); } diff --git a/src/connection.cc b/src/connection.cc index 031f63ec..75a6ba79 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -1,12 +1,13 @@ #include "connection.hpp" #include "checks.hpp" -#include "registered_memory.hpp" -#include "npkit/npkit.h" #include "infiniband/verbs.h" +#include "npkit/npkit.h" +#include "registered_memory.hpp" namespace mscclpp { -void validateTransport(RegisteredMemory mem, Transport transport) { +void validateTransport(RegisteredMemory mem, Transport transport) +{ if (!mem.transports().has(transport)) { throw std::runtime_error("mem does not support transport"); } @@ -14,29 +15,36 @@ void validateTransport(RegisteredMemory mem, Transport transport) { // Connection -std::shared_ptr Connection::getRegisteredMemoryImpl(RegisteredMemory& mem) { +std::shared_ptr Connection::getRegisteredMemoryImpl(RegisteredMemory& mem) +{ return mem.pimpl; } // CudaIpcConnection -CudaIpcConnection::CudaIpcConnection() { +CudaIpcConnection::CudaIpcConnection() +{ cudaStreamCreate(&stream); } -CudaIpcConnection::~CudaIpcConnection() { +CudaIpcConnection::~CudaIpcConnection() +{ cudaStreamDestroy(stream); } -Transport CudaIpcConnection::transport() { +Transport CudaIpcConnection::transport() +{ return Transport::CudaIpc; } -Transport CudaIpcConnection::remoteTransport() { +Transport CudaIpcConnection::remoteTransport() +{ return Transport::CudaIpc; } -void CudaIpcConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size) { +void CudaIpcConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, + uint64_t size) +{ validateTransport(dst, remoteTransport()); validateTransport(src, transport()); @@ -47,30 +55,38 @@ void CudaIpcConnection::write(RegisteredMemory dst, uint64_t dstOffset, Register // npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_DATA_ENTRY, (uint32_t)size); } -void CudaIpcConnection::flush() { +void CudaIpcConnection::flush() +{ CUDATHROW(cudaStreamSynchronize(stream)); // npkitCollectExitEvents(conn, NPKIT_EVENT_DMA_SEND_EXIT); } // IBConnection -IBConnection::IBConnection(int remoteRank, int tag, Transport transport, Communicator::Impl& commImpl) : remoteRank_(remoteRank), tag_(tag), transport_(transport), remoteTransport_(Transport::Unknown) { +IBConnection::IBConnection(int remoteRank, int tag, Transport transport, Communicator::Impl& commImpl) + : remoteRank_(remoteRank), tag_(tag), transport_(transport), remoteTransport_(Transport::Unknown) +{ qp = commImpl.getIbContext(transport)->createQp(); } -IBConnection::~IBConnection() { +IBConnection::~IBConnection() +{ // TODO: Destroy QP? } -Transport IBConnection::transport() { +Transport IBConnection::transport() +{ return transport_; } -Transport IBConnection::remoteTransport() { +Transport IBConnection::remoteTransport() +{ return remoteTransport_; } -void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size) { +void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, + uint64_t size) +{ validateTransport(dst, remoteTransport()); validateTransport(src, transport()); @@ -82,16 +98,18 @@ void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMem if (!srcTransportInfo.ibLocal) { throw std::runtime_error("src is remote, which is not supported"); } - + auto dstMrInfo = dstTransportInfo.ibMrInfo; auto srcMr = srcTransportInfo.ibMr; - qp->stageSend(srcMr, dstMrInfo, (uint32_t)size, /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset, /*signaled=*/false); + qp->stageSend(srcMr, dstMrInfo, (uint32_t)size, /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset, + /*signaled=*/false); qp->postSend(); // npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_DATA_ENTRY, (uint32_t)size); } -void IBConnection::flush() { +void IBConnection::flush() +{ bool isWaiting = true; while (isWaiting) { int wcNum = qp->pollCq(); @@ -114,11 +132,13 @@ void IBConnection::flush() { // npkitCollectExitEvents(conn, NPKIT_EVENT_IB_SEND_EXIT); } -void IBConnection::startSetup(std::shared_ptr bootstrap) { +void IBConnection::startSetup(std::shared_ptr bootstrap) +{ bootstrap->send(&qp->getInfo(), sizeof(qp->getInfo()), remoteRank_, tag_); } -void IBConnection::endSetup(std::shared_ptr bootstrap) { +void IBConnection::endSetup(std::shared_ptr bootstrap) +{ IbQpInfo qpInfo; bootstrap->recv(&qpInfo, sizeof(qpInfo), remoteRank_, tag_); qp->rtr(qpInfo); diff --git a/src/epoch.cc b/src/epoch.cc index 1fee307e..f6c82731 100644 --- a/src/epoch.cc +++ b/src/epoch.cc @@ -3,20 +3,25 @@ namespace mscclpp { -struct Epoch::Impl { +struct Epoch::Impl +{ DeviceEpoch deviceEpoch; - Impl() { + Impl() + { MSCCLPPTHROW(mscclppCudaCalloc(&deviceEpoch.localSignalEpochId, 1)); MSCCLPPTHROW(mscclppCudaCalloc(&deviceEpoch.waitEpochId, 1)); } - ~Impl() { + ~Impl() + { MSCCLPPTHROW(mscclppCudaFree(deviceEpoch.localSignalEpochId)); MSCCLPPTHROW(mscclppCudaFree(deviceEpoch.waitEpochId)); } }; -Epoch::Epoch() : pimpl(std::make_unique()) {} +Epoch::Epoch() : pimpl(std::make_unique()) +{ +} } // namespace mscclpp \ No newline at end of file diff --git a/src/fifo.cc b/src/fifo.cc index fe7f12d3..c2fdd738 100644 --- a/src/fifo.cc +++ b/src/fifo.cc @@ -1,13 +1,14 @@ -#include "mscclppfifo.hpp" #include "alloc.h" #include "checks.hpp" +#include "mscclppfifo.hpp" #include -#include #include +#include namespace mscclpp { -struct HostProxyFifo::Impl { +struct HostProxyFifo::Impl +{ DeviceProxyFifo deviceFifo; // allocated on the host. Only accessed by the host. This is a copy of the @@ -23,7 +24,8 @@ struct HostProxyFifo::Impl { cudaStream_t stream; }; -HostProxyFifo::HostProxyFifo() { +HostProxyFifo::HostProxyFifo() +{ pimpl = std::make_unique(); MSCCLPPTHROW(mscclppCudaCalloc(&pimpl->deviceFifo.head, 1)); MSCCLPPTHROW(mscclppCudaHostCalloc(&pimpl->deviceFifo.triggers, MSCCLPP_PROXY_FIFO_SIZE)); @@ -32,35 +34,40 @@ HostProxyFifo::HostProxyFifo() { pimpl->hostTail = 0; } -HostProxyFifo::~HostProxyFifo() { +HostProxyFifo::~HostProxyFifo() +{ MSCCLPPTHROW(mscclppCudaFree(pimpl->deviceFifo.head)); MSCCLPPTHROW(mscclppCudaHostFree(pimpl->deviceFifo.triggers)); MSCCLPPTHROW(mscclppCudaFree(pimpl->deviceFifo.tailReplica)); CUDATHROW(cudaStreamDestroy(pimpl->stream)); } -void HostProxyFifo::poll(ProxyTrigger *trigger) { +void HostProxyFifo::poll(ProxyTrigger* trigger) +{ __m128i xmm0 = _mm_load_si128((__m128i*)&pimpl->deviceFifo.triggers[pimpl->hostTail % MSCCLPP_PROXY_FIFO_SIZE]); _mm_store_si128((__m128i*)trigger, xmm0); } -void HostProxyFifo::pop() { +void HostProxyFifo::pop() +{ *(volatile uint64_t*)(&pimpl->deviceFifo.triggers[pimpl->hostTail % MSCCLPP_PROXY_FIFO_SIZE]) = 0; (pimpl->hostTail)++; } -void HostProxyFifo::flushTail(bool sync) { +void HostProxyFifo::flushTail(bool sync) +{ // Flush the tail to device memory. This is either triggered every MSCCLPP_PROXY_FIFO_FLUSH_COUNTER to make sure // that the fifo can make progress even if there is no request mscclppSync. However, mscclppSync type is for flush // request. - CUDATHROW( - cudaMemcpyAsync(pimpl->deviceFifo.tailReplica, &pimpl->hostTail, sizeof(uint64_t), cudaMemcpyHostToDevice, pimpl->stream)); + CUDATHROW(cudaMemcpyAsync(pimpl->deviceFifo.tailReplica, &pimpl->hostTail, sizeof(uint64_t), cudaMemcpyHostToDevice, + pimpl->stream)); if (sync) { CUDATHROW(cudaStreamSynchronize(pimpl->stream)); } } -DeviceProxyFifo HostProxyFifo::toDevice() { +DeviceProxyFifo HostProxyFifo::toDevice() +{ return pimpl->deviceFifo; } diff --git a/src/host_connection.cc b/src/host_connection.cc index 72e11ffc..e33069e2 100644 --- a/src/host_connection.cc +++ b/src/host_connection.cc @@ -1,52 +1,64 @@ #include "host_connection.hpp" -#include "communicator.hpp" +#include "api.h" #include "comm.h" +#include "communicator.hpp" #include "mscclpp.h" #include "mscclppfifo.h" -#include "api.h" namespace mscclpp { -HostConnection::Impl::Impl(Communicator* comm, mscclppConn* conn) : comm(comm), conn(conn) { +HostConnection::Impl::Impl(Communicator* comm, mscclppConn* conn) : comm(comm), conn(conn) +{ this->hostConn = conn->hostConn; } -HostConnection::Impl::~Impl() { +HostConnection::Impl::~Impl() +{ // TODO: figure out memory ownership. Does this deallocate the mscclppHostConn? Likely not. } MSCCLPP_API_CPP HostConnection::~HostConnection() = default; -MSCCLPP_API_CPP HostConnection::HostConnection(std::unique_ptr p) : pimpl(std::move(p)) {} +MSCCLPP_API_CPP HostConnection::HostConnection(std::unique_ptr p) : pimpl(std::move(p)) +{ +} -MSCCLPP_API_CPP int HostConnection::getId() { +MSCCLPP_API_CPP int HostConnection::getId() +{ return pimpl->conn->connId; } -MSCCLPP_API_CPP BufferHandle HostConnection::registerBuffer(void* data, uint64_t size) { +MSCCLPP_API_CPP BufferHandle HostConnection::registerBuffer(void* data, uint64_t size) +{ BufferHandle result; static_assert(sizeof(BufferHandle) == sizeof(mscclppBufferHandle_t)); - mscclppRegisterBufferForConnection(pimpl->comm->pimpl->comm, pimpl->conn->connId, data, size, reinterpret_cast(&result)); + mscclppRegisterBufferForConnection(pimpl->comm->pimpl->comm, pimpl->conn->connId, data, size, + reinterpret_cast(&result)); return result; } -MSCCLPP_API_CPP int HostConnection::numLocalBuffers() { +MSCCLPP_API_CPP int HostConnection::numLocalBuffers() +{ return pimpl->conn->bufferRegistrations.size() - 1; } -MSCCLPP_API_CPP BufferHandle HostConnection::getLocalBuffer(int index) { +MSCCLPP_API_CPP BufferHandle HostConnection::getLocalBuffer(int index) +{ return index + 1; } -MSCCLPP_API_CPP int HostConnection::numRemoteBuffers() { +MSCCLPP_API_CPP int HostConnection::numRemoteBuffers() +{ return pimpl->conn->remoteBufferRegistrations.size() - 1; } -MSCCLPP_API_CPP BufferHandle HostConnection::getRemoteBuffer(int index) { +MSCCLPP_API_CPP BufferHandle HostConnection::getRemoteBuffer(int index) +{ return index + 1; } -MSCCLPP_API_CPP ConnectionEpoch HostConnection::getEpoch() { +MSCCLPP_API_CPP ConnectionEpoch HostConnection::getEpoch() +{ ConnectionEpoch epoch; static_assert(sizeof(SignalEpochId) == sizeof(mscclppDevConnSignalEpochId)); epoch.localSignalEpochId = reinterpret_cast(pimpl->conn->devConn->localSignalEpochId); @@ -55,24 +67,29 @@ MSCCLPP_API_CPP ConnectionEpoch HostConnection::getEpoch() { return epoch; } - -MSCCLPP_API_CPP DeviceProxyFifo HostConnection::getDeviceFifo() { +MSCCLPP_API_CPP DeviceProxyFifo HostConnection::getDeviceFifo() +{ return pimpl->comm->pimpl->proxy.fifo().toDevice(); } -MSCCLPP_API_CPP void HostConnection::put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) { +MSCCLPP_API_CPP void HostConnection::put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, + uint64_t size) +{ pimpl->hostConn->put(dst, dstOffset, src, srcOffset, size); } -MSCCLPP_API_CPP void HostConnection::signal() { +MSCCLPP_API_CPP void HostConnection::signal() +{ pimpl->hostConn->signal(); } -MSCCLPP_API_CPP void HostConnection::flush() { +MSCCLPP_API_CPP void HostConnection::flush() +{ pimpl->hostConn->flush(); } -MSCCLPP_API_CPP void HostConnection::wait() { +MSCCLPP_API_CPP void HostConnection::wait() +{ pimpl->hostConn->wait(); } diff --git a/src/ib.cc b/src/ib.cc index 88d14d8e..ec7e95f2 100644 --- a/src/ib.cc +++ b/src/ib.cc @@ -1,16 +1,16 @@ #include #include #include -#include #include +#include #include -#include "mscclpp.hpp" #include "alloc.h" +#include "checks.hpp" #include "comm.h" #include "debug.h" #include "ib.hpp" -#include "checks.hpp" +#include "mscclpp.hpp" #include #include @@ -28,7 +28,9 @@ IbMr::IbMr(void* pd, void* buff, std::size_t size) : buff(buff) uintptr_t addr = reinterpret_cast(buff) & -pageSize; std::size_t pages = (size + (reinterpret_cast(buff) - addr) + pageSize - 1) / pageSize; struct ibv_pd* _pd = reinterpret_cast(pd); - struct ibv_mr* _mr = ibv_reg_mr(_pd, reinterpret_cast(addr), pages * pageSize, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_RELAXED_ORDERING); + struct ibv_mr* _mr = + ibv_reg_mr(_pd, reinterpret_cast(addr), pages * pageSize, + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_RELAXED_ORDERING); if (_mr == nullptr) { std::stringstream err; err << "ibv_reg_mr failed (errno " << errno << ")"; @@ -164,7 +166,9 @@ void IbQp::rtr(const IbQpInfo& info) qp_attr.ah_attr.sl = 0; qp_attr.ah_attr.src_path_bits = 0; qp_attr.ah_attr.port_num = info.port; - int ret = ibv_modify_qp(reinterpret_cast(this->qp), &qp_attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER); + int ret = ibv_modify_qp(reinterpret_cast(this->qp), &qp_attr, + IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER); if (ret != 0) { std::stringstream err; err << "ibv_modify_qp failed (errno " << errno << ")"; @@ -182,7 +186,9 @@ void IbQp::rts() qp_attr.rnr_retry = 7; qp_attr.sq_psn = 0; qp_attr.max_rd_atomic = 1; - int ret = ibv_modify_qp(reinterpret_cast(this->qp), &qp_attr, IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC); + int ret = ibv_modify_qp(reinterpret_cast(this->qp), &qp_attr, + IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | + IBV_QP_MAX_QP_RD_ATOMIC); if (ret != 0) { std::stringstream err; err << "ibv_modify_qp failed (errno " << errno << ")"; @@ -190,7 +196,8 @@ void IbQp::rts() } } -int IbQp::stageSend(const IbMr *mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled) +int IbQp::stageSend(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, + uint64_t dstOffset, bool signaled) { if (this->wrn >= MSCCLPP_IB_MAX_SENDS) { return -1; @@ -219,7 +226,8 @@ int IbQp::stageSend(const IbMr *mr, const IbMrInfo& info, uint32_t size, uint64_ return this->wrn; } -int IbQp::stageSendWithImm(const IbMr *mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled, unsigned int immData) +int IbQp::stageSendWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, + uint64_t dstOffset, bool signaled, unsigned int immData) { int wrn = this->stageSend(mr, info, size, wrId, srcOffset, dstOffset, signaled); struct ibv_send_wr* wrs_ = reinterpret_cast(this->wrs); @@ -234,7 +242,8 @@ void IbQp::postSend() return; } struct ibv_send_wr* bad_wr; - int ret = ibv_post_send(reinterpret_cast(this->qp), reinterpret_cast(this->wrs), &bad_wr); + int ret = ibv_post_send(reinterpret_cast(this->qp), reinterpret_cast(this->wrs), + &bad_wr); if (ret != 0) { std::stringstream err; err << "ibv_post_send failed (errno " << errno << ")"; @@ -260,7 +269,8 @@ void IbQp::postRecv(uint64_t wrId) int IbQp::pollCq() { - return ibv_poll_cq(reinterpret_cast(this->cq), MSCCLPP_IB_CQ_POLL_NUM, reinterpret_cast(this->wcs)); + return ibv_poll_cq(reinterpret_cast(this->cq), MSCCLPP_IB_CQ_POLL_NUM, + reinterpret_cast(this->wcs)); } IbQpInfo& IbQp::getInfo() @@ -317,8 +327,8 @@ bool IbCtx::isPortUsable(int port) const err << "ibv_query_port failed (errno " << errno << ", port << " << port << ")"; throw std::runtime_error(err.str()); } - return portAttr.state == IBV_PORT_ACTIVE && (portAttr.link_layer == IBV_LINK_LAYER_ETHERNET || - portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND); + return portAttr.state == IBV_PORT_ACTIVE && + (portAttr.link_layer == IBV_LINK_LAYER_ETHERNET || portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND); } int IbCtx::getAnyActivePort() const @@ -362,43 +372,45 @@ const std::string& IbCtx::getDevName() const return this->devName; } -int getIBDeviceCount() { +int getIBDeviceCount() +{ int num; ibv_get_device_list(&num); return num; } -std::string getIBDeviceName(Transport ibTransport) { +std::string getIBDeviceName(Transport ibTransport) +{ int num; struct ibv_device** devices = ibv_get_device_list(&num); int ibTransportIndex; switch (ibTransport) { // TODO: get rid of this ugly switch - case Transport::IB0: - ibTransportIndex = 0; - break; - case Transport::IB1: - ibTransportIndex = 1; - break; - case Transport::IB2: - ibTransportIndex = 2; - break; - case Transport::IB3: - ibTransportIndex = 3; - break; - case Transport::IB4: - ibTransportIndex = 4; - break; - case Transport::IB5: - ibTransportIndex = 5; - break; - case Transport::IB6: - ibTransportIndex = 6; - break; - case Transport::IB7: - ibTransportIndex = 7; - break; - default: - throw std::runtime_error("Not an IB transport"); + case Transport::IB0: + ibTransportIndex = 0; + break; + case Transport::IB1: + ibTransportIndex = 1; + break; + case Transport::IB2: + ibTransportIndex = 2; + break; + case Transport::IB3: + ibTransportIndex = 3; + break; + case Transport::IB4: + ibTransportIndex = 4; + break; + case Transport::IB5: + ibTransportIndex = 5; + break; + case Transport::IB6: + ibTransportIndex = 6; + break; + case Transport::IB7: + ibTransportIndex = 7; + break; + default: + throw std::runtime_error("Not an IB transport"); } if (ibTransportIndex >= num) { throw std::runtime_error("IB transport out of range"); @@ -406,30 +418,31 @@ std::string getIBDeviceName(Transport ibTransport) { return devices[ibTransportIndex]->name; } -Transport getIBTransportByDeviceName(const std::string& ibDeviceName) { +Transport getIBTransportByDeviceName(const std::string& ibDeviceName) +{ int num; struct ibv_device** devices = ibv_get_device_list(&num); for (int i = 0; i < num; ++i) { if (ibDeviceName == devices[i]->name) { switch (i) { // TODO: get rid of this ugly switch - case 0: - return Transport::IB0; - case 1: - return Transport::IB1; - case 2: - return Transport::IB2; - case 3: - return Transport::IB3; - case 4: - return Transport::IB4; - case 5: - return Transport::IB5; - case 6: - return Transport::IB6; - case 7: - return Transport::IB7; - default: - throw std::runtime_error("IB device index out of range"); + case 0: + return Transport::IB0; + case 1: + return Transport::IB1; + case 2: + return Transport::IB2; + case 3: + return Transport::IB3; + case 4: + return Transport::IB4; + case 5: + return Transport::IB5; + case 6: + return Transport::IB6; + case 7: + return Transport::IB7; + default: + throw std::runtime_error("IB device index out of range"); } } } diff --git a/src/include/basic_proxy_handler.hpp b/src/include/basic_proxy_handler.hpp index 1c4b3f86..58e41930 100644 --- a/src/include/basic_proxy_handler.hpp +++ b/src/include/basic_proxy_handler.hpp @@ -1,12 +1,12 @@ #ifndef MSCCLPP_BASIC_PROXY_SERVICE_HPP_ #define MSCCLPP_BASIC_PROXY_SERVICE_HPP_ -#include "mscclpp.hpp" #include "communicator.hpp" +#include "mscclpp.hpp" namespace mscclpp { -ProxyHandler makeBasicProxyHandler(Communicator::Impl &comm); +ProxyHandler makeBasicProxyHandler(Communicator::Impl& comm); } diff --git a/src/include/channel.hpp b/src/include/channel.hpp index 10a5f601..2303a57c 100644 --- a/src/include/channel.hpp +++ b/src/include/channel.hpp @@ -1,8 +1,8 @@ #ifndef MSCCLPP_CHANNEL_HPP_ #define MSCCLPP_CHANNEL_HPP_ -#include "mscclpp.hpp" #include "epoch.hpp" +#include "mscclpp.hpp" #include "proxy.hpp" namespace mscclpp { @@ -18,7 +18,7 @@ const ChannelTriggerType channelTriggerFlag = 0x2; const ChannelTriggerType channelTriggerSync = 0x4; // This is just a numeric ID. Each HostConnection will have an internal array indexed by these handles -// mapping to the actual +// mapping to the actual using BufferHandle = uint32_t; #define MSCCLPP_BITS_SIZE 32 @@ -43,20 +43,32 @@ union ChannelTrigger { uint64_t dstBufferHandle : MSCCLPP_BITS_BUFFER_HANDLE; uint64_t type : MSCCLPP_BITS_TYPE; uint64_t connId : MSCCLPP_BITS_CONNID; - uint64_t : (64 - MSCCLPP_BITS_OFFSET - MSCCLPP_BITS_BUFFER_HANDLE - MSCCLPP_BITS_BUFFER_HANDLE - MSCCLPP_BITS_TYPE); // ensure 64-bit alignment + uint64_t : (64 - MSCCLPP_BITS_OFFSET - MSCCLPP_BITS_BUFFER_HANDLE - MSCCLPP_BITS_BUFFER_HANDLE - + MSCCLPP_BITS_TYPE); // ensure 64-bit alignment } fields; #ifdef __CUDACC__ - __device__ ChannelTrigger() {} - __device__ ChannelTrigger(ProxyTrigger value) : value(value) {} - __device__ ChannelTrigger(ChannelTriggerType type, BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size, int connectionId) { + __device__ ChannelTrigger() + { + } + __device__ ChannelTrigger(ProxyTrigger value) : value(value) + { + } + __device__ ChannelTrigger(ChannelTriggerType type, BufferHandle dst, uint64_t dstOffset, BufferHandle src, + uint64_t srcOffset, uint64_t size, int connectionId) + { value.fst = ((srcOffset << MSCCLPP_BITS_SIZE) + size); - value.snd = ((((((((connectionId << MSCCLPP_BITS_TYPE) + (uint64_t)type) << MSCCLPP_BITS_BUFFER_HANDLE) + dst) << MSCCLPP_BITS_BUFFER_HANDLE) + src) << MSCCLPP_BITS_OFFSET) + dstOffset); + value.snd = ((((((((connectionId << MSCCLPP_BITS_TYPE) + (uint64_t)type) << MSCCLPP_BITS_BUFFER_HANDLE) + dst) + << MSCCLPP_BITS_BUFFER_HANDLE) + + src) + << MSCCLPP_BITS_OFFSET) + + dstOffset); } #endif // __CUDACC__ }; -struct ConnectionEpoch { +struct ConnectionEpoch +{ #ifdef __CUDACC__ __forceinline__ __device__ void wait() { @@ -81,8 +93,10 @@ struct ConnectionEpoch { uint64_t* waitEpochId; }; -class HostConnection { +class HostConnection +{ struct Impl; + public: /* HostConnection can not be constructed from user code and must instead be created through Communicator::connect */ HostConnection(std::unique_ptr); @@ -103,7 +117,7 @@ public: * * Inputs: * index: the index of the handle to get - * + * * Returns: a handle to the buffer */ BufferHandle getLocalBuffer(int index); @@ -118,7 +132,7 @@ public: * * Inputs: * index: the index of the handle to get - * + * * Returns: a handle to the buffer on the remote peer */ BufferHandle getRemoteBuffer(int index); @@ -140,19 +154,22 @@ private: friend class Communicator; }; -struct DeviceConnection { +struct DeviceConnection +{ DeviceConnection() = default; DeviceConnection(HostConnection& hostConn) - : connectionId(hostConn.getId()), epoch(hostConn.getEpoch()), - fifo(hostConn.getDeviceFifo()) {} + : connectionId(hostConn.getId()), epoch(hostConn.getEpoch()), fifo(hostConn.getDeviceFifo()) + { + } DeviceConnection(const DeviceConnection& other) = default; DeviceConnection& operator=(DeviceConnection& other) = default; #ifdef __CUDACC__ - __forceinline__ __device__ void put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) + __forceinline__ __device__ void put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, + uint64_t size) { fifo.push(ChannelTrigger(channelTriggerData, dst, dstOffset, src, srcOffset, size, connectionId).value); } @@ -168,10 +185,13 @@ struct DeviceConnection { fifo.push(ChannelTrigger(channelTriggerFlag, 0, 0, 0, 0, 1, connectionId).value); } - __forceinline__ __device__ void putWithSignal(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) + __forceinline__ __device__ void putWithSignal(BufferHandle dst, uint64_t dstOffset, BufferHandle src, + uint64_t srcOffset, uint64_t size) { epochIncrement(); - fifo.push(ChannelTrigger(channelTriggerData | channelTriggerFlag, dst, dstOffset, src, srcOffset, size, connectionId).value); + fifo.push( + ChannelTrigger(channelTriggerData | channelTriggerFlag, dst, dstOffset, src, srcOffset, size, connectionId) + .value); } __forceinline__ __device__ void putWithSignal(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) @@ -179,16 +199,20 @@ struct DeviceConnection { putWithSignal(dst, offset, src, offset, size); } - __forceinline__ __device__ void putWithSignalAndFlush(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size) + __forceinline__ __device__ void putWithSignalAndFlush(BufferHandle dst, uint64_t dstOffset, BufferHandle src, + uint64_t srcOffset, uint64_t size) { epochIncrement(); - uint64_t curFifoHead = fifo.push(ChannelTrigger(channelTriggerData | channelTriggerFlag | channelTriggerSync, dst, dstOffset, src, srcOffset, size, connectionId).value); + uint64_t curFifoHead = fifo.push(ChannelTrigger(channelTriggerData | channelTriggerFlag | channelTriggerSync, dst, + dstOffset, src, srcOffset, size, connectionId) + .value); while (*(volatile uint64_t*)&fifo.triggers[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0 && *(volatile uint64_t*)fifo.tailReplica <= curFifoHead) ; } - __forceinline__ __device__ void putWithSignalAndFlush(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) + __forceinline__ __device__ void putWithSignalAndFlush(BufferHandle dst, BufferHandle src, uint64_t offset, + uint64_t size) { putWithSignalAndFlush(dst, offset, src, offset, size); } @@ -223,10 +247,12 @@ struct DeviceConnection { DeviceProxyFifo fifo; }; -struct SimpleDeviceConnection { +struct SimpleDeviceConnection +{ SimpleDeviceConnection() = default; - SimpleDeviceConnection(HostConnection& hostConn) : devConn(hostConn) { + SimpleDeviceConnection(HostConnection& hostConn) : devConn(hostConn) + { dst = hostConn.getRemoteBuffer(0); src = hostConn.getLocalBuffer(0); } diff --git a/src/include/checks.hpp b/src/include/checks.hpp index ad985e76..69b222ee 100644 --- a/src/include/checks.hpp +++ b/src/include/checks.hpp @@ -8,6 +8,7 @@ #define MSCCLPP_CHECKS_HPP_ #include "debug.h" +#include #include #define MSCCLPPTHROW(call) \ @@ -26,4 +27,14 @@ } \ } while (false) +#define CUTHROW(cmd) \ + do { \ + CUresult err = cmd; \ + if (err != CUDA_SUCCESS) { \ + const char* errStr; \ + cuGetErrorString(err, &errStr); \ + throw std::runtime_error(std::string("Cu failure '") + std::string(errStr) + "'"); \ + } \ + } while (false) + #endif diff --git a/src/include/comm.h b/src/include/comm.h index dce724fa..e6a067d6 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -9,14 +9,14 @@ #include "ib.hpp" #include "proxy.h" -#include #include +#include #define MAXCONNECTIONS 64 struct mscclppBufferRegistration { - void *data; + void* data; uint64_t size; }; diff --git a/src/include/communicator.hpp b/src/include/communicator.hpp index e8e274b9..25fface7 100644 --- a/src/include/communicator.hpp +++ b/src/include/communicator.hpp @@ -1,19 +1,20 @@ #ifndef MSCCL_COMMUNICATOR_HPP_ #define MSCCL_COMMUNICATOR_HPP_ -#include "mscclpp.hpp" -#include "mscclpp.h" #include "channel.hpp" -#include "proxy.hpp" #include "ib.hpp" -#include +#include "mscclpp.h" +#include "mscclpp.hpp" +#include "proxy.hpp" #include +#include namespace mscclpp { class ConnectionBase; -struct Communicator::Impl { +struct Communicator::Impl +{ mscclppComm_t comm; std::vector> connections; std::unordered_map> ibContexts; diff --git a/src/include/connection.hpp b/src/include/connection.hpp index bd08802c..f957c8a1 100644 --- a/src/include/connection.hpp +++ b/src/include/connection.hpp @@ -1,25 +1,27 @@ #ifndef MSCCLPP_CONNECTION_HPP_ #define MSCCLPP_CONNECTION_HPP_ +#include "communicator.hpp" +#include "ib.hpp" #include "mscclpp.hpp" #include -#include "ib.hpp" -#include "communicator.hpp" namespace mscclpp { // TODO: Add functionality to these classes for Communicator to do connectionSetup -class ConnectionBase : public Connection { +class ConnectionBase : public Connection +{ public: - virtual void startSetup(std::shared_ptr bootstrap) {}; - virtual void endSetup(std::shared_ptr bootstrap) {}; + virtual void startSetup(std::shared_ptr bootstrap){}; + virtual void endSetup(std::shared_ptr bootstrap){}; }; -class CudaIpcConnection : public ConnectionBase { +class CudaIpcConnection : public ConnectionBase +{ cudaStream_t stream; -public: +public: CudaIpcConnection(); ~CudaIpcConnection(); @@ -28,19 +30,21 @@ public: Transport remoteTransport() override; - void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size) override; + void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, + uint64_t size) override; void flush() override; }; -class IBConnection : public ConnectionBase { +class IBConnection : public ConnectionBase +{ int remoteRank_; int tag_; Transport transport_; Transport remoteTransport_; IbQp* qp; -public: +public: IBConnection(int remoteRank, int tag, Transport transport, Communicator::Impl& commImpl); ~IBConnection(); @@ -49,7 +53,8 @@ public: Transport remoteTransport() override; - void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size) override; + void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, + uint64_t size) override; void flush() override; diff --git a/src/include/epoch.hpp b/src/include/epoch.hpp index 942edd8b..fd25b51f 100644 --- a/src/include/epoch.hpp +++ b/src/include/epoch.hpp @@ -5,7 +5,8 @@ namespace mscclpp { -struct alignas(16) SignalEpochId { +struct alignas(16) SignalEpochId +{ // every signal(), increaments this and either: // 1) proxy thread pushes it to the remote peer's localSignalEpochId->proxy // 2) gpu thread directly writes it to remoteSignalEpochId->device @@ -14,7 +15,8 @@ struct alignas(16) SignalEpochId { uint64_t proxy; }; -struct DeviceEpoch { +struct DeviceEpoch +{ #ifdef __CUDACC__ __forceinline__ __device__ void wait() { @@ -34,10 +36,11 @@ struct DeviceEpoch { uint64_t* waitEpochId; }; - -class Epoch { +class Epoch +{ struct Impl; std::unique_ptr pimpl; + public: Epoch(); ~Epoch(); diff --git a/src/include/host_connection.hpp b/src/include/host_connection.hpp index 495130d9..8ac5d9f1 100644 --- a/src/include/host_connection.hpp +++ b/src/include/host_connection.hpp @@ -1,13 +1,14 @@ #ifndef MSCCLPP_HOST_CONNECTION_HPP_ #define MSCCLPP_HOST_CONNECTION_HPP_ -#include "mscclpp.hpp" -#include "mscclpp.h" #include "comm.h" +#include "mscclpp.h" +#include "mscclpp.hpp" namespace mscclpp { -struct HostConnection::Impl { +struct HostConnection::Impl +{ Communicator* comm; mscclppConn* conn; mscclppHostConn_t* hostConn; diff --git a/src/include/ib.hpp b/src/include/ib.hpp index b1baeb75..78d31ce6 100644 --- a/src/include/ib.hpp +++ b/src/include/ib.hpp @@ -1,9 +1,9 @@ #ifndef MSCCLPP_IB_HPP_ #define MSCCLPP_IB_HPP_ +#include #include #include -#include #define MSCCLPP_IB_CQ_SIZE 1024 #define MSCCLPP_IB_CQ_POLL_NUM 1 @@ -55,8 +55,10 @@ public: void rtr(const IbQpInfo& info); void rts(); - int stageSend(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled); - int stageSendWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled, unsigned int immData); + int stageSend(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, + uint64_t dstOffset, bool signaled); + int stageSendWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, + uint64_t dstOffset, bool signaled, unsigned int immData); void postSend(); void postRecv(uint64_t wrId); int pollCq(); diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index c01246ab..4789b80f 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -191,7 +191,8 @@ struct mscclppHostConn { virtual ~mscclppHostConn() = default; virtual void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) = 0; - virtual void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, uint64_t dataSize) = 0; + virtual void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, + uint64_t dataSize) = 0; virtual void signal() = 0; virtual void wait() = 0; virtual void flush() = 0; @@ -232,7 +233,6 @@ typedef enum mscclppNumResults = 8 } mscclppResult_t; - /* Create a unique ID for communication. Only needs to be called by one process. * Use with mscclppCommInitRankFromId(). * All processes need to provide the same ID to mscclppCommInitRankFromId(). @@ -343,7 +343,8 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void * transportType: the type of transport to be used (mscclppTransportP2P or mscclppTransportIB) * ibDev: the name of the IB device to be used. Expects a null for mscclppTransportP2P. */ -mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int remoteRank, int tag, mscclppTransport_t transportType, const char* ibDev = 0); +mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int remoteRank, int tag, + mscclppTransport_t transportType, const char* ibDev = 0); /* Register a buffer for use with a connection. * @@ -356,7 +357,8 @@ mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int remoteRank, * Outputs: * handle: a handle to the buffer registration */ -mscclppResult_t mscclppRegisterBufferForConnection(mscclppComm_t comm, int connIdx, void* localBuff, uint64_t buffSize, mscclppBufferHandle_t *handle); +mscclppResult_t mscclppRegisterBufferForConnection(mscclppComm_t comm, int connIdx, void* localBuff, uint64_t buffSize, + mscclppBufferHandle_t* handle); /* Establish all connections declared by mscclppConnect(). This function must be called after all mscclppConnect() * calls are made. This function ensures that all remote ranks are ready to communicate when it returns. diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index 3b9c6d8d..8a85ebc6 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -6,16 +6,16 @@ #define MSCCLPP_PATCH 0 #define MSCCLPP_VERSION (MSCCLPP_MAJOR * 10000 + MSCCLPP_MINOR * 100 + MSCCLPP_PATCH) -#include +#include #include #include -#include - +#include namespace mscclpp { #define MSCCLPP_UNIQUE_ID_BYTES 128 -struct UniqueId { +struct UniqueId +{ char internal[MSCCLPP_UNIQUE_ID_BYTES]; }; @@ -64,7 +64,8 @@ private: */ std::unique_ptr getUniqueId(); -enum class Transport { +enum class Transport +{ Unknown, CudaIpc, IB0, @@ -79,109 +80,137 @@ enum class Transport { }; namespace detail { - const size_t TransportFlagsSize = 10; - static_assert(TransportFlagsSize == static_cast(Transport::NumTransports), "TransportFlagsSize must match the number of transports"); - using TransportFlagsBase = std::bitset; -} +const size_t TransportFlagsSize = 10; +static_assert(TransportFlagsSize == static_cast(Transport::NumTransports), + "TransportFlagsSize must match the number of transports"); +using TransportFlagsBase = std::bitset; +} // namespace detail -class TransportFlags : private detail::TransportFlagsBase { +class TransportFlags : private detail::TransportFlagsBase +{ public: TransportFlags() = default; - TransportFlags(Transport transport) : detail::TransportFlagsBase(1 << static_cast(transport)) {} + TransportFlags(Transport transport) : detail::TransportFlagsBase(1 << static_cast(transport)) + { + } - bool has(Transport transport) const { + bool has(Transport transport) const + { return detail::TransportFlagsBase::test(static_cast(transport)); } - bool none() const { + bool none() const + { return detail::TransportFlagsBase::none(); } - bool any() const { + bool any() const + { return detail::TransportFlagsBase::any(); } - bool all() const { + bool all() const + { return detail::TransportFlagsBase::all(); } - size_t count() const { + size_t count() const + { return detail::TransportFlagsBase::count(); } - TransportFlags& operator|=(TransportFlags other) { + TransportFlags& operator|=(TransportFlags other) + { detail::TransportFlagsBase::operator|=(other); return *this; } - TransportFlags operator|(TransportFlags other) const { + TransportFlags operator|(TransportFlags other) const + { return TransportFlags(*this) |= other; } - TransportFlags operator|(Transport transport) const { + TransportFlags operator|(Transport transport) const + { return *this | TransportFlags(transport); } - TransportFlags& operator&=(TransportFlags other) { + TransportFlags& operator&=(TransportFlags other) + { detail::TransportFlagsBase::operator&=(other); return *this; } - TransportFlags operator&(TransportFlags other) const { + TransportFlags operator&(TransportFlags other) const + { return TransportFlags(*this) &= other; } - TransportFlags operator&(Transport transport) const { + TransportFlags operator&(Transport transport) const + { return *this & TransportFlags(transport); } - TransportFlags& operator^=(TransportFlags other) { + TransportFlags& operator^=(TransportFlags other) + { detail::TransportFlagsBase::operator^=(other); return *this; } - TransportFlags operator^(TransportFlags other) const { + TransportFlags operator^(TransportFlags other) const + { return TransportFlags(*this) ^= other; } - TransportFlags operator^(Transport transport) const { + TransportFlags operator^(Transport transport) const + { return *this ^ TransportFlags(transport); } - TransportFlags operator~() const { + TransportFlags operator~() const + { return TransportFlags(*this).flip(); } - bool operator==(TransportFlags other) const { + bool operator==(TransportFlags other) const + { return detail::TransportFlagsBase::operator==(other); } - bool operator!=(TransportFlags other) const { + bool operator!=(TransportFlags other) const + { return detail::TransportFlagsBase::operator!=(other); } - detail::TransportFlagsBase toBitset() const { + detail::TransportFlagsBase toBitset() const + { return *this; } private: - TransportFlags(detail::TransportFlagsBase bitset) : detail::TransportFlagsBase(bitset) {} + TransportFlags(detail::TransportFlagsBase bitset) : detail::TransportFlagsBase(bitset) + { + } }; -inline TransportFlags operator|(Transport transport1, Transport transport2) { +inline TransportFlags operator|(Transport transport1, Transport transport2) +{ return TransportFlags(transport1) | transport2; } -inline TransportFlags operator&(Transport transport1, Transport transport2) { +inline TransportFlags operator&(Transport transport1, Transport transport2) +{ return TransportFlags(transport1) & transport2; } -inline TransportFlags operator^(Transport transport1, Transport transport2) { +inline TransportFlags operator^(Transport transport1, Transport transport2) +{ return TransportFlags(transport1) ^ transport2; } const TransportFlags NoTransports = TransportFlags(); -const TransportFlags AllIBTransports = Transport::IB0 | Transport::IB1 | Transport::IB2 | Transport::IB3 | Transport::IB4 | Transport::IB5 | Transport::IB6 | Transport::IB7; +const TransportFlags AllIBTransports = Transport::IB0 | Transport::IB1 | Transport::IB2 | Transport::IB3 | + Transport::IB4 | Transport::IB5 | Transport::IB6 | Transport::IB7; const TransportFlags AllTransports = AllIBTransports | Transport::CudaIpc; int getIBDeviceCount(); @@ -191,11 +220,12 @@ Transport getIBTransportByDeviceName(const std::string& ibDeviceName); class Communicator; class Connection; -class RegisteredMemory { +class RegisteredMemory +{ struct Impl; std::shared_ptr pimpl; -public: +public: RegisteredMemory(std::shared_ptr pimpl); ~RegisteredMemory(); @@ -211,9 +241,11 @@ public: friend class Communicator; }; -class Connection { +class Connection +{ public: - virtual void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size) = 0; + virtual void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, + uint64_t size) = 0; virtual void flush() = 0; @@ -225,24 +257,24 @@ protected: static std::shared_ptr getRegisteredMemoryImpl(RegisteredMemory&); }; -class Communicator { +class Communicator +{ public: /* Initialize the communicator. - * - * Inputs: - * bootstrap: an implementation of the of BaseBootstrap that the communicator will use - */ + * + * Inputs: + * bootstrap: an implementation of the of BaseBootstrap that the communicator will use + */ Communicator(std::shared_ptr bootstrap); - ~Communicator(); - + /* Ring-based AllGather through the bootstrap socket. - * - * Inputs: - * data: data array to be gathered where `[r*size, (r+1)*size)` is the data for rank `r` - * size: data size per rank - */ + * + * Inputs: + * data: data array to be gathered where `[r*size, (r+1)*size)` is the data for rank `r` + * size: data size per rank + */ void bootstrapAllGather(void* data, int size); /* A no-op function that is used to synchronize all processes via a bootstrap allgather*/ @@ -253,33 +285,34 @@ public: * Inputs: * data: base pointer to the memory * size: size of the memory region in bytes - * + * * Returns: a handle to the buffer */ RegisteredMemory registerMemory(void* ptr, size_t size, TransportFlags transports); /* Connect to a remote rank. This function only prepares metadata for connection. The actual connection - * is made by a following call of mscclppConnectionSetup(). Note that this function is two-way and a connection - * from rank i to remote rank j needs to have a counterpart from rank j to rank i. - * Note that with IB, buffers are registered at a page level and if a buffer is spread through multiple pages - * and do not fully utilize all of them, IB's QP has to register for all involved pages. This potentially has - * security risks if the devConn's accesses are given to a malicious process. - * - * Inputs: - * remoteRank: the rank of the remote process - * tag: the tag of the connection. tag is copied into the corresponding mscclppDevConn_t, which can be - * used to identify the connection inside a GPU kernel. - * transportType: the type of transport to be used (mscclppTransportP2P or mscclppTransportIB) - * ibDev: the name of the IB device to be used. Expects a null for mscclppTransportP2P. - */ + * is made by a following call of mscclppConnectionSetup(). Note that this function is two-way and a connection + * from rank i to remote rank j needs to have a counterpart from rank j to rank i. + * Note that with IB, buffers are registered at a page level and if a buffer is spread through multiple pages + * and do not fully utilize all of them, IB's QP has to register for all involved pages. This potentially has + * security risks if the devConn's accesses are given to a malicious process. + * + * Inputs: + * remoteRank: the rank of the remote process + * tag: the tag of the connection. tag is copied into the corresponding mscclppDevConn_t, which can be + * used to identify the connection inside a GPU kernel. + * transportType: the type of transport to be used (mscclppTransportP2P or mscclppTransportIB) + * ibDev: the name of the IB device to be used. Expects a null for mscclppTransportP2P. + */ std::shared_ptr connect(int remoteRank, int tag, Transport transport); /* Establish all connections declared by connect(). This function must be called after all connect() - * calls are made. This function ensures that all remote ranks are ready to communicate when it returns. - */ + * calls are made. This function ensures that all remote ranks are ready to communicate when it returns. + */ void connectionSetup(); struct Impl; + private: std::unique_ptr pimpl; }; @@ -287,12 +320,13 @@ private: } // namespace mscclpp namespace std { - template <> - struct hash { - size_t operator()(const mscclpp::TransportFlags& flags) const { - return hash()(flags.toBitset()); - } - }; -} +template <> struct hash +{ + size_t operator()(const mscclpp::TransportFlags& flags) const + { + return hash()(flags.toBitset()); + } +}; +} // namespace std #endif // MSCCLPP_H_ diff --git a/src/include/mscclppfifo.hpp b/src/include/mscclppfifo.hpp index b5f8ba4c..7e2820b0 100644 --- a/src/include/mscclppfifo.hpp +++ b/src/include/mscclppfifo.hpp @@ -1,13 +1,14 @@ #ifndef MSCCLPPFIFO_HPP_ #define MSCCLPPFIFO_HPP_ -#include #include #include +#include namespace mscclpp { -struct alignas(16) ProxyTrigger { +struct alignas(16) ProxyTrigger +{ uint64_t fst, snd; }; @@ -24,7 +25,8 @@ struct alignas(16) ProxyTrigger { * Why duplicating the tail is a good idea? The fifo is large engouh and we do not need frequent updates * for the tail as there is usually enough space for device threads to push their work into. */ -struct DeviceProxyFifo { +struct DeviceProxyFifo +{ #ifdef __CUDACC__ __forceinline__ __device__ uint64_t push(ProxyTrigger trigger) { @@ -34,29 +36,28 @@ struct DeviceProxyFifo { while (*(volatile uint64_t*)&this->triggers[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0) ; ProxyTrigger* triggerPtr = (ProxyTrigger*)&(this->triggers[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE]); - asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(triggerPtr), - "l"(trigger.fst), "l"(trigger.snd)); + asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(triggerPtr), "l"(trigger.fst), "l"(trigger.snd)); return curFifoHead; } #endif // __CUDACC__ ProxyTrigger* triggers; // Allocate on host via cudaHostAlloc. This space is used for pushing the workelements - uint64_t* tailReplica; // Allocated on device. proxyState->fifoTailHost is the true tail on host and pused - // occasionally to device - uint64_t* head; // Allocated on device. Only accessed by device + uint64_t* tailReplica; // Allocated on device. proxyState->fifoTailHost is the true tail on host and pused + // occasionally to device + uint64_t* head; // Allocated on device. Only accessed by device }; class HostProxyFifo { public: HostProxyFifo(); - + ~HostProxyFifo(); - void poll(ProxyTrigger *trigger); - + void poll(ProxyTrigger* trigger); + void pop(); - + void flushTail(bool sync = false); DeviceProxyFifo toDevice(); diff --git a/src/include/proxy.h b/src/include/proxy.h index 3746806b..5bcb7da5 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -60,7 +60,7 @@ struct mscclppProxyState int numaNodeToBind; mscclpp::IbCtx* ibContext; // For IB connection only - cudaStream_t p2pStream; // for P2P DMA engine only + cudaStream_t p2pStream; // for P2P DMA engine only struct mscclppProxyFifo fifo; }; diff --git a/src/include/proxy.hpp b/src/include/proxy.hpp index 70b6ba49..ac4116b3 100644 --- a/src/include/proxy.hpp +++ b/src/include/proxy.hpp @@ -3,12 +3,13 @@ #include -#include #include +#include namespace mscclpp { -enum class ProxyHandlerResult { +enum class ProxyHandlerResult +{ Continue, FlushFifoTailAndContinue, Stop, @@ -17,7 +18,8 @@ enum class ProxyHandlerResult { class Proxy; using ProxyHandler = std::function; -class Proxy { +class Proxy +{ public: Proxy(ProxyHandler handler); @@ -26,7 +28,7 @@ public: void start(); void stop(); - + HostProxyFifo& fifo(); private: diff --git a/src/include/registered_memory.hpp b/src/include/registered_memory.hpp index afe42da4..1c37ff04 100644 --- a/src/include/registered_memory.hpp +++ b/src/include/registered_memory.hpp @@ -1,15 +1,16 @@ #ifndef MSCCLPP_REGISTERED_MEMORY_HPP_ #define MSCCLPP_REGISTERED_MEMORY_HPP_ -#include "mscclpp.hpp" -#include "mscclpp.h" -#include "ib.hpp" #include "communicator.hpp" +#include "ib.hpp" +#include "mscclpp.h" +#include "mscclpp.hpp" #include namespace mscclpp { -struct TransportInfo { +struct TransportInfo +{ Transport transport; // TODO: rewrite this using std::variant or something @@ -21,7 +22,8 @@ struct TransportInfo { }; }; -struct RegisteredMemory::Impl { +struct RegisteredMemory::Impl +{ void* data; size_t size; int rank; @@ -31,7 +33,8 @@ struct RegisteredMemory::Impl { Impl(void* data, size_t size, int rank, TransportFlags transports, Communicator::Impl& commImpl); Impl(const std::vector& data); - TransportInfo& getTransportInfo(Transport transport) { + TransportInfo& getTransportInfo(Transport transport) + { for (auto& entry : transportInfos) { if (entry.transport == transport) { return entry; diff --git a/src/include/registered_ptr.hpp b/src/include/registered_ptr.hpp index 7eadb6b0..4f03ea40 100644 --- a/src/include/registered_ptr.hpp +++ b/src/include/registered_ptr.hpp @@ -3,32 +3,44 @@ namespace mscclpp { -template -class RegisteredPtr { +template class RegisteredPtr +{ RegisteredMemory memory; size_t offset; -public: - RegisteredPtr(RegisteredMemory memory, size_t offset) : memory(memory), offset(offset) {} - RegisteredPtr(RegisteredMemory memory) : RegisteredPtr(memory, 0) {} - ~RegisteredPtr() {} - RegisteredMemory memory() { +public: + RegisteredPtr(RegisteredMemory memory, size_t offset) : memory(memory), offset(offset) + { + } + RegisteredPtr(RegisteredMemory memory) : RegisteredPtr(memory, 0) + { + } + ~RegisteredPtr() + { + } + + RegisteredMemory memory() + { return memory; } - T* data() { + T* data() + { return reinterpret_cast(memory.data()); } - size_t size() { + size_t size() + { return memory.size() / sizeof(T); } - size_t offset() { + size_t offset() + { return offset; } - RegisteredPtr operator+(size_t offset) { + RegisteredPtr operator+(size_t offset) + { return RegisteredPtr(memory, this->offset + offset); } diff --git a/src/init.cc b/src/init.cc index c5b6a66b..03f037c4 100644 --- a/src/init.cc +++ b/src/init.cc @@ -6,8 +6,8 @@ #if defined(MSCCLPP_USE_GDRCOPY) #include "gdr.h" #endif -#include "mscclpp.h" #include "infiniband/verbs.h" +#include "mscclpp.h" #include #include #include @@ -327,7 +327,8 @@ struct mscclppHostP2PConn : mscclppHostConn { put(1, dstDataOffset, 1, srcDataOffset, dataSize); } - void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, uint64_t dataSize) + void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, + uint64_t dataSize) { void* srcBuff = (void*)((char*)conn->bufferRegistrations[src].data + srcDataOffset); void* dstBuff = (void*)((char*)conn->remoteBufferRegistrations[dst].data + dstDataOffset); @@ -365,7 +366,8 @@ struct mscclppHostIBConn : mscclppHostConn { put(1, dstDataOffset, 1, srcDataOffset, dataSize); } - void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, uint64_t dataSize) + void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, + uint64_t dataSize) { this->ibQp->stageSend(this->ibMrs[src], this->remoteIbMrInfos[dst], (uint32_t)dataSize, /*wrId=*/0, /*srcOffset=*/srcDataOffset, /*dstOffset=*/dstDataOffset, /*signaled=*/false); @@ -413,7 +415,8 @@ struct mscclppHostIBConn : mscclppHostConn std::vector remoteIbMrInfos; }; -MSCCLPP_API mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int remoteRank, int tag, mscclppTransport_t transportType, const char* ibDev) +MSCCLPP_API mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int remoteRank, int tag, + mscclppTransport_t transportType, const char* ibDev) { // save this processes numa binding and set it to the one closest to the device // so that all the allocation are close to the device @@ -550,7 +553,8 @@ MSCCLPP_API mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int MSCCLPPCHECK(setNumaState(curProcessState)); mscclppBufferHandle_t signalHandle = -1; - MSCCLPPCHECK(mscclppRegisterBufferForConnection(comm, connId, conn->devConn->localSignalEpochId, sizeof(mscclppDevConnSignalEpochId), &signalHandle)); + MSCCLPPCHECK(mscclppRegisterBufferForConnection(comm, connId, conn->devConn->localSignalEpochId, + sizeof(mscclppDevConnSignalEpochId), &signalHandle)); if (signalHandle != 0) { WARN("signal handle should be 0"); return mscclppInternalError; @@ -579,7 +583,9 @@ MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, i return mscclppSuccess; } -MSCCLPP_API mscclppResult_t mscclppRegisterBufferForConnection(mscclppComm_t comm, int connIdx, void* localBuff, uint64_t buffSize, mscclppBufferHandle_t *handle) { +MSCCLPP_API mscclppResult_t mscclppRegisterBufferForConnection(mscclppComm_t comm, int connIdx, void* localBuff, + uint64_t buffSize, mscclppBufferHandle_t* handle) +{ if (connIdx >= comm->nConns) { WARN("connIdx out of range"); return mscclppInvalidArgument; @@ -605,26 +611,31 @@ struct connInfo mscclpp::IbQpInfo infoQp; std::vector bufferInfos; - struct header { + struct header + { mscclpp::IbQpInfo infoQp; int numBufferInfos; }; - mscclppResult_t sendOverBootstrap(void* bootstrap, int remoteRank, int tag) { + mscclppResult_t sendOverBootstrap(void* bootstrap, int remoteRank, int tag) + { header h; h.infoQp = infoQp; h.numBufferInfos = bufferInfos.size(); MSCCLPPCHECK(bootstrapSend(bootstrap, remoteRank, tag, &h, sizeof(header))); - MSCCLPPCHECK(bootstrapSend(bootstrap, remoteRank, tag, bufferInfos.data(), bufferInfos.size() * sizeof(mscclppBufferRegistrationInfo))); + MSCCLPPCHECK(bootstrapSend(bootstrap, remoteRank, tag, bufferInfos.data(), + bufferInfos.size() * sizeof(mscclppBufferRegistrationInfo))); return mscclppSuccess; } - mscclppResult_t recvOverBootstrap(void* bootstrap, int remoteRank, int tag) { + mscclppResult_t recvOverBootstrap(void* bootstrap, int remoteRank, int tag) + { header h; MSCCLPPCHECK(bootstrapRecv(bootstrap, remoteRank, tag, &h, sizeof(header))); infoQp = h.infoQp; bufferInfos.resize(h.numBufferInfos); - MSCCLPPCHECK(bootstrapRecv(bootstrap, remoteRank, tag, bufferInfos.data(), bufferInfos.size() * sizeof(mscclppBufferRegistrationInfo))); + MSCCLPPCHECK(bootstrapRecv(bootstrap, remoteRank, tag, bufferInfos.data(), + bufferInfos.size() * sizeof(mscclppBufferRegistrationInfo))); return mscclppSuccess; } }; @@ -637,7 +648,7 @@ mscclppResult_t mscclppP2pConnectionSetupStart(struct connInfo* connInfo /*input } // Add all registered buffers - for (const auto &bufReg : conn->bufferRegistrations) { + for (const auto& bufReg : conn->bufferRegistrations) { connInfo->bufferInfos.emplace_back(); CUDACHECK(cudaIpcGetMemHandle(&connInfo->bufferInfos.back().cudaHandle, bufReg.data)); connInfo->bufferInfos.back().size = bufReg.size; @@ -659,7 +670,8 @@ mscclppResult_t mscclppP2pConnectionSetupEnd(struct connInfo* connInfo /*input*/ // Open all remote registered buffers for (size_t i = 0; i < connInfo->bufferInfos.size(); i++) { mscclppBufferRegistration newBufReg; - CUDACHECK(cudaIpcOpenMemHandle(&newBufReg.data, connInfo->bufferInfos[i].cudaHandle, cudaIpcMemLazyEnablePeerAccess)); + CUDACHECK( + cudaIpcOpenMemHandle(&newBufReg.data, connInfo->bufferInfos[i].cudaHandle, cudaIpcMemLazyEnablePeerAccess)); newBufReg.size = connInfo->bufferInfos[i].size; conn->remoteBufferRegistrations.push_back(newBufReg); } @@ -670,8 +682,8 @@ mscclppResult_t mscclppP2pConnectionSetupEnd(struct connInfo* connInfo /*input*/ } conn->devConn->remoteSignalEpochId = (mscclppDevConnSignalEpochId*)conn->remoteBufferRegistrations[0].data; - // For backwards compatibility with the previous API that assumed one data buffer per connection, set the remote buffer - // to the first remote data buffer + // For backwards compatibility with the previous API that assumed one data buffer per connection, set the remote + // buffer to the first remote data buffer if (conn->remoteBufferRegistrations.size() > 1) { conn->devConn->remoteBuff = conn->remoteBufferRegistrations[1].data; } @@ -695,7 +707,7 @@ mscclppResult_t mscclppIbConnectionSetupStart(struct connInfo* connInfo /*output } // Add all registered buffers - for (const auto &bufReg : conn->bufferRegistrations) { + for (const auto& bufReg : conn->bufferRegistrations) { hostConn->ibMrs.emplace_back(ibCtx->registerMr(bufReg.data, sizeof(struct mscclppDevConnSignalEpochId))); connInfo->bufferInfos.emplace_back(); connInfo->bufferInfos.back().ibMrInfo = hostConn->ibMrs.back()->getInfo(); @@ -743,7 +755,8 @@ MSCCLPP_API mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm) MSCCLPPCHECK(mscclppIbConnectionSetupStart(&cInfo, conn)); } // TODO: from saemal: do we possibly deadlock if there are too many outstanding sends? - // MSCCLPPCHECK(bootstrapSend(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag, &cInfo, sizeof(cInfo))); + // MSCCLPPCHECK(bootstrapSend(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag, &cInfo, + // sizeof(cInfo))); MSCCLPPCHECK(cInfo.sendOverBootstrap(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag)); } diff --git a/src/proxy_cpp.cc b/src/proxy_cpp.cc index 2d1cf098..b55d6995 100644 --- a/src/proxy_cpp.cc +++ b/src/proxy_cpp.cc @@ -1,8 +1,8 @@ +#include "api.h" #include "mscclpp.hpp" #include "utils.h" -#include "api.h" -#include #include +#include namespace mscclpp { @@ -10,26 +10,32 @@ const int ProxyStopCheckPeriod = 1000; const int ProxyFlushPeriod = 4; -struct Proxy::Impl { +struct Proxy::Impl +{ ProxyHandler handler; HostProxyFifo fifo; std::thread service; std::atomic_bool running; - Impl(ProxyHandler handler) : handler(handler), running(false) {} + Impl(ProxyHandler handler) : handler(handler), running(false) + { + } }; -MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler) { +MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler) +{ pimpl = std::make_unique(handler); } -MSCCLPP_API_CPP Proxy::~Proxy() { +MSCCLPP_API_CPP Proxy::~Proxy() +{ if (pimpl) { stop(); } } -MSCCLPP_API_CPP void Proxy::start() { +MSCCLPP_API_CPP void Proxy::start() +{ pimpl->running = true; pimpl->service = std::thread([this] { // from this point on, proxy thread will stay close to the device @@ -52,7 +58,7 @@ MSCCLPP_API_CPP void Proxy::start() { // Poll to see if we are ready to send anything fifo.poll(&trigger); if (trigger.fst == 0) { // TODO: this check is a potential pitfall for custom triggers - continue; // there is one in progress + continue; // there is one in progress } ProxyHandlerResult result = handler(trigger); @@ -83,14 +89,16 @@ MSCCLPP_API_CPP void Proxy::start() { }); } -MSCCLPP_API_CPP void Proxy::stop() { +MSCCLPP_API_CPP void Proxy::stop() +{ pimpl->running = false; if (pimpl->service.joinable()) { pimpl->service.join(); } } -MSCCLPP_API_CPP HostProxyFifo& Proxy::fifo() { +MSCCLPP_API_CPP HostProxyFifo& Proxy::fifo() +{ return pimpl->fifo; } diff --git a/src/registered_memory.cc b/src/registered_memory.cc index b26ea2d5..b9769dc9 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -1,10 +1,13 @@ #include "registered_memory.hpp" #include "checks.hpp" #include +#include namespace mscclpp { -RegisteredMemory::Impl::Impl(void* data, size_t size, int rank, TransportFlags transports, Communicator::Impl& commImpl) : data(data), size(size), rank(rank), transports(transports) { +RegisteredMemory::Impl::Impl(void* data, size_t size, int rank, TransportFlags transports, Communicator::Impl& commImpl) + : data(data), size(size), rank(rank), transports(transports) +{ if (transports.has(Transport::CudaIpc)) { TransportInfo transportInfo; transportInfo.transport = Transport::CudaIpc; @@ -23,38 +26,53 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, int rank, TransportFlags t transportInfo.ibLocal = true; this->transportInfos.push_back(transportInfo); }; - if (transports.has(Transport::IB0)) addIb(Transport::IB0); - if (transports.has(Transport::IB1)) addIb(Transport::IB1); - if (transports.has(Transport::IB2)) addIb(Transport::IB2); - if (transports.has(Transport::IB3)) addIb(Transport::IB3); - if (transports.has(Transport::IB4)) addIb(Transport::IB4); - if (transports.has(Transport::IB5)) addIb(Transport::IB5); - if (transports.has(Transport::IB6)) addIb(Transport::IB6); - if (transports.has(Transport::IB7)) addIb(Transport::IB7); + if (transports.has(Transport::IB0)) + addIb(Transport::IB0); + if (transports.has(Transport::IB1)) + addIb(Transport::IB1); + if (transports.has(Transport::IB2)) + addIb(Transport::IB2); + if (transports.has(Transport::IB3)) + addIb(Transport::IB3); + if (transports.has(Transport::IB4)) + addIb(Transport::IB4); + if (transports.has(Transport::IB5)) + addIb(Transport::IB5); + if (transports.has(Transport::IB6)) + addIb(Transport::IB6); + if (transports.has(Transport::IB7)) + addIb(Transport::IB7); } } -RegisteredMemory::RegisteredMemory(std::shared_ptr pimpl) : pimpl(pimpl) {} +RegisteredMemory::RegisteredMemory(std::shared_ptr pimpl) : pimpl(pimpl) +{ +} RegisteredMemory::~RegisteredMemory() = default; -void* RegisteredMemory::data() { +void* RegisteredMemory::data() +{ return pimpl->data; } -size_t RegisteredMemory::size() { +size_t RegisteredMemory::size() +{ return pimpl->size; } -int RegisteredMemory::rank() { +int RegisteredMemory::rank() +{ return pimpl->rank; } -TransportFlags RegisteredMemory::transports() { +TransportFlags RegisteredMemory::transports() +{ return pimpl->transports; } -std::vector RegisteredMemory::serialize() { +std::vector RegisteredMemory::serialize() +{ std::vector result; std::copy_n(reinterpret_cast(&pimpl->size), sizeof(pimpl->size), std::back_inserter(result)); std::copy_n(reinterpret_cast(&pimpl->rank), sizeof(pimpl->rank), std::back_inserter(result)); @@ -67,7 +85,8 @@ std::vector RegisteredMemory::serialize() { for (auto& entry : pimpl->transportInfos) { std::copy_n(reinterpret_cast(&entry.transport), sizeof(entry.transport), std::back_inserter(result)); if (entry.transport == Transport::CudaIpc) { - std::copy_n(reinterpret_cast(&entry.cudaIpcHandle), sizeof(entry.cudaIpcHandle), std::back_inserter(result)); + std::copy_n(reinterpret_cast(&entry.cudaIpcHandle), sizeof(entry.cudaIpcHandle), + std::back_inserter(result)); } else if (AllIBTransports.has(entry.transport)) { std::copy_n(reinterpret_cast(&entry.ibMrInfo), sizeof(entry.ibMrInfo), std::back_inserter(result)); } else { @@ -77,11 +96,13 @@ std::vector RegisteredMemory::serialize() { return result; } -RegisteredMemory RegisteredMemory::deserialize(const std::vector& data) { +RegisteredMemory RegisteredMemory::deserialize(const std::vector& data) +{ return RegisteredMemory(std::make_shared(data)); } -RegisteredMemory::Impl::Impl(const std::vector& serialization) { +RegisteredMemory::Impl::Impl(const std::vector& serialization) +{ auto it = serialization.begin(); std::copy_n(it, sizeof(this->size), reinterpret_cast(&this->size)); it += sizeof(this->size); @@ -118,6 +139,9 @@ RegisteredMemory::Impl::Impl(const std::vector& serialization) { if (transports.has(Transport::CudaIpc)) { auto entry = getTransportInfo(Transport::CudaIpc); + void* baseDataPtr; + size_t baseDataSize; // dummy + CUTHROW(cuMemGetAddressRange((CUdeviceptr*)&baseDataPtr, &baseDataSize, (CUdeviceptr)data)); CUDATHROW(cudaIpcOpenMemHandle(&data, entry.cudaIpcHandle, cudaIpcMemLazyEnablePeerAccess)); } } diff --git a/tests/allgather_test_cpp.cu b/tests/allgather_test_cpp.cu index 9b056e84..908a24f4 100644 --- a/tests/allgather_test_cpp.cu +++ b/tests/allgather_test_cpp.cu @@ -4,14 +4,14 @@ #ifdef MSCCLPP_USE_MPI_FOR_TESTS #include "mpi.h" #endif // MSCCLPP_USE_MPI_FOR_TESTS +#include +#include #include #include #include #include #include #include -#include -#include static int nranksPerNode = 8; @@ -50,7 +50,8 @@ static double getTime(void) __constant__ mscclpp::SimpleDeviceConnection constDevConns[16]; -__device__ void allgather0(mscclpp::SimpleDeviceConnection devConn, int rank, int world_size, int remoteRank, size_t nelemsPerGPU) +__device__ void allgather0(mscclpp::SimpleDeviceConnection devConn, int rank, int world_size, int remoteRank, + size_t nelemsPerGPU) { // this allgather is really simple and implemented as an alltoall @@ -69,8 +70,8 @@ __device__ void allgather0(mscclpp::SimpleDeviceConnection devConn, int rank, in devConn.wait(); } -__device__ void localAllGather(mscclpp::SimpleDeviceConnection devConn, int rank, int world_size, int nranksPerNode, int remoteRank, - uint64_t offset, uint64_t size) +__device__ void localAllGather(mscclpp::SimpleDeviceConnection devConn, int rank, int world_size, int nranksPerNode, + int remoteRank, uint64_t offset, uint64_t size) { // this allgather algorithm works as follows: // Step 1: GPU rank i sends data to GPU rank (i+1) % nranksPerNode @@ -93,15 +94,15 @@ __device__ void localAllGather(mscclpp::SimpleDeviceConnection devConn, int rank } } -__device__ void allgather1(mscclpp::SimpleDeviceConnection devConn, int rank, int world_size, int nranksPerNode, int remoteRank, - size_t nelemsPerGPU) +__device__ void allgather1(mscclpp::SimpleDeviceConnection devConn, int rank, int world_size, int nranksPerNode, + int remoteRank, size_t nelemsPerGPU) { localAllGather(devConn, rank, world_size, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int)); } -__device__ void allgather2(mscclpp::SimpleDeviceConnection devConn, int rank, int world_size, int nranksPerNode, int remoteRank, - size_t nelemsPerGPU) +__device__ void allgather2(mscclpp::SimpleDeviceConnection devConn, int rank, int world_size, int nranksPerNode, + int remoteRank, size_t nelemsPerGPU) { // this allgather is a pipelined and hierarchical one and only works for two nodes // it is implemented as follows: @@ -243,13 +244,13 @@ void setupMscclppConnections(int rank, int world_size, mscclpp::Communicator& co comm.connectionSetup(); std::vector devConns; - std::transform(hostConns.begin(), hostConns.end(), std::back_inserter(devConns), - [](std::shared_ptr& hostConn) { - return mscclpp::SimpleDeviceConnection(*hostConn); - }); + std::transform( + hostConns.begin(), hostConns.end(), std::back_inserter(devConns), + [](std::shared_ptr& hostConn) { return mscclpp::SimpleDeviceConnection(*hostConn); }); assert(devConns.size() < sizeof(constDevConns) / sizeof(mscclpp::SimpleDeviceConnection)); - CUDACHECK(cudaMemcpyToSymbol(constDevConns, devConns.data(), sizeof(mscclpp::SimpleDeviceConnection) * devConns.size() )); + CUDACHECK( + cudaMemcpyToSymbol(constDevConns, devConns.data(), sizeof(mscclpp::SimpleDeviceConnection) * devConns.size())); } void printUsage(const char* prog, bool isMpi) @@ -399,17 +400,17 @@ int main(int argc, const char* argv[]) } size_t nelemsPerGPU = dataSize / sizeof(int) / world_size; - try{ + try { if (rank == 0) - printf("Initializing MSCCL++\n"); + printf("Initializing MSCCL++\n"); mscclpp::Communicator comm(world_size, ip_port, rank); if (rank == 0) - printf("Initializing data for allgather test\n"); + printf("Initializing data for allgather test\n"); initializeAndAllocateAllGatherData(rank, world_size, dataSize, nelemsPerGPU, &data_h, &data_d); if (rank == 0) - printf("Setting up the connection in MSCCL++\n"); + printf("Setting up the connection in MSCCL++\n"); setupMscclppConnections(rank, world_size, comm, data_d, dataSize); if (rank == 0) @@ -466,7 +467,7 @@ int main(int argc, const char* argv[]) int cudagraphwarmup = 10; if (rank == 0) printf("Warming up %d iterations of the CUDA graph with %d iterations of the kernel\n", cudagraphwarmup, - cudagraphiter); + cudagraphiter); for (int i = 0; i < cudagraphwarmup; ++i) { cudaGraphLaunch(instance, stream); } @@ -476,7 +477,7 @@ int main(int argc, const char* argv[]) int cudagraphlaunch = 10; if (rank == 0) printf("Running %d iterations of the CUDA graph with %d iterations of the kernel\n", cudagraphlaunch, - cudagraphiter); + cudagraphiter); comm.bootstrapAllGather(tmp, sizeof(int)); double t0, t1, ms, time_in_us; t0 = getTime(); @@ -489,7 +490,7 @@ int main(int argc, const char* argv[]) ms = (t1 - t0) * 1000.0; time_in_us = ms * 1000. / (float)cudagraphlaunch / (float)cudagraphiter; printf("Rank %d report: size %lu time: %f us/iter algBW %f GBps\n", rank, dataSize, time_in_us, - (double)(dataSize) / 1e9 / (time_in_us / 1e6)); + (double)(dataSize) / 1e9 / (time_in_us / 1e6)); comm.bootstrapAllGather(tmp, sizeof(int)); if (rank == 0) diff --git a/tests/bootstrap_test_cpp.cc b/tests/bootstrap_test_cpp.cc index bdde8467..e4fe65bb 100644 --- a/tests/bootstrap_test_cpp.cc +++ b/tests/bootstrap_test_cpp.cc @@ -1,11 +1,12 @@ #include "mscclpp.hpp" -#include #include #include +#include #include -void test_allgather(std::shared_ptr bootstrap){ +void test_allgather(std::shared_ptr bootstrap) +{ std::vector tmp(bootstrap->getNranks(), 0); tmp[bootstrap->getRank()] = bootstrap->getRank() + 1; bootstrap->allGather(tmp.data(), sizeof(int)); @@ -16,13 +17,15 @@ void test_allgather(std::shared_ptr bootstrap){ std::cout << "AllGather test passed!" << std::endl; } -void test_barrier(std::shared_ptr bootstrap){ +void test_barrier(std::shared_ptr bootstrap) +{ bootstrap->barrier(); if (bootstrap->getRank() == 0) std::cout << "Barrier test passed!" << std::endl; } -void test_sendrecv(std::shared_ptr bootstrap){ +void test_sendrecv(std::shared_ptr bootstrap) +{ for (int i = 0; i < bootstrap->getNranks(); i++) { if (bootstrap->getRank() == i) continue; @@ -52,13 +55,15 @@ void test_sendrecv(std::shared_ptr bootstrap){ std::cout << "Send/Recv test passed!" << std::endl; } -void test_all(std::shared_ptr bootstrap){ +void test_all(std::shared_ptr bootstrap) +{ test_allgather(bootstrap); test_barrier(bootstrap); test_sendrecv(bootstrap); } -void test_mscclpp_bootstrap_with_id(int rank, int worldSize){ +void test_mscclpp_bootstrap_with_id(int rank, int worldSize) +{ auto bootstrap = std::make_shared(rank, worldSize); mscclpp::UniqueId id; if (bootstrap->getRank() == 0) @@ -71,7 +76,8 @@ void test_mscclpp_bootstrap_with_id(int rank, int worldSize){ std::cout << "--- MSCCLPP::Bootstrap test with unique id passed! ---" << std::endl; } -void test_mscclpp_bootstrap_with_ip_port_pair(int rank, int worldSize, char* ipPortPiar){ +void test_mscclpp_bootstrap_with_ip_port_pair(int rank, int worldSize, char* ipPortPiar) +{ std::shared_ptr bootstrap(new mscclpp::Bootstrap(rank, worldSize)); bootstrap->initialize(ipPortPiar); @@ -80,47 +86,57 @@ void test_mscclpp_bootstrap_with_ip_port_pair(int rank, int worldSize, char* ipP std::cout << "--- MSCCLPP::Bootstrap test with ip_port pair passed! ---" << std::endl; } -class MPIBootstrap : public mscclpp::BaseBootstrap { +class MPIBootstrap : public mscclpp::BaseBootstrap +{ public: - MPIBootstrap() : BaseBootstrap() {} - int getRank() override { + MPIBootstrap() : BaseBootstrap() + { + } + int getRank() override + { int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); return rank; } - int getNranks() override { + int getNranks() override + { int worldSize; MPI_Comm_size(MPI_COMM_WORLD, &worldSize); return worldSize; } - void allGather(void *sendbuf, int size) override { + void allGather(void* sendbuf, int size) override + { MPI_Allgather(MPI_IN_PLACE, 0, MPI_BYTE, sendbuf, size, MPI_BYTE, MPI_COMM_WORLD); } - void barrier() override { + void barrier() override + { MPI_Barrier(MPI_COMM_WORLD); } - void send(void *sendbuf, int size, int dest, int tag) override { + void send(void* sendbuf, int size, int dest, int tag) override + { MPI_Send(sendbuf, size, MPI_BYTE, dest, tag, MPI_COMM_WORLD); } - void recv(void *recvbuf, int size, int source, int tag) override { + void recv(void* recvbuf, int size, int source, int tag) override + { MPI_Recv(recvbuf, size, MPI_BYTE, source, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } }; -void test_mpi_bootstrap(){ +void test_mpi_bootstrap() +{ std::shared_ptr bootstrap(new MPIBootstrap()); test_all(bootstrap); if (bootstrap->getRank() == 0) std::cout << "--- MPI Bootstrap test passed! ---" << std::endl; } -int main(int argc, char **argv) +int main(int argc, char** argv) { int rank, worldSize; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &worldSize); - if (argc > 2){ + if (argc > 2) { if (rank == 0) std::cout << "Usage: " << argv[0] << " [ip:port]" << std::endl; MPI_Finalize(); diff --git a/tests/communicator_test_cpp.cc b/tests/communicator_test_cpp.cc index 1f14ca79..6864d97b 100644 --- a/tests/communicator_test_cpp.cc +++ b/tests/communicator_test_cpp.cc @@ -1,25 +1,20 @@ #include "mscclpp.hpp" -#include #include #include +#include #include -mscclpp::Transport findIb(int localRank){ - mscclpp::Transport IBs[] = { - mscclpp::Transport::IB0, - mscclpp::Transport::IB1, - mscclpp::Transport::IB2, - mscclpp::Transport::IB3, - mscclpp::Transport::IB4, - mscclpp::Transport::IB5, - mscclpp::Transport::IB6, - mscclpp::Transport::IB7 - }; +mscclpp::Transport findIb(int localRank) +{ + mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Transport::IB1, mscclpp::Transport::IB2, + mscclpp::Transport::IB3, mscclpp::Transport::IB4, mscclpp::Transport::IB5, + mscclpp::Transport::IB6, mscclpp::Transport::IB7}; return IBs[localRank]; } -void test_communicator(int rank, int worldSize, int nranksPerNode){ +void test_communicator(int rank, int worldSize, int nranksPerNode) +{ auto bootstrap = std::make_shared(rank, worldSize); mscclpp::UniqueId id; if (bootstrap->getRank() == 0) @@ -28,9 +23,9 @@ void test_communicator(int rank, int worldSize, int nranksPerNode){ bootstrap->initialize(id); auto communicator = std::make_shared(bootstrap); - for (int i = 0; i < worldSize; i++){ - if (i != rank){ - if (i / nranksPerNode == rank / nranksPerNode){ + for (int i = 0; i < worldSize; i++) { + if (i != rank) { + if (i / nranksPerNode == rank / nranksPerNode) { auto connect = communicator->connect(i, 0, mscclpp::Transport::CudaIpc); } else { auto connect = communicator->connect(i, 0, findIb(rank % nranksPerNode)); @@ -43,8 +38,7 @@ void test_communicator(int rank, int worldSize, int nranksPerNode){ std::cout << "--- MSCCLPP::Communicator tests passed! ---" << std::endl; } - -int main(int argc, char **argv) +int main(int argc, char** argv) { int rank, worldSize; MPI_Init(&argc, &argv); @@ -56,7 +50,7 @@ int main(int argc, char **argv) MPI_Comm_size(shmcomm, &shmWorldSize); int nranksPerNode = shmWorldSize; MPI_Comm_free(&shmcomm); - + test_communicator(rank, worldSize, nranksPerNode); MPI_Finalize(); diff --git a/tests/unittests/ib_test.cc b/tests/unittests/ib_test.cc index 6f84398f..3d99acb2 100644 --- a/tests/unittests/ib_test.cc +++ b/tests/unittests/ib_test.cc @@ -3,8 +3,8 @@ #include "ib.hpp" #include "infiniband/verbs.h" #include "mscclpp.hpp" -#include #include +#include // Measure current time in second. static double getTime(void) From 2ead25d8ebab548301e7c66d9283a1d55932750a Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Thu, 27 Apr 2023 21:36:13 +0000 Subject: [PATCH 077/135] INFO for IPC handle opened --- src/registered_memory.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/registered_memory.cc b/src/registered_memory.cc index b9769dc9..52cbb290 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -142,7 +142,8 @@ RegisteredMemory::Impl::Impl(const std::vector& serialization) void* baseDataPtr; size_t baseDataSize; // dummy CUTHROW(cuMemGetAddressRange((CUdeviceptr*)&baseDataPtr, &baseDataSize, (CUdeviceptr)data)); - CUDATHROW(cudaIpcOpenMemHandle(&data, entry.cudaIpcHandle, cudaIpcMemLazyEnablePeerAccess)); + CUDATHROW(cudaIpcOpenMemHandle(&baseDataPtr, entry.cudaIpcHandle, cudaIpcMemLazyEnablePeerAccess)); + INFO(MSCCLPP_P2P, "Opened CUDA IPC handle for base point of %p", data); } } From cbfc21851d14185734dc2d4125e42ac660a7c184 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Thu, 27 Apr 2023 22:25:03 +0000 Subject: [PATCH 078/135] registered buffer test --- src/communicator.cc | 2 +- src/registered_memory.cc | 5 +++-- tests/communicator_test_cpp.cc | 26 +++++++++++++++++++++++++- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/src/communicator.cc b/src/communicator.cc index 78df252d..79e45f8d 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -58,7 +58,7 @@ MSCCLPP_API_CPP void Communicator::bootstrapBarrier() mscclppBootstrapBarrier(pimpl->comm); } -RegisteredMemory Communicator::registerMemory(void* ptr, size_t size, TransportFlags transports) +MSCCLPP_API_CPP RegisteredMemory Communicator::registerMemory(void* ptr, size_t size, TransportFlags transports) { return RegisteredMemory(std::make_shared(ptr, size, pimpl->comm->rank, transports, *pimpl)); } diff --git a/src/registered_memory.cc b/src/registered_memory.cc index 52cbb290..42a03a8e 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -1,4 +1,5 @@ #include "registered_memory.hpp" +#include "api.h" #include "checks.hpp" #include #include @@ -45,11 +46,11 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, int rank, TransportFlags t } } -RegisteredMemory::RegisteredMemory(std::shared_ptr pimpl) : pimpl(pimpl) +MSCCLPP_API_CPP RegisteredMemory::RegisteredMemory(std::shared_ptr pimpl) : pimpl(pimpl) { } -RegisteredMemory::~RegisteredMemory() = default; +MSCCLPP_API_CPP RegisteredMemory::~RegisteredMemory() = default; void* RegisteredMemory::data() { diff --git a/tests/communicator_test_cpp.cc b/tests/communicator_test_cpp.cc index 6864d97b..a05c8981 100644 --- a/tests/communicator_test_cpp.cc +++ b/tests/communicator_test_cpp.cc @@ -1,10 +1,19 @@ #include "mscclpp.hpp" #include +#include #include #include #include +#define CUDATHROW(cmd) \ + do { \ + cudaError_t err = cmd; \ + if (err != cudaSuccess) { \ + throw std::runtime_error(std::string("Cuda failure '") + cudaGetErrorString(err) + "'"); \ + } \ + } while (false) + mscclpp::Transport findIb(int localRank) { mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Transport::IB1, mscclpp::Transport::IB2, @@ -23,17 +32,32 @@ void test_communicator(int rank, int worldSize, int nranksPerNode) bootstrap->initialize(id); auto communicator = std::make_shared(bootstrap); + if (bootstrap->getRank() == 0) + std::cout << "Communicator initialization passed" << std::endl; + + auto myIbDevice = findIb(rank % nranksPerNode); for (int i = 0; i < worldSize; i++) { if (i != rank) { if (i / nranksPerNode == rank / nranksPerNode) { auto connect = communicator->connect(i, 0, mscclpp::Transport::CudaIpc); } else { - auto connect = communicator->connect(i, 0, findIb(rank % nranksPerNode)); + auto connect = communicator->connect(i, 0, myIbDevice); } } } communicator->connectionSetup(); + if (bootstrap->getRank() == 0) + std::cout << "Connection setup passed" << std::endl; + + int* devicePtr; + int size = 1024; + CUDATHROW(cudaMalloc(&devicePtr, size)); + auto registeredMemory = communicator->registerMemory(devicePtr, size, mscclpp::Transport::CudaIpc | myIbDevice); + + if (bootstrap->getRank() == 0) + std::cout << "Memory registeration passed" << std::endl; + if (bootstrap->getRank() == 0) std::cout << "--- MSCCLPP::Communicator tests passed! ---" << std::endl; } From 962e63b11abf207e41a2a2d57fcb4f2d330f054a Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Thu, 27 Apr 2023 23:57:51 +0000 Subject: [PATCH 079/135] deserializing registered memory is failing -- commented out --- src/registered_memory.cc | 16 ++++++++-------- tests/communicator_test_cpp.cc | 19 +++++++++++++++++++ 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/registered_memory.cc b/src/registered_memory.cc index 42a03a8e..516a4c64 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -13,8 +13,11 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, int rank, TransportFlags t TransportInfo transportInfo; transportInfo.transport = Transport::CudaIpc; cudaIpcMemHandle_t handle; - // TODO: translate data to a base pointer - CUDATHROW(cudaIpcGetMemHandle(&handle, data)); + + void* baseDataPtr; + size_t baseDataSize; // dummy + CUTHROW(cuMemGetAddressRange((CUdeviceptr*)&baseDataPtr, &baseDataSize, (CUdeviceptr)data)); + CUDATHROW(cudaIpcGetMemHandle(&handle, baseDataPtr)); transportInfo.cudaIpcHandle = handle; this->transportInfos.push_back(transportInfo); } @@ -72,7 +75,7 @@ TransportFlags RegisteredMemory::transports() return pimpl->transports; } -std::vector RegisteredMemory::serialize() +MSCCLPP_API_CPP std::vector RegisteredMemory::serialize() { std::vector result; std::copy_n(reinterpret_cast(&pimpl->size), sizeof(pimpl->size), std::back_inserter(result)); @@ -97,7 +100,7 @@ std::vector RegisteredMemory::serialize() return result; } -RegisteredMemory RegisteredMemory::deserialize(const std::vector& data) +MSCCLPP_API_CPP RegisteredMemory RegisteredMemory::deserialize(const std::vector& data) { return RegisteredMemory(std::make_shared(data)); } @@ -140,10 +143,7 @@ RegisteredMemory::Impl::Impl(const std::vector& serialization) if (transports.has(Transport::CudaIpc)) { auto entry = getTransportInfo(Transport::CudaIpc); - void* baseDataPtr; - size_t baseDataSize; // dummy - CUTHROW(cuMemGetAddressRange((CUdeviceptr*)&baseDataPtr, &baseDataSize, (CUdeviceptr)data)); - CUDATHROW(cudaIpcOpenMemHandle(&baseDataPtr, entry.cudaIpcHandle, cudaIpcMemLazyEnablePeerAccess)); + CUDATHROW(cudaIpcOpenMemHandle(&data, entry.cudaIpcHandle, cudaIpcMemLazyEnablePeerAccess)); INFO(MSCCLPP_P2P, "Opened CUDA IPC handle for base point of %p", data); } } diff --git a/tests/communicator_test_cpp.cc b/tests/communicator_test_cpp.cc index a05c8981..7fccf57b 100644 --- a/tests/communicator_test_cpp.cc +++ b/tests/communicator_test_cpp.cc @@ -55,6 +55,25 @@ void test_communicator(int rank, int worldSize, int nranksPerNode) CUDATHROW(cudaMalloc(&devicePtr, size)); auto registeredMemory = communicator->registerMemory(devicePtr, size, mscclpp::Transport::CudaIpc | myIbDevice); + for (int i = 0; i < worldSize; i++) { + if (i != rank){ + auto serialized = registeredMemory.serialize(); + int serializedSize = serialized.size(); + bootstrap->send(&serializedSize, sizeof(int), i, 0); + bootstrap->send(serialized.data(), serializedSize, i, 1); + } + } + for (int i = 0; i < worldSize; i++) { + if (i != rank){ + int deserializedSize; + bootstrap->recv(&deserializedSize, sizeof(int), i, 0); + std::vector deserialized(deserializedSize); + bootstrap->recv(deserialized.data(), deserializedSize, i, 1); + // auto deserializedRegisteredMemory = mscclpp::RegisteredMemory::deserialize(deserialized); + } + } + + if (bootstrap->getRank() == 0) std::cout << "Memory registeration passed" << std::endl; From fa0fcb470e8e7910d3a1a2fedf33d4a4f1afdaee Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Fri, 28 Apr 2023 00:30:07 +0000 Subject: [PATCH 080/135] Lazy CUDA IPC handle opening --- src/include/registered_memory.hpp | 1 + src/registered_memory.cc | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/include/registered_memory.hpp b/src/include/registered_memory.hpp index 1c37ff04..88c1005d 100644 --- a/src/include/registered_memory.hpp +++ b/src/include/registered_memory.hpp @@ -25,6 +25,7 @@ struct TransportInfo struct RegisteredMemory::Impl { void* data; + bool dataInitialized; size_t size; int rank; TransportFlags transports; diff --git a/src/registered_memory.cc b/src/registered_memory.cc index 516a4c64..470e7c10 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -7,7 +7,7 @@ namespace mscclpp { RegisteredMemory::Impl::Impl(void* data, size_t size, int rank, TransportFlags transports, Communicator::Impl& commImpl) - : data(data), size(size), rank(rank), transports(transports) + : data(data), dataInitialized(true), size(size), rank(rank), transports(transports) { if (transports.has(Transport::CudaIpc)) { TransportInfo transportInfo; @@ -57,6 +57,18 @@ MSCCLPP_API_CPP RegisteredMemory::~RegisteredMemory() = default; void* RegisteredMemory::data() { + if (!pimpl->dataInitialized) { + if (pimpl->transports.has(Transport::CudaIpc)) { + auto entry = pimpl->getTransportInfo(Transport::CudaIpc); + CUDATHROW(cudaIpcOpenMemHandle(&pimpl->data, entry.cudaIpcHandle, cudaIpcMemLazyEnablePeerAccess)); + INFO(MSCCLPP_P2P, "Opened CUDA IPC handle for base point of %p", data); + } + else + { + pimpl->data = nullptr; + } + pimpl->dataInitialized = true; + } return pimpl->data; } @@ -141,11 +153,7 @@ RegisteredMemory::Impl::Impl(const std::vector& serialization) throw std::runtime_error("Deserialization failed"); } - if (transports.has(Transport::CudaIpc)) { - auto entry = getTransportInfo(Transport::CudaIpc); - CUDATHROW(cudaIpcOpenMemHandle(&data, entry.cudaIpcHandle, cudaIpcMemLazyEnablePeerAccess)); - INFO(MSCCLPP_P2P, "Opened CUDA IPC handle for base point of %p", data); - } + dataInitialized = false; } } // namespace mscclpp From 821ba7a5281a4cbaf8102030d973d5d269155400 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Fri, 28 Apr 2023 00:30:36 +0000 Subject: [PATCH 081/135] Fix compilation --- src/registered_memory.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/registered_memory.cc b/src/registered_memory.cc index 470e7c10..3fae7a96 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -61,7 +61,7 @@ void* RegisteredMemory::data() if (pimpl->transports.has(Transport::CudaIpc)) { auto entry = pimpl->getTransportInfo(Transport::CudaIpc); CUDATHROW(cudaIpcOpenMemHandle(&pimpl->data, entry.cudaIpcHandle, cudaIpcMemLazyEnablePeerAccess)); - INFO(MSCCLPP_P2P, "Opened CUDA IPC handle for base point of %p", data); + INFO(MSCCLPP_P2P, "Opened CUDA IPC handle for base point of %p", pimpl->data); } else { From cbefe38fd40f4d9acbed6813e48465d8ca569be7 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 28 Apr 2023 09:12:21 +0000 Subject: [PATCH 082/135] aad conn write test --- src/communicator.cc | 1 + src/include/connection.hpp | 4 ++-- tests/communicator_test_cpp.cc | 43 ++++++++++++++++++++++++++++++---- 3 files changed, 42 insertions(+), 6 deletions(-) diff --git a/src/communicator.cc b/src/communicator.cc index 79e45f8d..35936862 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -21,6 +21,7 @@ Communicator::Impl::Impl(std::shared_ptr bootstrap) : bootstrap_( INFO(MSCCLPP_INIT, "Host hash: %lx", hostHash); rankToHash_[bootstrap->getRank()] = hostHash; bootstrap->allGather(rankToHash_.data(), sizeof(uint64_t)); + comm->rank = bootstrap->getRank(); } Communicator::Impl::~Impl() diff --git a/src/include/connection.hpp b/src/include/connection.hpp index f957c8a1..42ca6d47 100644 --- a/src/include/connection.hpp +++ b/src/include/connection.hpp @@ -13,8 +13,8 @@ namespace mscclpp { class ConnectionBase : public Connection { public: - virtual void startSetup(std::shared_ptr bootstrap){}; - virtual void endSetup(std::shared_ptr bootstrap){}; + virtual void startSetup(std::shared_ptr){}; + virtual void endSetup(std::shared_ptr){}; }; class CudaIpcConnection : public ConnectionBase diff --git a/tests/communicator_test_cpp.cc b/tests/communicator_test_cpp.cc index 7fccf57b..a0b12e43 100644 --- a/tests/communicator_test_cpp.cc +++ b/tests/communicator_test_cpp.cc @@ -35,14 +35,17 @@ void test_communicator(int rank, int worldSize, int nranksPerNode) if (bootstrap->getRank() == 0) std::cout << "Communicator initialization passed" << std::endl; + std::vector> connections; auto myIbDevice = findIb(rank % nranksPerNode); for (int i = 0; i < worldSize; i++) { if (i != rank) { + std::shared_ptr conn; if (i / nranksPerNode == rank / nranksPerNode) { - auto connect = communicator->connect(i, 0, mscclpp::Transport::CudaIpc); + conn = communicator->connect(i, 0, mscclpp::Transport::CudaIpc); } else { - auto connect = communicator->connect(i, 0, myIbDevice); + conn = communicator->connect(i, 0, myIbDevice); } + connections.push_back(conn); } } communicator->connectionSetup(); @@ -63,20 +66,52 @@ void test_communicator(int rank, int worldSize, int nranksPerNode) bootstrap->send(serialized.data(), serializedSize, i, 1); } } + std::vector registeredMemories; for (int i = 0; i < worldSize; i++) { if (i != rank){ int deserializedSize; bootstrap->recv(&deserializedSize, sizeof(int), i, 0); std::vector deserialized(deserializedSize); bootstrap->recv(deserialized.data(), deserializedSize, i, 1); - // auto deserializedRegisteredMemory = mscclpp::RegisteredMemory::deserialize(deserialized); + auto deserializedRegisteredMemory = mscclpp::RegisteredMemory::deserialize(deserialized); + registeredMemories.push_back(std::move(deserializedRegisteredMemory)); } } + if (bootstrap->getRank() == 0) + std::cout << "Memory registration passed" << std::endl; + + assert(size % worldSize == 0); + size_t writeSize = size / worldSize; + size_t dataCount = size / sizeof(int); + // std::vector hostBuffer(dataCount, 0); + std::shared_ptr hostBuffer(new int[dataCount]); + for (int i = 0; i < dataCount; i++) { + hostBuffer[i] = rank; + } + CUDATHROW(cudaMemcpy(devicePtr, hostBuffer.get(), size, cudaMemcpyHostToDevice)); + + for (int i = 0; i < worldSize; i++) { + if (i != rank) { + int peerRankIndex = i < rank ? i : i - 1; + auto conn = connections[peerRankIndex]; + conn->write(registeredMemories[peerRankIndex], rank * writeSize, registeredMemory, rank * writeSize, writeSize); + } + } + CUDATHROW(cudaDeviceSynchronize()); + MPI_Barrier(MPI_COMM_WORLD); + CUDATHROW(cudaMemcpy(hostBuffer.get(), devicePtr, size, cudaMemcpyDeviceToHost)); + size_t dataPerRank = writeSize / sizeof(int); + for (int i = 0; i < dataCount; i++) { + if (hostBuffer[i] != i / dataPerRank) { + throw std::runtime_error("Data mismatch, connection write failed"); + } + } if (bootstrap->getRank() == 0) - std::cout << "Memory registeration passed" << std::endl; + std::cout << "Connection write passed" << std::endl; + CUDATHROW(cudaFree(devicePtr)); if (bootstrap->getRank() == 0) std::cout << "--- MSCCLPP::Communicator tests passed! ---" << std::endl; } From 750c40b98719e9ae97b1c7c402020f47ee08f9a7 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 28 Apr 2023 10:48:56 +0000 Subject: [PATCH 083/135] Fix --- src/communicator.cc | 4 ++-- src/registered_memory.cc | 2 +- tests/communicator_test_cpp.cc | 20 ++++++++++++-------- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/communicator.cc b/src/communicator.cc index 35936862..df213f8e 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -21,7 +21,6 @@ Communicator::Impl::Impl(std::shared_ptr bootstrap) : bootstrap_( INFO(MSCCLPP_INIT, "Host hash: %lx", hostHash); rankToHash_[bootstrap->getRank()] = hostHash; bootstrap->allGather(rankToHash_.data(), sizeof(uint64_t)); - comm->rank = bootstrap->getRank(); } Communicator::Impl::~Impl() @@ -61,7 +60,8 @@ MSCCLPP_API_CPP void Communicator::bootstrapBarrier() MSCCLPP_API_CPP RegisteredMemory Communicator::registerMemory(void* ptr, size_t size, TransportFlags transports) { - return RegisteredMemory(std::make_shared(ptr, size, pimpl->comm->rank, transports, *pimpl)); + return RegisteredMemory( + std::make_shared(ptr, size, pimpl->bootstrap_->getRank(), transports, *pimpl)); } MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank, int tag, Transport transport) diff --git a/src/registered_memory.cc b/src/registered_memory.cc index 3fae7a96..e298aee5 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -77,7 +77,7 @@ size_t RegisteredMemory::size() return pimpl->size; } -int RegisteredMemory::rank() +MSCCLPP_API_CPP int RegisteredMemory::rank() { return pimpl->rank; } diff --git a/tests/communicator_test_cpp.cc b/tests/communicator_test_cpp.cc index a0b12e43..c1e812cd 100644 --- a/tests/communicator_test_cpp.cc +++ b/tests/communicator_test_cpp.cc @@ -5,6 +5,7 @@ #include #include #include +#include #define CUDATHROW(cmd) \ do { \ @@ -35,7 +36,7 @@ void test_communicator(int rank, int worldSize, int nranksPerNode) if (bootstrap->getRank() == 0) std::cout << "Communicator initialization passed" << std::endl; - std::vector> connections; + std::unordered_map> connections; auto myIbDevice = findIb(rank % nranksPerNode); for (int i = 0; i < worldSize; i++) { if (i != rank) { @@ -45,7 +46,7 @@ void test_communicator(int rank, int worldSize, int nranksPerNode) } else { conn = communicator->connect(i, 0, myIbDevice); } - connections.push_back(conn); + connections[i] = conn; } } communicator->connectionSetup(); @@ -66,7 +67,7 @@ void test_communicator(int rank, int worldSize, int nranksPerNode) bootstrap->send(serialized.data(), serializedSize, i, 1); } } - std::vector registeredMemories; + std::unordered_map registeredMemories; for (int i = 0; i < worldSize; i++) { if (i != rank){ int deserializedSize; @@ -74,14 +75,15 @@ void test_communicator(int rank, int worldSize, int nranksPerNode) std::vector deserialized(deserializedSize); bootstrap->recv(deserialized.data(), deserializedSize, i, 1); auto deserializedRegisteredMemory = mscclpp::RegisteredMemory::deserialize(deserialized); - registeredMemories.push_back(std::move(deserializedRegisteredMemory)); + registeredMemories.insert({deserializedRegisteredMemory.rank(), deserializedRegisteredMemory}); } } + MPI_Barrier(MPI_COMM_WORLD); if (bootstrap->getRank() == 0) std::cout << "Memory registration passed" << std::endl; - assert(size % worldSize == 0); + assert((size / sizeof(int)) % worldSize == 0); size_t writeSize = size / worldSize; size_t dataCount = size / sizeof(int); // std::vector hostBuffer(dataCount, 0); @@ -91,11 +93,13 @@ void test_communicator(int rank, int worldSize, int nranksPerNode) } CUDATHROW(cudaMemcpy(devicePtr, hostBuffer.get(), size, cudaMemcpyHostToDevice)); + MPI_Barrier(MPI_COMM_WORLD); for (int i = 0; i < worldSize; i++) { if (i != rank) { - int peerRankIndex = i < rank ? i : i - 1; - auto conn = connections[peerRankIndex]; - conn->write(registeredMemories[peerRankIndex], rank * writeSize, registeredMemory, rank * writeSize, writeSize); + auto& conn = connections.at(i); + auto& peerMemory = registeredMemories.at(i); + // printf("write to rank: %d, rank is %d\n", peerMemory.rank(), rank); + conn->write(peerMemory, rank * writeSize, registeredMemory, rank * writeSize, writeSize); } } CUDATHROW(cudaDeviceSynchronize()); From 04e878489df3136b9acb1b6283c2022361339b8d Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Fri, 28 Apr 2023 22:50:38 +0000 Subject: [PATCH 084/135] Work on a channel service --- Makefile | 2 +- src/basic_proxy_handler.cc | 31 ---- src/communicator.cc | 13 +- src/connection.cc | 16 +- src/epoch.cc | 32 ++-- src/host_connection.cc | 96 ------------ src/include/channel.hpp | 280 ++++++++++++++++------------------- src/include/communicator.hpp | 1 - src/include/connection.hpp | 11 +- src/include/epoch.hpp | 34 ++--- src/include/mscclpp.hpp | 32 ++-- src/include/mscclppfifo.hpp | 7 +- src/include/proxy.hpp | 2 +- tests/allgather_test_cpp.cu | 26 ++-- 14 files changed, 227 insertions(+), 356 deletions(-) delete mode 100644 src/basic_proxy_handler.cc delete mode 100644 src/host_connection.cc diff --git a/Makefile b/Makefile index 41896041..782129c0 100644 --- a/Makefile +++ b/Makefile @@ -120,7 +120,7 @@ LDFLAGS := $(NVLDFLAGS) $(GDRCOPY_LDFLAGS) -libverbs -lnuma LIBSRCS := $(addprefix src/,debug.cc utils.cc init.cc proxy.cc ib.cc config.cc) LIBSRCS += $(addprefix src/bootstrap/,bootstrap.cc socket.cc) -LIBSRCS += $(addprefix src/,communicator.cc connection.cc registered_memory.cc) +LIBSRCS += $(addprefix src/,communicator.cc connection.cc registered_memory.cc epoch.cc) #LIBSRCS += $(addprefix src/,fifo.cc host_connection.cc proxy_cpp.cc basic_proxy_handler.cc) ifneq ($(NPKIT), 0) LIBSRCS += $(addprefix src/misc/,npkit.cc) diff --git a/src/basic_proxy_handler.cc b/src/basic_proxy_handler.cc deleted file mode 100644 index 42470131..00000000 --- a/src/basic_proxy_handler.cc +++ /dev/null @@ -1,31 +0,0 @@ -#include "basic_proxy_handler.hpp" - -namespace mscclpp { - -ProxyHandler makeBasicProxyHandler(Communicator::Impl& comm) -{ - return [&comm](ProxyTrigger triggerRaw) { - ChannelTrigger* trigger = reinterpret_cast(&triggerRaw); - HostConnection& conn = *comm.connections.at(trigger->fields.connId); - - auto result = ProxyHandlerResult::Continue; - - if (trigger->fields.type & mscclppData) { - conn.put(trigger->fields.dstBufferHandle, trigger->fields.dstOffset, trigger->fields.srcBufferHandle, - trigger->fields.srcOffset, trigger->fields.size); - } - - if (trigger->fields.type & mscclppFlag) { - conn.signal(); - } - - if (trigger->fields.type & mscclppSync) { - conn.flush(); - result = ProxyHandlerResult::FlushFifoTailAndContinue; - } - - return result; - }; -} - -} // namespace mscclpp diff --git a/src/communicator.cc b/src/communicator.cc index df213f8e..21faeaee 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -1,13 +1,11 @@ #include #include "api.h" -#include "basic_proxy_handler.hpp" #include "checks.hpp" #include "comm.h" #include "communicator.hpp" #include "connection.hpp" #include "debug.h" -#include "host_connection.hpp" #include "mscclpp.hpp" #include "registered_memory.hpp" #include "utils.h" @@ -48,14 +46,9 @@ MSCCLPP_API_CPP Communicator::Communicator(std::shared_ptr bootst { } -MSCCLPP_API_CPP void Communicator::bootstrapAllGather(void* data, int size) +MSCCLPP_API_CPP std::shared_ptr Communicator::bootstrapper() { - mscclppBootstrapAllGather(pimpl->comm, data, size); -} - -MSCCLPP_API_CPP void Communicator::bootstrapBarrier() -{ - mscclppBootstrapBarrier(pimpl->comm); + return pimpl->bootstrap_; } MSCCLPP_API_CPP RegisteredMemory Communicator::registerMemory(void* ptr, size_t size, TransportFlags transports) @@ -77,7 +70,7 @@ MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank << pimpl->rankToHash_[pimpl->bootstrap_->getRank()] << ")"; throw std::runtime_error(ss.str()); } - auto cudaIpcConn = std::make_shared(); + auto cudaIpcConn = std::make_shared(remoteRank, tag); conn = cudaIpcConn; INFO(MSCCLPP_P2P, "Cuda IPC connection between rank %d(%lx) and remoteRank %d(%lx) created", pimpl->bootstrap_->getRank(), pimpl->rankToHash_[pimpl->bootstrap_->getRank()], remoteRank, diff --git a/src/connection.cc b/src/connection.cc index 75a6ba79..4f8a4515 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -20,9 +20,17 @@ std::shared_ptr Connection::getRegisteredMemoryImpl(Regi return mem.pimpl; } +// ConnectionBase + +ConnectionBase::ConnectionBase(int remoteRank, int tag) : remoteRank_(remoteRank), tag_(tag) {} + +int ConnectionBase::remoteRank() { return remoteRank_; } + +int ConnectionBase::tag() { return tag_; } + // CudaIpcConnection -CudaIpcConnection::CudaIpcConnection() +CudaIpcConnection::CudaIpcConnection(int remoteRank, int tag) : ConnectionBase(remoteRank, tag) { cudaStreamCreate(&stream); } @@ -64,7 +72,7 @@ void CudaIpcConnection::flush() // IBConnection IBConnection::IBConnection(int remoteRank, int tag, Transport transport, Communicator::Impl& commImpl) - : remoteRank_(remoteRank), tag_(tag), transport_(transport), remoteTransport_(Transport::Unknown) + : ConnectionBase(remoteRank, tag), transport_(transport), remoteTransport_(Transport::Unknown) { qp = commImpl.getIbContext(transport)->createQp(); } @@ -134,13 +142,13 @@ void IBConnection::flush() void IBConnection::startSetup(std::shared_ptr bootstrap) { - bootstrap->send(&qp->getInfo(), sizeof(qp->getInfo()), remoteRank_, tag_); + bootstrap->send(&qp->getInfo(), sizeof(qp->getInfo()), remoteRank(), tag()); } void IBConnection::endSetup(std::shared_ptr bootstrap) { IbQpInfo qpInfo; - bootstrap->recv(&qpInfo, sizeof(qpInfo), remoteRank_, tag_); + bootstrap->recv(&qpInfo, sizeof(qpInfo), remoteRank(), tag()); qp->rtr(qpInfo); qp->rts(); } diff --git a/src/epoch.cc b/src/epoch.cc index f6c82731..7bcab9c8 100644 --- a/src/epoch.cc +++ b/src/epoch.cc @@ -1,27 +1,27 @@ #include "epoch.hpp" #include "checks.hpp" +#include "alloc.h" namespace mscclpp { -struct Epoch::Impl -{ - DeviceEpoch deviceEpoch; +Epoch::Epoch(Communicator& communicator, std::shared_ptr connection) : connection_(connection) { + MSCCLPPTHROW(mscclppCudaCalloc(&device_.epochIds_, 1)); + MSCCLPPTHROW(mscclppCudaCalloc(&device_.expectedInboundEpochId_, 1)); - Impl() - { - MSCCLPPTHROW(mscclppCudaCalloc(&deviceEpoch.localSignalEpochId, 1)); - MSCCLPPTHROW(mscclppCudaCalloc(&deviceEpoch.waitEpochId, 1)); - } + localEpochIdsRegMem_ = communicator.registerMemory(device_.epochIds_, sizeof(device_.epochIds_), connection->transport()); + communicator.bootstrapper()->send(localEpochIdsRegMem_.serialize(), connection->remoteRank(), connection->tag()); + std::vector serializedRemoteEpochIds; + communicator.bootstrapper()->recv(serializedRemoteEpochIds, connection->remoteRank(), connection->tag()); + remoteEpochIdsRegMem_ = RegisteredMemory::deserialize(serializedRemoteEpochIds); +} - ~Impl() - { - MSCCLPPTHROW(mscclppCudaFree(deviceEpoch.localSignalEpochId)); - MSCCLPPTHROW(mscclppCudaFree(deviceEpoch.waitEpochId)); - } -}; +Epoch::~Epoch() { + MSCCLPPTHROW(mscclppCudaFree(&device_.epochIds_)); + MSCCLPPTHROW(mscclppCudaFree(&device_.expectedInboundEpochId_)); +} -Epoch::Epoch() : pimpl(std::make_unique()) -{ +void Epoch::signal() { + connection_->write(localEpochIdsRegMem_, offsetof(EpochIds, outbound_), remoteEpochIdsRegMem_, offsetof(EpochIds, inboundReplica_), sizeof(device_.epochIds_)); } } // namespace mscclpp \ No newline at end of file diff --git a/src/host_connection.cc b/src/host_connection.cc deleted file mode 100644 index e33069e2..00000000 --- a/src/host_connection.cc +++ /dev/null @@ -1,96 +0,0 @@ -#include "host_connection.hpp" -#include "api.h" -#include "comm.h" -#include "communicator.hpp" -#include "mscclpp.h" -#include "mscclppfifo.h" - -namespace mscclpp { - -HostConnection::Impl::Impl(Communicator* comm, mscclppConn* conn) : comm(comm), conn(conn) -{ - this->hostConn = conn->hostConn; -} - -HostConnection::Impl::~Impl() -{ - // TODO: figure out memory ownership. Does this deallocate the mscclppHostConn? Likely not. -} - -MSCCLPP_API_CPP HostConnection::~HostConnection() = default; - -MSCCLPP_API_CPP HostConnection::HostConnection(std::unique_ptr p) : pimpl(std::move(p)) -{ -} - -MSCCLPP_API_CPP int HostConnection::getId() -{ - return pimpl->conn->connId; -} - -MSCCLPP_API_CPP BufferHandle HostConnection::registerBuffer(void* data, uint64_t size) -{ - BufferHandle result; - static_assert(sizeof(BufferHandle) == sizeof(mscclppBufferHandle_t)); - mscclppRegisterBufferForConnection(pimpl->comm->pimpl->comm, pimpl->conn->connId, data, size, - reinterpret_cast(&result)); - return result; -} - -MSCCLPP_API_CPP int HostConnection::numLocalBuffers() -{ - return pimpl->conn->bufferRegistrations.size() - 1; -} - -MSCCLPP_API_CPP BufferHandle HostConnection::getLocalBuffer(int index) -{ - return index + 1; -} - -MSCCLPP_API_CPP int HostConnection::numRemoteBuffers() -{ - return pimpl->conn->remoteBufferRegistrations.size() - 1; -} - -MSCCLPP_API_CPP BufferHandle HostConnection::getRemoteBuffer(int index) -{ - return index + 1; -} - -MSCCLPP_API_CPP ConnectionEpoch HostConnection::getEpoch() -{ - ConnectionEpoch epoch; - static_assert(sizeof(SignalEpochId) == sizeof(mscclppDevConnSignalEpochId)); - epoch.localSignalEpochId = reinterpret_cast(pimpl->conn->devConn->localSignalEpochId); - epoch.remoteSignalEpochId = reinterpret_cast(pimpl->conn->devConn->remoteSignalEpochId); - epoch.waitEpochId = pimpl->conn->devConn->waitEpochId; - return epoch; -} - -MSCCLPP_API_CPP DeviceProxyFifo HostConnection::getDeviceFifo() -{ - return pimpl->comm->pimpl->proxy.fifo().toDevice(); -} - -MSCCLPP_API_CPP void HostConnection::put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, - uint64_t size) -{ - pimpl->hostConn->put(dst, dstOffset, src, srcOffset, size); -} - -MSCCLPP_API_CPP void HostConnection::signal() -{ - pimpl->hostConn->signal(); -} - -MSCCLPP_API_CPP void HostConnection::flush() -{ - pimpl->hostConn->flush(); -} - -MSCCLPP_API_CPP void HostConnection::wait() -{ - pimpl->hostConn->wait(); -} - -} // namespace mscclpp \ No newline at end of file diff --git a/src/include/channel.hpp b/src/include/channel.hpp index 2303a57c..ace57661 100644 --- a/src/include/channel.hpp +++ b/src/include/channel.hpp @@ -4,26 +4,39 @@ #include "epoch.hpp" #include "mscclpp.hpp" #include "proxy.hpp" +#include "mscclppfifo.hpp" namespace mscclpp { +namespace channel { -// For every MSCCLPP_PROXY_FIFO_FLUSH_COUNTER, a flush of the tail to device memory is triggered. -// As long as MSCCLPP_PROXY_FIFO_SIZE is large enough, having a stale tail is not a problem. -#define MSCCLPP_PROXY_FIFO_SIZE 128 -#define MSCCLPP_PROXY_FIFO_FLUSH_COUNTER 4 +// A Channel pairs a Connection with an Epoch +class Channel +{ +public: + Channel(std::shared_ptr connection) : connection_(connection), epoch_(std::make_shared()) {}; -using ChannelTriggerType = uint64_t; -const ChannelTriggerType channelTriggerData = 0x1; -const ChannelTriggerType channelTriggerFlag = 0x2; -const ChannelTriggerType channelTriggerSync = 0x4; + Connection& connection() { return *connection_; } + Epoch& epoch() { return *epoch_; } + +private: + std::shared_ptr connection_; + std::shared_ptr epoch_; +}; + +using ChannelId = uint32_t; + +using TriggerType = uint64_t; +const TriggerType TriggerData = 0x1; +const TriggerType TriggerFlag = 0x2; +const TriggerType TriggerSync = 0x4; // This is just a numeric ID. Each HostConnection will have an internal array indexed by these handles // mapping to the actual -using BufferHandle = uint32_t; +using MemoryId = uint32_t; #define MSCCLPP_BITS_SIZE 32 #define MSCCLPP_BITS_OFFSET 32 -#define MSCCLPP_BITS_BUFFER_HANDLE 8 +#define MSCCLPP_BITS_REGMEM_HANDLE 8 #define MSCCLPP_BITS_TYPE 3 #define MSCCLPP_BITS_CONNID 10 @@ -39,11 +52,11 @@ union ChannelTrigger { uint64_t : (64 - MSCCLPP_BITS_SIZE - MSCCLPP_BITS_OFFSET); // ensure 64-bit alignment // second 64 bits: value[1] uint64_t dstOffset : MSCCLPP_BITS_OFFSET; - uint64_t srcBufferHandle : MSCCLPP_BITS_BUFFER_HANDLE; - uint64_t dstBufferHandle : MSCCLPP_BITS_BUFFER_HANDLE; + uint64_t srcMemoryId : MSCCLPP_BITS_REGMEM_HANDLE; + uint64_t dstMemoryId : MSCCLPP_BITS_REGMEM_HANDLE; uint64_t type : MSCCLPP_BITS_TYPE; - uint64_t connId : MSCCLPP_BITS_CONNID; - uint64_t : (64 - MSCCLPP_BITS_OFFSET - MSCCLPP_BITS_BUFFER_HANDLE - MSCCLPP_BITS_BUFFER_HANDLE - + uint64_t chanId : MSCCLPP_BITS_CONNID; + uint64_t : (64 - MSCCLPP_BITS_OFFSET - MSCCLPP_BITS_REGMEM_HANDLE - MSCCLPP_BITS_REGMEM_HANDLE - MSCCLPP_BITS_TYPE); // ensure 64-bit alignment } fields; @@ -54,12 +67,12 @@ union ChannelTrigger { __device__ ChannelTrigger(ProxyTrigger value) : value(value) { } - __device__ ChannelTrigger(ChannelTriggerType type, BufferHandle dst, uint64_t dstOffset, BufferHandle src, + __device__ ChannelTrigger(TriggerType type, MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset, uint64_t size, int connectionId) { value.fst = ((srcOffset << MSCCLPP_BITS_SIZE) + size); - value.snd = ((((((((connectionId << MSCCLPP_BITS_TYPE) + (uint64_t)type) << MSCCLPP_BITS_BUFFER_HANDLE) + dst) - << MSCCLPP_BITS_BUFFER_HANDLE) + + value.snd = ((((((((connectionId << MSCCLPP_BITS_TYPE) + (uint64_t)type) << MSCCLPP_BITS_REGMEM_HANDLE) + dst) + << MSCCLPP_BITS_REGMEM_HANDLE) + src) << MSCCLPP_BITS_OFFSET) + dstOffset); @@ -67,114 +80,24 @@ union ChannelTrigger { #endif // __CUDACC__ }; -struct ConnectionEpoch +struct DeviceChannel { -#ifdef __CUDACC__ - __forceinline__ __device__ void wait() - { - (*waitEpochId) += 1; - while (*(volatile uint64_t*)&(localSignalEpochId->proxy) < (*waitEpochId)) - ; - } + DeviceChannel() = default; - __forceinline__ __device__ void epochIncrement() - { - *(volatile uint64_t*)&(localSignalEpochId->device) += 1; - } -#endif // __CUDACC__ + DeviceChannel(ChannelId channelId, DeviceEpoch epoch, DeviceProxyFifo fifo) : channelId_(channelId), epoch_(epoch), fifo_(fifo) {} - SignalEpochId* localSignalEpochId; - // used by the signal() function directly from gpu - SignalEpochId* remoteSignalEpochId; + DeviceChannel(const DeviceChannel& other) = default; - // every wait(), increments this and then the gpu waits for either: - // 1) localSignalEpochId->proxy to be >= this in case of a proxy thread - // 2) remoteSignalEpochId->device to be >= this in case of a gpu thread - uint64_t* waitEpochId; -}; - -class HostConnection -{ - struct Impl; - -public: - /* HostConnection can not be constructed from user code and must instead be created through Communicator::connect */ - HostConnection(std::unique_ptr); - - ~HostConnection(); - - void write(); - - int getId(); - - /* Get the number of times registerBuffer(...) was called. - * - * Returns: the number of buffers registered - */ - int numLocalBuffers(); - - /* Get the BufferHandle returned by a call to registerBuffer(...) as identified by the index - * - * Inputs: - * index: the index of the handle to get - * - * Returns: a handle to the buffer - */ - BufferHandle getLocalBuffer(int index); - - /* Get the number of times registerBuffer(...) was called on the remote peer. - * - * Returns: the number of buffers registered on the remote peer - */ - int numRemoteBuffers(); - - /* Get the BufferHandle returned by a call to registerBuffer(...) on the remote peer as identified by the index - * - * Inputs: - * index: the index of the handle to get - * - * Returns: a handle to the buffer on the remote peer - */ - BufferHandle getRemoteBuffer(int index); - - ConnectionEpoch getEpoch(); - - DeviceProxyFifo getDeviceFifo(); - - void put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, uint64_t size); - - void signal(); - - void flush(); - - void wait(); - -private: - std::unique_ptr pimpl; - friend class Communicator; -}; - -struct DeviceConnection -{ - DeviceConnection() = default; - - DeviceConnection(HostConnection& hostConn) - : connectionId(hostConn.getId()), epoch(hostConn.getEpoch()), fifo(hostConn.getDeviceFifo()) - { - } - - DeviceConnection(const DeviceConnection& other) = default; - - DeviceConnection& operator=(DeviceConnection& other) = default; + DeviceChannel& operator=(DeviceChannel& other) = default; #ifdef __CUDACC__ - __forceinline__ __device__ void put(BufferHandle dst, uint64_t dstOffset, BufferHandle src, uint64_t srcOffset, + __forceinline__ __device__ void put(MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset, uint64_t size) { - fifo.push(ChannelTrigger(channelTriggerData, dst, dstOffset, src, srcOffset, size, connectionId).value); + fifo_.push(ChannelTrigger(TriggerData, dst, dstOffset, src, srcOffset, size, channelId_).value); } - __forceinline__ __device__ void put(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) + __forceinline__ __device__ void put(MemoryId dst, MemoryId src, uint64_t offset, uint64_t size) { put(dst, offset, src, offset, size); } @@ -182,36 +105,36 @@ struct DeviceConnection __forceinline__ __device__ void signal() { epochIncrement(); - fifo.push(ChannelTrigger(channelTriggerFlag, 0, 0, 0, 0, 1, connectionId).value); + fifo_.push(ChannelTrigger(TriggerFlag, 0, 0, 0, 0, 1, channelId_).value); } - __forceinline__ __device__ void putWithSignal(BufferHandle dst, uint64_t dstOffset, BufferHandle src, + __forceinline__ __device__ void putWithSignal(MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset, uint64_t size) { epochIncrement(); - fifo.push( - ChannelTrigger(channelTriggerData | channelTriggerFlag, dst, dstOffset, src, srcOffset, size, connectionId) + fifo_.push( + ChannelTrigger(TriggerData | TriggerFlag, dst, dstOffset, src, srcOffset, size, channelId_) .value); } - __forceinline__ __device__ void putWithSignal(BufferHandle dst, BufferHandle src, uint64_t offset, uint64_t size) + __forceinline__ __device__ void putWithSignal(MemoryId dst, MemoryId src, uint64_t offset, uint64_t size) { putWithSignal(dst, offset, src, offset, size); } - __forceinline__ __device__ void putWithSignalAndFlush(BufferHandle dst, uint64_t dstOffset, BufferHandle src, + __forceinline__ __device__ void putWithSignalAndFlush(MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset, uint64_t size) { epochIncrement(); - uint64_t curFifoHead = fifo.push(ChannelTrigger(channelTriggerData | channelTriggerFlag | channelTriggerSync, dst, - dstOffset, src, srcOffset, size, connectionId) + uint64_t curFifoHead = fifo_.push(ChannelTrigger(TriggerData | TriggerFlag | TriggerSync, dst, + dstOffset, src, srcOffset, size, channelId_) .value); - while (*(volatile uint64_t*)&fifo.triggers[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0 && - *(volatile uint64_t*)fifo.tailReplica <= curFifoHead) + while (*(volatile uint64_t*)&fifo_.triggers[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0 && + *(volatile uint64_t*)fifo_.tailReplica <= curFifoHead) ; } - __forceinline__ __device__ void putWithSignalAndFlush(BufferHandle dst, BufferHandle src, uint64_t offset, + __forceinline__ __device__ void putWithSignalAndFlush(MemoryId dst, MemoryId src, uint64_t offset, uint64_t size) { putWithSignalAndFlush(dst, offset, src, offset, size); @@ -219,53 +142,103 @@ struct DeviceConnection __forceinline__ __device__ void flush() { - uint64_t curFifoHead = fifo.push(ChannelTrigger(mscclppSync, 0, 0, 0, 0, 1, connectionId).value); + uint64_t curFifoHead = fifo_.push(ChannelTrigger(mscclppSync, 0, 0, 0, 0, 1, channelId_).value); // we need to wait for two conditions to be met to ensure the CPU is done flushing. (1) wait for the tail // to go pass by curFifoHead (this is safety net) and (2) wait for the work element value to change to 0. - while (*(volatile uint64_t*)&fifo.triggers[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0 && - *(volatile uint64_t*)fifo.tailReplica <= curFifoHead) + while (*(volatile uint64_t*)&fifo_.triggers[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0 && + *(volatile uint64_t*)fifo_.tailReplica <= curFifoHead) ; } __forceinline__ __device__ void wait() { - epoch.wait(); + epoch_.wait(); } __forceinline__ __device__ void epochIncrement() { - epoch.epochIncrement(); + epoch_.epochIncrement(); } #endif // __CUDACC__ - int connectionId; + ChannelId channelId_; - ConnectionEpoch epoch; + DeviceEpoch epoch_; // this is a concurrent fifo which is multiple threads from the device // can produce for and the sole proxy thread consumes it. - DeviceProxyFifo fifo; + DeviceProxyFifo fifo_; }; -struct SimpleDeviceConnection -{ - SimpleDeviceConnection() = default; +class DeviceChannelService; - SimpleDeviceConnection(HostConnection& hostConn) : devConn(hostConn) - { - dst = hostConn.getRemoteBuffer(0); - src = hostConn.getLocalBuffer(0); +inline ProxyHandler makeChannelProxyHandler(DeviceChannelService& channelService); + +class DeviceChannelService { +public: + DeviceChannelService() : proxy_([&](ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }) {} + + ChannelId addChannel(std::shared_ptr connection) { + channels_.push_back(Channel(connection)); + return channels_.size() - 1; } - SimpleDeviceConnection(const SimpleDeviceConnection& other) = default; + MemoryId addMemory(RegisteredMemory memory) { + memories_.push_back(memory); + return memories_.size() - 1; + } - SimpleDeviceConnection& operator=(SimpleDeviceConnection& other) = default; + Channel channel(ChannelId id) { return channels_[id]; } + DeviceChannel deviceChannel(ChannelId id) { return DeviceChannel(id, channels_[id].epoch().deviceEpoch(), proxy_.fifo().deviceFifo()); } + + void startProxy() { proxy_.start(); } + void stopProxy() { proxy_.stop(); } + +private: + std::vector channels_; + std::vector memories_; + Proxy proxy_; + + ProxyHandlerResult handleTrigger(ProxyTrigger triggerRaw) { + ChannelTrigger* trigger = reinterpret_cast(&triggerRaw); + Channel& channel = channels_[trigger->fields.chanId]; + + auto result = ProxyHandlerResult::Continue; + + if (trigger->fields.type & TriggerData) { + RegisteredMemory& dst = memories_[trigger->fields.dstMemoryId]; + RegisteredMemory& src = memories_[trigger->fields.srcMemoryId]; + channel.connection().write(dst, trigger->fields.dstOffset, src, trigger->fields.srcOffset, trigger->fields.size); + } + + if (trigger->fields.type & TriggerFlag) { + channel.epoch().signal(); + } + + if (trigger->fields.type & TriggerSync) { + channel.connection().flush(); + result = ProxyHandlerResult::FlushFifoTailAndContinue; + } + + return result; + } +}; + +struct SimpleDeviceChannel +{ + SimpleDeviceChannel() = default; + + SimpleDeviceChannel(DeviceChannel devChan, MemoryId dst, MemoryId src) : devChan_(devChan), dst_(dst), src_(src) {} + + SimpleDeviceChannel(const SimpleDeviceChannel& other) = default; + + SimpleDeviceChannel& operator=(SimpleDeviceChannel& other) = default; #ifdef __CUDACC__ __forceinline__ __device__ void put(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) { - devConn.put(dst, dstOffset, src, srcOffset, size); + devChan_.put(dst_, dstOffset, src_, srcOffset, size); } __forceinline__ __device__ void put(uint64_t offset, uint64_t size) @@ -275,12 +248,12 @@ struct SimpleDeviceConnection __forceinline__ __device__ void signal() { - devConn.signal(); + devChan_.signal(); } __forceinline__ __device__ void putWithSignal(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) { - devConn.putWithSignal(dst, dstOffset, src, srcOffset, size); + devChan_.putWithSignal(dst_, dstOffset, src_, srcOffset, size); } __forceinline__ __device__ void putWithSignal(uint64_t offset, uint64_t size) @@ -290,7 +263,7 @@ struct SimpleDeviceConnection __forceinline__ __device__ void putWithSignalAndFlush(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) { - devConn.putWithSignalAndFlush(dst, dstOffset, src, srcOffset, size); + devChan_.putWithSignalAndFlush(dst_, dstOffset, src_, srcOffset, size); } __forceinline__ __device__ void putWithSignalAndFlush(uint64_t offset, uint64_t size) @@ -300,26 +273,27 @@ struct SimpleDeviceConnection __forceinline__ __device__ void flush() { - devConn.flush(); + devChan_.flush(); } __forceinline__ __device__ void wait() { - devConn.wait(); + devChan_.wait(); } __forceinline__ __device__ void epochIncrement() { - devConn.epochIncrement(); + devChan_.epochIncrement(); } #endif // __CUDACC__ - DeviceConnection devConn; - BufferHandle dst; - BufferHandle src; + DeviceChannel devChan_; + MemoryId dst_; + MemoryId src_; }; +} // namespace channel } // namespace mscclpp #endif // MSCCLPP_CHANNEL_HPP_ diff --git a/src/include/communicator.hpp b/src/include/communicator.hpp index 25fface7..b9b28f89 100644 --- a/src/include/communicator.hpp +++ b/src/include/communicator.hpp @@ -1,7 +1,6 @@ #ifndef MSCCL_COMMUNICATOR_HPP_ #define MSCCL_COMMUNICATOR_HPP_ -#include "channel.hpp" #include "ib.hpp" #include "mscclpp.h" #include "mscclpp.hpp" diff --git a/src/include/connection.hpp b/src/include/connection.hpp index 42ca6d47..b28b5890 100644 --- a/src/include/connection.hpp +++ b/src/include/connection.hpp @@ -12,7 +12,14 @@ namespace mscclpp { class ConnectionBase : public Connection { + int remoteRank_; + int tag_; public: + ConnectionBase(int remoteRank, int tag); + + int remoteRank() override; + int tag() override; + virtual void startSetup(std::shared_ptr){}; virtual void endSetup(std::shared_ptr){}; }; @@ -22,7 +29,7 @@ class CudaIpcConnection : public ConnectionBase cudaStream_t stream; public: - CudaIpcConnection(); + CudaIpcConnection(int remoteRank, int tag); ~CudaIpcConnection(); @@ -38,8 +45,6 @@ public: class IBConnection : public ConnectionBase { - int remoteRank_; - int tag_; Transport transport_; Transport remoteTransport_; IbQp* qp; diff --git a/src/include/epoch.hpp b/src/include/epoch.hpp index fd25b51f..2c6e3296 100644 --- a/src/include/epoch.hpp +++ b/src/include/epoch.hpp @@ -5,14 +5,10 @@ namespace mscclpp { -struct alignas(16) SignalEpochId +struct alignas(16) EpochIds { - // every signal(), increaments this and either: - // 1) proxy thread pushes it to the remote peer's localSignalEpochId->proxy - // 2) gpu thread directly writes it to remoteSignalEpochId->device - uint64_t device; - // signal() function triggers the cpu proxy thread to write to it - uint64_t proxy; + uint64_t outbound_; + uint64_t inboundReplica_; }; struct DeviceEpoch @@ -20,34 +16,36 @@ struct DeviceEpoch #ifdef __CUDACC__ __forceinline__ __device__ void wait() { - (*waitEpochId) += 1; - while (*(volatile uint64_t*)&(localSignalEpochId->proxy) < (*waitEpochId)) - ; + (*expectedInboundEpochId_) += 1; + while (*(volatile uint64_t*)&(epochIds_->inboundReplica_) < (*expectedInboundEpochId_)); } __forceinline__ __device__ void epochIncrement() { - *(volatile uint64_t*)&(localSignalEpochId->device) += 1; + *(volatile uint64_t*)&(epochIds_->outbound_) += 1; } #endif // __CUDACC__ - SignalEpochId* localSignalEpochId; - SignalEpochId* remoteSignalEpochId; - uint64_t* waitEpochId; + EpochIds* epochIds_; + uint64_t* expectedInboundEpochId_; }; class Epoch { - struct Impl; - std::unique_ptr pimpl; + std::shared_ptr connection_; + DeviceEpoch device_; + RegisteredMemory localEpochIdsRegMem_; + RegisteredMemory remoteEpochIdsRegMem_; public: - Epoch(); + Epoch(Communicator& communicator, std::shared_ptr connection); ~Epoch(); void signal(); - DeviceEpoch& getDeviceEpoch(); + DeviceEpoch deviceEpoch() { + return device_; + } }; } // namespace mscclpp diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index 8a85ebc6..fde63180 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -30,6 +30,20 @@ public: virtual void recv(void* data, int size, int peer, int tag) = 0; virtual void allGather(void* allData, int size) = 0; virtual void barrier() = 0; + + // TODO: move implementations of these helpers out of this header + void send(const std::vector& data, int peer, int tag) + { + send((void*)data.size(), sizeof(size_t), peer, tag); + send((void*)data.data(), data.size(), peer, tag); + } + void recv(std::vector& data, int peer, int tag) + { + size_t size; + recv((void*)&size, sizeof(size_t), peer, tag); + data.resize(size); + recv((void*)data.data(), data.size(), peer, tag); + } }; class Bootstrap : public BaseBootstrap @@ -223,9 +237,11 @@ class Connection; class RegisteredMemory { struct Impl; + // A shared_ptr is used since RegisteredMemory is functionally immutable, although internally some state is populated lazily. std::shared_ptr pimpl; public: + RegisteredMemory() = default; RegisteredMemory(std::shared_ptr pimpl); ~RegisteredMemory(); @@ -249,6 +265,10 @@ public: virtual void flush() = 0; + virtual int remoteRank() = 0; + + virtual int tag() = 0; + virtual Transport transport() = 0; virtual Transport remoteTransport() = 0; @@ -269,16 +289,8 @@ public: ~Communicator(); - /* Ring-based AllGather through the bootstrap socket. - * - * Inputs: - * data: data array to be gathered where `[r*size, (r+1)*size)` is the data for rank `r` - * size: data size per rank - */ - void bootstrapAllGather(void* data, int size); - - /* A no-op function that is used to synchronize all processes via a bootstrap allgather*/ - void bootstrapBarrier(); + /* Return the bootstrapper held by this communicator. */ + std::shared_ptr bootstrapper(); /* Register a region of GPU memory for use in this communicator. * diff --git a/src/include/mscclppfifo.hpp b/src/include/mscclppfifo.hpp index 7e2820b0..c13e4fb8 100644 --- a/src/include/mscclppfifo.hpp +++ b/src/include/mscclppfifo.hpp @@ -7,6 +7,11 @@ namespace mscclpp { +// For every MSCCLPP_PROXY_FIFO_FLUSH_COUNTER, a flush of the tail to device memory is triggered. +// As long as MSCCLPP_PROXY_FIFO_SIZE is large enough, having a stale tail is not a problem. +#define MSCCLPP_PROXY_FIFO_SIZE 128 +#define MSCCLPP_PROXY_FIFO_FLUSH_COUNTER 4 + struct alignas(16) ProxyTrigger { uint64_t fst, snd; @@ -60,7 +65,7 @@ public: void flushTail(bool sync = false); - DeviceProxyFifo toDevice(); + DeviceProxyFifo deviceFifo(); private: struct Impl; diff --git a/src/include/proxy.hpp b/src/include/proxy.hpp index ac4116b3..f913beac 100644 --- a/src/include/proxy.hpp +++ b/src/include/proxy.hpp @@ -1,7 +1,7 @@ #ifndef MSCCLPP_PROXY_HPP_ #define MSCCLPP_PROXY_HPP_ -#include +#include "mscclppfifo.hpp" #include #include diff --git a/tests/allgather_test_cpp.cu b/tests/allgather_test_cpp.cu index 908a24f4..8fb54733 100644 --- a/tests/allgather_test_cpp.cu +++ b/tests/allgather_test_cpp.cu @@ -1,5 +1,6 @@ #include "mscclpp.h" #include "mscclpp.hpp" +#include "channel.hpp" #ifdef MSCCLPP_USE_MPI_FOR_TESTS #include "mpi.h" @@ -48,9 +49,9 @@ static double getTime(void) return (tspec.tv_nsec / 1.0e9) + tspec.tv_sec; } -__constant__ mscclpp::SimpleDeviceConnection constDevConns[16]; +__constant__ mscclpp::channel::SimpleDeviceConnection constDevConns[16]; -__device__ void allgather0(mscclpp::SimpleDeviceConnection devConn, int rank, int world_size, int remoteRank, +__device__ void allgather0(mscclpp::channel::SimpleDeviceConnection devConn, int rank, int world_size, int remoteRank, size_t nelemsPerGPU) { // this allgather is really simple and implemented as an alltoall @@ -70,7 +71,7 @@ __device__ void allgather0(mscclpp::SimpleDeviceConnection devConn, int rank, in devConn.wait(); } -__device__ void localAllGather(mscclpp::SimpleDeviceConnection devConn, int rank, int world_size, int nranksPerNode, +__device__ void localAllGather(mscclpp::channel::SimpleDeviceConnection devConn, int rank, int world_size, int nranksPerNode, int remoteRank, uint64_t offset, uint64_t size) { // this allgather algorithm works as follows: @@ -94,14 +95,14 @@ __device__ void localAllGather(mscclpp::SimpleDeviceConnection devConn, int rank } } -__device__ void allgather1(mscclpp::SimpleDeviceConnection devConn, int rank, int world_size, int nranksPerNode, +__device__ void allgather1(mscclpp::channel::SimpleDeviceConnection devConn, int rank, int world_size, int nranksPerNode, int remoteRank, size_t nelemsPerGPU) { localAllGather(devConn, rank, world_size, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int)); } -__device__ void allgather2(mscclpp::SimpleDeviceConnection devConn, int rank, int world_size, int nranksPerNode, +__device__ void allgather2(mscclpp::channel::SimpleDeviceConnection devConn, int rank, int world_size, int nranksPerNode, int remoteRank, size_t nelemsPerGPU) { // this allgather is a pipelined and hierarchical one and only works for two nodes @@ -170,7 +171,7 @@ __global__ void kernel(int rank, int world_size, int nranksPerNode, size_t nelem int warpId = threadIdx.x / 32; int remoteRank = (warpId < rank) ? warpId : warpId + 1; // Each warp is responsible for one of the remote ranks - mscclpp::SimpleDeviceConnection devConn = constDevConns[warpId]; + mscclpp::channel::SimpleDeviceConnection devConn = constDevConns[warpId]; if (kernel == 0) allgather0(devConn, rank, world_size, remoteRank, nelemsPerGPU); @@ -222,21 +223,24 @@ void setupMscclppConnections(int rank, int world_size, mscclpp::Communicator& co int thisNode = rankToNode(rank); int cudaNum = rankToLocalRank(rank); std::string ibDevStr = "mlx5_ib" + std::to_string(cudaNum); - std::vector> hostConns; + mscclpp::Transport ibTransport = mscclpp::getIBTransportByDeviceName(ibDevStr); + mscclpp::channel::DeviceChannelService channelService; for (int r = 0; r < world_size; ++r) { if (r == rank) continue; - mscclpp::TransportType transportType; + mscclpp::Transport transport; const char* ibDev = ibDevStr.c_str(); if (rankToNode(r) == thisNode) { ibDev = NULL; - transportType = mscclpp::TransportType::P2P; + transportType = mscclpp::Transport::CudaIpc; } else { - transportType = mscclpp::TransportType::IB; + transportType = ibTransport; } // Connect with all other ranks - auto hostConn = comm.connect(r, 0, transportType, ibDev); + auto conn = comm.connect(r, 0, transportType); + channelService.addChannel(conn); + // TODO: WIP hostConn->registerBuffer(data_d, dataSize); hostConns.push_back(hostConn); } From 7d1f038181cc5f31c4b4610a8733c5b127c7a290 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Sat, 29 Apr 2023 05:16:33 +0000 Subject: [PATCH 085/135] fixes for ib send/recv tests --- src/connection.cc | 4 +++- tests/communicator_test_cpp.cc | 30 ++++++++++++++++++++---------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/src/connection.cc b/src/connection.cc index 75a6ba79..439916eb 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -103,7 +103,7 @@ void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMem auto srcMr = srcTransportInfo.ibMr; qp->stageSend(srcMr, dstMrInfo, (uint32_t)size, /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset, - /*signaled=*/false); + /*signaled=*/true); qp->postSend(); // npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_DATA_ENTRY, (uint32_t)size); } @@ -135,12 +135,14 @@ void IBConnection::flush() void IBConnection::startSetup(std::shared_ptr bootstrap) { bootstrap->send(&qp->getInfo(), sizeof(qp->getInfo()), remoteRank_, tag_); + bootstrap->send(&transport_, sizeof(transport_), remoteRank_, tag_); } void IBConnection::endSetup(std::shared_ptr bootstrap) { IbQpInfo qpInfo; bootstrap->recv(&qpInfo, sizeof(qpInfo), remoteRank_, tag_); + bootstrap->recv(&remoteTransport_, sizeof(remoteTransport_), remoteRank_, tag_); qp->rtr(qpInfo); qp->rts(); } diff --git a/tests/communicator_test_cpp.cc b/tests/communicator_test_cpp.cc index c1e812cd..c4db0cf8 100644 --- a/tests/communicator_test_cpp.cc +++ b/tests/communicator_test_cpp.cc @@ -79,7 +79,7 @@ void test_communicator(int rank, int worldSize, int nranksPerNode) } } - MPI_Barrier(MPI_COMM_WORLD); + bootstrap->barrier(); if (bootstrap->getRank() == 0) std::cout << "Memory registration passed" << std::endl; @@ -93,24 +93,34 @@ void test_communicator(int rank, int worldSize, int nranksPerNode) } CUDATHROW(cudaMemcpy(devicePtr, hostBuffer.get(), size, cudaMemcpyHostToDevice)); - MPI_Barrier(MPI_COMM_WORLD); + bootstrap->barrier(); for (int i = 0; i < worldSize; i++) { if (i != rank) { auto& conn = connections.at(i); auto& peerMemory = registeredMemories.at(i); // printf("write to rank: %d, rank is %d\n", peerMemory.rank(), rank); conn->write(peerMemory, rank * writeSize, registeredMemory, rank * writeSize, writeSize); + conn->flush(); } } - CUDATHROW(cudaDeviceSynchronize()); - MPI_Barrier(MPI_COMM_WORLD); - CUDATHROW(cudaMemcpy(hostBuffer.get(), devicePtr, size, cudaMemcpyDeviceToHost)); - size_t dataPerRank = writeSize / sizeof(int); - for (int i = 0; i < dataCount; i++) { - if (hostBuffer[i] != i / dataPerRank) { - throw std::runtime_error("Data mismatch, connection write failed"); + bootstrap->barrier(); + // polling until it becomes ready + bool ready = false; + int niter = 0; + do { + ready = true; + CUDATHROW(cudaMemcpy(hostBuffer.get(), devicePtr, size, cudaMemcpyDeviceToHost)); + size_t dataPerRank = writeSize / sizeof(int); + for (int i = 0; i < dataCount; i++) { + if (hostBuffer[i] != i / dataPerRank) { + ready = false; + } } - } + if (niter == 10000){ + throw std::runtime_error("Polling is stuck."); + } + niter++; + } while (!ready); if (bootstrap->getRank() == 0) std::cout << "Connection write passed" << std::endl; From 88426ad83a33e165894a1265bb59c4c121a1f5b3 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Mon, 1 May 2023 21:07:12 +0000 Subject: [PATCH 086/135] bug fix for ib memory registeration --- src/connection.cc | 2 +- src/include/registered_memory.hpp | 6 ++++-- src/registered_memory.cc | 1 + 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/connection.cc b/src/connection.cc index 5d9f508a..5289ab59 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -102,7 +102,7 @@ void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMem if (dstTransportInfo.ibLocal) { throw std::runtime_error("dst is local, which is not supported"); } - auto srcTransportInfo = getRegisteredMemoryImpl(src)->getTransportInfo(remoteTransport()); + auto srcTransportInfo = getRegisteredMemoryImpl(src)->getTransportInfo(transport()); if (!srcTransportInfo.ibLocal) { throw std::runtime_error("src is remote, which is not supported"); } diff --git a/src/include/registered_memory.hpp b/src/include/registered_memory.hpp index 88c1005d..e95507f1 100644 --- a/src/include/registered_memory.hpp +++ b/src/include/registered_memory.hpp @@ -17,8 +17,10 @@ struct TransportInfo bool ibLocal; union { cudaIpcMemHandle_t cudaIpcHandle; - const IbMr* ibMr; - IbMrInfo ibMrInfo; + struct { + const IbMr* ibMr; + IbMrInfo ibMrInfo; + }; }; }; diff --git a/src/registered_memory.cc b/src/registered_memory.cc index e298aee5..1215c0e2 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -28,6 +28,7 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, int rank, TransportFlags t const IbMr* mr = commImpl.getIbContext(ibTransport)->registerMr(data, size); transportInfo.ibMr = mr; transportInfo.ibLocal = true; + transportInfo.ibMrInfo = mr->getInfo(); this->transportInfos.push_back(transportInfo); }; if (transports.has(Transport::IB0)) From 8a5a7873e05b150f659386cc86874141a5e73ab1 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Mon, 1 May 2023 21:40:18 +0000 Subject: [PATCH 087/135] test bug fix --- tests/communicator_test_cpp.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/communicator_test_cpp.cc b/tests/communicator_test_cpp.cc index c4db0cf8..78bffaac 100644 --- a/tests/communicator_test_cpp.cc +++ b/tests/communicator_test_cpp.cc @@ -92,6 +92,7 @@ void test_communicator(int rank, int worldSize, int nranksPerNode) hostBuffer[i] = rank; } CUDATHROW(cudaMemcpy(devicePtr, hostBuffer.get(), size, cudaMemcpyHostToDevice)); + CUDATHROW(cudaDeviceSynchronize()); bootstrap->barrier(); for (int i = 0; i < worldSize; i++) { @@ -122,6 +123,7 @@ void test_communicator(int rank, int worldSize, int nranksPerNode) niter++; } while (!ready); + bootstrap->barrier(); if (bootstrap->getRank() == 0) std::cout << "Connection write passed" << std::endl; From 5b7e76cae41f6d3eeb58a5eed4bbd80120efa4b6 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Mon, 1 May 2023 22:25:14 +0000 Subject: [PATCH 088/135] all tests are passing with memory registeration --- src/connection.cc | 19 +++++++++++++++---- src/registered_memory.cc | 1 + 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/connection.cc b/src/connection.cc index 5289ab59..2cfa7205 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -1,3 +1,4 @@ +#include #include "connection.hpp" #include "checks.hpp" #include "infiniband/verbs.h" @@ -142,15 +143,25 @@ void IBConnection::flush() void IBConnection::startSetup(std::shared_ptr bootstrap) { - bootstrap->send(&qp->getInfo(), sizeof(qp->getInfo()), remoteRank(), tag()); - bootstrap->send(&transport_, sizeof(transport_), remoteRank(), tag()); + std::vector ibQpTransport; + std::copy_n(reinterpret_cast(&qp->getInfo()), sizeof(qp->getInfo()), std::back_inserter(ibQpTransport)); + std::copy_n(reinterpret_cast(&transport_), sizeof(transport_), std::back_inserter(ibQpTransport)); + + bootstrap->send(ibQpTransport.data(), ibQpTransport.size(), remoteRank(), tag()); } void IBConnection::endSetup(std::shared_ptr bootstrap) { + std::vector ibQpTransport(sizeof(IbQpInfo) + sizeof(Transport)); + bootstrap->recv(ibQpTransport.data(), ibQpTransport.size(), remoteRank(), tag()); + IbQpInfo qpInfo; - bootstrap->recv(&qpInfo, sizeof(qpInfo), remoteRank(), tag()); - bootstrap->recv(&remoteTransport_, sizeof(remoteTransport_), remoteRank(), tag()); + auto it = ibQpTransport.begin(); + std::copy_n(it, sizeof(qpInfo), reinterpret_cast(&qpInfo)); + it += sizeof(qpInfo); + std::copy_n(it, sizeof(remoteTransport_), reinterpret_cast(&remoteTransport_)); + it += sizeof(qpInfo); + qp->rtr(qpInfo); qp->rts(); } diff --git a/src/registered_memory.cc b/src/registered_memory.cc index 1215c0e2..abf17a8b 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -30,6 +30,7 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, int rank, TransportFlags t transportInfo.ibLocal = true; transportInfo.ibMrInfo = mr->getInfo(); this->transportInfos.push_back(transportInfo); + INFO(MSCCLPP_NET, "IB mr for address %p with size %ld is registered", data, size); }; if (transports.has(Transport::IB0)) addIb(Transport::IB0); From 961f5b38ddf1cfe5eebfef40c4d5b81defb6daa4 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Tue, 2 May 2023 00:44:13 +0000 Subject: [PATCH 089/135] more debbuging info + testing 1000 memory registerations --- src/connection.cc | 3 + tests/communicator_test_cpp.cc | 177 ++++++++++++++++++++------------- 2 files changed, 109 insertions(+), 71 deletions(-) diff --git a/src/connection.cc b/src/connection.cc index 2cfa7205..e0c52419 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -61,6 +61,8 @@ void CudaIpcConnection::write(RegisteredMemory dst, uint64_t dstOffset, Register char* srcPtr = (char*)src.data(); CUDATHROW(cudaMemcpyAsync(dstPtr + dstOffset, srcPtr + srcOffset, size, cudaMemcpyDeviceToDevice, stream)); + INFO(MSCCLPP_P2P, "CudaIpcConnection write: from %p to %p, size %lu", srcPtr + srcOffset, dstPtr + dstOffset, size); + // npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_DATA_ENTRY, (uint32_t)size); } @@ -114,6 +116,7 @@ void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMem qp->stageSend(srcMr, dstMrInfo, (uint32_t)size, /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset, /*signaled=*/true); qp->postSend(); + INFO(MSCCLPP_NET, "IBConnection write: from %p to %p, size %lu", (uint8_t*)srcMr->getBuff() + srcOffset, (uint8_t*)dstMrInfo.addr + dstOffset, size); // npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_DATA_ENTRY, (uint32_t)size); } diff --git a/tests/communicator_test_cpp.cc b/tests/communicator_test_cpp.cc index 78bffaac..6f7aa3e1 100644 --- a/tests/communicator_test_cpp.cc +++ b/tests/communicator_test_cpp.cc @@ -23,6 +23,55 @@ mscclpp::Transport findIb(int localRank) return IBs[localRank]; } +void register_all_memories(std::unique_ptr& communicator, int rank, int worldSize, void* devicePtr, size_t deviceBufferSize, mscclpp::Transport myIbDevice, mscclpp::RegisteredMemory& localMemory, std::unordered_map& remoteMemory){ + localMemory = communicator->registerMemory(devicePtr, deviceBufferSize, mscclpp::Transport::CudaIpc | myIbDevice); + int serializedSize = 0; + for (int i = 0; i < worldSize; i++) { + if (i != rank){ + auto serialized = localMemory.serialize(); + serializedSize = serialized.size(); + communicator->bootstrapper()->send(serialized.data(), serializedSize, i, 0); + } + } + if (serializedSize == 0) { + throw std::runtime_error("Serialized size should have been set to a non-zero value."); + } + for (int i = 0; i < worldSize; i++) { + if (i != rank){ + std::vector deserialized(serializedSize); + communicator->bootstrapper()->recv(deserialized.data(), serializedSize, i, 0); + auto remote = mscclpp::RegisteredMemory::deserialize(deserialized); + remoteMemory[i] = remote; + } + } +} + +void make_connections(std::unique_ptr& communicator, int rank, int worldSize, int nRanksPerNode, mscclpp::Transport myIbDevice, std::unordered_map>& connections){ + for (int i = 0; i < worldSize; i++) { + if (i != rank){ + if (i / nRanksPerNode == rank / nRanksPerNode) { + connections[i] = communicator->connect(i, 0, mscclpp::Transport::CudaIpc); + } else { + connections[i] = communicator->connect(i, 0, myIbDevice); + } + } + } + communicator->connectionSetup(); +} + +void write_remote(int rank, int worldSize, std::unordered_map>& connections, std::unordered_map& remoteRegisteredMemories, mscclpp::RegisteredMemory& registeredMemory, int writeSize){ + for (int i = 0; i < worldSize; i++) { + if (i != rank) { + auto& conn = connections.at(i); + auto& peerMemory = remoteRegisteredMemories.at(i); + // printf("write to rank: %d, rank is %d\n", peerMemory.rank(), rank); + conn->write(peerMemory, rank * writeSize, registeredMemory, rank * writeSize, writeSize); + conn->flush(); + } + } + +} + void test_communicator(int rank, int worldSize, int nranksPerNode) { auto bootstrap = std::make_shared(rank, worldSize); @@ -32,104 +81,90 @@ void test_communicator(int rank, int worldSize, int nranksPerNode) MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); bootstrap->initialize(id); - auto communicator = std::make_shared(bootstrap); + auto communicator = std::make_unique(bootstrap); if (bootstrap->getRank() == 0) std::cout << "Communicator initialization passed" << std::endl; std::unordered_map> connections; auto myIbDevice = findIb(rank % nranksPerNode); - for (int i = 0; i < worldSize; i++) { - if (i != rank) { - std::shared_ptr conn; - if (i / nranksPerNode == rank / nranksPerNode) { - conn = communicator->connect(i, 0, mscclpp::Transport::CudaIpc); - } else { - conn = communicator->connect(i, 0, myIbDevice); - } - connections[i] = conn; - } - } - communicator->connectionSetup(); + make_connections(communicator, rank, worldSize, nranksPerNode, myIbDevice, connections); if (bootstrap->getRank() == 0) std::cout << "Connection setup passed" << std::endl; - int* devicePtr; - int size = 1024; - CUDATHROW(cudaMalloc(&devicePtr, size)); - auto registeredMemory = communicator->registerMemory(devicePtr, size, mscclpp::Transport::CudaIpc | myIbDevice); + int numBuffers = 1000; + std::vector devicePtr(numBuffers); + int deviceBufferSize = 1024*1024; + + std::vector localMemory(numBuffers); + std::vector> remoteMemory(numBuffers); - for (int i = 0; i < worldSize; i++) { - if (i != rank){ - auto serialized = registeredMemory.serialize(); - int serializedSize = serialized.size(); - bootstrap->send(&serializedSize, sizeof(int), i, 0); - bootstrap->send(serialized.data(), serializedSize, i, 1); - } + for (int n = 0; n < numBuffers; n++) { + if (n % 100 == 0) + std::cout << "Registering memory for " << std::to_string(n) << " buffers" << std::endl; + CUDATHROW(cudaMalloc(&devicePtr[n], deviceBufferSize)); + register_all_memories(communicator, rank, worldSize, devicePtr[n], deviceBufferSize, myIbDevice, localMemory[n], remoteMemory[n]); } - std::unordered_map registeredMemories; - for (int i = 0; i < worldSize; i++) { - if (i != rank){ - int deserializedSize; - bootstrap->recv(&deserializedSize, sizeof(int), i, 0); - std::vector deserialized(deserializedSize); - bootstrap->recv(deserialized.data(), deserializedSize, i, 1); - auto deserializedRegisteredMemory = mscclpp::RegisteredMemory::deserialize(deserialized); - registeredMemories.insert({deserializedRegisteredMemory.rank(), deserializedRegisteredMemory}); - } - } - bootstrap->barrier(); if (bootstrap->getRank() == 0) - std::cout << "Memory registration passed" << std::endl; + std::cout << "Memory registration for " << std::to_string(numBuffers) << " buffers passed" << std::endl; - assert((size / sizeof(int)) % worldSize == 0); - size_t writeSize = size / worldSize; - size_t dataCount = size / sizeof(int); - // std::vector hostBuffer(dataCount, 0); - std::shared_ptr hostBuffer(new int[dataCount]); - for (int i = 0; i < dataCount; i++) { - hostBuffer[i] = rank; + + assert((deviceBufferSize / sizeof(int)) % worldSize == 0); + size_t writeSize = deviceBufferSize / worldSize; + size_t dataCount = deviceBufferSize / sizeof(int); + for (int n = 0; n < numBuffers; n++){ + std::vector hostBuffer(dataCount, 0); + for (int i = 0; i < dataCount; i++) { + hostBuffer[i] = rank + n * worldSize; + } + CUDATHROW(cudaMemcpy(devicePtr[n], hostBuffer.data(), deviceBufferSize, cudaMemcpyHostToDevice)); } - CUDATHROW(cudaMemcpy(devicePtr, hostBuffer.get(), size, cudaMemcpyHostToDevice)); CUDATHROW(cudaDeviceSynchronize()); bootstrap->barrier(); - for (int i = 0; i < worldSize; i++) { - if (i != rank) { - auto& conn = connections.at(i); - auto& peerMemory = registeredMemories.at(i); - // printf("write to rank: %d, rank is %d\n", peerMemory.rank(), rank); - conn->write(peerMemory, rank * writeSize, registeredMemory, rank * writeSize, writeSize); - conn->flush(); - } + if (bootstrap->getRank() == 0) + std::cout << "CUDA memory initialization passed" << std::endl; + + for (int n = 0; n < numBuffers; n++){ + write_remote(rank, worldSize, connections, remoteMemory[n], localMemory[n], writeSize); } bootstrap->barrier(); - // polling until it becomes ready - bool ready = false; - int niter = 0; - do { - ready = true; - CUDATHROW(cudaMemcpy(hostBuffer.get(), devicePtr, size, cudaMemcpyDeviceToHost)); - size_t dataPerRank = writeSize / sizeof(int); - for (int i = 0; i < dataCount; i++) { - if (hostBuffer[i] != i / dataPerRank) { - ready = false; + if (bootstrap->getRank() == 0) + std::cout << "RDMA write for " << std::to_string(numBuffers) << " buffers passed" << std::endl; + + for (int n = 0; n < numBuffers; n++){ + // polling until it becomes ready + bool ready = false; + int niter = 0; + std::vector hostBuffer(dataCount, 0); + do { + ready = true; + CUDATHROW(cudaMemcpy(hostBuffer.data(), devicePtr[n], deviceBufferSize, cudaMemcpyDeviceToHost)); + for (int i = 0; i < worldSize; i++) { + for (int j = i*writeSize/sizeof(int); j < (i+1)*writeSize/sizeof(int); j++) { + if (hostBuffer[j] != i + n * worldSize) { + ready = false; + } + } } - } - if (niter == 10000){ - throw std::runtime_error("Polling is stuck."); - } - niter++; - } while (!ready); + if (niter == 10000){ + throw std::runtime_error("Polling is stuck."); + } + niter++; + } while (!ready); + } bootstrap->barrier(); if (bootstrap->getRank() == 0) - std::cout << "Connection write passed" << std::endl; + std::cout << "Polling for " << std::to_string(numBuffers) << " buffers passed" << std::endl; - CUDATHROW(cudaFree(devicePtr)); if (bootstrap->getRank() == 0) std::cout << "--- MSCCLPP::Communicator tests passed! ---" << std::endl; + + for (int n = 0; n < numBuffers; n++){ + CUDATHROW(cudaFree(devicePtr[n])); + } } int main(int argc, char** argv) From 6aa023ed1e205934a7f450a15b1a8d97a81a7e68 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Tue, 2 May 2023 03:28:09 +0000 Subject: [PATCH 090/135] moving serializer outside --- tests/communicator_test_cpp.cc | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/communicator_test_cpp.cc b/tests/communicator_test_cpp.cc index 6f7aa3e1..7c6423b4 100644 --- a/tests/communicator_test_cpp.cc +++ b/tests/communicator_test_cpp.cc @@ -25,17 +25,13 @@ mscclpp::Transport findIb(int localRank) void register_all_memories(std::unique_ptr& communicator, int rank, int worldSize, void* devicePtr, size_t deviceBufferSize, mscclpp::Transport myIbDevice, mscclpp::RegisteredMemory& localMemory, std::unordered_map& remoteMemory){ localMemory = communicator->registerMemory(devicePtr, deviceBufferSize, mscclpp::Transport::CudaIpc | myIbDevice); - int serializedSize = 0; + auto serialized = localMemory.serialize(); + int serializedSize = serialized.size(); for (int i = 0; i < worldSize; i++) { if (i != rank){ - auto serialized = localMemory.serialize(); - serializedSize = serialized.size(); communicator->bootstrapper()->send(serialized.data(), serializedSize, i, 0); } } - if (serializedSize == 0) { - throw std::runtime_error("Serialized size should have been set to a non-zero value."); - } for (int i = 0; i < worldSize; i++) { if (i != rank){ std::vector deserialized(serializedSize); From fe2b778abcb6a9f181a509033ad0ffb0115fb0c1 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Tue, 2 May 2023 03:50:57 +0000 Subject: [PATCH 091/135] flushing the full cq --- src/connection.cc | 12 +++++------- src/epoch.cc | 2 +- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/connection.cc b/src/connection.cc index e0c52419..e1b64072 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -126,18 +126,16 @@ void IBConnection::flush() while (isWaiting) { int wcNum = qp->pollCq(); if (wcNum < 0) { - WARN("pollCq failed: errno %d", errno); - continue; + throw std::runtime_error("pollCq failed: error no " + std::to_string(errno)); } + isWaiting = false; for (int i = 0; i < wcNum; ++i) { const struct ibv_wc* wc = reinterpret_cast(qp->getWc(i)); if (wc->status != IBV_WC_SUCCESS) { - WARN("wc status %d", wc->status); - continue; + throw std::runtime_error("pollCq failed: status " + std::to_string(wc->status)); } - if (wc->opcode == IBV_WC_RDMA_WRITE) { - isWaiting = false; - break; + if (wc->opcode != IBV_WC_RDMA_WRITE) { + isWaiting = true; } } } diff --git a/src/epoch.cc b/src/epoch.cc index 7bcab9c8..3d17c5a1 100644 --- a/src/epoch.cc +++ b/src/epoch.cc @@ -21,7 +21,7 @@ Epoch::~Epoch() { } void Epoch::signal() { - connection_->write(localEpochIdsRegMem_, offsetof(EpochIds, outbound_), remoteEpochIdsRegMem_, offsetof(EpochIds, inboundReplica_), sizeof(device_.epochIds_)); + connection_->write(remoteEpochIdsRegMem_, offsetof(EpochIds, inboundReplica_), localEpochIdsRegMem_, offsetof(EpochIds, outbound_), sizeof(device_.epochIds_)); } } // namespace mscclpp \ No newline at end of file From 358c3d62b818fc8d146986a772879c33e6fd9bb8 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Tue, 2 May 2023 20:06:30 +0000 Subject: [PATCH 092/135] Generalize connectionSetup() into setup() --- src/communicator.cc | 73 ++++++++++++++++++++++++++++----- src/connection.cc | 7 +--- src/epoch.cc | 10 ++--- src/include/communicator.hpp | 5 ++- src/include/connection.hpp | 9 +--- src/include/epoch.hpp | 6 +-- src/include/host_connection.hpp | 23 ----------- src/include/mscclpp.hpp | 41 ++++++++++++++++-- tests/allgather_test_cpp.cu | 12 +++--- tests/communicator_test_cpp.cc | 2 +- 10 files changed, 117 insertions(+), 71 deletions(-) delete mode 100644 src/include/host_connection.hpp diff --git a/src/communicator.cc b/src/communicator.cc index 21faeaee..7af88c73 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -23,17 +23,17 @@ Communicator::Impl::Impl(std::shared_ptr bootstrap) : bootstrap_( Communicator::Impl::~Impl() { - ibContexts.clear(); + ibContexts_.clear(); } IbCtx* Communicator::Impl::getIbContext(Transport ibTransport) { // Find IB context or create it - auto it = ibContexts.find(ibTransport); - if (it == ibContexts.end()) { + auto it = ibContexts_.find(ibTransport); + if (it == ibContexts_.end()) { auto ibDev = getIBDeviceName(ibTransport); - ibContexts[ibTransport] = std::make_unique(ibDev); - return ibContexts[ibTransport].get(); + ibContexts_[ibTransport] = std::make_unique(ibDev); + return ibContexts_[ibTransport].get(); } else { return it->second.get(); } @@ -57,6 +57,50 @@ MSCCLPP_API_CPP RegisteredMemory Communicator::registerMemory(void* ptr, size_t std::make_shared(ptr, size, pimpl->bootstrap_->getRank(), transports, *pimpl)); } +struct MemorySender : public Setuppable +{ + MemorySender(RegisteredMemory memory, int remoteRank, int tag) + : memory_(memory), remoteRank_(remoteRank), tag_(tag) {} + + void beginSetup(std::shared_ptr bootstrap) override + { + bootstrap->send(memory_.serialize(), remoteRank_, tag_); + } + + RegisteredMemory memory_; + int remoteRank_; + int tag_; +}; + +void Communicator::sendMemoryOnSetup(RegisteredMemory memory, int remoteRank, int tag) +{ + addSetup(std::make_shared(memory, remoteRank, tag)); +} + +struct MemoryReceiver : public Setuppable +{ + MemoryReceiver(int remoteRank, int tag) + : remoteRank_(remoteRank), tag_(tag) {} + + void endSetup(std::shared_ptr bootstrap) override + { + std::vector data; + bootstrap->recv(data, remoteRank_, tag_); + memoryPromise_.set_value(RegisteredMemory::deserialize(data)); + } + + std::promise memoryPromise_; + int remoteRank_; + int tag_; +}; + +NonblockingFuture Communicator::recvMemoryOnSetup(int remoteRank, int tag) +{ + auto memoryReceiver = std::make_shared(remoteRank, tag); + addSetup(memoryReceiver); + return memoryReceiver->memoryPromise_.get_future(); +} + MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank, int tag, Transport transport) { std::shared_ptr conn; @@ -84,18 +128,25 @@ MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank } else { throw std::runtime_error("Unsupported transport"); } - pimpl->connections.push_back(conn); + pimpl->connections_.push_back(conn); + addSetup(conn); return conn; } -MSCCLPP_API_CPP void Communicator::connectionSetup() +MSCCLPP_API_CPP void Communicator::addSetup(std::shared_ptr setuppable) { - for (auto& conn : pimpl->connections) { - conn->startSetup(pimpl->bootstrap_); + pimpl->toSetup_.push_back(setuppable); +} + +MSCCLPP_API_CPP void Communicator::setup() +{ + for (auto& setuppable : pimpl->toSetup_) { + setuppable->beginSetup(pimpl->bootstrap_); } - for (auto& conn : pimpl->connections) { - conn->endSetup(pimpl->bootstrap_); + for (auto& setuppable : pimpl->toSetup_) { + setuppable->endSetup(pimpl->bootstrap_); } + pimpl->toSetup_.clear(); } } // namespace mscclpp diff --git a/src/connection.cc b/src/connection.cc index e1b64072..f1ab06f8 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -80,11 +80,6 @@ IBConnection::IBConnection(int remoteRank, int tag, Transport transport, Communi qp = commImpl.getIbContext(transport)->createQp(); } -IBConnection::~IBConnection() -{ - // TODO: Destroy QP? -} - Transport IBConnection::transport() { return transport_; @@ -142,7 +137,7 @@ void IBConnection::flush() // npkitCollectExitEvents(conn, NPKIT_EVENT_IB_SEND_EXIT); } -void IBConnection::startSetup(std::shared_ptr bootstrap) +void IBConnection::beginSetup(std::shared_ptr bootstrap) { std::vector ibQpTransport; std::copy_n(reinterpret_cast(&qp->getInfo()), sizeof(qp->getInfo()), std::back_inserter(ibQpTransport)); diff --git a/src/epoch.cc b/src/epoch.cc index 3d17c5a1..a14191fd 100644 --- a/src/epoch.cc +++ b/src/epoch.cc @@ -9,10 +9,8 @@ Epoch::Epoch(Communicator& communicator, std::shared_ptr connection) MSCCLPPTHROW(mscclppCudaCalloc(&device_.expectedInboundEpochId_, 1)); localEpochIdsRegMem_ = communicator.registerMemory(device_.epochIds_, sizeof(device_.epochIds_), connection->transport()); - communicator.bootstrapper()->send(localEpochIdsRegMem_.serialize(), connection->remoteRank(), connection->tag()); - std::vector serializedRemoteEpochIds; - communicator.bootstrapper()->recv(serializedRemoteEpochIds, connection->remoteRank(), connection->tag()); - remoteEpochIdsRegMem_ = RegisteredMemory::deserialize(serializedRemoteEpochIds); + communicator.sendMemoryOnSetup(localEpochIdsRegMem_, connection->remoteRank(), connection->tag()); + remoteEpochIdsRegMem_ = communicator.recvMemoryOnSetup(connection->remoteRank(), connection->tag()); } Epoch::~Epoch() { @@ -21,7 +19,7 @@ Epoch::~Epoch() { } void Epoch::signal() { - connection_->write(remoteEpochIdsRegMem_, offsetof(EpochIds, inboundReplica_), localEpochIdsRegMem_, offsetof(EpochIds, outbound_), sizeof(device_.epochIds_)); + connection_->write(remoteEpochIdsRegMem_.get(), offsetof(EpochIds, inboundReplica_), localEpochIdsRegMem_, offsetof(EpochIds, outbound_), sizeof(device_.epochIds_)); } -} // namespace mscclpp \ No newline at end of file +} // namespace mscclpp diff --git a/src/include/communicator.hpp b/src/include/communicator.hpp index b9b28f89..32fb6e30 100644 --- a/src/include/communicator.hpp +++ b/src/include/communicator.hpp @@ -15,8 +15,9 @@ class ConnectionBase; struct Communicator::Impl { mscclppComm_t comm; - std::vector> connections; - std::unordered_map> ibContexts; + std::vector> connections_; + std::vector> toSetup_; + std::unordered_map> ibContexts_; std::shared_ptr bootstrap_; std::vector rankToHash_; diff --git a/src/include/connection.hpp b/src/include/connection.hpp index b28b5890..b380dbfd 100644 --- a/src/include/connection.hpp +++ b/src/include/connection.hpp @@ -10,7 +10,7 @@ namespace mscclpp { // TODO: Add functionality to these classes for Communicator to do connectionSetup -class ConnectionBase : public Connection +class ConnectionBase : public Connection, public Setuppable { int remoteRank_; int tag_; @@ -19,9 +19,6 @@ public: int remoteRank() override; int tag() override; - - virtual void startSetup(std::shared_ptr){}; - virtual void endSetup(std::shared_ptr){}; }; class CudaIpcConnection : public ConnectionBase @@ -52,8 +49,6 @@ class IBConnection : public ConnectionBase public: IBConnection(int remoteRank, int tag, Transport transport, Communicator::Impl& commImpl); - ~IBConnection(); - Transport transport() override; Transport remoteTransport() override; @@ -63,7 +58,7 @@ public: void flush() override; - void startSetup(std::shared_ptr bootstrap) override; + void beginSetup(std::shared_ptr bootstrap) override; void endSetup(std::shared_ptr bootstrap) override; }; diff --git a/src/include/epoch.hpp b/src/include/epoch.hpp index 2c6e3296..742db85c 100644 --- a/src/include/epoch.hpp +++ b/src/include/epoch.hpp @@ -35,7 +35,7 @@ class Epoch std::shared_ptr connection_; DeviceEpoch device_; RegisteredMemory localEpochIdsRegMem_; - RegisteredMemory remoteEpochIdsRegMem_; + NonblockingFuture remoteEpochIdsRegMem_; public: Epoch(Communicator& communicator, std::shared_ptr connection); @@ -43,9 +43,7 @@ public: void signal(); - DeviceEpoch deviceEpoch() { - return device_; - } + DeviceEpoch deviceEpoch() { return device_; } }; } // namespace mscclpp diff --git a/src/include/host_connection.hpp b/src/include/host_connection.hpp deleted file mode 100644 index 8ac5d9f1..00000000 --- a/src/include/host_connection.hpp +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef MSCCLPP_HOST_CONNECTION_HPP_ -#define MSCCLPP_HOST_CONNECTION_HPP_ - -#include "comm.h" -#include "mscclpp.h" -#include "mscclpp.hpp" - -namespace mscclpp { - -struct HostConnection::Impl -{ - Communicator* comm; - mscclppConn* conn; - mscclppHostConn_t* hostConn; - - Impl(Communicator* comm, mscclppConn* conn); - - ~Impl(); -}; - -} // namespace mscclpp - -#endif // MSCCLPP_HOST_CONNECTION_HPP_ \ No newline at end of file diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index fde63180..b4111da8 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -10,6 +10,7 @@ #include #include #include +#include namespace mscclpp { @@ -277,6 +278,33 @@ protected: static std::shared_ptr getRegisteredMemoryImpl(RegisteredMemory&); }; +struct Setuppable +{ + virtual void beginSetup(std::shared_ptr) {} + virtual void endSetup(std::shared_ptr) {} +}; + +template +class NonblockingFuture +{ + std::future future; +public: + NonblockingFuture() = default; + NonblockingFuture(std::future&& future) : future(std::move(future)) {} + + bool ready() const + { + return future.wait_for(std::chrono::seconds(0)) == std::future_status::ready; + } + + T get() + { + if (!ready()) + throw std::runtime_error("NonblockingFuture::get() called before ready"); + return future.get(); + } +}; + class Communicator { public: @@ -301,6 +329,10 @@ public: * Returns: a handle to the buffer */ RegisteredMemory registerMemory(void* ptr, size_t size, TransportFlags transports); + + void sendMemoryOnSetup(RegisteredMemory memory, int remoteRank, int tag); + + NonblockingFuture recvMemoryOnSetup(int remoteRank, int tag); /* Connect to a remote rank. This function only prepares metadata for connection. The actual connection * is made by a following call of mscclppConnectionSetup(). Note that this function is two-way and a connection @@ -318,10 +350,11 @@ public: */ std::shared_ptr connect(int remoteRank, int tag, Transport transport); - /* Establish all connections declared by connect(). This function must be called after all connect() - * calls are made. This function ensures that all remote ranks are ready to communicate when it returns. - */ - void connectionSetup(); + /* Add a custom Setuppable object to a list of objects to be setup later, when setup() is called. */ + void addSetup(std::shared_ptr setuppable); + + /* Setup all objects that have registered for setup. This includes any connections created by connect(). */ + void setup(); struct Impl; diff --git a/tests/allgather_test_cpp.cu b/tests/allgather_test_cpp.cu index 8fb54733..791e2ca9 100644 --- a/tests/allgather_test_cpp.cu +++ b/tests/allgather_test_cpp.cu @@ -224,7 +224,6 @@ void setupMscclppConnections(int rank, int world_size, mscclpp::Communicator& co int cudaNum = rankToLocalRank(rank); std::string ibDevStr = "mlx5_ib" + std::to_string(cudaNum); mscclpp::Transport ibTransport = mscclpp::getIBTransportByDeviceName(ibDevStr); - mscclpp::channel::DeviceChannelService channelService; for (int r = 0; r < world_size; ++r) { if (r == rank) @@ -238,14 +237,13 @@ void setupMscclppConnections(int rank, int world_size, mscclpp::Communicator& co transportType = ibTransport; } // Connect with all other ranks - auto conn = comm.connect(r, 0, transportType); - channelService.addChannel(conn); - // TODO: WIP - hostConn->registerBuffer(data_d, dataSize); - hostConns.push_back(hostConn); + auto connId = channelService.addChannel(comm.connect(r, 0, transportType)); + auto memoryId = channelService.addMemory(comm.registerMemory(data_d, dataSize, mscclpp::Transport::CudaIpc | ibTransport)); } - comm.connectionSetup(); + comm.setup(); + + mscclpp::channel::DeviceChannelService channelService; std::vector devConns; std::transform( diff --git a/tests/communicator_test_cpp.cc b/tests/communicator_test_cpp.cc index 7c6423b4..c922eaae 100644 --- a/tests/communicator_test_cpp.cc +++ b/tests/communicator_test_cpp.cc @@ -52,7 +52,7 @@ void make_connections(std::unique_ptr& communicator, int } } } - communicator->connectionSetup(); + communicator->setup(); } void write_remote(int rank, int worldSize, std::unordered_map>& connections, std::unordered_map& remoteRegisteredMemories, mscclpp::RegisteredMemory& registeredMemory, int writeSize){ From c7b7d20d850d6c3f531707130bb4da36ce5276fd Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Tue, 2 May 2023 20:35:16 +0000 Subject: [PATCH 093/135] Export epoch header --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 782129c0..e8c5bb25 100644 --- a/Makefile +++ b/Makefile @@ -135,7 +135,7 @@ HEADERS := $(wildcard src/include/*.h) CPPSOURCES := $(shell find ./ -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)' -not -path "./build/*" -not -path "./python/*") PYTHONCPPSOURCES := $(shell find ./python/src/ -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)') -INCEXPORTS := mscclpp.h mscclppfifo.h mscclpp.hpp mscclppfifo.hpp +INCEXPORTS := mscclpp.h mscclppfifo.h mscclpp.hpp mscclppfifo.hpp epoch.hpp INCTARGETS := $(INCEXPORTS:%=$(BUILDDIR)/$(INCDIR)/%) LIBNAME := libmscclpp.so From 66ce01baf3ac14aba84799bc0ee135410015305e Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Tue, 2 May 2023 20:46:30 +0000 Subject: [PATCH 094/135] Make NonblockingFuture copyable --- src/include/mscclpp.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index b4111da8..5186fbc2 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -287,10 +287,11 @@ struct Setuppable template class NonblockingFuture { - std::future future; + std::shared_future future; public: NonblockingFuture() = default; - NonblockingFuture(std::future&& future) : future(std::move(future)) {} + NonblockingFuture(std::shared_future&& future) : future(std::move(future)) {} + NonblockingFuture(const NonblockingFuture&) = default; bool ready() const { From c44b48b361e0e36154c50ca10cfc6c42b715caad Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Tue, 2 May 2023 21:38:26 +0000 Subject: [PATCH 095/135] Epoch non-copyable --- src/include/epoch.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/include/epoch.hpp b/src/include/epoch.hpp index 742db85c..ffd7464d 100644 --- a/src/include/epoch.hpp +++ b/src/include/epoch.hpp @@ -39,6 +39,7 @@ class Epoch public: Epoch(Communicator& communicator, std::shared_ptr connection); + Epoch(const Epoch&) = delete; ~Epoch(); void signal(); From a4e6ffe2bc5f272e132705d0ad73001cd0921ef3 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Tue, 2 May 2023 21:39:43 +0000 Subject: [PATCH 096/135] epoch creation --- src/communicator.cc | 2 +- src/epoch.cc | 11 ++++++----- src/include/checks.hpp | 2 +- src/include/mscclpp.hpp | 7 ++++--- tests/communicator_test_cpp.cc | 12 +++++++++++- 5 files changed, 23 insertions(+), 11 deletions(-) diff --git a/src/communicator.cc b/src/communicator.cc index 7af88c73..2507c175 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -98,7 +98,7 @@ NonblockingFuture Communicator::recvMemoryOnSetup(int remoteRa { auto memoryReceiver = std::make_shared(remoteRank, tag); addSetup(memoryReceiver); - return memoryReceiver->memoryPromise_.get_future(); + return NonblockingFuture(memoryReceiver->memoryPromise_.get_future()); } MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank, int tag, Transport transport) diff --git a/src/epoch.cc b/src/epoch.cc index a14191fd..9263fd1c 100644 --- a/src/epoch.cc +++ b/src/epoch.cc @@ -1,10 +1,11 @@ #include "epoch.hpp" #include "checks.hpp" #include "alloc.h" +#include "api.h" namespace mscclpp { -Epoch::Epoch(Communicator& communicator, std::shared_ptr connection) : connection_(connection) { +MSCCLPP_API_CPP Epoch::Epoch(Communicator& communicator, std::shared_ptr connection) : connection_(connection) { MSCCLPPTHROW(mscclppCudaCalloc(&device_.epochIds_, 1)); MSCCLPPTHROW(mscclppCudaCalloc(&device_.expectedInboundEpochId_, 1)); @@ -13,12 +14,12 @@ Epoch::Epoch(Communicator& communicator, std::shared_ptr connection) remoteEpochIdsRegMem_ = communicator.recvMemoryOnSetup(connection->remoteRank(), connection->tag()); } -Epoch::~Epoch() { - MSCCLPPTHROW(mscclppCudaFree(&device_.epochIds_)); - MSCCLPPTHROW(mscclppCudaFree(&device_.expectedInboundEpochId_)); +MSCCLPP_API_CPP Epoch::~Epoch() { + mscclppCudaFree(device_.epochIds_); + mscclppCudaFree(device_.expectedInboundEpochId_); } -void Epoch::signal() { +MSCCLPP_API_CPP void Epoch::signal() { connection_->write(remoteEpochIdsRegMem_.get(), offsetof(EpochIds, inboundReplica_), localEpochIdsRegMem_, offsetof(EpochIds, outbound_), sizeof(device_.epochIds_)); } diff --git a/src/include/checks.hpp b/src/include/checks.hpp index 69b222ee..6473c92f 100644 --- a/src/include/checks.hpp +++ b/src/include/checks.hpp @@ -17,7 +17,7 @@ if (res != mscclppSuccess && res != mscclppInProgress) { \ throw std::runtime_error(std::string("Call to " #call " failed with error code ") + mscclppGetErrorString(res)); \ } \ - } while (0); + } while (false) #define CUDATHROW(cmd) \ do { \ diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index 5186fbc2..4c26131c 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -35,15 +35,16 @@ public: // TODO: move implementations of these helpers out of this header void send(const std::vector& data, int peer, int tag) { - send((void*)data.size(), sizeof(size_t), peer, tag); - send((void*)data.data(), data.size(), peer, tag); + size_t size = data.size(); + send((void*)&size, sizeof(size_t), peer, tag); + send((void*)data.data(), data.size(), peer, tag+1); } void recv(std::vector& data, int peer, int tag) { size_t size; recv((void*)&size, sizeof(size_t), peer, tag); data.resize(size); - recv((void*)data.data(), data.size(), peer, tag); + recv((void*)data.data(), data.size(), peer, tag+1); } }; diff --git a/tests/communicator_test_cpp.cc b/tests/communicator_test_cpp.cc index c922eaae..29712cd0 100644 --- a/tests/communicator_test_cpp.cc +++ b/tests/communicator_test_cpp.cc @@ -1,4 +1,5 @@ #include "mscclpp.hpp" +#include "epoch.hpp" #include #include @@ -88,7 +89,7 @@ void test_communicator(int rank, int worldSize, int nranksPerNode) if (bootstrap->getRank() == 0) std::cout << "Connection setup passed" << std::endl; - int numBuffers = 1000; + int numBuffers = 1; std::vector devicePtr(numBuffers); int deviceBufferSize = 1024*1024; @@ -105,6 +106,15 @@ void test_communicator(int rank, int worldSize, int nranksPerNode) if (bootstrap->getRank() == 0) std::cout << "Memory registration for " << std::to_string(numBuffers) << " buffers passed" << std::endl; + std::vector> epochs; + for (auto entry : connections) { + auto& conn = entry.second; + epochs.emplace_back(std::make_unique(*communicator, conn)); + } + communicator->setup(); + bootstrap->barrier(); + if (bootstrap->getRank() == 0) + std::cout << "Epochs are created" << std::endl; assert((deviceBufferSize / sizeof(int)) % worldSize == 0); size_t writeSize = deviceBufferSize / worldSize; From fc12947c5b01d397a7d78a27f2aa1b1f0be7c8c7 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Tue, 2 May 2023 21:42:25 +0000 Subject: [PATCH 097/135] fixing flush for IB --- src/connection.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/connection.cc b/src/connection.cc index f1ab06f8..fd7283fc 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -123,14 +123,13 @@ void IBConnection::flush() if (wcNum < 0) { throw std::runtime_error("pollCq failed: error no " + std::to_string(errno)); } - isWaiting = false; for (int i = 0; i < wcNum; ++i) { const struct ibv_wc* wc = reinterpret_cast(qp->getWc(i)); if (wc->status != IBV_WC_SUCCESS) { throw std::runtime_error("pollCq failed: status " + std::to_string(wc->status)); } - if (wc->opcode != IBV_WC_RDMA_WRITE) { - isWaiting = true; + if (wc->opcode == IBV_WC_RDMA_WRITE) { + isWaiting = false; } } } From 4ba851683274355697503d6d7a13ee2a9178f6fc Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Tue, 2 May 2023 23:14:13 +0000 Subject: [PATCH 098/135] allgather_test_cpp functional again --- Makefile | 6 +-- src/communicator.cc | 4 +- src/fifo.cc | 21 ++++---- src/ib.cc | 7 +-- src/include/channel.hpp | 8 +-- src/proxy_cpp.cc | 1 + tests/allgather_test_cpp.cu | 100 +++++++++++++++++++----------------- 7 files changed, 79 insertions(+), 68 deletions(-) diff --git a/Makefile b/Makefile index e8c5bb25..7b44e154 100644 --- a/Makefile +++ b/Makefile @@ -120,8 +120,8 @@ LDFLAGS := $(NVLDFLAGS) $(GDRCOPY_LDFLAGS) -libverbs -lnuma LIBSRCS := $(addprefix src/,debug.cc utils.cc init.cc proxy.cc ib.cc config.cc) LIBSRCS += $(addprefix src/bootstrap/,bootstrap.cc socket.cc) -LIBSRCS += $(addprefix src/,communicator.cc connection.cc registered_memory.cc epoch.cc) -#LIBSRCS += $(addprefix src/,fifo.cc host_connection.cc proxy_cpp.cc basic_proxy_handler.cc) +LIBSRCS += $(addprefix src/,communicator.cc connection.cc registered_memory.cc) +LIBSRCS += $(addprefix src/,epoch.cc proxy_cpp.cc fifo.cc) ifneq ($(NPKIT), 0) LIBSRCS += $(addprefix src/misc/,npkit.cc) endif @@ -149,7 +149,7 @@ UTOBJTARGETS := $(UTOBJS:%=$(BUILDDIR)/$(OBJDIR)/%) UTBINS := $(patsubst %.o,$(BUILDDIR)/$(BINDIR)/%,$(UTOBJS)) TESTSDIR := tests -TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc allgather_test_standalone.cu bootstrap_test_cpp.cc communicator_test_cpp.cc) # allgather_test_cpp.cu +TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc allgather_test_standalone.cu bootstrap_test_cpp.cc communicator_test_cpp.cc allgather_test_cpp.cu) TESTSOBJS := $(patsubst %.cc,%.o,$(TESTSSRCS)) $(patsubst %.cu,%.o,$(TESTSSRCS)) TESTSOBJTARGETS := $(TESTSOBJS:%=$(BUILDDIR)/$(OBJDIR)/%) TESTSBINS := $(patsubst %.o,$(BUILDDIR)/$(BINDIR)/%,$(TESTSOBJS)) diff --git a/src/communicator.cc b/src/communicator.cc index 2507c175..074d127f 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -72,7 +72,7 @@ struct MemorySender : public Setuppable int tag_; }; -void Communicator::sendMemoryOnSetup(RegisteredMemory memory, int remoteRank, int tag) +MSCCLPP_API_CPP void Communicator::sendMemoryOnSetup(RegisteredMemory memory, int remoteRank, int tag) { addSetup(std::make_shared(memory, remoteRank, tag)); } @@ -94,7 +94,7 @@ struct MemoryReceiver : public Setuppable int tag_; }; -NonblockingFuture Communicator::recvMemoryOnSetup(int remoteRank, int tag) +MSCCLPP_API_CPP NonblockingFuture Communicator::recvMemoryOnSetup(int remoteRank, int tag) { auto memoryReceiver = std::make_shared(remoteRank, tag); addSetup(memoryReceiver); diff --git a/src/fifo.cc b/src/fifo.cc index c2fdd738..d5d70422 100644 --- a/src/fifo.cc +++ b/src/fifo.cc @@ -1,6 +1,7 @@ #include "alloc.h" #include "checks.hpp" #include "mscclppfifo.hpp" +#include "api.h" #include #include #include @@ -24,7 +25,7 @@ struct HostProxyFifo::Impl cudaStream_t stream; }; -HostProxyFifo::HostProxyFifo() +MSCCLPP_API_CPP HostProxyFifo::HostProxyFifo() { pimpl = std::make_unique(); MSCCLPPTHROW(mscclppCudaCalloc(&pimpl->deviceFifo.head, 1)); @@ -34,27 +35,27 @@ HostProxyFifo::HostProxyFifo() pimpl->hostTail = 0; } -HostProxyFifo::~HostProxyFifo() +MSCCLPP_API_CPP HostProxyFifo::~HostProxyFifo() { - MSCCLPPTHROW(mscclppCudaFree(pimpl->deviceFifo.head)); - MSCCLPPTHROW(mscclppCudaHostFree(pimpl->deviceFifo.triggers)); - MSCCLPPTHROW(mscclppCudaFree(pimpl->deviceFifo.tailReplica)); - CUDATHROW(cudaStreamDestroy(pimpl->stream)); + mscclppCudaFree(pimpl->deviceFifo.head); + mscclppCudaHostFree(pimpl->deviceFifo.triggers); + mscclppCudaFree(pimpl->deviceFifo.tailReplica); + cudaStreamDestroy(pimpl->stream); } -void HostProxyFifo::poll(ProxyTrigger* trigger) +MSCCLPP_API_CPP void HostProxyFifo::poll(ProxyTrigger* trigger) { __m128i xmm0 = _mm_load_si128((__m128i*)&pimpl->deviceFifo.triggers[pimpl->hostTail % MSCCLPP_PROXY_FIFO_SIZE]); _mm_store_si128((__m128i*)trigger, xmm0); } -void HostProxyFifo::pop() +MSCCLPP_API_CPP void HostProxyFifo::pop() { *(volatile uint64_t*)(&pimpl->deviceFifo.triggers[pimpl->hostTail % MSCCLPP_PROXY_FIFO_SIZE]) = 0; (pimpl->hostTail)++; } -void HostProxyFifo::flushTail(bool sync) +MSCCLPP_API_CPP void HostProxyFifo::flushTail(bool sync) { // Flush the tail to device memory. This is either triggered every MSCCLPP_PROXY_FIFO_FLUSH_COUNTER to make sure // that the fifo can make progress even if there is no request mscclppSync. However, mscclppSync type is for flush @@ -66,7 +67,7 @@ void HostProxyFifo::flushTail(bool sync) } } -DeviceProxyFifo HostProxyFifo::toDevice() +MSCCLPP_API_CPP DeviceProxyFifo HostProxyFifo::deviceFifo() { return pimpl->deviceFifo; } diff --git a/src/ib.cc b/src/ib.cc index ec7e95f2..7e77b235 100644 --- a/src/ib.cc +++ b/src/ib.cc @@ -11,6 +11,7 @@ #include "debug.h" #include "ib.hpp" #include "mscclpp.hpp" +#include "api.h" #include #include @@ -372,14 +373,14 @@ const std::string& IbCtx::getDevName() const return this->devName; } -int getIBDeviceCount() +MSCCLPP_API_CPP int getIBDeviceCount() { int num; ibv_get_device_list(&num); return num; } -std::string getIBDeviceName(Transport ibTransport) +MSCCLPP_API_CPP std::string getIBDeviceName(Transport ibTransport) { int num; struct ibv_device** devices = ibv_get_device_list(&num); @@ -418,7 +419,7 @@ std::string getIBDeviceName(Transport ibTransport) return devices[ibTransportIndex]->name; } -Transport getIBTransportByDeviceName(const std::string& ibDeviceName) +MSCCLPP_API_CPP Transport getIBTransportByDeviceName(const std::string& ibDeviceName) { int num; struct ibv_device** devices = ibv_get_device_list(&num); diff --git a/src/include/channel.hpp b/src/include/channel.hpp index ace57661..42826f4f 100644 --- a/src/include/channel.hpp +++ b/src/include/channel.hpp @@ -13,7 +13,8 @@ namespace channel { class Channel { public: - Channel(std::shared_ptr connection) : connection_(connection), epoch_(std::make_shared()) {}; + Channel(Communicator& communicator, std::shared_ptr connection) + : connection_(connection), epoch_(std::make_shared(communicator, connection)) {}; Connection& connection() { return *connection_; } Epoch& epoch() { return *epoch_; } @@ -176,10 +177,10 @@ inline ProxyHandler makeChannelProxyHandler(DeviceChannelService& channelService class DeviceChannelService { public: - DeviceChannelService() : proxy_([&](ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }) {} + DeviceChannelService(Communicator& communicator) : communicator_(communicator), proxy_([&](ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }) {} ChannelId addChannel(std::shared_ptr connection) { - channels_.push_back(Channel(connection)); + channels_.push_back(Channel(communicator_, connection)); return channels_.size() - 1; } @@ -195,6 +196,7 @@ public: void stopProxy() { proxy_.stop(); } private: + Communicator& communicator_; std::vector channels_; std::vector memories_; Proxy proxy_; diff --git a/src/proxy_cpp.cc b/src/proxy_cpp.cc index b55d6995..2fb8c2b0 100644 --- a/src/proxy_cpp.cc +++ b/src/proxy_cpp.cc @@ -1,3 +1,4 @@ +#include "proxy.hpp" #include "api.h" #include "mscclpp.hpp" #include "utils.h" diff --git a/tests/allgather_test_cpp.cu b/tests/allgather_test_cpp.cu index 791e2ca9..34050814 100644 --- a/tests/allgather_test_cpp.cu +++ b/tests/allgather_test_cpp.cu @@ -49,9 +49,9 @@ static double getTime(void) return (tspec.tv_nsec / 1.0e9) + tspec.tv_sec; } -__constant__ mscclpp::channel::SimpleDeviceConnection constDevConns[16]; +__constant__ mscclpp::channel::SimpleDeviceChannel constDevChans[16]; -__device__ void allgather0(mscclpp::channel::SimpleDeviceConnection devConn, int rank, int world_size, int remoteRank, +__device__ void allgather0(mscclpp::channel::SimpleDeviceChannel devChan, int rank, int world_size, int remoteRank, size_t nelemsPerGPU) { // this allgather is really simple and implemented as an alltoall @@ -59,19 +59,19 @@ __device__ void allgather0(mscclpp::channel::SimpleDeviceConnection devConn, int // this thread's role is a sender role // put your data asynchronously if ((threadIdx.x % 32) == 0) - devConn.putWithSignal(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int)); + devChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int)); // make sure everyone is put their data before some thread randomly blocks everyone else in signal __syncthreads(); // push with flag and sync to make sure the data is received if ((threadIdx.x % 32) == 0) - devConn.flush(); + devChan.flush(); // this thread's role is a receiver role. wait on the semaphore to make sure the data is ready if ((threadIdx.x % 32) == 0) - devConn.wait(); + devChan.wait(); } -__device__ void localAllGather(mscclpp::channel::SimpleDeviceConnection devConn, int rank, int world_size, int nranksPerNode, +__device__ void localAllGather(mscclpp::channel::SimpleDeviceChannel devChan, int rank, int world_size, int nranksPerNode, int remoteRank, uint64_t offset, uint64_t size) { // this allgather algorithm works as follows: @@ -84,25 +84,25 @@ __device__ void localAllGather(mscclpp::channel::SimpleDeviceConnection devConn, if ((remoteRank % nranksPerNode) == ((rank + i) % nranksPerNode)) { // put your data to GPU (rank+i) % nranksPerNode and signal in one call if ((threadIdx.x % 32) == 0) - devConn.putWithSignalAndFlush(offset, size); + devChan.putWithSignalAndFlush(offset, size); } // wait for the data from GPU (rank-i) % nranksPerNode to arrive if ((remoteRank % nranksPerNode) == ((rank - i + nranksPerNode) % nranksPerNode)) { if ((threadIdx.x % 32) == 0) - devConn.wait(); + devChan.wait(); } asm volatile("bar.sync %0, %1;" ::"r"(11), "r"((nranksPerNode - 1) * 32) : "memory"); } } -__device__ void allgather1(mscclpp::channel::SimpleDeviceConnection devConn, int rank, int world_size, int nranksPerNode, +__device__ void allgather1(mscclpp::channel::SimpleDeviceChannel devChan, int rank, int world_size, int nranksPerNode, int remoteRank, size_t nelemsPerGPU) { - localAllGather(devConn, rank, world_size, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), + localAllGather(devChan, rank, world_size, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int)); } -__device__ void allgather2(mscclpp::channel::SimpleDeviceConnection devConn, int rank, int world_size, int nranksPerNode, +__device__ void allgather2(mscclpp::channel::SimpleDeviceChannel devChan, int rank, int world_size, int nranksPerNode, int remoteRank, size_t nelemsPerGPU) { // this allgather is a pipelined and hierarchical one and only works for two nodes @@ -120,17 +120,17 @@ __device__ void allgather2(mscclpp::channel::SimpleDeviceConnection devConn, int // Step 1 // local allgather if (remoteRank / nranksPerNode == rank / nranksPerNode) { - localAllGather(devConn, rank, world_size, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), + localAllGather(devChan, rank, world_size, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int)); } // cross-node exchange if (remoteRank % nranksPerNode == rank % nranksPerNode) { // opposite side if ((threadIdx.x % 32) == 0) - devConn.putWithSignalAndFlush(rank * nelemsPerGPU * sizeof(int), + devChan.putWithSignalAndFlush(rank * nelemsPerGPU * sizeof(int), (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int)); if ((threadIdx.x % 32) == 0) - devConn.wait(); + devChan.wait(); } __syncthreads(); @@ -139,7 +139,7 @@ __device__ void allgather2(mscclpp::channel::SimpleDeviceConnection devConn, int // local allgather int otherNghr = (rank + nranksPerNode) % world_size; if (remoteRank / nranksPerNode == rank / nranksPerNode) { - localAllGather(devConn, rank, world_size, nranksPerNode, remoteRank, otherNghr * nelemsPerGPU * sizeof(int), + localAllGather(devChan, rank, world_size, nranksPerNode, remoteRank, otherNghr * nelemsPerGPU * sizeof(int), (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int)); } @@ -147,11 +147,11 @@ __device__ void allgather2(mscclpp::channel::SimpleDeviceConnection devConn, int if (remoteRank % nranksPerNode == rank % nranksPerNode) { // opposite side if ((threadIdx.x % 32) == 0) - devConn.putWithSignalAndFlush((rank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * + devChan.putWithSignalAndFlush((rank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int), nelemsPerGPU / pipelineSize * sizeof(int)); if ((threadIdx.x % 32) == 0) - devConn.wait(); + devChan.wait(); } __syncthreads(); @@ -159,7 +159,7 @@ __device__ void allgather2(mscclpp::channel::SimpleDeviceConnection devConn, int // Step 3 // local allgather if (remoteRank / nranksPerNode == rank / nranksPerNode) { - localAllGather(devConn, rank, world_size, nranksPerNode, remoteRank, + localAllGather(devChan, rank, world_size, nranksPerNode, remoteRank, (otherNghr * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int), nelemsPerGPU / pipelineSize * sizeof(int)); } @@ -167,18 +167,18 @@ __device__ void allgather2(mscclpp::channel::SimpleDeviceConnection devConn, int __global__ void kernel(int rank, int world_size, int nranksPerNode, size_t nelemsPerGPU, int kernel) { - // find the mapping between remoteRank and devConns + // find the mapping between remoteRank and devChans int warpId = threadIdx.x / 32; int remoteRank = (warpId < rank) ? warpId : warpId + 1; // Each warp is responsible for one of the remote ranks - mscclpp::channel::SimpleDeviceConnection devConn = constDevConns[warpId]; + mscclpp::channel::SimpleDeviceChannel devChan = constDevChans[warpId]; if (kernel == 0) - allgather0(devConn, rank, world_size, remoteRank, nelemsPerGPU); + allgather0(devChan, rank, world_size, remoteRank, nelemsPerGPU); else if (kernel == 1) - allgather1(devConn, rank, world_size, nranksPerNode, remoteRank, nelemsPerGPU); + allgather1(devChan, rank, world_size, nranksPerNode, remoteRank, nelemsPerGPU); else if (kernel == 2) - allgather2(devConn, rank, world_size, nranksPerNode, remoteRank, nelemsPerGPU); + allgather2(devChan, rank, world_size, nranksPerNode, remoteRank, nelemsPerGPU); } int rankToLocalRank(int rank) @@ -218,41 +218,44 @@ void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSiz CUDACHECK(cudaMemcpy(*data_d, *data_h, dataSize, cudaMemcpyHostToDevice)); } -void setupMscclppConnections(int rank, int world_size, mscclpp::Communicator& comm, int* data_d, size_t dataSize) +void setupMscclppConnections(int rank, int world_size, mscclpp::Communicator& comm, mscclpp::channel::DeviceChannelService& channelService, int* data_d, size_t dataSize) { int thisNode = rankToNode(rank); int cudaNum = rankToLocalRank(rank); std::string ibDevStr = "mlx5_ib" + std::to_string(cudaNum); mscclpp::Transport ibTransport = mscclpp::getIBTransportByDeviceName(ibDevStr); + std::vector channelIds; + std::vector localMemories; + std::vector> remoteMemories; for (int r = 0; r < world_size; ++r) { if (r == rank) continue; mscclpp::Transport transport; - const char* ibDev = ibDevStr.c_str(); if (rankToNode(r) == thisNode) { - ibDev = NULL; - transportType = mscclpp::Transport::CudaIpc; + transport = mscclpp::Transport::CudaIpc; } else { - transportType = ibTransport; + transport = ibTransport; } // Connect with all other ranks - auto connId = channelService.addChannel(comm.connect(r, 0, transportType)); - auto memoryId = channelService.addMemory(comm.registerMemory(data_d, dataSize, mscclpp::Transport::CudaIpc | ibTransport)); + channelIds.push_back(channelService.addChannel(comm.connect(r, 0, transport))); + auto memory = comm.registerMemory(data_d, dataSize, mscclpp::Transport::CudaIpc | ibTransport); + localMemories.push_back(memory); + comm.sendMemoryOnSetup(memory, r, 0); + remoteMemories.push_back(comm.recvMemoryOnSetup(r, 0)); } comm.setup(); - mscclpp::channel::DeviceChannelService channelService; + std::vector devChannels; + for (size_t i = 0; i < channelIds.size(); ++i) { + devChannels.push_back(mscclpp::channel::SimpleDeviceChannel(channelService.deviceChannel(channelIds[i]), + channelService.addMemory(remoteMemories[i].get()), channelService.addMemory(localMemories[i]))); + } - std::vector devConns; - std::transform( - hostConns.begin(), hostConns.end(), std::back_inserter(devConns), - [](std::shared_ptr& hostConn) { return mscclpp::SimpleDeviceConnection(*hostConn); }); - - assert(devConns.size() < sizeof(constDevConns) / sizeof(mscclpp::SimpleDeviceConnection)); + assert(devChannels.size() < sizeof(constDevChans) / sizeof(mscclpp::channel::SimpleDeviceChannel)); CUDACHECK( - cudaMemcpyToSymbol(constDevConns, devConns.data(), sizeof(mscclpp::SimpleDeviceConnection) * devConns.size())); + cudaMemcpyToSymbol(constDevChans, devChannels.data(), sizeof(mscclpp::channel::SimpleDeviceChannel) * devChannels.size())); } void printUsage(const char* prog, bool isMpi) @@ -405,7 +408,10 @@ int main(int argc, const char* argv[]) try { if (rank == 0) printf("Initializing MSCCL++\n"); - mscclpp::Communicator comm(world_size, ip_port, rank); + auto bootstrapper = std::make_shared(rank, world_size); + bootstrapper->initialize(ip_port); + mscclpp::Communicator comm(bootstrapper); + mscclpp::channel::DeviceChannelService channelService(comm); if (rank == 0) printf("Initializing data for allgather test\n"); @@ -413,11 +419,11 @@ int main(int argc, const char* argv[]) if (rank == 0) printf("Setting up the connection in MSCCL++\n"); - setupMscclppConnections(rank, world_size, comm, data_d, dataSize); + setupMscclppConnections(rank, world_size, comm, channelService, data_d, dataSize); if (rank == 0) printf("Launching MSCCL++ proxy threads\n"); - comm.startProxying(); + channelService.startProxy(); if (rank == 0) printf("Testing the correctness of AllGather implementation\n"); @@ -437,7 +443,7 @@ int main(int argc, const char* argv[]) } int tmp[16]; // A simple barrier - comm.bootstrapAllGather(tmp, sizeof(int)); + bootstrapper->allGather(tmp, sizeof(int)); if (rank == 0) printf("Successfully checked the correctness\n"); @@ -446,12 +452,12 @@ int main(int argc, const char* argv[]) if (rank == 0) printf("Running %d iterations of the kernel without CUDA graph\n", iterwithoutcudagraph); CUDACHECK(cudaStreamSynchronize(stream)); - comm.bootstrapAllGather(tmp, sizeof(int)); + bootstrapper->allGather(tmp, sizeof(int)); for (int i = 0; i < iterwithoutcudagraph; ++i) { kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size, nranksPerNode, nelemsPerGPU, kernelNum); } CUDACHECK(cudaStreamSynchronize(stream)); - comm.bootstrapAllGather(tmp, sizeof(int)); + bootstrapper->allGather(tmp, sizeof(int)); // cudaGraph Capture int cudagraphiter = 10; @@ -480,7 +486,7 @@ int main(int argc, const char* argv[]) if (rank == 0) printf("Running %d iterations of the CUDA graph with %d iterations of the kernel\n", cudagraphlaunch, cudagraphiter); - comm.bootstrapAllGather(tmp, sizeof(int)); + bootstrapper->allGather(tmp, sizeof(int)); double t0, t1, ms, time_in_us; t0 = getTime(); for (int i = 0; i < cudagraphlaunch; ++i) { @@ -493,11 +499,11 @@ int main(int argc, const char* argv[]) time_in_us = ms * 1000. / (float)cudagraphlaunch / (float)cudagraphiter; printf("Rank %d report: size %lu time: %f us/iter algBW %f GBps\n", rank, dataSize, time_in_us, (double)(dataSize) / 1e9 / (time_in_us / 1e6)); - comm.bootstrapAllGather(tmp, sizeof(int)); + bootstrapper->allGather(tmp, sizeof(int)); if (rank == 0) printf("Stopping MSCCL++ proxy threads\n"); - comm.stopProxying(); + channelService.stopProxy(); } catch (std::exception& e) { // todo: throw exceptions in the implementation and process them here From 54d1e1872caf5918a8df2f4793e71f7595c485ab Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Tue, 2 May 2023 23:53:31 +0000 Subject: [PATCH 099/135] testing writes with signal is passing --- Makefile | 2 +- src/communicator.cc | 6 +- src/connection.cc | 8 +- src/include/connection.hpp | 1 + src/include/mscclpp.hpp | 2 +- tests/communicator_test_cpp.cc | 193 ---------------------- tests/communicator_test_cpp.cu | 289 +++++++++++++++++++++++++++++++++ 7 files changed, 299 insertions(+), 202 deletions(-) delete mode 100644 tests/communicator_test_cpp.cc create mode 100644 tests/communicator_test_cpp.cu diff --git a/Makefile b/Makefile index e8c5bb25..cb71ec86 100644 --- a/Makefile +++ b/Makefile @@ -149,7 +149,7 @@ UTOBJTARGETS := $(UTOBJS:%=$(BUILDDIR)/$(OBJDIR)/%) UTBINS := $(patsubst %.o,$(BUILDDIR)/$(BINDIR)/%,$(UTOBJS)) TESTSDIR := tests -TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc allgather_test_standalone.cu bootstrap_test_cpp.cc communicator_test_cpp.cc) # allgather_test_cpp.cu +TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc allgather_test_standalone.cu communicator_test_cpp.cu bootstrap_test_cpp.cc) # allgather_test_cpp.cu TESTSOBJS := $(patsubst %.cc,%.o,$(TESTSSRCS)) $(patsubst %.cu,%.o,$(TESTSSRCS)) TESTSOBJTARGETS := $(TESTSOBJS:%=$(BUILDDIR)/$(OBJDIR)/%) TESTSBINS := $(patsubst %.o,$(BUILDDIR)/$(BINDIR)/%,$(TESTSOBJS)) diff --git a/src/communicator.cc b/src/communicator.cc index 2507c175..1fd64132 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -72,7 +72,7 @@ struct MemorySender : public Setuppable int tag_; }; -void Communicator::sendMemoryOnSetup(RegisteredMemory memory, int remoteRank, int tag) +MSCCLPP_API_CPP void Communicator::sendMemoryOnSetup(RegisteredMemory memory, int remoteRank, int tag) { addSetup(std::make_shared(memory, remoteRank, tag)); } @@ -94,14 +94,14 @@ struct MemoryReceiver : public Setuppable int tag_; }; -NonblockingFuture Communicator::recvMemoryOnSetup(int remoteRank, int tag) +MSCCLPP_API_CPP NonblockingFuture Communicator::recvMemoryOnSetup(int remoteRank, int tag) { auto memoryReceiver = std::make_shared(remoteRank, tag); addSetup(memoryReceiver); return NonblockingFuture(memoryReceiver->memoryPromise_.get_future()); } -MSCCLPP_API_CPP std::shared_ptr Communicator::connect(int remoteRank, int tag, Transport transport) +MSCCLPP_API_CPP std::shared_ptr Communicator::connectOnSetup(int remoteRank, int tag, Transport transport) { std::shared_ptr conn; if (transport == Transport::CudaIpc) { diff --git a/src/connection.cc b/src/connection.cc index fd7283fc..66c54f06 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -75,7 +75,7 @@ void CudaIpcConnection::flush() // IBConnection IBConnection::IBConnection(int remoteRank, int tag, Transport transport, Communicator::Impl& commImpl) - : ConnectionBase(remoteRank, tag), transport_(transport), remoteTransport_(Transport::Unknown) + : ConnectionBase(remoteRank, tag), transport_(transport), remoteTransport_(Transport::Unknown), numSignaledSends(0) { qp = commImpl.getIbContext(transport)->createQp(); } @@ -110,6 +110,7 @@ void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMem qp->stageSend(srcMr, dstMrInfo, (uint32_t)size, /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset, /*signaled=*/true); + numSignaledSends++; qp->postSend(); INFO(MSCCLPP_NET, "IBConnection write: from %p to %p, size %lu", (uint8_t*)srcMr->getBuff() + srcOffset, (uint8_t*)dstMrInfo.addr + dstOffset, size); // npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_DATA_ENTRY, (uint32_t)size); @@ -117,8 +118,7 @@ void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMem void IBConnection::flush() { - bool isWaiting = true; - while (isWaiting) { + while (numSignaledSends) { int wcNum = qp->pollCq(); if (wcNum < 0) { throw std::runtime_error("pollCq failed: error no " + std::to_string(errno)); @@ -129,7 +129,7 @@ void IBConnection::flush() throw std::runtime_error("pollCq failed: status " + std::to_string(wc->status)); } if (wc->opcode == IBV_WC_RDMA_WRITE) { - isWaiting = false; + numSignaledSends--; } } } diff --git a/src/include/connection.hpp b/src/include/connection.hpp index b380dbfd..8d1dec87 100644 --- a/src/include/connection.hpp +++ b/src/include/connection.hpp @@ -45,6 +45,7 @@ class IBConnection : public ConnectionBase Transport transport_; Transport remoteTransport_; IbQp* qp; + int numSignaledSends; public: IBConnection(int remoteRank, int tag, Transport transport, Communicator::Impl& commImpl); diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index 4c26131c..47ca9437 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -350,7 +350,7 @@ public: * transportType: the type of transport to be used (mscclppTransportP2P or mscclppTransportIB) * ibDev: the name of the IB device to be used. Expects a null for mscclppTransportP2P. */ - std::shared_ptr connect(int remoteRank, int tag, Transport transport); + std::shared_ptr connectOnSetup(int remoteRank, int tag, Transport transport); /* Add a custom Setuppable object to a list of objects to be setup later, when setup() is called. */ void addSetup(std::shared_ptr setuppable); diff --git a/tests/communicator_test_cpp.cc b/tests/communicator_test_cpp.cc deleted file mode 100644 index 29712cd0..00000000 --- a/tests/communicator_test_cpp.cc +++ /dev/null @@ -1,193 +0,0 @@ -#include "mscclpp.hpp" -#include "epoch.hpp" - -#include -#include -#include -#include -#include -#include - -#define CUDATHROW(cmd) \ - do { \ - cudaError_t err = cmd; \ - if (err != cudaSuccess) { \ - throw std::runtime_error(std::string("Cuda failure '") + cudaGetErrorString(err) + "'"); \ - } \ - } while (false) - -mscclpp::Transport findIb(int localRank) -{ - mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Transport::IB1, mscclpp::Transport::IB2, - mscclpp::Transport::IB3, mscclpp::Transport::IB4, mscclpp::Transport::IB5, - mscclpp::Transport::IB6, mscclpp::Transport::IB7}; - return IBs[localRank]; -} - -void register_all_memories(std::unique_ptr& communicator, int rank, int worldSize, void* devicePtr, size_t deviceBufferSize, mscclpp::Transport myIbDevice, mscclpp::RegisteredMemory& localMemory, std::unordered_map& remoteMemory){ - localMemory = communicator->registerMemory(devicePtr, deviceBufferSize, mscclpp::Transport::CudaIpc | myIbDevice); - auto serialized = localMemory.serialize(); - int serializedSize = serialized.size(); - for (int i = 0; i < worldSize; i++) { - if (i != rank){ - communicator->bootstrapper()->send(serialized.data(), serializedSize, i, 0); - } - } - for (int i = 0; i < worldSize; i++) { - if (i != rank){ - std::vector deserialized(serializedSize); - communicator->bootstrapper()->recv(deserialized.data(), serializedSize, i, 0); - auto remote = mscclpp::RegisteredMemory::deserialize(deserialized); - remoteMemory[i] = remote; - } - } -} - -void make_connections(std::unique_ptr& communicator, int rank, int worldSize, int nRanksPerNode, mscclpp::Transport myIbDevice, std::unordered_map>& connections){ - for (int i = 0; i < worldSize; i++) { - if (i != rank){ - if (i / nRanksPerNode == rank / nRanksPerNode) { - connections[i] = communicator->connect(i, 0, mscclpp::Transport::CudaIpc); - } else { - connections[i] = communicator->connect(i, 0, myIbDevice); - } - } - } - communicator->setup(); -} - -void write_remote(int rank, int worldSize, std::unordered_map>& connections, std::unordered_map& remoteRegisteredMemories, mscclpp::RegisteredMemory& registeredMemory, int writeSize){ - for (int i = 0; i < worldSize; i++) { - if (i != rank) { - auto& conn = connections.at(i); - auto& peerMemory = remoteRegisteredMemories.at(i); - // printf("write to rank: %d, rank is %d\n", peerMemory.rank(), rank); - conn->write(peerMemory, rank * writeSize, registeredMemory, rank * writeSize, writeSize); - conn->flush(); - } - } - -} - -void test_communicator(int rank, int worldSize, int nranksPerNode) -{ - auto bootstrap = std::make_shared(rank, worldSize); - mscclpp::UniqueId id; - if (bootstrap->getRank() == 0) - id = bootstrap->createUniqueId(); - MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); - bootstrap->initialize(id); - - auto communicator = std::make_unique(bootstrap); - if (bootstrap->getRank() == 0) - std::cout << "Communicator initialization passed" << std::endl; - - std::unordered_map> connections; - auto myIbDevice = findIb(rank % nranksPerNode); - - make_connections(communicator, rank, worldSize, nranksPerNode, myIbDevice, connections); - if (bootstrap->getRank() == 0) - std::cout << "Connection setup passed" << std::endl; - - int numBuffers = 1; - std::vector devicePtr(numBuffers); - int deviceBufferSize = 1024*1024; - - std::vector localMemory(numBuffers); - std::vector> remoteMemory(numBuffers); - - for (int n = 0; n < numBuffers; n++) { - if (n % 100 == 0) - std::cout << "Registering memory for " << std::to_string(n) << " buffers" << std::endl; - CUDATHROW(cudaMalloc(&devicePtr[n], deviceBufferSize)); - register_all_memories(communicator, rank, worldSize, devicePtr[n], deviceBufferSize, myIbDevice, localMemory[n], remoteMemory[n]); - } - bootstrap->barrier(); - if (bootstrap->getRank() == 0) - std::cout << "Memory registration for " << std::to_string(numBuffers) << " buffers passed" << std::endl; - - std::vector> epochs; - for (auto entry : connections) { - auto& conn = entry.second; - epochs.emplace_back(std::make_unique(*communicator, conn)); - } - communicator->setup(); - bootstrap->barrier(); - if (bootstrap->getRank() == 0) - std::cout << "Epochs are created" << std::endl; - - assert((deviceBufferSize / sizeof(int)) % worldSize == 0); - size_t writeSize = deviceBufferSize / worldSize; - size_t dataCount = deviceBufferSize / sizeof(int); - for (int n = 0; n < numBuffers; n++){ - std::vector hostBuffer(dataCount, 0); - for (int i = 0; i < dataCount; i++) { - hostBuffer[i] = rank + n * worldSize; - } - CUDATHROW(cudaMemcpy(devicePtr[n], hostBuffer.data(), deviceBufferSize, cudaMemcpyHostToDevice)); - } - CUDATHROW(cudaDeviceSynchronize()); - - bootstrap->barrier(); - if (bootstrap->getRank() == 0) - std::cout << "CUDA memory initialization passed" << std::endl; - - for (int n = 0; n < numBuffers; n++){ - write_remote(rank, worldSize, connections, remoteMemory[n], localMemory[n], writeSize); - } - bootstrap->barrier(); - if (bootstrap->getRank() == 0) - std::cout << "RDMA write for " << std::to_string(numBuffers) << " buffers passed" << std::endl; - - for (int n = 0; n < numBuffers; n++){ - // polling until it becomes ready - bool ready = false; - int niter = 0; - std::vector hostBuffer(dataCount, 0); - do { - ready = true; - CUDATHROW(cudaMemcpy(hostBuffer.data(), devicePtr[n], deviceBufferSize, cudaMemcpyDeviceToHost)); - for (int i = 0; i < worldSize; i++) { - for (int j = i*writeSize/sizeof(int); j < (i+1)*writeSize/sizeof(int); j++) { - if (hostBuffer[j] != i + n * worldSize) { - ready = false; - } - } - } - if (niter == 10000){ - throw std::runtime_error("Polling is stuck."); - } - niter++; - } while (!ready); - } - - bootstrap->barrier(); - if (bootstrap->getRank() == 0) - std::cout << "Polling for " << std::to_string(numBuffers) << " buffers passed" << std::endl; - - if (bootstrap->getRank() == 0) - std::cout << "--- MSCCLPP::Communicator tests passed! ---" << std::endl; - - for (int n = 0; n < numBuffers; n++){ - CUDATHROW(cudaFree(devicePtr[n])); - } -} - -int main(int argc, char** argv) -{ - int rank, worldSize; - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - MPI_Comm_size(MPI_COMM_WORLD, &worldSize); - MPI_Comm shmcomm; - MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm); - int shmWorldSize; - MPI_Comm_size(shmcomm, &shmWorldSize); - int nranksPerNode = shmWorldSize; - MPI_Comm_free(&shmcomm); - - test_communicator(rank, worldSize, nranksPerNode); - - MPI_Finalize(); - return 0; -} \ No newline at end of file diff --git a/tests/communicator_test_cpp.cu b/tests/communicator_test_cpp.cu new file mode 100644 index 00000000..fcdd0f5a --- /dev/null +++ b/tests/communicator_test_cpp.cu @@ -0,0 +1,289 @@ +#include "mscclpp.hpp" +#include "epoch.hpp" + +#include +#include +#include +#include +#include +#include + +#define CUDATHROW(cmd) \ + do { \ + cudaError_t err = cmd; \ + if (err != cudaSuccess) { \ + throw std::runtime_error(std::string("Cuda failure '") + cudaGetErrorString(err) + "'"); \ + } \ + } while (false) + +mscclpp::Transport findIb(int localRank) +{ + mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Transport::IB1, mscclpp::Transport::IB2, + mscclpp::Transport::IB3, mscclpp::Transport::IB4, mscclpp::Transport::IB5, + mscclpp::Transport::IB6, mscclpp::Transport::IB7}; + return IBs[localRank]; +} + +void register_all_memories(mscclpp::Communicator& communicator, int rank, int worldSize, void* devicePtr, size_t deviceBufferSize, mscclpp::Transport myIbDevice, mscclpp::RegisteredMemory& localMemory, std::unordered_map& remoteMemory){ + localMemory = communicator.registerMemory(devicePtr, deviceBufferSize, mscclpp::Transport::CudaIpc | myIbDevice); + std::unordered_map> futureRemoteMemory; + for (int i = 0; i < worldSize; i++) { + if (i != rank){ + communicator.sendMemoryOnSetup(localMemory, i, 0); + futureRemoteMemory[i] = communicator.recvMemoryOnSetup(i, 0); + } + } + communicator.setup(); + for (int i = 0; i < worldSize; i++) { + if (i != rank){ + remoteMemory[i] = futureRemoteMemory[i].get(); + } + } + + + // auto serialized = localMemory.serialize(); + // int serializedSize = serialized.size(); + // for (int i = 0; i < worldSize; i++) { + // if (i != rank){ + // communicator.bootstrapper()->send(serialized.data(), serializedSize, i, 0); + // } + // } + // for (int i = 0; i < worldSize; i++) { + // if (i != rank){ + // std::vector deserialized(serializedSize); + // communicator.bootstrapper()->recv(deserialized.data(), serializedSize, i, 0); + // auto remote = mscclpp::RegisteredMemory::deserialize(deserialized); + // remoteMemory[i] = remote; + // } + // } +} + +void make_connections(mscclpp::Communicator& communicator, int rank, int worldSize, int nRanksPerNode, mscclpp::Transport myIbDevice, std::unordered_map>& connections){ + for (int i = 0; i < worldSize; i++) { + if (i != rank){ + if (i / nRanksPerNode == rank / nRanksPerNode) { + connections[i] = communicator.connectOnSetup(i, 0, mscclpp::Transport::CudaIpc); + } else { + connections[i] = communicator.connectOnSetup(i, 0, myIbDevice); + } + } + } + communicator.setup(); +} + +void write_remote(int rank, int worldSize, std::unordered_map>& connections, + std::unordered_map& remoteRegisteredMemories, mscclpp::RegisteredMemory& registeredMemory, int dataCountPerRank){ + for (int i = 0; i < worldSize; i++) { + if (i != rank) { + auto& conn = connections.at(i); + auto& peerMemory = remoteRegisteredMemories.at(i); + conn->write(peerMemory, rank * dataCountPerRank * sizeof(int), registeredMemory, rank * dataCountPerRank*sizeof(int), dataCountPerRank*sizeof(int)); + conn->flush(); + } + } +} + +void device_buffer_init(int rank, int worldSize, int dataCount, std::vector& devicePtr){ + for (int n = 0; n < (int)devicePtr.size(); n++){ + std::vector hostBuffer(dataCount, 0); + for (int i = 0; i < dataCount; i++) { + hostBuffer[i] = rank + n * worldSize; + } + CUDATHROW(cudaMemcpy(devicePtr[n], hostBuffer.data(), dataCount*sizeof(int), cudaMemcpyHostToDevice)); + } + CUDATHROW(cudaDeviceSynchronize()); +} + +bool test_device_buffer_write_correctness(int worldSize, int dataCount, std::vector& devicePtr){ + for (int n = 0; n < (int)devicePtr.size(); n++){ + std::vector hostBuffer(dataCount, 0); + CUDATHROW(cudaMemcpy(hostBuffer.data(), devicePtr[n], dataCount*sizeof(int), cudaMemcpyDeviceToHost)); + for (int i = 0; i < worldSize; i++) { + for (int j = i*dataCount/worldSize; j < (i+1)*dataCount/worldSize; j++) { + if (hostBuffer[j] != i + n * worldSize) { + return false; + } + } + } + } + return true; +} + +void test_write(int rank, int worldSize, int deviceBufferSize, std::shared_ptr bootstrap, std::unordered_map>& connections, + std::vector>& remoteMemory, std::vector& localMemory, std::vector& devicePtr, int numBuffers){ + + assert((deviceBufferSize / sizeof(int)) % worldSize == 0); + size_t dataCount = deviceBufferSize / sizeof(int); + + device_buffer_init(rank, worldSize, dataCount, devicePtr); + bootstrap->barrier(); + if (bootstrap->getRank() == 0) + std::cout << "CUDA memory initialization passed" << std::endl; + + for (int n = 0; n < numBuffers; n++){ + write_remote(rank, worldSize, connections, remoteMemory[n], localMemory[n], dataCount / worldSize); + } + bootstrap->barrier(); + if (bootstrap->getRank() == 0) + std::cout << "RDMA write for " << std::to_string(numBuffers) << " buffers passed" << std::endl; + + // polling until it becomes ready + bool ready = false; + int niter = 0; + do { + ready = test_device_buffer_write_correctness(worldSize, dataCount, devicePtr); + niter++; + if (niter == 10000){ + throw std::runtime_error("Polling is stuck."); + } + } while (!ready); + + bootstrap->barrier(); + if (bootstrap->getRank() == 0) + std::cout << "Polling for " << std::to_string(numBuffers) << " buffers passed" << std::endl; +} + +__global__ void increament_epochs(mscclpp::DeviceEpoch* deviceEpochs, int rank, int worldSize){ + int tid = threadIdx.x; + if (tid != rank && tid < worldSize){ + deviceEpochs[tid].epochIncrement(); + } +} + +__global__ void wait_epochs(mscclpp::DeviceEpoch* deviceEpochs, int rank, int worldSize){ + int tid = threadIdx.x; + if (tid != rank && tid < worldSize){ + deviceEpochs[tid].wait(); + } +} + +void test_write_with_epochs(int rank, int worldSize, int deviceBufferSize, std::shared_ptr bootstrap, std::unordered_map>& connections, + std::vector>& remoteMemory, std::vector& localMemory, std::vector& devicePtr, std::unordered_map> epochs, int numBuffers){ + + assert((deviceBufferSize / sizeof(int)) % worldSize == 0); + size_t dataCount = deviceBufferSize / sizeof(int); + + device_buffer_init(rank, worldSize, dataCount, devicePtr); + bootstrap->barrier(); + if (bootstrap->getRank() == 0) + std::cout << "CUDA memory initialization passed" << std::endl; + + mscclpp::DeviceEpoch* deviceEpochs; + CUDATHROW(cudaMalloc(&deviceEpochs, sizeof(mscclpp::DeviceEpoch) * worldSize)); + for (int i = 0; i < worldSize; i++){ + if (i != rank){ + mscclpp::DeviceEpoch deviceEpoch = epochs[i]->deviceEpoch(); + CUDATHROW(cudaMemcpy(&deviceEpochs[i], &deviceEpoch, sizeof(mscclpp::DeviceEpoch), cudaMemcpyHostToDevice)); + } + } + CUDATHROW(cudaDeviceSynchronize()); + + bootstrap->barrier(); + if (bootstrap->getRank() == 0) + std::cout << "CUDA device epochs are created" << std::endl; + + + for (int n = 0; n < numBuffers; n++){ + write_remote(rank, worldSize, connections, remoteMemory[n], localMemory[n], dataCount / worldSize); + } + + increament_epochs<<<1, worldSize>>>(deviceEpochs, rank, worldSize); + CUDATHROW(cudaDeviceSynchronize()); + + for (int i = 0; i < worldSize; i++){ + if (i != rank){ + epochs[i]->signal(); + } + } + + wait_epochs<<<1, worldSize>>>(deviceEpochs, rank, worldSize); + CUDATHROW(cudaDeviceSynchronize()); + + if (!test_device_buffer_write_correctness(worldSize, dataCount, devicePtr)){ + throw std::runtime_error("unexpected result."); + } + + bootstrap->barrier(); + if (bootstrap->getRank() == 0) + std::cout << "--- Testing writes with singal for " << std::to_string(numBuffers) << " buffers passed ---" << std::endl; +} + +void test_communicator(int rank, int worldSize, int nranksPerNode) +{ + auto bootstrap = std::make_shared(rank, worldSize); + mscclpp::UniqueId id; + if (bootstrap->getRank() == 0) + id = bootstrap->createUniqueId(); + MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); + bootstrap->initialize(id); + + mscclpp::Communicator communicator(bootstrap); + if (bootstrap->getRank() == 0) + std::cout << "Communicator initialization passed" << std::endl; + + std::unordered_map> connections; + auto myIbDevice = findIb(rank % nranksPerNode); + + make_connections(communicator, rank, worldSize, nranksPerNode, myIbDevice, connections); + if (bootstrap->getRank() == 0) + std::cout << "Connection setup passed" << std::endl; + + int numBuffers = 10; + std::vector devicePtr(numBuffers); + int deviceBufferSize = 1024*1024; + + std::vector localMemory(numBuffers); + std::vector> remoteMemory(numBuffers); + + for (int n = 0; n < numBuffers; n++) { + if (n % 100 == 0) + std::cout << "Registering memory for " << std::to_string(n) << " buffers" << std::endl; + CUDATHROW(cudaMalloc(&devicePtr[n], deviceBufferSize)); + register_all_memories(communicator, rank, worldSize, devicePtr[n], deviceBufferSize, myIbDevice, localMemory[n], remoteMemory[n]); + } + bootstrap->barrier(); + if (bootstrap->getRank() == 0) + std::cout << "Memory registration for " << std::to_string(numBuffers) << " buffers passed" << std::endl; + + test_write(rank, worldSize, deviceBufferSize, bootstrap, connections, remoteMemory, localMemory, devicePtr, numBuffers); + if (bootstrap->getRank() == 0) + std::cout << "--- Testing vanialla writes passed ---" << std::endl; + + std::unordered_map> epochs; + for (auto entry : connections) { + auto& conn = entry.second; + epochs.insert({entry.first, std::make_shared(communicator, conn)}); + } + communicator.setup(); + bootstrap->barrier(); + if (bootstrap->getRank() == 0) + std::cout << "Epochs are created" << std::endl; + + test_write_with_epochs(rank, worldSize, deviceBufferSize, bootstrap, connections, remoteMemory, localMemory, devicePtr, epochs, numBuffers); + + if (bootstrap->getRank() == 0) + std::cout << "--- MSCCLPP::Communicator tests passed! ---" << std::endl; + + for (int n = 0; n < numBuffers; n++){ + CUDATHROW(cudaFree(devicePtr[n])); + } +} + +int main(int argc, char** argv) +{ + int rank, worldSize; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &worldSize); + MPI_Comm shmcomm; + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm); + int shmWorldSize; + MPI_Comm_size(shmcomm, &shmWorldSize); + int nranksPerNode = shmWorldSize; + MPI_Comm_free(&shmcomm); + + test_communicator(rank, worldSize, nranksPerNode); + + MPI_Finalize(); + return 0; +} \ No newline at end of file From 81e7d1b344af413a5b6a73c0ebab754d0dff7bf6 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Wed, 3 May 2023 17:11:25 +0000 Subject: [PATCH 100/135] Channels work --- Makefile | 2 +- src/channel.cc | 26 ++++++++++++++++++++ src/connection.cc | 4 ++- src/include/channel.hpp | 6 ++++- src/include/proxy.hpp | 3 +-- src/include/utils.hpp | 54 +++++++++++++++++++++++++++++++++++++++++ src/proxy_cpp.cc | 16 ++++++++---- 7 files changed, 101 insertions(+), 10 deletions(-) create mode 100644 src/channel.cc create mode 100644 src/include/utils.hpp diff --git a/Makefile b/Makefile index 78b993cf..2b80afb5 100644 --- a/Makefile +++ b/Makefile @@ -121,7 +121,7 @@ LDFLAGS := $(NVLDFLAGS) $(GDRCOPY_LDFLAGS) -libverbs -lnuma LIBSRCS := $(addprefix src/,debug.cc utils.cc init.cc proxy.cc ib.cc config.cc) LIBSRCS += $(addprefix src/bootstrap/,bootstrap.cc socket.cc) LIBSRCS += $(addprefix src/,communicator.cc connection.cc registered_memory.cc) -LIBSRCS += $(addprefix src/,epoch.cc proxy_cpp.cc fifo.cc) +LIBSRCS += $(addprefix src/,epoch.cc proxy_cpp.cc fifo.cc channel.cc) ifneq ($(NPKIT), 0) LIBSRCS += $(addprefix src/misc/,npkit.cc) endif diff --git a/src/channel.cc b/src/channel.cc new file mode 100644 index 00000000..42572390 --- /dev/null +++ b/src/channel.cc @@ -0,0 +1,26 @@ +#include "channel.hpp" +#include "utils.h" +#include "checks.hpp" +#include "api.h" +#include "debug.h" + +namespace mscclpp { +namespace channel { + +MSCCLPP_API_CPP DeviceChannelService::DeviceChannelService(Communicator& communicator) : communicator_(communicator), + proxy_([&](ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }, [&]() { bindThread(); }) { + int cudaDevice; + CUDATHROW(cudaGetDevice(&cudaDevice)); + MSCCLPPTHROW(getDeviceNumaNode(cudaDevice, &deviceNumaNode)); +} + +MSCCLPP_API_CPP void DeviceChannelService::bindThread() +{ + if (deviceNumaNode >= 0) { + MSCCLPPTHROW(numaBind(deviceNumaNode)); + INFO(MSCCLPP_INIT, "NUMA node of DeviceChannelService proxy thread is set to %d", deviceNumaNode); + } +} + +} // namespace channel +} // namespace mscclpp \ No newline at end of file diff --git a/src/connection.cc b/src/connection.cc index 66c54f06..0dee770b 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -4,6 +4,7 @@ #include "infiniband/verbs.h" #include "npkit/npkit.h" #include "registered_memory.hpp" +#include "utils.hpp" namespace mscclpp { @@ -33,7 +34,7 @@ int ConnectionBase::tag() { return tag_; } CudaIpcConnection::CudaIpcConnection(int remoteRank, int tag) : ConnectionBase(remoteRank, tag) { - cudaStreamCreate(&stream); + CUDATHROW(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); } CudaIpcConnection::~CudaIpcConnection() @@ -54,6 +55,7 @@ Transport CudaIpcConnection::remoteTransport() void CudaIpcConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size) { + ScopedTimer timer("CudaIpcConnection::write"); validateTransport(dst, remoteTransport()); validateTransport(src, transport()); diff --git a/src/include/channel.hpp b/src/include/channel.hpp index 42826f4f..eb4bd9e7 100644 --- a/src/include/channel.hpp +++ b/src/include/channel.hpp @@ -5,6 +5,7 @@ #include "mscclpp.hpp" #include "proxy.hpp" #include "mscclppfifo.hpp" +#include "utils.hpp" namespace mscclpp { namespace channel { @@ -177,7 +178,7 @@ inline ProxyHandler makeChannelProxyHandler(DeviceChannelService& channelService class DeviceChannelService { public: - DeviceChannelService(Communicator& communicator) : communicator_(communicator), proxy_([&](ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }) {} + DeviceChannelService(Communicator& communicator); ChannelId addChannel(std::shared_ptr connection) { channels_.push_back(Channel(communicator_, connection)); @@ -200,6 +201,9 @@ private: std::vector channels_; std::vector memories_; Proxy proxy_; + int deviceNumaNode; + + void bindThread(); ProxyHandlerResult handleTrigger(ProxyTrigger triggerRaw) { ChannelTrigger* trigger = reinterpret_cast(&triggerRaw); diff --git a/src/include/proxy.hpp b/src/include/proxy.hpp index f913beac..51ae4752 100644 --- a/src/include/proxy.hpp +++ b/src/include/proxy.hpp @@ -21,12 +21,11 @@ using ProxyHandler = std::function; class Proxy { public: + Proxy(ProxyHandler handler, std::function threadInit); Proxy(ProxyHandler handler); - ~Proxy(); void start(); - void stop(); HostProxyFifo& fifo(); diff --git a/src/include/utils.hpp b/src/include/utils.hpp new file mode 100644 index 00000000..9abf9994 --- /dev/null +++ b/src/include/utils.hpp @@ -0,0 +1,54 @@ +#ifndef MSCCLPP_UTILS_HPP_ +#define MSCCLPP_UTILS_HPP_ + +#include +#include + +namespace mscclpp { + +struct Timer +{ + std::chrono::steady_clock::time_point start; + + Timer() + { + start = std::chrono::steady_clock::now(); + } + + int64_t elapsed() + { + auto end = std::chrono::steady_clock::now(); + return std::chrono::duration_cast(end - start).count(); + } + + void reset() + { + start = std::chrono::steady_clock::now(); + } + + void print(const char* name) + { + auto end = std::chrono::steady_clock::now(); + auto elapsed = std::chrono::duration_cast(end - start).count(); + printf("%s: %ld us\n", name, elapsed); + } +}; + +struct ScopedTimer +{ + Timer timer; + const char* name; + + ScopedTimer(const char* name) : name(name) + { + } + + ~ScopedTimer() + { + timer.print(name); + } +}; + +} // namespace mscclpp + +#endif // MSCCLPP_UTILS_HPP_ diff --git a/src/proxy_cpp.cc b/src/proxy_cpp.cc index 2fb8c2b0..b1626813 100644 --- a/src/proxy_cpp.cc +++ b/src/proxy_cpp.cc @@ -2,6 +2,7 @@ #include "api.h" #include "mscclpp.hpp" #include "utils.h" +#include "utils.hpp" #include #include @@ -14,18 +15,23 @@ const int ProxyFlushPeriod = 4; struct Proxy::Impl { ProxyHandler handler; + std::function threadInit; HostProxyFifo fifo; std::thread service; std::atomic_bool running; - Impl(ProxyHandler handler) : handler(handler), running(false) + Impl(ProxyHandler handler, std::function threadInit) : handler(handler), threadInit(threadInit), running(false) { } }; -MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler) +MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler, std::function threadInit) +{ + pimpl = std::make_unique(handler, threadInit); +} + +MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler) : Proxy(handler, [] {}) { - pimpl = std::make_unique(handler); } MSCCLPP_API_CPP Proxy::~Proxy() @@ -39,8 +45,8 @@ MSCCLPP_API_CPP void Proxy::start() { pimpl->running = true; pimpl->service = std::thread([this] { - // from this point on, proxy thread will stay close to the device - // PROXYMSCCLPPCHECK(numaBind(pimpl->comm->devNumaNode)); // TODO: reenable this + + pimpl->threadInit(); ProxyHandler handler = this->pimpl->handler; HostProxyFifo& fifo = this->pimpl->fifo; From 39666f999ffdbc1d5a1fa6cdaff7494548808fd6 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Wed, 3 May 2023 19:20:45 +0000 Subject: [PATCH 101/135] Quick fix --- src/communicator.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/communicator.cc b/src/communicator.cc index 1fd64132..469502b7 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -86,7 +86,9 @@ struct MemoryReceiver : public Setuppable { std::vector data; bootstrap->recv(data, remoteRank_, tag_); - memoryPromise_.set_value(RegisteredMemory::deserialize(data)); + auto memory = RegisteredMemory::deserialize(data); + memory.data(); + memoryPromise_.set_value(memory); } std::promise memoryPromise_; From 4a41c19e721ffad3e22fbf23611cb217548a93da Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Wed, 3 May 2023 19:40:23 +0000 Subject: [PATCH 102/135] Fix performance bug and base pointer offset --- src/communicator.cc | 4 +-- src/connection.cc | 1 - src/include/registered_memory.hpp | 7 ++-- src/registered_memory.cc | 57 ++++++++++++++++--------------- src/utils.cc | 9 ++++- 5 files changed, 44 insertions(+), 34 deletions(-) diff --git a/src/communicator.cc b/src/communicator.cc index 469502b7..1fd64132 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -86,9 +86,7 @@ struct MemoryReceiver : public Setuppable { std::vector data; bootstrap->recv(data, remoteRank_, tag_); - auto memory = RegisteredMemory::deserialize(data); - memory.data(); - memoryPromise_.set_value(memory); + memoryPromise_.set_value(RegisteredMemory::deserialize(data)); } std::promise memoryPromise_; diff --git a/src/connection.cc b/src/connection.cc index 0dee770b..dca3e662 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -55,7 +55,6 @@ Transport CudaIpcConnection::remoteTransport() void CudaIpcConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size) { - ScopedTimer timer("CudaIpcConnection::write"); validateTransport(dst, remoteTransport()); validateTransport(src, transport()); diff --git a/src/include/registered_memory.hpp b/src/include/registered_memory.hpp index e95507f1..bf4802ce 100644 --- a/src/include/registered_memory.hpp +++ b/src/include/registered_memory.hpp @@ -16,7 +16,10 @@ struct TransportInfo // TODO: rewrite this using std::variant or something bool ibLocal; union { - cudaIpcMemHandle_t cudaIpcHandle; + struct { + cudaIpcMemHandle_t cudaIpcBaseHandle; + size_t cudaIpcOffsetFromBase; + }; struct { const IbMr* ibMr; IbMrInfo ibMrInfo; @@ -27,9 +30,9 @@ struct TransportInfo struct RegisteredMemory::Impl { void* data; - bool dataInitialized; size_t size; int rank; + uint64_t hostHash; TransportFlags transports; std::vector transportInfos; diff --git a/src/registered_memory.cc b/src/registered_memory.cc index abf17a8b..fed732a0 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -1,13 +1,14 @@ #include "registered_memory.hpp" #include "api.h" #include "checks.hpp" +#include "utils.h" #include #include namespace mscclpp { RegisteredMemory::Impl::Impl(void* data, size_t size, int rank, TransportFlags transports, Communicator::Impl& commImpl) - : data(data), dataInitialized(true), size(size), rank(rank), transports(transports) + : data(data), size(size), rank(rank), hostHash(commImpl.rankToHash_.at(rank)), transports(transports) { if (transports.has(Transport::CudaIpc)) { TransportInfo transportInfo; @@ -18,7 +19,9 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, int rank, TransportFlags t size_t baseDataSize; // dummy CUTHROW(cuMemGetAddressRange((CUdeviceptr*)&baseDataPtr, &baseDataSize, (CUdeviceptr)data)); CUDATHROW(cudaIpcGetMemHandle(&handle, baseDataPtr)); - transportInfo.cudaIpcHandle = handle; + // TODO: bug with offset of base? + transportInfo.cudaIpcBaseHandle = handle; + transportInfo.cudaIpcOffsetFromBase = (char*)data - (char*)baseDataPtr; this->transportInfos.push_back(transportInfo); } if ((transports & AllIBTransports).any()) { @@ -57,24 +60,12 @@ MSCCLPP_API_CPP RegisteredMemory::RegisteredMemory(std::shared_ptr pimpl) MSCCLPP_API_CPP RegisteredMemory::~RegisteredMemory() = default; -void* RegisteredMemory::data() +MSCCLPP_API_CPP void* RegisteredMemory::data() { - if (!pimpl->dataInitialized) { - if (pimpl->transports.has(Transport::CudaIpc)) { - auto entry = pimpl->getTransportInfo(Transport::CudaIpc); - CUDATHROW(cudaIpcOpenMemHandle(&pimpl->data, entry.cudaIpcHandle, cudaIpcMemLazyEnablePeerAccess)); - INFO(MSCCLPP_P2P, "Opened CUDA IPC handle for base point of %p", pimpl->data); - } - else - { - pimpl->data = nullptr; - } - pimpl->dataInitialized = true; - } return pimpl->data; } -size_t RegisteredMemory::size() +MSCCLPP_API_CPP size_t RegisteredMemory::size() { return pimpl->size; } @@ -84,7 +75,7 @@ MSCCLPP_API_CPP int RegisteredMemory::rank() return pimpl->rank; } -TransportFlags RegisteredMemory::transports() +MSCCLPP_API_CPP TransportFlags RegisteredMemory::transports() { return pimpl->transports; } @@ -94,6 +85,7 @@ MSCCLPP_API_CPP std::vector RegisteredMemory::serialize() std::vector result; std::copy_n(reinterpret_cast(&pimpl->size), sizeof(pimpl->size), std::back_inserter(result)); std::copy_n(reinterpret_cast(&pimpl->rank), sizeof(pimpl->rank), std::back_inserter(result)); + std::copy_n(reinterpret_cast(&pimpl->hostHash), sizeof(pimpl->hostHash), std::back_inserter(result)); std::copy_n(reinterpret_cast(&pimpl->transports), sizeof(pimpl->transports), std::back_inserter(result)); if (pimpl->transportInfos.size() > std::numeric_limits::max()) { throw std::runtime_error("Too many transport info entries"); @@ -103,7 +95,9 @@ MSCCLPP_API_CPP std::vector RegisteredMemory::serialize() for (auto& entry : pimpl->transportInfos) { std::copy_n(reinterpret_cast(&entry.transport), sizeof(entry.transport), std::back_inserter(result)); if (entry.transport == Transport::CudaIpc) { - std::copy_n(reinterpret_cast(&entry.cudaIpcHandle), sizeof(entry.cudaIpcHandle), + std::copy_n(reinterpret_cast(&entry.cudaIpcBaseHandle), sizeof(entry.cudaIpcBaseHandle), + std::back_inserter(result)); + std::copy_n(reinterpret_cast(&entry.cudaIpcOffsetFromBase), sizeof(entry.cudaIpcOffsetFromBase), std::back_inserter(result)); } else if (AllIBTransports.has(entry.transport)) { std::copy_n(reinterpret_cast(&entry.ibMrInfo), sizeof(entry.ibMrInfo), std::back_inserter(result)); @@ -126,6 +120,8 @@ RegisteredMemory::Impl::Impl(const std::vector& serialization) it += sizeof(this->size); std::copy_n(it, sizeof(this->rank), reinterpret_cast(&this->rank)); it += sizeof(this->rank); + std::copy_n(it, sizeof(this->hostHash), reinterpret_cast(&this->hostHash)); + it += sizeof(this->hostHash); std::copy_n(it, sizeof(this->transports), reinterpret_cast(&this->transports)); it += sizeof(this->transports); int8_t transportCount; @@ -136,15 +132,13 @@ RegisteredMemory::Impl::Impl(const std::vector& serialization) std::copy_n(it, sizeof(transportInfo.transport), reinterpret_cast(&transportInfo.transport)); it += sizeof(transportInfo.transport); if (transportInfo.transport == Transport::CudaIpc) { - cudaIpcMemHandle_t handle; - std::copy_n(it, sizeof(handle), reinterpret_cast(&handle)); - it += sizeof(handle); - transportInfo.cudaIpcHandle = handle; + std::copy_n(it, sizeof(transportInfo.cudaIpcBaseHandle), reinterpret_cast(&transportInfo.cudaIpcBaseHandle)); + it += sizeof(transportInfo.cudaIpcBaseHandle); + std::copy_n(it, sizeof(transportInfo.cudaIpcOffsetFromBase), reinterpret_cast(&transportInfo.cudaIpcOffsetFromBase)); + it += sizeof(transportInfo.cudaIpcOffsetFromBase); } else if (AllIBTransports.has(transportInfo.transport)) { - IbMrInfo info; - std::copy_n(it, sizeof(info), reinterpret_cast(&info)); - it += sizeof(info); - transportInfo.ibMrInfo = info; + std::copy_n(it, sizeof(transportInfo.ibMrInfo), reinterpret_cast(&transportInfo.ibMrInfo)); + it += sizeof(transportInfo.ibMrInfo); transportInfo.ibLocal = false; } else { throw std::runtime_error("Unknown transport"); @@ -155,7 +149,16 @@ RegisteredMemory::Impl::Impl(const std::vector& serialization) throw std::runtime_error("Deserialization failed"); } - dataInitialized = false; + if (transports.has(Transport::CudaIpc)) { + uint64_t localHostHash = getHostHash(); + if (localHostHash == this->hostHash) { + auto entry = getTransportInfo(Transport::CudaIpc); + void* base; + CUDATHROW(cudaIpcOpenMemHandle(&base, entry.cudaIpcBaseHandle, cudaIpcMemLazyEnablePeerAccess)); + data = static_cast(base) + entry.cudaIpcOffsetFromBase; + INFO(MSCCLPP_P2P, "Opened CUDA IPC handle at pointer %p", data); + } + } } } // namespace mscclpp diff --git a/src/utils.cc b/src/utils.cc index ebd31bfe..6954a64f 100644 --- a/src/utils.cc +++ b/src/utils.cc @@ -9,6 +9,7 @@ #include #include #include +#include // Get current Compute Capability // int mscclppCudaCompCap() { @@ -112,7 +113,7 @@ uint64_t getHash(const char* string, int n) * This string can be overridden by using the MSCCLPP_HOSTID env var. */ #define HOSTID_FILE "/proc/sys/kernel/random/boot_id" -uint64_t getHostHash(void) +uint64_t computeHostHash(void) { char hostHash[1024]; char* hostId; @@ -144,6 +145,12 @@ uint64_t getHostHash(void) return getHash(hostHash, strlen(hostHash)); } +uint64_t getHostHash(void) +{ + thread_local std::unique_ptr hostHash = std::make_unique(computeHostHash()); + return *hostHash; +} + /* Generate a hash of the unique identifying string for this process * that will be unique for both bare-metal and container instances * Equivalent of a hash of; From 7af687954c0b7f2efbdd3ed87d76bac1d12f753f Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Wed, 3 May 2023 20:23:51 +0000 Subject: [PATCH 103/135] removing old mscclppComm_t comm from communicator --- src/include/communicator.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/include/communicator.hpp b/src/include/communicator.hpp index 32fb6e30..5b0c7485 100644 --- a/src/include/communicator.hpp +++ b/src/include/communicator.hpp @@ -14,7 +14,6 @@ class ConnectionBase; struct Communicator::Impl { - mscclppComm_t comm; std::vector> connections_; std::vector> toSetup_; std::unordered_map> ibContexts_; From 518f325225ccece587aba8a0874a277d22cb4cc8 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Wed, 3 May 2023 22:45:47 +0000 Subject: [PATCH 104/135] kernel 2 is also performant --- tests/allgather_test_cpp.cu | 14 +++++++++++--- tests/communicator_test_cpp.cu | 17 ----------------- 2 files changed, 11 insertions(+), 20 deletions(-) diff --git a/tests/allgather_test_cpp.cu b/tests/allgather_test_cpp.cu index aaff931c..ad473f8f 100644 --- a/tests/allgather_test_cpp.cu +++ b/tests/allgather_test_cpp.cu @@ -84,7 +84,7 @@ __device__ void localAllGather(mscclpp::channel::SimpleDeviceChannel devChan, in if ((remoteRank % nranksPerNode) == ((rank + i) % nranksPerNode)) { // put your data to GPU (rank+i) % nranksPerNode and signal in one call if ((threadIdx.x % 32) == 0) - devChan.putWithSignalAndFlush(offset, size); + devChan.putWithSignal(offset, size); } // wait for the data from GPU (rank-i) % nranksPerNode to arrive if ((remoteRank % nranksPerNode) == ((rank - i + nranksPerNode) % nranksPerNode)) { @@ -100,6 +100,9 @@ __device__ void allgather1(mscclpp::channel::SimpleDeviceChannel devChan, int ra { localAllGather(devChan, rank, world_size, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int)); + if (remoteRank / nranksPerNode == rank / nranksPerNode) + if ((threadIdx.x % 32) == 0) + devChan.flush(); } __device__ void allgather2(mscclpp::channel::SimpleDeviceChannel devChan, int rank, int world_size, int nranksPerNode, @@ -127,7 +130,7 @@ __device__ void allgather2(mscclpp::channel::SimpleDeviceChannel devChan, int ra if (remoteRank % nranksPerNode == rank % nranksPerNode) { // opposite side if ((threadIdx.x % 32) == 0) - devChan.putWithSignalAndFlush(rank * nelemsPerGPU * sizeof(int), + devChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int)); if ((threadIdx.x % 32) == 0) devChan.wait(); @@ -147,7 +150,7 @@ __device__ void allgather2(mscclpp::channel::SimpleDeviceChannel devChan, int ra if (remoteRank % nranksPerNode == rank % nranksPerNode) { // opposite side if ((threadIdx.x % 32) == 0) - devChan.putWithSignalAndFlush((rank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * + devChan.putWithSignal((rank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int), nelemsPerGPU / pipelineSize * sizeof(int)); if ((threadIdx.x % 32) == 0) @@ -163,6 +166,11 @@ __device__ void allgather2(mscclpp::channel::SimpleDeviceChannel devChan, int ra (otherNghr * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int), nelemsPerGPU / pipelineSize * sizeof(int)); } + + if (remoteRank / nranksPerNode == rank / nranksPerNode || remoteRank % nranksPerNode == rank % nranksPerNode) { + if ((threadIdx.x % 32) == 0) + devChan.flush(); + } } __global__ void kernel(int rank, int world_size, int nranksPerNode, size_t nelemsPerGPU, int kernel) diff --git a/tests/communicator_test_cpp.cu b/tests/communicator_test_cpp.cu index fcdd0f5a..56c8592e 100644 --- a/tests/communicator_test_cpp.cu +++ b/tests/communicator_test_cpp.cu @@ -39,23 +39,6 @@ void register_all_memories(mscclpp::Communicator& communicator, int rank, int wo remoteMemory[i] = futureRemoteMemory[i].get(); } } - - - // auto serialized = localMemory.serialize(); - // int serializedSize = serialized.size(); - // for (int i = 0; i < worldSize; i++) { - // if (i != rank){ - // communicator.bootstrapper()->send(serialized.data(), serializedSize, i, 0); - // } - // } - // for (int i = 0; i < worldSize; i++) { - // if (i != rank){ - // std::vector deserialized(serializedSize); - // communicator.bootstrapper()->recv(deserialized.data(), serializedSize, i, 0); - // auto remote = mscclpp::RegisteredMemory::deserialize(deserialized); - // remoteMemory[i] = remote; - // } - // } } void make_connections(mscclpp::Communicator& communicator, int rank, int worldSize, int nRanksPerNode, mscclpp::Transport myIbDevice, std::unordered_map>& connections){ From 503cdd5c7ee693e6b16a1aa75fa85d86321bf0f0 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Thu, 13 Apr 2023 00:23:25 +0000 Subject: [PATCH 105/135] CMake build system transition WIP --- CMakeLists.txt | 23 ++++++++++++++++++ cmake/modules/FindGDRCopy.cmake | 41 +++++++++++++++++++++++++++++++++ cmake/modules/FindIBVerbs.cmake | 41 +++++++++++++++++++++++++++++++++ cmake/modules/FindNUMA.cmake | 41 +++++++++++++++++++++++++++++++++ src/CMakeLists.txt | 12 ++++++++++ tests/CMakeLists.txt | 5 ++++ 6 files changed, 163 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 cmake/modules/FindGDRCopy.cmake create mode 100644 cmake/modules/FindIBVerbs.cmake create mode 100644 cmake/modules/FindNUMA.cmake create mode 100644 src/CMakeLists.txt create mode 100644 tests/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 00000000..87c9c24e --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,23 @@ +cmake_minimum_required(VERSION 3.26) + +project(mscclpp LANGUAGES CUDA CXX) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CUDA_STANDARD 17) + +list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/modules) + +find_package(CUDAToolkit REQUIRED) +find_package(IBVerbs REQUIRED) +find_package(NUMA REQUIRED) +find_package(GDRCopy) + +option(USE_MPI_FOR_TESTS "Use MPI for tests" ON) +if(USE_MPI_FOR_TESTS) + find_package(MPI REQUIRED) +endif() + +include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + +add_subdirectory(src) +add_subdirectory(tests) \ No newline at end of file diff --git a/cmake/modules/FindGDRCopy.cmake b/cmake/modules/FindGDRCopy.cmake new file mode 100644 index 00000000..cde447ba --- /dev/null +++ b/cmake/modules/FindGDRCopy.cmake @@ -0,0 +1,41 @@ +# Find the GDRCopy libraries +# +# The following variables are optionally searched for defaults +# GDRCOPY_ROOT_DIR: Base directory where all GDRCopy components are found +# GDRCOPY_INCLUDE_DIR: Directory where GDRCopy headers are found +# GDRCOPY_LIB_DIR: Directory where GDRCopy libraries are found + +# The following are set after configuration is done: +# GDRCOPY_FOUND +# GDRCOPY_INCLUDE_DIRS +# GDRCOPY_LIBRARIES + +# An imported target MSCCLPP::gdrcopy is created if the library is found. + +find_path(GDRCOPY_INCLUDE_DIRS + NAMES gdrapi.h + HINTS + ${GDRCOPY_INCLUDE_DIR} + ${GDRCOPY_ROOT_DIR} + ${GDRCOPY_ROOT_DIR}/include) + +find_library(GDRCOPY_LIBRARIES + NAMES gdrapi + HINTS + ${GDRCOPY_LIB_DIR} + ${GDRCOPY_ROOT_DIR} + ${GDRCOPY_ROOT_DIR}/lib) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(GDRCopy DEFAULT_MSG GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES) +mark_as_advanced(GDRCOPY_INCLUDE_DIR GDRCOPY_LIBRARIES) + +if(GDRCOPY_FOUND) + if(NOT TARGET MSCCLPP::gdrcopy) + add_library(MSCCLPP::gdrcopy UNKNOWN IMPORTED) + endif() + set_target_properties(MSCCLPP::gdrcopy PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${GDRCOPY_INCLUDE_DIR}" + IMPORTED_LINK_INTERFACE_LANGUAGES "C" + IMPORTED_LOCATION "${GDRCOPY_LIBRARIES}") +endif() \ No newline at end of file diff --git a/cmake/modules/FindIBVerbs.cmake b/cmake/modules/FindIBVerbs.cmake new file mode 100644 index 00000000..fc80b11c --- /dev/null +++ b/cmake/modules/FindIBVerbs.cmake @@ -0,0 +1,41 @@ +# Find the IB Verbs libraries +# +# The following variables are optionally searched for defaults +# IBVERBS_ROOT_DIR: Base directory where all ibverbs components are found +# IBVERBS_INCLUDE_DIR: Directory where ibverbs headers are found +# IBVERBS_LIB_DIR: Directory where ibverbs libraries are found + +# The following are set after configuration is done: +# IBVERBS_FOUND +# IBVERBS_INCLUDE_DIRS +# IBVERBS_LIBRARIES + +# An imported target MSCCLPP::ibverbs is created if the library is found. + +find_path(IBVERBS_INCLUDE_DIRS + NAMES infiniband/verbs.h + HINTS + ${IBVERBS_INCLUDE_DIR} + ${IBVERBS_ROOT_DIR} + ${IBVERBS_ROOT_DIR}/include) + +find_library(IBVERBS_LIBRARIES + NAMES ibverbs + HINTS + ${IBVERBS_LIB_DIR} + ${IBVERBS_ROOT_DIR} + ${IBVERBS_ROOT_DIR}/lib) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(IBVerbs DEFAULT_MSG IBVERBS_INCLUDE_DIRS IBVERBS_LIBRARIES) +mark_as_advanced(IBVERBS_INCLUDE_DIR IBVERBS_LIBRARIES) + +if(IBVERBS_FOUND) + if(NOT TARGET MSCCLPP::ibverbs) + add_library(MSCCLPP::ibverbs UNKNOWN IMPORTED) + endif() + set_target_properties(MSCCLPP::ibverbs PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}" + IMPORTED_LINK_INTERFACE_LANGUAGES "C" + IMPORTED_LOCATION "${IBVERBS_LIBRARIES}") +endif() \ No newline at end of file diff --git a/cmake/modules/FindNUMA.cmake b/cmake/modules/FindNUMA.cmake new file mode 100644 index 00000000..70e04d53 --- /dev/null +++ b/cmake/modules/FindNUMA.cmake @@ -0,0 +1,41 @@ +# Find the numa libraries +# +# The following variables are optionally searched for defaults +# NUMA_ROOT_DIR: Base directory where all numa components are found +# NUMA_INCLUDE_DIR: Directory where numa headers are found +# NUMA_LIB_DIR: Directory where numa libraries are found + +# The following are set after configuration is done: +# NUMA_FOUND +# NUMA_INCLUDE_DIRS +# NUMA_LIBRARIES + +# An imported target MSCCLPP::numa is created if the library is found. + +find_path(NUMA_INCLUDE_DIRS + NAMES numa.h + HINTS + ${NUMA_INCLUDE_DIR} + ${NUMA_ROOT_DIR} + ${NUMA_ROOT_DIR}/include) + +find_library(NUMA_LIBRARIES + NAMES numa + HINTS + ${NUMA_LIB_DIR} + ${NUMA_ROOT_DIR} + ${NUMA_ROOT_DIR}/lib) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(NUMA DEFAULT_MSG NUMA_INCLUDE_DIRS NUMA_LIBRARIES) +mark_as_advanced(NUMA_INCLUDE_DIR NUMA_LIBRARIES) + +if(NUMA_FOUND) + if(NOT TARGET MSCCLPP::numa) + add_library(MSCCLPP::numa UNKNOWN IMPORTED) + endif() + set_target_properties(MSCCLPP::numa PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}" + IMPORTED_LINK_INTERFACE_LANGUAGES "C" + IMPORTED_LOCATION "${NUMA_LIBRARIES}") +endif() \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 00000000..f6bf1bc3 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,12 @@ +file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.h) +file(GLOB to_remove gdr.cc) +list(REMOVE_ITEM SOURCES ${to_remove}) + +add_library(mscclpp SHARED ${SOURCES}) +set_target_properties(mscclpp PROPERTIES LINKER_LANGUAGE CXX) +target_link_libraries(mscclpp PRIVATE MSCCLPP::ibverbs MSCCLPP::numa CUDA::cudart) +if(GDRCOPY_FOUND) + target_link_libraries(mscclpp PRIVATE MSCCLPP::gdrcopy) +endif() + +target_include_directories(mscclpp PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include) \ No newline at end of file diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 00000000..669c669d --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,5 @@ +add_executable(bootstrap_test bootstrap_test.cc) +target_link_libraries(bootstrap_test mscclpp) + +add_executable(allgather_test_standalone allgather_test_standalone.cu) +target_link_libraries(allgather_test_standalone mscclpp) \ No newline at end of file From 09d5f7c12ec6b0487d415ad10aa8eac086c58f58 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Thu, 4 May 2023 00:39:30 +0000 Subject: [PATCH 106/135] Fixes for cmake --- CMakeLists.txt | 1 + src/CMakeLists.txt | 2 +- tests/CMakeLists.txt | 7 +++++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 87c9c24e..68fa1b84 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,6 +15,7 @@ find_package(GDRCopy) option(USE_MPI_FOR_TESTS "Use MPI for tests" ON) if(USE_MPI_FOR_TESTS) find_package(MPI REQUIRED) + add_definitions(-DMSCCLPP_USE_MPI_FOR_TESTS) endif() include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f6bf1bc3..1d989c6b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -4,7 +4,7 @@ list(REMOVE_ITEM SOURCES ${to_remove}) add_library(mscclpp SHARED ${SOURCES}) set_target_properties(mscclpp PROPERTIES LINKER_LANGUAGE CXX) -target_link_libraries(mscclpp PRIVATE MSCCLPP::ibverbs MSCCLPP::numa CUDA::cudart) +target_link_libraries(mscclpp PRIVATE MSCCLPP::ibverbs MSCCLPP::numa CUDA::cudart CUDA::cuda_driver) if(GDRCOPY_FOUND) target_link_libraries(mscclpp PRIVATE MSCCLPP::gdrcopy) endif() diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 669c669d..fd02e658 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,5 +1,8 @@ add_executable(bootstrap_test bootstrap_test.cc) -target_link_libraries(bootstrap_test mscclpp) +target_link_libraries(bootstrap_test mscclpp MPI::MPI_CXX) + +add_executable(allgather_test_cpp allgather_test_cpp.cu) +target_link_libraries(allgather_test_cpp mscclpp MPI::MPI_CXX) add_executable(allgather_test_standalone allgather_test_standalone.cu) -target_link_libraries(allgather_test_standalone mscclpp) \ No newline at end of file +target_link_libraries(allgather_test_standalone mscclpp MPI::MPI_CXX) From bd2121a2efa53d3ab4104f53cae24c71981dda2d Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Thu, 4 May 2023 00:53:50 +0000 Subject: [PATCH 107/135] CMake improvement --- CMakeLists.txt | 12 ++++++++++-- src/CMakeLists.txt | 9 +-------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 68fa1b84..81f99cdb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,5 +20,13 @@ endif() include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) -add_subdirectory(src) -add_subdirectory(tests) \ No newline at end of file +add_library(mscclpp SHARED) +add_subdirectory(src) # This adds the srouces to the mscclpp target +target_include_directories(mscclpp PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/src/include) +set_target_properties(mscclpp PROPERTIES LINKER_LANGUAGE CXX) +target_link_libraries(mscclpp PRIVATE MSCCLPP::ibverbs MSCCLPP::numa CUDA::cudart CUDA::cuda_driver) +if(GDRCOPY_FOUND) + target_link_libraries(mscclpp PRIVATE MSCCLPP::gdrcopy) +endif() + +add_subdirectory(tests) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1d989c6b..5e583d45 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,11 +2,4 @@ file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.h) file(GLOB to_remove gdr.cc) list(REMOVE_ITEM SOURCES ${to_remove}) -add_library(mscclpp SHARED ${SOURCES}) -set_target_properties(mscclpp PROPERTIES LINKER_LANGUAGE CXX) -target_link_libraries(mscclpp PRIVATE MSCCLPP::ibverbs MSCCLPP::numa CUDA::cudart CUDA::cuda_driver) -if(GDRCOPY_FOUND) - target_link_libraries(mscclpp PRIVATE MSCCLPP::gdrcopy) -endif() - -target_include_directories(mscclpp PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include) \ No newline at end of file +target_sources(mscclpp PRIVATE ${SOURCES}) From d7103602acfa21723d022adc47d9dbc97057b80d Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Thu, 4 May 2023 00:55:35 +0000 Subject: [PATCH 108/135] Only build C++ tests in CMake --- tests/CMakeLists.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index fd02e658..457003e3 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,8 +1,8 @@ -add_executable(bootstrap_test bootstrap_test.cc) -target_link_libraries(bootstrap_test mscclpp MPI::MPI_CXX) +add_executable(bootstrap_test_cpp bootstrap_test_cpp.cc) +target_link_libraries(bootstrap_test_cpp mscclpp MPI::MPI_CXX) + +add_executable(communicator_test_cpp communicator_test_cpp.cu) +target_link_libraries(communicator_test_cpp mscclpp MPI::MPI_CXX) add_executable(allgather_test_cpp allgather_test_cpp.cu) target_link_libraries(allgather_test_cpp mscclpp MPI::MPI_CXX) - -add_executable(allgather_test_standalone allgather_test_standalone.cu) -target_link_libraries(allgather_test_standalone mscclpp MPI::MPI_CXX) From ddc9e681c8428a13ebd5f4f24cd4f4d4af3c3e31 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Thu, 4 May 2023 00:57:34 +0000 Subject: [PATCH 109/135] Add ib_test to CMake --- tests/CMakeLists.txt | 2 ++ tests/unittests/CMakeLists.txt | 2 ++ 2 files changed, 4 insertions(+) create mode 100644 tests/unittests/CMakeLists.txt diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 457003e3..b6ee63c7 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -6,3 +6,5 @@ target_link_libraries(communicator_test_cpp mscclpp MPI::MPI_CXX) add_executable(allgather_test_cpp allgather_test_cpp.cu) target_link_libraries(allgather_test_cpp mscclpp MPI::MPI_CXX) + +add_subdirectory(unittests) diff --git a/tests/unittests/CMakeLists.txt b/tests/unittests/CMakeLists.txt new file mode 100644 index 00000000..85f87f52 --- /dev/null +++ b/tests/unittests/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(ib_test ib_test.cc) +target_link_libraries(ib_test mscclpp MPI::MPI_CXX CUDA::cudart) \ No newline at end of file From bb3239fd6b7fe91e277eb80a3a33b238caf45155 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 4 May 2023 11:03:45 +0000 Subject: [PATCH 110/135] Fix IB write issue --- src/ib.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ib.cc b/src/ib.cc index 7e77b235..1e3e0af6 100644 --- a/src/ib.cc +++ b/src/ib.cc @@ -128,6 +128,7 @@ IbQp::IbQp(void* ctx, void* pd, int port) throw std::runtime_error(err.str()); } this->qp = _qp; + this->wrn = 0; MSCCLPPTHROW(mscclppCalloc(reinterpret_cast(&this->wrs), MSCCLPP_IB_MAX_SENDS)); MSCCLPPTHROW(mscclppCalloc(reinterpret_cast(&this->sges), MSCCLPP_IB_MAX_SENDS)); MSCCLPPTHROW(mscclppCalloc(reinterpret_cast(&this->wcs), MSCCLPP_IB_CQ_POLL_NUM)); From 9fb29f9dfc5b4c0f0adc578682a8119e1c27a6e3 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Thu, 4 May 2023 17:48:24 +0000 Subject: [PATCH 111/135] timeout for flush --- src/connection.cc | 5 +++++ src/include/connection.hpp | 3 +++ 2 files changed, 8 insertions(+) diff --git a/src/connection.cc b/src/connection.cc index dca3e662..6a657e02 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -119,11 +119,16 @@ void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMem void IBConnection::flush() { + Timer timer; while (numSignaledSends) { int wcNum = qp->pollCq(); if (wcNum < 0) { throw std::runtime_error("pollCq failed: error no " + std::to_string(errno)); } + + auto elapsed = timer.elapsed(); + if (elapsed > MSCCLPP_POLLING_WAIT) + throw std::runtime_error("pollCq is stuck: waited for " + std::to_string(elapsed) + " seconds. Expected " + std::to_string(numSignaledSends) + " signals"); for (int i = 0; i < wcNum; ++i) { const struct ibv_wc* wc = reinterpret_cast(qp->getWc(i)); if (wc->status != IBV_WC_SUCCESS) { diff --git a/src/include/connection.hpp b/src/include/connection.hpp index 8d1dec87..e06c426a 100644 --- a/src/include/connection.hpp +++ b/src/include/connection.hpp @@ -1,6 +1,9 @@ #ifndef MSCCLPP_CONNECTION_HPP_ #define MSCCLPP_CONNECTION_HPP_ +// TODO(saemal): make this configurable +#define MSCCLPP_POLLING_WAIT 10000 // in microseconds + #include "communicator.hpp" #include "ib.hpp" #include "mscclpp.hpp" From 669c67b3de113b517d908670435eb6b07b629121 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 5 May 2023 08:42:25 +0000 Subject: [PATCH 112/135] enable github action on all ranches --- .github/workflows/cpplint.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cpplint.yml b/.github/workflows/cpplint.yml index 72da335e..0b002f44 100644 --- a/.github/workflows/cpplint.yml +++ b/.github/workflows/cpplint.yml @@ -2,9 +2,11 @@ name: C++ Lint on: push: - branches: [ main ] + branches: + - '**' pull_request: - branches: [ main ] + branches: + - '**' jobs: run-linters: From adaa75536dc7125d83c5f06cfa9d9e34d47666d0 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Fri, 5 May 2023 18:05:55 +0000 Subject: [PATCH 113/135] Add clang-format to CMake --- .clang-format | 139 +------------------------------------------------ CMakeLists.txt | 12 ++++- 2 files changed, 13 insertions(+), 138 deletions(-) diff --git a/.clang-format b/.clang-format index c3bb0335..bfd118eb 100644 --- a/.clang-format +++ b/.clang-format @@ -1,137 +1,2 @@ ---- -Language: Cpp -# BasedOnStyle: Microsoft -AccessModifierOffset: -2 -AlignAfterOpenBracket: Align -AlignConsecutiveMacros: false -AlignConsecutiveAssignments: false -AlignConsecutiveDeclarations: false -AlignEscapedNewlines: Right -AlignOperands: true -AlignTrailingComments: true -AllowAllArgumentsOnNextLine: true -AllowAllConstructorInitializersOnNextLine: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowShortBlocksOnASingleLine: Never -AllowShortCaseLabelsOnASingleLine: false -AllowShortFunctionsOnASingleLine: None -AllowShortLambdasOnASingleLine: All -AllowShortIfStatementsOnASingleLine: Never -AllowShortLoopsOnASingleLine: false -AlwaysBreakAfterDefinitionReturnType: None -AlwaysBreakAfterReturnType: None -AlwaysBreakBeforeMultilineStrings: false -AlwaysBreakTemplateDeclarations: MultiLine -BinPackArguments: true -BinPackParameters: true -BraceWrapping: - AfterCaseLabel: false - AfterClass: true - AfterControlStatement: false # true - AfterEnum: true - AfterFunction: true - AfterNamespace: false # true - AfterObjCDeclaration: true - AfterStruct: true - AfterUnion: false - AfterExternBlock: true - BeforeCatch: false # true - BeforeElse: false # true - IndentBraces: false - SplitEmptyFunction: true - SplitEmptyRecord: true - SplitEmptyNamespace: true -BreakBeforeBinaryOperators: None -BreakBeforeBraces: Custom -BreakBeforeInheritanceComma: false -BreakInheritanceList: BeforeColon -BreakBeforeTernaryOperators: true -BreakConstructorInitializersBeforeComma: false -BreakConstructorInitializers: BeforeColon -BreakAfterJavaFieldAnnotations: false -BreakStringLiterals: true -ColumnLimit: 120 -CommentPragmas: '^ IWYU pragma:' -CompactNamespaces: false -ConstructorInitializerAllOnOneLineOrOnePerLine: false -ConstructorInitializerIndentWidth: 2 -ContinuationIndentWidth: 2 -Cpp11BracedListStyle: true -DeriveLineEnding: true -DerivePointerAlignment: false -DisableFormat: false -ExperimentalAutoDetectBinPacking: false -FixNamespaceComments: true -ForEachMacros: - - foreach - - Q_FOREACH - - BOOST_FOREACH -IncludeBlocks: Preserve -IncludeCategories: - - Regex: '^"(llvm|llvm-c|clang|clang-c)/' - Priority: 2 - SortPriority: 0 - - Regex: '^(<|"(gtest|gmock|isl|json)/)' - Priority: 3 - SortPriority: 0 - - Regex: '.*' - Priority: 1 - SortPriority: 0 -IncludeIsMainRegex: '(Test)?$' -IncludeIsMainSourceRegex: '' -IndentCaseLabels: false -IndentExternBlock: NoIndent -IndentGotoLabels: true -IndentPPDirectives: None -IndentWidth: 2 -IndentWrappedFunctionNames: false -JavaScriptQuotes: Leave -JavaScriptWrapImports: true -KeepEmptyLinesAtTheStartOfBlocks: true -MacroBlockBegin: '' -MacroBlockEnd: '' -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None -ObjCBinPackProtocolList: Auto -ObjCBlockIndentWidth: 2 -ObjCSpaceAfterProperty: false -ObjCSpaceBeforeProtocolList: true -PenaltyBreakAssignment: 2 -PenaltyBreakBeforeFirstCallParameter: 19 -PenaltyBreakComment: 300 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakString: 1000 -PenaltyBreakTemplateDeclaration: 10 -PenaltyExcessCharacter: 1000000 -PenaltyReturnTypeOnItsOwnLine: 1000 -PointerAlignment: Left -ReflowComments: true -SortIncludes: true -SortUsingDeclarations: true -SpaceAfterCStyleCast: false -SpaceAfterLogicalNot: false -SpaceAfterTemplateKeyword: true -SpaceBeforeAssignmentOperators: true -SpaceBeforeCpp11BracedList: false -SpaceBeforeCtorInitializerColon: true -SpaceBeforeInheritanceColon: true -SpaceBeforeParens: ControlStatements -SpaceBeforeRangeBasedForLoopColon: true -SpaceInEmptyBlock: false -SpaceInEmptyParentheses: false -SpacesBeforeTrailingComments: 1 -SpacesInAngles: false -SpacesInConditionalStatement: false -SpacesInContainerLiterals: true -SpacesInCStyleCastParentheses: false -SpacesInParentheses: false -SpacesInSquareBrackets: false -SpaceBeforeSquareBrackets: false -Standard: Latest -StatementMacros: - - Q_UNUSED - - QT_REQUIRE_VERSION -TabWidth: 2 -UseCRLF: false -UseTab: Never -... +BasedOnStyle: Google +ColumnLimit: 120 diff --git a/CMakeLists.txt b/CMakeLists.txt index 81f99cdb..a66535c9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,7 +21,6 @@ endif() include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) add_library(mscclpp SHARED) -add_subdirectory(src) # This adds the srouces to the mscclpp target target_include_directories(mscclpp PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/src/include) set_target_properties(mscclpp PROPERTIES LINKER_LANGUAGE CXX) target_link_libraries(mscclpp PRIVATE MSCCLPP::ibverbs MSCCLPP::numa CUDA::cudart CUDA::cuda_driver) @@ -29,4 +28,15 @@ if(GDRCOPY_FOUND) target_link_libraries(mscclpp PRIVATE MSCCLPP::gdrcopy) endif() +find_program(CLANG_FORMAT clang-format) +if(CLANG_FORMAT) + message(STATUS "Found clang-format: ${CLANG_FORMAT}") + file(GLOB_RECURSE ALL_SOURCES include/*.h src/*.cpp src/*.h) + add_custom_target(check-format ALL COMMAND ${CLANG_FORMAT} --dry-run ${ALL_SOURCES}) + add_custom_target(format COMMAND ${CLANG_FORMAT} -i ${ALL_SOURCES}) +else() + message(STATUS "clang-format not found.") +endif() + +add_subdirectory(src) # This adds the sources to the mscclpp target add_subdirectory(tests) From 051643b4c2b9c18f7419f1e549810fcdb9c640df Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Fri, 5 May 2023 18:13:34 +0000 Subject: [PATCH 114/135] Fix clang-format glob --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a66535c9..106b605c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,7 +31,7 @@ endif() find_program(CLANG_FORMAT clang-format) if(CLANG_FORMAT) message(STATUS "Found clang-format: ${CLANG_FORMAT}") - file(GLOB_RECURSE ALL_SOURCES include/*.h src/*.cpp src/*.h) + file(GLOB_RECURSE ALL_SOURCES *.h *.hpp *.cc *.cpp *.cu) add_custom_target(check-format ALL COMMAND ${CLANG_FORMAT} --dry-run ${ALL_SOURCES}) add_custom_target(format COMMAND ${CLANG_FORMAT} -i ${ALL_SOURCES}) else() From 86be901d98141188b36d19cc8ddee82f1f95cf45 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Fri, 5 May 2023 19:11:33 +0000 Subject: [PATCH 115/135] CMake improvements --- CMakeLists.txt | 38 +++++++++++++-------------- cmake/AddClangFormatTargets.cmake | 18 +++++++++++++ cmake/{modules => }/FindGDRCopy.cmake | 0 cmake/{modules => }/FindIBVerbs.cmake | 0 cmake/{modules => }/FindNUMA.cmake | 0 tests/CMakeLists.txt | 18 ++++++++----- tests/unittests/CMakeLists.txt | 4 +-- 7 files changed, 50 insertions(+), 28 deletions(-) create mode 100644 cmake/AddClangFormatTargets.cmake rename cmake/{modules => }/FindGDRCopy.cmake (100%) rename cmake/{modules => }/FindIBVerbs.cmake (100%) rename cmake/{modules => }/FindNUMA.cmake (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 106b605c..ad71de86 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,41 +1,41 @@ cmake_minimum_required(VERSION 3.26) - project(mscclpp LANGUAGES CUDA CXX) - set(CMAKE_CXX_STANDARD 17) set(CMAKE_CUDA_STANDARD 17) -list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/modules) +option(ENABLE_TRACE "Enable tracing" OFF) +option(USE_MPI_FOR_TESTS "Use MPI for tests" ON) +option(USE_NPKIT "Use NPKIT" ON) +option(ALLOW_GDRCOPY "Use GDRCopy, if available" OFF) + +list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) find_package(CUDAToolkit REQUIRED) find_package(IBVerbs REQUIRED) find_package(NUMA REQUIRED) -find_package(GDRCopy) - -option(USE_MPI_FOR_TESTS "Use MPI for tests" ON) if(USE_MPI_FOR_TESTS) find_package(MPI REQUIRED) - add_definitions(-DMSCCLPP_USE_MPI_FOR_TESTS) +endif() +if(ALLOW_GDRCOPY) + find_package(GDRCopy) endif() -include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) +set(CLANG_FORMAT_SOURCE_DIRS src tests) +include(${PROJECT_SOURCE_DIR}/cmake/AddClangFormatTargets.cmake) add_library(mscclpp SHARED) target_include_directories(mscclpp PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/src/include) set_target_properties(mscclpp PROPERTIES LINKER_LANGUAGE CXX) target_link_libraries(mscclpp PRIVATE MSCCLPP::ibverbs MSCCLPP::numa CUDA::cudart CUDA::cuda_driver) -if(GDRCOPY_FOUND) - target_link_libraries(mscclpp PRIVATE MSCCLPP::gdrcopy) +if(ENABLE_TRACE) + target_compile_definitions(mscclpp PRIVATE ENABLE_TRACE) endif() - -find_program(CLANG_FORMAT clang-format) -if(CLANG_FORMAT) - message(STATUS "Found clang-format: ${CLANG_FORMAT}") - file(GLOB_RECURSE ALL_SOURCES *.h *.hpp *.cc *.cpp *.cu) - add_custom_target(check-format ALL COMMAND ${CLANG_FORMAT} --dry-run ${ALL_SOURCES}) - add_custom_target(format COMMAND ${CLANG_FORMAT} -i ${ALL_SOURCES}) -else() - message(STATUS "clang-format not found.") +if(USE_NPKIT) + target_compile_definitions(mscclpp PRIVATE ENABLE_NPKIT) +endif() +if(ALLOW_GDRCOPY AND GDRCOPY_FOUND) + target_compile_definitions(mscclpp PRIVATE MSCCLPP_USE_GDRCOPY) + target_link_libraries(mscclpp PRIVATE MSCCLPP::gdrcopy) endif() add_subdirectory(src) # This adds the sources to the mscclpp target diff --git a/cmake/AddClangFormatTargets.cmake b/cmake/AddClangFormatTargets.cmake new file mode 100644 index 00000000..3bb3f73e --- /dev/null +++ b/cmake/AddClangFormatTargets.cmake @@ -0,0 +1,18 @@ +# Add targets to run clang-format + +find_program(CLANG_FORMAT clang-format) +if(CLANG_FORMAT) + message(STATUS "Found clang-format: ${CLANG_FORMAT}") + set(CLANG_FORMAT_FILE_TYPES *.h *.hpp *.c *.cc *.cpp *.cu) + # Produce combinations of source directories and file types + foreach(SOURCE_DIR ${CLANG_FORMAT_SOURCE_DIRS}) + foreach(FILE_TYPE ${CLANG_FORMAT_FILE_TYPES}) + list(APPEND CLANG_FORMAT_SOURCE_PATTERNS ${SOURCE_DIR}/${FILE_TYPE}) + endforeach() + endforeach() + file(GLOB_RECURSE CLANG_FORMAT_SOURCES ${CLANG_FORMAT_SOURCE_PATTERNS}) + add_custom_target(check-format ALL COMMAND ${CLANG_FORMAT} --dry-run ${CLANG_FORMAT_SOURCES}) + add_custom_target(format COMMAND ${CLANG_FORMAT} -i ${CLANG_FORMAT_SOURCES}) +else() + message(STATUS "clang-format not found.") +endif() diff --git a/cmake/modules/FindGDRCopy.cmake b/cmake/FindGDRCopy.cmake similarity index 100% rename from cmake/modules/FindGDRCopy.cmake rename to cmake/FindGDRCopy.cmake diff --git a/cmake/modules/FindIBVerbs.cmake b/cmake/FindIBVerbs.cmake similarity index 100% rename from cmake/modules/FindIBVerbs.cmake rename to cmake/FindIBVerbs.cmake diff --git a/cmake/modules/FindNUMA.cmake b/cmake/FindNUMA.cmake similarity index 100% rename from cmake/modules/FindNUMA.cmake rename to cmake/FindNUMA.cmake diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b6ee63c7..17875d69 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,10 +1,14 @@ -add_executable(bootstrap_test_cpp bootstrap_test_cpp.cc) -target_link_libraries(bootstrap_test_cpp mscclpp MPI::MPI_CXX) +function(add_test_executable name sources) + add_executable(${name} ${sources}) + target_link_libraries(${name} mscclpp) + if(USE_MPI_FOR_TESTS) + target_link_libraries(${name} MPI::MPI_CXX) + target_compile_definitions(${name} PRIVATE MSCCLPP_USE_MPI_FOR_TESTS) + endif() +endfunction() -add_executable(communicator_test_cpp communicator_test_cpp.cu) -target_link_libraries(communicator_test_cpp mscclpp MPI::MPI_CXX) - -add_executable(allgather_test_cpp allgather_test_cpp.cu) -target_link_libraries(allgather_test_cpp mscclpp MPI::MPI_CXX) +add_test_executable(bootstrap_test_cpp bootstrap_test_cpp.cc) +add_test_executable(communicator_test_cpp communicator_test_cpp.cu) +add_test_executable(allgather_test_cpp allgather_test_cpp.cu) add_subdirectory(unittests) diff --git a/tests/unittests/CMakeLists.txt b/tests/unittests/CMakeLists.txt index 85f87f52..44e405ad 100644 --- a/tests/unittests/CMakeLists.txt +++ b/tests/unittests/CMakeLists.txt @@ -1,2 +1,2 @@ -add_executable(ib_test ib_test.cc) -target_link_libraries(ib_test mscclpp MPI::MPI_CXX CUDA::cudart) \ No newline at end of file +add_test_executable(ib_test ib_test.cc) +target_link_libraries(ib_test CUDA::cudart) \ No newline at end of file From 4f528d29a0b4ad981e8a8f47866918afb98c076c Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Fri, 5 May 2023 19:15:38 +0000 Subject: [PATCH 116/135] Make clang-format style file explicit --- cmake/AddClangFormatTargets.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/AddClangFormatTargets.cmake b/cmake/AddClangFormatTargets.cmake index 3bb3f73e..49e142a3 100644 --- a/cmake/AddClangFormatTargets.cmake +++ b/cmake/AddClangFormatTargets.cmake @@ -11,8 +11,8 @@ if(CLANG_FORMAT) endforeach() endforeach() file(GLOB_RECURSE CLANG_FORMAT_SOURCES ${CLANG_FORMAT_SOURCE_PATTERNS}) - add_custom_target(check-format ALL COMMAND ${CLANG_FORMAT} --dry-run ${CLANG_FORMAT_SOURCES}) - add_custom_target(format COMMAND ${CLANG_FORMAT} -i ${CLANG_FORMAT_SOURCES}) + add_custom_target(check-format ALL COMMAND ${CLANG_FORMAT} -style=file --dry-run ${CLANG_FORMAT_SOURCES}) + add_custom_target(format COMMAND ${CLANG_FORMAT} -style=file -i ${CLANG_FORMAT_SOURCES}) else() message(STATUS "clang-format not found.") endif() From 8650dbaff800ec8321daa46d29cf815abdc0f975 Mon Sep 17 00:00:00 2001 From: Binyang2014 Date: Sat, 6 May 2023 16:27:25 +0800 Subject: [PATCH 117/135] Add exception class for mscclpp (#67) Add exception class for mscclpp --- Makefile | 4 +- src/bootstrap/bootstrap.cc | 26 ++++--- src/channel.cc | 12 ++-- src/communicator.cc | 14 ++-- src/connection.cc | 36 ++++++---- src/epoch.cc | 18 +++-- src/errors.cc | 30 ++++++++ src/fifo.cc | 2 +- src/ib.cc | 44 ++++++------ src/include/channel.hpp | 78 +++++++++++++-------- src/include/checks.hpp | 9 ++- src/include/connection.hpp | 1 + src/include/epoch.hpp | 10 ++- src/include/errors.hpp | 46 +++++++++++++ src/include/mscclpp.hpp | 28 +++++--- src/include/registered_memory.hpp | 9 ++- src/include/utils.hpp | 70 +++++++++---------- src/proxy_cpp.cc | 8 +-- src/registered_memory.cc | 14 ++-- src/utils.cc | 2 +- tests/allgather_test_cpp.cu | 22 +++--- tests/communicator_test_cpp.cu | 109 ++++++++++++++++++------------ 22 files changed, 381 insertions(+), 211 deletions(-) create mode 100644 src/errors.cc create mode 100644 src/include/errors.hpp diff --git a/Makefile b/Makefile index 2b80afb5..74d2c475 100644 --- a/Makefile +++ b/Makefile @@ -121,7 +121,7 @@ LDFLAGS := $(NVLDFLAGS) $(GDRCOPY_LDFLAGS) -libverbs -lnuma LIBSRCS := $(addprefix src/,debug.cc utils.cc init.cc proxy.cc ib.cc config.cc) LIBSRCS += $(addprefix src/bootstrap/,bootstrap.cc socket.cc) LIBSRCS += $(addprefix src/,communicator.cc connection.cc registered_memory.cc) -LIBSRCS += $(addprefix src/,epoch.cc proxy_cpp.cc fifo.cc channel.cc) +LIBSRCS += $(addprefix src/,epoch.cc proxy_cpp.cc fifo.cc channel.cc errors.cc) ifneq ($(NPKIT), 0) LIBSRCS += $(addprefix src/misc/,npkit.cc) endif @@ -135,7 +135,7 @@ HEADERS := $(wildcard src/include/*.h) CPPSOURCES := $(shell find ./ -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)' -not -path "./build/*" -not -path "./python/*") PYTHONCPPSOURCES := $(shell find ./python/src/ -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)') -INCEXPORTS := mscclpp.h mscclppfifo.h mscclpp.hpp mscclppfifo.hpp epoch.hpp +INCEXPORTS := mscclpp.h mscclppfifo.h mscclpp.hpp mscclppfifo.hpp epoch.hpp errors.hpp INCTARGETS := $(INCEXPORTS:%=$(BUILDDIR)/$(INCDIR)/%) LIBNAME := libmscclpp.so diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index 75225799..50227234 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -194,13 +194,15 @@ void Bootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, std::vector< MSCCLPPTHROW(mscclppSocketClose(&sock)); if (this->nRanks_ != info.nRanks) { - throw std::runtime_error("Bootstrap Root : mismatch in rank count from procs " + std::to_string(this->nRanks_) + - " : " + std::to_string(info.nRanks)); + throw mscclpp::Error("Bootstrap Root : mismatch in rank count from procs " + std::to_string(this->nRanks_) + " : " + + std::to_string(info.nRanks), + mscclppInternalError); } if (std::memcmp(&zero, &rankAddressesRoot[info.rank], sizeof(mscclppSocketAddress)) != 0) { - throw std::runtime_error("Bootstrap Root : rank " + std::to_string(info.rank) + " of " + - std::to_string(this->nRanks_) + " has already checked in"); + throw mscclpp::Error("Bootstrap Root : rank " + std::to_string(info.rank) + " of " + std::to_string(this->nRanks_) + + " has already checked in", + mscclppInternalError); } // Save the connection handle for that rank @@ -269,16 +271,17 @@ void Bootstrap::Impl::netInit(std::string ipPortPair) if (!ipPortPair.empty()) { mscclppSocketAddress remoteAddr; if (mscclppSocketGetAddrFromString(&remoteAddr, ipPortPair.c_str()) != mscclppSuccess) { - throw std::runtime_error( - "Invalid ipPortPair, please use format: : or []: or :"); + throw mscclpp::Error( + "Invalid ipPortPair, please use format: : or []: or :", + mscclppInvalidArgument); } if (mscclppFindInterfaceMatchSubnet(netIfName_, &netIfAddr_, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) { - throw std::runtime_error("NET/Socket : No usable listening interface found"); + throw mscclpp::Error("NET/Socket : No usable listening interface found", mscclppInternalError); } } else { int ret = mscclppFindInterfaces(netIfName_, &netIfAddr_, MAX_IF_NAME_SIZE, 1); if (ret <= 0) { - throw std::runtime_error("Bootstrap : no socket interface found"); + throw mscclpp::Error("Bootstrap : no socket interface found", mscclppInternalError); } } @@ -390,8 +393,9 @@ void Bootstrap::Impl::netRecv(mscclppSocket* sock, void* data, int size) int recvSize; MSCCLPPTHROW(mscclppSocketRecv(sock, &recvSize, sizeof(int))); if (recvSize > size) { - throw std::runtime_error("Message truncated : received " + std::to_string(recvSize) + " bytes instead of " + - std::to_string(size)); + throw mscclpp::Error("Message truncated : received " + std::to_string(recvSize) + " bytes instead of " + + std::to_string(size), + mscclppInternalError); } MSCCLPPTHROW(mscclppSocketRecv(sock, data, std::min(recvSize, size))); } @@ -1058,4 +1062,4 @@ mscclppResult_t bootstrapAbort(void* commState) free(state->peerProxyAddresses); free(state); return mscclppSuccess; -} \ No newline at end of file +} diff --git a/src/channel.cc b/src/channel.cc index 42572390..33b679c2 100644 --- a/src/channel.cc +++ b/src/channel.cc @@ -1,14 +1,16 @@ #include "channel.hpp" -#include "utils.h" -#include "checks.hpp" #include "api.h" +#include "checks.hpp" #include "debug.h" +#include "utils.h" namespace mscclpp { namespace channel { -MSCCLPP_API_CPP DeviceChannelService::DeviceChannelService(Communicator& communicator) : communicator_(communicator), - proxy_([&](ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }, [&]() { bindThread(); }) { +MSCCLPP_API_CPP DeviceChannelService::DeviceChannelService(Communicator& communicator) + : communicator_(communicator), + proxy_([&](ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }, [&]() { bindThread(); }) +{ int cudaDevice; CUDATHROW(cudaGetDevice(&cudaDevice)); MSCCLPPTHROW(getDeviceNumaNode(cudaDevice, &deviceNumaNode)); @@ -23,4 +25,4 @@ MSCCLPP_API_CPP void DeviceChannelService::bindThread() } } // namespace channel -} // namespace mscclpp \ No newline at end of file +} // namespace mscclpp diff --git a/src/communicator.cc b/src/communicator.cc index 1fd64132..9b28cbb4 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -59,8 +59,9 @@ MSCCLPP_API_CPP RegisteredMemory Communicator::registerMemory(void* ptr, size_t struct MemorySender : public Setuppable { - MemorySender(RegisteredMemory memory, int remoteRank, int tag) - : memory_(memory), remoteRank_(remoteRank), tag_(tag) {} + MemorySender(RegisteredMemory memory, int remoteRank, int tag) : memory_(memory), remoteRank_(remoteRank), tag_(tag) + { + } void beginSetup(std::shared_ptr bootstrap) override { @@ -79,8 +80,9 @@ MSCCLPP_API_CPP void Communicator::sendMemoryOnSetup(RegisteredMemory memory, in struct MemoryReceiver : public Setuppable { - MemoryReceiver(int remoteRank, int tag) - : remoteRank_(remoteRank), tag_(tag) {} + MemoryReceiver(int remoteRank, int tag) : remoteRank_(remoteRank), tag_(tag) + { + } void endSetup(std::shared_ptr bootstrap) override { @@ -112,7 +114,7 @@ MSCCLPP_API_CPP std::shared_ptr Communicator::connectOnSetup(int rem << pimpl->rankToHash_[pimpl->bootstrap_->getRank()] << ")" << " != " << pimpl->bootstrap_->getRank() << "(" << std::hex << pimpl->rankToHash_[pimpl->bootstrap_->getRank()] << ")"; - throw std::runtime_error(ss.str()); + throw mscclpp::Error(ss.str(), mscclppInternalError); } auto cudaIpcConn = std::make_shared(remoteRank, tag); conn = cudaIpcConn; @@ -126,7 +128,7 @@ MSCCLPP_API_CPP std::shared_ptr Communicator::connectOnSetup(int rem pimpl->bootstrap_->getRank(), pimpl->rankToHash_[pimpl->bootstrap_->getRank()], getIBDeviceName(transport).c_str(), remoteRank, pimpl->rankToHash_[remoteRank]); } else { - throw std::runtime_error("Unsupported transport"); + throw mscclpp::Error("Unsupported transport", mscclppInvalidArgument); } pimpl->connections_.push_back(conn); addSetup(conn); diff --git a/src/connection.cc b/src/connection.cc index 6a657e02..60ff2291 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -1,17 +1,17 @@ -#include #include "connection.hpp" #include "checks.hpp" #include "infiniband/verbs.h" #include "npkit/npkit.h" #include "registered_memory.hpp" #include "utils.hpp" +#include namespace mscclpp { void validateTransport(RegisteredMemory mem, Transport transport) { if (!mem.transports().has(transport)) { - throw std::runtime_error("mem does not support transport"); + throw Error("RegisteredMemory does not support transport", mscclppInvalidArgument); } } @@ -24,11 +24,19 @@ std::shared_ptr Connection::getRegisteredMemoryImpl(Regi // ConnectionBase -ConnectionBase::ConnectionBase(int remoteRank, int tag) : remoteRank_(remoteRank), tag_(tag) {} +ConnectionBase::ConnectionBase(int remoteRank, int tag) : remoteRank_(remoteRank), tag_(tag) +{ +} -int ConnectionBase::remoteRank() { return remoteRank_; } +int ConnectionBase::remoteRank() +{ + return remoteRank_; +} -int ConnectionBase::tag() { return tag_; } +int ConnectionBase::tag() +{ + return tag_; +} // CudaIpcConnection @@ -99,11 +107,11 @@ void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMem auto dstTransportInfo = getRegisteredMemoryImpl(dst)->getTransportInfo(remoteTransport()); if (dstTransportInfo.ibLocal) { - throw std::runtime_error("dst is local, which is not supported"); + throw Error("dst is local, which is not supported", mscclppInvalidArgument); } auto srcTransportInfo = getRegisteredMemoryImpl(src)->getTransportInfo(transport()); if (!srcTransportInfo.ibLocal) { - throw std::runtime_error("src is remote, which is not supported"); + throw Error("src is remote, which is not supported", mscclppInvalidArgument); } auto dstMrInfo = dstTransportInfo.ibMrInfo; @@ -113,7 +121,8 @@ void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMem /*signaled=*/true); numSignaledSends++; qp->postSend(); - INFO(MSCCLPP_NET, "IBConnection write: from %p to %p, size %lu", (uint8_t*)srcMr->getBuff() + srcOffset, (uint8_t*)dstMrInfo.addr + dstOffset, size); + INFO(MSCCLPP_NET, "IBConnection write: from %p to %p, size %lu", (uint8_t*)srcMr->getBuff() + srcOffset, + (uint8_t*)dstMrInfo.addr + dstOffset, size); // npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_DATA_ENTRY, (uint32_t)size); } @@ -123,16 +132,19 @@ void IBConnection::flush() while (numSignaledSends) { int wcNum = qp->pollCq(); if (wcNum < 0) { - throw std::runtime_error("pollCq failed: error no " + std::to_string(errno)); + throw mscclpp::IbError("pollCq failed: error no " + std::to_string(errno), errno); } auto elapsed = timer.elapsed(); - if (elapsed > MSCCLPP_POLLING_WAIT) - throw std::runtime_error("pollCq is stuck: waited for " + std::to_string(elapsed) + " seconds. Expected " + std::to_string(numSignaledSends) + " signals"); + if (elapsed > MSCCLPP_POLLING_WAIT) { + throw Error("pollCq is stuck: waited for " + std::to_string(elapsed) + " seconds. Expected " + + std::to_string(numSignaledSends) + " signals", + mscclppInternalError); + } for (int i = 0; i < wcNum; ++i) { const struct ibv_wc* wc = reinterpret_cast(qp->getWc(i)); if (wc->status != IBV_WC_SUCCESS) { - throw std::runtime_error("pollCq failed: status " + std::to_string(wc->status)); + throw mscclpp::IbError("pollCq failed: status " + std::to_string(wc->status), wc->status); } if (wc->opcode == IBV_WC_RDMA_WRITE) { numSignaledSends--; diff --git a/src/epoch.cc b/src/epoch.cc index 9263fd1c..2e3a5166 100644 --- a/src/epoch.cc +++ b/src/epoch.cc @@ -1,26 +1,32 @@ #include "epoch.hpp" -#include "checks.hpp" #include "alloc.h" #include "api.h" +#include "checks.hpp" namespace mscclpp { -MSCCLPP_API_CPP Epoch::Epoch(Communicator& communicator, std::shared_ptr connection) : connection_(connection) { +MSCCLPP_API_CPP Epoch::Epoch(Communicator& communicator, std::shared_ptr connection) + : connection_(connection) +{ MSCCLPPTHROW(mscclppCudaCalloc(&device_.epochIds_, 1)); MSCCLPPTHROW(mscclppCudaCalloc(&device_.expectedInboundEpochId_, 1)); - localEpochIdsRegMem_ = communicator.registerMemory(device_.epochIds_, sizeof(device_.epochIds_), connection->transport()); + localEpochIdsRegMem_ = + communicator.registerMemory(device_.epochIds_, sizeof(device_.epochIds_), connection->transport()); communicator.sendMemoryOnSetup(localEpochIdsRegMem_, connection->remoteRank(), connection->tag()); remoteEpochIdsRegMem_ = communicator.recvMemoryOnSetup(connection->remoteRank(), connection->tag()); } -MSCCLPP_API_CPP Epoch::~Epoch() { +MSCCLPP_API_CPP Epoch::~Epoch() +{ mscclppCudaFree(device_.epochIds_); mscclppCudaFree(device_.expectedInboundEpochId_); } -MSCCLPP_API_CPP void Epoch::signal() { - connection_->write(remoteEpochIdsRegMem_.get(), offsetof(EpochIds, inboundReplica_), localEpochIdsRegMem_, offsetof(EpochIds, outbound_), sizeof(device_.epochIds_)); +MSCCLPP_API_CPP void Epoch::signal() +{ + connection_->write(remoteEpochIdsRegMem_.get(), offsetof(EpochIds, inboundReplica_), localEpochIdsRegMem_, + offsetof(EpochIds, outbound_), sizeof(device_.epochIds_)); } } // namespace mscclpp diff --git a/src/errors.cc b/src/errors.cc new file mode 100644 index 00000000..d893578c --- /dev/null +++ b/src/errors.cc @@ -0,0 +1,30 @@ +#include "errors.hpp" + +namespace mscclpp { + +BaseError::BaseError(std::string message, int errorCode) : std::runtime_error(message), errorCode_(errorCode) +{ +} + +int BaseError::getErrorCode() const +{ + return errorCode_; +} + +Error::Error(std::string message, int errorCode) : BaseError(message, errorCode) +{ +} + +CudaError::CudaError(std::string message, int errorCode) : BaseError(message, errorCode) +{ +} + +CuError::CuError(std::string message, int errorCode) : BaseError(message, errorCode) +{ +} + +IbError::IbError(std::string message, int errorCode) : BaseError(message, errorCode) +{ +} + +}; // namespace mscclpp diff --git a/src/fifo.cc b/src/fifo.cc index d5d70422..49902816 100644 --- a/src/fifo.cc +++ b/src/fifo.cc @@ -1,7 +1,7 @@ #include "alloc.h" +#include "api.h" #include "checks.hpp" #include "mscclppfifo.hpp" -#include "api.h" #include #include #include diff --git a/src/ib.cc b/src/ib.cc index 1e3e0af6..b95bfb43 100644 --- a/src/ib.cc +++ b/src/ib.cc @@ -6,12 +6,12 @@ #include #include "alloc.h" +#include "api.h" #include "checks.hpp" #include "comm.h" #include "debug.h" #include "ib.hpp" #include "mscclpp.hpp" -#include "api.h" #include #include @@ -20,7 +20,7 @@ namespace mscclpp { IbMr::IbMr(void* pd, void* buff, std::size_t size) : buff(buff) { if (size == 0) { - throw std::runtime_error("invalid size: " + std::to_string(size)); + throw std::invalid_argument("invalid size: " + std::to_string(size)); } static __thread uintptr_t pageSize = 0; if (pageSize == 0) { @@ -35,7 +35,7 @@ IbMr::IbMr(void* pd, void* buff, std::size_t size) : buff(buff) if (_mr == nullptr) { std::stringstream err; err << "ibv_reg_mr failed (errno " << errno << ")"; - throw std::runtime_error(err.str()); + throw mscclpp::IbError(err.str(), errno); } this->mr = _mr; this->size = pages * pageSize; @@ -73,7 +73,7 @@ IbQp::IbQp(void* ctx, void* pd, int port) if (this->cq == nullptr) { std::stringstream err; err << "ibv_create_cq failed (errno " << errno << ")"; - throw std::runtime_error(err.str()); + throw mscclpp::IbError(err.str(), errno); } struct ibv_qp_init_attr qpInitAttr; @@ -92,14 +92,14 @@ IbQp::IbQp(void* ctx, void* pd, int port) if (_qp == nullptr) { std::stringstream err; err << "ibv_create_qp failed (errno " << errno << ")"; - throw std::runtime_error(err.str()); + throw mscclpp::IbError(err.str(), errno); } struct ibv_port_attr portAttr; if (ibv_query_port(_ctx, port, &portAttr) != 0) { std::stringstream err; err << "ibv_query_port failed (errno " << errno << ")"; - throw std::runtime_error(err.str()); + throw mscclpp::IbError(err.str(), errno); } this->info.lid = portAttr.lid; this->info.port = port; @@ -111,7 +111,7 @@ IbQp::IbQp(void* ctx, void* pd, int port) if (ibv_query_gid(_ctx, port, 0, &gid) != 0) { std::stringstream err; err << "ibv_query_gid failed (errno " << errno << ")"; - throw std::runtime_error(err.str()); + throw mscclpp::IbError(err.str(), errno); } this->info.spn = gid.global.subnet_prefix; } @@ -125,7 +125,7 @@ IbQp::IbQp(void* ctx, void* pd, int port) if (ibv_modify_qp(_qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS) != 0) { std::stringstream err; err << "ibv_modify_qp failed (errno " << errno << ")"; - throw std::runtime_error(err.str()); + throw mscclpp::IbError(err.str(), errno); } this->qp = _qp; this->wrn = 0; @@ -174,7 +174,7 @@ void IbQp::rtr(const IbQpInfo& info) if (ret != 0) { std::stringstream err; err << "ibv_modify_qp failed (errno " << errno << ")"; - throw std::runtime_error(err.str()); + throw mscclpp::IbError(err.str(), errno); } } @@ -194,7 +194,7 @@ void IbQp::rts() if (ret != 0) { std::stringstream err; err << "ibv_modify_qp failed (errno " << errno << ")"; - throw std::runtime_error(err.str()); + throw mscclpp::IbError(err.str(), errno); } } @@ -249,7 +249,7 @@ void IbQp::postSend() if (ret != 0) { std::stringstream err; err << "ibv_post_send failed (errno " << errno << ")"; - throw std::runtime_error(err.str()); + throw mscclpp::IbError(err.str(), errno); } this->wrn = 0; } @@ -265,7 +265,7 @@ void IbQp::postRecv(uint64_t wrId) if (ret != 0) { std::stringstream err; err << "ibv_post_recv failed (errno " << errno << ")"; - throw std::runtime_error(err.str()); + throw mscclpp::IbError(err.str(), errno); } } @@ -299,13 +299,13 @@ IbCtx::IbCtx(const std::string& devName) : devName(devName) if (this->ctx == nullptr) { std::stringstream err; err << "ibv_open_device failed (errno " << errno << ", device name << " << devName << ")"; - throw std::runtime_error(err.str()); + throw mscclpp::IbError(err.str(), errno); } this->pd = ibv_alloc_pd(reinterpret_cast(this->ctx)); if (this->pd == nullptr) { std::stringstream err; err << "ibv_alloc_pd failed (errno " << errno << ")"; - throw std::runtime_error(err.str()); + throw mscclpp::IbError(err.str(), errno); } } @@ -327,7 +327,7 @@ bool IbCtx::isPortUsable(int port) const if (ibv_query_port(reinterpret_cast(this->ctx), port, &portAttr) != 0) { std::stringstream err; err << "ibv_query_port failed (errno " << errno << ", port << " << port << ")"; - throw std::runtime_error(err.str()); + throw mscclpp::IbError(err.str(), errno); } return portAttr.state == IBV_PORT_ACTIVE && (portAttr.link_layer == IBV_LINK_LAYER_ETHERNET || portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND); @@ -339,7 +339,7 @@ int IbCtx::getAnyActivePort() const if (ibv_query_device(reinterpret_cast(this->ctx), &devAttr) != 0) { std::stringstream err; err << "ibv_query_device failed (errno " << errno << ")"; - throw std::runtime_error(err.str()); + throw mscclpp::IbError(err.str(), errno); } for (uint8_t port = 1; port <= devAttr.phys_port_cnt; ++port) { if (this->isPortUsable(port)) { @@ -354,10 +354,10 @@ IbQp* IbCtx::createQp(int port /*=-1*/) if (port == -1) { port = this->getAnyActivePort(); if (port == -1) { - throw std::runtime_error("No active port found"); + throw mscclpp::Error("No active port found", mscclppInternalError); } } else if (!this->isPortUsable(port)) { - throw std::runtime_error("invalid IB port: " + std::to_string(port)); + throw mscclpp::Error("invalid IB port: " + std::to_string(port), mscclppInternalError); } qps.emplace_back(new IbQp(this->ctx, this->pd, port)); return qps.back().get(); @@ -412,10 +412,10 @@ MSCCLPP_API_CPP std::string getIBDeviceName(Transport ibTransport) ibTransportIndex = 7; break; default: - throw std::runtime_error("Not an IB transport"); + throw std::invalid_argument("Not an IB transport"); } if (ibTransportIndex >= num) { - throw std::runtime_error("IB transport out of range"); + throw std::out_of_range("IB transport out of range"); } return devices[ibTransportIndex]->name; } @@ -444,11 +444,11 @@ MSCCLPP_API_CPP Transport getIBTransportByDeviceName(const std::string& ibDevice case 7: return Transport::IB7; default: - throw std::runtime_error("IB device index out of range"); + throw std::out_of_range("IB device index out of range"); } } } - throw std::runtime_error("IB device not found"); + throw std::invalid_argument("IB device not found"); } } // namespace mscclpp diff --git a/src/include/channel.hpp b/src/include/channel.hpp index eb4bd9e7..26d31731 100644 --- a/src/include/channel.hpp +++ b/src/include/channel.hpp @@ -3,8 +3,8 @@ #include "epoch.hpp" #include "mscclpp.hpp" -#include "proxy.hpp" #include "mscclppfifo.hpp" +#include "proxy.hpp" #include "utils.hpp" namespace mscclpp { @@ -15,10 +15,16 @@ class Channel { public: Channel(Communicator& communicator, std::shared_ptr connection) - : connection_(connection), epoch_(std::make_shared(communicator, connection)) {}; + : connection_(connection), epoch_(std::make_shared(communicator, connection)){}; - Connection& connection() { return *connection_; } - Epoch& epoch() { return *epoch_; } + Connection& connection() + { + return *connection_; + } + Epoch& epoch() + { + return *epoch_; + } private: std::shared_ptr connection_; @@ -69,8 +75,8 @@ union ChannelTrigger { __device__ ChannelTrigger(ProxyTrigger value) : value(value) { } - __device__ ChannelTrigger(TriggerType type, MemoryId dst, uint64_t dstOffset, MemoryId src, - uint64_t srcOffset, uint64_t size, int connectionId) + __device__ ChannelTrigger(TriggerType type, MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset, + uint64_t size, int connectionId) { value.fst = ((srcOffset << MSCCLPP_BITS_SIZE) + size); value.snd = ((((((((connectionId << MSCCLPP_BITS_TYPE) + (uint64_t)type) << MSCCLPP_BITS_REGMEM_HANDLE) + dst) @@ -86,15 +92,17 @@ struct DeviceChannel { DeviceChannel() = default; - DeviceChannel(ChannelId channelId, DeviceEpoch epoch, DeviceProxyFifo fifo) : channelId_(channelId), epoch_(epoch), fifo_(fifo) {} + DeviceChannel(ChannelId channelId, DeviceEpoch epoch, DeviceProxyFifo fifo) + : channelId_(channelId), epoch_(epoch), fifo_(fifo) + { + } DeviceChannel(const DeviceChannel& other) = default; DeviceChannel& operator=(DeviceChannel& other) = default; #ifdef __CUDACC__ - __forceinline__ __device__ void put(MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset, - uint64_t size) + __forceinline__ __device__ void put(MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset, uint64_t size) { fifo_.push(ChannelTrigger(TriggerData, dst, dstOffset, src, srcOffset, size, channelId_).value); } @@ -110,13 +118,11 @@ struct DeviceChannel fifo_.push(ChannelTrigger(TriggerFlag, 0, 0, 0, 0, 1, channelId_).value); } - __forceinline__ __device__ void putWithSignal(MemoryId dst, uint64_t dstOffset, MemoryId src, - uint64_t srcOffset, uint64_t size) + __forceinline__ __device__ void putWithSignal(MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset, + uint64_t size) { epochIncrement(); - fifo_.push( - ChannelTrigger(TriggerData | TriggerFlag, dst, dstOffset, src, srcOffset, size, channelId_) - .value); + fifo_.push(ChannelTrigger(TriggerData | TriggerFlag, dst, dstOffset, src, srcOffset, size, channelId_).value); } __forceinline__ __device__ void putWithSignal(MemoryId dst, MemoryId src, uint64_t offset, uint64_t size) @@ -128,16 +134,14 @@ struct DeviceChannel uint64_t srcOffset, uint64_t size) { epochIncrement(); - uint64_t curFifoHead = fifo_.push(ChannelTrigger(TriggerData | TriggerFlag | TriggerSync, dst, - dstOffset, src, srcOffset, size, channelId_) - .value); + uint64_t curFifoHead = fifo_.push( + ChannelTrigger(TriggerData | TriggerFlag | TriggerSync, dst, dstOffset, src, srcOffset, size, channelId_).value); while (*(volatile uint64_t*)&fifo_.triggers[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0 && *(volatile uint64_t*)fifo_.tailReplica <= curFifoHead) ; } - __forceinline__ __device__ void putWithSignalAndFlush(MemoryId dst, MemoryId src, uint64_t offset, - uint64_t size) + __forceinline__ __device__ void putWithSignalAndFlush(MemoryId dst, MemoryId src, uint64_t offset, uint64_t size) { putWithSignalAndFlush(dst, offset, src, offset, size); } @@ -176,25 +180,40 @@ class DeviceChannelService; inline ProxyHandler makeChannelProxyHandler(DeviceChannelService& channelService); -class DeviceChannelService { +class DeviceChannelService +{ public: DeviceChannelService(Communicator& communicator); - ChannelId addChannel(std::shared_ptr connection) { + ChannelId addChannel(std::shared_ptr connection) + { channels_.push_back(Channel(communicator_, connection)); return channels_.size() - 1; } - MemoryId addMemory(RegisteredMemory memory) { + MemoryId addMemory(RegisteredMemory memory) + { memories_.push_back(memory); return memories_.size() - 1; } - Channel channel(ChannelId id) { return channels_[id]; } - DeviceChannel deviceChannel(ChannelId id) { return DeviceChannel(id, channels_[id].epoch().deviceEpoch(), proxy_.fifo().deviceFifo()); } + Channel channel(ChannelId id) + { + return channels_[id]; + } + DeviceChannel deviceChannel(ChannelId id) + { + return DeviceChannel(id, channels_[id].epoch().deviceEpoch(), proxy_.fifo().deviceFifo()); + } - void startProxy() { proxy_.start(); } - void stopProxy() { proxy_.stop(); } + void startProxy() + { + proxy_.start(); + } + void stopProxy() + { + proxy_.stop(); + } private: Communicator& communicator_; @@ -205,7 +224,8 @@ private: void bindThread(); - ProxyHandlerResult handleTrigger(ProxyTrigger triggerRaw) { + ProxyHandlerResult handleTrigger(ProxyTrigger triggerRaw) + { ChannelTrigger* trigger = reinterpret_cast(&triggerRaw); Channel& channel = channels_[trigger->fields.chanId]; @@ -234,7 +254,9 @@ struct SimpleDeviceChannel { SimpleDeviceChannel() = default; - SimpleDeviceChannel(DeviceChannel devChan, MemoryId dst, MemoryId src) : devChan_(devChan), dst_(dst), src_(src) {} + SimpleDeviceChannel(DeviceChannel devChan, MemoryId dst, MemoryId src) : devChan_(devChan), dst_(dst), src_(src) + { + } SimpleDeviceChannel(const SimpleDeviceChannel& other) = default; diff --git a/src/include/checks.hpp b/src/include/checks.hpp index 6473c92f..b385d6d3 100644 --- a/src/include/checks.hpp +++ b/src/include/checks.hpp @@ -8,6 +8,8 @@ #define MSCCLPP_CHECKS_HPP_ #include "debug.h" +#include "errors.hpp" + #include #include @@ -15,7 +17,8 @@ do { \ mscclppResult_t res = call; \ if (res != mscclppSuccess && res != mscclppInProgress) { \ - throw std::runtime_error(std::string("Call to " #call " failed with error code ") + mscclppGetErrorString(res)); \ + throw mscclpp::Error(std::string("Call to " #call " failed with error code ") + mscclppGetErrorString(res), \ + res); \ } \ } while (false) @@ -23,7 +26,7 @@ do { \ cudaError_t err = cmd; \ if (err != cudaSuccess) { \ - throw std::runtime_error(std::string("Cuda failure '") + cudaGetErrorString(err) + "'"); \ + throw mscclpp::CudaError(std::string("Cuda failure '") + cudaGetErrorString(err) + "'", err); \ } \ } while (false) @@ -33,7 +36,7 @@ if (err != CUDA_SUCCESS) { \ const char* errStr; \ cuGetErrorString(err, &errStr); \ - throw std::runtime_error(std::string("Cu failure '") + std::string(errStr) + "'"); \ + throw mscclpp::CuError(std::string("Cu failure '") + std::string(errStr) + "'", err); \ } \ } while (false) diff --git a/src/include/connection.hpp b/src/include/connection.hpp index e06c426a..5f764b05 100644 --- a/src/include/connection.hpp +++ b/src/include/connection.hpp @@ -17,6 +17,7 @@ class ConnectionBase : public Connection, public Setuppable { int remoteRank_; int tag_; + public: ConnectionBase(int remoteRank, int tag); diff --git a/src/include/epoch.hpp b/src/include/epoch.hpp index ffd7464d..2566a273 100644 --- a/src/include/epoch.hpp +++ b/src/include/epoch.hpp @@ -17,7 +17,8 @@ struct DeviceEpoch __forceinline__ __device__ void wait() { (*expectedInboundEpochId_) += 1; - while (*(volatile uint64_t*)&(epochIds_->inboundReplica_) < (*expectedInboundEpochId_)); + while (*(volatile uint64_t*)&(epochIds_->inboundReplica_) < (*expectedInboundEpochId_)) + ; } __forceinline__ __device__ void epochIncrement() @@ -44,9 +45,12 @@ public: void signal(); - DeviceEpoch deviceEpoch() { return device_; } + DeviceEpoch deviceEpoch() + { + return device_; + } }; } // namespace mscclpp -#endif // MSCCLPP_EPOCH_HPP_ \ No newline at end of file +#endif // MSCCLPP_EPOCH_HPP_ diff --git a/src/include/errors.hpp b/src/include/errors.hpp new file mode 100644 index 00000000..5f58f766 --- /dev/null +++ b/src/include/errors.hpp @@ -0,0 +1,46 @@ +#ifndef MSCCLPP_ERRORS_HPP_ +#define MSCCLPP_ERRORS_HPP_ + +#include + +namespace mscclpp { +class BaseError : public std::runtime_error +{ +public: + BaseError(std::string message, int errorCode); + virtual ~BaseError() = default; + int getErrorCode() const; + +private: + int errorCode_; +}; + +class Error : public BaseError +{ +public: + Error(std::string message, int errorCode); + virtual ~Error() = default; +}; + +class CudaError : public BaseError +{ +public: + CudaError(std::string message, int errorCode); + virtual ~CudaError() = default; +}; + +class CuError : public BaseError +{ +public: + CuError(std::string message, int errorCode); + virtual ~CuError() = default; +}; + +class IbError : public BaseError +{ +public: + IbError(std::string message, int errorCode); + virtual ~IbError() = default; +}; +}; // namespace mscclpp +#endif // MSCCLPP_ERRORS_HPP diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index 47ca9437..a37195d3 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -7,10 +7,10 @@ #define MSCCLPP_VERSION (MSCCLPP_MAJOR * 10000 + MSCCLPP_MINOR * 100 + MSCCLPP_PATCH) #include +#include #include #include #include -#include namespace mscclpp { @@ -37,14 +37,14 @@ public: { size_t size = data.size(); send((void*)&size, sizeof(size_t), peer, tag); - send((void*)data.data(), data.size(), peer, tag+1); + send((void*)data.data(), data.size(), peer, tag + 1); } void recv(std::vector& data, int peer, int tag) { size_t size; recv((void*)&size, sizeof(size_t), peer, tag); data.resize(size); - recv((void*)data.data(), data.size(), peer, tag+1); + recv((void*)data.data(), data.size(), peer, tag + 1); } }; @@ -239,7 +239,8 @@ class Connection; class RegisteredMemory { struct Impl; - // A shared_ptr is used since RegisteredMemory is functionally immutable, although internally some state is populated lazily. + // A shared_ptr is used since RegisteredMemory is functionally immutable, although internally some state is populated + // lazily. std::shared_ptr pimpl; public: @@ -281,17 +282,23 @@ protected: struct Setuppable { - virtual void beginSetup(std::shared_ptr) {} - virtual void endSetup(std::shared_ptr) {} + virtual void beginSetup(std::shared_ptr) + { + } + virtual void endSetup(std::shared_ptr) + { + } }; -template -class NonblockingFuture +template class NonblockingFuture { std::shared_future future; + public: NonblockingFuture() = default; - NonblockingFuture(std::shared_future&& future) : future(std::move(future)) {} + NonblockingFuture(std::shared_future&& future) : future(std::move(future)) + { + } NonblockingFuture(const NonblockingFuture&) = default; bool ready() const @@ -331,7 +338,7 @@ public: * Returns: a handle to the buffer */ RegisteredMemory registerMemory(void* ptr, size_t size, TransportFlags transports); - + void sendMemoryOnSetup(RegisteredMemory memory, int remoteRank, int tag); NonblockingFuture recvMemoryOnSetup(int remoteRank, int tag); @@ -363,7 +370,6 @@ public: private: std::unique_ptr pimpl; }; - } // namespace mscclpp namespace std { diff --git a/src/include/registered_memory.hpp b/src/include/registered_memory.hpp index bf4802ce..d1b7830d 100644 --- a/src/include/registered_memory.hpp +++ b/src/include/registered_memory.hpp @@ -2,6 +2,7 @@ #define MSCCLPP_REGISTERED_MEMORY_HPP_ #include "communicator.hpp" +#include "errors.hpp" #include "ib.hpp" #include "mscclpp.h" #include "mscclpp.hpp" @@ -16,11 +17,13 @@ struct TransportInfo // TODO: rewrite this using std::variant or something bool ibLocal; union { - struct { + struct + { cudaIpcMemHandle_t cudaIpcBaseHandle; size_t cudaIpcOffsetFromBase; }; - struct { + struct + { const IbMr* ibMr; IbMrInfo ibMrInfo; }; @@ -46,7 +49,7 @@ struct RegisteredMemory::Impl return entry; } } - throw std::runtime_error("Transport data not found"); + throw Error("Transport data not found", mscclppInternalError); } }; diff --git a/src/include/utils.hpp b/src/include/utils.hpp index 9abf9994..d1a1c7d8 100644 --- a/src/include/utils.hpp +++ b/src/include/utils.hpp @@ -8,45 +8,45 @@ namespace mscclpp { struct Timer { - std::chrono::steady_clock::time_point start; - - Timer() - { - start = std::chrono::steady_clock::now(); - } - - int64_t elapsed() - { - auto end = std::chrono::steady_clock::now(); - return std::chrono::duration_cast(end - start).count(); - } - - void reset() - { - start = std::chrono::steady_clock::now(); - } - - void print(const char* name) - { - auto end = std::chrono::steady_clock::now(); - auto elapsed = std::chrono::duration_cast(end - start).count(); - printf("%s: %ld us\n", name, elapsed); - } + std::chrono::steady_clock::time_point start; + + Timer() + { + start = std::chrono::steady_clock::now(); + } + + int64_t elapsed() + { + auto end = std::chrono::steady_clock::now(); + return std::chrono::duration_cast(end - start).count(); + } + + void reset() + { + start = std::chrono::steady_clock::now(); + } + + void print(const char* name) + { + auto end = std::chrono::steady_clock::now(); + auto elapsed = std::chrono::duration_cast(end - start).count(); + printf("%s: %ld us\n", name, elapsed); + } }; struct ScopedTimer { - Timer timer; - const char* name; - - ScopedTimer(const char* name) : name(name) - { - } - - ~ScopedTimer() - { - timer.print(name); - } + Timer timer; + const char* name; + + ScopedTimer(const char* name) : name(name) + { + } + + ~ScopedTimer() + { + timer.print(name); + } }; } // namespace mscclpp diff --git a/src/proxy_cpp.cc b/src/proxy_cpp.cc index b1626813..cd005e02 100644 --- a/src/proxy_cpp.cc +++ b/src/proxy_cpp.cc @@ -1,6 +1,6 @@ -#include "proxy.hpp" #include "api.h" #include "mscclpp.hpp" +#include "proxy.hpp" #include "utils.h" #include "utils.hpp" #include @@ -20,7 +20,8 @@ struct Proxy::Impl std::thread service; std::atomic_bool running; - Impl(ProxyHandler handler, std::function threadInit) : handler(handler), threadInit(threadInit), running(false) + Impl(ProxyHandler handler, std::function threadInit) + : handler(handler), threadInit(threadInit), running(false) { } }; @@ -45,7 +46,6 @@ MSCCLPP_API_CPP void Proxy::start() { pimpl->running = true; pimpl->service = std::thread([this] { - pimpl->threadInit(); ProxyHandler handler = this->pimpl->handler; @@ -109,4 +109,4 @@ MSCCLPP_API_CPP HostProxyFifo& Proxy::fifo() return pimpl->fifo; } -} // namespace mscclpp \ No newline at end of file +} // namespace mscclpp diff --git a/src/registered_memory.cc b/src/registered_memory.cc index fed732a0..3cb82fbf 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -88,7 +88,7 @@ MSCCLPP_API_CPP std::vector RegisteredMemory::serialize() std::copy_n(reinterpret_cast(&pimpl->hostHash), sizeof(pimpl->hostHash), std::back_inserter(result)); std::copy_n(reinterpret_cast(&pimpl->transports), sizeof(pimpl->transports), std::back_inserter(result)); if (pimpl->transportInfos.size() > std::numeric_limits::max()) { - throw std::runtime_error("Too many transport info entries"); + throw mscclpp::Error("Too many transport info entries", mscclppInternalError); } int8_t transportCount = pimpl->transportInfos.size(); std::copy_n(reinterpret_cast(&transportCount), sizeof(transportCount), std::back_inserter(result)); @@ -102,7 +102,7 @@ MSCCLPP_API_CPP std::vector RegisteredMemory::serialize() } else if (AllIBTransports.has(entry.transport)) { std::copy_n(reinterpret_cast(&entry.ibMrInfo), sizeof(entry.ibMrInfo), std::back_inserter(result)); } else { - throw std::runtime_error("Unknown transport"); + throw mscclpp::Error("Unknown transport", mscclppInternalError); } } return result; @@ -132,21 +132,23 @@ RegisteredMemory::Impl::Impl(const std::vector& serialization) std::copy_n(it, sizeof(transportInfo.transport), reinterpret_cast(&transportInfo.transport)); it += sizeof(transportInfo.transport); if (transportInfo.transport == Transport::CudaIpc) { - std::copy_n(it, sizeof(transportInfo.cudaIpcBaseHandle), reinterpret_cast(&transportInfo.cudaIpcBaseHandle)); + std::copy_n(it, sizeof(transportInfo.cudaIpcBaseHandle), + reinterpret_cast(&transportInfo.cudaIpcBaseHandle)); it += sizeof(transportInfo.cudaIpcBaseHandle); - std::copy_n(it, sizeof(transportInfo.cudaIpcOffsetFromBase), reinterpret_cast(&transportInfo.cudaIpcOffsetFromBase)); + std::copy_n(it, sizeof(transportInfo.cudaIpcOffsetFromBase), + reinterpret_cast(&transportInfo.cudaIpcOffsetFromBase)); it += sizeof(transportInfo.cudaIpcOffsetFromBase); } else if (AllIBTransports.has(transportInfo.transport)) { std::copy_n(it, sizeof(transportInfo.ibMrInfo), reinterpret_cast(&transportInfo.ibMrInfo)); it += sizeof(transportInfo.ibMrInfo); transportInfo.ibLocal = false; } else { - throw std::runtime_error("Unknown transport"); + throw mscclpp::Error("Unknown transport", mscclppInternalError); } this->transportInfos.push_back(transportInfo); } if (it != serialization.end()) { - throw std::runtime_error("Deserialization failed"); + throw mscclpp::Error("Serialization failed", mscclppInternalError); } if (transports.has(Transport::CudaIpc)) { diff --git a/src/utils.cc b/src/utils.cc index 6954a64f..d3957bb1 100644 --- a/src/utils.cc +++ b/src/utils.cc @@ -6,10 +6,10 @@ #include "utils.h" +#include #include #include #include -#include // Get current Compute Capability // int mscclppCudaCompCap() { diff --git a/tests/allgather_test_cpp.cu b/tests/allgather_test_cpp.cu index ad473f8f..ddfd51d8 100644 --- a/tests/allgather_test_cpp.cu +++ b/tests/allgather_test_cpp.cu @@ -1,5 +1,6 @@ #include "mscclpp.h" #include "mscclpp.hpp" + #include "channel.hpp" #ifdef MSCCLPP_USE_MPI_FOR_TESTS @@ -71,8 +72,8 @@ __device__ void allgather0(mscclpp::channel::SimpleDeviceChannel devChan, int ra devChan.wait(); } -__device__ void localAllGather(mscclpp::channel::SimpleDeviceChannel devChan, int rank, int world_size, int nranksPerNode, - int remoteRank, uint64_t offset, uint64_t size) +__device__ void localAllGather(mscclpp::channel::SimpleDeviceChannel devChan, int rank, int world_size, + int nranksPerNode, int remoteRank, uint64_t offset, uint64_t size) { // this allgather algorithm works as follows: // Step 1: GPU rank i sends data to GPU rank (i+1) % nranksPerNode @@ -131,7 +132,7 @@ __device__ void allgather2(mscclpp::channel::SimpleDeviceChannel devChan, int ra // opposite side if ((threadIdx.x % 32) == 0) devChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), - (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int)); + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int)); if ((threadIdx.x % 32) == 0) devChan.wait(); } @@ -150,9 +151,8 @@ __device__ void allgather2(mscclpp::channel::SimpleDeviceChannel devChan, int ra if (remoteRank % nranksPerNode == rank % nranksPerNode) { // opposite side if ((threadIdx.x % 32) == 0) - devChan.putWithSignal((rank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * - sizeof(int), - nelemsPerGPU / pipelineSize * sizeof(int)); + devChan.putWithSignal((rank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int), + nelemsPerGPU / pipelineSize * sizeof(int)); if ((threadIdx.x % 32) == 0) devChan.wait(); } @@ -226,7 +226,8 @@ void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSiz CUDACHECK(cudaMemcpy(*data_d, *data_h, dataSize, cudaMemcpyHostToDevice)); } -void setupMscclppConnections(int rank, int world_size, mscclpp::Communicator& comm, mscclpp::channel::DeviceChannelService& channelService, int* data_d, size_t dataSize) +void setupMscclppConnections(int rank, int world_size, mscclpp::Communicator& comm, + mscclpp::channel::DeviceChannelService& channelService, int* data_d, size_t dataSize) { int thisNode = rankToNode(rank); int cudaNum = rankToLocalRank(rank); @@ -258,12 +259,13 @@ void setupMscclppConnections(int rank, int world_size, mscclpp::Communicator& co std::vector devChannels; for (size_t i = 0; i < channelIds.size(); ++i) { devChannels.push_back(mscclpp::channel::SimpleDeviceChannel(channelService.deviceChannel(channelIds[i]), - channelService.addMemory(remoteMemories[i].get()), channelService.addMemory(localMemories[i]))); + channelService.addMemory(remoteMemories[i].get()), + channelService.addMemory(localMemories[i]))); } assert(devChannels.size() < sizeof(constDevChans) / sizeof(mscclpp::channel::SimpleDeviceChannel)); - CUDACHECK( - cudaMemcpyToSymbol(constDevChans, devChannels.data(), sizeof(mscclpp::channel::SimpleDeviceChannel) * devChannels.size())); + CUDACHECK(cudaMemcpyToSymbol(constDevChans, devChannels.data(), + sizeof(mscclpp::channel::SimpleDeviceChannel) * devChannels.size())); } void printUsage(const char* prog, bool isMpi) diff --git a/tests/communicator_test_cpp.cu b/tests/communicator_test_cpp.cu index 56c8592e..345ba1fc 100644 --- a/tests/communicator_test_cpp.cu +++ b/tests/communicator_test_cpp.cu @@ -1,5 +1,5 @@ -#include "mscclpp.hpp" #include "epoch.hpp" +#include "mscclpp.hpp" #include #include @@ -24,26 +24,33 @@ mscclpp::Transport findIb(int localRank) return IBs[localRank]; } -void register_all_memories(mscclpp::Communicator& communicator, int rank, int worldSize, void* devicePtr, size_t deviceBufferSize, mscclpp::Transport myIbDevice, mscclpp::RegisteredMemory& localMemory, std::unordered_map& remoteMemory){ +void register_all_memories(mscclpp::Communicator& communicator, int rank, int worldSize, void* devicePtr, + size_t deviceBufferSize, mscclpp::Transport myIbDevice, + mscclpp::RegisteredMemory& localMemory, + std::unordered_map& remoteMemory) +{ localMemory = communicator.registerMemory(devicePtr, deviceBufferSize, mscclpp::Transport::CudaIpc | myIbDevice); std::unordered_map> futureRemoteMemory; for (int i = 0; i < worldSize; i++) { - if (i != rank){ + if (i != rank) { communicator.sendMemoryOnSetup(localMemory, i, 0); futureRemoteMemory[i] = communicator.recvMemoryOnSetup(i, 0); } } communicator.setup(); for (int i = 0; i < worldSize; i++) { - if (i != rank){ + if (i != rank) { remoteMemory[i] = futureRemoteMemory[i].get(); } } } -void make_connections(mscclpp::Communicator& communicator, int rank, int worldSize, int nRanksPerNode, mscclpp::Transport myIbDevice, std::unordered_map>& connections){ +void make_connections(mscclpp::Communicator& communicator, int rank, int worldSize, int nRanksPerNode, + mscclpp::Transport myIbDevice, + std::unordered_map>& connections) +{ for (int i = 0; i < worldSize; i++) { - if (i != rank){ + if (i != rank) { if (i / nRanksPerNode == rank / nRanksPerNode) { connections[i] = communicator.connectOnSetup(i, 0, mscclpp::Transport::CudaIpc); } else { @@ -54,35 +61,40 @@ void make_connections(mscclpp::Communicator& communicator, int rank, int worldSi communicator.setup(); } -void write_remote(int rank, int worldSize, std::unordered_map>& connections, - std::unordered_map& remoteRegisteredMemories, mscclpp::RegisteredMemory& registeredMemory, int dataCountPerRank){ +void write_remote(int rank, int worldSize, std::unordered_map>& connections, + std::unordered_map& remoteRegisteredMemories, + mscclpp::RegisteredMemory& registeredMemory, int dataCountPerRank) +{ for (int i = 0; i < worldSize; i++) { if (i != rank) { auto& conn = connections.at(i); auto& peerMemory = remoteRegisteredMemories.at(i); - conn->write(peerMemory, rank * dataCountPerRank * sizeof(int), registeredMemory, rank * dataCountPerRank*sizeof(int), dataCountPerRank*sizeof(int)); + conn->write(peerMemory, rank * dataCountPerRank * sizeof(int), registeredMemory, + rank * dataCountPerRank * sizeof(int), dataCountPerRank * sizeof(int)); conn->flush(); } } } -void device_buffer_init(int rank, int worldSize, int dataCount, std::vector& devicePtr){ - for (int n = 0; n < (int)devicePtr.size(); n++){ +void device_buffer_init(int rank, int worldSize, int dataCount, std::vector& devicePtr) +{ + for (int n = 0; n < (int)devicePtr.size(); n++) { std::vector hostBuffer(dataCount, 0); for (int i = 0; i < dataCount; i++) { hostBuffer[i] = rank + n * worldSize; } - CUDATHROW(cudaMemcpy(devicePtr[n], hostBuffer.data(), dataCount*sizeof(int), cudaMemcpyHostToDevice)); + CUDATHROW(cudaMemcpy(devicePtr[n], hostBuffer.data(), dataCount * sizeof(int), cudaMemcpyHostToDevice)); } CUDATHROW(cudaDeviceSynchronize()); } -bool test_device_buffer_write_correctness(int worldSize, int dataCount, std::vector& devicePtr){ - for (int n = 0; n < (int)devicePtr.size(); n++){ +bool test_device_buffer_write_correctness(int worldSize, int dataCount, std::vector& devicePtr) +{ + for (int n = 0; n < (int)devicePtr.size(); n++) { std::vector hostBuffer(dataCount, 0); - CUDATHROW(cudaMemcpy(hostBuffer.data(), devicePtr[n], dataCount*sizeof(int), cudaMemcpyDeviceToHost)); + CUDATHROW(cudaMemcpy(hostBuffer.data(), devicePtr[n], dataCount * sizeof(int), cudaMemcpyDeviceToHost)); for (int i = 0; i < worldSize; i++) { - for (int j = i*dataCount/worldSize; j < (i+1)*dataCount/worldSize; j++) { + for (int j = i * dataCount / worldSize; j < (i + 1) * dataCount / worldSize; j++) { if (hostBuffer[j] != i + n * worldSize) { return false; } @@ -92,8 +104,11 @@ bool test_device_buffer_write_correctness(int worldSize, int dataCount, std::vec return true; } -void test_write(int rank, int worldSize, int deviceBufferSize, std::shared_ptr bootstrap, std::unordered_map>& connections, - std::vector>& remoteMemory, std::vector& localMemory, std::vector& devicePtr, int numBuffers){ +void test_write(int rank, int worldSize, int deviceBufferSize, std::shared_ptr bootstrap, + std::unordered_map>& connections, + std::vector>& remoteMemory, + std::vector& localMemory, std::vector& devicePtr, int numBuffers) +{ assert((deviceBufferSize / sizeof(int)) % worldSize == 0); size_t dataCount = deviceBufferSize / sizeof(int); @@ -102,8 +117,8 @@ void test_write(int rank, int worldSize, int deviceBufferSize, std::shared_ptrbarrier(); if (bootstrap->getRank() == 0) std::cout << "CUDA memory initialization passed" << std::endl; - - for (int n = 0; n < numBuffers; n++){ + + for (int n = 0; n < numBuffers; n++) { write_remote(rank, worldSize, connections, remoteMemory[n], localMemory[n], dataCount / worldSize); } bootstrap->barrier(); @@ -116,7 +131,7 @@ void test_write(int rank, int worldSize, int deviceBufferSize, std::shared_ptr bootstrap, std::unordered_map>& connections, - std::vector>& remoteMemory, std::vector& localMemory, std::vector& devicePtr, std::unordered_map> epochs, int numBuffers){ +void test_write_with_epochs(int rank, int worldSize, int deviceBufferSize, + std::shared_ptr bootstrap, + std::unordered_map>& connections, + std::vector>& remoteMemory, + std::vector& localMemory, std::vector& devicePtr, + std::unordered_map> epochs, int numBuffers) +{ assert((deviceBufferSize / sizeof(int)) % worldSize == 0); size_t dataCount = deviceBufferSize / sizeof(int); @@ -153,8 +175,8 @@ void test_write_with_epochs(int rank, int worldSize, int deviceBufferSize, std:: mscclpp::DeviceEpoch* deviceEpochs; CUDATHROW(cudaMalloc(&deviceEpochs, sizeof(mscclpp::DeviceEpoch) * worldSize)); - for (int i = 0; i < worldSize; i++){ - if (i != rank){ + for (int i = 0; i < worldSize; i++) { + if (i != rank) { mscclpp::DeviceEpoch deviceEpoch = epochs[i]->deviceEpoch(); CUDATHROW(cudaMemcpy(&deviceEpochs[i], &deviceEpoch, sizeof(mscclpp::DeviceEpoch), cudaMemcpyHostToDevice)); } @@ -165,16 +187,15 @@ void test_write_with_epochs(int rank, int worldSize, int deviceBufferSize, std:: if (bootstrap->getRank() == 0) std::cout << "CUDA device epochs are created" << std::endl; - - for (int n = 0; n < numBuffers; n++){ + for (int n = 0; n < numBuffers; n++) { write_remote(rank, worldSize, connections, remoteMemory[n], localMemory[n], dataCount / worldSize); } increament_epochs<<<1, worldSize>>>(deviceEpochs, rank, worldSize); CUDATHROW(cudaDeviceSynchronize()); - for (int i = 0; i < worldSize; i++){ - if (i != rank){ + for (int i = 0; i < worldSize; i++) { + if (i != rank) { epochs[i]->signal(); } } @@ -182,13 +203,14 @@ void test_write_with_epochs(int rank, int worldSize, int deviceBufferSize, std:: wait_epochs<<<1, worldSize>>>(deviceEpochs, rank, worldSize); CUDATHROW(cudaDeviceSynchronize()); - if (!test_device_buffer_write_correctness(worldSize, dataCount, devicePtr)){ + if (!test_device_buffer_write_correctness(worldSize, dataCount, devicePtr)) { throw std::runtime_error("unexpected result."); } bootstrap->barrier(); if (bootstrap->getRank() == 0) - std::cout << "--- Testing writes with singal for " << std::to_string(numBuffers) << " buffers passed ---" << std::endl; + std::cout << "--- Testing writes with singal for " << std::to_string(numBuffers) << " buffers passed ---" + << std::endl; } void test_communicator(int rank, int worldSize, int nranksPerNode) @@ -213,8 +235,8 @@ void test_communicator(int rank, int worldSize, int nranksPerNode) int numBuffers = 10; std::vector devicePtr(numBuffers); - int deviceBufferSize = 1024*1024; - + int deviceBufferSize = 1024 * 1024; + std::vector localMemory(numBuffers); std::vector> remoteMemory(numBuffers); @@ -222,13 +244,15 @@ void test_communicator(int rank, int worldSize, int nranksPerNode) if (n % 100 == 0) std::cout << "Registering memory for " << std::to_string(n) << " buffers" << std::endl; CUDATHROW(cudaMalloc(&devicePtr[n], deviceBufferSize)); - register_all_memories(communicator, rank, worldSize, devicePtr[n], deviceBufferSize, myIbDevice, localMemory[n], remoteMemory[n]); + register_all_memories(communicator, rank, worldSize, devicePtr[n], deviceBufferSize, myIbDevice, localMemory[n], + remoteMemory[n]); } bootstrap->barrier(); if (bootstrap->getRank() == 0) std::cout << "Memory registration for " << std::to_string(numBuffers) << " buffers passed" << std::endl; - test_write(rank, worldSize, deviceBufferSize, bootstrap, connections, remoteMemory, localMemory, devicePtr, numBuffers); + test_write(rank, worldSize, deviceBufferSize, bootstrap, connections, remoteMemory, localMemory, devicePtr, + numBuffers); if (bootstrap->getRank() == 0) std::cout << "--- Testing vanialla writes passed ---" << std::endl; @@ -242,12 +266,13 @@ void test_communicator(int rank, int worldSize, int nranksPerNode) if (bootstrap->getRank() == 0) std::cout << "Epochs are created" << std::endl; - test_write_with_epochs(rank, worldSize, deviceBufferSize, bootstrap, connections, remoteMemory, localMemory, devicePtr, epochs, numBuffers); + test_write_with_epochs(rank, worldSize, deviceBufferSize, bootstrap, connections, remoteMemory, localMemory, + devicePtr, epochs, numBuffers); if (bootstrap->getRank() == 0) std::cout << "--- MSCCLPP::Communicator tests passed! ---" << std::endl; - for (int n = 0; n < numBuffers; n++){ + for (int n = 0; n < numBuffers; n++) { CUDATHROW(cudaFree(devicePtr[n])); } } @@ -269,4 +294,4 @@ int main(int argc, char** argv) MPI_Finalize(); return 0; -} \ No newline at end of file +} From 8b384600a92b47a89874d3a260e88756ce75a439 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Tue, 9 May 2023 22:17:43 +0000 Subject: [PATCH 118/135] host epoch works --- src/epoch.cc | 62 +++++++++++++--- src/include/channel.hpp | 12 ++-- src/include/epoch.hpp | 77 ++++++++++++-------- tests/communicator_test_cpp.cu | 127 ++++++++++++++++++++++++--------- 4 files changed, 200 insertions(+), 78 deletions(-) diff --git a/src/epoch.cc b/src/epoch.cc index 9263fd1c..36ed3186 100644 --- a/src/epoch.cc +++ b/src/epoch.cc @@ -5,22 +5,62 @@ namespace mscclpp { -MSCCLPP_API_CPP Epoch::Epoch(Communicator& communicator, std::shared_ptr connection) : connection_(connection) { - MSCCLPPTHROW(mscclppCudaCalloc(&device_.epochIds_, 1)); - MSCCLPPTHROW(mscclppCudaCalloc(&device_.expectedInboundEpochId_, 1)); +BaseEpoch::BaseEpoch(std::shared_ptr connection) : connection_(connection){} - localEpochIdsRegMem_ = communicator.registerMemory(device_.epochIds_, sizeof(device_.epochIds_), connection->transport()); - communicator.sendMemoryOnSetup(localEpochIdsRegMem_, connection->remoteRank(), connection->tag()); - remoteEpochIdsRegMem_ = communicator.recvMemoryOnSetup(connection->remoteRank(), connection->tag()); +void BaseEpoch::setup(Communicator& communicator) { + localEpochIdsRegMem_ = communicator.registerMemory(epochIds_, sizeof(epochIds_), connection_->transport()); + communicator.sendMemoryOnSetup(localEpochIdsRegMem_, connection_->remoteRank(), connection_->tag()); + remoteEpochIdsRegMem_ = communicator.recvMemoryOnSetup(connection_->remoteRank(), connection_->tag()); } -MSCCLPP_API_CPP Epoch::~Epoch() { - mscclppCudaFree(device_.epochIds_); - mscclppCudaFree(device_.expectedInboundEpochId_); +void BaseEpoch::signal() { + connection_->write(remoteEpochIdsRegMem_.get(), offsetof(EpochIds, inboundReplica), localEpochIdsRegMem_, offsetof(EpochIds, outbound), sizeof(epochIds_)); } -MSCCLPP_API_CPP void Epoch::signal() { - connection_->write(remoteEpochIdsRegMem_.get(), offsetof(EpochIds, inboundReplica_), localEpochIdsRegMem_, offsetof(EpochIds, outbound_), sizeof(device_.epochIds_)); +MSCCLPP_API_CPP DeviceEpoch::DeviceEpoch(Communicator& communicator, std::shared_ptr connection) : BaseEpoch(connection) { + MSCCLPPTHROW(mscclppCudaCalloc(&epochIds_, 1)); + MSCCLPPTHROW(mscclppCudaCalloc(&expectedInboundEpochId_, 1)); + setup(communicator); +} + +MSCCLPP_API_CPP DeviceEpoch::~DeviceEpoch() { + mscclppCudaFree(epochIds_); + mscclppCudaFree(expectedInboundEpochId_); +} + +MSCCLPP_API_CPP void DeviceEpoch::signal() { + BaseEpoch::signal(); +} + +MSCCLPP_API_CPP DeviceEpoch::DeviceHandle DeviceEpoch::deviceHandle() { + DeviceEpoch::DeviceHandle device; + device.epochIds = epochIds_; + device.expectedInboundEpochId = expectedInboundEpochId_; + return device; +} + +MSCCLPP_API_CPP HostEpoch::HostEpoch(Communicator& communicator, std::shared_ptr connection) : BaseEpoch(connection) { + if (connection->transport() == Transport::CudaIpc){ + throw std::runtime_error("HostEpoch cannot be used with CudaIpc transport"); + } + epochIds_ = new EpochIds(); + expectedInboundEpochId_ = new uint64_t(); + setup(communicator); +} + +MSCCLPP_API_CPP HostEpoch::~HostEpoch() { + delete epochIds_; + delete expectedInboundEpochId_; +} + +MSCCLPP_API_CPP void HostEpoch::increamentAndSignal() { + *(volatile uint64_t*)&(epochIds_->outbound) += 1; + signal(); +} + +MSCCLPP_API_CPP void HostEpoch::wait(){ + (*expectedInboundEpochId_) += 1; + while (*(volatile uint64_t*)&(epochIds_->inboundReplica) < (*expectedInboundEpochId_)); } } // namespace mscclpp diff --git a/src/include/channel.hpp b/src/include/channel.hpp index eb4bd9e7..37c66537 100644 --- a/src/include/channel.hpp +++ b/src/include/channel.hpp @@ -15,14 +15,14 @@ class Channel { public: Channel(Communicator& communicator, std::shared_ptr connection) - : connection_(connection), epoch_(std::make_shared(communicator, connection)) {}; + : connection_(connection), epoch_(std::make_shared(communicator, connection)) {}; Connection& connection() { return *connection_; } - Epoch& epoch() { return *epoch_; } + DeviceEpoch& epoch() { return *epoch_; } private: std::shared_ptr connection_; - std::shared_ptr epoch_; + std::shared_ptr epoch_; }; using ChannelId = uint32_t; @@ -86,7 +86,7 @@ struct DeviceChannel { DeviceChannel() = default; - DeviceChannel(ChannelId channelId, DeviceEpoch epoch, DeviceProxyFifo fifo) : channelId_(channelId), epoch_(epoch), fifo_(fifo) {} + DeviceChannel(ChannelId channelId, DeviceEpoch::DeviceHandle epoch, DeviceProxyFifo fifo) : channelId_(channelId), epoch_(epoch), fifo_(fifo) {} DeviceChannel(const DeviceChannel& other) = default; @@ -165,7 +165,7 @@ struct DeviceChannel ChannelId channelId_; - DeviceEpoch epoch_; + DeviceEpoch::DeviceHandle epoch_; // this is a concurrent fifo which is multiple threads from the device // can produce for and the sole proxy thread consumes it. @@ -191,7 +191,7 @@ public: } Channel channel(ChannelId id) { return channels_[id]; } - DeviceChannel deviceChannel(ChannelId id) { return DeviceChannel(id, channels_[id].epoch().deviceEpoch(), proxy_.fifo().deviceFifo()); } + DeviceChannel deviceChannel(ChannelId id) { return DeviceChannel(id, channels_[id].epoch().deviceHandle(), proxy_.fifo().deviceFifo()); } void startProxy() { proxy_.start(); } void stopProxy() { proxy_.stop(); } diff --git a/src/include/epoch.hpp b/src/include/epoch.hpp index ffd7464d..65813bdc 100644 --- a/src/include/epoch.hpp +++ b/src/include/epoch.hpp @@ -7,44 +7,65 @@ namespace mscclpp { struct alignas(16) EpochIds { - uint64_t outbound_; - uint64_t inboundReplica_; + uint64_t outbound; + uint64_t inboundReplica; }; -struct DeviceEpoch -{ -#ifdef __CUDACC__ - __forceinline__ __device__ void wait() - { - (*expectedInboundEpochId_) += 1; - while (*(volatile uint64_t*)&(epochIds_->inboundReplica_) < (*expectedInboundEpochId_)); - } - - __forceinline__ __device__ void epochIncrement() - { - *(volatile uint64_t*)&(epochIds_->outbound_) += 1; - } -#endif // __CUDACC__ - - EpochIds* epochIds_; - uint64_t* expectedInboundEpochId_; -}; - -class Epoch +class BaseEpoch { +private: std::shared_ptr connection_; - DeviceEpoch device_; RegisteredMemory localEpochIdsRegMem_; NonblockingFuture remoteEpochIdsRegMem_; - +protected: + EpochIds* epochIds_; + uint64_t* expectedInboundEpochId_; public: - Epoch(Communicator& communicator, std::shared_ptr connection); - Epoch(const Epoch&) = delete; - ~Epoch(); + BaseEpoch(std::shared_ptr connection); + void setup(Communicator& communicator); + BaseEpoch(const BaseEpoch&) = delete; + void signal(); +}; +class DeviceEpoch : BaseEpoch +{ +public: + DeviceEpoch(Communicator& communicator, std::shared_ptr connection); + DeviceEpoch(const DeviceEpoch&) = delete; + ~DeviceEpoch(); void signal(); - DeviceEpoch deviceEpoch() { return device_; } + struct DeviceHandle + { + #ifdef __CUDACC__ + __forceinline__ __device__ void wait() + { + (*expectedInboundEpochId) += 1; + while (*(volatile uint64_t*)&(epochIds->inboundReplica) < (*expectedInboundEpochId)); + } + + __forceinline__ __device__ void epochIncrement() + { + *(volatile uint64_t*)&(epochIds->outbound) += 1; + } + #endif // __CUDACC__ + + EpochIds* epochIds; + uint64_t* expectedInboundEpochId; + }; + + DeviceHandle deviceHandle(); +}; + +class HostEpoch : BaseEpoch +{ +public: + HostEpoch(Communicator& communicator, std::shared_ptr connection); + HostEpoch(const HostEpoch&) = delete; + ~HostEpoch(); + + void increamentAndSignal(); + void wait(); }; } // namespace mscclpp diff --git a/tests/communicator_test_cpp.cu b/tests/communicator_test_cpp.cu index 56c8592e..2bf14256 100644 --- a/tests/communicator_test_cpp.cu +++ b/tests/communicator_test_cpp.cu @@ -77,11 +77,14 @@ void device_buffer_init(int rank, int worldSize, int dataCount, std::vector& devicePtr){ +bool test_device_buffer_write_correctness(int rank, int worldSize, int nRanksPerNode, int dataCount, std::vector& devicePtr, bool skipLocal = false){ for (int n = 0; n < (int)devicePtr.size(); n++){ std::vector hostBuffer(dataCount, 0); CUDATHROW(cudaMemcpy(hostBuffer.data(), devicePtr[n], dataCount*sizeof(int), cudaMemcpyDeviceToHost)); for (int i = 0; i < worldSize; i++) { + if (i / nRanksPerNode == rank / nRanksPerNode && skipLocal) { + continue; + } for (int j = i*dataCount/worldSize; j < (i+1)*dataCount/worldSize; j++) { if (hostBuffer[j] != i + n * worldSize) { return false; @@ -92,7 +95,7 @@ bool test_device_buffer_write_correctness(int worldSize, int dataCount, std::vec return true; } -void test_write(int rank, int worldSize, int deviceBufferSize, std::shared_ptr bootstrap, std::unordered_map>& connections, +void test_write(int rank, int worldSize, int nRanksPerNode, int deviceBufferSize, std::shared_ptr bootstrap, std::unordered_map>& connections, std::vector>& remoteMemory, std::vector& localMemory, std::vector& devicePtr, int numBuffers){ assert((deviceBufferSize / sizeof(int)) % worldSize == 0); @@ -114,7 +117,7 @@ void test_write(int rank, int worldSize, int deviceBufferSize, std::shared_ptrbarrier(); if (bootstrap->getRank() == 0) std::cout << "Polling for " << std::to_string(numBuffers) << " buffers passed" << std::endl; + + if (bootstrap->getRank() == 0) + std::cout << "--- Testing vanialla writes passed ---" << std::endl; } -__global__ void increament_epochs(mscclpp::DeviceEpoch* deviceEpochs, int rank, int worldSize){ +__global__ void increament_epochs(mscclpp::DeviceEpoch::DeviceHandle* deviceEpochs, int rank, int worldSize){ int tid = threadIdx.x; if (tid != rank && tid < worldSize){ deviceEpochs[tid].epochIncrement(); } } -__global__ void wait_epochs(mscclpp::DeviceEpoch* deviceEpochs, int rank, int worldSize){ +__global__ void wait_epochs(mscclpp::DeviceEpoch::DeviceHandle* deviceEpochs, int rank, int worldSize){ int tid = threadIdx.x; if (tid != rank && tid < worldSize){ deviceEpochs[tid].wait(); } } -void test_write_with_epochs(int rank, int worldSize, int deviceBufferSize, std::shared_ptr bootstrap, std::unordered_map>& connections, - std::vector>& remoteMemory, std::vector& localMemory, std::vector& devicePtr, std::unordered_map> epochs, int numBuffers){ +void test_write_with_device_epochs(int rank, int worldSize, int nRanksPerNode, int deviceBufferSize, mscclpp::Communicator& communicator, std::shared_ptr bootstrap, std::unordered_map>& connections, + std::vector>& remoteMemory, std::vector& localMemory, std::vector& devicePtr, int numBuffers){ + + std::unordered_map> epochs; + for (auto entry : connections) { + auto& conn = entry.second; + epochs.insert({entry.first, std::make_shared(communicator, conn)}); + } + communicator.setup(); + bootstrap->barrier(); + if (bootstrap->getRank() == 0) + std::cout << "Epochs are created" << std::endl; assert((deviceBufferSize / sizeof(int)) % worldSize == 0); size_t dataCount = deviceBufferSize / sizeof(int); @@ -151,12 +167,12 @@ void test_write_with_epochs(int rank, int worldSize, int deviceBufferSize, std:: if (bootstrap->getRank() == 0) std::cout << "CUDA memory initialization passed" << std::endl; - mscclpp::DeviceEpoch* deviceEpochs; - CUDATHROW(cudaMalloc(&deviceEpochs, sizeof(mscclpp::DeviceEpoch) * worldSize)); + mscclpp::DeviceEpoch::DeviceHandle* deviceEpochHandles; + CUDATHROW(cudaMalloc(&deviceEpochHandles, sizeof(mscclpp::DeviceEpoch::DeviceHandle) * worldSize)); for (int i = 0; i < worldSize; i++){ if (i != rank){ - mscclpp::DeviceEpoch deviceEpoch = epochs[i]->deviceEpoch(); - CUDATHROW(cudaMemcpy(&deviceEpochs[i], &deviceEpoch, sizeof(mscclpp::DeviceEpoch), cudaMemcpyHostToDevice)); + mscclpp::DeviceEpoch::DeviceHandle deviceHandle = epochs[i]->deviceHandle(); + CUDATHROW(cudaMemcpy(&deviceEpochHandles[i], &deviceHandle, sizeof(mscclpp::DeviceEpoch::DeviceHandle), cudaMemcpyHostToDevice)); } } CUDATHROW(cudaDeviceSynchronize()); @@ -170,7 +186,7 @@ void test_write_with_epochs(int rank, int worldSize, int deviceBufferSize, std:: write_remote(rank, worldSize, connections, remoteMemory[n], localMemory[n], dataCount / worldSize); } - increament_epochs<<<1, worldSize>>>(deviceEpochs, rank, worldSize); + increament_epochs<<<1, worldSize>>>(deviceEpochHandles, rank, worldSize); CUDATHROW(cudaDeviceSynchronize()); for (int i = 0; i < worldSize; i++){ @@ -179,19 +195,74 @@ void test_write_with_epochs(int rank, int worldSize, int deviceBufferSize, std:: } } - wait_epochs<<<1, worldSize>>>(deviceEpochs, rank, worldSize); + wait_epochs<<<1, worldSize>>>(deviceEpochHandles, rank, worldSize); CUDATHROW(cudaDeviceSynchronize()); - if (!test_device_buffer_write_correctness(worldSize, dataCount, devicePtr)){ + if (!test_device_buffer_write_correctness(rank, worldSize, nRanksPerNode, dataCount, devicePtr)){ throw std::runtime_error("unexpected result."); } bootstrap->barrier(); if (bootstrap->getRank() == 0) - std::cout << "--- Testing writes with singal for " << std::to_string(numBuffers) << " buffers passed ---" << std::endl; + std::cout << "--- Testing writes with device epochs for " << std::to_string(numBuffers) << " buffers passed ---" << std::endl; } -void test_communicator(int rank, int worldSize, int nranksPerNode) +void test_write_with_host_epochs(int rank, int worldSize, int nRanksPerNode, int deviceBufferSize, mscclpp::Communicator& communicator, std::shared_ptr bootstrap, std::unordered_map>& connections, + std::vector>& remoteMemory, std::vector& localMemory, std::vector& devicePtr, int numBuffers){ + + std::unordered_map> epochs; + for (auto entry : connections) { + auto& conn = entry.second; + if (conn->transport() == mscclpp::Transport::CudaIpc) + continue; + epochs.insert({entry.first, std::make_shared(communicator, conn)}); + } + communicator.setup(); + bootstrap->barrier(); + if (bootstrap->getRank() == 0) + std::cout << "Epochs are created" << std::endl; + + assert((deviceBufferSize / sizeof(int)) % worldSize == 0); + size_t dataCount = deviceBufferSize / sizeof(int); + + device_buffer_init(rank, worldSize, dataCount, devicePtr); + bootstrap->barrier(); + if (bootstrap->getRank() == 0) + std::cout << "CUDA memory initialization passed" << std::endl; + + bootstrap->barrier(); + if (bootstrap->getRank() == 0) + std::cout << "Host epochs are created" << std::endl; + + + for (int n = 0; n < numBuffers; n++){ + write_remote(rank, worldSize, connections, remoteMemory[n], localMemory[n], dataCount / worldSize); + } + + for (int i = 0; i < worldSize; i++){ + if (i != rank && connections[i]->transport() != mscclpp::Transport::CudaIpc){ + epochs[i]->increamentAndSignal(); + } + } + + + for (int i = 0; i < worldSize; i++){ + if (i != rank && connections[i]->transport() != mscclpp::Transport::CudaIpc){ + epochs[i]->wait(); + } + } + + if (!test_device_buffer_write_correctness(rank, worldSize, nRanksPerNode, dataCount, devicePtr, true)){ + throw std::runtime_error("unexpected result."); + } + + bootstrap->barrier(); + if (bootstrap->getRank() == 0) + std::cout << "--- Testing writes with host epochs for " << std::to_string(numBuffers) << " buffers passed ---" << std::endl; +} + + +void test_communicator(int rank, int worldSize, int nRanksPerNode) { auto bootstrap = std::make_shared(rank, worldSize); mscclpp::UniqueId id; @@ -205,9 +276,9 @@ void test_communicator(int rank, int worldSize, int nranksPerNode) std::cout << "Communicator initialization passed" << std::endl; std::unordered_map> connections; - auto myIbDevice = findIb(rank % nranksPerNode); + auto myIbDevice = findIb(rank % nRanksPerNode); - make_connections(communicator, rank, worldSize, nranksPerNode, myIbDevice, connections); + make_connections(communicator, rank, worldSize, nRanksPerNode, myIbDevice, connections); if (bootstrap->getRank() == 0) std::cout << "Connection setup passed" << std::endl; @@ -228,21 +299,11 @@ void test_communicator(int rank, int worldSize, int nranksPerNode) if (bootstrap->getRank() == 0) std::cout << "Memory registration for " << std::to_string(numBuffers) << " buffers passed" << std::endl; - test_write(rank, worldSize, deviceBufferSize, bootstrap, connections, remoteMemory, localMemory, devicePtr, numBuffers); - if (bootstrap->getRank() == 0) - std::cout << "--- Testing vanialla writes passed ---" << std::endl; + test_write(rank, worldSize, nRanksPerNode, deviceBufferSize, bootstrap, connections, remoteMemory, localMemory, devicePtr, numBuffers); - std::unordered_map> epochs; - for (auto entry : connections) { - auto& conn = entry.second; - epochs.insert({entry.first, std::make_shared(communicator, conn)}); - } - communicator.setup(); - bootstrap->barrier(); - if (bootstrap->getRank() == 0) - std::cout << "Epochs are created" << std::endl; + test_write_with_device_epochs(rank, worldSize, nRanksPerNode, deviceBufferSize, communicator, bootstrap, connections, remoteMemory, localMemory, devicePtr, numBuffers); - test_write_with_epochs(rank, worldSize, deviceBufferSize, bootstrap, connections, remoteMemory, localMemory, devicePtr, epochs, numBuffers); + test_write_with_host_epochs(rank, worldSize, nRanksPerNode, deviceBufferSize, communicator, bootstrap, connections, remoteMemory, localMemory, devicePtr, numBuffers); if (bootstrap->getRank() == 0) std::cout << "--- MSCCLPP::Communicator tests passed! ---" << std::endl; @@ -262,10 +323,10 @@ int main(int argc, char** argv) MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm); int shmWorldSize; MPI_Comm_size(shmcomm, &shmWorldSize); - int nranksPerNode = shmWorldSize; + int nRanksPerNode = shmWorldSize; MPI_Comm_free(&shmcomm); - test_communicator(rank, worldSize, nranksPerNode); + test_communicator(rank, worldSize, nRanksPerNode); MPI_Finalize(); return 0; From 75a2af8de2a3c24e90f71967dd9e0bfafa1eed19 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Wed, 10 May 2023 18:46:55 +0000 Subject: [PATCH 119/135] Add GoogleTest with CTest integration + some tests Also rename addSetup to onSetup to unify naming. --- CMakeLists.txt | 6 ++++ src/communicator.cc | 8 +++--- src/include/mscclpp.hpp | 2 +- tests/CMakeLists.txt | 9 ++++-- tests/{unittests => }/ib_test.cc | 0 tests/unit/CMakeLists.txt | 3 ++ tests/unit/core_tests.cc | 49 ++++++++++++++++++++++++++++++++ tests/unittests/CMakeLists.txt | 2 -- 8 files changed, 70 insertions(+), 9 deletions(-) rename tests/{unittests => }/ib_test.cc (100%) create mode 100644 tests/unit/CMakeLists.txt create mode 100644 tests/unit/core_tests.cc delete mode 100644 tests/unittests/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index ad71de86..2597313f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,12 @@ if(ALLOW_GDRCOPY) find_package(GDRCopy) endif() +include(CTest) +include(FetchContent) +FetchContent_Declare(googletest URL https://github.com/google/googletest/archive/b796f7d44681514f58a683a3a71ff17c94edb0c1.zip) +FetchContent_MakeAvailable(googletest) +include(GoogleTest) + set(CLANG_FORMAT_SOURCE_DIRS src tests) include(${PROJECT_SOURCE_DIR}/cmake/AddClangFormatTargets.cmake) diff --git a/src/communicator.cc b/src/communicator.cc index 603d053d..c4abf818 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -75,7 +75,7 @@ struct MemorySender : public Setuppable MSCCLPP_API_CPP void Communicator::sendMemoryOnSetup(RegisteredMemory memory, int remoteRank, int tag) { - addSetup(std::make_shared(memory, remoteRank, tag)); + onSetup(std::make_shared(memory, remoteRank, tag)); } struct MemoryReceiver : public Setuppable @@ -99,7 +99,7 @@ struct MemoryReceiver : public Setuppable MSCCLPP_API_CPP NonblockingFuture Communicator::recvMemoryOnSetup(int remoteRank, int tag) { auto memoryReceiver = std::make_shared(remoteRank, tag); - addSetup(memoryReceiver); + onSetup(memoryReceiver); return NonblockingFuture(memoryReceiver->memoryPromise_.get_future()); } @@ -131,11 +131,11 @@ MSCCLPP_API_CPP std::shared_ptr Communicator::connectOnSetup(int rem throw mscclpp::Error("Unsupported transport", ErrorCode::InternalError); } pimpl->connections_.push_back(conn); - addSetup(conn); + onSetup(conn); return conn; } -MSCCLPP_API_CPP void Communicator::addSetup(std::shared_ptr setuppable) +MSCCLPP_API_CPP void Communicator::onSetup(std::shared_ptr setuppable) { pimpl->toSetup_.push_back(setuppable); } diff --git a/src/include/mscclpp.hpp b/src/include/mscclpp.hpp index a242d2bc..7ca8503b 100644 --- a/src/include/mscclpp.hpp +++ b/src/include/mscclpp.hpp @@ -361,7 +361,7 @@ public: std::shared_ptr connectOnSetup(int remoteRank, int tag, Transport transport); /* Add a custom Setuppable object to a list of objects to be setup later, when setup() is called. */ - void addSetup(std::shared_ptr setuppable); + void onSetup(std::shared_ptr setuppable); /* Setup all objects that have registered for setup. This includes any connections created by connect(). */ void setup(); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 17875d69..3eec4226 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,6 +1,6 @@ function(add_test_executable name sources) add_executable(${name} ${sources}) - target_link_libraries(${name} mscclpp) + target_link_libraries(${name} mscclpp CUDA::cudart CUDA::cuda_driver) if(USE_MPI_FOR_TESTS) target_link_libraries(${name} MPI::MPI_CXX) target_compile_definitions(${name} PRIVATE MSCCLPP_USE_MPI_FOR_TESTS) @@ -10,5 +10,10 @@ endfunction() add_test_executable(bootstrap_test_cpp bootstrap_test_cpp.cc) add_test_executable(communicator_test_cpp communicator_test_cpp.cu) add_test_executable(allgather_test_cpp allgather_test_cpp.cu) +add_test_executable(ib_test ib_test.cc) -add_subdirectory(unittests) +# Unit tests +add_executable(unit_tests) +target_link_libraries(unit_tests GTest::gtest_main GTest::gmock_main mscclpp) +add_subdirectory(unit) # This adds the sources to the mscclpp target +gtest_discover_tests(unit_tests DISCOVERY_MODE PRE_TEST) diff --git a/tests/unittests/ib_test.cc b/tests/ib_test.cc similarity index 100% rename from tests/unittests/ib_test.cc rename to tests/ib_test.cc diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt new file mode 100644 index 00000000..a5b2f4b9 --- /dev/null +++ b/tests/unit/CMakeLists.txt @@ -0,0 +1,3 @@ +target_sources(unit_tests PRIVATE + core_tests.cc +) diff --git a/tests/unit/core_tests.cc b/tests/unit/core_tests.cc new file mode 100644 index 00000000..df4c165e --- /dev/null +++ b/tests/unit/core_tests.cc @@ -0,0 +1,49 @@ +#include +#include +#include "mscclpp.hpp" + +class LocalCommunicatorTest : public ::testing::Test { + protected: + void SetUp() override { + bootstrap = std::make_shared(0, 1); + comm = std::make_shared(bootstrap); + } + + std::shared_ptr bootstrap; + std::shared_ptr comm; +}; + +class MockSetuppable : public mscclpp::Setuppable { + public: + MOCK_METHOD(void, beginSetup, (std::shared_ptr bootstrap), (override)); + MOCK_METHOD(void, endSetup, (std::shared_ptr bootstrap), (override)); +}; + +TEST_F(LocalCommunicatorTest, OnSetup) { + auto mockSetuppable = std::make_shared(); + comm->onSetup(mockSetuppable); + EXPECT_CALL(*mockSetuppable, beginSetup(std::dynamic_pointer_cast(bootstrap))); + EXPECT_CALL(*mockSetuppable, endSetup(std::dynamic_pointer_cast(bootstrap))); + comm->setup(); +} + +TEST_F(LocalCommunicatorTest, RegisterMemory) { + int dummy[42]; + auto memory = comm->registerMemory(&dummy, sizeof(dummy), mscclpp::NoTransports); + EXPECT_EQ(memory.data(), &dummy); + EXPECT_EQ(memory.size(), sizeof(dummy)); + EXPECT_EQ(memory.rank(), 0); + EXPECT_EQ(memory.transports(), mscclpp::NoTransports); +} + +TEST_F(LocalCommunicatorTest, SendMemoryToSelf) { + int dummy[42]; + auto memory = comm->registerMemory(&dummy, sizeof(dummy), mscclpp::NoTransports); + comm->sendMemoryOnSetup(memory, 0, 0); + auto memoryFuture = comm->recvMemoryOnSetup(0, 0); + comm->setup(); + auto sameMemory = memoryFuture.get(); + EXPECT_EQ(sameMemory.size(), memory.size()); + EXPECT_EQ(sameMemory.rank(), memory.rank()); + EXPECT_EQ(sameMemory.transports(), memory.transports()); +} diff --git a/tests/unittests/CMakeLists.txt b/tests/unittests/CMakeLists.txt deleted file mode 100644 index 44e405ad..00000000 --- a/tests/unittests/CMakeLists.txt +++ /dev/null @@ -1,2 +0,0 @@ -add_test_executable(ib_test ib_test.cc) -target_link_libraries(ib_test CUDA::cudart) \ No newline at end of file From f4ecae7c96c9a7dce65a7ba4158c0d7ca98b3a07 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Wed, 10 May 2023 18:49:02 +0000 Subject: [PATCH 120/135] Rename tests/ to test/ --- CMakeLists.txt | 2 +- {tests => test}/CMakeLists.txt | 0 {tests => test}/allgather_test.cu | 0 {tests => test}/allgather_test_cpp.cu | 0 {tests => test}/allgather_test_standalone.cu | 0 {tests => test}/allreduce_allpairs_test.cu | 0 {tests => test}/bootstrap_test.cc | 0 {tests => test}/bootstrap_test_cpp.cc | 0 {tests => test}/common.cu | 0 {tests => test}/common.h | 0 {tests => test}/communicator_test_cpp.cu | 0 {tests => test}/ib_test.cc | 0 {tests => test}/p2p_test.cu | 0 {tests => test}/unit/CMakeLists.txt | 0 {tests => test}/unit/core_tests.cc | 0 15 files changed, 1 insertion(+), 1 deletion(-) rename {tests => test}/CMakeLists.txt (100%) rename {tests => test}/allgather_test.cu (100%) rename {tests => test}/allgather_test_cpp.cu (100%) rename {tests => test}/allgather_test_standalone.cu (100%) rename {tests => test}/allreduce_allpairs_test.cu (100%) rename {tests => test}/bootstrap_test.cc (100%) rename {tests => test}/bootstrap_test_cpp.cc (100%) rename {tests => test}/common.cu (100%) rename {tests => test}/common.h (100%) rename {tests => test}/communicator_test_cpp.cu (100%) rename {tests => test}/ib_test.cc (100%) rename {tests => test}/p2p_test.cu (100%) rename {tests => test}/unit/CMakeLists.txt (100%) rename {tests => test}/unit/core_tests.cc (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2597313f..270a1ed8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,4 +45,4 @@ if(ALLOW_GDRCOPY AND GDRCOPY_FOUND) endif() add_subdirectory(src) # This adds the sources to the mscclpp target -add_subdirectory(tests) +add_subdirectory(test) diff --git a/tests/CMakeLists.txt b/test/CMakeLists.txt similarity index 100% rename from tests/CMakeLists.txt rename to test/CMakeLists.txt diff --git a/tests/allgather_test.cu b/test/allgather_test.cu similarity index 100% rename from tests/allgather_test.cu rename to test/allgather_test.cu diff --git a/tests/allgather_test_cpp.cu b/test/allgather_test_cpp.cu similarity index 100% rename from tests/allgather_test_cpp.cu rename to test/allgather_test_cpp.cu diff --git a/tests/allgather_test_standalone.cu b/test/allgather_test_standalone.cu similarity index 100% rename from tests/allgather_test_standalone.cu rename to test/allgather_test_standalone.cu diff --git a/tests/allreduce_allpairs_test.cu b/test/allreduce_allpairs_test.cu similarity index 100% rename from tests/allreduce_allpairs_test.cu rename to test/allreduce_allpairs_test.cu diff --git a/tests/bootstrap_test.cc b/test/bootstrap_test.cc similarity index 100% rename from tests/bootstrap_test.cc rename to test/bootstrap_test.cc diff --git a/tests/bootstrap_test_cpp.cc b/test/bootstrap_test_cpp.cc similarity index 100% rename from tests/bootstrap_test_cpp.cc rename to test/bootstrap_test_cpp.cc diff --git a/tests/common.cu b/test/common.cu similarity index 100% rename from tests/common.cu rename to test/common.cu diff --git a/tests/common.h b/test/common.h similarity index 100% rename from tests/common.h rename to test/common.h diff --git a/tests/communicator_test_cpp.cu b/test/communicator_test_cpp.cu similarity index 100% rename from tests/communicator_test_cpp.cu rename to test/communicator_test_cpp.cu diff --git a/tests/ib_test.cc b/test/ib_test.cc similarity index 100% rename from tests/ib_test.cc rename to test/ib_test.cc diff --git a/tests/p2p_test.cu b/test/p2p_test.cu similarity index 100% rename from tests/p2p_test.cu rename to test/p2p_test.cu diff --git a/tests/unit/CMakeLists.txt b/test/unit/CMakeLists.txt similarity index 100% rename from tests/unit/CMakeLists.txt rename to test/unit/CMakeLists.txt diff --git a/tests/unit/core_tests.cc b/test/unit/core_tests.cc similarity index 100% rename from tests/unit/core_tests.cc rename to test/unit/core_tests.cc From 33eb4093ac8a99f4079cfa370543c2ab60f75ec6 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Wed, 10 May 2023 20:24:33 +0000 Subject: [PATCH 121/135] timeout fix --- src/connection.cc | 2 +- src/include/connection.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/connection.cc b/src/connection.cc index 9666e4c1..d6cf3284 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -137,7 +137,7 @@ void IBConnection::flush() auto elapsed = timer.elapsed(); if (elapsed > MSCCLPP_POLLING_WAIT) { - throw Error("pollCq is stuck: waited for " + std::to_string(elapsed) + " seconds. Expected " + + throw Error("pollCq is stuck: waited for " + std::to_string(elapsed/1e6) + " seconds. Expected " + std::to_string(numSignaledSends) + " signals", ErrorCode::InternalError); } diff --git a/src/include/connection.hpp b/src/include/connection.hpp index 5f764b05..4b2b5907 100644 --- a/src/include/connection.hpp +++ b/src/include/connection.hpp @@ -2,7 +2,7 @@ #define MSCCLPP_CONNECTION_HPP_ // TODO(saemal): make this configurable -#define MSCCLPP_POLLING_WAIT 10000 // in microseconds +#define MSCCLPP_POLLING_WAIT 3e7 // in microseconds #include "communicator.hpp" #include "ib.hpp" From beaf2aea39582568979bd945f212f7fd863ed705 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Wed, 10 May 2023 20:46:49 +0000 Subject: [PATCH 122/135] Move public headers under include/ --- CMakeLists.txt | 4 ++-- {src/include => include/mscclpp}/channel.hpp | 11 +++++------ src/include/mscclpp.hpp => include/mscclpp/core.hpp | 8 ++++---- {src/include => include/mscclpp}/epoch.hpp | 2 +- {src/include => include/mscclpp}/errors.hpp | 4 +++- .../mscclppfifo.hpp => include/mscclpp/fifo.hpp | 6 +++--- {src/include => include/mscclpp}/proxy.hpp | 2 +- src/bootstrap/bootstrap.cc | 2 +- src/channel.cc | 2 +- src/communicator.cc | 2 +- src/epoch.cc | 2 +- src/errors.cc | 2 +- src/fifo.cc | 2 +- src/ib.cc | 2 +- src/include/basic_proxy_handler.hpp | 2 +- src/include/checks.hpp | 2 +- src/include/communicator.hpp | 4 ++-- src/include/connection.hpp | 2 +- src/include/registered_memory.hpp | 4 ++-- src/proxy_cpp.cc | 4 ++-- test/CMakeLists.txt | 1 + test/allgather_test_cpp.cu | 5 ++--- test/bootstrap_test_cpp.cc | 2 +- test/communicator_test_cpp.cu | 4 ++-- test/ib_test.cc | 2 +- test/unit/core_tests.cc | 2 +- 26 files changed, 43 insertions(+), 42 deletions(-) rename {src/include => include/mscclpp}/channel.hpp (98%) rename src/include/mscclpp.hpp => include/mscclpp/core.hpp (98%) rename {src/include => include/mscclpp}/epoch.hpp (98%) rename {src/include => include/mscclpp}/errors.hpp (96%) rename src/include/mscclppfifo.hpp => include/mscclpp/fifo.hpp (97%) rename {src/include => include/mscclpp}/proxy.hpp (95%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 270a1ed8..5470cd32 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,11 +26,11 @@ FetchContent_Declare(googletest URL https://github.com/google/googletest/archive FetchContent_MakeAvailable(googletest) include(GoogleTest) -set(CLANG_FORMAT_SOURCE_DIRS src tests) +set(CLANG_FORMAT_SOURCE_DIRS include src tests) include(${PROJECT_SOURCE_DIR}/cmake/AddClangFormatTargets.cmake) add_library(mscclpp SHARED) -target_include_directories(mscclpp PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/src/include) +target_include_directories(mscclpp PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src/include) set_target_properties(mscclpp PROPERTIES LINKER_LANGUAGE CXX) target_link_libraries(mscclpp PRIVATE MSCCLPP::ibverbs MSCCLPP::numa CUDA::cudart CUDA::cuda_driver) if(ENABLE_TRACE) diff --git a/src/include/channel.hpp b/include/mscclpp/channel.hpp similarity index 98% rename from src/include/channel.hpp rename to include/mscclpp/channel.hpp index 05582726..9aa50902 100644 --- a/src/include/channel.hpp +++ b/include/mscclpp/channel.hpp @@ -1,11 +1,10 @@ #ifndef MSCCLPP_CHANNEL_HPP_ #define MSCCLPP_CHANNEL_HPP_ -#include "epoch.hpp" -#include "mscclpp.hpp" -#include "mscclppfifo.hpp" -#include "proxy.hpp" -#include "utils.hpp" +#include +#include +#include +#include namespace mscclpp { namespace channel { @@ -148,7 +147,7 @@ struct DeviceChannel __forceinline__ __device__ void flush() { - uint64_t curFifoHead = fifo_.push(ChannelTrigger(mscclppSync, 0, 0, 0, 0, 1, channelId_).value); + uint64_t curFifoHead = fifo_.push(ChannelTrigger(TriggerSync, 0, 0, 0, 0, 1, channelId_).value); // we need to wait for two conditions to be met to ensure the CPU is done flushing. (1) wait for the tail // to go pass by curFifoHead (this is safety net) and (2) wait for the work element value to change to 0. while (*(volatile uint64_t*)&fifo_.triggers[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0 && diff --git a/src/include/mscclpp.hpp b/include/mscclpp/core.hpp similarity index 98% rename from src/include/mscclpp.hpp rename to include/mscclpp/core.hpp index 7ca8503b..aeb692e6 100644 --- a/src/include/mscclpp.hpp +++ b/include/mscclpp/core.hpp @@ -1,12 +1,12 @@ -#ifndef MSCCLPP_HPP_ -#define MSCCLPP_HPP_ +#ifndef MSCCLPP_CORE_HPP_ +#define MSCCLPP_CORE_HPP_ #define MSCCLPP_MAJOR 0 #define MSCCLPP_MINOR 1 #define MSCCLPP_PATCH 0 #define MSCCLPP_VERSION (MSCCLPP_MAJOR * 10000 + MSCCLPP_MINOR * 100 + MSCCLPP_PATCH) -#include "errors.hpp" +#include #include #include #include @@ -383,4 +383,4 @@ template <> struct hash }; } // namespace std -#endif // MSCCLPP_H_ +#endif // MSCCLPP_CORE_HPP_ diff --git a/src/include/epoch.hpp b/include/mscclpp/epoch.hpp similarity index 98% rename from src/include/epoch.hpp rename to include/mscclpp/epoch.hpp index daba1ec1..cbd3478a 100644 --- a/src/include/epoch.hpp +++ b/include/mscclpp/epoch.hpp @@ -1,7 +1,7 @@ #ifndef MSCCLPP_EPOCH_HPP_ #define MSCCLPP_EPOCH_HPP_ -#include "mscclpp.hpp" +#include namespace mscclpp { diff --git a/src/include/errors.hpp b/include/mscclpp/errors.hpp similarity index 96% rename from src/include/errors.hpp rename to include/mscclpp/errors.hpp index 2425970f..eb18f98f 100644 --- a/src/include/errors.hpp +++ b/include/mscclpp/errors.hpp @@ -50,5 +50,7 @@ public: IbError(std::string message, int errorCode); virtual ~IbError() = default; }; + }; // namespace mscclpp -#endif // MSCCLPP_ERRORS_HPP + +#endif // MSCCLPP_ERRORS_HPP_ diff --git a/src/include/mscclppfifo.hpp b/include/mscclpp/fifo.hpp similarity index 97% rename from src/include/mscclppfifo.hpp rename to include/mscclpp/fifo.hpp index c13e4fb8..e3172dca 100644 --- a/src/include/mscclppfifo.hpp +++ b/include/mscclpp/fifo.hpp @@ -1,5 +1,5 @@ -#ifndef MSCCLPPFIFO_HPP_ -#define MSCCLPPFIFO_HPP_ +#ifndef MSCCLPP_FIFO_HPP_ +#define MSCCLPP_FIFO_HPP_ #include #include @@ -74,4 +74,4 @@ private: } // namespace mscclpp -#endif // MSCCLPPFIFO_H_ +#endif // MSCCLPP_FIFO_HPP_ diff --git a/src/include/proxy.hpp b/include/mscclpp/proxy.hpp similarity index 95% rename from src/include/proxy.hpp rename to include/mscclpp/proxy.hpp index 51ae4752..37decafb 100644 --- a/src/include/proxy.hpp +++ b/include/mscclpp/proxy.hpp @@ -1,7 +1,7 @@ #ifndef MSCCLPP_PROXY_HPP_ #define MSCCLPP_PROXY_HPP_ -#include "mscclppfifo.hpp" +#include #include #include diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index 7c884726..b6311948 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -1,7 +1,7 @@ #include "bootstrap.h" #include "api.h" #include "checks.hpp" -#include "mscclpp.hpp" +#include #include "utils.h" #include diff --git a/src/channel.cc b/src/channel.cc index 33b679c2..bf5e6da6 100644 --- a/src/channel.cc +++ b/src/channel.cc @@ -1,4 +1,4 @@ -#include "channel.hpp" +#include #include "api.h" #include "checks.hpp" #include "debug.h" diff --git a/src/communicator.cc b/src/communicator.cc index c4abf818..1d670fa6 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -6,7 +6,7 @@ #include "communicator.hpp" #include "connection.hpp" #include "debug.h" -#include "mscclpp.hpp" +#include #include "registered_memory.hpp" #include "utils.h" diff --git a/src/epoch.cc b/src/epoch.cc index d358ca40..afdbf8c2 100644 --- a/src/epoch.cc +++ b/src/epoch.cc @@ -1,4 +1,4 @@ -#include "epoch.hpp" +#include #include "alloc.h" #include "api.h" #include "checks.hpp" diff --git a/src/errors.cc b/src/errors.cc index 40c9d9c0..c3a0a7b7 100644 --- a/src/errors.cc +++ b/src/errors.cc @@ -1,4 +1,4 @@ -#include "errors.hpp" +#include #include "api.h" namespace mscclpp { diff --git a/src/fifo.cc b/src/fifo.cc index 49902816..2c4ebf7a 100644 --- a/src/fifo.cc +++ b/src/fifo.cc @@ -1,7 +1,7 @@ #include "alloc.h" #include "api.h" #include "checks.hpp" -#include "mscclppfifo.hpp" +#include #include #include #include diff --git a/src/ib.cc b/src/ib.cc index 9d1c3203..e6c91eb3 100644 --- a/src/ib.cc +++ b/src/ib.cc @@ -11,7 +11,7 @@ #include "comm.h" #include "debug.h" #include "ib.hpp" -#include "mscclpp.hpp" +#include #include #include diff --git a/src/include/basic_proxy_handler.hpp b/src/include/basic_proxy_handler.hpp index 58e41930..c1dc1038 100644 --- a/src/include/basic_proxy_handler.hpp +++ b/src/include/basic_proxy_handler.hpp @@ -2,7 +2,7 @@ #define MSCCLPP_BASIC_PROXY_SERVICE_HPP_ #include "communicator.hpp" -#include "mscclpp.hpp" +#include namespace mscclpp { diff --git a/src/include/checks.hpp b/src/include/checks.hpp index e64c07d1..8332847b 100644 --- a/src/include/checks.hpp +++ b/src/include/checks.hpp @@ -8,7 +8,7 @@ #define MSCCLPP_CHECKS_HPP_ #include "debug.h" -#include "errors.hpp" +#include #include #include diff --git a/src/include/communicator.hpp b/src/include/communicator.hpp index 5b0c7485..cc464618 100644 --- a/src/include/communicator.hpp +++ b/src/include/communicator.hpp @@ -3,8 +3,8 @@ #include "ib.hpp" #include "mscclpp.h" -#include "mscclpp.hpp" -#include "proxy.hpp" +#include +#include #include #include diff --git a/src/include/connection.hpp b/src/include/connection.hpp index 5f764b05..54f9b316 100644 --- a/src/include/connection.hpp +++ b/src/include/connection.hpp @@ -6,7 +6,7 @@ #include "communicator.hpp" #include "ib.hpp" -#include "mscclpp.hpp" +#include #include namespace mscclpp { diff --git a/src/include/registered_memory.hpp b/src/include/registered_memory.hpp index 23b71f50..7c26f4b4 100644 --- a/src/include/registered_memory.hpp +++ b/src/include/registered_memory.hpp @@ -2,10 +2,10 @@ #define MSCCLPP_REGISTERED_MEMORY_HPP_ #include "communicator.hpp" -#include "errors.hpp" +#include #include "ib.hpp" #include "mscclpp.h" -#include "mscclpp.hpp" +#include #include namespace mscclpp { diff --git a/src/proxy_cpp.cc b/src/proxy_cpp.cc index cd005e02..060bbfb0 100644 --- a/src/proxy_cpp.cc +++ b/src/proxy_cpp.cc @@ -1,6 +1,6 @@ #include "api.h" -#include "mscclpp.hpp" -#include "proxy.hpp" +#include +#include #include "utils.h" #include "utils.hpp" #include diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3eec4226..d7e59bc6 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,6 +1,7 @@ function(add_test_executable name sources) add_executable(${name} ${sources}) target_link_libraries(${name} mscclpp CUDA::cudart CUDA::cuda_driver) + target_include_directories(${name} PRIVATE ${PROJECT_SOURCE_DIR}/src/include) if(USE_MPI_FOR_TESTS) target_link_libraries(${name} MPI::MPI_CXX) target_compile_definitions(${name} PRIVATE MSCCLPP_USE_MPI_FOR_TESTS) diff --git a/test/allgather_test_cpp.cu b/test/allgather_test_cpp.cu index ddfd51d8..60652a0f 100644 --- a/test/allgather_test_cpp.cu +++ b/test/allgather_test_cpp.cu @@ -1,7 +1,6 @@ -#include "mscclpp.h" -#include "mscclpp.hpp" +#include -#include "channel.hpp" +#include #ifdef MSCCLPP_USE_MPI_FOR_TESTS #include "mpi.h" diff --git a/test/bootstrap_test_cpp.cc b/test/bootstrap_test_cpp.cc index e4fe65bb..b32d83fa 100644 --- a/test/bootstrap_test_cpp.cc +++ b/test/bootstrap_test_cpp.cc @@ -1,4 +1,4 @@ -#include "mscclpp.hpp" +#include #include #include diff --git a/test/communicator_test_cpp.cu b/test/communicator_test_cpp.cu index 74f9aadb..cda4d712 100644 --- a/test/communicator_test_cpp.cu +++ b/test/communicator_test_cpp.cu @@ -1,5 +1,5 @@ -#include "epoch.hpp" -#include "mscclpp.hpp" +#include +#include #include #include diff --git a/test/ib_test.cc b/test/ib_test.cc index 3d99acb2..753d6fa4 100644 --- a/test/ib_test.cc +++ b/test/ib_test.cc @@ -2,7 +2,7 @@ #include "checks.h" #include "ib.hpp" #include "infiniband/verbs.h" -#include "mscclpp.hpp" +#include #include #include diff --git a/test/unit/core_tests.cc b/test/unit/core_tests.cc index df4c165e..e3bf7265 100644 --- a/test/unit/core_tests.cc +++ b/test/unit/core_tests.cc @@ -1,6 +1,6 @@ #include #include -#include "mscclpp.hpp" +#include class LocalCommunicatorTest : public ::testing::Test { protected: From ccf45b33a22baaba986f1061160ef03b824247c9 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Wed, 10 May 2023 22:03:42 +0000 Subject: [PATCH 123/135] Delete old init code and other C-style code --- src/CMakeLists.txt | 5 +- src/bootstrap/bootstrap.cc | 562 +-------------- src/c_style_remnants.cc | 45 ++ src/communicator.cc | 1 - src/gdr.cc | 75 -- src/ib.cc | 3 +- src/include/bootstrap.h | 26 - src/include/comm.h | 65 -- src/include/gdr.h | 156 ----- src/include/registered_ptr.hpp | 52 -- src/init.cc | 920 ------------------------- src/{misc => npkit}/npkit.cc | 2 +- src/{include => }/npkit/npkit.h | 4 +- src/{include => }/npkit/npkit_event.h | 0 src/{include => }/npkit/npkit_struct.h | 0 src/proxy.cc | 257 +++---- src/proxy_cpp.cc | 112 --- 17 files changed, 131 insertions(+), 2154 deletions(-) create mode 100644 src/c_style_remnants.cc delete mode 100644 src/gdr.cc delete mode 100644 src/include/bootstrap.h delete mode 100644 src/include/comm.h delete mode 100644 src/include/gdr.h delete mode 100644 src/include/registered_ptr.hpp delete mode 100644 src/init.cc rename src/{misc => npkit}/npkit.cc (99%) rename src/{include => }/npkit/npkit.h (96%) rename src/{include => }/npkit/npkit_event.h (100%) rename src/{include => }/npkit/npkit_struct.h (100%) delete mode 100644 src/proxy_cpp.cc diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5e583d45..dc86f638 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,5 +1,2 @@ -file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.h) -file(GLOB to_remove gdr.cc) -list(REMOVE_ITEM SOURCES ${to_remove}) - +file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc) target_sources(mscclpp PRIVATE ${SOURCES}) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index b6311948..d3020030 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -1,8 +1,8 @@ -#include "bootstrap.h" #include "api.h" #include "checks.hpp" #include #include "utils.h" +#include "socket.h" #include #include @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -17,17 +18,6 @@ using namespace mscclpp; namespace { -uint64_t hashUniqueId(const mscclppBootstrapHandle& id) -{ - const char* bytes = (const char*)&id; - uint64_t h = 0xdeadbeef; - for (int i = 0; i < (int)sizeof(mscclppBootstrapHandle); i++) { - h ^= h >> 32; - h *= 0x8db3db47fa2994ad; - h += bytes[i]; - } - return h; -} mscclppResult_t setFilesLimit() { @@ -515,551 +505,3 @@ MSCCLPP_API_CPP Bootstrap::~Bootstrap() { pimpl_->close(); } - -// ------------------- Old bootstrap functions ------------------- -struct BootstrapRootArgs -{ - struct mscclppSocket* listenSock; - uint64_t magic; -}; - -/* Init functions */ -static char bootstrapNetIfName[MAX_IF_NAME_SIZE + 1]; -static union mscclppSocketAddress bootstrapNetIfAddr; -static int bootstrapNetInitDone = 0; -pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER; - -mscclppResult_t bootstrapNetInit(const char* ip_port_pair) -{ - if (bootstrapNetInitDone == 0) { - pthread_mutex_lock(&bootstrapNetLock); - if (bootstrapNetInitDone == 0) { - const char* env; - if (ip_port_pair) { - env = ip_port_pair; - } else { - env = getenv("MSCCLPP_COMM_ID"); - } - if (env) { - union mscclppSocketAddress remoteAddr; - if (mscclppSocketGetAddrFromString(&remoteAddr, env) != mscclppSuccess) { - WARN("Invalid MSCCLPP_COMM_ID, please use format: : or []: or :"); - return mscclppInvalidArgument; - } - if (mscclppFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, - 1) <= 0) { - WARN("NET/Socket : No usable listening interface found"); - return mscclppSystemError; - } - } else { - int nIfs = mscclppFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1); - if (nIfs <= 0) { - WARN("Bootstrap : no socket interface found"); - return mscclppInternalError; - } - } - char line[SOCKET_NAME_MAXLEN + MAX_IF_NAME_SIZE + 2]; - sprintf(line, " %s:", bootstrapNetIfName); - mscclppSocketToString(&bootstrapNetIfAddr, line + strlen(line)); - INFO(MSCCLPP_INIT, "Bootstrap : Using%s", line); - bootstrapNetInitDone = 1; - } - pthread_mutex_unlock(&bootstrapNetLock); - } - return mscclppSuccess; -} - -// Additional sync functions -static mscclppResult_t bootstrapNetSend(struct mscclppSocket* sock, void* data, int size) -{ - MSCCLPPCHECK(mscclppSocketSend(sock, &size, sizeof(int))); - MSCCLPPCHECK(mscclppSocketSend(sock, data, size)); - return mscclppSuccess; -} -static mscclppResult_t bootstrapNetRecv(struct mscclppSocket* sock, void* data, int size) -{ - int recvSize; - MSCCLPPCHECK(mscclppSocketRecv(sock, &recvSize, sizeof(int))); - if (recvSize > size) { - WARN("Message truncated : received %d bytes instead of %d", recvSize, size); - return mscclppInternalError; - } - MSCCLPPCHECK(mscclppSocketRecv(sock, data, std::min(recvSize, size))); - return mscclppSuccess; -} - -// struct ExtInfo -// { -// int rank; -// int nranks; -// union mscclppSocketAddress extAddressListenRoot; -// union mscclppSocketAddress extAddressListen; -// }; - -#include - -// static mscclppResult_t setFilesLimit() -// { -// struct rlimit filesLimit; -// SYSCHECK(getrlimit(RLIMIT_NOFILE, &filesLimit), "getrlimit"); -// filesLimit.rlim_cur = filesLimit.rlim_max; -// SYSCHECK(setrlimit(RLIMIT_NOFILE, &filesLimit), "setrlimit"); -// return mscclppSuccess; -// } - -static void* bootstrapRoot(void* rargs) -{ - struct BootstrapRootArgs* args = (struct BootstrapRootArgs*)rargs; - struct mscclppSocket* listenSock = args->listenSock; - uint64_t magic = args->magic; - mscclppResult_t res = mscclppSuccess; - int nranks = 0, c = 0; - struct ExtInfo info; - union mscclppSocketAddress* rankAddresses = NULL; - union mscclppSocketAddress* rankAddressesRoot = NULL; // for initial rank <-> root information exchange - union mscclppSocketAddress* zero = NULL; - MSCCLPPCHECKGOTO(mscclppCalloc(&zero, 1), res, out); - setFilesLimit(); - - TRACE(MSCCLPP_INIT, "BEGIN"); - /* Receive addresses from all ranks */ - do { - struct mscclppSocket sock; - MSCCLPPCHECKGOTO(mscclppSocketInit(&sock), res, out); - MSCCLPPCHECKGOTO(mscclppSocketAccept(&sock, listenSock), res, out); - MSCCLPPCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out); - MSCCLPPCHECKGOTO(mscclppSocketClose(&sock), res, out); - - if (c == 0) { - nranks = info.nRanks; - MSCCLPPCHECKGOTO(mscclppCalloc(&rankAddresses, nranks), res, out); - MSCCLPPCHECKGOTO(mscclppCalloc(&rankAddressesRoot, nranks), res, out); - } - - if (nranks != info.nRanks) { - WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", nranks, info.nRanks); - goto out; - } - - if (memcmp(zero, &rankAddressesRoot[info.rank], sizeof(union mscclppSocketAddress)) != 0) { - WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks); - goto out; - } - - // Save the connection handle for that rank - memcpy(rankAddressesRoot + info.rank, &info.extAddressListenRoot, sizeof(union mscclppSocketAddress)); - memcpy(rankAddresses + info.rank, &info.extAddressListen, sizeof(union mscclppSocketAddress)); - - ++c; - TRACE(MSCCLPP_INIT, "Received connect from rank %d total %d/%d", info.rank, c, nranks); - } while (c < nranks); - TRACE(MSCCLPP_INIT, "COLLECTED ALL %d HANDLES", nranks); - - // Send the connect handle for the next rank in the AllGather ring - for (int r = 0; r < nranks; ++r) { - int next = (r + 1) % nranks; - struct mscclppSocket sock; - MSCCLPPCHECKGOTO(mscclppSocketInit(&sock, rankAddressesRoot + r, magic, mscclppSocketTypeBootstrap), res, out); - MSCCLPPCHECKGOTO(mscclppSocketConnect(&sock), res, out); - MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, rankAddresses + next, sizeof(union mscclppSocketAddress)), res, out); - MSCCLPPCHECKGOTO(mscclppSocketClose(&sock), res, out); - } - TRACE(MSCCLPP_INIT, "SENT OUT ALL %d HANDLES", nranks); - -out: - if (listenSock != NULL) { - mscclppSocketClose(listenSock); - free(listenSock); - } - if (rankAddresses) - free(rankAddresses); - if (rankAddressesRoot) - free(rankAddressesRoot); - if (zero) - free(zero); - free(rargs); - - TRACE(MSCCLPP_INIT, "DONE"); - return NULL; -} - -mscclppResult_t bootstrapCreateRoot(struct mscclppBootstrapHandle* handle) -{ - struct mscclppSocket* listenSock; - struct BootstrapRootArgs* args; - pthread_t thread; - - MSCCLPPCHECK(mscclppCalloc(&listenSock, 1)); - MSCCLPPCHECK(mscclppSocketInit(listenSock, &handle->addr, handle->magic, mscclppSocketTypeBootstrap, NULL, 0)); - MSCCLPPCHECK(mscclppSocketListen(listenSock)); - MSCCLPPCHECK(mscclppSocketGetAddr(listenSock, &handle->addr)); - - MSCCLPPCHECK(mscclppCalloc(&args, 1)); - args->listenSock = listenSock; - args->magic = handle->magic; - NEQCHECK(pthread_create(&thread, NULL, bootstrapRoot, (void*)args), 0); - mscclppSetThreadName(thread, "MSCCLPP BootstrapR"); - NEQCHECK(pthread_detach(thread), 0); // will not be pthread_join()'d - return mscclppSuccess; -} - -// #include -// #include - -mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle, bool isRoot, const char* ip_port_pair) -{ - memset(handle, 0, sizeof(mscclppBootstrapHandle)); - const char* env = NULL; - - if (ip_port_pair) { - env = ip_port_pair; - } else { - env = getenv("MSCCLPP_COMM_ID"); - } - if (env) { - handle->magic = 0xdeadbeef; - - INFO(MSCCLPP_ENV, "MSCCLPP_COMM_ID set by environment to %s", env); - if (mscclppSocketGetAddrFromString(&handle->addr, env) != mscclppSuccess) { - WARN("Invalid MSCCLPP_COMM_ID, please use format: : or []: or :"); - return mscclppInvalidArgument; - } - if (isRoot) - MSCCLPPCHECK(bootstrapCreateRoot(handle)); - } else { - MSCCLPPCHECK(getRandomData(&handle->magic, sizeof(handle->magic))); - memcpy(&handle->addr, &bootstrapNetIfAddr, sizeof(union mscclppSocketAddress)); - MSCCLPPCHECK(bootstrapCreateRoot(handle)); - } - printf("addr = %s port = %d\n", inet_ntoa(handle->addr.sin.sin_addr), (int)ntohs(handle->addr.sin.sin_port)); - // printf("addr = %s\n", inet_ntoa((*(struct sockaddr_in*)&handle->addr.sa).sin_addr)); - - return mscclppSuccess; -} - -struct UnexConn -{ - int peer; - int tag; - struct mscclppSocket sock; - struct UnexConn* next; -}; - -struct BootstrapState -{ - struct mscclppSocket listenSock; - struct mscclppSocket ringRecvSocket; - struct mscclppSocket ringSendSocket; - union mscclppSocketAddress* peerCommAddresses; - union mscclppSocketAddress* peerProxyAddresses; - struct UnexConn* unexpectedConnections; - int cudaDev; - int rank; - int nranks; - uint64_t magic; - volatile uint32_t* abortFlag; -}; - -mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscclppComm* comm) -{ - int rank = comm->rank; - int nranks = comm->nRanks; - struct BootstrapState* state; - struct mscclppSocket* proxySocket; - mscclppSocketAddress nextAddr; - struct mscclppSocket sock, listenSockRoot; - struct ExtInfo info; - - MSCCLPPCHECK(mscclppCalloc(&state, 1)); - state->rank = rank; - state->nranks = nranks; - state->abortFlag = comm->abortFlag; - comm->bootstrap = state; - comm->magic = state->magic = handle->magic; - - TRACE(MSCCLPP_INIT, "rank %d nranks %d", rank, nranks); - - info.rank = rank; - info.nRanks = nranks; - - // Create socket for other ranks to contact me - MSCCLPPCHECK(mscclppSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->magic, mscclppSocketTypeBootstrap, - comm->abortFlag)); - MSCCLPPCHECK(mscclppSocketListen(&state->listenSock)); - MSCCLPPCHECK(mscclppSocketGetAddr(&state->listenSock, &info.extAddressListen)); - - // Create socket for root to contact me - MSCCLPPCHECK( - mscclppSocketInit(&listenSockRoot, &bootstrapNetIfAddr, comm->magic, mscclppSocketTypeBootstrap, comm->abortFlag)); - MSCCLPPCHECK(mscclppSocketListen(&listenSockRoot)); - MSCCLPPCHECK(mscclppSocketGetAddr(&listenSockRoot, &info.extAddressListenRoot)); - - // stagger connection times to avoid an overload of the root - if (nranks > 128) { - long msec = rank; - struct timespec tv; - tv.tv_sec = msec / 1000; - tv.tv_nsec = 1000000 * (msec % 1000); - TRACE(MSCCLPP_INIT, "rank %d delaying connection to root by %ld msec", rank, msec); - (void)nanosleep(&tv, NULL); - } - - // send info on my listening socket to root - MSCCLPPCHECK(mscclppSocketInit(&sock, &handle->addr, comm->magic, mscclppSocketTypeBootstrap, comm->abortFlag)); - MSCCLPPCHECK(mscclppSocketConnect(&sock)); - MSCCLPPCHECK(bootstrapNetSend(&sock, &info, sizeof(info))); - MSCCLPPCHECK(mscclppSocketClose(&sock)); - - // get info on my "next" rank in the bootstrap ring from root - MSCCLPPCHECK(mscclppSocketInit(&sock)); - MSCCLPPCHECK(mscclppSocketAccept(&sock, &listenSockRoot)); - MSCCLPPCHECK(bootstrapNetRecv(&sock, &nextAddr, sizeof(union mscclppSocketAddress))); - MSCCLPPCHECK(mscclppSocketClose(&sock)); - MSCCLPPCHECK(mscclppSocketClose(&listenSockRoot)); - - MSCCLPPCHECK( - mscclppSocketInit(&state->ringSendSocket, &nextAddr, comm->magic, mscclppSocketTypeBootstrap, comm->abortFlag)); - MSCCLPPCHECK(mscclppSocketConnect(&state->ringSendSocket)); - // Accept the connect request from the previous rank in the AllGather ring - MSCCLPPCHECK(mscclppSocketInit(&state->ringRecvSocket)); - MSCCLPPCHECK(mscclppSocketAccept(&state->ringRecvSocket, &state->listenSock)); - - // AllGather all listen handlers - MSCCLPPCHECK(mscclppCalloc(&state->peerCommAddresses, nranks)); - MSCCLPPCHECK(mscclppSocketGetAddr(&state->listenSock, state->peerCommAddresses + rank)); - MSCCLPPCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union mscclppSocketAddress))); - - // Create the service proxy - MSCCLPPCHECK(mscclppCalloc(&state->peerProxyAddresses, nranks)); - - // proxy is aborted through a message; don't set abortFlag - MSCCLPPCHECK(mscclppCalloc(&proxySocket, 1)); - MSCCLPPCHECK( - mscclppSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, mscclppSocketTypeProxy, comm->abortFlag)); - MSCCLPPCHECK(mscclppSocketListen(proxySocket)); - MSCCLPPCHECK(mscclppSocketGetAddr(proxySocket, state->peerProxyAddresses + rank)); - MSCCLPPCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union mscclppSocketAddress))); - // MSCCLPPCHECK(mscclppProxyInit(comm, proxySocket, state->peerProxyAddresses)); - - TRACE(MSCCLPP_INIT, "rank %d nranks %d - DONE", rank, nranks); - - return mscclppSuccess; -} - -mscclppResult_t bootstrapAllGather(void* commState, void* allData, int size) -{ - struct BootstrapState* state = (struct BootstrapState*)commState; - char* data = (char*)allData; - int rank = state->rank; - int nranks = state->nranks; - - TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d", rank, nranks, size); - - /* Simple ring based AllGather - * At each step i receive data from (rank-i-1) from left - * and send previous step's data from (rank-i) to right - */ - for (int i = 0; i < nranks - 1; i++) { - size_t rslice = (rank - i - 1 + nranks) % nranks; - size_t sslice = (rank - i + nranks) % nranks; - - // Send slice to the right - MSCCLPPCHECK(bootstrapNetSend(&state->ringSendSocket, data + sslice * size, size)); - // Recv slice from the left - MSCCLPPCHECK(bootstrapNetRecv(&state->ringRecvSocket, data + rslice * size, size)); - } - - TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); - return mscclppSuccess; -} - -mscclppResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) -{ - mscclppResult_t ret = mscclppSuccess; - struct BootstrapState* state = (struct BootstrapState*)commState; - struct mscclppSocket sock; - - MSCCLPPCHECKGOTO(mscclppSocketInit(&sock, state->peerCommAddresses + peer, state->magic, mscclppSocketTypeBootstrap, - state->abortFlag), - ret, fail); - MSCCLPPCHECKGOTO(mscclppSocketConnect(&sock), ret, fail); - MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, &state->rank, sizeof(int)), ret, fail); - MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, &tag, sizeof(int)), ret, fail); - MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, data, size), ret, fail); - -exit: - MSCCLPPCHECK(mscclppSocketClose(&sock)); - return ret; -fail: - goto exit; -} - -mscclppResult_t bootstrapBarrier(void* commState, int* ranks, int rank, int nranks, int tag) -{ - if (nranks == 1) - return mscclppSuccess; - TRACE(MSCCLPP_INIT, "rank %d nranks %d tag %x - ENTER", rank, nranks, tag); - - /* Simple intra process barrier - * - * Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet, - * "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988" - */ - int data[1]; - for (int mask = 1; mask < nranks; mask <<= 1) { - int src = (rank - mask + nranks) % nranks; - int dst = (rank + mask) % nranks; - MSCCLPPCHECK(bootstrapSend(commState, ranks[dst], tag, data, sizeof(data))); - MSCCLPPCHECK(bootstrapRecv(commState, ranks[src], tag, data, sizeof(data))); - } - - TRACE(MSCCLPP_INIT, "rank %d nranks %d tag %x - DONE", rank, nranks, tag); - return mscclppSuccess; -} - -mscclppResult_t bootstrapIntraNodeAllGather(void* commState, int* ranks, int rank, int nranks, void* allData, int size) -{ - if (nranks == 1) - return mscclppSuccess; - char* data = (char*)allData; - TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - ENTER", rank, nranks, size); - - for (int i = 1; i < nranks; i++) { - int src = (rank - i + nranks) % nranks; - int dst = (rank + i) % nranks; - MSCCLPPCHECK(bootstrapSend(commState, ranks[dst], /*tag=*/i, data + rank * size, size)); - MSCCLPPCHECK(bootstrapRecv(commState, ranks[src], /*tag=*/i, data + src * size, size)); - } - - TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); - return mscclppSuccess; -} - -mscclppResult_t unexpectedEnqueue(struct BootstrapState* state, int peer, int tag, struct mscclppSocket* sock) -{ - // New unex - struct UnexConn* unex; - MSCCLPPCHECK(mscclppCalloc(&unex, 1)); - unex->peer = peer; - unex->tag = tag; - memcpy(&unex->sock, sock, sizeof(struct mscclppSocket)); - - // Enqueue - struct UnexConn* list = state->unexpectedConnections; - if (list == NULL) { - state->unexpectedConnections = unex; - return mscclppSuccess; - } - while (list->next) - list = list->next; - list->next = unex; - return mscclppSuccess; -} - -mscclppResult_t unexpectedDequeue(struct BootstrapState* state, int peer, int tag, struct mscclppSocket* sock, - int* found) -{ - struct UnexConn* elem = state->unexpectedConnections; - struct UnexConn* prev = NULL; - *found = 0; - while (elem) { - if (elem->peer == peer && elem->tag == tag) { - if (prev == NULL) { - state->unexpectedConnections = elem->next; - } else { - prev->next = elem->next; - } - memcpy(sock, &elem->sock, sizeof(struct mscclppSocket)); - free(elem); - *found = 1; - return mscclppSuccess; - } - prev = elem; - elem = elem->next; - } - return mscclppSuccess; -} - -static void unexpectedFree(struct BootstrapState* state) -{ - struct UnexConn* elem = state->unexpectedConnections; - struct UnexConn* prev = NULL; - - while (elem) { - prev = elem; - elem = elem->next; - free(prev); - } - return; -} - -// We can't know who we'll receive from, so we need to receive everything at once -mscclppResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) -{ - mscclppResult_t ret = mscclppSuccess; - struct BootstrapState* state = (struct BootstrapState*)commState; - struct mscclppSocket sock; - int newPeer, newTag; - - // Search unexpected connections first - int found; - MSCCLPPCHECK(unexpectedDequeue(state, peer, tag, &sock, &found)); - if (found) { - MSCCLPPCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, fail); - goto exit; - } - - // Then look for new connections - while (1) { - MSCCLPPCHECKGOTO(mscclppSocketInit(&sock), ret, fail); - MSCCLPPCHECKGOTO(mscclppSocketAccept(&sock, &state->listenSock), ret, fail); - MSCCLPPCHECKGOTO(bootstrapNetRecv(&sock, &newPeer, sizeof(int)), ret, fail); - MSCCLPPCHECKGOTO(bootstrapNetRecv(&sock, &newTag, sizeof(int)), ret, fail); - if (newPeer == peer && newTag == tag) { - MSCCLPPCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, fail); - goto exit; - } - // Unexpected connection. Save for later. - MSCCLPPCHECKGOTO(unexpectedEnqueue(state, newPeer, newTag, &sock), ret, fail); - } -exit: - MSCCLPPCHECK(mscclppSocketClose(&sock)); - return ret; -fail: - goto exit; -} - -mscclppResult_t bootstrapClose(void* commState) -{ - struct BootstrapState* state = (struct BootstrapState*)commState; - if (state->unexpectedConnections != NULL) { - unexpectedFree(state); - if (*state->abortFlag == 0) { - WARN("Unexpected connections are not empty"); - return mscclppInternalError; - } - } - - MSCCLPPCHECK(mscclppSocketClose(&state->listenSock)); - MSCCLPPCHECK(mscclppSocketClose(&state->ringSendSocket)); - MSCCLPPCHECK(mscclppSocketClose(&state->ringRecvSocket)); - - free(state->peerCommAddresses); - free(state); - - return mscclppSuccess; -} - -mscclppResult_t bootstrapAbort(void* commState) -{ - struct BootstrapState* state = (struct BootstrapState*)commState; - if (commState == NULL) - return mscclppSuccess; - MSCCLPPCHECK(mscclppSocketClose(&state->listenSock)); - MSCCLPPCHECK(mscclppSocketClose(&state->ringSendSocket)); - MSCCLPPCHECK(mscclppSocketClose(&state->ringRecvSocket)); - free(state->peerCommAddresses); - free(state->peerProxyAddresses); - free(state); - return mscclppSuccess; -} diff --git a/src/c_style_remnants.cc b/src/c_style_remnants.cc new file mode 100644 index 00000000..613ff7ee --- /dev/null +++ b/src/c_style_remnants.cc @@ -0,0 +1,45 @@ +#include "mscclpp.h" +#include "debug.h" +#include "config.h" +#include "api.h" + +MSCCLPP_API void mscclppDefaultLogHandler(const char* msg) +{ + mscclppDebugDefaultLogHandler(msg); +} + +MSCCLPP_API mscclppResult_t mscclppSetLogHandler(mscclppLogHandler_t handler) +{ + return mscclppDebugSetLogHandler(handler); +} + +MSCCLPP_API mscclppResult_t mscclppSetBootstrapConnTimeout(int timeout) +{ + mscclppConfig* config = mscclppConfig::getInstance(); + config->setBootstrapConnectionTimeoutConfig(timeout); + return mscclppSuccess; +} + +MSCCLPP_API const char* mscclppGetErrorString(mscclppResult_t code) +{ + switch (code) { + case mscclppSuccess: + return "no error"; + case mscclppUnhandledCudaError: + return "unhandled cuda error"; + case mscclppSystemError: + return "unhandled system error"; + case mscclppInternalError: + return "internal error"; + case mscclppInvalidArgument: + return "invalid argument"; + case mscclppInvalidUsage: + return "invalid usage"; + case mscclppRemoteError: + return "remote process exited or there was a network error"; + case mscclppInProgress: + return "MSCCL++ operation in progress"; + default: + return "unknown result code"; + } +} diff --git a/src/communicator.cc b/src/communicator.cc index 1d670fa6..8b721232 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -2,7 +2,6 @@ #include "api.h" #include "checks.hpp" -#include "comm.h" #include "communicator.hpp" #include "connection.hpp" #include "debug.h" diff --git a/src/gdr.cc b/src/gdr.cc deleted file mode 100644 index 95cd6870..00000000 --- a/src/gdr.cc +++ /dev/null @@ -1,75 +0,0 @@ -#include "gdr.h" - -// Used to make the GDR library calls thread safe -pthread_mutex_t gdrLock = PTHREAD_MUTEX_INITIALIZER; - -gdr_t wrap_gdr_open(void) -{ - return gdr_open(); -} - -mscclppResult_t wrap_gdr_close(gdr_t g) -{ - int ret = gdr_close(g); - if (ret != 0) { - WARN("gdr_close() failed: %d", ret); - return mscclppSystemError; - } - return mscclppSuccess; -} - -mscclppResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, - gdr_mh_t* handle) -{ - int ret; - GDRLOCKCALL(gdr_pin_buffer(g, addr, size, p2p_token, va_space, handle), ret); - if (ret != 0) { - WARN("gdr_pin_buffer(addr %lx, size %zi) failed: %d", addr, size, ret); - return mscclppSystemError; - } - return mscclppSuccess; -} - -mscclppResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) -{ - int ret; - GDRLOCKCALL(gdr_unpin_buffer(g, handle), ret); - if (ret != 0) { - WARN("gdr_unpin_buffer(handle %lx) failed: %d", handle.h, ret); - return mscclppSystemError; - } - return mscclppSuccess; -} - -mscclppResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t* info) -{ - int ret; - GDRLOCKCALL(gdr_get_info(g, handle, info), ret); - if (ret != 0) { - WARN("gdr_get_info(handle %lx) failed: %d", handle.h, ret); - return mscclppSystemError; - } - return mscclppSuccess; -} - -mscclppResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void** va, size_t size) -{ - int ret; - GDRLOCKCALL(gdr_map(g, handle, va, size), ret); - if (ret != 0) { - WARN("gdr_map(handle %lx, size %zi) failed: %d", handle.h, size, ret); - return mscclppSystemError; - } - return mscclppSuccess; -} - -mscclppResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void* va, size_t size) -{ - int ret; - GDRLOCKCALL(gdr_unmap(g, handle, va, size), ret); - if (ret != 0) { - WARN("gdr_unmap(handle %lx, va %p, size %zi) failed: %d", handle.h, va, size, ret); - return mscclppSystemError; - } - return mscclppSuccess; -} diff --git a/src/ib.cc b/src/ib.cc index e6c91eb3..32db71bb 100644 --- a/src/ib.cc +++ b/src/ib.cc @@ -8,13 +8,14 @@ #include "alloc.h" #include "api.h" #include "checks.hpp" -#include "comm.h" #include "debug.h" #include "ib.hpp" #include #include #include +#define MAXCONNECTIONS 64 + namespace mscclpp { IbMr::IbMr(void* pd, void* buff, std::size_t size) : buff(buff) diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h deleted file mode 100644 index 6bb20f81..00000000 --- a/src/include/bootstrap.h +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once - -#include "mscclpp.h" -#include "socket.h" - -#include "comm.h" - -// ------------------- Old bootstrap headers: to be removed ------------------- - -struct mscclppBootstrapHandle -{ - uint64_t magic; - union mscclppSocketAddress addr; -}; -mscclppResult_t bootstrapNetInit(const char* ip_port_pair = NULL); -mscclppResult_t bootstrapCreateRoot(struct mscclppBootstrapHandle* handle); -mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle, bool isRoot = true, - const char* ip_port_pair = NULL); -mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscclppComm* comm); -mscclppResult_t bootstrapAllGather(void* commState, void* allData, int size); -mscclppResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size); -mscclppResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size); -mscclppResult_t bootstrapBarrier(void* commState, int* ranks, int rank, int nranks, int tag); -mscclppResult_t bootstrapIntraNodeAllGather(void* commState, int* ranks, int rank, int nranks, void* allData, int size); -mscclppResult_t bootstrapClose(void* commState); -mscclppResult_t bootstrapAbort(void* commState); diff --git a/src/include/comm.h b/src/include/comm.h deleted file mode 100644 index e6a067d6..00000000 --- a/src/include/comm.h +++ /dev/null @@ -1,65 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef MSCCLPP_COMM_H_ -#define MSCCLPP_COMM_H_ - -#include "ib.hpp" -#include "proxy.h" -#include -#include - -#define MAXCONNECTIONS 64 - -struct mscclppBufferRegistration -{ - void* data; - uint64_t size; -}; - -struct mscclppConn -{ - int connId; - mscclppTransport_t transport; - int remoteRank; - uint64_t buffSize; - struct mscclppDevConn* devConn; - struct mscclppHostConn* hostConn; - - std::vector bufferRegistrations; - std::vector remoteBufferRegistrations; - - mscclpp::IbCtx* ibCtx; -#if defined(ENABLE_NPKIT) - std::vector npkitUsedReqIds; - std::vector npkitFreeReqIds; -#endif -}; - -struct mscclppComm -{ - struct mscclppConn conns[MAXCONNECTIONS]; - struct mscclppDevConn devConns[MAXCONNECTIONS]; - int nConns; - - void* bootstrap; - - // Magic number for all network communication. Not a security key -- only goal is to detect mismatches. - uint64_t magic; - - int rank; // my rank in the communicator - int nRanks; // number of GPUs in communicator - int cudaDev; // my cuda device index - int devNumaNode; // my device's NUMA node - - // Flag to ask MSCCLPP kernels to abort - volatile uint32_t* abortFlag; - - std::unique_ptr ibContext[MSCCLPP_IB_MAX_DEVS]; - struct mscclppProxyState* proxyState[MSCCLPP_PROXY_MAX_NUM]; -}; - -#endif diff --git a/src/include/gdr.h b/src/include/gdr.h deleted file mode 100644 index d7e0269a..00000000 --- a/src/include/gdr.h +++ /dev/null @@ -1,156 +0,0 @@ -#ifndef MSCCLPP_GDR_H_ -#define MSCCLPP_GDR_H_ - -#include "align.h" -#include "alloc.h" -#include "checks.h" -#include "debug.h" -#include "gdrapi.h" - -// These can be used if the GDR library isn't thread safe -#include -extern pthread_mutex_t gdrLock; -#define GDRLOCK() pthread_mutex_lock(&gdrLock) -#define GDRUNLOCK() pthread_mutex_unlock(&gdrLock) -#define GDRLOCKCALL(cmd, ret) \ - do { \ - GDRLOCK(); \ - ret = cmd; \ - GDRUNLOCK(); \ - } while (false) - -#define GDRCHECK(cmd) \ - do { \ - int e; \ - /* GDRLOCKCALL(cmd, e); */ \ - e = cmd; \ - if (e != 0) { \ - WARN("GDRCOPY failure %d", e); \ - return mscclppSystemError; \ - } \ - } while (false) - -gdr_t wrap_gdr_open(void); -mscclppResult_t wrap_gdr_close(gdr_t g); -mscclppResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, - gdr_mh_t* handle); -mscclppResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle); -mscclppResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t* info); -mscclppResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void** va, size_t size); -mscclppResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void* va, size_t size); - -// Global GDR driver handle -extern gdr_t mscclppGdrCopy; - -typedef struct gdr_mem_desc -{ - void* gdrDevMem; - void* gdrMap; - size_t gdrOffset; - size_t gdrMapSize; - gdr_mh_t gdrMh; -} gdr_mem_desc_t; - -static gdr_t mscclppGdrInit() -{ - // int libMajor, libMinor, drvMajor, drvMinor; - gdr_t handle = wrap_gdr_open(); - - // if (handle != NULL) { - // mscclppResult_t res; - - // // Query the version of libgdrapi - // MSCCLPPCHECKGOTO(wrap_gdr_runtime_get_version(&libMajor, &libMinor), res, error); - - // // Query the version of gdrdrv driver - // MSCCLPPCHECKGOTO(wrap_gdr_driver_get_version(handle, &drvMajor, &drvMinor), res, error); - - // // Only support GDRAPI 2.1 and later - // if (libMajor < 2 || (libMajor == 2 && libMinor < 1) || drvMajor < 2 || (drvMajor == 2 && drvMinor < 1)) { - // goto error; - // } - // else - // INFO(MSCCLPP_INIT, "GDRCOPY enabled library %d.%d driver %d.%d", libMajor, libMinor, drvMajor, drvMinor); - // } - return handle; - // error: - // if (handle != NULL) (void) wrap_gdr_close(handle); - // return NULL; -} - -template -mscclppResult_t mscclppGdrCudaCallocDebug(T** ptr, T** devPtr, size_t nelem, void** gdrDesc, const char* filefunc, - int line) -{ - mscclppResult_t result = mscclppSuccess; - cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; - *ptr = nullptr; - *devPtr = nullptr; - *gdrDesc = nullptr; - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - - gdr_info_t info; - size_t mapSize; - gdr_mh_t mh; - char* devMem; - void* gdrMap; - ssize_t off; - gdr_mem_desc_t* md; - uint64_t alignedAddr; - size_t align; - - mapSize = sizeof(T) * nelem; - - // GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE - ALIGN_SIZE(mapSize, GPU_PAGE_SIZE); - // GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too - MSCCLPPCHECKGOTO(mscclppCudaCalloc(&devMem, mapSize + GPU_PAGE_SIZE - 1), result, finish); - alignedAddr = (((uint64_t)devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK; - align = alignedAddr - (uint64_t)devMem; - MSCCLPPCHECKGOTO(wrap_gdr_pin_buffer(mscclppGdrCopy, alignedAddr, mapSize, 0, 0, &mh), result, finish); - - MSCCLPPCHECKGOTO(wrap_gdr_map(mscclppGdrCopy, mh, &gdrMap, mapSize), result, finish); - - MSCCLPPCHECKGOTO(wrap_gdr_get_info(mscclppGdrCopy, mh, &info), result, finish); - - // Will offset ever be non zero ? - off = info.va - alignedAddr; - - MSCCLPPCHECKGOTO(mscclppCalloc(&md, 1), result, finish); - md->gdrDevMem = devMem; - md->gdrMap = gdrMap; - md->gdrMapSize = mapSize; - md->gdrOffset = off + align; - md->gdrMh = mh; - *gdrDesc = md; - - *ptr = (T*)((char*)gdrMap + off); - if (devPtr) - *devPtr = (T*)(devMem + off + align); - - TRACE(mscclpp_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p", md->gdrDevMem, - md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr); - - return mscclppSuccess; - -finish: - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (*ptr == nullptr) - WARN("Failed to CUDA calloc %ld bytes", nelem * sizeof(T)); - INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr); - return result; -} -#define mscclppGdrCudaCalloc(...) mscclppGdrCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__) - -static mscclppResult_t mscclppGdrCudaFree(void* gdrDesc) -{ - gdr_mem_desc_t* md = (gdr_mem_desc_t*)gdrDesc; - MSCCLPPCHECK(wrap_gdr_unmap(mscclppGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize)); - MSCCLPPCHECK(wrap_gdr_unpin_buffer(mscclppGdrCopy, md->gdrMh)); - CUDACHECK(cudaFree(md->gdrDevMem)); - free(md); - - return mscclppSuccess; -} - -#endif diff --git a/src/include/registered_ptr.hpp b/src/include/registered_ptr.hpp deleted file mode 100644 index 4f03ea40..00000000 --- a/src/include/registered_ptr.hpp +++ /dev/null @@ -1,52 +0,0 @@ -#ifndef MSCCLPP_REGISTERED_PTR_HPP_ -#define MSCCLPP_REGISTERED_PTR_HPP_ - -namespace mscclpp { - -template class RegisteredPtr -{ - RegisteredMemory memory; - size_t offset; - -public: - RegisteredPtr(RegisteredMemory memory, size_t offset) : memory(memory), offset(offset) - { - } - RegisteredPtr(RegisteredMemory memory) : RegisteredPtr(memory, 0) - { - } - ~RegisteredPtr() - { - } - - RegisteredMemory memory() - { - return memory; - } - - T* data() - { - return reinterpret_cast(memory.data()); - } - - size_t size() - { - return memory.size() / sizeof(T); - } - - size_t offset() - { - return offset; - } - - RegisteredPtr operator+(size_t offset) - { - return RegisteredPtr(memory, this->offset + offset); - } - - // TODO: all other relevant overloads -}; - -} // namespace mscclpp - -#endif // MSCCLPP_REGISTERED_PTR_HPP_ \ No newline at end of file diff --git a/src/init.cc b/src/init.cc deleted file mode 100644 index 03f037c4..00000000 --- a/src/init.cc +++ /dev/null @@ -1,920 +0,0 @@ -#include "alloc.h" -#include "api.h" -#include "bootstrap.h" -#include "checks.h" -#include "config.h" -#if defined(MSCCLPP_USE_GDRCOPY) -#include "gdr.h" -#endif -#include "infiniband/verbs.h" -#include "mscclpp.h" -#include -#include -#include -#if defined(ENABLE_NPKIT) -#include "npkit/npkit.h" -#endif - -static uint64_t hashUniqueId(mscclppUniqueId const& id) -{ - char const* bytes = (char const*)&id; - uint64_t h = 0xdeadbeef; - for (int i = 0; i < (int)sizeof(mscclppUniqueId); i++) { - h ^= h >> 32; - h *= 0x8db3db47fa2994ad; - h += bytes[i]; - } - return h; -} - -pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER; -static bool initialized = false; -// static size_t maxLocalSizeBytes = 0; - -#if defined(MSCCLPP_USE_GDRCOPY) - -gdr_t mscclppGdrCopy = NULL; - -mscclppResult_t initGdrCopy() -{ - if (mscclppGdrCopy == NULL) { - mscclppGdrCopy = mscclppGdrInit(); - if (mscclppGdrCopy == NULL) { - WARN("GDR init failed"); - return mscclppSystemError; - } - } - return mscclppSuccess; -} - -#endif - -static mscclppResult_t mscclppInit() -{ - if (__atomic_load_n(&initialized, __ATOMIC_ACQUIRE)) - return mscclppSuccess; - pthread_mutex_lock(&initLock); - if (!initialized) { - // Always initialize bootstrap network - MSCCLPPCHECK(bootstrapNetInit()); - - __atomic_store_n(&initialized, true, __ATOMIC_RELEASE); - } - pthread_mutex_unlock(&initLock); - return mscclppSuccess; -} - -static std::string mscclppShmFileName(mscclppComm_t comm, int rank) -{ - std::stringstream ss; - ss << "mscclpp." << std::hex << comm->magic << "." << rank; - return ss.str(); -} - -MSCCLPP_API mscclppResult_t mscclppGetUniqueId(mscclppUniqueId* out) -{ - MSCCLPPCHECK(mscclppInit()); - // mscclppCHECK(PtrCheck(out, "GetUniqueId", "out")); - mscclppResult_t res = bootstrapGetUniqueId((struct mscclppBootstrapHandle*)out); - TRACE_CALL("mscclppGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(*out)); - return res; -} - -MSCCLPP_API mscclppResult_t mscclppBootstrapAllGather(mscclppComm_t comm, void* data, int size) -{ - MSCCLPPCHECK(bootstrapAllGather(comm->bootstrap, data, size)); - return mscclppSuccess; -} - -MSCCLPP_API mscclppResult_t mscclppCommInitRank(mscclppComm_t* comm, int nranks, const char* ipPortPair, int rank) -{ -#if defined(MSCCLPP_USE_GDRCOPY) - MSCCLPPCHECK(initGdrCopy()); -#endif - - mscclppResult_t res = mscclppSuccess; - mscclppComm_t _comm = NULL; - // uint64_t hash = getHostHash(); - // uint64_t *hashes; - // std::map hashToNode; - - MSCCLPPCHECKGOTO(mscclppCalloc(&_comm, 1), res, fail); - _comm->rank = rank; - _comm->nRanks = nranks; - _comm->devNumaNode = -1; - // We assume that the user has set the device to the intended one already - CUDACHECK(cudaGetDevice(&_comm->cudaDev)); - - MSCCLPPCHECK(bootstrapNetInit(ipPortPair)); - mscclppBootstrapHandle handle; - MSCCLPPCHECK(bootstrapGetUniqueId(&handle, rank == 0, ipPortPair)); - _comm->magic = handle.magic; - - MSCCLPPCHECKGOTO(mscclppCudaHostCalloc((uint32_t**)&_comm->abortFlag, 1), res, fail); - MSCCLPPCHECK(bootstrapInit(&handle, _comm)); - -#if defined(ENABLE_NPKIT) - // Init NPKit - MSCCLPPCHECK(NpKit::Init(_comm->rank)); -#endif - - *comm = _comm; - return res; -fail: - if (_comm) { - if (_comm->abortFlag) - mscclppCudaHostFree((void*)_comm->abortFlag); - free(_comm); - } - if (comm) - *comm = NULL; - return res; -} - -MSCCLPP_API mscclppResult_t mscclppCommInitRankFromId(mscclppComm_t* comm, int nranks, mscclppUniqueId id, int rank) -{ -#if defined(MSCCLPP_USE_GDRCOPY) - MSCCLPPCHECK(initGdrCopy()); -#endif - - mscclppResult_t res = mscclppSuccess; - mscclppComm_t _comm = NULL; - mscclppBootstrapHandle* handle = (mscclppBootstrapHandle*)&id; - - MSCCLPPCHECKGOTO(mscclppCalloc(&_comm, 1), res, fail); - _comm->rank = rank; - _comm->nRanks = nranks; - // We assume that the user has set the device to the intended one already - CUDACHECK(cudaGetDevice(&_comm->cudaDev)); - - MSCCLPPCHECK(bootstrapNetInit()); - _comm->magic = handle->magic; - - MSCCLPPCHECKGOTO(mscclppCudaHostCalloc((uint32_t**)&_comm->abortFlag, 1), res, fail); - MSCCLPPCHECK(bootstrapInit(handle, _comm)); - -#if defined(ENABLE_NPKIT) - // Init NPKit - MSCCLPPCHECK(NpKit::Init(_comm->rank)); -#endif - - *comm = _comm; - return res; -fail: - if (_comm) { - if (_comm->abortFlag) - mscclppCudaHostFree((void*)_comm->abortFlag); - free(_comm); - } - if (comm) - *comm = NULL; - return res; -} - -MSCCLPP_API mscclppResult_t mscclppCommDestroy(mscclppComm_t comm) -{ -#if defined(ENABLE_NPKIT) - const char* npkitDumpDir = nullptr; -#endif - - if (comm == NULL) - return mscclppSuccess; - - for (int i = 0; i < MSCCLPP_PROXY_MAX_NUM; ++i) { - struct mscclppProxyState* proxyState = comm->proxyState[i]; - if (proxyState) { - MSCCLPPCHECK(proxyState->fifo.destroy()); - if (proxyState->p2pStream) - CUDACHECK(cudaStreamDestroy(proxyState->p2pStream)); - free(proxyState); - } - } - - for (int i = 0; i < MSCCLPP_IB_MAX_DEVS; ++i) { - if (comm->ibContext[i]) { - comm->ibContext[i].reset(nullptr); - } - } - - for (int i = 0; i < comm->nConns; i++) { - struct mscclppConn* conn = &comm->conns[i]; - if (conn) { - MSCCLPPCHECK(mscclppCudaFree(conn->devConn->localSignalEpochId)); - MSCCLPPCHECK(mscclppCudaFree(conn->devConn->waitEpochId)); - if (conn->hostConn) - delete conn->hostConn; - } - } - - if (comm->bootstrap) - MSCCLPPCHECK(bootstrapClose(comm->bootstrap)); - - mscclppCudaHostFree((void*)comm->abortFlag); - free(comm); - -#if defined(ENABLE_NPKIT) - // Dump NPKit events and shutdown - npkitDumpDir = getenv("NPKIT_DUMP_DIR"); - if (npkitDumpDir == nullptr) { - WARN("NPKIT_DUMP_DIR is empty"); - } else { - MSCCLPPCHECK(NpKit::Dump(npkitDumpDir)); - } - MSCCLPPCHECK(NpKit::Shutdown()); -#endif - - return mscclppSuccess; -} - -MSCCLPP_API const char* mscclppGetErrorString(mscclppResult_t code) -{ - switch (code) { - case mscclppSuccess: - return "no error"; - case mscclppUnhandledCudaError: - return "unhandled cuda error"; - case mscclppSystemError: - return "unhandled system error"; - case mscclppInternalError: - return "internal error"; - case mscclppInvalidArgument: - return "invalid argument"; - case mscclppInvalidUsage: - return "invalid usage"; - case mscclppRemoteError: - return "remote process exited or there was a network error"; - case mscclppInProgress: - return "MSCCL++ operation in progress"; - default: - return "unknown result code"; - } -} - -MSCCLPP_API mscclppResult_t mscclppGetDeviceConnection(mscclppComm_t comm, int remoteRank, int tag, - mscclppDevConn_t** devConn) -{ - for (int i = 0; i < comm->nConns; i++) { - if (comm->devConns[i].remoteRank == remoteRank && comm->devConns[i].tag == tag) { - *devConn = &comm->devConns[i]; - return mscclppSuccess; - } - } - - return mscclppInvalidArgument; -} - -MSCCLPP_API mscclppResult_t mscclppGetAllDeviceConnections(mscclppComm_t comm, mscclppDevConn_t** devConns, int* nConns) -{ - *nConns = comm->nConns; - *devConns = comm->devConns; - return mscclppSuccess; -} - -#if defined(ENABLE_NPKIT) - -static void npkitInitReqIds(struct mscclppComm* comm) -{ - for (int i = 0; i < comm->nConns; i++) { - struct mscclppConn* conn = &comm->conns[i]; - conn->npkitUsedReqIds.resize(0); - conn->npkitFreeReqIds.resize(MSCCLPP_IB_MAX_SENDS); - for (uint64_t j = 0; j < MSCCLPP_IB_MAX_SENDS; j++) { - conn->npkitFreeReqIds[j] = MSCCLPP_IB_MAX_SENDS - j - 1; - } - } -} - -static void npkitCollectEntryEvent(struct mscclppConn* conn, uint8_t type, uint32_t size) -{ - uint64_t reqId = 0; - if (conn->npkitFreeReqIds.size() == 0) { - reqId = conn->npkitUsedReqIds.size(); - } else { - reqId = conn->npkitFreeReqIds.back(); - conn->npkitFreeReqIds.pop_back(); - } - conn->npkitUsedReqIds.push_back(reqId); - NpKit::CollectCpuEvent(type, size, (uint32_t)reqId, NpKit::GetCpuTimestamp(), conn->connId); -} - -static void npkitCollectExitEvents(struct mscclppConn* conn, uint8_t type) -{ - while (conn->npkitUsedReqIds.size()) { - uint64_t reqId = conn->npkitUsedReqIds.back(); - NpKit::CollectCpuEvent(type, 0, (uint32_t)reqId, NpKit::GetCpuTimestamp(), conn->connId); - conn->npkitFreeReqIds.push_back(reqId); - conn->npkitUsedReqIds.pop_back(); - } -} - -#else - -#define npkitInitReqIds(comm) - -#define npkitCollectEntryEvent(conn, type, size) - -#define npkitCollectExitEvents(conn, type) - -#endif - -struct mscclppHostP2PConn : mscclppHostConn -{ - mscclppHostP2PConn(mscclppConn* _conn, cudaStream_t _stream) : conn(_conn), p2pStream(_stream) - { - } - - void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) - { - put(1, dstDataOffset, 1, srcDataOffset, dataSize); - } - void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, - uint64_t dataSize) - { - void* srcBuff = (void*)((char*)conn->bufferRegistrations[src].data + srcDataOffset); - void* dstBuff = (void*)((char*)conn->remoteBufferRegistrations[dst].data + dstDataOffset); - CUDACHECKNORET(cudaMemcpyAsync(dstBuff, srcBuff, dataSize, cudaMemcpyDeviceToDevice, p2pStream)); - npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_DATA_ENTRY, (uint32_t)dataSize); - } - void signal() - { - CUDACHECKNORET(cudaMemcpyAsync(&conn->devConn->remoteSignalEpochId->proxy, - &(conn->devConn->localSignalEpochId->device), sizeof(uint64_t), - cudaMemcpyDeviceToDevice, p2pStream)); - npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_FLAG_ENTRY, (uint32_t)sizeof(uint64_t)); - } - void wait() - { - } - void flush() - { - CUDACHECKNORET(cudaStreamSynchronize(p2pStream)); - npkitCollectExitEvents(conn, NPKIT_EVENT_DMA_SEND_EXIT); - } - - mscclppConn* conn; - cudaStream_t p2pStream; -}; - -struct mscclppHostIBConn : mscclppHostConn -{ - mscclppHostIBConn(mscclppConn* conn) : conn(conn) - { - this->ibQp = NULL; - } - - void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) - { - put(1, dstDataOffset, 1, srcDataOffset, dataSize); - } - void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, - uint64_t dataSize) - { - this->ibQp->stageSend(this->ibMrs[src], this->remoteIbMrInfos[dst], (uint32_t)dataSize, - /*wrId=*/0, /*srcOffset=*/srcDataOffset, /*dstOffset=*/dstDataOffset, /*signaled=*/false); - this->ibQp->postSend(); - npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_DATA_ENTRY, (uint32_t)dataSize); - } - void signal() - { - // My local device flag is copied to the remote's proxy flag - this->ibQp->stageSend(this->ibMrs[0], this->remoteIbMrInfos[0], sizeof(uint64_t), - /*wrId=*/0, /*srcOffset=*/0, /*dstOffset=*/sizeof(uint64_t), /*signaled=*/true); - this->ibQp->postSend(); - npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_FLAG_ENTRY, (uint32_t)sizeof(uint64_t)); - } - void wait() - { - } - void flush() - { - bool isWaiting = true; - while (isWaiting) { - int wcNum = this->ibQp->pollCq(); - if (wcNum < 0) { - WARN("pollCq failed: errno %d", errno); - continue; - } - for (int i = 0; i < wcNum; ++i) { - struct ibv_wc* wc = (struct ibv_wc*)this->ibQp->getWc(i); - if (wc->status != IBV_WC_SUCCESS) { - WARN("wc status %d", wc->status); - continue; - } - if (wc->opcode == IBV_WC_RDMA_WRITE) { - isWaiting = false; - break; - } - } - } - npkitCollectExitEvents(conn, NPKIT_EVENT_IB_SEND_EXIT); - } - - mscclppConn* conn; - mscclpp::IbQp* ibQp; - std::vector ibMrs; - std::vector remoteIbMrInfos; -}; - -MSCCLPP_API mscclppResult_t mscclppConnectWithoutBuffer(mscclppComm_t comm, int remoteRank, int tag, - mscclppTransport_t transportType, const char* ibDev) -{ - // save this processes numa binding and set it to the one closest to the device - // so that all the allocation are close to the device - if (comm->devNumaNode == -1) { - // in case this is our first time - MSCCLPPCHECK(getDeviceNumaNode(comm->cudaDev, &comm->devNumaNode)); - INFO(MSCCLPP_INIT, "NUMA node of device %d is set to %d", comm->cudaDev, comm->devNumaNode); - } - // save numa node bitmask to change it back to user's numa node - mscclppNumaState curProcessState; - MSCCLPPCHECK(getNumaState(&curProcessState)); - // change to device's numa node so that the following allocation are close to the device - MSCCLPPCHECK(numaBind(comm->devNumaNode)); - - if (comm->nConns == MAXCONNECTIONS) { - WARN("Too many connections made"); - return mscclppInternalError; - } - int connId = comm->nConns; - struct mscclppConn* conn = &comm->conns[connId]; - conn->connId = connId; - conn->transport = transportType; - conn->buffSize = 0; - - conn->ibCtx = NULL; - int ibDevIdx = -1; - if (transportType == mscclppTransportIB) { - // Check if an IB context exists - int firstNullIdx = -1; - for (int i = 0; i < MSCCLPP_IB_MAX_DEVS; ++i) { - if (comm->ibContext[i] == NULL) { - if (firstNullIdx == -1) { - firstNullIdx = i; - } - } else if (strncmp(comm->ibContext[i]->getDevName().c_str(), ibDev, IBV_SYSFS_NAME_MAX) == 0) { - ibDevIdx = i; - break; - } - } - - // If not, create a new one - if (ibDevIdx == -1) { - // Create a new context. - ibDevIdx = firstNullIdx; - comm->ibContext[ibDevIdx].reset(new mscclpp::IbCtx(std::string(ibDev))); - } - // Set the ib context for this conn - conn->ibCtx = comm->ibContext[ibDevIdx].get(); - - } else if (transportType == mscclppTransportP2P) { - // do the rest of the initialization later - } else if (transportType == mscclppTransportSHM) { - WARN("Shared memory interconnection is not implemented yet!"); - return mscclppInternalError; - } else { - WARN("Unexpected connection type!"); - return mscclppInvalidUsage; - } - - // Find/create a proxy state for the given connection - struct mscclppProxyState* proxyState = NULL; - // First see if there is a matching context - // If not, find the first empty proxy - int firstEmptyProxyIndex = -1; - for (int i = 0; i < MSCCLPP_PROXY_MAX_NUM; ++i) { - struct mscclppProxyState* curProxy = comm->proxyState[i]; - if (curProxy && (curProxy->transportType == transportType)) { - if ((transportType == mscclppTransportIB && curProxy->ibContext == conn->ibCtx) || - (transportType == mscclppTransportP2P)) { - proxyState = curProxy; - break; // we found the matching context - } - } - if (curProxy == NULL && firstEmptyProxyIndex == -1) { - firstEmptyProxyIndex = i; - } - } - - if (proxyState == NULL && firstEmptyProxyIndex == -1) { - WARN("Too many proxies have been allocated!"); - return mscclppInvalidUsage; - } - - // If we couldn't find a matching context, create one - if (proxyState == NULL) { - MSCCLPPCHECK(mscclppCalloc(&proxyState, 1)); - MSCCLPPCHECK(proxyState->fifo.create()); - - if (transportType == mscclppTransportIB) { - proxyState->ibContext = conn->ibCtx; - proxyState->p2pStream = NULL; - } else if (transportType == mscclppTransportP2P) { - proxyState->ibContext = NULL; - CUDACHECK(cudaStreamCreateWithFlags(&proxyState->p2pStream, cudaStreamNonBlocking)); - } - proxyState->numaNodeToBind = comm->devNumaNode; - - // INFO(MSCCLPP_INIT, "NUMA node for device %d is %d", cudaDev, *numaNode); - proxyState->transportType = transportType; - comm->proxyState[firstEmptyProxyIndex] = proxyState; - } - if (proxyState == NULL) { - // Cannot reach - WARN("Proxy allocation failed!"); - return mscclppInternalError; - } - - if (transportType == mscclppTransportIB) { - conn->hostConn = new mscclppHostIBConn(conn); - } else if (transportType == mscclppTransportP2P) { - conn->hostConn = new mscclppHostP2PConn(conn, proxyState->p2pStream); - } - - struct mscclppDevConn* devConn = &comm->devConns[connId]; - - conn->devConn = devConn; - conn->devConn->localBuff = nullptr; - MSCCLPPCHECK(mscclppCudaCalloc(&conn->devConn->localSignalEpochId, 1)); - MSCCLPPCHECK(mscclppCudaCalloc(&conn->devConn->waitEpochId, 1)); - conn->devConn->remoteRank = remoteRank; - conn->devConn->tag = tag; - conn->devConn->fifo.connId = connId; -#if defined(MSCCLPP_USE_GDRCOPY) - conn->devConn->fifo.triggerFifo = proxyState->fifo.triggerFifoDev; -#else - conn->devConn->fifo.triggerFifo = proxyState->fifo.triggerFifo; -#endif - conn->devConn->fifo.triggerFifoHead = proxyState->fifo.fifoHead; - conn->devConn->fifo.triggerFifoTail = proxyState->fifo.fifoTailDev; - - comm->nConns++; - - // change the numa binding back to user's - MSCCLPPCHECK(setNumaState(curProcessState)); - - mscclppBufferHandle_t signalHandle = -1; - MSCCLPPCHECK(mscclppRegisterBufferForConnection(comm, connId, conn->devConn->localSignalEpochId, - sizeof(mscclppDevConnSignalEpochId), &signalHandle)); - if (signalHandle != 0) { - WARN("signal handle should be 0"); - return mscclppInternalError; - } - - return mscclppSuccess; -} - -MSCCLPP_API mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void* localBuff, - uint64_t buffSize, mscclppTransport_t transportType, const char* ibDev) -{ - int connId = comm->nConns; - MSCCLPPCHECK(mscclppConnectWithoutBuffer(comm, remoteRank, tag, transportType, ibDev)); - struct mscclppConn* conn = &comm->conns[connId]; - - conn->buffSize = buffSize; - conn->devConn->localBuff = localBuff; - - mscclppBufferHandle_t localBuffHandle = -1; - MSCCLPPCHECK(mscclppRegisterBufferForConnection(comm, connId, localBuff, buffSize, &localBuffHandle)); - if (localBuffHandle != 1) { - WARN("data buffer handle should be 1"); - return mscclppInternalError; - } - - return mscclppSuccess; -} - -MSCCLPP_API mscclppResult_t mscclppRegisterBufferForConnection(mscclppComm_t comm, int connIdx, void* localBuff, - uint64_t buffSize, mscclppBufferHandle_t* handle) -{ - if (connIdx >= comm->nConns) { - WARN("connIdx out of range"); - return mscclppInvalidArgument; - } - mscclppConn& conn = comm->conns[connIdx]; - *handle = conn.bufferRegistrations.size(); - conn.bufferRegistrations.emplace_back(); - conn.bufferRegistrations.back().data = localBuff; - conn.bufferRegistrations.back().size = buffSize; - - return mscclppSuccess; -} - -struct mscclppBufferRegistrationInfo -{ - cudaIpcMemHandle_t cudaHandle; - mscclpp::IbMrInfo ibMrInfo; - uint64_t size; -}; - -struct connInfo -{ - mscclpp::IbQpInfo infoQp; - std::vector bufferInfos; - - struct header - { - mscclpp::IbQpInfo infoQp; - int numBufferInfos; - }; - - mscclppResult_t sendOverBootstrap(void* bootstrap, int remoteRank, int tag) - { - header h; - h.infoQp = infoQp; - h.numBufferInfos = bufferInfos.size(); - MSCCLPPCHECK(bootstrapSend(bootstrap, remoteRank, tag, &h, sizeof(header))); - MSCCLPPCHECK(bootstrapSend(bootstrap, remoteRank, tag, bufferInfos.data(), - bufferInfos.size() * sizeof(mscclppBufferRegistrationInfo))); - return mscclppSuccess; - } - - mscclppResult_t recvOverBootstrap(void* bootstrap, int remoteRank, int tag) - { - header h; - MSCCLPPCHECK(bootstrapRecv(bootstrap, remoteRank, tag, &h, sizeof(header))); - infoQp = h.infoQp; - bufferInfos.resize(h.numBufferInfos); - MSCCLPPCHECK(bootstrapRecv(bootstrap, remoteRank, tag, bufferInfos.data(), - bufferInfos.size() * sizeof(mscclppBufferRegistrationInfo))); - return mscclppSuccess; - } -}; - -mscclppResult_t mscclppP2pConnectionSetupStart(struct connInfo* connInfo /*input*/, struct mscclppConn* conn /*input*/) -{ - if (conn == NULL) { - WARN("connection cannot be null"); - return mscclppInternalError; - } - - // Add all registered buffers - for (const auto& bufReg : conn->bufferRegistrations) { - connInfo->bufferInfos.emplace_back(); - CUDACHECK(cudaIpcGetMemHandle(&connInfo->bufferInfos.back().cudaHandle, bufReg.data)); - connInfo->bufferInfos.back().size = bufReg.size; - } - return mscclppSuccess; -} - -mscclppResult_t mscclppP2pConnectionSetupEnd(struct connInfo* connInfo /*input*/, struct mscclppConn* conn /*output*/) -{ - if (connInfo == NULL || conn == NULL) { - WARN("ipcHandles or connection cannot be null"); - return mscclppInternalError; - } - if (connInfo->bufferInfos.size() < 1) { - WARN("at least 1 buffer info expected"); - return mscclppInternalError; - } - - // Open all remote registered buffers - for (size_t i = 0; i < connInfo->bufferInfos.size(); i++) { - mscclppBufferRegistration newBufReg; - CUDACHECK( - cudaIpcOpenMemHandle(&newBufReg.data, connInfo->bufferInfos[i].cudaHandle, cudaIpcMemLazyEnablePeerAccess)); - newBufReg.size = connInfo->bufferInfos[i].size; - conn->remoteBufferRegistrations.push_back(newBufReg); - } - - if (conn->remoteBufferRegistrations[0].size != sizeof(mscclppDevConnSignalEpochId)) { - WARN("buffer registration zero size doesn't match sizeof(mscclppDevConnSignalEpochId)"); - return mscclppInternalError; - } - conn->devConn->remoteSignalEpochId = (mscclppDevConnSignalEpochId*)conn->remoteBufferRegistrations[0].data; - - // For backwards compatibility with the previous API that assumed one data buffer per connection, set the remote - // buffer to the first remote data buffer - if (conn->remoteBufferRegistrations.size() > 1) { - conn->devConn->remoteBuff = conn->remoteBufferRegistrations[1].data; - } - return mscclppSuccess; -} - -mscclppResult_t mscclppIbConnectionSetupStart(struct connInfo* connInfo /*output*/, struct mscclppConn* conn /*input*/) -{ - if (connInfo == NULL || conn == NULL) { - WARN("connInfo or connection cannot be null"); - return mscclppInternalError; - } - struct mscclppDevConn* devConn = conn->devConn; - struct mscclppHostIBConn* hostConn = (struct mscclppHostIBConn*)conn->hostConn; - devConn->remoteBuff = NULL; - devConn->remoteSignalEpochId = NULL; - - mscclpp::IbCtx* ibCtx = conn->ibCtx; - if (hostConn->ibQp == NULL) { - hostConn->ibQp = ibCtx->createQp(); - } - - // Add all registered buffers - for (const auto& bufReg : conn->bufferRegistrations) { - hostConn->ibMrs.emplace_back(ibCtx->registerMr(bufReg.data, sizeof(struct mscclppDevConnSignalEpochId))); - connInfo->bufferInfos.emplace_back(); - connInfo->bufferInfos.back().ibMrInfo = hostConn->ibMrs.back()->getInfo(); - connInfo->bufferInfos.back().size = bufReg.size; - } - - connInfo->infoQp = hostConn->ibQp->getInfo(); - return mscclppSuccess; -} - -mscclppResult_t mscclppIbConnectionSetupEnd(struct connInfo* connInfo /*input*/, struct mscclppConn* conn /*output*/) -{ - if (connInfo == NULL || conn == NULL) { - WARN("ipcHandles or connection cannot be null"); - return mscclppInternalError; - } - struct mscclppHostIBConn* hostConn = (struct mscclppHostIBConn*)conn->hostConn; - hostConn->ibQp->rtr(connInfo->infoQp); - hostConn->ibQp->rts(); - - // No remote pointers to set with IB, so we just set the Mrs - - // Push the Mrs for all the remote registered buffers - for (size_t i = 1; i < connInfo->bufferInfos.size(); i++) { - hostConn->remoteIbMrInfos.push_back(connInfo->bufferInfos[i].ibMrInfo); - - mscclppBufferRegistration newBufReg; - newBufReg.data = nullptr; - newBufReg.size = connInfo->bufferInfos[i].size; - conn->remoteBufferRegistrations.push_back(newBufReg); - } - return mscclppSuccess; -} - -MSCCLPP_API mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm) -{ - // Send info to peers - for (int i = 0; i < comm->nConns; ++i) { - struct mscclppConn* conn = &comm->conns[i]; - - struct connInfo cInfo; - if (conn->transport == mscclppTransportP2P) { - MSCCLPPCHECK(mscclppP2pConnectionSetupStart(&cInfo, conn)); - } else if (conn->transport == mscclppTransportIB) { - MSCCLPPCHECK(mscclppIbConnectionSetupStart(&cInfo, conn)); - } - // TODO: from saemal: do we possibly deadlock if there are too many outstanding sends? - // MSCCLPPCHECK(bootstrapSend(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag, &cInfo, - // sizeof(cInfo))); - MSCCLPPCHECK(cInfo.sendOverBootstrap(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag)); - } - - // Recv info from peers - for (int i = 0; i < comm->nConns; ++i) { - struct mscclppConn* conn = &comm->conns[i]; - struct connInfo cInfo; - MSCCLPPCHECK(cInfo.recvOverBootstrap(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag)); - if (conn->transport == mscclppTransportP2P) { - MSCCLPPCHECK(mscclppP2pConnectionSetupEnd(&cInfo, conn)); - } else if (conn->transport == mscclppTransportIB) { - MSCCLPPCHECK(mscclppIbConnectionSetupEnd(&cInfo, conn)); - } - } - - // a barrier to ensure setup on all gpus are done and we can return to the user - MSCCLPPCHECK(mscclppBootstrapBarrier(comm)); - return mscclppSuccess; -} - -struct bufferInfo -{ - cudaIpcMemHandle_t handleBuff; - mscclpp::IbMrInfo infoBuffMr; -}; - -MSCCLPP_API mscclppResult_t mscclppRegisterBuffer(mscclppComm_t comm, void* local_memory, size_t size, - mscclppRegisteredMemory* regMem) -{ - std::vector ibMrs; - for (int i = 0; i < comm->nConns; ++i) { - struct mscclppConn* conn = &comm->conns[i]; - struct bufferInfo bInfo; - const mscclpp::IbMr* ibBuffMr; - - // TODO: (conn->transport & mscclppTransportP2P) to support both P2P and IB - if (conn->transport == mscclppTransportP2P) { - CUDACHECK(cudaIpcGetMemHandle(&bInfo.handleBuff, local_memory)); - } else if (conn->transport == mscclppTransportIB) { - ibBuffMr = conn->ibCtx->registerMr(local_memory, size); - bInfo.infoBuffMr = ibBuffMr->getInfo(); - ibMrs.emplace_back(ibBuffMr); - } - - MSCCLPPCHECK(bootstrapSend(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag, &bInfo, sizeof(bInfo))); - } - - // Recv info from peers - for (int i = 0; i < comm->nConns; ++i) { - struct mscclppConn* conn = &comm->conns[i]; - struct bufferInfo bInfo; - - mscclppRegisteredMemoryP2P p2p; - p2p.IbMr = NULL; - p2p.remoteBuff = NULL; - MSCCLPPCHECK(bootstrapRecv(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag, &bInfo, sizeof(bInfo))); - - // TODO: (conn->transport & mscclppTransportP2P) to support both P2P and IB - if (conn->transport == mscclppTransportP2P) { - CUDACHECK(cudaIpcOpenMemHandle((void**)&p2p.remoteBuff, bInfo.handleBuff, cudaIpcMemLazyEnablePeerAccess)); - } else if (conn->transport == mscclppTransportIB) { - p2p.IbMr = ibMrs[i]; - } - regMem->p2p.push_back(p2p); - } - return mscclppSuccess; -} - -MSCCLPP_API mscclppResult_t mscclppRegisteredBufferWrite(mscclppComm_t comm, mscclppRegisteredMemory* regMem, - void* srcBuff, size_t size, uint32_t srcOffset, - uint32_t dstOffset, int64_t stream) -{ - int ret = 0; - // TODO: transport should be an argument too so user can decide which transport to use - for (int i = 0; i < comm->nConns; ++i) { - struct mscclppConn* conn = &comm->conns[i]; - // TODO: (conn->transport & mscclppTransportP2P) to support both P2P and IB - if (conn->transport == mscclppTransportP2P) { - void* dstBuff = regMem->p2p[i].remoteBuff; - CUDACHECK(cudaMemcpyAsync(dstBuff, srcBuff, size, cudaMemcpyDeviceToDevice, (cudaStream_t)stream)); - } else { - WARN("mscclppRegisteredBufferWrite not implemented for IB"); - return mscclppInternalError; - // TODO: fix the following (Olli: probably by including the relevant ibBuffMr in the mscclppRegisteredMemory) - // struct mscclppHostIBConn* hostConn = (struct mscclppHostIBConn*)conn->hostConn; - // hostConn->ibQp->stageSend(hostConn->ibBuffMr, &hostConn->ibBuffMrRemoteInfo, (uint32_t)size, - // /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset, /*signaled=*/false); - // if ((ret = hostConn->ibQp->postSend()) != 0) { - // // Return value is errno. - // WARN("data postSend failed: errno %d", ret); - // } - // // ?? - // // npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_ENTRY, (uint32_t)trigger.fields.dataSize, - // // trigger.fields.connId); - } - } - return mscclppSuccess; -} - -// TODO: destroy registered buffer - -MSCCLPP_API mscclppResult_t mscclppProxyLaunch(mscclppComm_t comm) -{ - npkitInitReqIds(comm); - MSCCLPPCHECK(mscclppProxyCreate(comm)); - return mscclppSuccess; -} - -MSCCLPP_API mscclppResult_t mscclppBootstrapBarrier(mscclppComm_t comm) -{ - int* tmp = new int[comm->nRanks]; - MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int))); - delete[] tmp; - return mscclppSuccess; -} - -MSCCLPP_API mscclppResult_t mscclppProxyStop(mscclppComm_t comm) -{ - // a barrier to make sure all ranks are done with their work before stopping the proxy - MSCCLPPCHECK(mscclppBootstrapBarrier(comm)); - - MSCCLPPCHECK(mscclppProxyDestroy(comm)); - return mscclppSuccess; -} - -MSCCLPP_API mscclppResult_t mscclppCommRank(mscclppComm_t comm, int* rank) -{ - if (comm == NULL || rank == NULL) { - WARN("comm or rank cannot be null"); - return mscclppInvalidUsage; - } - *rank = comm->rank; - return mscclppSuccess; -} - -MSCCLPP_API mscclppResult_t mscclppCommSize(mscclppComm_t comm, int* size) -{ - if (comm == NULL || size == NULL) { - WARN("comm or size cannot be null"); - return mscclppInvalidUsage; - } - *size = comm->nRanks; - return mscclppSuccess; -} - -MSCCLPP_API void mscclppDefaultLogHandler(const char* msg) -{ - mscclppDebugDefaultLogHandler(msg); -} - -MSCCLPP_API mscclppResult_t mscclppSetLogHandler(mscclppLogHandler_t handler) -{ - return mscclppDebugSetLogHandler(handler); -} - -MSCCLPP_API mscclppResult_t mscclppSetBootstrapConnTimeout(int timeout) -{ - mscclppConfig* config = mscclppConfig::getInstance(); - config->setBootstrapConnectionTimeoutConfig(timeout); - return mscclppSuccess; -} diff --git a/src/misc/npkit.cc b/src/npkit/npkit.cc similarity index 99% rename from src/misc/npkit.cc rename to src/npkit/npkit.cc index 30914810..e7fe78f8 100644 --- a/src/misc/npkit.cc +++ b/src/npkit/npkit.cc @@ -3,7 +3,7 @@ #include #include "alloc.h" -#include "npkit/npkit.h" +#include "npkit.h" #include uint64_t NpKit::rank_ = 0; diff --git a/src/include/npkit/npkit.h b/src/npkit/npkit.h similarity index 96% rename from src/include/npkit/npkit.h rename to src/npkit/npkit.h index f0a72dfc..c0cc4710 100644 --- a/src/include/npkit/npkit.h +++ b/src/npkit/npkit.h @@ -3,8 +3,8 @@ #include -#include "npkit/npkit_event.h" -#include "npkit/npkit_struct.h" +#include "npkit_event.h" +#include "npkit_struct.h" class NpKit { diff --git a/src/include/npkit/npkit_event.h b/src/npkit/npkit_event.h similarity index 100% rename from src/include/npkit/npkit_event.h rename to src/npkit/npkit_event.h diff --git a/src/include/npkit/npkit_struct.h b/src/npkit/npkit_struct.h similarity index 100% rename from src/include/npkit/npkit_struct.h rename to src/npkit/npkit_struct.h diff --git a/src/proxy.cc b/src/proxy.cc index c8bf4414..060bbfb0 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -1,213 +1,112 @@ -#include "alloc.h" -#include "checks.h" -#include "comm.h" -#include "debug.h" -#include "ib.hpp" -#include "socket.h" - -#include -#include -#include +#include "api.h" +#include +#include +#include "utils.h" +#include "utils.hpp" +#include #include -#define MSCCLPP_PROXY_RUN_STATE_CHECK_PERIOD 100 +namespace mscclpp { -#define PROXYCUDACHECK(cmd) \ - do { \ - cudaError_t err = cmd; \ - if (err != cudaSuccess) { \ - WARN("CUDA error from proxy: %s", cudaGetErrorString(err)); \ - return NULL; \ - } \ - } while (false) +const int ProxyStopCheckPeriod = 1000; -#define PROXYMSCCLPPCHECK(call) \ - do { \ - mscclppResult_t res = call; \ - if (res != mscclppSuccess && res != mscclppInProgress) { \ - /* Print the back trace*/ \ - if (mscclppDebugNoWarn == 0) \ - INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res); \ - return NULL; \ - } \ - } while (0); +const int ProxyFlushPeriod = 4; -struct proxyArgs +struct Proxy::Impl { - struct mscclppComm* comm; - struct mscclppProxyState* proxyState; + ProxyHandler handler; + std::function threadInit; + HostProxyFifo fifo; + std::thread service; + std::atomic_bool running; + + Impl(ProxyHandler handler, std::function threadInit) + : handler(handler), threadInit(threadInit), running(false) + { + } }; -mscclppResult_t mscclppProxyFifo::create() +MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler, std::function threadInit) { - MSCCLPPCHECK(mscclppCudaCalloc(&this->fifoHead, 1)); -#if defined(MSCCLPP_USE_GDRCOPY) - MSCCLPPCHECK( - mscclppGdrCudaCalloc(&this->triggerFifo, &this->triggerFifoDev, MSCCLPP_PROXY_FIFO_SIZE, &this->triggerFifoDesc)); - MSCCLPPCHECK(mscclppGdrCudaCalloc(&this->fifoTailDevHostPtr, &this->fifoTailDev, 1, &this->fifoTailDesc)); -#else - MSCCLPPCHECK(mscclppCudaHostCalloc(&this->triggerFifo, MSCCLPP_PROXY_FIFO_SIZE)); - MSCCLPPCHECK(mscclppCudaCalloc(&this->fifoTailDev, 1)); -#endif - CUDACHECK(cudaStreamCreateWithFlags(&this->stream, cudaStreamNonBlocking)); - this->fifoTailHost = 0; - return mscclppSuccess; + pimpl = std::make_unique(handler, threadInit); } -mscclppResult_t mscclppProxyFifo::destroy() +MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler) : Proxy(handler, [] {}) { - MSCCLPPCHECK(mscclppCudaFree(this->fifoHead)); -#if defined(MSCCLPP_USE_GDRCOPY) - MSCCLPPCHECK(mscclppGdrCudaFree(this->triggerFifoDesc)); - MSCCLPPCHECK(mscclppGdrCudaFree(this->fifoTailDesc)); -#else - MSCCLPPCHECK(mscclppCudaHostFree(this->triggerFifo)); - MSCCLPPCHECK(mscclppCudaFree(this->fifoTailDev)); -#endif - CUDACHECK(cudaStreamDestroy(this->stream)); - return mscclppSuccess; } -// return true if the trigger is valid -mscclppResult_t mscclppProxyFifo::poll(mscclppTrigger* trigger) +MSCCLPP_API_CPP Proxy::~Proxy() { - __m128i xmm0 = _mm_load_si128((__m128i*)&this->triggerFifo[this->fifoTailHost % MSCCLPP_PROXY_FIFO_SIZE]); - _mm_store_si128((__m128i*)trigger, xmm0); - return mscclppSuccess; -} - -mscclppResult_t mscclppProxyFifo::pop() -{ - *(volatile uint64_t*)(&this->triggerFifo[this->fifoTailHost % MSCCLPP_PROXY_FIFO_SIZE]) = 0; - (this->fifoTailHost)++; - return mscclppSuccess; -} - -mscclppResult_t mscclppProxyFifo::flushTail(bool sync) -{ - // Flush the tail to device memory. This is either triggered every MSCCLPP_PROXY_FIFO_FLUSH_COUNTER to make sure - // that the fifo can make progress even if there is no request mscclppSync. However, mscclppSync type is for flush - // request. -#if defined(MSCCLPP_USE_GDRCOPY) - *(volatile uint64_t*)(this->fifoTailDevHostPtr) = this->fifoTailHost; -#else - CUDACHECK( - cudaMemcpyAsync(this->fifoTailDev, &(this->fifoTailHost), sizeof(uint64_t), cudaMemcpyHostToDevice, this->stream)); - if (sync) { - CUDACHECK(cudaStreamSynchronize(this->stream)); - } -#endif - return mscclppSuccess; -} - -static void processTrigger(const mscclppTrigger trigger, mscclppConn* conn) -{ - // Iterate over what send is needed - if (trigger.fields.type & mscclppData) { - conn->hostConn->put(trigger.fields.dstDataOffset, trigger.fields.srcDataOffset, trigger.fields.dataSize); - } - - if (trigger.fields.type & mscclppFlag) { - conn->hostConn->signal(); - } - - // Wait for completion - if (trigger.fields.type & mscclppSync) { - conn->hostConn->flush(); + if (pimpl) { + stop(); } } -void* mscclppProxyService(void* _args) +MSCCLPP_API_CPP void Proxy::start() { - struct proxyArgs* args = (struct proxyArgs*)_args; - struct mscclppComm* comm = args->comm; - struct mscclppProxyState* proxyState = args->proxyState; - free(_args); // allocated in mscclppProxyCreate + pimpl->running = true; + pimpl->service = std::thread([this] { + pimpl->threadInit(); - // from this point on, proxy thread will stay close to the device - PROXYMSCCLPPCHECK(numaBind(comm->devNumaNode)); + ProxyHandler handler = this->pimpl->handler; + HostProxyFifo& fifo = this->pimpl->fifo; + std::atomic_bool& running = this->pimpl->running; + ProxyTrigger trigger; - struct mscclppProxyFifo* fifo = &proxyState->fifo; - volatile mscclppProxyRunState_t* run = &proxyState->run; - mscclppTrigger trigger; + int runCnt = ProxyStopCheckPeriod; + uint64_t flushCnt = 0; + for (;;) { + if (runCnt-- == 0) { + runCnt = ProxyStopCheckPeriod; + if (!running) { + break; + } + } + // Poll to see if we are ready to send anything + fifo.poll(&trigger); + if (trigger.fst == 0) { // TODO: this check is a potential pitfall for custom triggers + continue; // there is one in progress + } - int runCnt = MSCCLPP_PROXY_RUN_STATE_CHECK_PERIOD; - uint64_t flushCnt = 0; - for (;;) { - if (runCnt-- == 0) { - runCnt = MSCCLPP_PROXY_RUN_STATE_CHECK_PERIOD; - if (*run != MSCCLPP_PROXY_RUN_STATE_RUNNING) { + ProxyHandlerResult result = handler(trigger); + + // Send completion: reset only the high 64 bits + fifo.pop(); + // Flush the tail to device memory. This is either triggered every ProxyFlushPeriod to make sure + // that the fifo can make progress even if there is no request mscclppSync. However, mscclppSync type is for flush + // request. + if ((++flushCnt % ProxyFlushPeriod) == 0 || result == ProxyHandlerResult::FlushFifoTailAndContinue) { + // TODO: relocate this check: || (trigger.fields.type & mscclppSync) + fifo.flushTail(); + } + + if (result == ProxyHandlerResult::Stop) { break; } } - // Poll to see if we are ready to send anything - PROXYMSCCLPPCHECK(fifo->poll(&trigger)); - if (trigger.value[0] == 0) { - continue; // there is one in progreess - } - mscclppConn* conn = &comm->conns[trigger.fields.connId]; - processTrigger(trigger, conn); - - // Send completion: reset only the high 64 bits - PROXYMSCCLPPCHECK(fifo->pop()); - // Flush the tail to device memory. This is either triggered every MSCCLPP_PROXY_FIFO_FLUSH_COUNTER to make sure - // that the fifo can make progress even if there is no request mscclppSync. However, mscclppSync type is for flush - // request. - if (((++flushCnt % MSCCLPP_PROXY_FIFO_FLUSH_COUNTER) == 0) || (trigger.fields.type & mscclppSync)) { - PROXYMSCCLPPCHECK(fifo->flushTail()); - } - } - - // make sure the tail is flushed before we shut the proxy - PROXYMSCCLPPCHECK(fifo->flushTail(/*sync=*/true)); - bool isP2pProxy = (proxyState->ibContext == nullptr); - if (isP2pProxy) { - cudaStream_t p2pStream = proxyState->p2pStream; - PROXYCUDACHECK(cudaStreamSynchronize(p2pStream)); - } - *run = MSCCLPP_PROXY_RUN_STATE_IDLE; - return NULL; + // make sure the tail is flushed before we shut the proxy + fifo.flushTail(/*sync=*/true); + // TODO: do these need to run? + // bool isP2pProxy = (proxyState->ibContext == nullptr); + // if (isP2pProxy) { + // cudaStream_t p2pStream = proxyState->p2pStream; + // PROXYCUDACHECK(cudaStreamSynchronize(p2pStream)); + // } + }); } -mscclppResult_t mscclppProxyCreate(struct mscclppComm* comm) +MSCCLPP_API_CPP void Proxy::stop() { - for (int i = 0; i < MSCCLPP_PROXY_MAX_NUM; ++i) { - struct mscclppProxyState* proxyState = comm->proxyState[i]; - if (proxyState == NULL) - break; - - struct proxyArgs* args; - MSCCLPPCHECK(mscclppCalloc(&args, 1)); - args->comm = comm; - args->proxyState = proxyState; - - proxyState->run = MSCCLPP_PROXY_RUN_STATE_RUNNING; - pthread_create(&proxyState->thread, NULL, mscclppProxyService, args); - if (proxyState->transportType == mscclppTransportP2P) { - mscclppSetThreadName(proxyState->thread, "MSCCLPP Service P2P - %02d", comm->cudaDev); - } else if (proxyState->transportType == mscclppTransportIB) { - mscclppSetThreadName(proxyState->thread, "MSCCLPP Service IB - %02d", i); - } + pimpl->running = false; + if (pimpl->service.joinable()) { + pimpl->service.join(); } - return mscclppSuccess; } -mscclppResult_t mscclppProxyDestroy(struct mscclppComm* comm) +MSCCLPP_API_CPP HostProxyFifo& Proxy::fifo() { - for (int i = 0; i < MSCCLPP_PROXY_MAX_NUM; ++i) { - struct mscclppProxyState* proxyState = comm->proxyState[i]; - if (proxyState == NULL) - break; - - volatile int* run = (volatile int*)&proxyState->run; - if (*run == MSCCLPP_PROXY_RUN_STATE_IDLE) { - continue; - } - *run = MSCCLPP_PROXY_RUN_STATE_EXITING; - while (*run == MSCCLPP_PROXY_RUN_STATE_EXITING && *comm->abortFlag == 0) { - usleep(1000); - } - } - return mscclppSuccess; + return pimpl->fifo; } + +} // namespace mscclpp diff --git a/src/proxy_cpp.cc b/src/proxy_cpp.cc deleted file mode 100644 index 060bbfb0..00000000 --- a/src/proxy_cpp.cc +++ /dev/null @@ -1,112 +0,0 @@ -#include "api.h" -#include -#include -#include "utils.h" -#include "utils.hpp" -#include -#include - -namespace mscclpp { - -const int ProxyStopCheckPeriod = 1000; - -const int ProxyFlushPeriod = 4; - -struct Proxy::Impl -{ - ProxyHandler handler; - std::function threadInit; - HostProxyFifo fifo; - std::thread service; - std::atomic_bool running; - - Impl(ProxyHandler handler, std::function threadInit) - : handler(handler), threadInit(threadInit), running(false) - { - } -}; - -MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler, std::function threadInit) -{ - pimpl = std::make_unique(handler, threadInit); -} - -MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler) : Proxy(handler, [] {}) -{ -} - -MSCCLPP_API_CPP Proxy::~Proxy() -{ - if (pimpl) { - stop(); - } -} - -MSCCLPP_API_CPP void Proxy::start() -{ - pimpl->running = true; - pimpl->service = std::thread([this] { - pimpl->threadInit(); - - ProxyHandler handler = this->pimpl->handler; - HostProxyFifo& fifo = this->pimpl->fifo; - std::atomic_bool& running = this->pimpl->running; - ProxyTrigger trigger; - - int runCnt = ProxyStopCheckPeriod; - uint64_t flushCnt = 0; - for (;;) { - if (runCnt-- == 0) { - runCnt = ProxyStopCheckPeriod; - if (!running) { - break; - } - } - // Poll to see if we are ready to send anything - fifo.poll(&trigger); - if (trigger.fst == 0) { // TODO: this check is a potential pitfall for custom triggers - continue; // there is one in progress - } - - ProxyHandlerResult result = handler(trigger); - - // Send completion: reset only the high 64 bits - fifo.pop(); - // Flush the tail to device memory. This is either triggered every ProxyFlushPeriod to make sure - // that the fifo can make progress even if there is no request mscclppSync. However, mscclppSync type is for flush - // request. - if ((++flushCnt % ProxyFlushPeriod) == 0 || result == ProxyHandlerResult::FlushFifoTailAndContinue) { - // TODO: relocate this check: || (trigger.fields.type & mscclppSync) - fifo.flushTail(); - } - - if (result == ProxyHandlerResult::Stop) { - break; - } - } - - // make sure the tail is flushed before we shut the proxy - fifo.flushTail(/*sync=*/true); - // TODO: do these need to run? - // bool isP2pProxy = (proxyState->ibContext == nullptr); - // if (isP2pProxy) { - // cudaStream_t p2pStream = proxyState->p2pStream; - // PROXYCUDACHECK(cudaStreamSynchronize(p2pStream)); - // } - }); -} - -MSCCLPP_API_CPP void Proxy::stop() -{ - pimpl->running = false; - if (pimpl->service.joinable()) { - pimpl->service.join(); - } -} - -MSCCLPP_API_CPP HostProxyFifo& Proxy::fifo() -{ - return pimpl->fifo; -} - -} // namespace mscclpp From 9f6c48cbf92637f397b9a3043ef7fa8c387f8ab4 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Thu, 11 May 2023 00:23:14 +0000 Subject: [PATCH 124/135] Format all files --- include/mscclpp/channel.hpp | 179 ++++++------------ include/mscclpp/core.hpp | 212 ++++++--------------- include/mscclpp/epoch.hpp | 39 ++-- include/mscclpp/errors.hpp | 34 ++-- include/mscclpp/fifo.hpp | 33 ++-- include/mscclpp/proxy.hpp | 17 +- src/bootstrap/bootstrap.cc | 198 +++++++------------- src/bootstrap/socket.cc | 256 ++++++++++---------------- src/c_style_remnants.cc | 56 +++--- src/channel.cc | 13 +- src/communicator.cc | 68 +++---- src/config.cc | 15 +- src/connection.cc | 84 +++------ src/debug.cc | 65 +++---- src/epoch.cc | 39 ++-- src/errors.cc | 28 +-- src/fifo.cc | 35 ++-- src/ib.cc | 212 +++++++++------------ src/include/align.h | 12 +- src/include/alloc.h | 72 ++++---- src/include/api.h | 2 +- src/include/basic_proxy_handler.hpp | 3 +- src/include/checks.h | 275 ++++++++++++++-------------- src/include/checks.hpp | 49 ++--- src/include/communicator.hpp | 14 +- src/include/config.h | 9 +- src/include/connection.hpp | 27 ++- src/include/debug.h | 17 +- src/include/ib.hpp | 31 ++-- src/include/mscclpp.h | 67 +++---- src/include/mscclppfifo.h | 34 ++-- src/include/proxy.h | 23 ++- src/include/registered_memory.hpp | 27 ++- src/include/socket.h | 14 +- src/include/utils.h | 24 +-- src/include/utils.hpp | 38 ++-- src/npkit/npkit.cc | 30 ++- src/npkit/npkit.h | 13 +- src/npkit/npkit_struct.h | 6 +- src/proxy.cc | 41 ++--- src/registered_memory.cc | 71 +++---- src/utils.cc | 89 +++------ 42 files changed, 990 insertions(+), 1581 deletions(-) diff --git a/include/mscclpp/channel.hpp b/include/mscclpp/channel.hpp index 9aa50902..474244ce 100644 --- a/include/mscclpp/channel.hpp +++ b/include/mscclpp/channel.hpp @@ -1,8 +1,8 @@ #ifndef MSCCLPP_CHANNEL_HPP_ #define MSCCLPP_CHANNEL_HPP_ -#include #include +#include #include #include @@ -10,22 +10,15 @@ namespace mscclpp { namespace channel { // A Channel pairs a Connection with an Epoch -class Channel -{ -public: +class Channel { + public: Channel(Communicator& communicator, std::shared_ptr connection) - : connection_(connection), epoch_(std::make_shared(communicator, connection)){}; + : connection_(connection), epoch_(std::make_shared(communicator, connection)){}; - Connection& connection() - { - return *connection_; - } - DeviceEpoch& epoch() - { - return *epoch_; - } + Connection& connection() { return *connection_; } + DeviceEpoch& epoch() { return *epoch_; } -private: + private: std::shared_ptr connection_; std::shared_ptr epoch_; }; @@ -51,12 +44,11 @@ using MemoryId = uint32_t; // the summation of number of bits must be 128 or less union ChannelTrigger { ProxyTrigger value; - struct - { + struct { // first 64 bits: value[0] uint64_t size : MSCCLPP_BITS_SIZE; uint64_t srcOffset : MSCCLPP_BITS_OFFSET; - uint64_t : (64 - MSCCLPP_BITS_SIZE - MSCCLPP_BITS_OFFSET); // ensure 64-bit alignment + uint64_t : (64 - MSCCLPP_BITS_SIZE - MSCCLPP_BITS_OFFSET); // ensure 64-bit alignment // second 64 bits: value[1] uint64_t dstOffset : MSCCLPP_BITS_OFFSET; uint64_t srcMemoryId : MSCCLPP_BITS_REGMEM_HANDLE; @@ -64,19 +56,14 @@ union ChannelTrigger { uint64_t type : MSCCLPP_BITS_TYPE; uint64_t chanId : MSCCLPP_BITS_CONNID; uint64_t : (64 - MSCCLPP_BITS_OFFSET - MSCCLPP_BITS_REGMEM_HANDLE - MSCCLPP_BITS_REGMEM_HANDLE - - MSCCLPP_BITS_TYPE); // ensure 64-bit alignment + MSCCLPP_BITS_TYPE); // ensure 64-bit alignment } fields; #ifdef __CUDACC__ - __device__ ChannelTrigger() - { - } - __device__ ChannelTrigger(ProxyTrigger value) : value(value) - { - } + __device__ ChannelTrigger() {} + __device__ ChannelTrigger(ProxyTrigger value) : value(value) {} __device__ ChannelTrigger(TriggerType type, MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset, - uint64_t size, int connectionId) - { + uint64_t size, int connectionId) { value.fst = ((srcOffset << MSCCLPP_BITS_SIZE) + size); value.snd = ((((((((connectionId << MSCCLPP_BITS_TYPE) + (uint64_t)type) << MSCCLPP_BITS_REGMEM_HANDLE) + dst) << MSCCLPP_BITS_REGMEM_HANDLE) + @@ -84,69 +71,60 @@ union ChannelTrigger { << MSCCLPP_BITS_OFFSET) + dstOffset); } -#endif // __CUDACC__ +#endif // __CUDACC__ }; -struct DeviceChannel -{ +struct DeviceChannel { DeviceChannel() = default; DeviceChannel(ChannelId channelId, DeviceEpoch::DeviceHandle epoch, DeviceProxyFifo fifo) - : channelId_(channelId), epoch_(epoch), fifo_(fifo) - { - } + : channelId_(channelId), epoch_(epoch), fifo_(fifo) {} DeviceChannel(const DeviceChannel& other) = default; DeviceChannel& operator=(DeviceChannel& other) = default; #ifdef __CUDACC__ - __forceinline__ __device__ void put(MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset, uint64_t size) - { + __forceinline__ __device__ void put(MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset, + uint64_t size) { fifo_.push(ChannelTrigger(TriggerData, dst, dstOffset, src, srcOffset, size, channelId_).value); } - __forceinline__ __device__ void put(MemoryId dst, MemoryId src, uint64_t offset, uint64_t size) - { + __forceinline__ __device__ void put(MemoryId dst, MemoryId src, uint64_t offset, uint64_t size) { put(dst, offset, src, offset, size); } - __forceinline__ __device__ void signal() - { + __forceinline__ __device__ void signal() { epochIncrement(); fifo_.push(ChannelTrigger(TriggerFlag, 0, 0, 0, 0, 1, channelId_).value); } __forceinline__ __device__ void putWithSignal(MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset, - uint64_t size) - { + uint64_t size) { epochIncrement(); fifo_.push(ChannelTrigger(TriggerData | TriggerFlag, dst, dstOffset, src, srcOffset, size, channelId_).value); } - __forceinline__ __device__ void putWithSignal(MemoryId dst, MemoryId src, uint64_t offset, uint64_t size) - { + __forceinline__ __device__ void putWithSignal(MemoryId dst, MemoryId src, uint64_t offset, uint64_t size) { putWithSignal(dst, offset, src, offset, size); } __forceinline__ __device__ void putWithSignalAndFlush(MemoryId dst, uint64_t dstOffset, MemoryId src, - uint64_t srcOffset, uint64_t size) - { + uint64_t srcOffset, uint64_t size) { epochIncrement(); uint64_t curFifoHead = fifo_.push( - ChannelTrigger(TriggerData | TriggerFlag | TriggerSync, dst, dstOffset, src, srcOffset, size, channelId_).value); + ChannelTrigger(TriggerData | TriggerFlag | TriggerSync, dst, dstOffset, src, srcOffset, size, channelId_) + .value); while (*(volatile uint64_t*)&fifo_.triggers[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0 && *(volatile uint64_t*)fifo_.tailReplica <= curFifoHead) ; } - __forceinline__ __device__ void putWithSignalAndFlush(MemoryId dst, MemoryId src, uint64_t offset, uint64_t size) - { + __forceinline__ __device__ void putWithSignalAndFlush(MemoryId dst, MemoryId src, uint64_t offset, uint64_t size) { putWithSignalAndFlush(dst, offset, src, offset, size); } - __forceinline__ __device__ void flush() - { + __forceinline__ __device__ void flush() { uint64_t curFifoHead = fifo_.push(ChannelTrigger(TriggerSync, 0, 0, 0, 0, 1, channelId_).value); // we need to wait for two conditions to be met to ensure the CPU is done flushing. (1) wait for the tail // to go pass by curFifoHead (this is safety net) and (2) wait for the work element value to change to 0. @@ -155,16 +133,10 @@ struct DeviceChannel ; } - __forceinline__ __device__ void wait() - { - epoch_.wait(); - } + __forceinline__ __device__ void wait() { epoch_.wait(); } - __forceinline__ __device__ void epochIncrement() - { - epoch_.epochIncrement(); - } -#endif // __CUDACC__ + __forceinline__ __device__ void epochIncrement() { epoch_.epochIncrement(); } +#endif // __CUDACC__ ChannelId channelId_; @@ -179,42 +151,29 @@ class DeviceChannelService; inline ProxyHandler makeChannelProxyHandler(DeviceChannelService& channelService); -class DeviceChannelService -{ -public: +class DeviceChannelService { + public: DeviceChannelService(Communicator& communicator); - ChannelId addChannel(std::shared_ptr connection) - { + ChannelId addChannel(std::shared_ptr connection) { channels_.push_back(Channel(communicator_, connection)); return channels_.size() - 1; } - MemoryId addMemory(RegisteredMemory memory) - { + MemoryId addMemory(RegisteredMemory memory) { memories_.push_back(memory); return memories_.size() - 1; } - Channel channel(ChannelId id) - { - return channels_[id]; - } - DeviceChannel deviceChannel(ChannelId id) - { + Channel channel(ChannelId id) { return channels_[id]; } + DeviceChannel deviceChannel(ChannelId id) { return DeviceChannel(id, channels_[id].epoch().deviceHandle(), proxy_.fifo().deviceFifo()); } - void startProxy() - { - proxy_.start(); - } - void stopProxy() - { - proxy_.stop(); - } + void startProxy() { proxy_.start(); } + void stopProxy() { proxy_.stop(); } -private: + private: Communicator& communicator_; std::vector channels_; std::vector memories_; @@ -223,8 +182,7 @@ private: void bindThread(); - ProxyHandlerResult handleTrigger(ProxyTrigger triggerRaw) - { + ProxyHandlerResult handleTrigger(ProxyTrigger triggerRaw) { ChannelTrigger* trigger = reinterpret_cast(&triggerRaw); Channel& channel = channels_[trigger->fields.chanId]; @@ -249,13 +207,10 @@ private: } }; -struct SimpleDeviceChannel -{ +struct SimpleDeviceChannel { SimpleDeviceChannel() = default; - SimpleDeviceChannel(DeviceChannel devChan, MemoryId dst, MemoryId src) : devChan_(devChan), dst_(dst), src_(src) - { - } + SimpleDeviceChannel(DeviceChannel devChan, MemoryId dst, MemoryId src) : devChan_(devChan), dst_(dst), src_(src) {} SimpleDeviceChannel(const SimpleDeviceChannel& other) = default; @@ -263,64 +218,42 @@ struct SimpleDeviceChannel #ifdef __CUDACC__ - __forceinline__ __device__ void put(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) - { + __forceinline__ __device__ void put(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) { devChan_.put(dst_, dstOffset, src_, srcOffset, size); } - __forceinline__ __device__ void put(uint64_t offset, uint64_t size) - { - put(offset, offset, size); - } + __forceinline__ __device__ void put(uint64_t offset, uint64_t size) { put(offset, offset, size); } - __forceinline__ __device__ void signal() - { - devChan_.signal(); - } + __forceinline__ __device__ void signal() { devChan_.signal(); } - __forceinline__ __device__ void putWithSignal(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) - { + __forceinline__ __device__ void putWithSignal(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) { devChan_.putWithSignal(dst_, dstOffset, src_, srcOffset, size); } - __forceinline__ __device__ void putWithSignal(uint64_t offset, uint64_t size) - { - putWithSignal(offset, offset, size); - } + __forceinline__ __device__ void putWithSignal(uint64_t offset, uint64_t size) { putWithSignal(offset, offset, size); } - __forceinline__ __device__ void putWithSignalAndFlush(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) - { + __forceinline__ __device__ void putWithSignalAndFlush(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) { devChan_.putWithSignalAndFlush(dst_, dstOffset, src_, srcOffset, size); } - __forceinline__ __device__ void putWithSignalAndFlush(uint64_t offset, uint64_t size) - { + __forceinline__ __device__ void putWithSignalAndFlush(uint64_t offset, uint64_t size) { putWithSignalAndFlush(offset, offset, size); } - __forceinline__ __device__ void flush() - { - devChan_.flush(); - } + __forceinline__ __device__ void flush() { devChan_.flush(); } - __forceinline__ __device__ void wait() - { - devChan_.wait(); - } + __forceinline__ __device__ void wait() { devChan_.wait(); } - __forceinline__ __device__ void epochIncrement() - { - devChan_.epochIncrement(); - } + __forceinline__ __device__ void epochIncrement() { devChan_.epochIncrement(); } -#endif // __CUDACC__ +#endif // __CUDACC__ DeviceChannel devChan_; MemoryId dst_; MemoryId src_; }; -} // namespace channel -} // namespace mscclpp +} // namespace channel +} // namespace mscclpp -#endif // MSCCLPP_CHANNEL_HPP_ +#endif // MSCCLPP_CHANNEL_HPP_ diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index aeb692e6..b6249bfd 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -6,24 +6,22 @@ #define MSCCLPP_PATCH 0 #define MSCCLPP_VERSION (MSCCLPP_MAJOR * 10000 + MSCCLPP_MINOR * 100 + MSCCLPP_PATCH) -#include #include #include #include +#include #include #include namespace mscclpp { #define MSCCLPP_UNIQUE_ID_BYTES 128 -struct UniqueId -{ +struct UniqueId { char internal[MSCCLPP_UNIQUE_ID_BYTES]; }; -class BaseBootstrap -{ -public: +class BaseBootstrap { + public: BaseBootstrap(){}; virtual ~BaseBootstrap() = default; virtual int getRank() = 0; @@ -34,14 +32,12 @@ public: virtual void barrier() = 0; // TODO: move implementations of these helpers out of this header - void send(const std::vector& data, int peer, int tag) - { + void send(const std::vector& data, int peer, int tag) { size_t size = data.size(); send((void*)&size, sizeof(size_t), peer, tag); send((void*)data.data(), data.size(), peer, tag + 1); } - void recv(std::vector& data, int peer, int tag) - { + void recv(std::vector& data, int peer, int tag) { size_t size; recv((void*)&size, sizeof(size_t), peer, tag); data.resize(size); @@ -49,9 +45,8 @@ public: } }; -class Bootstrap : public BaseBootstrap -{ -public: +class Bootstrap : public BaseBootstrap { + public: Bootstrap(int rank, int nRanks); ~Bootstrap(); @@ -67,7 +62,7 @@ public: void allGather(void* allData, int size) override; void barrier() override; -private: + private: class Impl; std::unique_ptr pimpl_; }; @@ -81,147 +76,78 @@ private: */ std::unique_ptr getUniqueId(); -enum class Transport -{ - Unknown, - CudaIpc, - IB0, - IB1, - IB2, - IB3, - IB4, - IB5, - IB6, - IB7, - NumTransports -}; +enum class Transport { Unknown, CudaIpc, IB0, IB1, IB2, IB3, IB4, IB5, IB6, IB7, NumTransports }; namespace detail { const size_t TransportFlagsSize = 10; static_assert(TransportFlagsSize == static_cast(Transport::NumTransports), "TransportFlagsSize must match the number of transports"); using TransportFlagsBase = std::bitset; -} // namespace detail +} // namespace detail -class TransportFlags : private detail::TransportFlagsBase -{ -public: +class TransportFlags : private detail::TransportFlagsBase { + public: TransportFlags() = default; - TransportFlags(Transport transport) : detail::TransportFlagsBase(1 << static_cast(transport)) - { - } + TransportFlags(Transport transport) : detail::TransportFlagsBase(1 << static_cast(transport)) {} - bool has(Transport transport) const - { - return detail::TransportFlagsBase::test(static_cast(transport)); - } + bool has(Transport transport) const { return detail::TransportFlagsBase::test(static_cast(transport)); } - bool none() const - { - return detail::TransportFlagsBase::none(); - } + bool none() const { return detail::TransportFlagsBase::none(); } - bool any() const - { - return detail::TransportFlagsBase::any(); - } + bool any() const { return detail::TransportFlagsBase::any(); } - bool all() const - { - return detail::TransportFlagsBase::all(); - } + bool all() const { return detail::TransportFlagsBase::all(); } - size_t count() const - { - return detail::TransportFlagsBase::count(); - } + size_t count() const { return detail::TransportFlagsBase::count(); } - TransportFlags& operator|=(TransportFlags other) - { + TransportFlags& operator|=(TransportFlags other) { detail::TransportFlagsBase::operator|=(other); return *this; } - TransportFlags operator|(TransportFlags other) const - { - return TransportFlags(*this) |= other; - } + TransportFlags operator|(TransportFlags other) const { return TransportFlags(*this) |= other; } - TransportFlags operator|(Transport transport) const - { - return *this | TransportFlags(transport); - } + TransportFlags operator|(Transport transport) const { return *this | TransportFlags(transport); } - TransportFlags& operator&=(TransportFlags other) - { + TransportFlags& operator&=(TransportFlags other) { detail::TransportFlagsBase::operator&=(other); return *this; } - TransportFlags operator&(TransportFlags other) const - { - return TransportFlags(*this) &= other; - } + TransportFlags operator&(TransportFlags other) const { return TransportFlags(*this) &= other; } - TransportFlags operator&(Transport transport) const - { - return *this & TransportFlags(transport); - } + TransportFlags operator&(Transport transport) const { return *this & TransportFlags(transport); } - TransportFlags& operator^=(TransportFlags other) - { + TransportFlags& operator^=(TransportFlags other) { detail::TransportFlagsBase::operator^=(other); return *this; } - TransportFlags operator^(TransportFlags other) const - { - return TransportFlags(*this) ^= other; - } + TransportFlags operator^(TransportFlags other) const { return TransportFlags(*this) ^= other; } - TransportFlags operator^(Transport transport) const - { - return *this ^ TransportFlags(transport); - } + TransportFlags operator^(Transport transport) const { return *this ^ TransportFlags(transport); } - TransportFlags operator~() const - { - return TransportFlags(*this).flip(); - } + TransportFlags operator~() const { return TransportFlags(*this).flip(); } - bool operator==(TransportFlags other) const - { - return detail::TransportFlagsBase::operator==(other); - } + bool operator==(TransportFlags other) const { return detail::TransportFlagsBase::operator==(other); } - bool operator!=(TransportFlags other) const - { - return detail::TransportFlagsBase::operator!=(other); - } + bool operator!=(TransportFlags other) const { return detail::TransportFlagsBase::operator!=(other); } - detail::TransportFlagsBase toBitset() const - { - return *this; - } + detail::TransportFlagsBase toBitset() const { return *this; } -private: - TransportFlags(detail::TransportFlagsBase bitset) : detail::TransportFlagsBase(bitset) - { - } + private: + TransportFlags(detail::TransportFlagsBase bitset) : detail::TransportFlagsBase(bitset) {} }; -inline TransportFlags operator|(Transport transport1, Transport transport2) -{ +inline TransportFlags operator|(Transport transport1, Transport transport2) { return TransportFlags(transport1) | transport2; } -inline TransportFlags operator&(Transport transport1, Transport transport2) -{ +inline TransportFlags operator&(Transport transport1, Transport transport2) { return TransportFlags(transport1) & transport2; } -inline TransportFlags operator^(Transport transport1, Transport transport2) -{ +inline TransportFlags operator^(Transport transport1, Transport transport2) { return TransportFlags(transport1) ^ transport2; } @@ -237,14 +163,13 @@ Transport getIBTransportByDeviceName(const std::string& ibDeviceName); class Communicator; class Connection; -class RegisteredMemory -{ +class RegisteredMemory { struct Impl; // A shared_ptr is used since RegisteredMemory is functionally immutable, although internally some state is populated // lazily. std::shared_ptr pimpl; -public: + public: RegisteredMemory() = default; RegisteredMemory(std::shared_ptr pimpl); ~RegisteredMemory(); @@ -261,9 +186,8 @@ public: friend class Communicator; }; -class Connection -{ -public: +class Connection { + public: virtual void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size) = 0; @@ -277,47 +201,34 @@ public: virtual Transport remoteTransport() = 0; -protected: + protected: static std::shared_ptr getRegisteredMemoryImpl(RegisteredMemory&); }; -struct Setuppable -{ - virtual void beginSetup(std::shared_ptr) - { - } - virtual void endSetup(std::shared_ptr) - { - } +struct Setuppable { + virtual void beginSetup(std::shared_ptr) {} + virtual void endSetup(std::shared_ptr) {} }; -template class NonblockingFuture -{ +template +class NonblockingFuture { std::shared_future future; -public: + public: NonblockingFuture() = default; - NonblockingFuture(std::shared_future&& future) : future(std::move(future)) - { - } + NonblockingFuture(std::shared_future&& future) : future(std::move(future)) {} NonblockingFuture(const NonblockingFuture&) = default; - bool ready() const - { - return future.wait_for(std::chrono::seconds(0)) == std::future_status::ready; - } + bool ready() const { return future.wait_for(std::chrono::seconds(0)) == std::future_status::ready; } - T get() - { - if (!ready()) - throw Error("NonblockingFuture::get() called before ready", ErrorCode::InvalidUsage); + T get() { + if (!ready()) throw Error("NonblockingFuture::get() called before ready", ErrorCode::InvalidUsage); return future.get(); } }; -class Communicator -{ -public: +class Communicator { + public: /* Initialize the communicator. * * Inputs: @@ -368,19 +279,18 @@ public: struct Impl; -private: + private: std::unique_ptr pimpl; }; -} // namespace mscclpp +} // namespace mscclpp namespace std { -template <> struct hash -{ - size_t operator()(const mscclpp::TransportFlags& flags) const - { +template <> +struct hash { + size_t operator()(const mscclpp::TransportFlags& flags) const { return hash()(flags.toBitset()); } }; -} // namespace std +} // namespace std -#endif // MSCCLPP_CORE_HPP_ +#endif // MSCCLPP_CORE_HPP_ diff --git a/include/mscclpp/epoch.hpp b/include/mscclpp/epoch.hpp index cbd3478a..539ad03f 100644 --- a/include/mscclpp/epoch.hpp +++ b/include/mscclpp/epoch.hpp @@ -5,53 +5,45 @@ namespace mscclpp { -struct alignas(16) EpochIds -{ +struct alignas(16) EpochIds { uint64_t outbound; uint64_t inboundReplica; }; -class BaseEpoch -{ -private: +class BaseEpoch { + private: std::shared_ptr connection_; RegisteredMemory localEpochIdsRegMem_; NonblockingFuture remoteEpochIdsRegMem_; -protected: + protected: EpochIds* epochIds_; uint64_t* expectedInboundEpochId_; -public: + public: BaseEpoch(std::shared_ptr connection); void setup(Communicator& communicator); BaseEpoch(const BaseEpoch&) = delete; void signal(); }; -class DeviceEpoch : BaseEpoch -{ -public: +class DeviceEpoch : BaseEpoch { + public: DeviceEpoch(Communicator& communicator, std::shared_ptr connection); DeviceEpoch(const DeviceEpoch&) = delete; ~DeviceEpoch(); void signal(); - struct DeviceHandle - { + struct DeviceHandle { #ifdef __CUDACC__ - __forceinline__ __device__ void wait() - { + __forceinline__ __device__ void wait() { (*expectedInboundEpochId) += 1; while (*(volatile uint64_t*)&(epochIds->inboundReplica) < (*expectedInboundEpochId)) ; } - __forceinline__ __device__ void epochIncrement() - { - *(volatile uint64_t*)&(epochIds->outbound) += 1; - } -#endif // __CUDACC__ + __forceinline__ __device__ void epochIncrement() { *(volatile uint64_t*)&(epochIds->outbound) += 1; } +#endif // __CUDACC__ EpochIds* epochIds; uint64_t* expectedInboundEpochId; @@ -60,9 +52,8 @@ public: DeviceHandle deviceHandle(); }; -class HostEpoch : BaseEpoch -{ -public: +class HostEpoch : BaseEpoch { + public: HostEpoch(Communicator& communicator, std::shared_ptr connection); HostEpoch(const HostEpoch&) = delete; ~HostEpoch(); @@ -71,6 +62,6 @@ public: void wait(); }; -} // namespace mscclpp +} // namespace mscclpp -#endif // MSCCLPP_EPOCH_HPP_ +#endif // MSCCLPP_EPOCH_HPP_ diff --git a/include/mscclpp/errors.hpp b/include/mscclpp/errors.hpp index eb18f98f..3497f783 100644 --- a/include/mscclpp/errors.hpp +++ b/include/mscclpp/errors.hpp @@ -5,52 +5,46 @@ namespace mscclpp { -enum class ErrorCode -{ +enum class ErrorCode { SystemError, InternalError, InvalidUsage, }; -class BaseError : public std::runtime_error -{ -public: +class BaseError : public std::runtime_error { + public: BaseError(std::string message, int errorCode); virtual ~BaseError() = default; int getErrorCode() const; -private: + private: int errorCode_; }; -class Error : public BaseError -{ -public: +class Error : public BaseError { + public: Error(std::string message, ErrorCode errorCode); virtual ~Error() = default; }; -class CudaError : public BaseError -{ -public: +class CudaError : public BaseError { + public: CudaError(std::string message, int errorCode); virtual ~CudaError() = default; }; -class CuError : public BaseError -{ -public: +class CuError : public BaseError { + public: CuError(std::string message, int errorCode); virtual ~CuError() = default; }; -class IbError : public BaseError -{ -public: +class IbError : public BaseError { + public: IbError(std::string message, int errorCode); virtual ~IbError() = default; }; -}; // namespace mscclpp +}; // namespace mscclpp -#endif // MSCCLPP_ERRORS_HPP_ +#endif // MSCCLPP_ERRORS_HPP_ diff --git a/include/mscclpp/fifo.hpp b/include/mscclpp/fifo.hpp index e3172dca..aff86f8f 100644 --- a/include/mscclpp/fifo.hpp +++ b/include/mscclpp/fifo.hpp @@ -1,9 +1,10 @@ #ifndef MSCCLPP_FIFO_HPP_ #define MSCCLPP_FIFO_HPP_ +#include + #include #include -#include namespace mscclpp { @@ -12,8 +13,7 @@ namespace mscclpp { #define MSCCLPP_PROXY_FIFO_SIZE 128 #define MSCCLPP_PROXY_FIFO_FLUSH_COUNTER 4 -struct alignas(16) ProxyTrigger -{ +struct alignas(16) ProxyTrigger { uint64_t fst, snd; }; @@ -30,11 +30,9 @@ struct alignas(16) ProxyTrigger * Why duplicating the tail is a good idea? The fifo is large engouh and we do not need frequent updates * for the tail as there is usually enough space for device threads to push their work into. */ -struct DeviceProxyFifo -{ +struct DeviceProxyFifo { #ifdef __CUDACC__ - __forceinline__ __device__ uint64_t push(ProxyTrigger trigger) - { + __forceinline__ __device__ uint64_t push(ProxyTrigger trigger) { uint64_t curFifoHead = atomicAdd((unsigned long long int*)this->head, 1); while (curFifoHead >= MSCCLPP_PROXY_FIFO_SIZE + *((volatile uint64_t*)this->tailReplica)) ; @@ -44,17 +42,16 @@ struct DeviceProxyFifo asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(triggerPtr), "l"(trigger.fst), "l"(trigger.snd)); return curFifoHead; } -#endif // __CUDACC__ +#endif // __CUDACC__ - ProxyTrigger* triggers; // Allocate on host via cudaHostAlloc. This space is used for pushing the workelements - uint64_t* tailReplica; // Allocated on device. proxyState->fifoTailHost is the true tail on host and pused - // occasionally to device - uint64_t* head; // Allocated on device. Only accessed by device + ProxyTrigger* triggers; // Allocate on host via cudaHostAlloc. This space is used for pushing the workelements + uint64_t* tailReplica; // Allocated on device. proxyState->fifoTailHost is the true tail on host and pused + // occasionally to device + uint64_t* head; // Allocated on device. Only accessed by device }; -class HostProxyFifo -{ -public: +class HostProxyFifo { + public: HostProxyFifo(); ~HostProxyFifo(); @@ -67,11 +64,11 @@ public: DeviceProxyFifo deviceFifo(); -private: + private: struct Impl; std::unique_ptr pimpl; }; -} // namespace mscclpp +} // namespace mscclpp -#endif // MSCCLPP_FIFO_HPP_ +#endif // MSCCLPP_FIFO_HPP_ diff --git a/include/mscclpp/proxy.hpp b/include/mscclpp/proxy.hpp index 37decafb..4e89e56b 100644 --- a/include/mscclpp/proxy.hpp +++ b/include/mscclpp/proxy.hpp @@ -1,15 +1,13 @@ #ifndef MSCCLPP_PROXY_HPP_ #define MSCCLPP_PROXY_HPP_ -#include - #include #include +#include namespace mscclpp { -enum class ProxyHandlerResult -{ +enum class ProxyHandlerResult { Continue, FlushFifoTailAndContinue, Stop, @@ -18,9 +16,8 @@ enum class ProxyHandlerResult class Proxy; using ProxyHandler = std::function; -class Proxy -{ -public: +class Proxy { + public: Proxy(ProxyHandler handler, std::function threadInit); Proxy(ProxyHandler handler); ~Proxy(); @@ -30,11 +27,11 @@ public: HostProxyFifo& fifo(); -private: + private: struct Impl; std::unique_ptr pimpl; }; -} // namespace mscclpp +} // namespace mscclpp -#endif // MSCCLPP_PROXY_HPP_ \ No newline at end of file +#endif // MSCCLPP_PROXY_HPP_ \ No newline at end of file diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index d3020030..7efe46ae 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -1,26 +1,25 @@ -#include "api.h" -#include "checks.hpp" -#include -#include "utils.h" -#include "socket.h" +#include +#include #include #include +#include +#include #include #include #include #include -#include -#include -#include +#include "api.h" +#include "checks.hpp" +#include "socket.h" +#include "utils.h" using namespace mscclpp; namespace { -mscclppResult_t setFilesLimit() -{ +mscclppResult_t setFilesLimit() { rlimit filesLimit; SYSCHECK(getrlimit(RLIMIT_NOFILE, &filesLimit), "getrlimit"); filesLimit.rlim_cur = filesLimit.rlim_max; @@ -28,40 +27,32 @@ mscclppResult_t setFilesLimit() return mscclppSuccess; } -} // namespace +} // namespace /* Socket Interface Selection type */ -enum bootstrapInterface_t -{ - findSubnetIf = -1, - dontCareIf = -2 -}; +enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 }; -struct UnexpectedMsg -{ +struct UnexpectedMsg { int peer; int tag; std::shared_ptr sock; }; -struct ExtInfo -{ +struct ExtInfo { int rank; int nRanks; mscclppSocketAddress extAddressListenRoot; mscclppSocketAddress extAddressListen; }; -struct UniqueIdInternal -{ +struct UniqueIdInternal { uint64_t magic; union mscclppSocketAddress addr; }; static_assert(sizeof(UniqueIdInternal) <= sizeof(UniqueId), "UniqueIdInternal is too large to fit into UniqueId"); -class Bootstrap::Impl -{ -public: +class Bootstrap::Impl { + public: Impl(int rank, int nRanks); ~Impl(); void initialize(const UniqueId uniqueId); @@ -77,7 +68,7 @@ public: void barrier(); void close(); -private: + private: UniqueIdInternal uniqueId_; int rank_; int nRanks_; @@ -108,20 +99,20 @@ private: // UniqueId MscclppBootstrap::Impl::uniqueId_; Bootstrap::Impl::Impl(int rank, int nRanks) - : rank_(rank), nRanks_(nRanks), netInitialized(false), peerCommAddresses_(nRanks, mscclppSocketAddress()), - barrierArr_(nRanks, 0), abortFlag_(nullptr) -{ -} + : rank_(rank), + nRanks_(nRanks), + netInitialized(false), + peerCommAddresses_(nRanks, mscclppSocketAddress()), + barrierArr_(nRanks, 0), + abortFlag_(nullptr) {} -UniqueId Bootstrap::Impl::getUniqueId() const -{ +UniqueId Bootstrap::Impl::getUniqueId() const { UniqueId ret; std::memcpy(&ret, &uniqueId_, sizeof(uniqueId_)); return ret; } -UniqueId Bootstrap::Impl::createUniqueId() -{ +UniqueId Bootstrap::Impl::createUniqueId() { netInit(""); MSCCLPPTHROW(getRandomData(&uniqueId_.magic, sizeof(uniqueId_.magic))); std::memcpy(&uniqueId_.addr, &netIfAddr_, sizeof(mscclppSocketAddress)); @@ -129,18 +120,11 @@ UniqueId Bootstrap::Impl::createUniqueId() return getUniqueId(); } -int Bootstrap::Impl::getRank() -{ - return rank_; -} +int Bootstrap::Impl::getRank() { return rank_; } -int Bootstrap::Impl::getNranks() -{ - return nRanks_; -} +int Bootstrap::Impl::getNranks() { return nRanks_; } -void Bootstrap::Impl::initialize(const UniqueId uniqueId) -{ +void Bootstrap::Impl::initialize(const UniqueId uniqueId) { netInit(""); std::memcpy(&uniqueId_, &uniqueId, sizeof(uniqueId_)); @@ -148,8 +132,7 @@ void Bootstrap::Impl::initialize(const UniqueId uniqueId) establishConnections(); } -void Bootstrap::Impl::initialize(std::string ipPortPair) -{ +void Bootstrap::Impl::initialize(std::string ipPortPair) { netInit(ipPortPair); uniqueId_.magic = 0xdeadbeef; @@ -163,16 +146,14 @@ void Bootstrap::Impl::initialize(std::string ipPortPair) establishConnections(); } -Bootstrap::Impl::~Impl() -{ +Bootstrap::Impl::~Impl() { if (rootThread_.joinable()) { rootThread_.join(); } } void Bootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, std::vector& rankAddresses, - std::vector& rankAddressesRoot, int& rank) -{ + std::vector& rankAddressesRoot, int& rank) { mscclppSocket sock; ExtInfo info; @@ -185,13 +166,13 @@ void Bootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, std::vector< if (this->nRanks_ != info.nRanks) { throw mscclpp::Error("Bootstrap Root : mismatch in rank count from procs " + std::to_string(this->nRanks_) + " : " + - std::to_string(info.nRanks), + std::to_string(info.nRanks), ErrorCode::InternalError); } if (std::memcmp(&zero, &rankAddressesRoot[info.rank], sizeof(mscclppSocketAddress)) != 0) { throw mscclpp::Error("Bootstrap Root : rank " + std::to_string(info.rank) + " of " + std::to_string(this->nRanks_) + - " has already checked in", + " has already checked in", ErrorCode::InternalError); } @@ -202,8 +183,7 @@ void Bootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, std::vector< } void Bootstrap::Impl::sendHandleToPeer(int peer, const std::vector& rankAddresses, - const std::vector& rankAddressesRoot) -{ + const std::vector& rankAddressesRoot) { mscclppSocket sock; int next = (peer + 1) % this->nRanks_; MSCCLPPTHROW(mscclppSocketInit(&sock, &rankAddressesRoot[peer], this->uniqueId_.magic, mscclppSocketTypeBootstrap)); @@ -212,21 +192,19 @@ void Bootstrap::Impl::sendHandleToPeer(int peer, const std::vectorbootstrapRoot(listenSock); }; rootThread_ = std::thread(lambda); } -void Bootstrap::Impl::bootstrapRoot(mscclppSocket listenSock) -{ +void Bootstrap::Impl::bootstrapRoot(mscclppSocket listenSock) { int numCollected = 0; std::vector rankAddresses(this->nRanks_, mscclppSocketAddress()); // for initial rank <-> root information exchange @@ -254,16 +232,14 @@ void Bootstrap::Impl::bootstrapRoot(mscclppSocket listenSock) TRACE(MSCCLPP_INIT, "DONE"); } -void Bootstrap::Impl::netInit(std::string ipPortPair) -{ - if (netInitialized) - return; +void Bootstrap::Impl::netInit(std::string ipPortPair) { + if (netInitialized) return; if (!ipPortPair.empty()) { mscclppSocketAddress remoteAddr; if (mscclppSocketGetAddrFromString(&remoteAddr, ipPortPair.c_str()) != mscclppSuccess) { throw mscclpp::Error( - "Invalid ipPortPair, please use format: : or []: or :", - ErrorCode::InvalidUsage); + "Invalid ipPortPair, please use format: : or []: or :", + ErrorCode::InvalidUsage); } if (mscclppFindInterfaceMatchSubnet(netIfName_, &netIfAddr_, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) { throw mscclpp::Error("NET/Socket : No usable listening interface found", ErrorCode::InternalError); @@ -282,8 +258,7 @@ void Bootstrap::Impl::netInit(std::string ipPortPair) netInitialized = true; } -void Bootstrap::Impl::establishConnections() -{ +void Bootstrap::Impl::establishConnections() { mscclppSocketAddress nextAddr; mscclppSocket sock, listenSockRoot; ExtInfo info; @@ -334,7 +309,7 @@ void Bootstrap::Impl::establishConnections() MSCCLPPTHROW(mscclppSocketClose(&listenSockRoot)); MSCCLPPTHROW( - mscclppSocketInit(&this->ringSendSocket_, &nextAddr, magic, mscclppSocketTypeBootstrap, this->abortFlag_)); + mscclppSocketInit(&this->ringSendSocket_, &nextAddr, magic, mscclppSocketTypeBootstrap, this->abortFlag_)); MSCCLPPTHROW(mscclppSocketConnect(&this->ringSendSocket_)); // Accept the connect request from the previous rank in the AllGather ring MSCCLPPTHROW(mscclppSocketInit(&this->ringRecvSocket_)); @@ -347,8 +322,7 @@ void Bootstrap::Impl::establishConnections() TRACE(MSCCLPP_INIT, "rank %d nranks %d - DONE", rank_, nRanks_); } -void Bootstrap::Impl::allGather(void* allData, int size) -{ +void Bootstrap::Impl::allGather(void* allData, int size) { char* data = static_cast(allData); int rank = this->rank_; int nRanks = this->nRanks_; @@ -372,26 +346,23 @@ void Bootstrap::Impl::allGather(void* allData, int size) TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - DONE", rank, nRanks, size); } -void Bootstrap::Impl::netSend(mscclppSocket* sock, const void* data, int size) -{ +void Bootstrap::Impl::netSend(mscclppSocket* sock, const void* data, int size) { MSCCLPPTHROW(mscclppSocketSend(sock, &size, sizeof(int))); MSCCLPPTHROW(mscclppSocketSend(sock, const_cast(data), size)); } -void Bootstrap::Impl::netRecv(mscclppSocket* sock, void* data, int size) -{ +void Bootstrap::Impl::netRecv(mscclppSocket* sock, void* data, int size) { int recvSize; MSCCLPPTHROW(mscclppSocketRecv(sock, &recvSize, sizeof(int))); if (recvSize > size) { - throw mscclpp::Error("Message truncated : received " + std::to_string(recvSize) + " bytes instead of " + - std::to_string(size), - ErrorCode::InvalidUsage); + throw mscclpp::Error( + "Message truncated : received " + std::to_string(recvSize) + " bytes instead of " + std::to_string(size), + ErrorCode::InvalidUsage); } MSCCLPPTHROW(mscclppSocketRecv(sock, data, std::min(recvSize, size))); } -void Bootstrap::Impl::send(void* data, int size, int peer, int tag) -{ +void Bootstrap::Impl::send(void* data, int size, int peer, int tag) { mscclppSocket sock; MSCCLPPTHROW(mscclppSocketInit(&sock, &this->peerCommAddresses_[peer], this->uniqueId_.magic, mscclppSocketTypeBootstrap, this->abortFlag_)); @@ -403,8 +374,7 @@ void Bootstrap::Impl::send(void* data, int size, int peer, int tag) MSCCLPPTHROW(mscclppSocketClose(&sock)); } -void Bootstrap::Impl::recv(void* data, int size, int peer, int tag) -{ +void Bootstrap::Impl::recv(void* data, int size, int peer, int tag) { // search over all unexpected messages auto lambda = [peer, tag](const UnexpectedMsg& msg) { return msg.peer == peer && msg.tag == tag; }; auto it = std::find_if(unexpectedMessages_.begin(), unexpectedMessages_.end(), lambda); @@ -433,75 +403,37 @@ void Bootstrap::Impl::recv(void* data, int size, int peer, int tag) } } -void Bootstrap::Impl::barrier() -{ - allGather(barrierArr_.data(), sizeof(int)); -} +void Bootstrap::Impl::barrier() { allGather(barrierArr_.data(), sizeof(int)); } -void Bootstrap::Impl::close() -{ +void Bootstrap::Impl::close() { MSCCLPPTHROW(mscclppSocketClose(&this->listenSock_)); MSCCLPPTHROW(mscclppSocketClose(&this->ringSendSocket_)); MSCCLPPTHROW(mscclppSocketClose(&this->ringRecvSocket_)); } -MSCCLPP_API_CPP Bootstrap::Bootstrap(int rank, int nRanks) -{ +MSCCLPP_API_CPP Bootstrap::Bootstrap(int rank, int nRanks) { // pimpl_ = std::make_unique(ipPortPair, rank, nRanks, uniqueId); pimpl_ = std::make_unique(rank, nRanks); } -MSCCLPP_API_CPP UniqueId Bootstrap::createUniqueId() -{ - return pimpl_->createUniqueId(); -} +MSCCLPP_API_CPP UniqueId Bootstrap::createUniqueId() { return pimpl_->createUniqueId(); } -MSCCLPP_API_CPP UniqueId Bootstrap::getUniqueId() const -{ - return pimpl_->getUniqueId(); -} +MSCCLPP_API_CPP UniqueId Bootstrap::getUniqueId() const { return pimpl_->getUniqueId(); } -MSCCLPP_API_CPP int Bootstrap::getRank() -{ - return pimpl_->getRank(); -} +MSCCLPP_API_CPP int Bootstrap::getRank() { return pimpl_->getRank(); } -MSCCLPP_API_CPP int Bootstrap::getNranks() -{ - return pimpl_->getNranks(); -} +MSCCLPP_API_CPP int Bootstrap::getNranks() { return pimpl_->getNranks(); } -MSCCLPP_API_CPP void Bootstrap::send(void* data, int size, int peer, int tag) -{ - pimpl_->send(data, size, peer, tag); -} +MSCCLPP_API_CPP void Bootstrap::send(void* data, int size, int peer, int tag) { pimpl_->send(data, size, peer, tag); } -MSCCLPP_API_CPP void Bootstrap::recv(void* data, int size, int peer, int tag) -{ - pimpl_->recv(data, size, peer, tag); -} +MSCCLPP_API_CPP void Bootstrap::recv(void* data, int size, int peer, int tag) { pimpl_->recv(data, size, peer, tag); } -MSCCLPP_API_CPP void Bootstrap::allGather(void* allData, int size) -{ - pimpl_->allGather(allData, size); -} +MSCCLPP_API_CPP void Bootstrap::allGather(void* allData, int size) { pimpl_->allGather(allData, size); } -MSCCLPP_API_CPP void Bootstrap::initialize(UniqueId uniqueId) -{ - pimpl_->initialize(uniqueId); -} +MSCCLPP_API_CPP void Bootstrap::initialize(UniqueId uniqueId) { pimpl_->initialize(uniqueId); } -MSCCLPP_API_CPP void Bootstrap::initialize(std::string ipPortPair) -{ - pimpl_->initialize(ipPortPair); -} +MSCCLPP_API_CPP void Bootstrap::initialize(std::string ipPortPair) { pimpl_->initialize(ipPortPair); } -MSCCLPP_API_CPP void Bootstrap::barrier() -{ - pimpl_->barrier(); -} +MSCCLPP_API_CPP void Bootstrap::barrier() { pimpl_->barrier(); } -MSCCLPP_API_CPP Bootstrap::~Bootstrap() -{ - pimpl_->close(); -} +MSCCLPP_API_CPP Bootstrap::~Bootstrap() { pimpl_->close(); } diff --git a/src/bootstrap/socket.cc b/src/bootstrap/socket.cc index 4241390d..b60815b6 100644 --- a/src/bootstrap/socket.cc +++ b/src/bootstrap/socket.cc @@ -5,25 +5,23 @@ ************************************************************************/ #include "socket.h" -#include "config.h" -#include "utils.h" - -#include #include #include +#include #include +#include "config.h" +#include "utils.h" + static mscclppResult_t socketProgressOpt(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset, - int block, int* closed) -{ + int block, int* closed) { int bytes = 0; *closed = 0; char* data = (char*)ptr; char line[SOCKET_NAME_MAXLEN + 1]; do { - if (op == MSCCLPP_SOCKET_RECV) - bytes = recv(sock->fd, data + (*offset), size - (*offset), block ? 0 : MSG_DONTWAIT); + if (op == MSCCLPP_SOCKET_RECV) bytes = recv(sock->fd, data + (*offset), size - (*offset), block ? 0 : MSG_DONTWAIT); if (op == MSCCLPP_SOCKET_SEND) bytes = send(sock->fd, data + (*offset), size - (*offset), block ? MSG_NOSIGNAL : MSG_DONTWAIT | MSG_NOSIGNAL); if (op == MSCCLPP_SOCKET_RECV && bytes == 0) { @@ -48,8 +46,7 @@ static mscclppResult_t socketProgressOpt(int op, struct mscclppSocket* sock, voi return mscclppSuccess; } -static mscclppResult_t socketProgress(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset) -{ +static mscclppResult_t socketProgress(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset) { int closed; MSCCLPPCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0, &closed)); if (closed) { @@ -60,10 +57,8 @@ static mscclppResult_t socketProgress(int op, struct mscclppSocket* sock, void* return mscclppSuccess; } -static mscclppResult_t socketWait(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset) -{ - while (*offset < size) - MSCCLPPCHECK(socketProgress(op, sock, ptr, size, offset)); +static mscclppResult_t socketWait(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset) { + while (*offset < size) MSCCLPPCHECK(socketProgress(op, sock, ptr, size, offset)); return mscclppSuccess; } @@ -71,10 +66,8 @@ static mscclppResult_t socketWait(int op, struct mscclppSocket* sock, void* ptr, * * Output: "IPv4/IPv6 address" */ -const char* mscclppSocketToString(union mscclppSocketAddress* addr, char* buf, const int numericHostForm /*= 1*/) -{ - if (buf == NULL || addr == NULL) - return NULL; +const char* mscclppSocketToString(union mscclppSocketAddress* addr, char* buf, const int numericHostForm /*= 1*/) { + if (buf == NULL || addr == NULL) return NULL; struct sockaddr* saddr = &addr->sa; if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0] = '\0'; @@ -90,68 +83,58 @@ const char* mscclppSocketToString(union mscclppSocketAddress* addr, char* buf, c return buf; } -static uint16_t socketToPort(union mscclppSocketAddress* addr) -{ +static uint16_t socketToPort(union mscclppSocketAddress* addr) { struct sockaddr* saddr = &addr->sa; return ntohs(saddr->sa_family == AF_INET ? addr->sin.sin_port : addr->sin6.sin6_port); } /* Allow the user to force the IPv4/IPv6 interface selection */ -static int envSocketFamily(void) -{ - int family = -1; // Family selection is not forced, will use first one found +static int envSocketFamily(void) { + int family = -1; // Family selection is not forced, will use first one found char* env = getenv("MSCCLPP_SOCKET_FAMILY"); - if (env == NULL) - return family; + if (env == NULL) return family; INFO(MSCCLPP_ENV, "MSCCLPP_SOCKET_FAMILY set by environment to %s", env); if (strcmp(env, "AF_INET") == 0) - family = AF_INET; // IPv4 + family = AF_INET; // IPv4 else if (strcmp(env, "AF_INET6") == 0) - family = AF_INET6; // IPv6 + family = AF_INET6; // IPv6 return family; } static int findInterfaces(const char* prefixList, char* names, union mscclppSocketAddress* addrs, int sock_family, - int maxIfNameSize, int maxIfs) -{ + int maxIfNameSize, int maxIfs) { #ifdef ENABLE_TRACE char line[SOCKET_NAME_MAXLEN + 1]; #endif struct netIf userIfs[MAX_IFS]; bool searchNot = prefixList && prefixList[0] == '^'; - if (searchNot) - prefixList++; + if (searchNot) prefixList++; bool searchExact = prefixList && prefixList[0] == '='; - if (searchExact) - prefixList++; + if (searchExact) prefixList++; int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS); int found = 0; struct ifaddrs *interfaces, *interface; getifaddrs(&interfaces); for (interface = interfaces; interface && found < maxIfs; interface = interface->ifa_next) { - if (interface->ifa_addr == NULL) - continue; + if (interface->ifa_addr == NULL) continue; /* We only support IPv4 & IPv6 */ int family = interface->ifa_addr->sa_family; - if (family != AF_INET && family != AF_INET6) - continue; + if (family != AF_INET && family != AF_INET6) continue; TRACE(MSCCLPP_INIT | MSCCLPP_NET, "Found interface %s:%s", interface->ifa_name, mscclppSocketToString((union mscclppSocketAddress*)interface->ifa_addr, line)); /* Allow the caller to force the socket family type */ - if (sock_family != -1 && family != sock_family) - continue; + if (sock_family != -1 && family != sock_family) continue; /* We also need to skip IPv6 loopback interfaces */ if (family == AF_INET6) { struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr); - if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) - continue; + if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue; } // check against user specified interfaces @@ -183,8 +166,7 @@ static int findInterfaces(const char* prefixList, char* names, union mscclppSock return found; } -static bool matchSubnet(struct ifaddrs local_if, union mscclppSocketAddress* remote) -{ +static bool matchSubnet(struct ifaddrs local_if, union mscclppSocketAddress* remote) { /* Check family first */ int family = local_if.ifa_addr->sa_family; if (family != remote->sa.sa_family) { @@ -207,8 +189,8 @@ static bool matchSubnet(struct ifaddrs local_if, union mscclppSocketAddress* rem struct in6_addr& mask_in6 = mask->sin6_addr; struct in6_addr& remote_in6 = remote_addr.sin6_addr; bool same = true; - int len = 16; // IPv6 address is 16 unsigned char - for (int c = 0; c < len; c++) { // Network byte order is big-endian + int len = 16; // IPv6 address is 16 unsigned char + for (int c = 0; c < len; c++) { // Network byte order is big-endian char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c]; char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c]; if (c1 ^ c2) { @@ -228,8 +210,7 @@ static bool matchSubnet(struct ifaddrs local_if, union mscclppSocketAddress* rem } int mscclppFindInterfaceMatchSubnet(char* ifNames, union mscclppSocketAddress* localAddrs, - union mscclppSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) -{ + union mscclppSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) { #ifdef ENABLE_TRACE char line[SOCKET_NAME_MAXLEN + 1]; #endif @@ -238,13 +219,11 @@ int mscclppFindInterfaceMatchSubnet(char* ifNames, union mscclppSocketAddress* l struct ifaddrs *interfaces, *interface; getifaddrs(&interfaces); for (interface = interfaces; interface && !found; interface = interface->ifa_next) { - if (interface->ifa_addr == NULL) - continue; + if (interface->ifa_addr == NULL) continue; /* We only support IPv4 & IPv6 */ int family = interface->ifa_addr->sa_family; - if (family != AF_INET && family != AF_INET6) - continue; + if (family != AF_INET && family != AF_INET6) continue; // check against user specified interfaces if (!matchSubnet(*interface, remoteAddr)) { @@ -262,8 +241,7 @@ int mscclppFindInterfaceMatchSubnet(char* ifNames, union mscclppSocketAddress* l interface->ifa_name, mscclppSocketToString(localAddrs + found, line), mscclppSocketToString(remoteAddr, line_a)); found++; - if (found == maxIfs) - break; + if (found == maxIfs) break; } if (found == 0) { @@ -273,8 +251,7 @@ int mscclppFindInterfaceMatchSubnet(char* ifNames, union mscclppSocketAddress* l return found; } -mscclppResult_t mscclppSocketGetAddrFromString(union mscclppSocketAddress* ua, const char* ip_port_pair) -{ +mscclppResult_t mscclppSocketGetAddrFromString(union mscclppSocketAddress* ua, const char* ip_port_pair) { if (!(ip_port_pair && strlen(ip_port_pair) > 1)) { WARN("Net : string is null"); return mscclppInvalidArgument; @@ -305,36 +282,34 @@ mscclppResult_t mscclppSocketGetAddrFromString(union mscclppSocketAddress* ua, c if (p->ai_family == AF_INET) { struct sockaddr_in& sin = ua->sin; memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in)); - sin.sin_family = AF_INET; // IPv4 + sin.sin_family = AF_INET; // IPv4 // inet_pton(AF_INET, ni.prefix, &(sin.sin_addr)); // IP address - sin.sin_port = htons(ni.port); // port + sin.sin_port = htons(ni.port); // port } else if (p->ai_family == AF_INET6) { struct sockaddr_in6& sin6 = ua->sin6; memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6)); - sin6.sin6_family = AF_INET6; // IPv6 - sin6.sin6_port = htons(ni.port); // port - sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete - sin6.sin6_scope_id = 0; // should be global scope, set to 0 + sin6.sin6_family = AF_INET6; // IPv6 + sin6.sin6_port = htons(ni.port); // port + sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete + sin6.sin6_scope_id = 0; // should be global scope, set to 0 } else { WARN("Net : unsupported IP family"); return mscclppInvalidArgument; } - freeaddrinfo(p); // all done with this structure + freeaddrinfo(p); // all done with this structure } else { int i, j = -1, len = strlen(ip_port_pair); for (i = 1; i < len; i++) { - if (ip_port_pair[i] == '%') - j = i; - if (ip_port_pair[i] == ']') - break; + if (ip_port_pair[i] == '%') j = i; + if (ip_port_pair[i] == ']') break; } if (i == len) { WARN("Net : No valid [IPv6]:port pair found"); return mscclppInvalidArgument; } - bool global_scope = (j == -1 ? true : false); // If no % found, global scope; otherwise, link scope + bool global_scope = (j == -1 ? true : false); // If no % found, global scope; otherwise, link scope char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ]; memset(ip_str, '\0', sizeof(ip_str)); @@ -343,21 +318,19 @@ mscclppResult_t mscclppSocketGetAddrFromString(union mscclppSocketAddress* ua, c strncpy(ip_str, ip_port_pair + 1, global_scope ? i - 1 : j - 1); strncpy(port_str, ip_port_pair + i + 2, len - i - 1); int port = atoi(port_str); - if (!global_scope) - strncpy(if_name, ip_port_pair + j + 1, i - j - 1); // If not global scope, we need the intf name + if (!global_scope) strncpy(if_name, ip_port_pair + j + 1, i - j - 1); // If not global scope, we need the intf name struct sockaddr_in6& sin6 = ua->sin6; - sin6.sin6_family = AF_INET6; // IPv6 - inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr)); // IP address - sin6.sin6_port = htons(port); // port - sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete - sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name); // 0 if global scope; intf index if link scope + sin6.sin6_family = AF_INET6; // IPv6 + inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr)); // IP address + sin6.sin6_port = htons(port); // port + sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete + sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name); // 0 if global scope; intf index if link scope } return mscclppSuccess; } -int mscclppFindInterfaces(char* ifNames, union mscclppSocketAddress* ifAddrs, int ifNameMaxSize, int maxIfs) -{ +int mscclppFindInterfaces(char* ifNames, union mscclppSocketAddress* ifAddrs, int ifNameMaxSize, int maxIfs) { static int shownIfName = 0; int nIfs = 0; // Allow user to force the INET socket family selection @@ -367,8 +340,7 @@ int mscclppFindInterfaces(char* ifNames, union mscclppSocketAddress* ifAddrs, in if (env && strlen(env) > 1) { INFO(MSCCLPP_ENV, "MSCCLPP_SOCKET_IFNAME set by environment to %s", env); // Specified by user : find or fail - if (shownIfName++ == 0) - INFO(MSCCLPP_NET, "MSCCLPP_SOCKET_IFNAME set to %s", env); + if (shownIfName++ == 0) INFO(MSCCLPP_NET, "MSCCLPP_SOCKET_IFNAME set to %s", env); nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); } else { // Try to automatically pick the right one @@ -386,19 +358,15 @@ int mscclppFindInterfaces(char* ifNames, union mscclppSocketAddress* ifAddrs, in } } // Then look for anything else (but not docker or lo) - if (nIfs == 0) - nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); + if (nIfs == 0) nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); // Finally look for docker, then lo. - if (nIfs == 0) - nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); - if (nIfs == 0) - nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); + if (nIfs == 0) nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); + if (nIfs == 0) nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); } return nIfs; } -mscclppResult_t mscclppSocketListen(struct mscclppSocket* sock) -{ +mscclppResult_t mscclppSocketListen(struct mscclppSocket* sock) { if (sock == NULL) { WARN("mscclppSocketListen: pass NULL socket"); return mscclppInvalidArgument; @@ -438,20 +406,17 @@ mscclppResult_t mscclppSocketListen(struct mscclppSocket* sock) return mscclppSuccess; } -mscclppResult_t mscclppSocketGetAddr(struct mscclppSocket* sock, union mscclppSocketAddress* addr) -{ +mscclppResult_t mscclppSocketGetAddr(struct mscclppSocket* sock, union mscclppSocketAddress* addr) { if (sock == NULL) { WARN("mscclppSocketGetAddr: pass NULL socket"); return mscclppInvalidArgument; } - if (sock->state != mscclppSocketStateReady) - return mscclppInternalError; + if (sock->state != mscclppSocketStateReady) return mscclppInternalError; memcpy(addr, &sock->addr, sizeof(union mscclppSocketAddress)); return mscclppSuccess; } -static mscclppResult_t socketTryAccept(struct mscclppSocket* sock) -{ +static mscclppResult_t socketTryAccept(struct mscclppSocket* sock) { static bool timeInitialized = false; static mscclppTime_t initTime; if (!timeInitialized) { @@ -482,14 +447,12 @@ static mscclppResult_t socketTryAccept(struct mscclppSocket* sock) return mscclppSuccess; } -static mscclppResult_t socketFinalizeAccept(struct mscclppSocket* sock) -{ +static mscclppResult_t socketFinalizeAccept(struct mscclppSocket* sock) { uint64_t magic; enum mscclppSocketType type; int received = 0; MSCCLPPCHECK(mscclppSocketProgress(MSCCLPP_SOCKET_RECV, sock, &magic, sizeof(magic), &received)); - if (received == 0) - return mscclppSuccess; + if (received == 0) return mscclppSuccess; MSCCLPPCHECK(socketWait(MSCCLPP_SOCKET_RECV, sock, &magic, sizeof(magic), &received)); if (magic != sock->magic) { WARN("socketFinalizeAccept: wrong magic %lx != %lx", magic, sock->magic); @@ -514,8 +477,7 @@ static mscclppResult_t socketFinalizeAccept(struct mscclppSocket* sock) return mscclppSuccess; } -static mscclppResult_t socketStartConnect(struct mscclppSocket* sock) -{ +static mscclppResult_t socketStartConnect(struct mscclppSocket* sock) { static bool timeInitialized = false; static mscclppTime_t initTime; if (!timeInitialized) { @@ -543,8 +505,7 @@ static mscclppResult_t socketStartConnect(struct mscclppSocket* sock) return mscclppRemoteError; } usleep(SLEEP_INT); - if (++sock->connectRetries % 1000 == 0) - INFO(MSCCLPP_ALL, "Call to connect returned %s, retrying", strerror(errno)); + if (++sock->connectRetries % 1000 == 0) INFO(MSCCLPP_ALL, "Call to connect returned %s, retrying", strerror(errno)); return mscclppSuccess; } else { char line[SOCKET_NAME_MAXLEN + 1]; @@ -555,8 +516,7 @@ static mscclppResult_t socketStartConnect(struct mscclppSocket* sock) } } -static mscclppResult_t socketPollConnect(struct mscclppSocket* sock) -{ +static mscclppResult_t socketPollConnect(struct mscclppSocket* sock) { static bool timeInitialized = false; static mscclppTime_t initTime; if (!timeInitialized) { @@ -608,8 +568,7 @@ static mscclppResult_t socketPollConnect(struct mscclppSocket* sock) return mscclppSuccess; } -mscclppResult_t mscclppSocketPollConnect(struct mscclppSocket* sock) -{ +mscclppResult_t mscclppSocketPollConnect(struct mscclppSocket* sock) { if (sock == NULL) { WARN("mscclppSocketPollConnect: pass NULL socket"); return mscclppInvalidArgument; @@ -618,12 +577,10 @@ mscclppResult_t mscclppSocketPollConnect(struct mscclppSocket* sock) return mscclppSuccess; } -static mscclppResult_t socketFinalizeConnect(struct mscclppSocket* sock) -{ +static mscclppResult_t socketFinalizeConnect(struct mscclppSocket* sock) { int sent = 0; MSCCLPPCHECK(socketProgress(MSCCLPP_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent)); - if (sent == 0) - return mscclppSuccess; + if (sent == 0) return mscclppSuccess; MSCCLPPCHECK(socketWait(MSCCLPP_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent)); sent = 0; MSCCLPPCHECK(socketWait(MSCCLPP_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent)); @@ -631,8 +588,7 @@ static mscclppResult_t socketFinalizeConnect(struct mscclppSocket* sock) return mscclppSuccess; } -static mscclppResult_t socketProgressState(struct mscclppSocket* sock) -{ +static mscclppResult_t socketProgressState(struct mscclppSocket* sock) { if (sock->state == mscclppSocketStateAccepting) { MSCCLPPCHECK(socketTryAccept(sock)); } @@ -668,8 +624,7 @@ static mscclppResult_t socketProgressState(struct mscclppSocket* sock) // return mscclppSuccess; // } -mscclppResult_t mscclppSocketConnect(struct mscclppSocket* sock) -{ +mscclppResult_t mscclppSocketConnect(struct mscclppSocket* sock) { #ifdef ENABLE_TRACE char line[SOCKET_NAME_MAXLEN + 1]; #endif @@ -686,8 +641,7 @@ mscclppResult_t mscclppSocketConnect(struct mscclppSocket* sock) if (sock->state != mscclppSocketStateInitialized) { WARN("mscclppSocketConnect: wrong socket state %d", sock->state); - if (sock->state == mscclppSocketStateError) - return mscclppRemoteError; + if (sock->state == mscclppSocketStateError) return mscclppRemoteError; return mscclppInternalError; } TRACE(MSCCLPP_INIT | MSCCLPP_NET, "Connecting to socket %s", mscclppSocketToString(&sock->addr, line)); @@ -701,25 +655,23 @@ mscclppResult_t mscclppSocketConnect(struct mscclppSocket* sock) (sock->state == mscclppSocketStateConnecting || sock->state == mscclppSocketStateConnectPolling || sock->state == mscclppSocketStateConnected)); - if (sock->abortFlag && *sock->abortFlag != 0) - return mscclppInternalError; + if (sock->abortFlag && *sock->abortFlag != 0) return mscclppInternalError; switch (sock->state) { - case mscclppSocketStateConnecting: - case mscclppSocketStateConnectPolling: - case mscclppSocketStateConnected: - case mscclppSocketStateReady: - return mscclppSuccess; - case mscclppSocketStateError: - return mscclppSystemError; - default: - WARN("mscclppSocketConnect: wrong socket state %d", sock->state); - return mscclppInternalError; + case mscclppSocketStateConnecting: + case mscclppSocketStateConnectPolling: + case mscclppSocketStateConnected: + case mscclppSocketStateReady: + return mscclppSuccess; + case mscclppSocketStateError: + return mscclppSystemError; + default: + WARN("mscclppSocketConnect: wrong socket state %d", sock->state); + return mscclppInternalError; } } -mscclppResult_t mscclppSocketAccept(struct mscclppSocket* sock, struct mscclppSocket* listenSock) -{ +mscclppResult_t mscclppSocketAccept(struct mscclppSocket* sock, struct mscclppSocket* listenSock) { mscclppResult_t ret = mscclppSuccess; if (listenSock == NULL || sock == NULL) { @@ -747,22 +699,21 @@ mscclppResult_t mscclppSocketAccept(struct mscclppSocket* sock, struct mscclppSo } while (sock->asyncFlag == 0 && (sock->abortFlag == NULL || *sock->abortFlag == 0) && (sock->state == mscclppSocketStateAccepting || sock->state == mscclppSocketStateAccepted)); - if (sock->abortFlag && *sock->abortFlag != 0) - return mscclppInternalError; + if (sock->abortFlag && *sock->abortFlag != 0) return mscclppInternalError; switch (sock->state) { - case mscclppSocketStateAccepting: - case mscclppSocketStateAccepted: - case mscclppSocketStateReady: - ret = mscclppSuccess; - break; - case mscclppSocketStateError: - ret = mscclppSystemError; - break; - default: - WARN("mscclppSocketAccept: wrong socket state %d", sock->state); - ret = mscclppInternalError; - break; + case mscclppSocketStateAccepting: + case mscclppSocketStateAccepted: + case mscclppSocketStateReady: + ret = mscclppSuccess; + break; + case mscclppSocketStateError: + ret = mscclppSystemError; + break; + default: + WARN("mscclppSocketAccept: wrong socket state %d", sock->state); + ret = mscclppInternalError; + break; } exit: @@ -770,12 +721,10 @@ exit: } mscclppResult_t mscclppSocketInit(struct mscclppSocket* sock, const mscclppSocketAddress* addr, uint64_t magic, - enum mscclppSocketType type, volatile uint32_t* abortFlag, int asyncFlag) -{ + enum mscclppSocketType type, volatile uint32_t* abortFlag, int asyncFlag) { mscclppResult_t ret = mscclppSuccess; - if (sock == NULL) - goto exit; + if (sock == NULL) goto exit; sock->connectRetries = 0; sock->acceptRetries = 0; sock->abortFlag = abortFlag; @@ -824,8 +773,7 @@ fail: goto exit; } -mscclppResult_t mscclppSocketProgress(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset) -{ +mscclppResult_t mscclppSocketProgress(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset) { if (sock == NULL) { WARN("mscclppSocketProgress: pass NULL socket"); return mscclppInvalidArgument; @@ -843,8 +791,7 @@ mscclppResult_t mscclppSocketProgress(int op, struct mscclppSocket* sock, void* // return mscclppSuccess; // } -mscclppResult_t mscclppSocketSend(struct mscclppSocket* sock, void* ptr, int size) -{ +mscclppResult_t mscclppSocketSend(struct mscclppSocket* sock, void* ptr, int size) { int offset = 0; if (sock == NULL) { WARN("mscclppSocketSend: pass NULL socket"); @@ -858,8 +805,7 @@ mscclppResult_t mscclppSocketSend(struct mscclppSocket* sock, void* ptr, int siz return mscclppSuccess; } -mscclppResult_t mscclppSocketRecv(struct mscclppSocket* sock, void* ptr, int size) -{ +mscclppResult_t mscclppSocketRecv(struct mscclppSocket* sock, void* ptr, int size) { int offset = 0; if (sock == NULL) { WARN("mscclppSocketRecv: pass NULL socket"); @@ -888,11 +834,9 @@ mscclppResult_t mscclppSocketRecv(struct mscclppSocket* sock, void* ptr, int siz // return mscclppSuccess; // } -mscclppResult_t mscclppSocketClose(struct mscclppSocket* sock) -{ +mscclppResult_t mscclppSocketClose(struct mscclppSocket* sock) { if (sock != NULL) { - if (sock->fd >= 0) - close(sock->fd); + if (sock->fd >= 0) close(sock->fd); sock->state = mscclppSocketStateClosed; sock->fd = -1; } diff --git a/src/c_style_remnants.cc b/src/c_style_remnants.cc index 613ff7ee..98b6273f 100644 --- a/src/c_style_remnants.cc +++ b/src/c_style_remnants.cc @@ -1,45 +1,39 @@ -#include "mscclpp.h" -#include "debug.h" -#include "config.h" #include "api.h" +#include "config.h" +#include "debug.h" +#include "mscclpp.h" -MSCCLPP_API void mscclppDefaultLogHandler(const char* msg) -{ - mscclppDebugDefaultLogHandler(msg); -} +MSCCLPP_API void mscclppDefaultLogHandler(const char* msg) { mscclppDebugDefaultLogHandler(msg); } -MSCCLPP_API mscclppResult_t mscclppSetLogHandler(mscclppLogHandler_t handler) -{ +MSCCLPP_API mscclppResult_t mscclppSetLogHandler(mscclppLogHandler_t handler) { return mscclppDebugSetLogHandler(handler); } -MSCCLPP_API mscclppResult_t mscclppSetBootstrapConnTimeout(int timeout) -{ +MSCCLPP_API mscclppResult_t mscclppSetBootstrapConnTimeout(int timeout) { mscclppConfig* config = mscclppConfig::getInstance(); config->setBootstrapConnectionTimeoutConfig(timeout); return mscclppSuccess; } -MSCCLPP_API const char* mscclppGetErrorString(mscclppResult_t code) -{ +MSCCLPP_API const char* mscclppGetErrorString(mscclppResult_t code) { switch (code) { - case mscclppSuccess: - return "no error"; - case mscclppUnhandledCudaError: - return "unhandled cuda error"; - case mscclppSystemError: - return "unhandled system error"; - case mscclppInternalError: - return "internal error"; - case mscclppInvalidArgument: - return "invalid argument"; - case mscclppInvalidUsage: - return "invalid usage"; - case mscclppRemoteError: - return "remote process exited or there was a network error"; - case mscclppInProgress: - return "MSCCL++ operation in progress"; - default: - return "unknown result code"; + case mscclppSuccess: + return "no error"; + case mscclppUnhandledCudaError: + return "unhandled cuda error"; + case mscclppSystemError: + return "unhandled system error"; + case mscclppInternalError: + return "internal error"; + case mscclppInvalidArgument: + return "invalid argument"; + case mscclppInvalidUsage: + return "invalid usage"; + case mscclppRemoteError: + return "remote process exited or there was a network error"; + case mscclppInProgress: + return "MSCCL++ operation in progress"; + default: + return "unknown result code"; } } diff --git a/src/channel.cc b/src/channel.cc index bf5e6da6..f9564f79 100644 --- a/src/channel.cc +++ b/src/channel.cc @@ -1,4 +1,5 @@ #include + #include "api.h" #include "checks.hpp" #include "debug.h" @@ -8,21 +9,19 @@ namespace mscclpp { namespace channel { MSCCLPP_API_CPP DeviceChannelService::DeviceChannelService(Communicator& communicator) - : communicator_(communicator), - proxy_([&](ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }, [&]() { bindThread(); }) -{ + : communicator_(communicator), + proxy_([&](ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }, [&]() { bindThread(); }) { int cudaDevice; CUDATHROW(cudaGetDevice(&cudaDevice)); MSCCLPPTHROW(getDeviceNumaNode(cudaDevice, &deviceNumaNode)); } -MSCCLPP_API_CPP void DeviceChannelService::bindThread() -{ +MSCCLPP_API_CPP void DeviceChannelService::bindThread() { if (deviceNumaNode >= 0) { MSCCLPPTHROW(numaBind(deviceNumaNode)); INFO(MSCCLPP_INIT, "NUMA node of DeviceChannelService proxy thread is set to %d", deviceNumaNode); } } -} // namespace channel -} // namespace mscclpp +} // namespace channel +} // namespace mscclpp diff --git a/src/communicator.cc b/src/communicator.cc index 8b721232..c324093f 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -1,18 +1,18 @@ +#include "communicator.hpp" + +#include #include #include "api.h" #include "checks.hpp" -#include "communicator.hpp" #include "connection.hpp" #include "debug.h" -#include #include "registered_memory.hpp" #include "utils.h" namespace mscclpp { -Communicator::Impl::Impl(std::shared_ptr bootstrap) : bootstrap_(bootstrap) -{ +Communicator::Impl::Impl(std::shared_ptr bootstrap) : bootstrap_(bootstrap) { rankToHash_.resize(bootstrap->getNranks()); auto hostHash = getHostHash(); INFO(MSCCLPP_INIT, "Host hash: %lx", hostHash); @@ -20,13 +20,9 @@ Communicator::Impl::Impl(std::shared_ptr bootstrap) : bootstrap_( bootstrap->allGather(rankToHash_.data(), sizeof(uint64_t)); } -Communicator::Impl::~Impl() -{ - ibContexts_.clear(); -} +Communicator::Impl::~Impl() { ibContexts_.clear(); } -IbCtx* Communicator::Impl::getIbContext(Transport ibTransport) -{ +IbCtx* Communicator::Impl::getIbContext(Transport ibTransport) { // Find IB context or create it auto it = ibContexts_.find(ibTransport); if (it == ibContexts_.end()) { @@ -41,29 +37,20 @@ IbCtx* Communicator::Impl::getIbContext(Transport ibTransport) MSCCLPP_API_CPP Communicator::~Communicator() = default; MSCCLPP_API_CPP Communicator::Communicator(std::shared_ptr bootstrap) - : pimpl(std::make_unique(bootstrap)) -{ -} + : pimpl(std::make_unique(bootstrap)) {} -MSCCLPP_API_CPP std::shared_ptr Communicator::bootstrapper() -{ - return pimpl->bootstrap_; -} +MSCCLPP_API_CPP std::shared_ptr Communicator::bootstrapper() { return pimpl->bootstrap_; } -MSCCLPP_API_CPP RegisteredMemory Communicator::registerMemory(void* ptr, size_t size, TransportFlags transports) -{ +MSCCLPP_API_CPP RegisteredMemory Communicator::registerMemory(void* ptr, size_t size, TransportFlags transports) { return RegisteredMemory( - std::make_shared(ptr, size, pimpl->bootstrap_->getRank(), transports, *pimpl)); + std::make_shared(ptr, size, pimpl->bootstrap_->getRank(), transports, *pimpl)); } -struct MemorySender : public Setuppable -{ - MemorySender(RegisteredMemory memory, int remoteRank, int tag) : memory_(memory), remoteRank_(remoteRank), tag_(tag) - { - } +struct MemorySender : public Setuppable { + MemorySender(RegisteredMemory memory, int remoteRank, int tag) + : memory_(memory), remoteRank_(remoteRank), tag_(tag) {} - void beginSetup(std::shared_ptr bootstrap) override - { + void beginSetup(std::shared_ptr bootstrap) override { bootstrap->send(memory_.serialize(), remoteRank_, tag_); } @@ -72,19 +59,14 @@ struct MemorySender : public Setuppable int tag_; }; -MSCCLPP_API_CPP void Communicator::sendMemoryOnSetup(RegisteredMemory memory, int remoteRank, int tag) -{ +MSCCLPP_API_CPP void Communicator::sendMemoryOnSetup(RegisteredMemory memory, int remoteRank, int tag) { onSetup(std::make_shared(memory, remoteRank, tag)); } -struct MemoryReceiver : public Setuppable -{ - MemoryReceiver(int remoteRank, int tag) : remoteRank_(remoteRank), tag_(tag) - { - } +struct MemoryReceiver : public Setuppable { + MemoryReceiver(int remoteRank, int tag) : remoteRank_(remoteRank), tag_(tag) {} - void endSetup(std::shared_ptr bootstrap) override - { + void endSetup(std::shared_ptr bootstrap) override { std::vector data; bootstrap->recv(data, remoteRank_, tag_); memoryPromise_.set_value(RegisteredMemory::deserialize(data)); @@ -95,15 +77,13 @@ struct MemoryReceiver : public Setuppable int tag_; }; -MSCCLPP_API_CPP NonblockingFuture Communicator::recvMemoryOnSetup(int remoteRank, int tag) -{ +MSCCLPP_API_CPP NonblockingFuture Communicator::recvMemoryOnSetup(int remoteRank, int tag) { auto memoryReceiver = std::make_shared(remoteRank, tag); onSetup(memoryReceiver); return NonblockingFuture(memoryReceiver->memoryPromise_.get_future()); } -MSCCLPP_API_CPP std::shared_ptr Communicator::connectOnSetup(int remoteRank, int tag, Transport transport) -{ +MSCCLPP_API_CPP std::shared_ptr Communicator::connectOnSetup(int remoteRank, int tag, Transport transport) { std::shared_ptr conn; if (transport == Transport::CudaIpc) { // sanity check: make sure the IPC connection is being made within a node @@ -134,13 +114,11 @@ MSCCLPP_API_CPP std::shared_ptr Communicator::connectOnSetup(int rem return conn; } -MSCCLPP_API_CPP void Communicator::onSetup(std::shared_ptr setuppable) -{ +MSCCLPP_API_CPP void Communicator::onSetup(std::shared_ptr setuppable) { pimpl->toSetup_.push_back(setuppable); } -MSCCLPP_API_CPP void Communicator::setup() -{ +MSCCLPP_API_CPP void Communicator::setup() { for (auto& setuppable : pimpl->toSetup_) { setuppable->beginSetup(pimpl->bootstrap_); } @@ -150,4 +128,4 @@ MSCCLPP_API_CPP void Communicator::setup() pimpl->toSetup_.clear(); } -} // namespace mscclpp +} // namespace mscclpp diff --git a/src/config.cc b/src/config.cc index 069bfbe0..e4640216 100644 --- a/src/config.cc +++ b/src/config.cc @@ -2,17 +2,8 @@ mscclppConfig mscclppConfig::_instance; -mscclppConfig* mscclppConfig::getInstance() -{ - return &_instance; -} +mscclppConfig* mscclppConfig::getInstance() { return &_instance; } -time_t mscclppConfig::getBootstrapConnectionTimeoutConfig() -{ - return bootstrapConnectionTimeout; -} +time_t mscclppConfig::getBootstrapConnectionTimeoutConfig() { return bootstrapConnectionTimeout; } -void mscclppConfig::setBootstrapConnectionTimeoutConfig(time_t timeout) -{ - bootstrapConnectionTimeout = timeout; -} +void mscclppConfig::setBootstrapConnectionTimeoutConfig(time_t timeout) { bootstrapConnectionTimeout = timeout; } diff --git a/src/connection.cc b/src/connection.cc index d6cf3284..10ca79ee 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -1,15 +1,16 @@ #include "connection.hpp" + +#include + #include "checks.hpp" #include "infiniband/verbs.h" #include "npkit/npkit.h" #include "registered_memory.hpp" #include "utils.hpp" -#include namespace mscclpp { -void validateTransport(RegisteredMemory mem, Transport transport) -{ +void validateTransport(RegisteredMemory mem, Transport transport) { if (!mem.transports().has(transport)) { throw Error("RegisteredMemory does not support this transport", ErrorCode::InvalidUsage); } @@ -17,52 +18,30 @@ void validateTransport(RegisteredMemory mem, Transport transport) // Connection -std::shared_ptr Connection::getRegisteredMemoryImpl(RegisteredMemory& mem) -{ - return mem.pimpl; -} +std::shared_ptr Connection::getRegisteredMemoryImpl(RegisteredMemory& mem) { return mem.pimpl; } // ConnectionBase -ConnectionBase::ConnectionBase(int remoteRank, int tag) : remoteRank_(remoteRank), tag_(tag) -{ -} +ConnectionBase::ConnectionBase(int remoteRank, int tag) : remoteRank_(remoteRank), tag_(tag) {} -int ConnectionBase::remoteRank() -{ - return remoteRank_; -} +int ConnectionBase::remoteRank() { return remoteRank_; } -int ConnectionBase::tag() -{ - return tag_; -} +int ConnectionBase::tag() { return tag_; } // CudaIpcConnection -CudaIpcConnection::CudaIpcConnection(int remoteRank, int tag) : ConnectionBase(remoteRank, tag) -{ +CudaIpcConnection::CudaIpcConnection(int remoteRank, int tag) : ConnectionBase(remoteRank, tag) { CUDATHROW(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); } -CudaIpcConnection::~CudaIpcConnection() -{ - cudaStreamDestroy(stream); -} +CudaIpcConnection::~CudaIpcConnection() { cudaStreamDestroy(stream); } -Transport CudaIpcConnection::transport() -{ - return Transport::CudaIpc; -} +Transport CudaIpcConnection::transport() { return Transport::CudaIpc; } -Transport CudaIpcConnection::remoteTransport() -{ - return Transport::CudaIpc; -} +Transport CudaIpcConnection::remoteTransport() { return Transport::CudaIpc; } void CudaIpcConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, - uint64_t size) -{ + uint64_t size) { validateTransport(dst, remoteTransport()); validateTransport(src, transport()); @@ -75,8 +54,7 @@ void CudaIpcConnection::write(RegisteredMemory dst, uint64_t dstOffset, Register // npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_DATA_ENTRY, (uint32_t)size); } -void CudaIpcConnection::flush() -{ +void CudaIpcConnection::flush() { CUDATHROW(cudaStreamSynchronize(stream)); // npkitCollectExitEvents(conn, NPKIT_EVENT_DMA_SEND_EXIT); } @@ -84,24 +62,19 @@ void CudaIpcConnection::flush() // IBConnection IBConnection::IBConnection(int remoteRank, int tag, Transport transport, Communicator::Impl& commImpl) - : ConnectionBase(remoteRank, tag), transport_(transport), remoteTransport_(Transport::Unknown), numSignaledSends(0) -{ + : ConnectionBase(remoteRank, tag), + transport_(transport), + remoteTransport_(Transport::Unknown), + numSignaledSends(0) { qp = commImpl.getIbContext(transport)->createQp(); } -Transport IBConnection::transport() -{ - return transport_; -} +Transport IBConnection::transport() { return transport_; } -Transport IBConnection::remoteTransport() -{ - return remoteTransport_; -} +Transport IBConnection::remoteTransport() { return remoteTransport_; } void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, - uint64_t size) -{ + uint64_t size) { validateTransport(dst, remoteTransport()); validateTransport(src, transport()); @@ -126,8 +99,7 @@ void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMem // npkitCollectEntryEvent(conn, NPKIT_EVENT_IB_SEND_DATA_ENTRY, (uint32_t)size); } -void IBConnection::flush() -{ +void IBConnection::flush() { Timer timer; while (numSignaledSends) { int wcNum = qp->pollCq(); @@ -137,8 +109,8 @@ void IBConnection::flush() auto elapsed = timer.elapsed(); if (elapsed > MSCCLPP_POLLING_WAIT) { - throw Error("pollCq is stuck: waited for " + std::to_string(elapsed/1e6) + " seconds. Expected " + - std::to_string(numSignaledSends) + " signals", + throw Error("pollCq is stuck: waited for " + std::to_string(elapsed / 1e6) + " seconds. Expected " + + std::to_string(numSignaledSends) + " signals", ErrorCode::InternalError); } for (int i = 0; i < wcNum; ++i) { @@ -154,8 +126,7 @@ void IBConnection::flush() // npkitCollectExitEvents(conn, NPKIT_EVENT_IB_SEND_EXIT); } -void IBConnection::beginSetup(std::shared_ptr bootstrap) -{ +void IBConnection::beginSetup(std::shared_ptr bootstrap) { std::vector ibQpTransport; std::copy_n(reinterpret_cast(&qp->getInfo()), sizeof(qp->getInfo()), std::back_inserter(ibQpTransport)); std::copy_n(reinterpret_cast(&transport_), sizeof(transport_), std::back_inserter(ibQpTransport)); @@ -163,8 +134,7 @@ void IBConnection::beginSetup(std::shared_ptr bootstrap) bootstrap->send(ibQpTransport.data(), ibQpTransport.size(), remoteRank(), tag()); } -void IBConnection::endSetup(std::shared_ptr bootstrap) -{ +void IBConnection::endSetup(std::shared_ptr bootstrap) { std::vector ibQpTransport(sizeof(IbQpInfo) + sizeof(Transport)); bootstrap->recv(ibQpTransport.data(), ibQpTransport.size(), remoteRank(), tag()); @@ -179,4 +149,4 @@ void IBConnection::endSetup(std::shared_ptr bootstrap) qp->rts(); } -} // namespace mscclpp +} // namespace mscclpp diff --git a/src/debug.cc b/src/debug.cc index d6d29262..9841a4b7 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -5,6 +5,7 @@ ************************************************************************/ #include "debug.h" + #include #include #include @@ -15,8 +16,8 @@ int mscclppDebugLevel = -1; static int pid = -1; static char hostname[1024]; thread_local int mscclppDebugNoWarn = 0; -char mscclppLastError[1024] = ""; // Global string for the last error in human readable form -uint64_t mscclppDebugMask = MSCCLPP_INIT; // Default debug sub-system mask is INIT +char mscclppLastError[1024] = ""; // Global string for the last error in human readable form +uint64_t mscclppDebugMask = MSCCLPP_INIT; // Default debug sub-system mask is INIT FILE* mscclppDebugFile = stdout; mscclppLogHandler_t mscclppDebugLogHandler = NULL; pthread_mutex_t mscclppDebugLock = PTHREAD_MUTEX_INITIALIZER; @@ -24,13 +25,9 @@ std::chrono::steady_clock::time_point mscclppEpoch; static __thread int tid = -1; -void mscclppDebugDefaultLogHandler(const char* msg) -{ - fwrite(msg, 1, strlen(msg), mscclppDebugFile); -} +void mscclppDebugDefaultLogHandler(const char* msg) { fwrite(msg, 1, strlen(msg), mscclppDebugFile); } -void mscclppDebugInit() -{ +void mscclppDebugInit() { pthread_mutex_lock(&mscclppDebugLock); if (mscclppDebugLevel != -1) { pthread_mutex_unlock(&mscclppDebugLock); @@ -121,33 +118,32 @@ void mscclppDebugInit() continue; } switch (mscclppDebugFileEnv[c++]) { - case '%': // Double % - *dfn++ = '%'; - break; - case 'h': // %h = hostname - dfn += snprintf(dfn, PATH_MAX, "%s", hostname); - break; - case 'p': // %p = pid - dfn += snprintf(dfn, PATH_MAX, "%d", pid); - break; - default: // Echo everything we don't understand - *dfn++ = '%'; - *dfn++ = mscclppDebugFileEnv[c - 1]; - break; + case '%': // Double % + *dfn++ = '%'; + break; + case 'h': // %h = hostname + dfn += snprintf(dfn, PATH_MAX, "%s", hostname); + break; + case 'p': // %p = pid + dfn += snprintf(dfn, PATH_MAX, "%d", pid); + break; + default: // Echo everything we don't understand + *dfn++ = '%'; + *dfn++ = mscclppDebugFileEnv[c - 1]; + break; } } *dfn = '\0'; if (debugFn[0] != '\0') { FILE* file = fopen(debugFn, "w"); if (file != nullptr) { - setbuf(file, nullptr); // disable buffering + setbuf(file, nullptr); // disable buffering mscclppDebugFile = file; } } } - if (mscclppDebugLogHandler == NULL) - mscclppDebugLogHandler = mscclppDefaultLogHandler; + if (mscclppDebugLogHandler == NULL) mscclppDebugLogHandler = mscclppDefaultLogHandler; mscclppEpoch = std::chrono::steady_clock::now(); __atomic_store_n(&mscclppDebugLevel, tempNcclDebugLevel, __ATOMIC_RELEASE); @@ -159,10 +155,8 @@ void mscclppDebugInit() * they can share the debugging mechanisms and output files */ void mscclppDebugLog(mscclppDebugLogLevel level, unsigned long flags, const char* filefunc, int line, const char* fmt, - ...) -{ - if (__atomic_load_n(&mscclppDebugLevel, __ATOMIC_ACQUIRE) == -1) - mscclppDebugInit(); + ...) { + if (__atomic_load_n(&mscclppDebugLevel, __ATOMIC_ACQUIRE) == -1) mscclppDebugInit(); if (mscclppDebugNoWarn != 0 && level == MSCCLPP_LOG_WARN) { level = MSCCLPP_LOG_INFO; flags = mscclppDebugNoWarn; @@ -176,8 +170,7 @@ void mscclppDebugLog(mscclppDebugLogLevel level, unsigned long flags, const char va_end(vargs); pthread_mutex_unlock(&mscclppDebugLock); } - if (mscclppDebugLevel < level || ((flags & mscclppDebugMask) == 0)) - return; + if (mscclppDebugLevel < level || ((flags & mscclppDebugMask) == 0)) return; if (tid == -1) { tid = syscall(SYS_gettid); @@ -218,20 +211,16 @@ void mscclppDebugLog(mscclppDebugLogLevel level, unsigned long flags, const char } } -mscclppResult_t mscclppDebugSetLogHandler(mscclppLogHandler_t handler) -{ - if (__atomic_load_n(&mscclppDebugLevel, __ATOMIC_ACQUIRE) == -1) - mscclppDebugInit(); - if (handler == NULL) - return mscclppInvalidArgument; +mscclppResult_t mscclppDebugSetLogHandler(mscclppLogHandler_t handler) { + if (__atomic_load_n(&mscclppDebugLevel, __ATOMIC_ACQUIRE) == -1) mscclppDebugInit(); + if (handler == NULL) return mscclppInvalidArgument; pthread_mutex_lock(&mscclppDebugLock); mscclppDebugLogHandler = handler; pthread_mutex_unlock(&mscclppDebugLock); return mscclppSuccess; } -void mscclppSetThreadName(pthread_t thread, const char* fmt, ...) -{ +void mscclppSetThreadName(pthread_t thread, const char* fmt, ...) { // pthread_setname_np is nonstandard GNU extension // needs the following feature test macro #ifdef _GNU_SOURCE diff --git a/src/epoch.cc b/src/epoch.cc index afdbf8c2..7f29c92e 100644 --- a/src/epoch.cc +++ b/src/epoch.cc @@ -1,48 +1,39 @@ #include + #include "alloc.h" #include "api.h" #include "checks.hpp" namespace mscclpp { -BaseEpoch::BaseEpoch(std::shared_ptr connection) : connection_(connection) -{ -} +BaseEpoch::BaseEpoch(std::shared_ptr connection) : connection_(connection) {} -void BaseEpoch::setup(Communicator& communicator) -{ +void BaseEpoch::setup(Communicator& communicator) { localEpochIdsRegMem_ = communicator.registerMemory(epochIds_, sizeof(epochIds_), connection_->transport()); communicator.sendMemoryOnSetup(localEpochIdsRegMem_, connection_->remoteRank(), connection_->tag()); remoteEpochIdsRegMem_ = communicator.recvMemoryOnSetup(connection_->remoteRank(), connection_->tag()); } -void BaseEpoch::signal() -{ +void BaseEpoch::signal() { connection_->write(remoteEpochIdsRegMem_.get(), offsetof(EpochIds, inboundReplica), localEpochIdsRegMem_, offsetof(EpochIds, outbound), sizeof(epochIds_)); } MSCCLPP_API_CPP DeviceEpoch::DeviceEpoch(Communicator& communicator, std::shared_ptr connection) - : BaseEpoch(connection) -{ + : BaseEpoch(connection) { MSCCLPPTHROW(mscclppCudaCalloc(&epochIds_, 1)); MSCCLPPTHROW(mscclppCudaCalloc(&expectedInboundEpochId_, 1)); setup(communicator); } -MSCCLPP_API_CPP DeviceEpoch::~DeviceEpoch() -{ +MSCCLPP_API_CPP DeviceEpoch::~DeviceEpoch() { mscclppCudaFree(epochIds_); mscclppCudaFree(expectedInboundEpochId_); } -MSCCLPP_API_CPP void DeviceEpoch::signal() -{ - BaseEpoch::signal(); -} +MSCCLPP_API_CPP void DeviceEpoch::signal() { BaseEpoch::signal(); } -MSCCLPP_API_CPP DeviceEpoch::DeviceHandle DeviceEpoch::deviceHandle() -{ +MSCCLPP_API_CPP DeviceEpoch::DeviceHandle DeviceEpoch::deviceHandle() { DeviceEpoch::DeviceHandle device; device.epochIds = epochIds_; device.expectedInboundEpochId = expectedInboundEpochId_; @@ -50,8 +41,7 @@ MSCCLPP_API_CPP DeviceEpoch::DeviceHandle DeviceEpoch::deviceHandle() } MSCCLPP_API_CPP HostEpoch::HostEpoch(Communicator& communicator, std::shared_ptr connection) - : BaseEpoch(connection) -{ + : BaseEpoch(connection) { if (connection->transport() == Transport::CudaIpc) { throw Error("HostEpoch cannot be used with CudaIpc transport", ErrorCode::InvalidUsage); } @@ -60,23 +50,20 @@ MSCCLPP_API_CPP HostEpoch::HostEpoch(Communicator& communicator, std::shared_ptr setup(communicator); } -MSCCLPP_API_CPP HostEpoch::~HostEpoch() -{ +MSCCLPP_API_CPP HostEpoch::~HostEpoch() { delete epochIds_; delete expectedInboundEpochId_; } -MSCCLPP_API_CPP void HostEpoch::increamentAndSignal() -{ +MSCCLPP_API_CPP void HostEpoch::increamentAndSignal() { *(volatile uint64_t*)&(epochIds_->outbound) += 1; signal(); } -MSCCLPP_API_CPP void HostEpoch::wait() -{ +MSCCLPP_API_CPP void HostEpoch::wait() { (*expectedInboundEpochId_) += 1; while (*(volatile uint64_t*)&(epochIds_->inboundReplica) < (*expectedInboundEpochId_)) ; } -} // namespace mscclpp +} // namespace mscclpp diff --git a/src/errors.cc b/src/errors.cc index c3a0a7b7..c32accf8 100644 --- a/src/errors.cc +++ b/src/errors.cc @@ -1,31 +1,19 @@ #include + #include "api.h" namespace mscclpp { -BaseError::BaseError(std::string message, int errorCode) : std::runtime_error(message), errorCode_(errorCode) -{ -} +BaseError::BaseError(std::string message, int errorCode) : std::runtime_error(message), errorCode_(errorCode) {} -int BaseError::getErrorCode() const -{ - return errorCode_; -} +int BaseError::getErrorCode() const { return errorCode_; } -MSCCLPP_API_CPP Error::Error(std::string message, ErrorCode errorCode) : BaseError(message, -1) -{ -} +MSCCLPP_API_CPP Error::Error(std::string message, ErrorCode errorCode) : BaseError(message, -1) {} -MSCCLPP_API_CPP CudaError::CudaError(std::string message, int errorCode) : BaseError(message, errorCode) -{ -} +MSCCLPP_API_CPP CudaError::CudaError(std::string message, int errorCode) : BaseError(message, errorCode) {} -MSCCLPP_API_CPP CuError::CuError(std::string message, int errorCode) : BaseError(message, errorCode) -{ -} +MSCCLPP_API_CPP CuError::CuError(std::string message, int errorCode) : BaseError(message, errorCode) {} -MSCCLPP_API_CPP IbError::IbError(std::string message, int errorCode) : BaseError(message, errorCode) -{ -} +MSCCLPP_API_CPP IbError::IbError(std::string message, int errorCode) : BaseError(message, errorCode) {} -}; // namespace mscclpp +}; // namespace mscclpp diff --git a/src/fifo.cc b/src/fifo.cc index 2c4ebf7a..e4571254 100644 --- a/src/fifo.cc +++ b/src/fifo.cc @@ -1,15 +1,16 @@ +#include +#include + +#include +#include + #include "alloc.h" #include "api.h" #include "checks.hpp" -#include -#include -#include -#include namespace mscclpp { -struct HostProxyFifo::Impl -{ +struct HostProxyFifo::Impl { DeviceProxyFifo deviceFifo; // allocated on the host. Only accessed by the host. This is a copy of the @@ -25,8 +26,7 @@ struct HostProxyFifo::Impl cudaStream_t stream; }; -MSCCLPP_API_CPP HostProxyFifo::HostProxyFifo() -{ +MSCCLPP_API_CPP HostProxyFifo::HostProxyFifo() { pimpl = std::make_unique(); MSCCLPPTHROW(mscclppCudaCalloc(&pimpl->deviceFifo.head, 1)); MSCCLPPTHROW(mscclppCudaHostCalloc(&pimpl->deviceFifo.triggers, MSCCLPP_PROXY_FIFO_SIZE)); @@ -35,28 +35,24 @@ MSCCLPP_API_CPP HostProxyFifo::HostProxyFifo() pimpl->hostTail = 0; } -MSCCLPP_API_CPP HostProxyFifo::~HostProxyFifo() -{ +MSCCLPP_API_CPP HostProxyFifo::~HostProxyFifo() { mscclppCudaFree(pimpl->deviceFifo.head); mscclppCudaHostFree(pimpl->deviceFifo.triggers); mscclppCudaFree(pimpl->deviceFifo.tailReplica); cudaStreamDestroy(pimpl->stream); } -MSCCLPP_API_CPP void HostProxyFifo::poll(ProxyTrigger* trigger) -{ +MSCCLPP_API_CPP void HostProxyFifo::poll(ProxyTrigger* trigger) { __m128i xmm0 = _mm_load_si128((__m128i*)&pimpl->deviceFifo.triggers[pimpl->hostTail % MSCCLPP_PROXY_FIFO_SIZE]); _mm_store_si128((__m128i*)trigger, xmm0); } -MSCCLPP_API_CPP void HostProxyFifo::pop() -{ +MSCCLPP_API_CPP void HostProxyFifo::pop() { *(volatile uint64_t*)(&pimpl->deviceFifo.triggers[pimpl->hostTail % MSCCLPP_PROXY_FIFO_SIZE]) = 0; (pimpl->hostTail)++; } -MSCCLPP_API_CPP void HostProxyFifo::flushTail(bool sync) -{ +MSCCLPP_API_CPP void HostProxyFifo::flushTail(bool sync) { // Flush the tail to device memory. This is either triggered every MSCCLPP_PROXY_FIFO_FLUSH_COUNTER to make sure // that the fifo can make progress even if there is no request mscclppSync. However, mscclppSync type is for flush // request. @@ -67,9 +63,6 @@ MSCCLPP_API_CPP void HostProxyFifo::flushTail(bool sync) } } -MSCCLPP_API_CPP DeviceProxyFifo HostProxyFifo::deviceFifo() -{ - return pimpl->deviceFifo; -} +MSCCLPP_API_CPP DeviceProxyFifo HostProxyFifo::deviceFifo() { return pimpl->deviceFifo; } -} // namespace mscclpp +} // namespace mscclpp diff --git a/src/ib.cc b/src/ib.cc index 32db71bb..99450863 100644 --- a/src/ib.cc +++ b/src/ib.cc @@ -1,25 +1,26 @@ +#include "ib.hpp" + +#include +#include +#include + #include #include #include -#include +#include #include -#include +#include #include "alloc.h" #include "api.h" #include "checks.hpp" #include "debug.h" -#include "ib.hpp" -#include -#include -#include #define MAXCONNECTIONS 64 namespace mscclpp { -IbMr::IbMr(void* pd, void* buff, std::size_t size) : buff(buff) -{ +IbMr::IbMr(void* pd, void* buff, std::size_t size) : buff(buff) { if (size == 0) { throw std::invalid_argument("invalid size: " + std::to_string(size)); } @@ -30,9 +31,9 @@ IbMr::IbMr(void* pd, void* buff, std::size_t size) : buff(buff) uintptr_t addr = reinterpret_cast(buff) & -pageSize; std::size_t pages = (size + (reinterpret_cast(buff) - addr) + pageSize - 1) / pageSize; struct ibv_pd* _pd = reinterpret_cast(pd); - struct ibv_mr* _mr = - ibv_reg_mr(_pd, reinterpret_cast(addr), pages * pageSize, - IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_RELAXED_ORDERING); + struct ibv_mr* _mr = ibv_reg_mr( + _pd, reinterpret_cast(addr), pages * pageSize, + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_RELAXED_ORDERING); if (_mr == nullptr) { std::stringstream err; err << "ibv_reg_mr failed (errno " << errno << ")"; @@ -42,31 +43,20 @@ IbMr::IbMr(void* pd, void* buff, std::size_t size) : buff(buff) this->size = pages * pageSize; } -IbMr::~IbMr() -{ - ibv_dereg_mr(reinterpret_cast(this->mr)); -} +IbMr::~IbMr() { ibv_dereg_mr(reinterpret_cast(this->mr)); } -IbMrInfo IbMr::getInfo() const -{ +IbMrInfo IbMr::getInfo() const { IbMrInfo info; info.addr = reinterpret_cast(this->buff); info.rkey = reinterpret_cast(this->mr)->rkey; return info; } -const void* IbMr::getBuff() const -{ - return this->buff; -} +const void* IbMr::getBuff() const { return this->buff; } -uint32_t IbMr::getLkey() const -{ - return reinterpret_cast(this->mr)->lkey; -} +uint32_t IbMr::getLkey() const { return reinterpret_cast(this->mr)->lkey; } -IbQp::IbQp(void* ctx, void* pd, int port) -{ +IbQp::IbQp(void* ctx, void* pd, int port) { struct ibv_context* _ctx = reinterpret_cast(ctx); struct ibv_pd* _pd = reinterpret_cast(pd); @@ -135,8 +125,7 @@ IbQp::IbQp(void* ctx, void* pd, int port) MSCCLPPTHROW(mscclppCalloc(reinterpret_cast(&this->wcs), MSCCLPP_IB_CQ_POLL_NUM)); } -IbQp::~IbQp() -{ +IbQp::~IbQp() { ibv_destroy_qp(reinterpret_cast(this->qp)); ibv_destroy_cq(reinterpret_cast(this->cq)); std::free(this->wrs); @@ -144,8 +133,7 @@ IbQp::~IbQp() std::free(this->wcs); } -void IbQp::rtr(const IbQpInfo& info) -{ +void IbQp::rtr(const IbQpInfo& info) { struct ibv_qp_attr qp_attr; std::memset(&qp_attr, 0, sizeof(struct ibv_qp_attr)); qp_attr.qp_state = IBV_QPS_RTR; @@ -171,7 +159,7 @@ void IbQp::rtr(const IbQpInfo& info) qp_attr.ah_attr.port_num = info.port; int ret = ibv_modify_qp(reinterpret_cast(this->qp), &qp_attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | - IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER); + IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER); if (ret != 0) { std::stringstream err; err << "ibv_modify_qp failed (errno " << errno << ")"; @@ -179,8 +167,7 @@ void IbQp::rtr(const IbQpInfo& info) } } -void IbQp::rts() -{ +void IbQp::rts() { struct ibv_qp_attr qp_attr; std::memset(&qp_attr, 0, sizeof(struct ibv_qp_attr)); qp_attr.qp_state = IBV_QPS_RTS; @@ -189,9 +176,9 @@ void IbQp::rts() qp_attr.rnr_retry = 7; qp_attr.sq_psn = 0; qp_attr.max_rd_atomic = 1; - int ret = ibv_modify_qp(reinterpret_cast(this->qp), &qp_attr, - IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | - IBV_QP_MAX_QP_RD_ATOMIC); + int ret = ibv_modify_qp( + reinterpret_cast(this->qp), &qp_attr, + IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC); if (ret != 0) { std::stringstream err; err << "ibv_modify_qp failed (errno " << errno << ")"; @@ -200,8 +187,7 @@ void IbQp::rts() } int IbQp::stageSend(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, - uint64_t dstOffset, bool signaled) -{ + uint64_t dstOffset, bool signaled) { if (this->wrn >= MSCCLPP_IB_MAX_SENDS) { return -1; } @@ -230,8 +216,7 @@ int IbQp::stageSend(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_ } int IbQp::stageSendWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, - uint64_t dstOffset, bool signaled, unsigned int immData) -{ + uint64_t dstOffset, bool signaled, unsigned int immData) { int wrn = this->stageSend(mr, info, size, wrId, srcOffset, dstOffset, signaled); struct ibv_send_wr* wrs_ = reinterpret_cast(this->wrs); wrs_[wrn - 1].opcode = IBV_WR_RDMA_WRITE_WITH_IMM; @@ -239,8 +224,7 @@ int IbQp::stageSendWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size, return wrn; } -void IbQp::postSend() -{ +void IbQp::postSend() { if (this->wrn == 0) { return; } @@ -255,8 +239,7 @@ void IbQp::postSend() this->wrn = 0; } -void IbQp::postRecv(uint64_t wrId) -{ +void IbQp::postRecv(uint64_t wrId) { struct ibv_recv_wr wr, *bad_wr; wr.wr_id = wrId; wr.sg_list = nullptr; @@ -270,24 +253,16 @@ void IbQp::postRecv(uint64_t wrId) } } -int IbQp::pollCq() -{ +int IbQp::pollCq() { return ibv_poll_cq(reinterpret_cast(this->cq), MSCCLPP_IB_CQ_POLL_NUM, reinterpret_cast(this->wcs)); } -IbQpInfo& IbQp::getInfo() -{ - return this->info; -} +IbQpInfo& IbQp::getInfo() { return this->info; } -const void* IbQp::getWc(int idx) const -{ - return &reinterpret_cast(this->wcs)[idx]; -} +const void* IbQp::getWc(int idx) const { return &reinterpret_cast(this->wcs)[idx]; } -IbCtx::IbCtx(const std::string& devName) : devName(devName) -{ +IbCtx::IbCtx(const std::string& devName) : devName(devName) { int num; struct ibv_device** devices = ibv_get_device_list(&num); for (int i = 0; i < num; ++i) { @@ -310,8 +285,7 @@ IbCtx::IbCtx(const std::string& devName) : devName(devName) } } -IbCtx::~IbCtx() -{ +IbCtx::~IbCtx() { this->mrs.clear(); this->qps.clear(); if (this->pd != nullptr) { @@ -322,8 +296,7 @@ IbCtx::~IbCtx() } } -bool IbCtx::isPortUsable(int port) const -{ +bool IbCtx::isPortUsable(int port) const { struct ibv_port_attr portAttr; if (ibv_query_port(reinterpret_cast(this->ctx), port, &portAttr) != 0) { std::stringstream err; @@ -334,8 +307,7 @@ bool IbCtx::isPortUsable(int port) const (portAttr.link_layer == IBV_LINK_LAYER_ETHERNET || portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND); } -int IbCtx::getAnyActivePort() const -{ +int IbCtx::getAnyActivePort() const { struct ibv_device_attr devAttr; if (ibv_query_device(reinterpret_cast(this->ctx), &devAttr) != 0) { std::stringstream err; @@ -350,8 +322,7 @@ int IbCtx::getAnyActivePort() const return -1; } -IbQp* IbCtx::createQp(int port /*=-1*/) -{ +IbQp* IbCtx::createQp(int port /*=-1*/) { if (port == -1) { port = this->getAnyActivePort(); if (port == -1) { @@ -364,56 +335,50 @@ IbQp* IbCtx::createQp(int port /*=-1*/) return qps.back().get(); } -const IbMr* IbCtx::registerMr(void* buff, std::size_t size) -{ +const IbMr* IbCtx::registerMr(void* buff, std::size_t size) { mrs.emplace_back(new IbMr(this->pd, buff, size)); return mrs.back().get(); } -const std::string& IbCtx::getDevName() const -{ - return this->devName; -} +const std::string& IbCtx::getDevName() const { return this->devName; } -MSCCLPP_API_CPP int getIBDeviceCount() -{ +MSCCLPP_API_CPP int getIBDeviceCount() { int num; ibv_get_device_list(&num); return num; } -MSCCLPP_API_CPP std::string getIBDeviceName(Transport ibTransport) -{ +MSCCLPP_API_CPP std::string getIBDeviceName(Transport ibTransport) { int num; struct ibv_device** devices = ibv_get_device_list(&num); int ibTransportIndex; - switch (ibTransport) { // TODO: get rid of this ugly switch - case Transport::IB0: - ibTransportIndex = 0; - break; - case Transport::IB1: - ibTransportIndex = 1; - break; - case Transport::IB2: - ibTransportIndex = 2; - break; - case Transport::IB3: - ibTransportIndex = 3; - break; - case Transport::IB4: - ibTransportIndex = 4; - break; - case Transport::IB5: - ibTransportIndex = 5; - break; - case Transport::IB6: - ibTransportIndex = 6; - break; - case Transport::IB7: - ibTransportIndex = 7; - break; - default: - throw std::invalid_argument("Not an IB transport"); + switch (ibTransport) { // TODO: get rid of this ugly switch + case Transport::IB0: + ibTransportIndex = 0; + break; + case Transport::IB1: + ibTransportIndex = 1; + break; + case Transport::IB2: + ibTransportIndex = 2; + break; + case Transport::IB3: + ibTransportIndex = 3; + break; + case Transport::IB4: + ibTransportIndex = 4; + break; + case Transport::IB5: + ibTransportIndex = 5; + break; + case Transport::IB6: + ibTransportIndex = 6; + break; + case Transport::IB7: + ibTransportIndex = 7; + break; + default: + throw std::invalid_argument("Not an IB transport"); } if (ibTransportIndex >= num) { throw std::out_of_range("IB transport out of range"); @@ -421,35 +386,34 @@ MSCCLPP_API_CPP std::string getIBDeviceName(Transport ibTransport) return devices[ibTransportIndex]->name; } -MSCCLPP_API_CPP Transport getIBTransportByDeviceName(const std::string& ibDeviceName) -{ +MSCCLPP_API_CPP Transport getIBTransportByDeviceName(const std::string& ibDeviceName) { int num; struct ibv_device** devices = ibv_get_device_list(&num); for (int i = 0; i < num; ++i) { if (ibDeviceName == devices[i]->name) { - switch (i) { // TODO: get rid of this ugly switch - case 0: - return Transport::IB0; - case 1: - return Transport::IB1; - case 2: - return Transport::IB2; - case 3: - return Transport::IB3; - case 4: - return Transport::IB4; - case 5: - return Transport::IB5; - case 6: - return Transport::IB6; - case 7: - return Transport::IB7; - default: - throw std::out_of_range("IB device index out of range"); + switch (i) { // TODO: get rid of this ugly switch + case 0: + return Transport::IB0; + case 1: + return Transport::IB1; + case 2: + return Transport::IB2; + case 3: + return Transport::IB3; + case 4: + return Transport::IB4; + case 5: + return Transport::IB5; + case 6: + return Transport::IB6; + case 7: + return Transport::IB7; + default: + throw std::out_of_range("IB device index out of range"); } } } throw std::invalid_argument("IB device not found"); } -} // namespace mscclpp +} // namespace mscclpp diff --git a/src/include/align.h b/src/include/align.h index 008d2b44..981d943d 100644 --- a/src/include/align.h +++ b/src/include/align.h @@ -22,19 +22,19 @@ #endif #endif -template __host__ __device__ constexpr Z divUp(X x, Y y) -{ +template +__host__ __device__ constexpr Z divUp(X x, Y y) { return (x + y - 1) / y; } -template __host__ __device__ constexpr Z roundUp(X x, Y y) -{ +template +__host__ __device__ constexpr Z roundUp(X x, Y y) { return (x + y - 1) - (x + y - 1) % y; } // assumes second argument is a power of 2 -template __host__ __device__ constexpr Z alignUp(X x, int a) -{ +template +__host__ __device__ constexpr Z alignUp(X x, int a) { return (x + a - 1) & Z(-a); } diff --git a/src/include/alloc.h b/src/include/alloc.h index 5c696e6e..5de23e87 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -7,16 +7,17 @@ #ifndef MSCCLPP_ALLOC_H_ #define MSCCLPP_ALLOC_H_ -#include "align.h" -#include "checks.h" -#include "mscclpp.h" -#include "utils.h" #include #include #include -template mscclppResult_t mscclppCudaHostCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) -{ +#include "align.h" +#include "checks.h" +#include "mscclpp.h" +#include "utils.h" + +template +mscclppResult_t mscclppCudaHostCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) { mscclppResult_t result = mscclppSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; @@ -26,21 +27,19 @@ template mscclppResult_t mscclppCudaHostCallocDebug(T** ptr, size_t memset(*ptr, 0, nelem * sizeof(T)); finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (*ptr == nullptr) - WARN("Failed to CUDA host alloc %ld bytes", nelem * sizeof(T)); + if (*ptr == nullptr) WARN("Failed to CUDA host alloc %ld bytes", nelem * sizeof(T)); INFO(MSCCLPP_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr); return result; } #define mscclppCudaHostCalloc(...) mscclppCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__) -inline mscclppResult_t mscclppCudaHostFree(void* ptr) -{ +inline mscclppResult_t mscclppCudaHostFree(void* ptr) { CUDACHECK(cudaFreeHost(ptr)); return mscclppSuccess; } -template mscclppResult_t mscclppCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) -{ +template +mscclppResult_t mscclppCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) { void* p = malloc(nelem * sizeof(T)); if (p == NULL) { WARN("Failed to malloc %ld bytes", nelem * sizeof(T)); @@ -53,12 +52,10 @@ template mscclppResult_t mscclppCallocDebug(T** ptr, size_t nelem, } #define mscclppCalloc(...) mscclppCallocDebug(__VA_ARGS__, __FILE__, __LINE__) -template mscclppResult_t mscclppRealloc(T** ptr, size_t oldNelem, size_t nelem) -{ - if (nelem < oldNelem) - return mscclppInternalError; - if (nelem == oldNelem) - return mscclppSuccess; +template +mscclppResult_t mscclppRealloc(T** ptr, size_t oldNelem, size_t nelem) { + if (nelem < oldNelem) return mscclppInternalError; + if (nelem == oldNelem) return mscclppSuccess; T* oldp = *ptr; T* p = (T*)malloc(nelem * sizeof(T)); @@ -75,8 +72,8 @@ template mscclppResult_t mscclppRealloc(T** ptr, size_t oldNelem, s return mscclppSuccess; } -template mscclppResult_t mscclppCudaMallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) -{ +template +mscclppResult_t mscclppCudaMallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) { mscclppResult_t result = mscclppSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; @@ -84,15 +81,14 @@ template mscclppResult_t mscclppCudaMallocDebug(T** ptr, size_t nel CUDACHECKGOTO(cudaMalloc(ptr, nelem * sizeof(T)), result, finish); finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (*ptr == nullptr) - WARN("Failed to CUDA malloc %ld bytes", nelem * sizeof(T)); + if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem * sizeof(T)); INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr); return result; } #define mscclppCudaMalloc(...) mscclppCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__) -template mscclppResult_t mscclppCudaCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) -{ +template +mscclppResult_t mscclppCudaCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) { mscclppResult_t result = mscclppSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; @@ -106,16 +102,15 @@ template mscclppResult_t mscclppCudaCallocDebug(T** ptr, size_t nel CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish); finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (*ptr == nullptr) - WARN("Failed to CUDA calloc %ld bytes", nelem * sizeof(T)); + if (*ptr == nullptr) WARN("Failed to CUDA calloc %ld bytes", nelem * sizeof(T)); INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr); return result; } #define mscclppCudaCalloc(...) mscclppCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__) template -mscclppResult_t mscclppCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream, const char* filefunc, int line) -{ +mscclppResult_t mscclppCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream, const char* filefunc, + int line) { mscclppResult_t result = mscclppSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; @@ -124,15 +119,14 @@ mscclppResult_t mscclppCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem * sizeof(T), stream), result, finish); finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (*ptr == nullptr) - WARN("Failed to CUDA calloc async %ld bytes", nelem * sizeof(T)); + if (*ptr == nullptr) WARN("Failed to CUDA calloc async %ld bytes", nelem * sizeof(T)); INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr); return result; } #define mscclppCudaCallocAsync(...) mscclppCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__) -template mscclppResult_t mscclppCudaMemcpy(T* dst, T* src, size_t nelem) -{ +template +mscclppResult_t mscclppCudaMemcpy(T* dst, T* src, size_t nelem) { mscclppResult_t result = mscclppSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); @@ -147,8 +141,8 @@ finish: return result; } -template mscclppResult_t mscclppCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stream) -{ +template +mscclppResult_t mscclppCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stream) { mscclppResult_t result = mscclppSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); @@ -158,8 +152,8 @@ finish: return result; } -template mscclppResult_t mscclppCudaFree(T* ptr) -{ +template +mscclppResult_t mscclppCudaFree(T* ptr) { mscclppResult_t result = mscclppSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); @@ -172,14 +166,12 @@ finish: // Allocate memory to be potentially ibv_reg_mr'd. This needs to be // allocated on separate pages as those pages will be marked DONTFORK // and if they are shared, that could cause a crash in a child process -inline mscclppResult_t mscclppIbMallocDebug(void** ptr, size_t size, const char* filefunc, int line) -{ +inline mscclppResult_t mscclppIbMallocDebug(void** ptr, size_t size, const char* filefunc, int line) { size_t page_size = sysconf(_SC_PAGESIZE); void* p; int size_aligned = ROUNDUP(size, page_size); int ret = posix_memalign(&p, page_size, size_aligned); - if (ret != 0) - return mscclppSystemError; + if (ret != 0) return mscclppSystemError; memset(p, 0, size); *ptr = p; INFO(MSCCLPP_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr); diff --git a/src/include/api.h b/src/include/api.h index cf546e39..cb2cac81 100644 --- a/src/include/api.h +++ b/src/include/api.h @@ -4,4 +4,4 @@ #define MSCCLPP_API extern "C" __attribute__((visibility("default"))) #define MSCCLPP_API_CPP __attribute__((visibility("default"))) -#endif // MSCCLPP_API_H_ +#endif // MSCCLPP_API_H_ diff --git a/src/include/basic_proxy_handler.hpp b/src/include/basic_proxy_handler.hpp index c1dc1038..2d22a309 100644 --- a/src/include/basic_proxy_handler.hpp +++ b/src/include/basic_proxy_handler.hpp @@ -1,9 +1,10 @@ #ifndef MSCCLPP_BASIC_PROXY_SERVICE_HPP_ #define MSCCLPP_BASIC_PROXY_SERVICE_HPP_ -#include "communicator.hpp" #include +#include "communicator.hpp" + namespace mscclpp { ProxyHandler makeBasicProxyHandler(Communicator::Impl& comm); diff --git a/src/include/checks.h b/src/include/checks.h index 7422e384..c877cdea 100644 --- a/src/include/checks.h +++ b/src/include/checks.h @@ -7,187 +7,182 @@ #ifndef MSCCLPP_CHECKS_H_ #define MSCCLPP_CHECKS_H_ -#include "debug.h" #include +#include "debug.h" + // Check CUDA RT calls -#define CUDACHECK(cmd) \ - do { \ - cudaError_t err = cmd; \ - if (err != cudaSuccess) { \ - WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ - return mscclppUnhandledCudaError; \ - } \ +#define CUDACHECK(cmd) \ + do { \ + cudaError_t err = cmd; \ + if (err != cudaSuccess) { \ + WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ + return mscclppUnhandledCudaError; \ + } \ } while (false) -#define CUDACHECKNORET(cmd) \ - do { \ - cudaError_t err = cmd; \ - if (err != cudaSuccess) { \ - WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ - return; \ - } \ +#define CUDACHECKNORET(cmd) \ + do { \ + cudaError_t err = cmd; \ + if (err != cudaSuccess) { \ + WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ + return; \ + } \ } while (false) -#define CUDACHECKGOTO(cmd, res, label) \ - do { \ - cudaError_t err = cmd; \ - if (err != cudaSuccess) { \ - WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ - res = mscclppUnhandledCudaError; \ - goto label; \ - } \ +#define CUDACHECKGOTO(cmd, res, label) \ + do { \ + cudaError_t err = cmd; \ + if (err != cudaSuccess) { \ + WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ + res = mscclppUnhandledCudaError; \ + goto label; \ + } \ } while (false) // Report failure but clear error and continue -#define CUDACHECKIGNORE(cmd) \ - do { \ - cudaError_t err = cmd; \ - if (err != cudaSuccess) { \ - INFO(MSCCLPP_ALL, "%s:%d Cuda failure '%s'", __FILE__, __LINE__, cudaGetErrorString(err)); \ - (void)cudaGetLastError(); \ - } \ +#define CUDACHECKIGNORE(cmd) \ + do { \ + cudaError_t err = cmd; \ + if (err != cudaSuccess) { \ + INFO(MSCCLPP_ALL, "%s:%d Cuda failure '%s'", __FILE__, __LINE__, cudaGetErrorString(err)); \ + (void)cudaGetLastError(); \ + } \ } while (false) #include // Check system calls -#define SYSCHECK(call, name) \ - do { \ - int retval; \ - SYSCHECKVAL(call, name, retval); \ +#define SYSCHECK(call, name) \ + do { \ + int retval; \ + SYSCHECKVAL(call, name, retval); \ } while (false) -#define SYSCHECKVAL(call, name, retval) \ - do { \ - SYSCHECKSYNC(call, name, retval); \ - if (retval == -1) { \ - WARN("Call to " name " failed : %s", strerror(errno)); \ - return mscclppSystemError; \ - } \ +#define SYSCHECKVAL(call, name, retval) \ + do { \ + SYSCHECKSYNC(call, name, retval); \ + if (retval == -1) { \ + WARN("Call to " name " failed : %s", strerror(errno)); \ + return mscclppSystemError; \ + } \ } while (false) -#define SYSCHECKSYNC(call, name, retval) \ - do { \ - retval = call; \ - if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ - INFO(MSCCLPP_ALL, "Call to " name " returned %s, retrying", strerror(errno)); \ - } else { \ - break; \ - } \ +#define SYSCHECKSYNC(call, name, retval) \ + do { \ + retval = call; \ + if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ + INFO(MSCCLPP_ALL, "Call to " name " returned %s, retrying", strerror(errno)); \ + } else { \ + break; \ + } \ } while (true) -#define SYSCHECKGOTO(statement, res, label) \ - do { \ - if ((statement) == -1) { \ - /* Print the back trace*/ \ - res = mscclppSystemError; \ - INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res); \ - goto label; \ - } \ +#define SYSCHECKGOTO(statement, res, label) \ + do { \ + if ((statement) == -1) { \ + /* Print the back trace*/ \ + res = mscclppSystemError; \ + INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res); \ + goto label; \ + } \ } while (0); -#define NEQCHECK(statement, value) \ - do { \ - if ((statement) != value) { \ - /* Print the back trace*/ \ - INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, mscclppSystemError); \ - return mscclppSystemError; \ - } \ +#define NEQCHECK(statement, value) \ + do { \ + if ((statement) != value) { \ + /* Print the back trace*/ \ + INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, mscclppSystemError); \ + return mscclppSystemError; \ + } \ } while (0); -#define NEQCHECKGOTO(statement, value, res, label) \ - do { \ - if ((statement) != value) { \ - /* Print the back trace*/ \ - res = mscclppSystemError; \ - INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res); \ - goto label; \ - } \ +#define NEQCHECKGOTO(statement, value, res, label) \ + do { \ + if ((statement) != value) { \ + /* Print the back trace*/ \ + res = mscclppSystemError; \ + INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res); \ + goto label; \ + } \ } while (0); -#define EQCHECK(statement, value) \ - do { \ - if ((statement) == value) { \ - /* Print the back trace*/ \ - INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, mscclppSystemError); \ - return mscclppSystemError; \ - } \ +#define EQCHECK(statement, value) \ + do { \ + if ((statement) == value) { \ + /* Print the back trace*/ \ + INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, mscclppSystemError); \ + return mscclppSystemError; \ + } \ } while (0); -#define EQCHECKGOTO(statement, value, res, label) \ - do { \ - if ((statement) == value) { \ - /* Print the back trace*/ \ - res = mscclppSystemError; \ - INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res); \ - goto label; \ - } \ +#define EQCHECKGOTO(statement, value, res, label) \ + do { \ + if ((statement) == value) { \ + /* Print the back trace*/ \ + res = mscclppSystemError; \ + INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res); \ + goto label; \ + } \ } while (0); // Propagate errors up -#define MSCCLPPCHECK(call) \ - do { \ - mscclppResult_t res = call; \ - if (res != mscclppSuccess && res != mscclppInProgress) { \ - /* Print the back trace*/ \ - if (mscclppDebugNoWarn == 0) \ - INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res); \ - return res; \ - } \ +#define MSCCLPPCHECK(call) \ + do { \ + mscclppResult_t res = call; \ + if (res != mscclppSuccess && res != mscclppInProgress) { \ + /* Print the back trace*/ \ + if (mscclppDebugNoWarn == 0) INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res); \ + return res; \ + } \ } while (0); -#define MSCCLPPCHECKGOTO(call, res, label) \ - do { \ - res = call; \ - if (res != mscclppSuccess && res != mscclppInProgress) { \ - /* Print the back trace*/ \ - if (mscclppDebugNoWarn == 0) \ - INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res); \ - goto label; \ - } \ +#define MSCCLPPCHECKGOTO(call, res, label) \ + do { \ + res = call; \ + if (res != mscclppSuccess && res != mscclppInProgress) { \ + /* Print the back trace*/ \ + if (mscclppDebugNoWarn == 0) INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res); \ + goto label; \ + } \ } while (0); -#define MSCCLPPWAIT(call, cond, abortFlagPtr) \ - do { \ - volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ - mscclppResult_t res = call; \ - if (res != mscclppSuccess && res != mscclppInProgress) { \ - if (mscclppDebugNoWarn == 0) \ - INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res); \ - return mscclppInternalError; \ - } \ - if (tmpAbortFlag) \ - NEQCHECK(*tmpAbortFlag, 0); \ +#define MSCCLPPWAIT(call, cond, abortFlagPtr) \ + do { \ + volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ + mscclppResult_t res = call; \ + if (res != mscclppSuccess && res != mscclppInProgress) { \ + if (mscclppDebugNoWarn == 0) INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res); \ + return mscclppInternalError; \ + } \ + if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \ } while (!(cond)); -#define MSCCLPPWAITGOTO(call, cond, abortFlagPtr, res, label) \ - do { \ - volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ - res = call; \ - if (res != mscclppSuccess && res != mscclppInProgress) { \ - if (mscclppDebugNoWarn == 0) \ - INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res); \ - goto label; \ - } \ - if (tmpAbortFlag) \ - NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \ +#define MSCCLPPWAITGOTO(call, cond, abortFlagPtr, res, label) \ + do { \ + volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ + res = call; \ + if (res != mscclppSuccess && res != mscclppInProgress) { \ + if (mscclppDebugNoWarn == 0) INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res); \ + goto label; \ + } \ + if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \ } while (!(cond)); -#define MSCCLPPCHECKTHREAD(a, args) \ - do { \ - if (((args)->ret = (a)) != mscclppSuccess && (args)->ret != mscclppInProgress) { \ - INFO(MSCCLPP_INIT, "%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \ - return args; \ - } \ +#define MSCCLPPCHECKTHREAD(a, args) \ + do { \ + if (((args)->ret = (a)) != mscclppSuccess && (args)->ret != mscclppInProgress) { \ + INFO(MSCCLPP_INIT, "%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \ + return args; \ + } \ } while (0) -#define CUDACHECKTHREAD(a) \ - do { \ - if ((a) != cudaSuccess) { \ - INFO(MSCCLPP_INIT, "%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ - args->ret = mscclppUnhandledCudaError; \ - return args; \ - } \ +#define CUDACHECKTHREAD(a) \ + do { \ + if ((a) != cudaSuccess) { \ + INFO(MSCCLPP_INIT, "%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ + args->ret = mscclppUnhandledCudaError; \ + return args; \ + } \ } while (0) #endif diff --git a/src/include/checks.hpp b/src/include/checks.hpp index 8332847b..05204bef 100644 --- a/src/include/checks.hpp +++ b/src/include/checks.hpp @@ -7,37 +7,38 @@ #ifndef MSCCLPP_CHECKS_HPP_ #define MSCCLPP_CHECKS_HPP_ -#include "debug.h" -#include - #include #include -#define MSCCLPPTHROW(call) \ - do { \ - mscclppResult_t res = call; \ - if (res != mscclppSuccess && res != mscclppInProgress) { \ - throw mscclpp::Error(std::string("Call to " #call " failed with error code ") + mscclppGetErrorString(res), \ - ErrorCode::InvalidUsage); \ - } \ +#include + +#include "debug.h" + +#define MSCCLPPTHROW(call) \ + do { \ + mscclppResult_t res = call; \ + if (res != mscclppSuccess && res != mscclppInProgress) { \ + throw mscclpp::Error(std::string("Call to " #call " failed with error code ") + mscclppGetErrorString(res), \ + ErrorCode::InvalidUsage); \ + } \ } while (false) -#define CUDATHROW(cmd) \ - do { \ - cudaError_t err = cmd; \ - if (err != cudaSuccess) { \ - throw mscclpp::CudaError(std::string("Cuda failure '") + cudaGetErrorString(err) + "'", err); \ - } \ +#define CUDATHROW(cmd) \ + do { \ + cudaError_t err = cmd; \ + if (err != cudaSuccess) { \ + throw mscclpp::CudaError(std::string("Cuda failure '") + cudaGetErrorString(err) + "'", err); \ + } \ } while (false) -#define CUTHROW(cmd) \ - do { \ - CUresult err = cmd; \ - if (err != CUDA_SUCCESS) { \ - const char* errStr; \ - cuGetErrorString(err, &errStr); \ - throw mscclpp::CuError(std::string("Cu failure '") + std::string(errStr) + "'", err); \ - } \ +#define CUTHROW(cmd) \ + do { \ + CUresult err = cmd; \ + if (err != CUDA_SUCCESS) { \ + const char* errStr; \ + cuGetErrorString(err, &errStr); \ + throw mscclpp::CuError(std::string("Cu failure '") + std::string(errStr) + "'", err); \ + } \ } while (false) #endif diff --git a/src/include/communicator.hpp b/src/include/communicator.hpp index cc464618..eaf05a32 100644 --- a/src/include/communicator.hpp +++ b/src/include/communicator.hpp @@ -1,19 +1,19 @@ #ifndef MSCCL_COMMUNICATOR_HPP_ #define MSCCL_COMMUNICATOR_HPP_ -#include "ib.hpp" -#include "mscclpp.h" +#include #include #include -#include #include +#include "ib.hpp" +#include "mscclpp.h" + namespace mscclpp { class ConnectionBase; -struct Communicator::Impl -{ +struct Communicator::Impl { std::vector> connections_; std::vector> toSetup_; std::unordered_map> ibContexts_; @@ -27,6 +27,6 @@ struct Communicator::Impl IbCtx* getIbContext(Transport ibTransport); }; -} // namespace mscclpp +} // namespace mscclpp -#endif // MSCCL_COMMUNICATOR_HPP_ +#endif // MSCCL_COMMUNICATOR_HPP_ diff --git a/src/include/config.h b/src/include/config.h index 49f8cb57..60fe3e3e 100644 --- a/src/include/config.h +++ b/src/include/config.h @@ -3,16 +3,15 @@ #include -class mscclppConfig -{ -public: +class mscclppConfig { + public: time_t bootstrapConnectionTimeout = 30; static mscclppConfig* getInstance(); time_t getBootstrapConnectionTimeoutConfig(); void setBootstrapConnectionTimeoutConfig(time_t timeout); -private: + private: mscclppConfig() = default; mscclppConfig(const mscclppConfig&) = delete; mscclppConfig& operator=(const mscclppConfig&) = delete; @@ -20,4 +19,4 @@ private: static mscclppConfig _instance; }; -#endif // end include guard +#endif // end include guard diff --git a/src/include/connection.hpp b/src/include/connection.hpp index 8d9dd270..8516871a 100644 --- a/src/include/connection.hpp +++ b/src/include/connection.hpp @@ -2,34 +2,34 @@ #define MSCCLPP_CONNECTION_HPP_ // TODO(saemal): make this configurable -#define MSCCLPP_POLLING_WAIT 3e7 // in microseconds +#define MSCCLPP_POLLING_WAIT 3e7 // in microseconds + +#include + +#include #include "communicator.hpp" #include "ib.hpp" -#include -#include namespace mscclpp { // TODO: Add functionality to these classes for Communicator to do connectionSetup -class ConnectionBase : public Connection, public Setuppable -{ +class ConnectionBase : public Connection, public Setuppable { int remoteRank_; int tag_; -public: + public: ConnectionBase(int remoteRank, int tag); int remoteRank() override; int tag() override; }; -class CudaIpcConnection : public ConnectionBase -{ +class CudaIpcConnection : public ConnectionBase { cudaStream_t stream; -public: + public: CudaIpcConnection(int remoteRank, int tag); ~CudaIpcConnection(); @@ -44,14 +44,13 @@ public: void flush() override; }; -class IBConnection : public ConnectionBase -{ +class IBConnection : public ConnectionBase { Transport transport_; Transport remoteTransport_; IbQp* qp; int numSignaledSends; -public: + public: IBConnection(int remoteRank, int tag, Transport transport, Communicator::Impl& commImpl); Transport transport() override; @@ -68,6 +67,6 @@ public: void endSetup(std::shared_ptr bootstrap) override; }; -} // namespace mscclpp +} // namespace mscclpp -#endif // MSCCLPP_CONNECTION_HPP_ +#endif // MSCCLPP_CONNECTION_HPP_ diff --git a/src/include/debug.h b/src/include/debug.h index dd548cbb..64b37297 100644 --- a/src/include/debug.h +++ b/src/include/debug.h @@ -7,20 +7,20 @@ #ifndef MSCCLPP_DEBUG_H_ #define MSCCLPP_DEBUG_H_ -#include "mscclpp.h" -#include -#include -#include - #include #include +#include #include +#include +#include + +#include "mscclpp.h" + // Conform to pthread and NVTX standard #define MSCCLPP_THREAD_NAMELEN 16 -typedef enum -{ +typedef enum { MSCCLPP_LOG_NONE = 0, MSCCLPP_LOG_VERSION = 1, MSCCLPP_LOG_WARN = 2, @@ -28,8 +28,7 @@ typedef enum MSCCLPP_LOG_ABORT = 4, MSCCLPP_LOG_TRACE = 5 } mscclppDebugLogLevel; -typedef enum -{ +typedef enum { MSCCLPP_INIT = 1, MSCCLPP_COLL = 2, MSCCLPP_P2P = 4, diff --git a/src/include/ib.hpp b/src/include/ib.hpp index 78d31ce6..fea25615 100644 --- a/src/include/ib.hpp +++ b/src/include/ib.hpp @@ -12,22 +12,20 @@ namespace mscclpp { -struct IbMrInfo -{ +struct IbMrInfo { uint64_t addr; uint32_t rkey; }; -class IbMr -{ -public: +class IbMr { + public: ~IbMr(); IbMrInfo getInfo() const; const void* getBuff() const; uint32_t getLkey() const; -private: + private: IbMr(void* pd, void* buff, std::size_t size); void* mr; @@ -38,8 +36,7 @@ private: }; // QP info to be shared with the remote peer -struct IbQpInfo -{ +struct IbQpInfo { uint16_t lid; uint8_t port; uint8_t linkLayer; @@ -48,9 +45,8 @@ struct IbQpInfo int mtu; }; -class IbQp -{ -public: +class IbQp { + public: ~IbQp(); void rtr(const IbQpInfo& info); @@ -66,7 +62,7 @@ public: IbQpInfo& getInfo(); const void* getWc(int idx) const; -private: + private: IbQp(void* ctx, void* pd, int port); IbQpInfo info; @@ -81,9 +77,8 @@ private: friend class IbCtx; }; -class IbCtx -{ -public: +class IbCtx { + public: IbCtx(const std::string& devName); ~IbCtx(); @@ -92,7 +87,7 @@ public: const std::string& getDevName() const; -private: + private: bool isPortUsable(int port) const; int getAnyActivePort() const; @@ -103,6 +98,6 @@ private: std::list> mrs; }; -} // namespace mscclpp +} // namespace mscclpp -#endif // MSCCLPP_IB_HPP_ +#endif // MSCCLPP_IB_HPP_ diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index 4789b80f..b74f5a8d 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -12,6 +12,7 @@ #define MSCCLPP_PROXY_FIFO_FLUSH_COUNTER 4 #include + #include // #includa @@ -19,8 +20,7 @@ extern "C" { #endif -struct alignas(16) mscclppDevConnSignalEpochId -{ +struct alignas(16) mscclppDevConnSignalEpochId { // every signal(), increaments this and either: // 1) proxy thread pushes it to the remote peer's localSignalEpochId->proxy // 2) gpu thread directly writes it to remoteSignalEpochId->device @@ -93,39 +93,30 @@ using mscclppBufferHandle_t = uint32_t; * The two endpoint can concurrently use the same connection provided they are writing (puts) on different * indices in the registered buffer. **************************************************************************************************************/ -struct mscclppDevConn -{ +struct mscclppDevConn { #ifdef __CUDACC__ - __forceinline__ __device__ void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) - { + __forceinline__ __device__ void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) { fifo.push(mscclppData, dstDataOffset, srcDataOffset, dataSize); } - __forceinline__ __device__ void put(uint64_t dataOffset, uint64_t dataSize) - { - put(dataOffset, dataOffset, dataSize); - } + __forceinline__ __device__ void put(uint64_t dataOffset, uint64_t dataSize) { put(dataOffset, dataOffset, dataSize); } - __forceinline__ __device__ void signal() - { + __forceinline__ __device__ void signal() { epochIncrement(); fifo.push(mscclppFlag, 0, 0, 1); } - __forceinline__ __device__ void putWithSignal(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) - { + __forceinline__ __device__ void putWithSignal(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) { epochIncrement(); fifo.push(mscclppData | mscclppFlag, dstDataOffset, srcDataOffset, dataSize); } - __forceinline__ __device__ void putWithSignal(uint64_t dataOffset, uint64_t dataSize) - { + __forceinline__ __device__ void putWithSignal(uint64_t dataOffset, uint64_t dataSize) { putWithSignal(dataOffset, dataOffset, dataSize); } __forceinline__ __device__ void putWithSignalAndFlush(uint64_t dstDataOffset, uint64_t srcDataOffset, - uint64_t dataSize) - { + uint64_t dataSize) { epochIncrement(); uint64_t curFifoHead = fifo.push(mscclppData | mscclppFlag | mscclppSync, dstDataOffset, srcDataOffset, dataSize); while (*(volatile uint64_t*)&fifo.triggerFifo[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0 && @@ -133,13 +124,11 @@ struct mscclppDevConn ; } - __forceinline__ __device__ void putWithSignalAndFlush(uint64_t dataOffset, uint64_t dataSize) - { + __forceinline__ __device__ void putWithSignalAndFlush(uint64_t dataOffset, uint64_t dataSize) { putWithSignalAndFlush(dataOffset, dataOffset, dataSize); } - __forceinline__ __device__ void flush() - { + __forceinline__ __device__ void flush() { uint64_t curFifoHead = fifo.push(mscclppSync, 0, 0, 1); // we need to wait for two conditions to be met to ensure the CPU is done flushing. (1) wait for the tail // to go pass by curFifoHead (this is safety net) and (2) wait for the work element value to change to 0. @@ -148,19 +137,15 @@ struct mscclppDevConn ; } - __forceinline__ __device__ void wait() - { + __forceinline__ __device__ void wait() { (*waitEpochId) += 1; while (*(volatile uint64_t*)&(localSignalEpochId->proxy) < (*waitEpochId)) ; } - __forceinline__ __device__ void epochIncrement() - { - *(volatile uint64_t*)&(localSignalEpochId->device) += 1; - } + __forceinline__ __device__ void epochIncrement() { *(volatile uint64_t*)&(localSignalEpochId->device) += 1; } -#endif // __CUDACC__ +#endif // __CUDACC__ // this is a concurrent fifo which is multiple threads from the device // can produce for and the sole proxy thread consumes it. @@ -187,8 +172,7 @@ struct mscclppDevConn }; // Host interface for mscclppDevCon functionality -struct mscclppHostConn -{ +struct mscclppHostConn { virtual ~mscclppHostConn() = default; virtual void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) = 0; virtual void put(mscclppBufferHandle_t dst, uint64_t dstDataOffset, mscclppBufferHandle_t src, uint64_t srcDataOffset, @@ -203,25 +187,21 @@ typedef struct mscclppDevConn mscclppDevConn_t; typedef struct mscclppHostConn mscclppHostConn_t; #define MSCCLPP_UNIQUE_ID_BYTES 128 -typedef struct -{ +typedef struct { char internal[MSCCLPP_UNIQUE_ID_BYTES]; } mscclppUniqueId; -struct mscclppRegisteredMemoryP2P -{ +struct mscclppRegisteredMemoryP2P { void* remoteBuff; const void* IbMr; }; -struct mscclppRegisteredMemory -{ +struct mscclppRegisteredMemory { std::vector p2p; }; /* Error type */ -typedef enum -{ +typedef enum { mscclppSuccess = 0, mscclppUnhandledCudaError = 1, mscclppSystemError = 2, @@ -243,10 +223,9 @@ typedef enum mscclppResult_t mscclppGetUniqueId(mscclppUniqueId* uniqueId); /* Transport Types */ -typedef enum -{ +typedef enum { mscclppTransportP2P = 0, - mscclppTransportSHM = 1, // TODO(chhwang): not implemented yet + mscclppTransportSHM = 1, // TODO(chhwang): not implemented yet mscclppTransportIB = 2, } mscclppTransport_t; @@ -484,7 +463,7 @@ mscclppResult_t mscclppRegisteredBufferWrite(mscclppComm_t comm, mscclppRegister size_t size, uint32_t srcOffset, uint32_t dstOffset, int64_t stream); #ifdef __cplusplus -} // end extern "C" +} // end extern "C" #endif -#endif // MSCCLPP_H_ +#endif // MSCCLPP_H_ diff --git a/src/include/mscclppfifo.h b/src/include/mscclppfifo.h index 341025b5..030220dd 100644 --- a/src/include/mscclppfifo.h +++ b/src/include/mscclppfifo.h @@ -7,12 +7,7 @@ extern "C" { #endif -typedef enum : uint64_t -{ - mscclppData = 0x1, - mscclppFlag = 0x2, - mscclppSync = 0x4 -} mscclppTriggerType_t; +typedef enum : uint64_t { mscclppData = 0x1, mscclppFlag = 0x2, mscclppSync = 0x4 } mscclppTriggerType_t; #define MSCCLPP_BITS_SIZE 32 #define MSCCLPP_BITS_OFFSET 32 @@ -23,17 +18,16 @@ typedef enum : uint64_t // the summation of number of bits must be 128 or less union alignas(16) mscclppTrigger { uint64_t value[2]; - struct - { + struct { // first 64 bits: value[0] uint64_t dataSize : MSCCLPP_BITS_SIZE; uint64_t srcDataOffset : MSCCLPP_BITS_OFFSET; - uint64_t : (64 - MSCCLPP_BITS_SIZE - MSCCLPP_BITS_OFFSET); // ensure 64-bit alignment + uint64_t : (64 - MSCCLPP_BITS_SIZE - MSCCLPP_BITS_OFFSET); // ensure 64-bit alignment // second 64 bits: value[1] uint64_t dstDataOffset : MSCCLPP_BITS_OFFSET; uint64_t connId : MSCCLPP_BITS_CONNID; uint64_t type : MSCCLPP_BITS_TYPE; - uint64_t : (64 - MSCCLPP_BITS_OFFSET - MSCCLPP_BITS_CONNID - MSCCLPP_BITS_TYPE); // ensure 64-bit alignment + uint64_t : (64 - MSCCLPP_BITS_OFFSET - MSCCLPP_BITS_CONNID - MSCCLPP_BITS_TYPE); // ensure 64-bit alignment } fields; }; @@ -52,13 +46,11 @@ typedef mscclppTrigger* mscclppTrigger_t; * Why duplicating the tail is a good idea? The fifo is large engouh and we do not need frequent updates * for the tail as there is usually enough space for device threads to push their work into. */ -struct mscclppConcurrentFifo -{ +struct mscclppConcurrentFifo { #ifdef __CUDACC__ __forceinline__ __device__ uint64_t push(uint64_t type, uint64_t dstDataOffset, uint64_t srcDataOffset, - uint64_t dataSize) - { + uint64_t dataSize) { uint64_t curFifoHead = atomicAdd((unsigned long long int*)this->triggerFifoHead, 1); while (curFifoHead >= MSCCLPP_PROXY_FIFO_SIZE + *((volatile uint64_t*)this->triggerFifoTail)) ; @@ -71,16 +63,16 @@ struct mscclppConcurrentFifo return curFifoHead; } -#endif // __CUDACC__ - mscclppTrigger* triggerFifo; // Allocate on host via cudaHostAlloc. This space is used for pushing the workelements - uint64_t* triggerFifoTail; // Allocated on device. proxyState->fifoTailHost is the true tail on host and pused - // occasionally to device - uint64_t* triggerFifoHead; // Allocated on device. Only accessed by device +#endif // __CUDACC__ + mscclppTrigger* triggerFifo; // Allocate on host via cudaHostAlloc. This space is used for pushing the workelements + uint64_t* triggerFifoTail; // Allocated on device. proxyState->fifoTailHost is the true tail on host and pused + // occasionally to device + uint64_t* triggerFifoHead; // Allocated on device. Only accessed by device int connId; }; #ifdef __cplusplus -} // end extern "C" +} // end extern "C" #endif -#endif // MSCCLPPFIFO_H_ +#endif // MSCCLPPFIFO_H_ diff --git a/src/include/proxy.h b/src/include/proxy.h index 5bcb7da5..17e92dfd 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -1,23 +1,23 @@ #ifndef MSCCLPP_PROXY_H_ #define MSCCLPP_PROXY_H_ -#include "comm.h" -#include "mscclpp.h" -#include #include #include -#define MSCCLPP_PROXY_MAX_NUM (MSCCLPP_IB_MAX_DEVS + 1) // One is for a P2P proxy. +#include -typedef enum -{ +#include "comm.h" +#include "mscclpp.h" + +#define MSCCLPP_PROXY_MAX_NUM (MSCCLPP_IB_MAX_DEVS + 1) // One is for a P2P proxy. + +typedef enum { MSCCLPP_PROXY_RUN_STATE_IDLE = 0, MSCCLPP_PROXY_RUN_STATE_RUNNING, MSCCLPP_PROXY_RUN_STATE_EXITING, } mscclppProxyRunState_t; -struct mscclppProxyFifo -{ +struct mscclppProxyFifo { mscclppResult_t create(); mscclppResult_t destroy(); mscclppResult_t poll(mscclppTrigger* trigger); @@ -52,15 +52,14 @@ struct mscclppProxyFifo cudaStream_t stream; }; -struct mscclppProxyState -{ +struct mscclppProxyState { mscclppTransport_t transportType; pthread_t thread; mscclppProxyRunState_t run; int numaNodeToBind; - mscclpp::IbCtx* ibContext; // For IB connection only - cudaStream_t p2pStream; // for P2P DMA engine only + mscclpp::IbCtx* ibContext; // For IB connection only + cudaStream_t p2pStream; // for P2P DMA engine only struct mscclppProxyFifo fifo; }; diff --git a/src/include/registered_memory.hpp b/src/include/registered_memory.hpp index 7c26f4b4..be32e25a 100644 --- a/src/include/registered_memory.hpp +++ b/src/include/registered_memory.hpp @@ -1,37 +1,35 @@ #ifndef MSCCLPP_REGISTERED_MEMORY_HPP_ #define MSCCLPP_REGISTERED_MEMORY_HPP_ -#include "communicator.hpp" +#include + +#include #include + +#include "communicator.hpp" #include "ib.hpp" #include "mscclpp.h" -#include -#include namespace mscclpp { -struct TransportInfo -{ +struct TransportInfo { Transport transport; // TODO: rewrite this using std::variant or something bool ibLocal; union { - struct - { + struct { cudaIpcMemHandle_t cudaIpcBaseHandle; size_t cudaIpcOffsetFromBase; }; - struct - { + struct { const IbMr* ibMr; IbMrInfo ibMrInfo; }; }; }; -struct RegisteredMemory::Impl -{ +struct RegisteredMemory::Impl { void* data; size_t size; int rank; @@ -42,8 +40,7 @@ struct RegisteredMemory::Impl Impl(void* data, size_t size, int rank, TransportFlags transports, Communicator::Impl& commImpl); Impl(const std::vector& data); - TransportInfo& getTransportInfo(Transport transport) - { + TransportInfo& getTransportInfo(Transport transport) { for (auto& entry : transportInfos) { if (entry.transport == transport) { return entry; @@ -53,6 +50,6 @@ struct RegisteredMemory::Impl } }; -} // namespace mscclpp +} // namespace mscclpp -#endif // MSCCLPP_REGISTERED_MEMORY_HPP_ +#endif // MSCCLPP_REGISTERED_MEMORY_HPP_ diff --git a/src/include/socket.h b/src/include/socket.h index 53bdd98d..f17f74a8 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -7,7 +7,6 @@ #ifndef MSCCLPP_SOCKET_H_ #define MSCCLPP_SOCKET_H_ -#include "mscclpp.h" #include #include #include @@ -16,9 +15,11 @@ #include #include +#include "mscclpp.h" + #define MAX_IFS 16 #define MAX_IF_NAME_SIZE 16 -#define SLEEP_INT 1000 // connection retry sleep interval in usec +#define SLEEP_INT 1000 // connection retry sleep interval in usec #define SOCKET_NAME_MAXLEN (NI_MAXHOST + NI_MAXSERV) #define MSCCLPP_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL @@ -29,8 +30,7 @@ union mscclppSocketAddress { struct sockaddr_in6 sin6; }; -enum mscclppSocketState -{ +enum mscclppSocketState { mscclppSocketStateNone = 0, mscclppSocketStateInitialized = 1, mscclppSocketStateAccepting = 2, @@ -44,8 +44,7 @@ enum mscclppSocketState mscclppSocketStateNum = 10 }; -enum mscclppSocketType -{ +enum mscclppSocketType { mscclppSocketTypeUnknown = 0, mscclppSocketTypeBootstrap = 1, mscclppSocketTypeProxy = 2, @@ -53,8 +52,7 @@ enum mscclppSocketType mscclppSocketTypeNetIb = 4 }; -struct mscclppSocket -{ +struct mscclppSocket { int fd; int acceptFd; int connectRetries; diff --git a/src/include/utils.h b/src/include/utils.h index 59b35407..07a16684 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -7,10 +7,12 @@ #ifndef MSCCLPP_UTILS_H_ #define MSCCLPP_UTILS_H_ +#include + +#include + #include "alloc.h" #include "mscclpp.h" -#include -#include // int mscclppCudaCompCap(); @@ -27,8 +29,7 @@ uint64_t getHostHash(); uint64_t getPidHash(); mscclppResult_t getRandomData(void* buffer, size_t bytes); -struct netIf -{ +struct netIf { char prefix[64]; int port; }; @@ -36,11 +37,9 @@ struct netIf int parseStringList(const char* string, struct netIf* ifList, int maxList); bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact); -static long log2i(long n) -{ +static long log2i(long n) { long l = 0; - while (n >>= 1) - l++; + while (n >>= 1) l++; return l; } @@ -50,16 +49,13 @@ int64_t elapsedClock(mscclppTime_t start, mscclppTime_t end); /* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else * return -1 */ -inline mscclppResult_t getRandomData(void* buffer, size_t bytes) -{ +inline mscclppResult_t getRandomData(void* buffer, size_t bytes) { mscclppResult_t ret = mscclppSuccess; if (bytes > 0) { const size_t one = 1UL; FILE* fp = fopen("/dev/urandom", "r"); - if (buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one) - ret = mscclppSystemError; - if (fp) - fclose(fp); + if (buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one) ret = mscclppSystemError; + if (fp) fclose(fp); } return ret; } diff --git a/src/include/utils.hpp b/src/include/utils.hpp index d1a1c7d8..536d1d29 100644 --- a/src/include/utils.hpp +++ b/src/include/utils.hpp @@ -1,54 +1,40 @@ #ifndef MSCCLPP_UTILS_HPP_ #define MSCCLPP_UTILS_HPP_ -#include #include +#include + namespace mscclpp { -struct Timer -{ +struct Timer { std::chrono::steady_clock::time_point start; - Timer() - { - start = std::chrono::steady_clock::now(); - } + Timer() { start = std::chrono::steady_clock::now(); } - int64_t elapsed() - { + int64_t elapsed() { auto end = std::chrono::steady_clock::now(); return std::chrono::duration_cast(end - start).count(); } - void reset() - { - start = std::chrono::steady_clock::now(); - } + void reset() { start = std::chrono::steady_clock::now(); } - void print(const char* name) - { + void print(const char* name) { auto end = std::chrono::steady_clock::now(); auto elapsed = std::chrono::duration_cast(end - start).count(); printf("%s: %ld us\n", name, elapsed); } }; -struct ScopedTimer -{ +struct ScopedTimer { Timer timer; const char* name; - ScopedTimer(const char* name) : name(name) - { - } + ScopedTimer(const char* name) : name(name) {} - ~ScopedTimer() - { - timer.print(name); - } + ~ScopedTimer() { timer.print(name); } }; -} // namespace mscclpp +} // namespace mscclpp -#endif // MSCCLPP_UTILS_HPP_ +#endif // MSCCLPP_UTILS_HPP_ diff --git a/src/npkit/npkit.cc b/src/npkit/npkit.cc index e7fe78f8..49ee7a12 100644 --- a/src/npkit/npkit.cc +++ b/src/npkit/npkit.cc @@ -1,10 +1,12 @@ -#include -#include +#include "npkit.h" + +#include #include +#include +#include + #include "alloc.h" -#include "npkit.h" -#include uint64_t NpKit::rank_ = 0; @@ -16,8 +18,7 @@ NpKitEventCollectContext* NpKit::cpu_collect_contexts_ = nullptr; uint64_t NpKit::cpu_base_system_timestamp_ = 0; uint64_t NpKit::cpu_base_steady_timestamp_ = 0; -mscclppResult_t NpKit::Init(int rank) -{ +mscclppResult_t NpKit::Init(int rank) { uint64_t i = 0; NpKitEventCollectContext ctx; ctx.event_buffer_head = 0; @@ -47,8 +48,7 @@ mscclppResult_t NpKit::Init(int rank) return mscclppSuccess; } -mscclppResult_t NpKit::Dump(const std::string& dump_dir) -{ +mscclppResult_t NpKit::Dump(const std::string& dump_dir) { uint64_t i = 0; std::string dump_file_path; @@ -113,8 +113,7 @@ mscclppResult_t NpKit::Dump(const std::string& dump_dir) return mscclppSuccess; } -mscclppResult_t NpKit::Shutdown() -{ +mscclppResult_t NpKit::Shutdown() { uint64_t i = 0; // Free CPU event data structures @@ -134,13 +133,9 @@ mscclppResult_t NpKit::Shutdown() return mscclppSuccess; } -NpKitEventCollectContext* NpKit::GetGpuEventCollectContexts() -{ - return gpu_collect_contexts_; -} +NpKitEventCollectContext* NpKit::GetGpuEventCollectContexts() { return gpu_collect_contexts_; } -void NpKit::CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id) -{ +void NpKit::CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id) { uint64_t event_buffer_head = cpu_collect_contexts_[channel_id].event_buffer_head; if (event_buffer_head < kMaxNumCpuEventsPerBuffer) { NpKitEvent& event = cpu_collect_contexts_[channel_id].event_buffer[event_buffer_head]; @@ -152,8 +147,7 @@ void NpKit::CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t } } -uint64_t NpKit::GetCpuTimestamp() -{ +uint64_t NpKit::GetCpuTimestamp() { uint64_t cpu_curr_steady_timestamp_ = std::chrono::steady_clock::now().time_since_epoch().count(); return cpu_base_steady_timestamp_ + (cpu_curr_steady_timestamp_ - cpu_base_steady_timestamp_); } diff --git a/src/npkit/npkit.h b/src/npkit/npkit.h index c0cc4710..c15bb812 100644 --- a/src/npkit/npkit.h +++ b/src/npkit/npkit.h @@ -3,12 +3,12 @@ #include +#include "mscclpp.h" #include "npkit_event.h" #include "npkit_struct.h" -class NpKit -{ -public: +class NpKit { + public: static const uint64_t kNumGpuEventBuffers = 512; static const uint64_t kNumCpuEventBuffers = 32; @@ -21,9 +21,9 @@ public: static NpKitEventCollectContext* GetGpuEventCollectContexts(); +#ifdef __CUDACC__ static inline __device__ void CollectGpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, - NpKitEventCollectContext* ctx) - { + NpKitEventCollectContext* ctx) { uint64_t event_buffer_head = ctx->event_buffer_head; if (event_buffer_head < kMaxNumGpuEventsPerBuffer) { NpKitEvent& event = ctx->event_buffer[event_buffer_head]; @@ -34,12 +34,13 @@ public: ctx->event_buffer_head++; } } +#endif // __CUDACC__ static void CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id); static uint64_t GetCpuTimestamp(); -private: + private: // 64K * 512 * 16B = 512MB per GPU static const uint64_t kMaxNumGpuEventsPerBuffer = 1ULL << 16; diff --git a/src/npkit/npkit_struct.h b/src/npkit/npkit_struct.h index 2fc19821..a18e8798 100644 --- a/src/npkit/npkit_struct.h +++ b/src/npkit/npkit_struct.h @@ -7,8 +7,7 @@ union NpKitEvent { uint64_t bits[2]; - struct - { + struct { uint64_t type : 8; uint64_t size : 32; uint64_t rsvd : 24; @@ -16,8 +15,7 @@ union NpKitEvent { } fields; }; -struct NpKitEventCollectContext -{ +struct NpKitEventCollectContext { NpKitEvent* event_buffer; uint64_t event_buffer_head; }; diff --git a/src/proxy.cc b/src/proxy.cc index 060bbfb0..8a066279 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -1,10 +1,11 @@ -#include "api.h" +#include #include #include +#include + +#include "api.h" #include "utils.h" #include "utils.hpp" -#include -#include namespace mscclpp { @@ -12,8 +13,7 @@ const int ProxyStopCheckPeriod = 1000; const int ProxyFlushPeriod = 4; -struct Proxy::Impl -{ +struct Proxy::Impl { ProxyHandler handler; std::function threadInit; HostProxyFifo fifo; @@ -21,29 +21,22 @@ struct Proxy::Impl std::atomic_bool running; Impl(ProxyHandler handler, std::function threadInit) - : handler(handler), threadInit(threadInit), running(false) - { - } + : handler(handler), threadInit(threadInit), running(false) {} }; -MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler, std::function threadInit) -{ +MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler, std::function threadInit) { pimpl = std::make_unique(handler, threadInit); } -MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler) : Proxy(handler, [] {}) -{ -} +MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler) : Proxy(handler, [] {}) {} -MSCCLPP_API_CPP Proxy::~Proxy() -{ +MSCCLPP_API_CPP Proxy::~Proxy() { if (pimpl) { stop(); } } -MSCCLPP_API_CPP void Proxy::start() -{ +MSCCLPP_API_CPP void Proxy::start() { pimpl->running = true; pimpl->service = std::thread([this] { pimpl->threadInit(); @@ -64,8 +57,8 @@ MSCCLPP_API_CPP void Proxy::start() } // Poll to see if we are ready to send anything fifo.poll(&trigger); - if (trigger.fst == 0) { // TODO: this check is a potential pitfall for custom triggers - continue; // there is one in progress + if (trigger.fst == 0) { // TODO: this check is a potential pitfall for custom triggers + continue; // there is one in progress } ProxyHandlerResult result = handler(trigger); @@ -96,17 +89,13 @@ MSCCLPP_API_CPP void Proxy::start() }); } -MSCCLPP_API_CPP void Proxy::stop() -{ +MSCCLPP_API_CPP void Proxy::stop() { pimpl->running = false; if (pimpl->service.joinable()) { pimpl->service.join(); } } -MSCCLPP_API_CPP HostProxyFifo& Proxy::fifo() -{ - return pimpl->fifo; -} +MSCCLPP_API_CPP HostProxyFifo& Proxy::fifo() { return pimpl->fifo; } -} // namespace mscclpp +} // namespace mscclpp diff --git a/src/registered_memory.cc b/src/registered_memory.cc index 603caa67..4781ba61 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -1,22 +1,24 @@ #include "registered_memory.hpp" + +#include + +#include + #include "api.h" #include "checks.hpp" #include "utils.h" -#include -#include namespace mscclpp { RegisteredMemory::Impl::Impl(void* data, size_t size, int rank, TransportFlags transports, Communicator::Impl& commImpl) - : data(data), size(size), rank(rank), hostHash(commImpl.rankToHash_.at(rank)), transports(transports) -{ + : data(data), size(size), rank(rank), hostHash(commImpl.rankToHash_.at(rank)), transports(transports) { if (transports.has(Transport::CudaIpc)) { TransportInfo transportInfo; transportInfo.transport = Transport::CudaIpc; cudaIpcMemHandle_t handle; void* baseDataPtr; - size_t baseDataSize; // dummy + size_t baseDataSize; // dummy CUTHROW(cuMemGetAddressRange((CUdeviceptr*)&baseDataPtr, &baseDataSize, (CUdeviceptr)data)); CUDATHROW(cudaIpcGetMemHandle(&handle, baseDataPtr)); // TODO: bug with offset of base? @@ -35,53 +37,30 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, int rank, TransportFlags t this->transportInfos.push_back(transportInfo); INFO(MSCCLPP_NET, "IB mr for address %p with size %ld is registered", data, size); }; - if (transports.has(Transport::IB0)) - addIb(Transport::IB0); - if (transports.has(Transport::IB1)) - addIb(Transport::IB1); - if (transports.has(Transport::IB2)) - addIb(Transport::IB2); - if (transports.has(Transport::IB3)) - addIb(Transport::IB3); - if (transports.has(Transport::IB4)) - addIb(Transport::IB4); - if (transports.has(Transport::IB5)) - addIb(Transport::IB5); - if (transports.has(Transport::IB6)) - addIb(Transport::IB6); - if (transports.has(Transport::IB7)) - addIb(Transport::IB7); + if (transports.has(Transport::IB0)) addIb(Transport::IB0); + if (transports.has(Transport::IB1)) addIb(Transport::IB1); + if (transports.has(Transport::IB2)) addIb(Transport::IB2); + if (transports.has(Transport::IB3)) addIb(Transport::IB3); + if (transports.has(Transport::IB4)) addIb(Transport::IB4); + if (transports.has(Transport::IB5)) addIb(Transport::IB5); + if (transports.has(Transport::IB6)) addIb(Transport::IB6); + if (transports.has(Transport::IB7)) addIb(Transport::IB7); } } -MSCCLPP_API_CPP RegisteredMemory::RegisteredMemory(std::shared_ptr pimpl) : pimpl(pimpl) -{ -} +MSCCLPP_API_CPP RegisteredMemory::RegisteredMemory(std::shared_ptr pimpl) : pimpl(pimpl) {} MSCCLPP_API_CPP RegisteredMemory::~RegisteredMemory() = default; -MSCCLPP_API_CPP void* RegisteredMemory::data() -{ - return pimpl->data; -} +MSCCLPP_API_CPP void* RegisteredMemory::data() { return pimpl->data; } -MSCCLPP_API_CPP size_t RegisteredMemory::size() -{ - return pimpl->size; -} +MSCCLPP_API_CPP size_t RegisteredMemory::size() { return pimpl->size; } -MSCCLPP_API_CPP int RegisteredMemory::rank() -{ - return pimpl->rank; -} +MSCCLPP_API_CPP int RegisteredMemory::rank() { return pimpl->rank; } -MSCCLPP_API_CPP TransportFlags RegisteredMemory::transports() -{ - return pimpl->transports; -} +MSCCLPP_API_CPP TransportFlags RegisteredMemory::transports() { return pimpl->transports; } -MSCCLPP_API_CPP std::vector RegisteredMemory::serialize() -{ +MSCCLPP_API_CPP std::vector RegisteredMemory::serialize() { std::vector result; std::copy_n(reinterpret_cast(&pimpl->size), sizeof(pimpl->size), std::back_inserter(result)); std::copy_n(reinterpret_cast(&pimpl->rank), sizeof(pimpl->rank), std::back_inserter(result)); @@ -108,13 +87,11 @@ MSCCLPP_API_CPP std::vector RegisteredMemory::serialize() return result; } -MSCCLPP_API_CPP RegisteredMemory RegisteredMemory::deserialize(const std::vector& data) -{ +MSCCLPP_API_CPP RegisteredMemory RegisteredMemory::deserialize(const std::vector& data) { return RegisteredMemory(std::make_shared(data)); } -RegisteredMemory::Impl::Impl(const std::vector& serialization) -{ +RegisteredMemory::Impl::Impl(const std::vector& serialization) { auto it = serialization.begin(); std::copy_n(it, sizeof(this->size), reinterpret_cast(&this->size)); it += sizeof(this->size); @@ -163,4 +140,4 @@ RegisteredMemory::Impl::Impl(const std::vector& serialization) } } -} // namespace mscclpp +} // namespace mscclpp diff --git a/src/utils.cc b/src/utils.cc index d3957bb1..6e9e1970 100644 --- a/src/utils.cc +++ b/src/utils.cc @@ -6,9 +6,10 @@ #include "utils.h" -#include #include #include + +#include #include // Get current Compute Capability @@ -21,20 +22,17 @@ // return ccMajor*10+ccMinor; // } -mscclppResult_t int64ToBusId(int64_t id, char* busId) -{ +mscclppResult_t int64ToBusId(int64_t id, char* busId) { sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf)); return mscclppSuccess; } -mscclppResult_t busIdToInt64(const char* busId, int64_t* id) -{ - char hexStr[17]; // Longest possible int64 hex string + null terminator. +mscclppResult_t busIdToInt64(const char* busId, int64_t* id) { + char hexStr[17]; // Longest possible int64 hex string + null terminator. int hexOffset = 0; for (int i = 0; hexOffset < sizeof(hexStr) - 1; i++) { char c = busId[i]; - if (c == '.' || c == ':') - continue; + if (c == '.' || c == ':') continue; if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) { hexStr[hexOffset++] = busId[i]; } else @@ -46,8 +44,7 @@ mscclppResult_t busIdToInt64(const char* busId, int64_t* id) } // Convert a logical cudaDev index to the NVML device minor number -mscclppResult_t getBusId(int cudaDev, std::string* busId) -{ +mscclppResult_t getBusId(int cudaDev, std::string* busId) { // On most systems, the PCI bus ID comes back as in the 0000:00:00.0 // format. Still need to allocate proper space in case PCI domain goes // higher. @@ -61,8 +58,7 @@ mscclppResult_t getBusId(int cudaDev, std::string* busId) return mscclppSuccess; } -mscclppResult_t getDeviceNumaNode(int cudaDev, int* numaNode) -{ +mscclppResult_t getDeviceNumaNode(int cudaDev, int* numaNode) { std::string busId; MSCCLPPCHECK(getBusId(cudaDev, &busId)); @@ -81,21 +77,18 @@ mscclppResult_t getDeviceNumaNode(int cudaDev, int* numaNode) return mscclppSuccess; } -mscclppResult_t getHostName(char* hostname, int maxlen, const char delim) -{ +mscclppResult_t getHostName(char* hostname, int maxlen, const char delim) { if (gethostname(hostname, maxlen) != 0) { strncpy(hostname, "unknown", maxlen); return mscclppSystemError; } int i = 0; - while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen - 1)) - i++; + while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen - 1)) i++; hostname[i] = '\0'; return mscclppSuccess; } -uint64_t getHash(const char* string, int n) -{ +uint64_t getHash(const char* string, int n) { // Based on DJB2a, result = result * 33 ^ char uint64_t result = 5381; for (int c = 0; c < n; c++) { @@ -113,8 +106,7 @@ uint64_t getHash(const char* string, int n) * This string can be overridden by using the MSCCLPP_HOSTID env var. */ #define HOSTID_FILE "/proc/sys/kernel/random/boot_id" -uint64_t computeHostHash(void) -{ +uint64_t computeHostHash(void) { char hostHash[1024]; char* hostId; @@ -145,8 +137,7 @@ uint64_t computeHostHash(void) return getHash(hostHash, strlen(hostHash)); } -uint64_t getHostHash(void) -{ +uint64_t getHostHash(void) { thread_local std::unique_ptr hostHash = std::make_unique(computeHostHash()); return *hostHash; } @@ -157,15 +148,13 @@ uint64_t getHostHash(void) * * $$ $(readlink /proc/self/ns/pid) */ -uint64_t getPidHash(void) -{ +uint64_t getPidHash(void) { char pname[1024]; // Start off with our pid ($$) sprintf(pname, "%ld", (long)getpid()); int plen = strlen(pname); int len = readlink("/proc/self/ns/pid", pname + plen, sizeof(pname) - 1 - plen); - if (len < 0) - len = 0; + if (len < 0) len = 0; pname[plen + len] = '\0'; TRACE(MSCCLPP_INIT, "unique PID '%s'", pname); @@ -173,10 +162,8 @@ uint64_t getPidHash(void) return getHash(pname, strlen(pname)); } -int parseStringList(const char* string, struct netIf* ifList, int maxList) -{ - if (!string) - return 0; +int parseStringList(const char* string, struct netIf* ifList, int maxList) { + if (!string) return 0; const char* ptr = string; @@ -192,8 +179,7 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList) ifNum++; ifC = 0; } - while (c != ',' && c != '\0') - c = *(++ptr); + while (c != ',' && c != '\0') c = *(++ptr); } else if (c == ',' || c == '\0') { if (ifC > 0) { ifList[ifNum].prefix[ifC] = '\0'; @@ -210,29 +196,22 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList) return ifNum; } -static bool matchIf(const char* string, const char* ref, bool matchExact) -{ +static bool matchIf(const char* string, const char* ref, bool matchExact) { // Make sure to include '\0' in the exact case int matchLen = matchExact ? strlen(string) + 1 : strlen(ref); return strncmp(string, ref, matchLen) == 0; } -static bool matchPort(const int port1, const int port2) -{ - if (port1 == -1) - return true; - if (port2 == -1) - return true; - if (port1 == port2) - return true; +static bool matchPort(const int port1, const int port2) { + if (port1 == -1) return true; + if (port2 == -1) return true; + if (port1 == port2) return true; return false; } -bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) -{ +bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) { // Make an exception for the case where no user list is defined - if (listSize == 0) - return true; + if (listSize == 0) return true; for (int i = 0; i < listSize; i++) { if (matchIf(string, ifList[i].prefix, matchExact) && matchPort(port, ifList[i].port)) { @@ -242,8 +221,7 @@ bool matchIfList(const char* string, int port, struct netIf* ifList, int listSiz return false; } -mscclppResult_t numaBind(int node) -{ +mscclppResult_t numaBind(int node) { int totalNumNumaNodes = numa_num_configured_nodes(); if (node < 0 || node >= totalNumNumaNodes) { WARN("Invalid NUMA node %d, must be between 0 and %d", node, totalNumNumaNodes); @@ -256,9 +234,7 @@ mscclppResult_t numaBind(int node) return mscclppSuccess; } -mscclppResult_t getNumaState(mscclppNumaState* state) -{ - +mscclppResult_t getNumaState(mscclppNumaState* state) { mscclppNumaState state_ = numa_get_run_node_mask(); if (state_ == NULL) { WARN("Failed to get NUMA node mask of the running process"); @@ -268,8 +244,7 @@ mscclppResult_t getNumaState(mscclppNumaState* state) return mscclppSuccess; } -mscclppResult_t setNumaState(mscclppNumaState state) -{ +mscclppResult_t setNumaState(mscclppNumaState state) { if (state == NULL) { WARN("Invalid NUMA state"); return mscclppInvalidUsage; @@ -278,12 +253,8 @@ mscclppResult_t setNumaState(mscclppNumaState state) return mscclppSuccess; } -mscclppTime_t getClock() -{ - return std::chrono::steady_clock::now(); -} +mscclppTime_t getClock() { return std::chrono::steady_clock::now(); } -int64_t elapsedClock(mscclppTime_t start, mscclppTime_t end) -{ +int64_t elapsedClock(mscclppTime_t start, mscclppTime_t end) { return std::chrono::duration_cast(end - start).count(); } From 96a0c45fb4cac0b8a1c611812c482502b9cc130f Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Thu, 11 May 2023 00:23:21 +0000 Subject: [PATCH 125/135] Remove makefile --- Makefile | 238 ------------------------------------------------------- 1 file changed, 238 deletions(-) delete mode 100644 Makefile diff --git a/Makefile b/Makefile deleted file mode 100644 index 74d2c475..00000000 --- a/Makefile +++ /dev/null @@ -1,238 +0,0 @@ -######## VERSION -MSCCLPP_MAJOR := 0 -MSCCLPP_MINOR := 1 -MSCCLPP_PATCH := 0 - -######## COMPILE OPTIONS -DEBUG ?= 0 -VERBOSE ?= 1 -TRACE ?= 0 -NPKIT ?= 0 -GDRCOPY ?= 0 -USE_MPI_FOR_TESTS ?= 1 - -######## CUDA -CUDA_HOME ?= /usr/local/cuda -CUDA_LIB ?= $(CUDA_HOME)/lib64 -CUDA_INC ?= $(CUDA_HOME)/include -NVCC = $(CUDA_HOME)/bin/nvcc -CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) -CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) -CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2) -# You should define NVCC_GENCODE in your environment to the minimal set -# of archs to reduce compile time. -CUDA8_GENCODE = -gencode=arch=compute_50,code=sm_50 \ - -gencode=arch=compute_60,code=sm_60 \ - -gencode=arch=compute_61,code=sm_61 -CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70 -CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80 -CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90 - -CUDA8_PTX = -gencode=arch=compute_61,code=compute_61 -CUDA9_PTX = -gencode=arch=compute_70,code=compute_70 -CUDA11_PTX = -gencode=arch=compute_80,code=compute_80 -CUDA12_PTX = -gencode=arch=compute_90,code=compute_90 - -######## CXX/NVCC -CXX := g++ -NVTX ?= 1 - -ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0) -# Include Hopper support if we're using CUDA11.8 or above - NVCC_GENCODE ?= $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX) -else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) - NVCC_GENCODE ?= $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_PTX) -# Include Volta support if we're using CUDA9 or above -else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 9; echo $$?),0) - NVCC_GENCODE ?= $(CUDA9_GENCODE) $(CUDA9_PTX) -else - NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX) -endif -$(info NVCC_GENCODE is ${NVCC_GENCODE}) - -CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden \ - -Wall -Wno-unused-function -Wno-sign-compare -std=c++14 -Wvla \ - -I $(CUDA_INC) \ - $(CXXFLAGS) - -ifneq ($(TRACE), 0) -CXXFLAGS += -DENABLE_TRACE -endif - -NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xfatbin -compress-all -# Use addprefix so that we can specify more than one path -NVLDFLAGS := -L$(CUDA_LIB) -lcudart -lrt -lcuda - -ifeq ($(DEBUG), 0) -NVCUFLAGS += -O3 -CXXFLAGS += -O3 -g -else -NVCUFLAGS += -O0 -G -g -CXXFLAGS += -O0 -g -ggdb3 -endif - -ifneq ($(VERBOSE), 0) -NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -CXXFLAGS += -Wall -Wextra -else -.SILENT: -endif - -ifeq ($(NVTX), 0) -CXXFLAGS += -DNVTX_DISABLE -endif - -#### MPI (only for test code) -ifeq ($(USE_MPI_FOR_TESTS), 1) -MPI_HOME ?= /usr/local/mpi -MPI_INC := -I$(MPI_HOME)/include -MPI_LDFLAGS := -L$(MPI_HOME)/lib -lmpi -MPI_MACRO := -D MSCCLPP_USE_MPI_FOR_TESTS -else -MPI_HOME := -MPI_INC := -MPI_LDFLAGS := -MPI_MACRO := -endif - -#### GDRCOPY -ifeq ($(GDRCOPY), 1) -GDRCOPY_LDFLAGS := -lgdrapi -CXXFLAGS += -DMSCCLPP_USE_GDRCOPY -NVCUFLAGS += -DMSCCLPP_USE_GDRCOPY -else -GDRCOPY_LDFLAGS := -endif - -#### MSCCL++ -BUILDDIR ?= $(abspath ./build) -INCDIR := include -LIBDIR := lib -OBJDIR := obj -BINDIR := bin - -ifneq ($(NPKIT), 0) -CXXFLAGS += -DENABLE_NPKIT -NVCUFLAGS += -DENABLE_NPKIT -endif - -LDFLAGS := $(NVLDFLAGS) $(GDRCOPY_LDFLAGS) -libverbs -lnuma - -LIBSRCS := $(addprefix src/,debug.cc utils.cc init.cc proxy.cc ib.cc config.cc) -LIBSRCS += $(addprefix src/bootstrap/,bootstrap.cc socket.cc) -LIBSRCS += $(addprefix src/,communicator.cc connection.cc registered_memory.cc) -LIBSRCS += $(addprefix src/,epoch.cc proxy_cpp.cc fifo.cc channel.cc errors.cc) -ifneq ($(NPKIT), 0) -LIBSRCS += $(addprefix src/misc/,npkit.cc) -endif -ifeq ($(GDRCOPY), 1) -LIBSRCS += $(addprefix src/,gdr.cc) -endif -LIBOBJS := $(patsubst %.cc,%.o,$(LIBSRCS)) -LIBOBJTARGETS := $(LIBOBJS:%=$(BUILDDIR)/$(OBJDIR)/%) - -HEADERS := $(wildcard src/include/*.h) -CPPSOURCES := $(shell find ./ -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)' -not -path "./build/*" -not -path "./python/*") -PYTHONCPPSOURCES := $(shell find ./python/src/ -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)') - -INCEXPORTS := mscclpp.h mscclppfifo.h mscclpp.hpp mscclppfifo.hpp epoch.hpp errors.hpp -INCTARGETS := $(INCEXPORTS:%=$(BUILDDIR)/$(INCDIR)/%) - -LIBNAME := libmscclpp.so -LIBSONAME := $(LIBNAME).$(MSCCLPP_MAJOR) -LIBTARGET := $(BUILDDIR)/$(LIBDIR)/$(LIBNAME).$(MSCCLPP_MAJOR).$(MSCCLPP_MINOR).$(MSCCLPP_PATCH) - -UTDIR := tests/unittests -UTSRCS := $(addprefix $(UTDIR)/,ib_test.cc) -UTOBJS := $(patsubst %.cc,%.o,$(UTSRCS)) -UTOBJTARGETS := $(UTOBJS:%=$(BUILDDIR)/$(OBJDIR)/%) -UTBINS := $(patsubst %.o,$(BUILDDIR)/$(BINDIR)/%,$(UTOBJS)) - -TESTSDIR := tests -TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc allgather_test_standalone.cu communicator_test_cpp.cu bootstrap_test_cpp.cc allgather_test_cpp.cu) -TESTSOBJS := $(patsubst %.cc,%.o,$(TESTSSRCS)) $(patsubst %.cu,%.o,$(TESTSSRCS)) -TESTSOBJTARGETS := $(TESTSOBJS:%=$(BUILDDIR)/$(OBJDIR)/%) -TESTSBINS := $(patsubst %.o,$(BUILDDIR)/$(BINDIR)/%,$(TESTSOBJS)) - -MSCLLPPTESTSOBJSDIR:= $(BUILDDIR)/$(OBJDIR)/$(TESTSDIR) -MSCLLPPTESTBINFILESLIST := allgather_test -MSCLLPPTESTBINS := $(MSCLLPPTESTBINFILESLIST:%=$(BUILDDIR)/$(BINDIR)/$(TESTSDIR)/%_perf) - -INCLUDE := -Isrc -Isrc/include - -.PHONY: all build lib unittests tests mscclpp-test cpplint cpplint-autofix cpplint-file-autofix clean - -all: build - -build: lib tests -ifeq ($(USE_MPI_FOR_TESTS), 0) -build += mscclpp-test -endif - -lib: $(LIBOBJTARGETS) $(INCTARGETS) $(LIBTARGET) - -unittests: $(UTBINS) - -tests: unittests $(TESTSBINS) - -mscclpp-test: $(LIBTARGET) $(MSCLLPPTESTBINS) - -cpplint: - clang-format-12 -style=file --verbose --Werror --dry-run $(CPPSOURCES) - clang-format-12 --dry-run $(CPPSOURCES) - -cpplint-autofix: - clang-format-12 -style=file --verbose --Werror -i $(CPPSOURCES) - clang-format-12 -i $(PYTHONCPPSOURCES) - -# Run cpplint on a single file, example: make cpplint-file-autofix INPUTFILE=src/bootstrap/bootstrap.cc -cpplint-file-autofix: - clang-format-12 -style=file --verbose --Werror -i $(INPUTFILE) - -# Compile libobjs -$(BUILDDIR)/$(OBJDIR)/%.o: %.cc $(HEADERS) - @mkdir -p $(@D) - $(CXX) -o $@ $(INCLUDE) $(CXXFLAGS) -c $< - -# Compile utobjs -$(BUILDDIR)/$(OBJDIR)/$(UTDIR)/%.o: $(UTDIR)/%.cc $(HEADERS) - @mkdir -p $(@D) - $(CXX) -o $@ $(INCLUDE) $(CXXFLAGS) -c $< - -$(BUILDDIR)/$(INCDIR)/%: src/$(INCDIR)/% - @mkdir -p $(@D) - cp $< $@ - -$(LIBTARGET): $(LIBOBJTARGETS) - @mkdir -p $(@D) - $(CXX) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $^ $(CXXFLAGS) $(LDFLAGS) - ln -sf $(LIBTARGET) $(BUILDDIR)/$(LIBDIR)/$(LIBNAME) - ln -sf $(LIBTARGET) $(BUILDDIR)/$(LIBDIR)/$(LIBSONAME) - -# UT bins -$(BUILDDIR)/$(BINDIR)/$(UTDIR)/%: $(BUILDDIR)/$(OBJDIR)/$(UTDIR)/%.o $(LIBOBJTARGETS) - @mkdir -p $(@D) - $(NVCC) -o $@ $+ $(MPI_LDFLAGS) $(LDFLAGS) - -# Compile .cc tests -$(BUILDDIR)/$(OBJDIR)/$(TESTSDIR)/%.o: $(TESTSDIR)/%.cc $(INCTARGETS) - @mkdir -p $(@D) - $(CXX) -o $@ -I$(BUILDDIR)/$(INCDIR) $(MPI_INC) $(CXXFLAGS) -c $< $(MPI_MACRO) - -# Compile .cu tests -$(BUILDDIR)/$(OBJDIR)/$(TESTSDIR)/%.o: $(TESTSDIR)/%.cu $(INCTARGETS) - @mkdir -p $(@D) - $(NVCC) -o $@ -I$(BUILDDIR)/$(INCDIR) $(MPI_INC) $(NVCUFLAGS) $(INCLUDE) -c $< $(MPI_MACRO) - -# Test bins -$(BUILDDIR)/$(BINDIR)/$(TESTSDIR)/%: $(BUILDDIR)/$(OBJDIR)/$(TESTSDIR)/%.o $(LIBTARGET) - @mkdir -p $(@D) - $(NVCC) -o $@ $< $(MPI_LDFLAGS) -L$(BUILDDIR)/$(LIBDIR) -lmscclpp - -# Compile mscclpp_test -$(BUILDDIR)/$(BINDIR)/$(TESTSDIR)/%_perf: $(MSCLLPPTESTSOBJSDIR)/%.o $(MSCLLPPTESTSOBJSDIR)/common.o - @mkdir -p $(@D) - $(NVCC) -o $@ $^ $(MPI_LDFLAGS) -L$(BUILDDIR)/$(LIBDIR) -lmscclpp - -clean: - rm -rf $(BUILDDIR) From 785a973ace091c57a4d20b0e876901b262636f2e Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 11 May 2023 08:25:25 +0000 Subject: [PATCH 126/135] refine exception --- CMakeLists.txt | 1 + include/mscclpp/errors.hpp | 14 ++++++++++-- src/errors.cc | 39 ++++++++++++++++++++++++++++----- src/include/checks.hpp | 44 +++++++++++++++++++------------------- 4 files changed, 69 insertions(+), 29 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5470cd32..5f8ddeb2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,7 @@ if(ALLOW_GDRCOPY) find_package(GDRCopy) endif() +include_directories(${CUDAToolkit_INCLUDE_DIRS}) include(CTest) include(FetchContent) FetchContent_Declare(googletest URL https://github.com/google/googletest/archive/b796f7d44681514f58a683a3a71ff17c94edb0c1.zip) diff --git a/include/mscclpp/errors.hpp b/include/mscclpp/errors.hpp index 3497f783..cdaf9ed8 100644 --- a/include/mscclpp/errors.hpp +++ b/include/mscclpp/errors.hpp @@ -1,6 +1,9 @@ #ifndef MSCCLPP_ERRORS_HPP_ #define MSCCLPP_ERRORS_HPP_ +#include +#include + #include namespace mscclpp { @@ -11,14 +14,21 @@ enum class ErrorCode { InvalidUsage, }; +std::string errorToString(enum ErrorCode error); + class BaseError : public std::runtime_error { public: BaseError(std::string message, int errorCode); + explicit BaseError(int errorCode); virtual ~BaseError() = default; int getErrorCode() const; + const char* what() const noexcept override; private: int errorCode_; + + protected: + std::string message_; }; class Error : public BaseError { @@ -29,13 +39,13 @@ class Error : public BaseError { class CudaError : public BaseError { public: - CudaError(std::string message, int errorCode); + CudaError(std::string message, cudaError_t errorCode); virtual ~CudaError() = default; }; class CuError : public BaseError { public: - CuError(std::string message, int errorCode); + CuError(std::string message, CUresult errorCode); virtual ~CuError() = default; }; diff --git a/src/errors.cc b/src/errors.cc index c32accf8..50d7a2ef 100644 --- a/src/errors.cc +++ b/src/errors.cc @@ -1,19 +1,48 @@ +#include #include #include "api.h" namespace mscclpp { -BaseError::BaseError(std::string message, int errorCode) : std::runtime_error(message), errorCode_(errorCode) {} +std::string errorToString(enum ErrorCode error) { + switch (error) { + case ErrorCode::SystemError: + return "SystemError"; + case ErrorCode::InternalError: + return "InternalError"; + case ErrorCode::InvalidUsage: + return "InvalidUsage"; + default: + return "UnknownError"; + } +} + +BaseError::BaseError(std::string message, int errorCode) + : std::runtime_error(""), message_(message), errorCode_(errorCode) {} + +BaseError::BaseError(int errorCode) : std::runtime_error(""), errorCode_(errorCode) {} int BaseError::getErrorCode() const { return errorCode_; } -MSCCLPP_API_CPP Error::Error(std::string message, ErrorCode errorCode) : BaseError(message, -1) {} +const char* BaseError::what() const noexcept { return message_.c_str(); } -MSCCLPP_API_CPP CudaError::CudaError(std::string message, int errorCode) : BaseError(message, errorCode) {} +MSCCLPP_API_CPP Error::Error(std::string message, ErrorCode errorCode) : BaseError(static_cast(errorCode)) { + message_ = message + " (Mscclpp failure: " + errorToString(errorCode) + ")"; +} -MSCCLPP_API_CPP CuError::CuError(std::string message, int errorCode) : BaseError(message, errorCode) {} +MSCCLPP_API_CPP CudaError::CudaError(std::string message, cudaError_t errorCode) : BaseError(errorCode) { + message_ = message + " (Cuda failure: " + cudaGetErrorString(errorCode) + ")"; +} -MSCCLPP_API_CPP IbError::IbError(std::string message, int errorCode) : BaseError(message, errorCode) {} +MSCCLPP_API_CPP CuError::CuError(std::string message, CUresult errorCode) : BaseError(errorCode) { + const char* errStr; + cuGetErrorString(errorCode, &errStr); + message_ = message + " (Cu failure: " + errStr + ")"; +} + +MSCCLPP_API_CPP IbError::IbError(std::string message, int errorCode) : BaseError(errorCode) { + message_ = message + " (Ib failure: " + std::strerror(errorCode) + ")"; +} }; // namespace mscclpp diff --git a/src/include/checks.hpp b/src/include/checks.hpp index 05204bef..00acc2f3 100644 --- a/src/include/checks.hpp +++ b/src/include/checks.hpp @@ -1,9 +1,3 @@ -/************************************************************************* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - #ifndef MSCCLPP_CHECKS_HPP_ #define MSCCLPP_CHECKS_HPP_ @@ -17,28 +11,34 @@ #define MSCCLPPTHROW(call) \ do { \ mscclppResult_t res = call; \ + mscclpp::ErrorCode err = mscclpp::ErrorCode::InternalError; \ if (res != mscclppSuccess && res != mscclppInProgress) { \ - throw mscclpp::Error(std::string("Call to " #call " failed with error code ") + mscclppGetErrorString(res), \ - ErrorCode::InvalidUsage); \ + if (res == mscclppInvalidUsage) { \ + err = mscclpp::ErrorCode::InvalidUsage; \ + } else if (res == mscclppSystemError) { \ + err = mscclpp::ErrorCode::SystemError; \ + } \ + throw mscclpp::Error(std::string("Call to " #call " failed. ") + __FILE__ + ":" + std::to_string(__LINE__), \ + err); \ } \ } while (false) -#define CUDATHROW(cmd) \ - do { \ - cudaError_t err = cmd; \ - if (err != cudaSuccess) { \ - throw mscclpp::CudaError(std::string("Cuda failure '") + cudaGetErrorString(err) + "'", err); \ - } \ +#define CUDATHROW(cmd) \ + do { \ + cudaError_t err = cmd; \ + if (err != cudaSuccess) { \ + throw mscclpp::CudaError(std::string("Call to " #cmd " failed. ") + __FILE__ + ":" + std::to_string(__LINE__), \ + err); \ + } \ } while (false) -#define CUTHROW(cmd) \ - do { \ - CUresult err = cmd; \ - if (err != CUDA_SUCCESS) { \ - const char* errStr; \ - cuGetErrorString(err, &errStr); \ - throw mscclpp::CuError(std::string("Cu failure '") + std::string(errStr) + "'", err); \ - } \ +#define CUTHROW(cmd) \ + do { \ + CUresult err = cmd; \ + if (err != CUDA_SUCCESS) { \ + throw mscclpp::CuError(std::string("Call to " #cmd " failed.") + __FILE__ + ":" + std::to_string(__LINE__), \ + err); \ + } \ } while (false) #endif From 1487596dc895151c01bf5268e6a6ad94203a8ef8 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 11 May 2023 08:34:57 +0000 Subject: [PATCH 127/135] update cpplint --- .github/workflows/cpplint.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cpplint.yml b/.github/workflows/cpplint.yml index 0b002f44..b9f9b202 100644 --- a/.github/workflows/cpplint.yml +++ b/.github/workflows/cpplint.yml @@ -25,4 +25,8 @@ jobs: run: sudo apt-get install -y clang-format-12 - name: Run cpplint - run: make cpplint + run: | + CPPSOURCES=$(find ./ -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)' -not -path "./build/*" -not -path "./python/*") + PYTHONCPPSOURCES=$(find ./python/src/ -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)') + clang-format-12 -style=file --verbose --Werror --dry-run $(CPPSOURCES) + clang-format-12 --dry-run $(PYTHONCPPSOURCES) From 5704fb7c6afedea6a38aa197ae2cd32f8f978b4a Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 11 May 2023 08:38:00 +0000 Subject: [PATCH 128/135] update --- .github/workflows/cpplint.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cpplint.yml b/.github/workflows/cpplint.yml index b9f9b202..c2e6cb43 100644 --- a/.github/workflows/cpplint.yml +++ b/.github/workflows/cpplint.yml @@ -26,7 +26,7 @@ jobs: - name: Run cpplint run: | - CPPSOURCES=$(find ./ -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)' -not -path "./build/*" -not -path "./python/*") + CPPSOURCES=$(find ./ -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)' -not -path "./build/*" -not -path "./python/*" -not -path "./test/*") PYTHONCPPSOURCES=$(find ./python/src/ -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)') - clang-format-12 -style=file --verbose --Werror --dry-run $(CPPSOURCES) - clang-format-12 --dry-run $(PYTHONCPPSOURCES) + clang-format-12 -style=file --verbose --Werror --dry-run ${CPPSOURCES} + clang-format-12 --dry-run ${PYTHONCPPSOURCES} From ef558a42e8660ca5f4b2652b59845d51ec050595 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Fri, 12 May 2023 05:54:32 +0000 Subject: [PATCH 129/135] wip --- test/CMakeLists.txt | 1 + test/allgather_test_host_offloading.cu | 282 +++++++++++++++++++++++++ 2 files changed, 283 insertions(+) create mode 100644 test/allgather_test_host_offloading.cu diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d7e59bc6..4ce78e68 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -11,6 +11,7 @@ endfunction() add_test_executable(bootstrap_test_cpp bootstrap_test_cpp.cc) add_test_executable(communicator_test_cpp communicator_test_cpp.cu) add_test_executable(allgather_test_cpp allgather_test_cpp.cu) +add_test_executable(allgather_test_host_offloading allgather_test_host_offloading.cu) add_test_executable(ib_test ib_test.cc) # Unit tests diff --git a/test/allgather_test_host_offloading.cu b/test/allgather_test_host_offloading.cu new file mode 100644 index 00000000..c7a80611 --- /dev/null +++ b/test/allgather_test_host_offloading.cu @@ -0,0 +1,282 @@ +#include +#include +#include +#include + +#ifdef MSCCLPP_USE_MPI_FOR_TESTS +#include "mpi.h" +#endif // MSCCLPP_USE_MPI_FOR_TESTS +#include +#include +#include +#include +#include +#include + +int nranksPerNode; +int rank; +int world_size; + +// Propagate errors up + +#define MSCCLPPCHECK(call) \ + do { \ + mscclppResult_t res = call; \ + if (res != mscclppSuccess && res != mscclppInProgress) { \ + /* Print the back trace*/ \ + printf("Failure at %s:%d -> %s\n", __FILE__, __LINE__, mscclppGetErrorString(res)); \ + return res; \ + } \ + } while (0) + +// Check CUDA RT calls +#define CUDACHECK(cmd) \ + do { \ + cudaError_t err = cmd; \ + if (err != cudaSuccess) { \ + printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } while (false) + +// Measure current time in second. +static double getTime(void) +{ + struct timespec tspec; + if (clock_gettime(CLOCK_MONOTONIC, &tspec) == -1) { + printf("clock_gettime failed\n"); + exit(EXIT_FAILURE); + } + return (tspec.tv_nsec / 1.0e9) + tspec.tv_sec; +} + + +__global__ void kernel(int r, int nranks, mscclpp::DeviceProxyFifo fifo, mscclpp::DeviceEpoch::DeviceHandle* handles) +{ + int tid = threadIdx.x; + if (tid != r) + handles[tid].epochIncrement(); + if (tid == 0){ + mscclpp::ProxyTrigger trigger; + trigger.fst = 1; + fifo.push(trigger); + } + if (tid != r) + handles[tid].wait(); +} + +int rankToLocalRank(int rank) +{ + return rank % nranksPerNode; +} + +int rankToNode(int rank) +{ + return rank / nranksPerNode; +} + +void print_usage(const char* prog) +{ +#ifdef MSCCLPP_USE_MPI_FOR_TESTS + printf("usage: %s IP:PORT [rank nranks]\n", prog); +#else + printf("usage: %s IP:PORT rank nranks\n", prog); +#endif +} + +void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSize, size_t nelemsPerGPU, int** data_h, + int** data_d) +{ + CUDACHECK(cudaMalloc(data_d, dataSize)); + CUDACHECK(cudaMemset(*data_d, 0, dataSize)); + + *data_h = new int[nelemsPerGPU * world_size]; + for (size_t i = 0; i < nelemsPerGPU * world_size; i++) { + int val = i + 1; + if (i / nelemsPerGPU == (size_t)rank) { + (*data_h)[i] = val; + } else { + (*data_h)[i] = 0; + } + } + CUDACHECK(cudaMemcpy(*data_d, *data_h, dataSize, cudaMemcpyHostToDevice)); +} + +class MyProxyService { +private: + int deviceNumaNode; +public: + MyProxyService() : remoteMemories(world_size), connections(world_size), + proxy([&](mscclpp::ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }, [&]() { bindThread(); }) { + // int cudaDevice; + // CUDACHECK(cudaGetDevice(&cudaDevice)); + // getDeviceNumaNode(cudaDevice, &deviceNumaNode); + } + + void bindThread() { + // if (deviceNumaNode >= 0) { + // numaBind(deviceNumaNode); + // INFO(MSCCLPP_INIT, "NUMA node of DeviceChannelService proxy thread is set to %d", deviceNumaNode); + // } + } + + mscclpp::ProxyHandlerResult handleTrigger(mscclpp::ProxyTrigger triggerRaw) { + // do something with it. + return mscclpp::ProxyHandlerResult::FlushFifoTailAndContinue; + } + mscclpp::Proxy proxy; + std::vector remoteMemories; + mscclpp::RegisteredMemory localMemory; + std::vector> hostEpochs; + std::vector> deviceEpochs; + std::vector> connections; +}; + +void setupProxyService(mscclpp::Communicator& comm, MyProxyService& proxyService, int* data_d, int dataSize) +{ + int thisNode = rankToNode(rank); + int cudaNum = rankToLocalRank(rank); + std::string ibDevStr = "mlx5_ib" + std::to_string(cudaNum); + mscclpp::Transport ibTransport = mscclpp::getIBTransportByDeviceName(ibDevStr); + std::vector> remoteMemories(world_size); + + proxyService.localMemory = comm.registerMemory(data_d, dataSize, mscclpp::Transport::CudaIpc | ibTransport); + for (int r = 0; r < world_size; ++r) { + if (r == rank){ + proxyService.hostEpochs.emplace_back(nullptr); + proxyService.deviceEpochs.emplace_back(nullptr); + continue; + } + mscclpp::Transport transport; + if (rankToNode(r) == thisNode) { + transport = mscclpp::Transport::CudaIpc; + } else { + transport = ibTransport; + } + // Connect with all other ranks + proxyService.connections[r] = comm.connectOnSetup(r, 0, transport); + proxyService.hostEpochs.emplace_back(std::make_shared(comm, proxyService.connections[r])); + proxyService.deviceEpochs.emplace_back(std::make_shared(comm, proxyService.connections[r])); + comm.sendMemoryOnSetup(proxyService.localMemory, r, 0); + + remoteMemories[r] = comm.recvMemoryOnSetup(r, 0); + } + + comm.setup(); + for (int r = 0; r < world_size; ++r) { + if (r == rank){ + continue; + } + proxyService.remoteMemories[r] = remoteMemories[r].get(); + } +} + +std::unordered_map parseArgs(int argc, char* argv[]) +{ + std::unordered_map options; + + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + if (arg == "-datasize") { + if (i + 1 < argc) { + options["datasize"] = argv[++i]; + } else { + fprintf(stderr, "Error: -datasize option requires an argument.\n"); + exit(-1); + } + } else if (arg == "-help" || arg == "-h") { + exit(0); + } else { + fprintf(stderr, "Error: Unknown option %s\n", argv[i]); + exit(-1); + } + } + return options; +} + + +int main(int argc, char* argv[]) +{ + MPI_Init(&argc, &argv); + auto parsedArgs = parseArgs(argc, argv); + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + // get the local number of nodes with MPI + MPI_Comm shmcomm; + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm); + int shmrank; + MPI_Comm_size(shmcomm, &shmrank); + nranksPerNode = shmrank; + MPI_Comm_free(&shmcomm); + + + int cudaNum = rankToLocalRank(rank); + CUDACHECK(cudaSetDevice(cudaNum)); + + if (rank == 0) + printf("Initializing MSCCL++\n"); + auto bootstrap = std::make_shared(rank, world_size); + mscclpp::UniqueId uniqueId; + if (rank == 0) + uniqueId = bootstrap->createUniqueId(); + MPI_Bcast(&uniqueId, sizeof(uniqueId), MPI_BYTE, 0, MPI_COMM_WORLD); + bootstrap->initialize(uniqueId); + mscclpp::Communicator comm(bootstrap); + + int* data_d; + int* data_h; + size_t dataSize = 1024 * 1024 * 1024; + if (parsedArgs.find("datasize") != parsedArgs.end()) { + dataSize = std::stoul(parsedArgs["datasize"]); + } + size_t nelemsPerGPU = dataSize / sizeof(int) / world_size; + + if (rank == 0) + printf("Initializing data for allgather test\n"); + initializeAndAllocateAllGatherData(rank, world_size, dataSize, nelemsPerGPU, &data_h, &data_d); + + if (rank == 0) + printf("Setting up the connection in MSCCL++\n"); + + MyProxyService proxyService; + setupProxyService(comm, proxyService, data_d, dataSize); + + if (rank == 0) + printf("Launching MSCCL++ proxy threads\n"); + proxyService.proxy.start(); + mscclpp::DeviceProxyFifo fifo = proxyService.proxy.fifo().deviceFifo(); + if (rank == 0) + printf("Testing the correctness of AllGather implementation\n"); + cudaStream_t stream; + CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + mscclpp::DeviceEpoch::DeviceHandle* deviceHandles; + + CUDACHECK(cudaMalloc(&deviceHandles, sizeof(mscclpp::DeviceEpoch::DeviceHandle) * world_size)); + for (int i = 0; i < world_size; ++i) { + auto handle = proxyService.deviceEpochs[i]->deviceHandle(); + CUDACHECK(cudaMemcpy(&deviceHandles[i], &handle, sizeof(mscclpp::DeviceEpoch::DeviceHandle), cudaMemcpyHostToDevice)); + } + + kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles); + CUDACHECK(cudaStreamSynchronize(stream)); + + CUDACHECK(cudaMemcpy(data_h, data_d, dataSize, cudaMemcpyDeviceToHost)); + + for (size_t i = 0; i < nelemsPerGPU * world_size; i++) { + int val = i + 1; + if (data_h[i] != val) { + printf("oh uh! data_h[%ld] (%d) != val (%d)\n", i, data_h[i], val); + break; + } + } + + bootstrap->barrier(); + + printf("Rank %d succeeded!\n", rank); + +#ifdef MSCCLPP_USE_MPI_FOR_TESTS + MPI_Finalize(); +#endif + return 0; +} From 31851ad82c76b96a197dfa7cdeee01d8b382e12b Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Fri, 12 May 2023 06:11:12 +0000 Subject: [PATCH 130/135] host epoch removed --- test/allgather_test_host_offloading.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/allgather_test_host_offloading.cu b/test/allgather_test_host_offloading.cu index c7a80611..32f6b8c9 100644 --- a/test/allgather_test_host_offloading.cu +++ b/test/allgather_test_host_offloading.cu @@ -150,12 +150,13 @@ void setupProxyService(mscclpp::Communicator& comm, MyProxyService& proxyService mscclpp::Transport transport; if (rankToNode(r) == thisNode) { transport = mscclpp::Transport::CudaIpc; + proxyService.hostEpochs.emplace_back(nullptr); } else { transport = ibTransport; + proxyService.hostEpochs.emplace_back(std::make_shared(comm, proxyService.connections[r])); } // Connect with all other ranks proxyService.connections[r] = comm.connectOnSetup(r, 0, transport); - proxyService.hostEpochs.emplace_back(std::make_shared(comm, proxyService.connections[r])); proxyService.deviceEpochs.emplace_back(std::make_shared(comm, proxyService.connections[r])); comm.sendMemoryOnSetup(proxyService.localMemory, r, 0); @@ -197,6 +198,7 @@ std::unordered_map parseArgs(int argc, char* argv[]) int main(int argc, char* argv[]) { + sleep(10); MPI_Init(&argc, &argv); auto parsedArgs = parseArgs(argc, argv); @@ -258,7 +260,7 @@ int main(int argc, char* argv[]) CUDACHECK(cudaMemcpy(&deviceHandles[i], &handle, sizeof(mscclpp::DeviceEpoch::DeviceHandle), cudaMemcpyHostToDevice)); } - kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles); + // kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles); CUDACHECK(cudaStreamSynchronize(stream)); CUDACHECK(cudaMemcpy(data_h, data_d, dataSize, cudaMemcpyDeviceToHost)); From 113473a116c84061312f44df7ee5a50c86e39e51 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Fri, 12 May 2023 07:01:21 +0000 Subject: [PATCH 131/135] more progress --- test/allgather_test_host_offloading.cu | 96 ++++++++++++++++++++++++-- 1 file changed, 89 insertions(+), 7 deletions(-) diff --git a/test/allgather_test_host_offloading.cu b/test/allgather_test_host_offloading.cu index 32f6b8c9..29d61c0a 100644 --- a/test/allgather_test_host_offloading.cu +++ b/test/allgather_test_host_offloading.cu @@ -121,7 +121,17 @@ public: } mscclpp::ProxyHandlerResult handleTrigger(mscclpp::ProxyTrigger triggerRaw) { - // do something with it. + if (triggerRaw.fst == 1) { + int dataSizePerRank = dataSize / world_size; + for (int r = 0; r < world_size; ++r) { + if (r == rank) { + continue; + } + connections[r]->write(remoteMemories[r], rank*dataSizePerRank, localMemory, rank*dataSizePerRank, dataSizePerRank); + deviceEpochs[r]->signal(); + connections[r]->flush(); + } + } return mscclpp::ProxyHandlerResult::FlushFifoTailAndContinue; } mscclpp::Proxy proxy; @@ -129,11 +139,13 @@ public: mscclpp::RegisteredMemory localMemory; std::vector> hostEpochs; std::vector> deviceEpochs; - std::vector> connections; + std::vector> connections; + int dataSize; }; void setupProxyService(mscclpp::Communicator& comm, MyProxyService& proxyService, int* data_d, int dataSize) { + proxyService.dataSize = dataSize; int thisNode = rankToNode(rank); int cudaNum = rankToLocalRank(rank); std::string ibDevStr = "mlx5_ib" + std::to_string(cudaNum); @@ -150,13 +162,16 @@ void setupProxyService(mscclpp::Communicator& comm, MyProxyService& proxyService mscclpp::Transport transport; if (rankToNode(r) == thisNode) { transport = mscclpp::Transport::CudaIpc; - proxyService.hostEpochs.emplace_back(nullptr); } else { transport = ibTransport; - proxyService.hostEpochs.emplace_back(std::make_shared(comm, proxyService.connections[r])); } // Connect with all other ranks proxyService.connections[r] = comm.connectOnSetup(r, 0, transport); + if (rankToNode(r) == thisNode) { + proxyService.hostEpochs.emplace_back(nullptr); + } else { + proxyService.hostEpochs.emplace_back(std::make_shared(comm, proxyService.connections[r])); + } proxyService.deviceEpochs.emplace_back(std::make_shared(comm, proxyService.connections[r])); comm.sendMemoryOnSetup(proxyService.localMemory, r, 0); @@ -198,7 +213,7 @@ std::unordered_map parseArgs(int argc, char* argv[]) int main(int argc, char* argv[]) { - sleep(10); + // sleep(10); MPI_Init(&argc, &argv); auto parsedArgs = parseArgs(argc, argv); @@ -256,11 +271,13 @@ int main(int argc, char* argv[]) CUDACHECK(cudaMalloc(&deviceHandles, sizeof(mscclpp::DeviceEpoch::DeviceHandle) * world_size)); for (int i = 0; i < world_size; ++i) { + if (i == rank) + continue; auto handle = proxyService.deviceEpochs[i]->deviceHandle(); CUDACHECK(cudaMemcpy(&deviceHandles[i], &handle, sizeof(mscclpp::DeviceEpoch::DeviceHandle), cudaMemcpyHostToDevice)); } - // kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles); + kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles); CUDACHECK(cudaStreamSynchronize(stream)); CUDACHECK(cudaMemcpy(data_h, data_d, dataSize, cudaMemcpyDeviceToHost)); @@ -274,8 +291,73 @@ int main(int argc, char* argv[]) } bootstrap->barrier(); + if (rank == 0) + printf("Correctness test passed!\n"); + + double t0, t1, ms, time_in_us; + int iterwithoutcudagraph = 10; + if (rank == 0) + printf("Running %d iterations of the kernel without CUDA graph\n", iterwithoutcudagraph); + CUDACHECK(cudaStreamSynchronize(stream)); + bootstrap->barrier(); + t0 = getTime(); + for (int i = 0; i < iterwithoutcudagraph; ++i) { + kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles); + } + CUDACHECK(cudaStreamSynchronize(stream)); + bootstrap->barrier(); + t1 = getTime(); + ms = (t1 - t0) * 1000.0; + time_in_us = ms * 1000. / (float)iterwithoutcudagraph; + printf("No Graph %d report: size %lu time: %f us/iter algBW %f GBps\n", rank, dataSize, time_in_us, + (double)(dataSize) / 1e9 / (time_in_us / 1e6)); + + // cudaGraph Capture + int cudagraphiter = 10; + if (rank == 0) + printf("Capturing %d iterations of the kernel in a CUDA graph\n", cudagraphiter); + cudaGraph_t graph; + cudaGraphExec_t instance; + cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal); + for (int i = 0; i < cudagraphiter; ++i) { + kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles); + } + cudaStreamEndCapture(stream, &graph); + cudaGraphInstantiate(&instance, graph, NULL, NULL, 0); + + int cudagraphwarmup = 10; + if (rank == 0) + printf("Warming up %d iterations of the CUDA graph with %d iterations of the kernel\n", cudagraphwarmup, + cudagraphiter); + for (int i = 0; i < cudagraphwarmup; ++i) { + cudaGraphLaunch(instance, stream); + } + CUDACHECK(cudaStreamSynchronize(stream)); + + // measure runtime + int cudagraphlaunch = 10; + if (rank == 0) + printf("Running %d iterations of the CUDA graph with %d iterations of the kernel\n", cudagraphlaunch, + cudagraphiter); + bootstrap->barrier(); + t0 = getTime(); + for (int i = 0; i < cudagraphlaunch; ++i) { + cudaGraphLaunch(instance, stream); + } + CUDACHECK(cudaStreamSynchronize(stream)); + + t1 = getTime(); + ms = (t1 - t0) * 1000.0; + time_in_us = ms * 1000. / (float)cudagraphlaunch / (float)cudagraphiter; + printf("Rank %d report: size %lu time: %f us/iter algBW %f GBps\n", rank, dataSize, time_in_us, + (double)(dataSize) / 1e9 / (time_in_us / 1e6)); + bootstrap->barrier(); + + if (rank == 0) + printf("Stopping MSCCL++ proxy threads\n"); + proxyService.proxy.stop(); + - printf("Rank %d succeeded!\n", rank); #ifdef MSCCLPP_USE_MPI_FOR_TESTS MPI_Finalize(); From 2691784b8805881cfcd963a746f07dafa37bc250 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Fri, 12 May 2023 20:21:58 +0000 Subject: [PATCH 132/135] working -- at least for single node --- src/communicator.cc | 12 ++++- src/connection.cc | 11 ++-- src/include/communicator.hpp | 4 ++ src/include/connection.hpp | 4 +- test/allgather_test_host_offloading.cu | 69 ++++++++++++++++---------- 5 files changed, 65 insertions(+), 35 deletions(-) diff --git a/src/communicator.cc b/src/communicator.cc index c324093f..b7b6923d 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -18,9 +18,15 @@ Communicator::Impl::Impl(std::shared_ptr bootstrap) : bootstrap_( INFO(MSCCLPP_INIT, "Host hash: %lx", hostHash); rankToHash_[bootstrap->getRank()] = hostHash; bootstrap->allGather(rankToHash_.data(), sizeof(uint64_t)); + + CUDATHROW(cudaStreamCreateWithFlags(&ipcStream_, cudaStreamNonBlocking)); } -Communicator::Impl::~Impl() { ibContexts_.clear(); } +Communicator::Impl::~Impl() { + ibContexts_.clear(); + + cudaStreamDestroy(ipcStream_); +} IbCtx* Communicator::Impl::getIbContext(Transport ibTransport) { // Find IB context or create it @@ -34,6 +40,8 @@ IbCtx* Communicator::Impl::getIbContext(Transport ibTransport) { } } +cudaStream_t Communicator::Impl::getIpcStream() { return ipcStream_; } + MSCCLPP_API_CPP Communicator::~Communicator() = default; MSCCLPP_API_CPP Communicator::Communicator(std::shared_ptr bootstrap) @@ -95,7 +103,7 @@ MSCCLPP_API_CPP std::shared_ptr Communicator::connectOnSetup(int rem << pimpl->rankToHash_[pimpl->bootstrap_->getRank()] << ")"; throw mscclpp::Error(ss.str(), ErrorCode::InvalidUsage); } - auto cudaIpcConn = std::make_shared(remoteRank, tag); + auto cudaIpcConn = std::make_shared(remoteRank, tag, pimpl->getIpcStream()); conn = cudaIpcConn; INFO(MSCCLPP_P2P, "Cuda IPC connection between rank %d(%lx) and remoteRank %d(%lx) created", pimpl->bootstrap_->getRank(), pimpl->rankToHash_[pimpl->bootstrap_->getRank()], remoteRank, diff --git a/src/connection.cc b/src/connection.cc index 10ca79ee..1fce9b89 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -30,11 +30,10 @@ int ConnectionBase::tag() { return tag_; } // CudaIpcConnection -CudaIpcConnection::CudaIpcConnection(int remoteRank, int tag) : ConnectionBase(remoteRank, tag) { - CUDATHROW(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); -} +CudaIpcConnection::CudaIpcConnection(int remoteRank, int tag, cudaStream_t stream) + : ConnectionBase(remoteRank, tag), stream_(stream) {} -CudaIpcConnection::~CudaIpcConnection() { cudaStreamDestroy(stream); } +CudaIpcConnection::~CudaIpcConnection() {} Transport CudaIpcConnection::transport() { return Transport::CudaIpc; } @@ -48,14 +47,14 @@ void CudaIpcConnection::write(RegisteredMemory dst, uint64_t dstOffset, Register char* dstPtr = (char*)dst.data(); char* srcPtr = (char*)src.data(); - CUDATHROW(cudaMemcpyAsync(dstPtr + dstOffset, srcPtr + srcOffset, size, cudaMemcpyDeviceToDevice, stream)); + CUDATHROW(cudaMemcpyAsync(dstPtr + dstOffset, srcPtr + srcOffset, size, cudaMemcpyDeviceToDevice, stream_)); INFO(MSCCLPP_P2P, "CudaIpcConnection write: from %p to %p, size %lu", srcPtr + srcOffset, dstPtr + dstOffset, size); // npkitCollectEntryEvent(conn, NPKIT_EVENT_DMA_SEND_DATA_ENTRY, (uint32_t)size); } void CudaIpcConnection::flush() { - CUDATHROW(cudaStreamSynchronize(stream)); + CUDATHROW(cudaStreamSynchronize(stream_)); // npkitCollectExitEvents(conn, NPKIT_EVENT_DMA_SEND_EXIT); } diff --git a/src/include/communicator.hpp b/src/include/communicator.hpp index eaf05a32..6461eb13 100644 --- a/src/include/communicator.hpp +++ b/src/include/communicator.hpp @@ -1,6 +1,8 @@ #ifndef MSCCL_COMMUNICATOR_HPP_ #define MSCCL_COMMUNICATOR_HPP_ +#include + #include #include #include @@ -17,6 +19,7 @@ struct Communicator::Impl { std::vector> connections_; std::vector> toSetup_; std::unordered_map> ibContexts_; + cudaStream_t ipcStream_; std::shared_ptr bootstrap_; std::vector rankToHash_; @@ -25,6 +28,7 @@ struct Communicator::Impl { ~Impl(); IbCtx* getIbContext(Transport ibTransport); + cudaStream_t getIpcStream(); }; } // namespace mscclpp diff --git a/src/include/connection.hpp b/src/include/connection.hpp index 8516871a..3e9896ba 100644 --- a/src/include/connection.hpp +++ b/src/include/connection.hpp @@ -27,10 +27,10 @@ class ConnectionBase : public Connection, public Setuppable { }; class CudaIpcConnection : public ConnectionBase { - cudaStream_t stream; + cudaStream_t stream_; public: - CudaIpcConnection(int remoteRank, int tag); + CudaIpcConnection(int remoteRank, int tag, cudaStream_t stream); ~CudaIpcConnection(); diff --git a/test/allgather_test_host_offloading.cu b/test/allgather_test_host_offloading.cu index 29d61c0a..dfd0acdf 100644 --- a/test/allgather_test_host_offloading.cu +++ b/test/allgather_test_host_offloading.cu @@ -51,18 +51,22 @@ static double getTime(void) } -__global__ void kernel(int r, int nranks, mscclpp::DeviceProxyFifo fifo, mscclpp::DeviceEpoch::DeviceHandle* handles) +__global__ void kernel(int r, int nranks, mscclpp::DeviceProxyFifo fifo, mscclpp::DeviceEpoch::DeviceHandle* handles, int handleIndex) { int tid = threadIdx.x; if (tid != r) handles[tid].epochIncrement(); + __syncthreads(); + uint64_t tail; if (tid == 0){ mscclpp::ProxyTrigger trigger; - trigger.fst = 1; - fifo.push(trigger); + trigger.fst = handleIndex; + tail = fifo.push(trigger); } if (tid != r) handles[tid].wait(); + // if (tid == 0) + // while(*(volatile uint64_t*)fifo.tailReplica < tail) {}; } int rankToLocalRank(int rank) @@ -121,15 +125,15 @@ public: } mscclpp::ProxyHandlerResult handleTrigger(mscclpp::ProxyTrigger triggerRaw) { - if (triggerRaw.fst == 1) { + if (triggerRaw.fst > 0) { int dataSizePerRank = dataSize / world_size; - for (int r = 0; r < world_size; ++r) { - if (r == rank) { - continue; - } - connections[r]->write(remoteMemories[r], rank*dataSizePerRank, localMemory, rank*dataSizePerRank, dataSizePerRank); - deviceEpochs[r]->signal(); - connections[r]->flush(); + for (int r = 1; r < world_size; ++r) { + int nghr = (rank + r) % world_size; + connections[nghr]->write(remoteMemories[nghr], rank*dataSizePerRank, localMemory, rank*dataSizePerRank, dataSizePerRank); + if (triggerRaw.fst == 1) + deviceEpochs1[nghr]->signal(); + else + deviceEpochs2[nghr]->signal(); } } return mscclpp::ProxyHandlerResult::FlushFifoTailAndContinue; @@ -138,7 +142,8 @@ public: std::vector remoteMemories; mscclpp::RegisteredMemory localMemory; std::vector> hostEpochs; - std::vector> deviceEpochs; + std::vector> deviceEpochs1; + std::vector> deviceEpochs2; std::vector> connections; int dataSize; }; @@ -156,7 +161,8 @@ void setupProxyService(mscclpp::Communicator& comm, MyProxyService& proxyService for (int r = 0; r < world_size; ++r) { if (r == rank){ proxyService.hostEpochs.emplace_back(nullptr); - proxyService.deviceEpochs.emplace_back(nullptr); + proxyService.deviceEpochs1.emplace_back(nullptr); + proxyService.deviceEpochs2.emplace_back(nullptr); continue; } mscclpp::Transport transport; @@ -172,7 +178,8 @@ void setupProxyService(mscclpp::Communicator& comm, MyProxyService& proxyService } else { proxyService.hostEpochs.emplace_back(std::make_shared(comm, proxyService.connections[r])); } - proxyService.deviceEpochs.emplace_back(std::make_shared(comm, proxyService.connections[r])); + proxyService.deviceEpochs1.emplace_back(std::make_shared(comm, proxyService.connections[r])); + proxyService.deviceEpochs2.emplace_back(std::make_shared(comm, proxyService.connections[r])); comm.sendMemoryOnSetup(proxyService.localMemory, r, 0); remoteMemories[r] = comm.recvMemoryOnSetup(r, 0); @@ -267,17 +274,26 @@ int main(int argc, char* argv[]) printf("Testing the correctness of AllGather implementation\n"); cudaStream_t stream; CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - mscclpp::DeviceEpoch::DeviceHandle* deviceHandles; + mscclpp::DeviceEpoch::DeviceHandle* deviceHandles1; + mscclpp::DeviceEpoch::DeviceHandle* deviceHandles2; - CUDACHECK(cudaMalloc(&deviceHandles, sizeof(mscclpp::DeviceEpoch::DeviceHandle) * world_size)); + CUDACHECK(cudaMalloc(&deviceHandles1, sizeof(mscclpp::DeviceEpoch::DeviceHandle) * world_size)); for (int i = 0; i < world_size; ++i) { if (i == rank) continue; - auto handle = proxyService.deviceEpochs[i]->deviceHandle(); - CUDACHECK(cudaMemcpy(&deviceHandles[i], &handle, sizeof(mscclpp::DeviceEpoch::DeviceHandle), cudaMemcpyHostToDevice)); + auto handle = proxyService.deviceEpochs1[i]->deviceHandle(); + CUDACHECK(cudaMemcpy(&deviceHandles1[i], &handle, sizeof(mscclpp::DeviceEpoch::DeviceHandle), cudaMemcpyHostToDevice)); } - kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles); + CUDACHECK(cudaMalloc(&deviceHandles2, sizeof(mscclpp::DeviceEpoch::DeviceHandle) * world_size)); + for (int i = 0; i < world_size; ++i) { + if (i == rank) + continue; + auto handle = proxyService.deviceEpochs2[i]->deviceHandle(); + CUDACHECK(cudaMemcpy(&deviceHandles2[i], &handle, sizeof(mscclpp::DeviceEpoch::DeviceHandle), cudaMemcpyHostToDevice)); + } + + kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles1, 1); CUDACHECK(cudaStreamSynchronize(stream)); CUDACHECK(cudaMemcpy(data_h, data_d, dataSize, cudaMemcpyDeviceToHost)); @@ -302,13 +318,14 @@ int main(int argc, char* argv[]) bootstrap->barrier(); t0 = getTime(); for (int i = 0; i < iterwithoutcudagraph; ++i) { - kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles); + kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles1, 1); + kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles2, 2); } CUDACHECK(cudaStreamSynchronize(stream)); bootstrap->barrier(); t1 = getTime(); ms = (t1 - t0) * 1000.0; - time_in_us = ms * 1000. / (float)iterwithoutcudagraph; + time_in_us = ms * 1000. / (float)iterwithoutcudagraph / 2; printf("No Graph %d report: size %lu time: %f us/iter algBW %f GBps\n", rank, dataSize, time_in_us, (double)(dataSize) / 1e9 / (time_in_us / 1e6)); @@ -320,7 +337,8 @@ int main(int argc, char* argv[]) cudaGraphExec_t instance; cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal); for (int i = 0; i < cudagraphiter; ++i) { - kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles); + kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles1, 1); + kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles2, 2); } cudaStreamEndCapture(stream, &graph); cudaGraphInstantiate(&instance, graph, NULL, NULL, 0); @@ -348,9 +366,10 @@ int main(int argc, char* argv[]) t1 = getTime(); ms = (t1 - t0) * 1000.0; - time_in_us = ms * 1000. / (float)cudagraphlaunch / (float)cudagraphiter; - printf("Rank %d report: size %lu time: %f us/iter algBW %f GBps\n", rank, dataSize, time_in_us, - (double)(dataSize) / 1e9 / (time_in_us / 1e6)); + time_in_us = ms * 1000. / (float)cudagraphlaunch / (float)cudagraphiter / 2; + if (rank == 0) + printf("Rank %d report: size %lu time: %f us/iter algBW %f GBps\n", rank, dataSize, time_in_us, + (double)(dataSize) / 1e9 / (time_in_us / 1e6)); bootstrap->barrier(); if (rank == 0) From d58e698d514895120ee5f376b60d27dd96468c80 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Fri, 12 May 2023 21:23:01 +0000 Subject: [PATCH 133/135] Add headers to install and set default install dir --- CMakeLists.txt | 6 ++++++ include/CMakeLists.txt | 3 +++ 2 files changed, 9 insertions(+) create mode 100644 include/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 5f8ddeb2..3340524f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,6 +8,10 @@ option(USE_MPI_FOR_TESTS "Use MPI for tests" ON) option(USE_NPKIT "Use NPKIT" ON) option(ALLOW_GDRCOPY "Use GDRCopy, if available" OFF) +if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) + set (CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/installed" CACHE PATH "default install path" FORCE) +endif() + list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) find_package(CUDAToolkit REQUIRED) @@ -24,6 +28,7 @@ include_directories(${CUDAToolkit_INCLUDE_DIRS}) include(CTest) include(FetchContent) FetchContent_Declare(googletest URL https://github.com/google/googletest/archive/b796f7d44681514f58a683a3a71ff17c94edb0c1.zip) +option(INSTALL_GTEST OFF) FetchContent_MakeAvailable(googletest) include(GoogleTest) @@ -45,5 +50,6 @@ if(ALLOW_GDRCOPY AND GDRCOPY_FOUND) target_link_libraries(mscclpp PRIVATE MSCCLPP::gdrcopy) endif() +add_subdirectory(include) # This adds the public headers to install with mscclpp add_subdirectory(src) # This adds the sources to the mscclpp target add_subdirectory(test) diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt new file mode 100644 index 00000000..b5fa7984 --- /dev/null +++ b/include/CMakeLists.txt @@ -0,0 +1,3 @@ +file(GLOB_RECURSE HEADERS CONFIGURE_DEPENDS *.hpp) +target_sources(mscclpp PUBLIC FILE_SET HEADERS FILES ${HEADERS}) +install(TARGETS mscclpp FILE_SET HEADERS) From 8f2d7922edaaf5ab45a5338de436ec5d17a653a8 Mon Sep 17 00:00:00 2001 From: Olli Saarikivi Date: Fri, 12 May 2023 21:25:29 +0000 Subject: [PATCH 134/135] Change install dir --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3340524f..01354076 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,7 +9,7 @@ option(USE_NPKIT "Use NPKIT" ON) option(ALLOW_GDRCOPY "Use GDRCopy, if available" OFF) if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) - set (CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/installed" CACHE PATH "default install path" FORCE) + set (CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "default install path" FORCE) endif() list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) From 2a7b7459729eff43a0f0283a551939dbf75730a0 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Fri, 12 May 2023 22:42:22 +0000 Subject: [PATCH 135/135] fully working with double buffering --- src/include/utils.h | 2 +- test/allgather_test_host_offloading.cu | 222 +++++++++++++------------ 2 files changed, 120 insertions(+), 104 deletions(-) diff --git a/src/include/utils.h b/src/include/utils.h index 07a16684..f3318031 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -12,7 +12,7 @@ #include #include "alloc.h" -#include "mscclpp.h" +// #include "mscclpp.h" // int mscclppCudaCompCap(); diff --git a/test/allgather_test_host_offloading.cu b/test/allgather_test_host_offloading.cu index dfd0acdf..c0ced1f0 100644 --- a/test/allgather_test_host_offloading.cu +++ b/test/allgather_test_host_offloading.cu @@ -2,6 +2,7 @@ #include #include #include +#include #ifdef MSCCLPP_USE_MPI_FOR_TESTS #include "mpi.h" @@ -19,18 +20,8 @@ int world_size; // Propagate errors up -#define MSCCLPPCHECK(call) \ - do { \ - mscclppResult_t res = call; \ - if (res != mscclppSuccess && res != mscclppInProgress) { \ - /* Print the back trace*/ \ - printf("Failure at %s:%d -> %s\n", __FILE__, __LINE__, mscclppGetErrorString(res)); \ - return res; \ - } \ - } while (0) - // Check CUDA RT calls -#define CUDACHECK(cmd) \ +#define CUCHECK(cmd) \ do { \ cudaError_t err = cmd; \ if (err != cudaSuccess) { \ @@ -57,11 +48,12 @@ __global__ void kernel(int r, int nranks, mscclpp::DeviceProxyFifo fifo, mscclpp if (tid != r) handles[tid].epochIncrement(); __syncthreads(); - uint64_t tail; + // uint64_t tail; if (tid == 0){ mscclpp::ProxyTrigger trigger; trigger.fst = handleIndex; - tail = fifo.push(trigger); + fifo.push(trigger); + // tail = fifo.push(trigger); } if (tid != r) handles[tid].wait(); @@ -91,8 +83,8 @@ void print_usage(const char* prog) void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSize, size_t nelemsPerGPU, int** data_h, int** data_d) { - CUDACHECK(cudaMalloc(data_d, dataSize)); - CUDACHECK(cudaMemset(*data_d, 0, dataSize)); + CUCHECK(cudaMalloc(data_d, dataSize)); + CUCHECK(cudaMemset(*data_d, 0, dataSize)); *data_h = new int[nelemsPerGPU * world_size]; for (size_t i = 0; i < nelemsPerGPU * world_size; i++) { @@ -103,97 +95,121 @@ void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSiz (*data_h)[i] = 0; } } - CUDACHECK(cudaMemcpy(*data_d, *data_h, dataSize, cudaMemcpyHostToDevice)); + CUCHECK(cudaMemcpy(*data_d, *data_h, dataSize, cudaMemcpyHostToDevice)); } class MyProxyService { private: - int deviceNumaNode; + int deviceNumaNode_; + mscclpp::Proxy proxy_; + std::vector remoteMemories_; + mscclpp::RegisteredMemory localMemory_; + std::vector> hostEpochs_; + std::vector> deviceEpochs1_; + std::vector> deviceEpochs2_; + std::vector> connections_; + int dataSize_; public: - MyProxyService() : remoteMemories(world_size), connections(world_size), - proxy([&](mscclpp::ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }, [&]() { bindThread(); }) { - // int cudaDevice; - // CUDACHECK(cudaGetDevice(&cudaDevice)); - // getDeviceNumaNode(cudaDevice, &deviceNumaNode); + MyProxyService(mscclpp::Communicator& comm, int* data_d, int dataSize) : remoteMemories_(world_size), connections_(world_size), dataSize_(dataSize), + proxy_([&](mscclpp::ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }, [&]() { bindThread(); }) { + int cudaDevice; + CUCHECK(cudaGetDevice(&cudaDevice)); + getDeviceNumaNode(cudaDevice, &deviceNumaNode_); + + int thisNode = rankToNode(rank); + int cudaNum = rankToLocalRank(rank); + std::string ibDevStr = "mlx5_ib" + std::to_string(cudaNum); + mscclpp::Transport ibTransport = mscclpp::getIBTransportByDeviceName(ibDevStr); + std::vector> remoteMemoriesFuture(world_size); + + + localMemory_ = comm.registerMemory(data_d, dataSize, mscclpp::Transport::CudaIpc | ibTransport); + for (int r = 0; r < world_size; ++r) { + if (r == rank){ + hostEpochs_.emplace_back(nullptr); + deviceEpochs1_.emplace_back(nullptr); + deviceEpochs2_.emplace_back(nullptr); + continue; + } + mscclpp::Transport transport; + if (rankToNode(r) == thisNode) { + transport = mscclpp::Transport::CudaIpc; + } else { + transport = ibTransport; + } + // Connect with all other ranks + connections_[r] = comm.connectOnSetup(r, 0, transport); + if (rankToNode(r) == thisNode) { + hostEpochs_.emplace_back(nullptr); + } else { + hostEpochs_.emplace_back(std::make_shared(comm, connections_[r])); + } + deviceEpochs1_.emplace_back(std::make_shared(comm, connections_[r])); + deviceEpochs2_.emplace_back(std::make_shared(comm, connections_[r])); + comm.sendMemoryOnSetup(localMemory_, r, 0); + + remoteMemoriesFuture[r] = comm.recvMemoryOnSetup(r, 0); + } + + comm.setup(); + + for (int r = 0; r < world_size; ++r) { + if (r == rank){ + continue; + } + remoteMemories_[r] = remoteMemoriesFuture[r].get(); + } } void bindThread() { - // if (deviceNumaNode >= 0) { - // numaBind(deviceNumaNode); - // INFO(MSCCLPP_INIT, "NUMA node of DeviceChannelService proxy thread is set to %d", deviceNumaNode); - // } + if (deviceNumaNode_ >= 0) { + numaBind(deviceNumaNode_); + } } mscclpp::ProxyHandlerResult handleTrigger(mscclpp::ProxyTrigger triggerRaw) { + static int flusher = 0; if (triggerRaw.fst > 0) { - int dataSizePerRank = dataSize / world_size; + int dataSizePerRank = dataSize_ / world_size; for (int r = 1; r < world_size; ++r) { int nghr = (rank + r) % world_size; - connections[nghr]->write(remoteMemories[nghr], rank*dataSizePerRank, localMemory, rank*dataSizePerRank, dataSizePerRank); + connections_[nghr]->write(remoteMemories_[nghr], rank*dataSizePerRank, localMemory_, rank*dataSizePerRank, dataSizePerRank); if (triggerRaw.fst == 1) - deviceEpochs1[nghr]->signal(); + deviceEpochs1_[nghr]->signal(); else - deviceEpochs2[nghr]->signal(); + deviceEpochs2_[nghr]->signal(); + if ((flusher % 64) == 0 && mscclpp::AllIBTransports.has(connections_[nghr]->transport())){ + // if we are using IB transport, we need a flush every once in a while + connections_[nghr]->flush(); + } } + flusher++; + } return mscclpp::ProxyHandlerResult::FlushFifoTailAndContinue; } - mscclpp::Proxy proxy; - std::vector remoteMemories; - mscclpp::RegisteredMemory localMemory; - std::vector> hostEpochs; - std::vector> deviceEpochs1; - std::vector> deviceEpochs2; - std::vector> connections; - int dataSize; + + void start(){ + proxy_.start(); + } + + void stop(){ + proxy_.stop(); + } + + mscclpp::HostProxyFifo& fifo(){ + return proxy_.fifo(); + } + + mscclpp::DeviceEpoch::DeviceHandle getDeviceHandle1(int r){ + return deviceEpochs1_[r]->deviceHandle(); + } + + mscclpp::DeviceEpoch::DeviceHandle getDeviceHandle2(int r){ + return deviceEpochs2_[r]->deviceHandle(); + } }; -void setupProxyService(mscclpp::Communicator& comm, MyProxyService& proxyService, int* data_d, int dataSize) -{ - proxyService.dataSize = dataSize; - int thisNode = rankToNode(rank); - int cudaNum = rankToLocalRank(rank); - std::string ibDevStr = "mlx5_ib" + std::to_string(cudaNum); - mscclpp::Transport ibTransport = mscclpp::getIBTransportByDeviceName(ibDevStr); - std::vector> remoteMemories(world_size); - - proxyService.localMemory = comm.registerMemory(data_d, dataSize, mscclpp::Transport::CudaIpc | ibTransport); - for (int r = 0; r < world_size; ++r) { - if (r == rank){ - proxyService.hostEpochs.emplace_back(nullptr); - proxyService.deviceEpochs1.emplace_back(nullptr); - proxyService.deviceEpochs2.emplace_back(nullptr); - continue; - } - mscclpp::Transport transport; - if (rankToNode(r) == thisNode) { - transport = mscclpp::Transport::CudaIpc; - } else { - transport = ibTransport; - } - // Connect with all other ranks - proxyService.connections[r] = comm.connectOnSetup(r, 0, transport); - if (rankToNode(r) == thisNode) { - proxyService.hostEpochs.emplace_back(nullptr); - } else { - proxyService.hostEpochs.emplace_back(std::make_shared(comm, proxyService.connections[r])); - } - proxyService.deviceEpochs1.emplace_back(std::make_shared(comm, proxyService.connections[r])); - proxyService.deviceEpochs2.emplace_back(std::make_shared(comm, proxyService.connections[r])); - comm.sendMemoryOnSetup(proxyService.localMemory, r, 0); - - remoteMemories[r] = comm.recvMemoryOnSetup(r, 0); - } - - comm.setup(); - for (int r = 0; r < world_size; ++r) { - if (r == rank){ - continue; - } - proxyService.remoteMemories[r] = remoteMemories[r].get(); - } -} - std::unordered_map parseArgs(int argc, char* argv[]) { std::unordered_map options; @@ -236,7 +252,7 @@ int main(int argc, char* argv[]) int cudaNum = rankToLocalRank(rank); - CUDACHECK(cudaSetDevice(cudaNum)); + CUCHECK(cudaSetDevice(cudaNum)); if (rank == 0) printf("Initializing MSCCL++\n"); @@ -263,40 +279,40 @@ int main(int argc, char* argv[]) if (rank == 0) printf("Setting up the connection in MSCCL++\n"); - MyProxyService proxyService; - setupProxyService(comm, proxyService, data_d, dataSize); + MyProxyService proxyService(comm, data_d, dataSize); + // setupProxyService(comm, proxyService, data_d, dataSize); if (rank == 0) printf("Launching MSCCL++ proxy threads\n"); - proxyService.proxy.start(); - mscclpp::DeviceProxyFifo fifo = proxyService.proxy.fifo().deviceFifo(); + proxyService.start(); + mscclpp::DeviceProxyFifo fifo = proxyService.fifo().deviceFifo(); if (rank == 0) printf("Testing the correctness of AllGather implementation\n"); cudaStream_t stream; - CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + CUCHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); mscclpp::DeviceEpoch::DeviceHandle* deviceHandles1; mscclpp::DeviceEpoch::DeviceHandle* deviceHandles2; - CUDACHECK(cudaMalloc(&deviceHandles1, sizeof(mscclpp::DeviceEpoch::DeviceHandle) * world_size)); + CUCHECK(cudaMalloc(&deviceHandles1, sizeof(mscclpp::DeviceEpoch::DeviceHandle) * world_size)); for (int i = 0; i < world_size; ++i) { if (i == rank) continue; - auto handle = proxyService.deviceEpochs1[i]->deviceHandle(); - CUDACHECK(cudaMemcpy(&deviceHandles1[i], &handle, sizeof(mscclpp::DeviceEpoch::DeviceHandle), cudaMemcpyHostToDevice)); + auto handle = proxyService.getDeviceHandle1(i); + CUCHECK(cudaMemcpy(&deviceHandles1[i], &handle, sizeof(mscclpp::DeviceEpoch::DeviceHandle), cudaMemcpyHostToDevice)); } - CUDACHECK(cudaMalloc(&deviceHandles2, sizeof(mscclpp::DeviceEpoch::DeviceHandle) * world_size)); + CUCHECK(cudaMalloc(&deviceHandles2, sizeof(mscclpp::DeviceEpoch::DeviceHandle) * world_size)); for (int i = 0; i < world_size; ++i) { if (i == rank) continue; - auto handle = proxyService.deviceEpochs2[i]->deviceHandle(); - CUDACHECK(cudaMemcpy(&deviceHandles2[i], &handle, sizeof(mscclpp::DeviceEpoch::DeviceHandle), cudaMemcpyHostToDevice)); + auto handle = proxyService.getDeviceHandle2(i); + CUCHECK(cudaMemcpy(&deviceHandles2[i], &handle, sizeof(mscclpp::DeviceEpoch::DeviceHandle), cudaMemcpyHostToDevice)); } kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles1, 1); - CUDACHECK(cudaStreamSynchronize(stream)); + CUCHECK(cudaStreamSynchronize(stream)); - CUDACHECK(cudaMemcpy(data_h, data_d, dataSize, cudaMemcpyDeviceToHost)); + CUCHECK(cudaMemcpy(data_h, data_d, dataSize, cudaMemcpyDeviceToHost)); for (size_t i = 0; i < nelemsPerGPU * world_size; i++) { int val = i + 1; @@ -314,14 +330,14 @@ int main(int argc, char* argv[]) int iterwithoutcudagraph = 10; if (rank == 0) printf("Running %d iterations of the kernel without CUDA graph\n", iterwithoutcudagraph); - CUDACHECK(cudaStreamSynchronize(stream)); + CUCHECK(cudaStreamSynchronize(stream)); bootstrap->barrier(); t0 = getTime(); for (int i = 0; i < iterwithoutcudagraph; ++i) { kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles1, 1); kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles2, 2); } - CUDACHECK(cudaStreamSynchronize(stream)); + CUCHECK(cudaStreamSynchronize(stream)); bootstrap->barrier(); t1 = getTime(); ms = (t1 - t0) * 1000.0; @@ -350,7 +366,7 @@ int main(int argc, char* argv[]) for (int i = 0; i < cudagraphwarmup; ++i) { cudaGraphLaunch(instance, stream); } - CUDACHECK(cudaStreamSynchronize(stream)); + CUCHECK(cudaStreamSynchronize(stream)); // measure runtime int cudagraphlaunch = 10; @@ -362,7 +378,7 @@ int main(int argc, char* argv[]) for (int i = 0; i < cudagraphlaunch; ++i) { cudaGraphLaunch(instance, stream); } - CUDACHECK(cudaStreamSynchronize(stream)); + CUCHECK(cudaStreamSynchronize(stream)); t1 = getTime(); ms = (t1 - t0) * 1000.0; @@ -374,7 +390,7 @@ int main(int argc, char* argv[]) if (rank == 0) printf("Stopping MSCCL++ proxy threads\n"); - proxyService.proxy.stop(); + proxyService.stop();