From 0216ceb34e3ec3cfbe19466a48d3ec0aa7e1517d Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Mon, 6 Mar 2023 08:04:33 +0000 Subject: [PATCH] added todos --- src/init.cc | 1 + src/proxy.cc | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/src/init.cc b/src/init.cc index e7d420f8..1b9e6bff 100644 --- a/src/init.cc +++ b/src/init.cc @@ -301,6 +301,7 @@ mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm) } else if (conn->transport == mscclppTransportIB) { MSCCLPPCHECK(mscclppIbConnectionSetupStart(&cInfo, conn)); } + // TODO: from saemal: do we possibly deadlock if there are too many outstanding sends? MSCCLPPCHECK(bootstrapSend(comm->bootstrap, conn->remoteRank, conn->devConn->tag, &cInfo, sizeof(cInfo))); } diff --git a/src/proxy.cc b/src/proxy.cc index da6180f0..0e473787 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -37,9 +37,11 @@ struct proxyArgs { int connIdx; }; +// TODO(saemal) We need to add a fifo for each DMA engine void* mscclppProxyServiceP2P(void* _args) { struct proxyArgs *args = (struct proxyArgs *)_args; struct mscclppComm *comm = args->comm; + // TODO(saemal): we perhaps need a finite state for run instead of just 0 and 1 volatile int *run = args->run; struct mscclppConn *conn = &comm->conns[args->connIdx]; cudaStream_t stream = args->stream; @@ -49,6 +51,7 @@ void* mscclppProxyServiceP2P(void* _args) { mscclppTrigger trigger; // TODO(chhwang): find numa node // Current mapping is based on NDv4: GPU [0,1,2,3,4,5,6,7] -> NUMA [1,1,0,0,3,3,2,2] + // TODO(saemal): either ask user or detect it automatically NumaBind((comm->cudaDev / 2) ^ 1); PROXYCUDACHECK(cudaSetDevice(comm->cudaDev)); @@ -89,6 +92,7 @@ void* mscclppProxyServiceP2P(void* _args) { #if (MSCCLPP_PROXY_FLAG_SET_BY_RDMA == 0) +// TODO(saemal) We need to add a fifo for each DMA engine void* mscclppProxyServiceIb(void* _args) { struct proxyArgs *args = (struct proxyArgs *)_args; struct mscclppComm *comm = args->comm; @@ -168,6 +172,7 @@ void* mscclppProxyServiceIb(void* _args) { #else // MSCCLPP_PROXY_FLAG_SET_BY_RDMA == 1 +// TODO(saemal): merge this with the function above void* mscclppProxyServiceIb(void* _args) { struct proxyArgs *args = (struct proxyArgs *)_args; struct mscclppComm *comm = args->comm;