diff --git a/src/init.cc b/src/init.cc index a2929e8e..1112c6e8 100644 --- a/src/init.cc +++ b/src/init.cc @@ -86,7 +86,7 @@ MSCCLPP_API(mscclppResult_t, mscclppCommInitRank, mscclppComm_t* comm, int nrank mscclppResult_t mscclppCommInitRank(mscclppComm_t* comm, int nranks, const char* ipPortPair, int rank) { if (mscclppGdrCopy == NULL) { - MSCCLPPCHECK(initGdrCopy()); + // MSCCLPPCHECK(initGdrCopy()); } mscclppResult_t res = mscclppSuccess; @@ -162,7 +162,7 @@ MSCCLPP_API(mscclppResult_t, mscclppCommInitRankFromId, mscclppComm_t* comm, int mscclppResult_t mscclppCommInitRankFromId(mscclppComm_t* comm, int nranks, mscclppUniqueId id, int rank) { if (mscclppGdrCopy == NULL) { - MSCCLPPCHECK(initGdrCopy()); + // MSCCLPPCHECK(initGdrCopy()); } mscclppResult_t res = mscclppSuccess; @@ -213,7 +213,7 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm) struct mscclppConn* conn = &comm->conns[i]; if (conn->cpuProxyFlagGdrDesc) { // IB - MSCCLPPCHECK(mscclppGdrCudaFree(conn->cpuProxyFlagGdrDesc)); + // MSCCLPPCHECK(mscclppGdrCudaFree(conn->cpuProxyFlagGdrDesc)); } else if (conn->devConn->proxyEpochId) { // P2P MSCCLPPCHECK(mscclppCudaFree(conn->devConn->proxyEpochId)); @@ -401,8 +401,8 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void // If we couldn't find a matching context, create one if (proxyState == NULL) { MSCCLPPCHECK(mscclppCalloc(&proxyState, 1)); - MSCCLPPCHECK(mscclppGdrCudaCalloc(&proxyState->triggerFifo.hostPtr, &proxyState->triggerFifo.devPtr, - MSCCLPP_PROXY_FIFO_SIZE, &proxyState->triggerFifo.desc)); + // MSCCLPPCHECK(mscclppGdrCudaCalloc(&proxyState->triggerFifo.hostPtr, &proxyState->triggerFifo.devPtr, + // MSCCLPP_PROXY_FIFO_SIZE, &proxyState->triggerFifo.desc)); MSCCLPPCHECK(mscclppCudaHostCalloc(&proxyState->triggerFifo.hostPtr, MSCCLPP_PROXY_FIFO_SIZE)); proxyState->triggerFifo.devPtr = proxyState->triggerFifo.hostPtr; // MSCCLPPCHECK( @@ -497,7 +497,8 @@ mscclppResult_t mscclppIbConnectionSetupStart(struct connInfo* connInfo /*output struct mscclppDevConn* devConn = conn->devConn; devConn->remoteBuff = NULL; devConn->remoteFlag = NULL; - MSCCLPPCHECK(mscclppGdrCudaCalloc(&conn->cpuProxyFlag, &devConn->proxyEpochId, 1, &conn->cpuProxyFlagGdrDesc)); + // MSCCLPPCHECK(mscclppGdrCudaCalloc(&conn->cpuProxyFlag, &devConn->proxyEpochId, 1, &conn->cpuProxyFlagGdrDesc)); + MSCCLPPCHECK(mscclppCudaCalloc(&devConn->proxyEpochId, 1)); struct mscclppIbContext* ibCtx = conn->ibCtx; if (conn->ibQp == NULL) { diff --git a/src/proxy.cc b/src/proxy.cc index f02d66be..22578254 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -122,7 +122,8 @@ void* mscclppProxyServiceP2P(void* _args) // Send completion: reset only the high 64 bits *(volatile uint64_t*)(&fifo[cachedFifoTail % MSCCLPP_PROXY_FIFO_SIZE]) = 0; cachedFifoTail++; - PROXYCUDACHECK(cudaMemcpyAsync(fifoTailDevPtr, &cachedFifoTail, sizeof(uint64_t), cudaMemcpyHostToDevice, stream2)); + if (((cachedFifoTail % 4) == 0) || (trigger.fields.type & mscclppSync)) + PROXYCUDACHECK(cudaMemcpyAsync(fifoTailDevPtr, &cachedFifoTail, sizeof(uint64_t), cudaMemcpyHostToDevice, stream2)); *fifoTail = cachedFifoTail; } @@ -314,7 +315,8 @@ void* mscclppProxyServiceIb(void* _args) // Send completion: reset only the high 64 bits *(volatile uint64_t*)(&fifo[cachedFifoTail % MSCCLPP_PROXY_FIFO_SIZE]) = 0; cachedFifoTail++; - PROXYCUDACHECK(cudaMemcpyAsync(fifoTailDevPtr, &cachedFifoTail, sizeof(uint64_t), cudaMemcpyHostToDevice, stream)); + if (((cachedFifoTail % 4) == 0) || (trigger.fields.type & mscclppSync)) + PROXYCUDACHECK(cudaMemcpyAsync(fifoTailDevPtr, &cachedFifoTail, sizeof(uint64_t), cudaMemcpyHostToDevice, stream)); *fifoTail = cachedFifoTail; #endif }