diff --git a/src/include/comm.h b/src/include/comm.h index b99cfad2..3cfd772c 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -165,8 +165,9 @@ struct mscclppConn { int remoteRank; int buffSize; mscclppTrigger *cpuTriggerFifo; - int* fifoHead; // indicates where CPU needs to read work elements. Write by CPU only, read by both - int* fifoTail; // indicates where GPU needs to write work elements. Write by GPU only, read by both + // fifoTail indicates where CPU needs to read the head of the fifo. only accessible by CPU + // No atomicity is required for fifoTail as only a single CPU thread accesses it. + int fifoTail; uint64_t *remoteProxyFlag; uint64_t *cpuProxyFlag; void *cpuTriggerFifoGdrDesc; diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index ba9c4f87..59494d10 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -79,6 +79,7 @@ struct mscclppDevConn { // // localBuff[srcOffset..srcOffset+size-1] <- remoteBuff[dstOffset..dstOffset+size-1] // virtual void pullRemoteBuff(size_t srcOffset, size_t dstOffset, size_t size); + int* triggerFifoHead; // indicates the tail of the fifo. only accessible by the gpu. for parallel, access use atomic mscclppTrigger* trigger; uint64_t* proxyFlag; }; diff --git a/src/init.cc b/src/init.cc index fcea06bd..2cb063a6 100644 --- a/src/init.cc +++ b/src/init.cc @@ -179,7 +179,10 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, mscclppDevConn* devConnOut, i conn->devConn->localBuff = localBuff; conn->devConn->localFlag = localFlag; conn->devConn->tag = tag; + + // TODO(saemal): these two should be shared for all P2P-DMA connections made from each GPU. Same for each IB driver. MSCCLPPCHECK(mscclppGdrCudaCalloc(&conn->cpuTriggerFifo, &conn->devConn->trigger, MSCCLPP_PROXY_FIFO_SIZE, &conn->cpuTriggerFifoGdrDesc)); + MSCCLPPCHECK(mscclppCudaCalloc(&conn->devConn->triggerFifoHead, 1)); conn->ibCtx = NULL; conn->ibQp = NULL; diff --git a/src/proxy.cc b/src/proxy.cc index c7512a07..ee0c11e9 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -59,7 +59,7 @@ void* mscclppProxyServiceP2P(void* _args) { while (*run) { // Poll to see if we are ready to send anything - trigger.value = *(volatile uint64_t *)conn->cpuTriggerFifo; + trigger.value = *(volatile uint64_t *)(&conn->cpuTriggerFifo[conn->triggerFifoHead]); if (trigger.value == 0) continue; // Iterate over what send is needed @@ -79,6 +79,7 @@ void* mscclppProxyServiceP2P(void* _args) { // send completion volatile uint64_t *tmp = (volatile uint64_t *)conn->cpuTriggerFifo; *tmp = 0; + conn->triggerFifoHead++; } *run = 1; PROXYCUDACHECK(cudaStreamDestroy(stream)); diff --git a/tests/p2p_test.cu b/tests/p2p_test.cu index 1fb47e46..3e2bd964 100644 --- a/tests/p2p_test.cu +++ b/tests/p2p_test.cu @@ -42,7 +42,8 @@ __global__ void kernel(int rank, int world_size) volatile uint64_t *localFlag = devConn.localFlag; volatile uint64_t *remoteFlag = devConn.remoteFlag; volatile uint64_t *proxyFlag = devConn.proxyFlag; - volatile uint64_t *trig = (volatile uint64_t *)devConn.trigger; + volatile uint64_t *trig = (volatile uint64_t *)devConn.triggerFifo[devConn.triggerFifoHead]; + devConn.triggerFifoHead++; uint64_t baseFlag = *localFlag; if (threadIdx.x == 0) {