done with the design

2026-05-13 09:46:00 +00:00 · 2023-03-06 12:26:46 -08:00
parent b663469bcd
commit dced4c4c14
5 changed files with 11 additions and 4 deletions
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -165,8 +165,9 @@ struct mscclppConn {
  int remoteRank;
  int buffSize;
  mscclppTrigger *cpuTriggerFifo;
-  int* fifoHead; // indicates where CPU needs to read work elements. Write by CPU only, read by both
-  int* fifoTail; // indicates where GPU needs to write work elements. Write by GPU only, read by both
+  // fifoTail indicates where CPU needs to read the head of the fifo. only accessible by CPU
+  // No atomicity is required for fifoTail as only a single CPU thread accesses it.
+  int fifoTail; 
  uint64_t *remoteProxyFlag;
  uint64_t *cpuProxyFlag;
  void *cpuTriggerFifoGdrDesc;
--- a/src/include/mscclpp.h
+++ b/src/include/mscclpp.h
@@ -79,6 +79,7 @@ struct mscclppDevConn {
  // // localBuff[srcOffset..srcOffset+size-1] <- remoteBuff[dstOffset..dstOffset+size-1]
  // virtual void pullRemoteBuff(size_t srcOffset, size_t dstOffset, size_t size);

+  int* triggerFifoHead; // indicates the tail of the fifo. only accessible by the gpu. for parallel, access use atomic
  mscclppTrigger* trigger;
  uint64_t* proxyFlag;
 };
--- a/src/init.cc
+++ b/src/init.cc
@@ -179,7 +179,10 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, mscclppDevConn* devConnOut, i
  conn->devConn->localBuff = localBuff;
  conn->devConn->localFlag = localFlag;
  conn->devConn->tag = tag;
+
+  // TODO(saemal): these two should be shared for all P2P-DMA connections made from each GPU. Same for each IB driver.
  MSCCLPPCHECK(mscclppGdrCudaCalloc(&conn->cpuTriggerFifo, &conn->devConn->trigger, MSCCLPP_PROXY_FIFO_SIZE, &conn->cpuTriggerFifoGdrDesc));
+  MSCCLPPCHECK(mscclppCudaCalloc(&conn->devConn->triggerFifoHead, 1));

  conn->ibCtx = NULL;
  conn->ibQp = NULL;
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -59,7 +59,7 @@ void* mscclppProxyServiceP2P(void* _args) {

  while (*run) {
    // Poll to see if we are ready to send anything
-    trigger.value = *(volatile uint64_t *)conn->cpuTriggerFifo;
+    trigger.value = *(volatile uint64_t *)(&conn->cpuTriggerFifo[conn->triggerFifoHead]);
    if (trigger.value == 0) continue;

    // Iterate over what send is needed
@@ -79,6 +79,7 @@ void* mscclppProxyServiceP2P(void* _args) {
    // send completion
    volatile uint64_t *tmp = (volatile uint64_t *)conn->cpuTriggerFifo;
    *tmp = 0;
+    conn->triggerFifoHead++;
  }
  *run = 1;
  PROXYCUDACHECK(cudaStreamDestroy(stream));
--- a/tests/p2p_test.cu
+++ b/tests/p2p_test.cu
@@ -42,7 +42,8 @@ __global__ void kernel(int rank, int world_size)
  volatile uint64_t *localFlag = devConn.localFlag;
  volatile uint64_t *remoteFlag = devConn.remoteFlag;
  volatile uint64_t *proxyFlag = devConn.proxyFlag;
-  volatile uint64_t *trig = (volatile uint64_t *)devConn.trigger;
+  volatile uint64_t *trig = (volatile uint64_t *)devConn.triggerFifo[devConn.triggerFifoHead];
+  devConn.triggerFifoHead++;
  uint64_t baseFlag = *localFlag;

  if (threadIdx.x == 0) {