diff --git a/src/include/comm.h b/src/include/comm.h
index b99cfad2..3cfd772c 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -165,8 +165,9 @@ struct mscclppConn {
   int remoteRank;
   int buffSize;
   mscclppTrigger *cpuTriggerFifo;
-  int* fifoHead; // indicates where CPU needs to read work elements. Write by CPU only, read by both
-  int* fifoTail; // indicates where GPU needs to write work elements. Write by GPU only, read by both
+  // fifoTail indicates where CPU needs to read the head of the fifo. only accessible by CPU
+  // No atomicity is required for fifoTail as only a single CPU thread accesses it.
+  int fifoTail; 
   uint64_t *remoteProxyFlag;
   uint64_t *cpuProxyFlag;
   void *cpuTriggerFifoGdrDesc;
diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h
index ba9c4f87..59494d10 100644
--- a/src/include/mscclpp.h
+++ b/src/include/mscclpp.h
@@ -79,6 +79,7 @@ struct mscclppDevConn {
   // // localBuff[srcOffset..srcOffset+size-1] <- remoteBuff[dstOffset..dstOffset+size-1]
   // virtual void pullRemoteBuff(size_t srcOffset, size_t dstOffset, size_t size);
 
+  int* triggerFifoHead; // indicates the tail of the fifo. only accessible by the gpu. for parallel, access use atomic
   mscclppTrigger* trigger;
   uint64_t* proxyFlag;
 };
diff --git a/src/init.cc b/src/init.cc
index fcea06bd..2cb063a6 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -179,7 +179,10 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, mscclppDevConn* devConnOut, i
   conn->devConn->localBuff = localBuff;
   conn->devConn->localFlag = localFlag;
   conn->devConn->tag = tag;
+
+  // TODO(saemal): these two should be shared for all P2P-DMA connections made from each GPU. Same for each IB driver.
   MSCCLPPCHECK(mscclppGdrCudaCalloc(&conn->cpuTriggerFifo, &conn->devConn->trigger, MSCCLPP_PROXY_FIFO_SIZE, &conn->cpuTriggerFifoGdrDesc));
+  MSCCLPPCHECK(mscclppCudaCalloc(&conn->devConn->triggerFifoHead, 1));
 
   conn->ibCtx = NULL;
   conn->ibQp = NULL;
diff --git a/src/proxy.cc b/src/proxy.cc
index c7512a07..ee0c11e9 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -59,7 +59,7 @@ void* mscclppProxyServiceP2P(void* _args) {
 
   while (*run) {
     // Poll to see if we are ready to send anything
-    trigger.value = *(volatile uint64_t *)conn->cpuTriggerFifo;
+    trigger.value = *(volatile uint64_t *)(&conn->cpuTriggerFifo[conn->triggerFifoHead]);
     if (trigger.value == 0) continue;
 
     // Iterate over what send is needed
@@ -79,6 +79,7 @@ void* mscclppProxyServiceP2P(void* _args) {
     // send completion
     volatile uint64_t *tmp = (volatile uint64_t *)conn->cpuTriggerFifo;
     *tmp = 0;
+    conn->triggerFifoHead++;
   }
   *run = 1;
   PROXYCUDACHECK(cudaStreamDestroy(stream));
diff --git a/tests/p2p_test.cu b/tests/p2p_test.cu
index 1fb47e46..3e2bd964 100644
--- a/tests/p2p_test.cu
+++ b/tests/p2p_test.cu
@@ -42,7 +42,8 @@ __global__ void kernel(int rank, int world_size)
   volatile uint64_t *localFlag = devConn.localFlag;
   volatile uint64_t *remoteFlag = devConn.remoteFlag;
   volatile uint64_t *proxyFlag = devConn.proxyFlag;
-  volatile uint64_t *trig = (volatile uint64_t *)devConn.trigger;
+  volatile uint64_t *trig = (volatile uint64_t *)devConn.triggerFifo[devConn.triggerFifoHead];
+  devConn.triggerFifoHead++;
   uint64_t baseFlag = *localFlag;
 
   if (threadIdx.x == 0) {