From 0a707d84ec909f4d51198c19afaae98eeeb64abd Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Wed, 22 Mar 2023 02:19:49 +0000
Subject: [PATCH] new api works -- single node is not performant

---
 Makefile                |  2 +-
 src/include/mscclpp.h   | 45 ++++-------------------------------------
 tests/allgather_test.cu | 34 +++++++++++++++----------------
 3 files changed, 22 insertions(+), 59 deletions(-)

diff --git a/Makefile b/Makefile
index b0c78aa1..8306b656 100644
--- a/Makefile
+++ b/Makefile
@@ -116,7 +116,7 @@ LIBSONAME := $(LIBNAME).$(MSCCLPP_MAJOR)
 LIBTARGET := $(BUILDDIR)/$(LIBDIR)/$(LIBNAME).$(MSCCLPP_MAJOR).$(MSCCLPP_MINOR)
 
 TESTSDIR  := tests
-TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc p2p_test.cu allgather_test.cu allgather_test2.cu)
+TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc allgather_test.cu)
 TESTSOBJS := $(patsubst %.cc,%.o,$(TESTSSRCS)) $(patsubst %.cu,%.o,$(TESTSSRCS))
 TESTSOBJTARGETS := $(TESTSOBJS:%=$(BUILDDIR)/$(OBJDIR)/%)
 TESTSBINS       := $(patsubst %.o,$(BUILDDIR)/$(BINDIR)/%,$(TESTSOBJS))
diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h
index c0f07be5..9d815245 100644
--- a/src/include/mscclpp.h
+++ b/src/include/mscclpp.h
@@ -103,7 +103,7 @@ struct mscclppDevConn {
   int tag;
 
   void* localBuff;
-  volatile uint64_t* sendEpochId;  // this is read and written by the GPU
+  uint64_t* sendEpochId;  // this is read and written by the GPU
   uint64_t recvEpochId;   // this is the copy of the remote epoch id.
 
   void* remoteBuff;
@@ -115,7 +115,7 @@ struct mscclppDevConn {
 #ifdef __CUDACC__
 
  __forceinline__ __device__ void increment(){
-   *sendEpochId += 1;
+   *(volatile uint64_t*)sendEpochId += 1;
  }
 
  __forceinline__ __device__ void put(uint64_t dataOffset, uint64_t dataSize){
@@ -132,12 +132,12 @@ struct mscclppDevConn {
   }
 
   __forceinline__ __device__ void sync(mscclppRequest_t req) {
-    while (*(volatile uint64_t *)triggerFifoTail <= req);
+    while (*(volatile uint64_t *)fifo.triggerFifoTail <= req);
   }
 
   __forceinline__ __device__ void wait(){
     recvEpochId++;
-    while (*proxyEpochId < recvEpochId);
+    while (*(volatile uint64_t*)proxyEpochId < recvEpochId);
   }
 #endif
 };
@@ -161,43 +161,6 @@ typedef enum { mscclppSuccess                 =  0,
 
 mscclppResult_t mscclppGetUniqueId(mscclppUniqueId* uniqueId);
 
-/* Reduction operation selector */
-typedef enum { mscclppNumOps_dummy = 5 } mscclppRedOp_dummy_t;
-typedef enum { mscclppSum        = 0,
-               mscclppProd       = 1,
-               mscclppMax        = 2,
-               mscclppMin        = 3,
-               mscclppAvg        = 4,
-               /* mscclppNumOps: The number of built-in mscclppRedOp_t values. Also
-                * serves as the least possible value for dynamic mscclppRedOp_t's
-                * as constructed by mscclppRedOpCreate*** functions. */
-               mscclppNumOps     = 5,
-               /* mscclppMaxRedOp: The largest valid value for mscclppRedOp_t.
-                * It is defined to be the largest signed value (since compilers
-                * are permitted to use signed enums) that won't grow
-                * sizeof(mscclppRedOp_t) when compared to previous MSCCLPP versions to
-                * maintain ABI compatibility. */
-               mscclppMaxRedOp   = 0x7fffffff>>(32-8*sizeof(mscclppRedOp_dummy_t))
-             } mscclppRedOp_t;
-
-/* Data types */
-typedef enum { mscclppInt8       = 0, mscclppChar       = 0,
-               mscclppUint8      = 1,
-               mscclppInt32      = 2, mscclppInt        = 2,
-               mscclppUint32     = 3,
-               mscclppInt64      = 4,
-               mscclppUint64     = 5,
-               mscclppFloat16    = 6, mscclppHalf       = 6,
-               mscclppFloat32    = 7, mscclppFloat      = 7,
-               mscclppFloat64    = 8, mscclppDouble     = 8,
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-               mscclppBfloat16   = 9,
-               mscclppNumTypes   = 10
-#else
-               mscclppNumTypes   = 9
-#endif
-} mscclppDataType_t;
-
 /* Transport Types */
 typedef enum { mscclppTransportP2P = 0,
                mscclppTransportSHM = 1, // TODO(chhwang): not implemented yet
diff --git a/tests/allgather_test.cu b/tests/allgather_test.cu
index 4aa1c027..e3de578e 100644
--- a/tests/allgather_test.cu
+++ b/tests/allgather_test.cu
@@ -49,24 +49,24 @@ __global__ void kernel(int rank, int world_size, int nelemsPerGPU)
   mscclppDevConn_t devConn = constDevConns[remoteRank];
 
   // volatile int *data = (volatile int *)devConn.localBuff;
-  volatile uint64_t *localFlag = devConn.localFlag;
-  volatile uint64_t *proxyFlag = devConn.proxyFlag;
+  // volatile uint64_t *localFlag = devConn.localFlag;
+  // volatile uint64_t *proxyFlag = devConn.proxyFlag;
 
-  uint64_t baseFlag = *localFlag;
+  // uint64_t baseFlag = *localFlag;
 
-  if (threadIdx.x == 0) {
-    *localFlag = baseFlag + 1;
-  }
+  // if (threadIdx.x == 0) {
+  //   *localFlag = baseFlag + 1;
+  // }
 
   // Each warp receives data from different ranks
 #if 1
   // push your data asynchronously
-  devConn.fifo.put(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int));
+  devConn.put(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int));
 
   // push with flag and sync to make sure the data is received
-  auto req = devConn.fifo.signal();
+  auto req = devConn.signal();
 
-  devConn.fifo.sync(req);
+  devConn.sync(req);
 
   devConn.wait();
   //while (*proxyFlag == baseFlag);
@@ -75,18 +75,18 @@ __global__ void kernel(int rank, int world_size, int nelemsPerGPU)
   for (int i = 1; i < world_size; i++){
     __syncthreads();
     if (remoteRank != ((rank+i) % world_size)) continue;
-    // get a thread-local trigger and a request for waiting on it
-    mscclppTrigger_t trig;
-    mscclppRequest_t req = devConn.fifo.getTrigger(&trig);
+    // push your data asynchronously
+    devConn.put(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int));
 
-    // Trigger sending data, flag and synchronize after
-    devConn.fifo.setTrigger(trig, mscclppFlag | mscclppData | mscclppSync, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int));
+    // push with flag and sync to make sure the data is received
+    auto req = devConn.signal();
+
+    devConn.sync(req);
 
-    // Wait on the request to make sure it is safe to reuse buffer and flag
-    devConn.fifo.waitTrigger(req);    
   }
+  devConn.wait();
   // Wait for receiving data from remote rank
-  while (*proxyFlag == baseFlag);
+  // while (*proxyFlag == baseFlag);
 #endif
 
 }