new api works -- single node is not performant

2026-05-13 09:46:00 +00:00 · 2023-03-22 02:19:49 +00:00
parent b75f9e6d8a
commit 0a707d84ec
3 changed files with 22 additions and 59 deletions
--- a/2
+++ b/2
@@ -116,7 +116,7 @@ LIBSONAME := $(LIBNAME).$(MSCCLPP_MAJOR)
 LIBTARGET := $(BUILDDIR)/$(LIBDIR)/$(LIBNAME).$(MSCCLPP_MAJOR).$(MSCCLPP_MINOR)

 TESTSDIR  := tests
-TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc p2p_test.cu allgather_test.cu allgather_test2.cu)
+TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc allgather_test.cu)
 TESTSOBJS := $(patsubst %.cc,%.o,$(TESTSSRCS)) $(patsubst %.cu,%.o,$(TESTSSRCS))
 TESTSOBJTARGETS := $(TESTSOBJS:%=$(BUILDDIR)/$(OBJDIR)/%)
 TESTSBINS       := $(patsubst %.o,$(BUILDDIR)/$(BINDIR)/%,$(TESTSOBJS))
--- a/src/include/mscclpp.h
+++ b/src/include/mscclpp.h
@@ -103,7 +103,7 @@ struct mscclppDevConn {
  int tag;

  void* localBuff;
-  volatile uint64_t* sendEpochId;  // this is read and written by the GPU
+  uint64_t* sendEpochId;  // this is read and written by the GPU
  uint64_t recvEpochId;   // this is the copy of the remote epoch id.

  void* remoteBuff;
@@ -115,7 +115,7 @@ struct mscclppDevConn {
 #ifdef __CUDACC__

 __forceinline__ __device__ void increment(){
-   *sendEpochId += 1;
+   *(volatile uint64_t*)sendEpochId += 1;
 }

 __forceinline__ __device__ void put(uint64_t dataOffset, uint64_t dataSize){
@@ -132,12 +132,12 @@ struct mscclppDevConn {
  }

  __forceinline__ __device__ void sync(mscclppRequest_t req) {
-    while (*(volatile uint64_t *)triggerFifoTail <= req);
+    while (*(volatile uint64_t *)fifo.triggerFifoTail <= req);
  }

  __forceinline__ __device__ void wait(){
    recvEpochId++;
-    while (*proxyEpochId < recvEpochId);
+    while (*(volatile uint64_t*)proxyEpochId < recvEpochId);
  }
 #endif
 };
@@ -161,43 +161,6 @@ typedef enum { mscclppSuccess                 =  0,

 mscclppResult_t mscclppGetUniqueId(mscclppUniqueId* uniqueId);

-/* Reduction operation selector */
-typedef enum { mscclppNumOps_dummy = 5 } mscclppRedOp_dummy_t;
-typedef enum { mscclppSum        = 0,
-               mscclppProd       = 1,
-               mscclppMax        = 2,
-               mscclppMin        = 3,
-               mscclppAvg        = 4,
-               /* mscclppNumOps: The number of built-in mscclppRedOp_t values. Also
-                * serves as the least possible value for dynamic mscclppRedOp_t's
-                * as constructed by mscclppRedOpCreate*** functions. */
-               mscclppNumOps     = 5,
-               /* mscclppMaxRedOp: The largest valid value for mscclppRedOp_t.
-                * It is defined to be the largest signed value (since compilers
-                * are permitted to use signed enums) that won't grow
-                * sizeof(mscclppRedOp_t) when compared to previous MSCCLPP versions to
-                * maintain ABI compatibility. */
-               mscclppMaxRedOp   = 0x7fffffff>>(32-8*sizeof(mscclppRedOp_dummy_t))
-             } mscclppRedOp_t;
-
-/* Data types */
-typedef enum { mscclppInt8       = 0, mscclppChar       = 0,
-               mscclppUint8      = 1,
-               mscclppInt32      = 2, mscclppInt        = 2,
-               mscclppUint32     = 3,
-               mscclppInt64      = 4,
-               mscclppUint64     = 5,
-               mscclppFloat16    = 6, mscclppHalf       = 6,
-               mscclppFloat32    = 7, mscclppFloat      = 7,
-               mscclppFloat64    = 8, mscclppDouble     = 8,
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-               mscclppBfloat16   = 9,
-               mscclppNumTypes   = 10
-#else
-               mscclppNumTypes   = 9
-#endif
-} mscclppDataType_t;
-
 /* Transport Types */
 typedef enum { mscclppTransportP2P = 0,
               mscclppTransportSHM = 1, // TODO(chhwang): not implemented yet
--- a/tests/allgather_test.cu
+++ b/tests/allgather_test.cu
@@ -49,24 +49,24 @@ __global__ void kernel(int rank, int world_size, int nelemsPerGPU)
  mscclppDevConn_t devConn = constDevConns[remoteRank];

  // volatile int *data = (volatile int *)devConn.localBuff;
-  volatile uint64_t *localFlag = devConn.localFlag;
-  volatile uint64_t *proxyFlag = devConn.proxyFlag;
+  // volatile uint64_t *localFlag = devConn.localFlag;
+  // volatile uint64_t *proxyFlag = devConn.proxyFlag;

-  uint64_t baseFlag = *localFlag;
+  // uint64_t baseFlag = *localFlag;

-  if (threadIdx.x == 0) {
-    *localFlag = baseFlag + 1;
-  }
+  // if (threadIdx.x == 0) {
+  //   *localFlag = baseFlag + 1;
+  // }

  // Each warp receives data from different ranks
 #if 1
  // push your data asynchronously
-  devConn.fifo.put(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int));
+  devConn.put(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int));

  // push with flag and sync to make sure the data is received
-  auto req = devConn.fifo.signal();
+  auto req = devConn.signal();

-  devConn.fifo.sync(req);
+  devConn.sync(req);

  devConn.wait();
  //while (*proxyFlag == baseFlag);
@@ -75,18 +75,18 @@ __global__ void kernel(int rank, int world_size, int nelemsPerGPU)
  for (int i = 1; i < world_size; i++){
    __syncthreads();
    if (remoteRank != ((rank+i) % world_size)) continue;
-    // get a thread-local trigger and a request for waiting on it
-    mscclppTrigger_t trig;
-    mscclppRequest_t req = devConn.fifo.getTrigger(&trig);
+    // push your data asynchronously
+    devConn.put(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int));

-    // Trigger sending data, flag and synchronize after
-    devConn.fifo.setTrigger(trig, mscclppFlag | mscclppData | mscclppSync, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int));
+    // push with flag and sync to make sure the data is received
+    auto req = devConn.signal();
+
+    devConn.sync(req);

-    // Wait on the request to make sure it is safe to reuse buffer and flag
-    devConn.fifo.waitTrigger(req);    
  }
+  devConn.wait();
  // Wait for receiving data from remote rank
-  while (*proxyFlag == baseFlag);
+  // while (*proxyFlag == baseFlag);
 #endif

 }