From 0a707d84ec909f4d51198c19afaae98eeeb64abd Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Wed, 22 Mar 2023 02:19:49 +0000 Subject: [PATCH] new api works -- single node is not performant --- Makefile | 2 +- src/include/mscclpp.h | 45 ++++------------------------------------- tests/allgather_test.cu | 34 +++++++++++++++---------------- 3 files changed, 22 insertions(+), 59 deletions(-) diff --git a/Makefile b/Makefile index b0c78aa1..8306b656 100644 --- a/Makefile +++ b/Makefile @@ -116,7 +116,7 @@ LIBSONAME := $(LIBNAME).$(MSCCLPP_MAJOR) LIBTARGET := $(BUILDDIR)/$(LIBDIR)/$(LIBNAME).$(MSCCLPP_MAJOR).$(MSCCLPP_MINOR) TESTSDIR := tests -TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc p2p_test.cu allgather_test.cu allgather_test2.cu) +TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc allgather_test.cu) TESTSOBJS := $(patsubst %.cc,%.o,$(TESTSSRCS)) $(patsubst %.cu,%.o,$(TESTSSRCS)) TESTSOBJTARGETS := $(TESTSOBJS:%=$(BUILDDIR)/$(OBJDIR)/%) TESTSBINS := $(patsubst %.o,$(BUILDDIR)/$(BINDIR)/%,$(TESTSOBJS)) diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index c0f07be5..9d815245 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -103,7 +103,7 @@ struct mscclppDevConn { int tag; void* localBuff; - volatile uint64_t* sendEpochId; // this is read and written by the GPU + uint64_t* sendEpochId; // this is read and written by the GPU uint64_t recvEpochId; // this is the copy of the remote epoch id. void* remoteBuff; @@ -115,7 +115,7 @@ struct mscclppDevConn { #ifdef __CUDACC__ __forceinline__ __device__ void increment(){ - *sendEpochId += 1; + *(volatile uint64_t*)sendEpochId += 1; } __forceinline__ __device__ void put(uint64_t dataOffset, uint64_t dataSize){ @@ -132,12 +132,12 @@ struct mscclppDevConn { } __forceinline__ __device__ void sync(mscclppRequest_t req) { - while (*(volatile uint64_t *)triggerFifoTail <= req); + while (*(volatile uint64_t *)fifo.triggerFifoTail <= req); } __forceinline__ __device__ void wait(){ recvEpochId++; - while (*proxyEpochId < recvEpochId); + while (*(volatile uint64_t*)proxyEpochId < recvEpochId); } #endif }; @@ -161,43 +161,6 @@ typedef enum { mscclppSuccess = 0, mscclppResult_t mscclppGetUniqueId(mscclppUniqueId* uniqueId); -/* Reduction operation selector */ -typedef enum { mscclppNumOps_dummy = 5 } mscclppRedOp_dummy_t; -typedef enum { mscclppSum = 0, - mscclppProd = 1, - mscclppMax = 2, - mscclppMin = 3, - mscclppAvg = 4, - /* mscclppNumOps: The number of built-in mscclppRedOp_t values. Also - * serves as the least possible value for dynamic mscclppRedOp_t's - * as constructed by mscclppRedOpCreate*** functions. */ - mscclppNumOps = 5, - /* mscclppMaxRedOp: The largest valid value for mscclppRedOp_t. - * It is defined to be the largest signed value (since compilers - * are permitted to use signed enums) that won't grow - * sizeof(mscclppRedOp_t) when compared to previous MSCCLPP versions to - * maintain ABI compatibility. */ - mscclppMaxRedOp = 0x7fffffff>>(32-8*sizeof(mscclppRedOp_dummy_t)) - } mscclppRedOp_t; - -/* Data types */ -typedef enum { mscclppInt8 = 0, mscclppChar = 0, - mscclppUint8 = 1, - mscclppInt32 = 2, mscclppInt = 2, - mscclppUint32 = 3, - mscclppInt64 = 4, - mscclppUint64 = 5, - mscclppFloat16 = 6, mscclppHalf = 6, - mscclppFloat32 = 7, mscclppFloat = 7, - mscclppFloat64 = 8, mscclppDouble = 8, -#if defined(__CUDA_BF16_TYPES_EXIST__) - mscclppBfloat16 = 9, - mscclppNumTypes = 10 -#else - mscclppNumTypes = 9 -#endif -} mscclppDataType_t; - /* Transport Types */ typedef enum { mscclppTransportP2P = 0, mscclppTransportSHM = 1, // TODO(chhwang): not implemented yet diff --git a/tests/allgather_test.cu b/tests/allgather_test.cu index 4aa1c027..e3de578e 100644 --- a/tests/allgather_test.cu +++ b/tests/allgather_test.cu @@ -49,24 +49,24 @@ __global__ void kernel(int rank, int world_size, int nelemsPerGPU) mscclppDevConn_t devConn = constDevConns[remoteRank]; // volatile int *data = (volatile int *)devConn.localBuff; - volatile uint64_t *localFlag = devConn.localFlag; - volatile uint64_t *proxyFlag = devConn.proxyFlag; + // volatile uint64_t *localFlag = devConn.localFlag; + // volatile uint64_t *proxyFlag = devConn.proxyFlag; - uint64_t baseFlag = *localFlag; + // uint64_t baseFlag = *localFlag; - if (threadIdx.x == 0) { - *localFlag = baseFlag + 1; - } + // if (threadIdx.x == 0) { + // *localFlag = baseFlag + 1; + // } // Each warp receives data from different ranks #if 1 // push your data asynchronously - devConn.fifo.put(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int)); + devConn.put(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int)); // push with flag and sync to make sure the data is received - auto req = devConn.fifo.signal(); + auto req = devConn.signal(); - devConn.fifo.sync(req); + devConn.sync(req); devConn.wait(); //while (*proxyFlag == baseFlag); @@ -75,18 +75,18 @@ __global__ void kernel(int rank, int world_size, int nelemsPerGPU) for (int i = 1; i < world_size; i++){ __syncthreads(); if (remoteRank != ((rank+i) % world_size)) continue; - // get a thread-local trigger and a request for waiting on it - mscclppTrigger_t trig; - mscclppRequest_t req = devConn.fifo.getTrigger(&trig); + // push your data asynchronously + devConn.put(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int)); - // Trigger sending data, flag and synchronize after - devConn.fifo.setTrigger(trig, mscclppFlag | mscclppData | mscclppSync, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int)); + // push with flag and sync to make sure the data is received + auto req = devConn.signal(); + + devConn.sync(req); - // Wait on the request to make sure it is safe to reuse buffer and flag - devConn.fifo.waitTrigger(req); } + devConn.wait(); // Wait for receiving data from remote rank - while (*proxyFlag == baseFlag); + // while (*proxyFlag == baseFlag); #endif }