new api works -- single node is not performant

This commit is contained in:
Saeed Maleki
2023-03-22 02:19:49 +00:00
parent b75f9e6d8a
commit 0a707d84ec
3 changed files with 22 additions and 59 deletions

View File

@@ -116,7 +116,7 @@ LIBSONAME := $(LIBNAME).$(MSCCLPP_MAJOR)
LIBTARGET := $(BUILDDIR)/$(LIBDIR)/$(LIBNAME).$(MSCCLPP_MAJOR).$(MSCCLPP_MINOR)
TESTSDIR := tests
TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc p2p_test.cu allgather_test.cu allgather_test2.cu)
TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc allgather_test.cu)
TESTSOBJS := $(patsubst %.cc,%.o,$(TESTSSRCS)) $(patsubst %.cu,%.o,$(TESTSSRCS))
TESTSOBJTARGETS := $(TESTSOBJS:%=$(BUILDDIR)/$(OBJDIR)/%)
TESTSBINS := $(patsubst %.o,$(BUILDDIR)/$(BINDIR)/%,$(TESTSOBJS))

View File

@@ -103,7 +103,7 @@ struct mscclppDevConn {
int tag;
void* localBuff;
volatile uint64_t* sendEpochId; // this is read and written by the GPU
uint64_t* sendEpochId; // this is read and written by the GPU
uint64_t recvEpochId; // this is the copy of the remote epoch id.
void* remoteBuff;
@@ -115,7 +115,7 @@ struct mscclppDevConn {
#ifdef __CUDACC__
__forceinline__ __device__ void increment(){
*sendEpochId += 1;
*(volatile uint64_t*)sendEpochId += 1;
}
__forceinline__ __device__ void put(uint64_t dataOffset, uint64_t dataSize){
@@ -132,12 +132,12 @@ struct mscclppDevConn {
}
__forceinline__ __device__ void sync(mscclppRequest_t req) {
while (*(volatile uint64_t *)triggerFifoTail <= req);
while (*(volatile uint64_t *)fifo.triggerFifoTail <= req);
}
__forceinline__ __device__ void wait(){
recvEpochId++;
while (*proxyEpochId < recvEpochId);
while (*(volatile uint64_t*)proxyEpochId < recvEpochId);
}
#endif
};
@@ -161,43 +161,6 @@ typedef enum { mscclppSuccess = 0,
mscclppResult_t mscclppGetUniqueId(mscclppUniqueId* uniqueId);
/* Reduction operation selector */
typedef enum { mscclppNumOps_dummy = 5 } mscclppRedOp_dummy_t;
typedef enum { mscclppSum = 0,
mscclppProd = 1,
mscclppMax = 2,
mscclppMin = 3,
mscclppAvg = 4,
/* mscclppNumOps: The number of built-in mscclppRedOp_t values. Also
* serves as the least possible value for dynamic mscclppRedOp_t's
* as constructed by mscclppRedOpCreate*** functions. */
mscclppNumOps = 5,
/* mscclppMaxRedOp: The largest valid value for mscclppRedOp_t.
* It is defined to be the largest signed value (since compilers
* are permitted to use signed enums) that won't grow
* sizeof(mscclppRedOp_t) when compared to previous MSCCLPP versions to
* maintain ABI compatibility. */
mscclppMaxRedOp = 0x7fffffff>>(32-8*sizeof(mscclppRedOp_dummy_t))
} mscclppRedOp_t;
/* Data types */
typedef enum { mscclppInt8 = 0, mscclppChar = 0,
mscclppUint8 = 1,
mscclppInt32 = 2, mscclppInt = 2,
mscclppUint32 = 3,
mscclppInt64 = 4,
mscclppUint64 = 5,
mscclppFloat16 = 6, mscclppHalf = 6,
mscclppFloat32 = 7, mscclppFloat = 7,
mscclppFloat64 = 8, mscclppDouble = 8,
#if defined(__CUDA_BF16_TYPES_EXIST__)
mscclppBfloat16 = 9,
mscclppNumTypes = 10
#else
mscclppNumTypes = 9
#endif
} mscclppDataType_t;
/* Transport Types */
typedef enum { mscclppTransportP2P = 0,
mscclppTransportSHM = 1, // TODO(chhwang): not implemented yet

View File

@@ -49,24 +49,24 @@ __global__ void kernel(int rank, int world_size, int nelemsPerGPU)
mscclppDevConn_t devConn = constDevConns[remoteRank];
// volatile int *data = (volatile int *)devConn.localBuff;
volatile uint64_t *localFlag = devConn.localFlag;
volatile uint64_t *proxyFlag = devConn.proxyFlag;
// volatile uint64_t *localFlag = devConn.localFlag;
// volatile uint64_t *proxyFlag = devConn.proxyFlag;
uint64_t baseFlag = *localFlag;
// uint64_t baseFlag = *localFlag;
if (threadIdx.x == 0) {
*localFlag = baseFlag + 1;
}
// if (threadIdx.x == 0) {
// *localFlag = baseFlag + 1;
// }
// Each warp receives data from different ranks
#if 1
// push your data asynchronously
devConn.fifo.put(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int));
devConn.put(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int));
// push with flag and sync to make sure the data is received
auto req = devConn.fifo.signal();
auto req = devConn.signal();
devConn.fifo.sync(req);
devConn.sync(req);
devConn.wait();
//while (*proxyFlag == baseFlag);
@@ -75,18 +75,18 @@ __global__ void kernel(int rank, int world_size, int nelemsPerGPU)
for (int i = 1; i < world_size; i++){
__syncthreads();
if (remoteRank != ((rank+i) % world_size)) continue;
// get a thread-local trigger and a request for waiting on it
mscclppTrigger_t trig;
mscclppRequest_t req = devConn.fifo.getTrigger(&trig);
// push your data asynchronously
devConn.put(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int));
// Trigger sending data, flag and synchronize after
devConn.fifo.setTrigger(trig, mscclppFlag | mscclppData | mscclppSync, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int));
// push with flag and sync to make sure the data is received
auto req = devConn.signal();
devConn.sync(req);
// Wait on the request to make sure it is safe to reuse buffer and flag
devConn.fifo.waitTrigger(req);
}
devConn.wait();
// Wait for receiving data from remote rank
while (*proxyFlag == baseFlag);
// while (*proxyFlag == baseFlag);
#endif
}