mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-13 09:46:00 +00:00
new api works -- single node is not performant
This commit is contained in:
2
Makefile
2
Makefile
@@ -116,7 +116,7 @@ LIBSONAME := $(LIBNAME).$(MSCCLPP_MAJOR)
|
||||
LIBTARGET := $(BUILDDIR)/$(LIBDIR)/$(LIBNAME).$(MSCCLPP_MAJOR).$(MSCCLPP_MINOR)
|
||||
|
||||
TESTSDIR := tests
|
||||
TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc p2p_test.cu allgather_test.cu allgather_test2.cu)
|
||||
TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc allgather_test.cu)
|
||||
TESTSOBJS := $(patsubst %.cc,%.o,$(TESTSSRCS)) $(patsubst %.cu,%.o,$(TESTSSRCS))
|
||||
TESTSOBJTARGETS := $(TESTSOBJS:%=$(BUILDDIR)/$(OBJDIR)/%)
|
||||
TESTSBINS := $(patsubst %.o,$(BUILDDIR)/$(BINDIR)/%,$(TESTSOBJS))
|
||||
|
||||
@@ -103,7 +103,7 @@ struct mscclppDevConn {
|
||||
int tag;
|
||||
|
||||
void* localBuff;
|
||||
volatile uint64_t* sendEpochId; // this is read and written by the GPU
|
||||
uint64_t* sendEpochId; // this is read and written by the GPU
|
||||
uint64_t recvEpochId; // this is the copy of the remote epoch id.
|
||||
|
||||
void* remoteBuff;
|
||||
@@ -115,7 +115,7 @@ struct mscclppDevConn {
|
||||
#ifdef __CUDACC__
|
||||
|
||||
__forceinline__ __device__ void increment(){
|
||||
*sendEpochId += 1;
|
||||
*(volatile uint64_t*)sendEpochId += 1;
|
||||
}
|
||||
|
||||
__forceinline__ __device__ void put(uint64_t dataOffset, uint64_t dataSize){
|
||||
@@ -132,12 +132,12 @@ struct mscclppDevConn {
|
||||
}
|
||||
|
||||
__forceinline__ __device__ void sync(mscclppRequest_t req) {
|
||||
while (*(volatile uint64_t *)triggerFifoTail <= req);
|
||||
while (*(volatile uint64_t *)fifo.triggerFifoTail <= req);
|
||||
}
|
||||
|
||||
__forceinline__ __device__ void wait(){
|
||||
recvEpochId++;
|
||||
while (*proxyEpochId < recvEpochId);
|
||||
while (*(volatile uint64_t*)proxyEpochId < recvEpochId);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
@@ -161,43 +161,6 @@ typedef enum { mscclppSuccess = 0,
|
||||
|
||||
mscclppResult_t mscclppGetUniqueId(mscclppUniqueId* uniqueId);
|
||||
|
||||
/* Reduction operation selector */
|
||||
typedef enum { mscclppNumOps_dummy = 5 } mscclppRedOp_dummy_t;
|
||||
typedef enum { mscclppSum = 0,
|
||||
mscclppProd = 1,
|
||||
mscclppMax = 2,
|
||||
mscclppMin = 3,
|
||||
mscclppAvg = 4,
|
||||
/* mscclppNumOps: The number of built-in mscclppRedOp_t values. Also
|
||||
* serves as the least possible value for dynamic mscclppRedOp_t's
|
||||
* as constructed by mscclppRedOpCreate*** functions. */
|
||||
mscclppNumOps = 5,
|
||||
/* mscclppMaxRedOp: The largest valid value for mscclppRedOp_t.
|
||||
* It is defined to be the largest signed value (since compilers
|
||||
* are permitted to use signed enums) that won't grow
|
||||
* sizeof(mscclppRedOp_t) when compared to previous MSCCLPP versions to
|
||||
* maintain ABI compatibility. */
|
||||
mscclppMaxRedOp = 0x7fffffff>>(32-8*sizeof(mscclppRedOp_dummy_t))
|
||||
} mscclppRedOp_t;
|
||||
|
||||
/* Data types */
|
||||
typedef enum { mscclppInt8 = 0, mscclppChar = 0,
|
||||
mscclppUint8 = 1,
|
||||
mscclppInt32 = 2, mscclppInt = 2,
|
||||
mscclppUint32 = 3,
|
||||
mscclppInt64 = 4,
|
||||
mscclppUint64 = 5,
|
||||
mscclppFloat16 = 6, mscclppHalf = 6,
|
||||
mscclppFloat32 = 7, mscclppFloat = 7,
|
||||
mscclppFloat64 = 8, mscclppDouble = 8,
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
||||
mscclppBfloat16 = 9,
|
||||
mscclppNumTypes = 10
|
||||
#else
|
||||
mscclppNumTypes = 9
|
||||
#endif
|
||||
} mscclppDataType_t;
|
||||
|
||||
/* Transport Types */
|
||||
typedef enum { mscclppTransportP2P = 0,
|
||||
mscclppTransportSHM = 1, // TODO(chhwang): not implemented yet
|
||||
|
||||
@@ -49,24 +49,24 @@ __global__ void kernel(int rank, int world_size, int nelemsPerGPU)
|
||||
mscclppDevConn_t devConn = constDevConns[remoteRank];
|
||||
|
||||
// volatile int *data = (volatile int *)devConn.localBuff;
|
||||
volatile uint64_t *localFlag = devConn.localFlag;
|
||||
volatile uint64_t *proxyFlag = devConn.proxyFlag;
|
||||
// volatile uint64_t *localFlag = devConn.localFlag;
|
||||
// volatile uint64_t *proxyFlag = devConn.proxyFlag;
|
||||
|
||||
uint64_t baseFlag = *localFlag;
|
||||
// uint64_t baseFlag = *localFlag;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
*localFlag = baseFlag + 1;
|
||||
}
|
||||
// if (threadIdx.x == 0) {
|
||||
// *localFlag = baseFlag + 1;
|
||||
// }
|
||||
|
||||
// Each warp receives data from different ranks
|
||||
#if 1
|
||||
// push your data asynchronously
|
||||
devConn.fifo.put(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int));
|
||||
devConn.put(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int));
|
||||
|
||||
// push with flag and sync to make sure the data is received
|
||||
auto req = devConn.fifo.signal();
|
||||
auto req = devConn.signal();
|
||||
|
||||
devConn.fifo.sync(req);
|
||||
devConn.sync(req);
|
||||
|
||||
devConn.wait();
|
||||
//while (*proxyFlag == baseFlag);
|
||||
@@ -75,18 +75,18 @@ __global__ void kernel(int rank, int world_size, int nelemsPerGPU)
|
||||
for (int i = 1; i < world_size; i++){
|
||||
__syncthreads();
|
||||
if (remoteRank != ((rank+i) % world_size)) continue;
|
||||
// get a thread-local trigger and a request for waiting on it
|
||||
mscclppTrigger_t trig;
|
||||
mscclppRequest_t req = devConn.fifo.getTrigger(&trig);
|
||||
// push your data asynchronously
|
||||
devConn.put(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int));
|
||||
|
||||
// Trigger sending data, flag and synchronize after
|
||||
devConn.fifo.setTrigger(trig, mscclppFlag | mscclppData | mscclppSync, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int));
|
||||
// push with flag and sync to make sure the data is received
|
||||
auto req = devConn.signal();
|
||||
|
||||
devConn.sync(req);
|
||||
|
||||
// Wait on the request to make sure it is safe to reuse buffer and flag
|
||||
devConn.fifo.waitTrigger(req);
|
||||
}
|
||||
devConn.wait();
|
||||
// Wait for receiving data from remote rank
|
||||
while (*proxyFlag == baseFlag);
|
||||
// while (*proxyFlag == baseFlag);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user