mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-13 01:36:10 +00:00
cleanups
This commit is contained in:
@@ -177,231 +177,23 @@ struct mscclppConn {
|
||||
};
|
||||
|
||||
struct mscclppComm {
|
||||
// struct mscclppMemoryStack memPermanent, memScoped;
|
||||
// // List of destructors to run when comm is destructed
|
||||
// struct mscclppDestructor* destructorHead;
|
||||
|
||||
// struct mscclppChannel channels[MAXCHANNELS];
|
||||
// struct mscclppPeerInfo* peerInfo;
|
||||
// struct mscclppTopoSystem* topo;
|
||||
|
||||
struct mscclppConn conns[MAXCONNECTIONS];
|
||||
int nConns;
|
||||
|
||||
// mscclppNet_t* mscclppNet;
|
||||
// mscclppCollNet_t* mscclppCollNet;
|
||||
void* bootstrap;
|
||||
// // Bitmasks for mscclppTransportP2pSetup
|
||||
// uint64_t* connectSend;
|
||||
// uint64_t* connectRecv;
|
||||
|
||||
uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
|
||||
|
||||
int rank; // my rank in the communicator
|
||||
int nRanks; // number of GPUs in communicator
|
||||
int cudaDev; // my cuda device index
|
||||
// int compCap; // compute capability of the GPU
|
||||
// int64_t busId; // my PCI bus ID in int format
|
||||
// cpu_set_t cpuAffinity; // CPU affinity of the GPU
|
||||
|
||||
// int node;
|
||||
// int nNodes;
|
||||
// int localRank;
|
||||
// int localRanks;
|
||||
// int maxLocalRanks;
|
||||
// int* rankToNode;
|
||||
// int* rankToLocalRank;
|
||||
// int* localRankToRank;
|
||||
// // localRanks and localRanktoRank for all nodes
|
||||
// struct mscclppNodeRanks* nodeRanks;
|
||||
|
||||
// bool checkPointers;
|
||||
// bool dmaBufSupport;
|
||||
|
||||
// // Counter for tracking CUDA launches (P2P and collectives included)
|
||||
// uint64_t opCount;
|
||||
// // Collective operation counter
|
||||
// uint64_t collOpCount;
|
||||
|
||||
// // Channels for collectives
|
||||
// int nChannels;
|
||||
// // Channels (per peer) for p2p
|
||||
// int p2pnChannels;
|
||||
// int p2pnChannelsPerPeer;
|
||||
// int p2pChannels[MAXCHANNELS];
|
||||
|
||||
// // Should this comm allocate LL buffers for network P2P connections?
|
||||
// bool allocP2pNetLLBuffers;
|
||||
|
||||
// // Buffer sizes
|
||||
// int buffSizes[MSCCLPP_NUM_PROTOCOLS];
|
||||
// int p2pChunkSize;
|
||||
|
||||
// // Algorithm/Protocols thresholds
|
||||
// ssize_t threadThresholds[MSCCLPP_NUM_ALGORITHMS][MSCCLPP_NUM_PROTOCOLS];
|
||||
// float latencies[MSCCLPP_NUM_FUNCTIONS][MSCCLPP_NUM_ALGORITHMS][MSCCLPP_NUM_PROTOCOLS];
|
||||
// float bandwidths[MSCCLPP_NUM_FUNCTIONS][MSCCLPP_NUM_ALGORITHMS][MSCCLPP_NUM_PROTOCOLS];
|
||||
// int maxThreads[MSCCLPP_NUM_ALGORITHMS][MSCCLPP_NUM_PROTOCOLS];
|
||||
|
||||
// /* This attribute can indicate the states of communicators and return code of
|
||||
// * asynchronous MSCCLPP operations. */
|
||||
// mscclppResult_t asyncResult;
|
||||
|
||||
// Flag to ask MSCCLPP kernels to abort
|
||||
volatile uint32_t *abortFlag;
|
||||
|
||||
// // Device side of the communicator (for cudaFree's)
|
||||
// struct mscclppDevComm* devComm; // actually = &mscclppDevCommAndChannels::comm
|
||||
|
||||
// // Operation pool.
|
||||
// int workFifoDepth; // size of workFifoHeap[], power of 2
|
||||
// struct mscclppWork* workFifoHeap;
|
||||
// struct mscclppWork* devWorkFifoHeap;
|
||||
// void* workFifoHeapGdrHandle;
|
||||
|
||||
// // Work completion notificaion
|
||||
// uint32_t* workFifoDone/*[MAXCHANNELS]*/; // in cudaHost memory
|
||||
// uint32_t workFifoSent; // Monotonic (mod 1<<32) index of next unused fifo slot.
|
||||
// uint32_t workFifoAckdMin; // Monotonic index of least unprocessed fifo slot over all channels.
|
||||
|
||||
// // Intra-process sync
|
||||
// struct mscclppComm* intraComm0; // leader of intra-process comms (self possible)
|
||||
// struct mscclppComm* intraNext; // next of intra-process comms, intraComm0 is head
|
||||
// int intraRank;
|
||||
// int intraRanks;
|
||||
// uint32_t intraBarrierPhase;
|
||||
// char intraPad1[64 - sizeof(uint64_t)];
|
||||
// uint64_t intraBarrierCounter; // only used if this is intraComm0
|
||||
// char intraPad2[64 - sizeof(uint64_t)];
|
||||
// uint64_t intraBarrierGate; // only used if this is intraComm0
|
||||
|
||||
struct mscclppIbContext *ibContext[MSCCLPP_IB_MAX_DEVS];
|
||||
|
||||
// Last one is for P2P proxies.
|
||||
struct mscclppProxyState proxyState[MSCCLPP_IB_MAX_DEVS + 1];
|
||||
|
||||
// // Whether this communicator uses collNet
|
||||
// int collNetSupport;
|
||||
// int intraHighestTransportType;
|
||||
|
||||
// size_t channelSize; // User requested work size (bytes) for channel partitions
|
||||
|
||||
// // Internal streams
|
||||
// struct mscclppStrongStream deviceStream, hostStream;
|
||||
|
||||
// // pools backed by comm->memPermanent
|
||||
// struct mscclppMemoryPool memPool_mscclppProxyOp;
|
||||
// struct mscclppMemoryPool memPool_mscclppKernelPlan;
|
||||
// struct mscclppMemoryPool memPool_mscclppPointerList;
|
||||
// // Next comm in this thread's active mscclppGroup[Start|End](). Holds "0x1" when
|
||||
// // this comm is not yet in a group.
|
||||
// struct mscclppComm* groupNext;
|
||||
// // Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
|
||||
// struct mscclppComm* preconnectNext;
|
||||
// int persistentRefs; // number of persistent plan-lists capturing this comm
|
||||
// struct mscclppTasks tasks;
|
||||
|
||||
// // user-created reduction ops
|
||||
// int userRedOpCapacity, userRedOpFreeHead;
|
||||
// mscclppUserRedOp *userRedOps;
|
||||
|
||||
// // Queue of things for the main thread to do
|
||||
// struct mscclppIntruQueueMpsc<struct mscclppCommCallback, &mscclppCommCallback::next> callbackQueue;
|
||||
|
||||
// // List of kernel plans built form tasks.
|
||||
// struct mscclppIntruQueue<struct mscclppKernelPlan, &mscclppKernelPlan::next> planQueue;
|
||||
// // First of the unlaunched kernels in `planQueue`
|
||||
// struct mscclppKernelPlan* unlaunchedPlansHead;
|
||||
|
||||
// // communicator mode
|
||||
// int blocking;
|
||||
// // initState is to more conveniently reclaim resources when errors happen.
|
||||
// mscclppResult_t initState;
|
||||
// // flag to indicate if mscclppCommFinalize() is called
|
||||
// bool finalizeCalled;
|
||||
// // shared structures for finalization
|
||||
// int finalizeRankCnt;
|
||||
};
|
||||
|
||||
// enum mscclppLaunchMode {
|
||||
// mscclppLaunchModeInvalid=0,
|
||||
// mscclppLaunchModeParallel,
|
||||
// mscclppLaunchModeGroup
|
||||
// };
|
||||
// extern enum mscclppLaunchMode mscclppParamLaunchMode;
|
||||
|
||||
// void mscclppCommPushFree(struct mscclppComm* comm, void* buf);
|
||||
// void mscclppCommPushCudaFree(struct mscclppComm* comm, void* buf);
|
||||
// void mscclppCommPushCudaHostFree(struct mscclppComm* comm, void* buf);
|
||||
// void mscclppCommPushCudaGdrFree(struct mscclppComm* comm, void* handle);
|
||||
|
||||
// inline mscclppResult_t mscclppCommPollCallbacks(struct mscclppComm* comm, bool waitSome) {
|
||||
// mscclppResult_t result = mscclppSuccess;
|
||||
// struct mscclppCommCallback* cb = mscclppIntruQueueMpscDequeueAll(&comm->callbackQueue, waitSome);
|
||||
// while (cb != nullptr) {
|
||||
// struct mscclppCommCallback* next = cb->next;
|
||||
// mscclppResult_t res1 = cb->fn(comm, cb); // may reclaim memory of cb
|
||||
// if (res1 != mscclppSuccess) result = res1;
|
||||
// cb = next;
|
||||
// }
|
||||
// MSCCLPPCHECK(result);
|
||||
// return mscclppSuccess;
|
||||
// }
|
||||
|
||||
// inline void mscclppCommIntraBarrierIn(struct mscclppComm* comm, uint32_t x) {
|
||||
// int phase = comm->intraBarrierPhase;
|
||||
// if (comm->intraRanks == 1) {
|
||||
// // Release everyone (just me).
|
||||
// comm->intraBarrierGate = (uint64_t(x)<<32) | (phase^1);
|
||||
// } else {
|
||||
// struct mscclppComm* comm0 = comm->intraComm0;
|
||||
// uint64_t count = __atomic_add_fetch(&comm0->intraBarrierCounter, (uint64_t(x)<<32) + 1, __ATOMIC_RELEASE);
|
||||
// if (uint32_t(count) == uint32_t(comm->intraRanks)) {
|
||||
// // Reset.
|
||||
// __atomic_store_n(&comm0->intraBarrierCounter, 0, __ATOMIC_RELAXED);
|
||||
// // Release everyone.
|
||||
// __atomic_store_n(&comm0->intraBarrierGate, (count>>32<<32) | (phase^1), __ATOMIC_RELEASE);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
// // returns sum of x values contributed to mscclppCommIntraBarrierIn(comm, x)
|
||||
// inline uint32_t mscclppCommIntraBarrierOut(struct mscclppComm* comm) {
|
||||
// struct mscclppComm* comm0 = comm->intraComm0;
|
||||
// comm->intraBarrierPhase ^= 1;
|
||||
// uint32_t phase = comm->intraBarrierPhase;
|
||||
// uint64_t gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED);
|
||||
// if ((gate & 1) != phase) {
|
||||
// uint64_t t0 = clockNano();
|
||||
// do {
|
||||
// // Spin vigorously for first 5us.
|
||||
// if (clockNano()-t0 >= 5*1000) sched_yield();
|
||||
// gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED);
|
||||
// } while ((gate & 1) != phase);
|
||||
// }
|
||||
// if (comm->intraRanks != 1) __atomic_thread_fence(__ATOMIC_ACQUIRE);
|
||||
// return gate>>32;
|
||||
// }
|
||||
|
||||
// // Scrambles the bits of non-builtin values of mscclppRedOp_t according to the
|
||||
// // communicator memory address. Used to catch bugs so that integer handles
|
||||
// // associated with this communicator won't collide with handles of other
|
||||
// // communicatrs. This function is its own inverse.
|
||||
// static inline mscclppRedOp_t mscclppUserRedOpMangle(mscclppComm *comm, mscclppRedOp_t op) {
|
||||
// // Preserve the built-in values.
|
||||
// if(int(op) < int(mscclppNumOps))
|
||||
// return op;
|
||||
// uint64_t h = reinterpret_cast<uint64_t>(comm);
|
||||
// h ^= h >> 32;
|
||||
// h *= 0x9e3779b97f4a7c13u; // Knuth's 64-bit magical hash constant
|
||||
// h >>= 32; // h is now an excellent 32-bit hash of the comm pointer
|
||||
// h &= int(mscclppMaxRedOp); // mscclppMaxRedOp is a power of 2 minus 1
|
||||
// int op1 = int(h) ^ int(op);
|
||||
// // Since builtin values are preserved, we also have to preserve their preimage.
|
||||
// return op1 < int(mscclppNumOps) ? op : mscclppRedOp_t(op1);
|
||||
// }
|
||||
|
||||
// mscclppResult_t mscclppCommEnsureReady(mscclppComm_t comm);
|
||||
// mscclppResult_t mscclppCommSetAsyncError(mscclppComm_t comm, mscclppResult_t nextState);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -122,12 +122,12 @@ void* mscclppProxyServiceIb(void* _args) {
|
||||
SEND_STATE_INPROGRESS
|
||||
};
|
||||
int *sendState;
|
||||
uint64_t *currentProxyFlagVlaue;
|
||||
uint64_t *currentProxyFlagValue;
|
||||
if (mscclppCalloc((void **)&sendState, comm->nConns) != mscclppSuccess) {
|
||||
WARN("mscclppCalloc failed: errno %d", errno);
|
||||
return NULL;
|
||||
}
|
||||
if (mscclppCalloc((void **)¤tProxyFlagVlaue, comm->nConns) != mscclppSuccess) {
|
||||
if (mscclppCalloc((void **)¤tProxyFlagValue, comm->nConns) != mscclppSuccess) {
|
||||
WARN("mscclppCalloc failed: errno %d", errno);
|
||||
return NULL;
|
||||
}
|
||||
@@ -143,7 +143,7 @@ void* mscclppProxyServiceIb(void* _args) {
|
||||
for (int i = 0; i < (int)comm->nConns; ++i) {
|
||||
sendState[i] = SEND_STATE_INIT;
|
||||
struct mscclppConn *conn = &comm->conns[i];
|
||||
currentProxyFlagVlaue[i] = *conn->cpuProxyFlag;
|
||||
currentProxyFlagValue[i] = *conn->cpuProxyFlag;
|
||||
// Post recv
|
||||
if (conn->ibQp->postRecv(0) != 0) {
|
||||
WARN("postRecv failed: errno %d", errno);
|
||||
@@ -195,7 +195,7 @@ void* mscclppProxyServiceIb(void* _args) {
|
||||
}
|
||||
if (wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
|
||||
// TODO(chhwang): cpu flush
|
||||
*((volatile uint64_t *)conn->cpuProxyFlag) = ++currentProxyFlagVlaue[trigger.fields.connId];
|
||||
*((volatile uint64_t *)conn->cpuProxyFlag) = ++currentProxyFlagValue[trigger.fields.connId];
|
||||
// recv completion
|
||||
if (conn->ibQp->postRecv(wc->wr_id) != 0) {
|
||||
WARN("postRecv failed: errno %d", errno);
|
||||
|
||||
Reference in New Issue
Block a user