From 135520a14ae7cc38cb9cd22fe0d52dbaa75cd10e Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 14 Mar 2023 09:21:52 +0000 Subject: [PATCH] cleanups --- src/include/comm.h | 208 --------------------------------------------- src/proxy.cc | 8 +- 2 files changed, 4 insertions(+), 212 deletions(-) diff --git a/src/include/comm.h b/src/include/comm.h index 81449fe2..65435951 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -177,231 +177,23 @@ struct mscclppConn { }; struct mscclppComm { -// struct mscclppMemoryStack memPermanent, memScoped; -// // List of destructors to run when comm is destructed -// struct mscclppDestructor* destructorHead; - -// struct mscclppChannel channels[MAXCHANNELS]; -// struct mscclppPeerInfo* peerInfo; -// struct mscclppTopoSystem* topo; - struct mscclppConn conns[MAXCONNECTIONS]; int nConns; -// mscclppNet_t* mscclppNet; -// mscclppCollNet_t* mscclppCollNet; void* bootstrap; -// // Bitmasks for mscclppTransportP2pSetup -// uint64_t* connectSend; -// uint64_t* connectRecv; uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches. int rank; // my rank in the communicator int nRanks; // number of GPUs in communicator int cudaDev; // my cuda device index -// int compCap; // compute capability of the GPU -// int64_t busId; // my PCI bus ID in int format -// cpu_set_t cpuAffinity; // CPU affinity of the GPU - - // int node; - // int nNodes; - // int localRank; - // int localRanks; - // int maxLocalRanks; - // int* rankToNode; - // int* rankToLocalRank; - // int* localRankToRank; -// // localRanks and localRanktoRank for all nodes -// struct mscclppNodeRanks* nodeRanks; - -// bool checkPointers; -// bool dmaBufSupport; - -// // Counter for tracking CUDA launches (P2P and collectives included) -// uint64_t opCount; -// // Collective operation counter -// uint64_t collOpCount; - -// // Channels for collectives -// int nChannels; -// // Channels (per peer) for p2p -// int p2pnChannels; -// int p2pnChannelsPerPeer; -// int p2pChannels[MAXCHANNELS]; - -// // Should this comm allocate LL buffers for network P2P connections? -// bool allocP2pNetLLBuffers; - -// // Buffer sizes -// int buffSizes[MSCCLPP_NUM_PROTOCOLS]; -// int p2pChunkSize; - -// // Algorithm/Protocols thresholds -// ssize_t threadThresholds[MSCCLPP_NUM_ALGORITHMS][MSCCLPP_NUM_PROTOCOLS]; -// float latencies[MSCCLPP_NUM_FUNCTIONS][MSCCLPP_NUM_ALGORITHMS][MSCCLPP_NUM_PROTOCOLS]; -// float bandwidths[MSCCLPP_NUM_FUNCTIONS][MSCCLPP_NUM_ALGORITHMS][MSCCLPP_NUM_PROTOCOLS]; -// int maxThreads[MSCCLPP_NUM_ALGORITHMS][MSCCLPP_NUM_PROTOCOLS]; - -// /* This attribute can indicate the states of communicators and return code of -// * asynchronous MSCCLPP operations. */ -// mscclppResult_t asyncResult; // Flag to ask MSCCLPP kernels to abort volatile uint32_t *abortFlag; -// // Device side of the communicator (for cudaFree's) -// struct mscclppDevComm* devComm; // actually = &mscclppDevCommAndChannels::comm - -// // Operation pool. -// int workFifoDepth; // size of workFifoHeap[], power of 2 -// struct mscclppWork* workFifoHeap; -// struct mscclppWork* devWorkFifoHeap; -// void* workFifoHeapGdrHandle; - -// // Work completion notificaion -// uint32_t* workFifoDone/*[MAXCHANNELS]*/; // in cudaHost memory -// uint32_t workFifoSent; // Monotonic (mod 1<<32) index of next unused fifo slot. -// uint32_t workFifoAckdMin; // Monotonic index of least unprocessed fifo slot over all channels. - -// // Intra-process sync -// struct mscclppComm* intraComm0; // leader of intra-process comms (self possible) -// struct mscclppComm* intraNext; // next of intra-process comms, intraComm0 is head -// int intraRank; -// int intraRanks; -// uint32_t intraBarrierPhase; -// char intraPad1[64 - sizeof(uint64_t)]; -// uint64_t intraBarrierCounter; // only used if this is intraComm0 -// char intraPad2[64 - sizeof(uint64_t)]; -// uint64_t intraBarrierGate; // only used if this is intraComm0 - struct mscclppIbContext *ibContext[MSCCLPP_IB_MAX_DEVS]; - // Last one is for P2P proxies. struct mscclppProxyState proxyState[MSCCLPP_IB_MAX_DEVS + 1]; - -// // Whether this communicator uses collNet -// int collNetSupport; -// int intraHighestTransportType; - -// size_t channelSize; // User requested work size (bytes) for channel partitions - -// // Internal streams -// struct mscclppStrongStream deviceStream, hostStream; - -// // pools backed by comm->memPermanent -// struct mscclppMemoryPool memPool_mscclppProxyOp; -// struct mscclppMemoryPool memPool_mscclppKernelPlan; -// struct mscclppMemoryPool memPool_mscclppPointerList; -// // Next comm in this thread's active mscclppGroup[Start|End](). Holds "0x1" when -// // this comm is not yet in a group. -// struct mscclppComm* groupNext; -// // Subset of those in groupNext list. Holds 0x1 if not needing preconnect. -// struct mscclppComm* preconnectNext; -// int persistentRefs; // number of persistent plan-lists capturing this comm -// struct mscclppTasks tasks; - -// // user-created reduction ops -// int userRedOpCapacity, userRedOpFreeHead; -// mscclppUserRedOp *userRedOps; - -// // Queue of things for the main thread to do -// struct mscclppIntruQueueMpsc callbackQueue; - -// // List of kernel plans built form tasks. -// struct mscclppIntruQueue planQueue; -// // First of the unlaunched kernels in `planQueue` -// struct mscclppKernelPlan* unlaunchedPlansHead; - -// // communicator mode -// int blocking; -// // initState is to more conveniently reclaim resources when errors happen. -// mscclppResult_t initState; -// // flag to indicate if mscclppCommFinalize() is called -// bool finalizeCalled; -// // shared structures for finalization -// int finalizeRankCnt; }; -// enum mscclppLaunchMode { -// mscclppLaunchModeInvalid=0, -// mscclppLaunchModeParallel, -// mscclppLaunchModeGroup -// }; -// extern enum mscclppLaunchMode mscclppParamLaunchMode; - -// void mscclppCommPushFree(struct mscclppComm* comm, void* buf); -// void mscclppCommPushCudaFree(struct mscclppComm* comm, void* buf); -// void mscclppCommPushCudaHostFree(struct mscclppComm* comm, void* buf); -// void mscclppCommPushCudaGdrFree(struct mscclppComm* comm, void* handle); - -// inline mscclppResult_t mscclppCommPollCallbacks(struct mscclppComm* comm, bool waitSome) { -// mscclppResult_t result = mscclppSuccess; -// struct mscclppCommCallback* cb = mscclppIntruQueueMpscDequeueAll(&comm->callbackQueue, waitSome); -// while (cb != nullptr) { -// struct mscclppCommCallback* next = cb->next; -// mscclppResult_t res1 = cb->fn(comm, cb); // may reclaim memory of cb -// if (res1 != mscclppSuccess) result = res1; -// cb = next; -// } -// MSCCLPPCHECK(result); -// return mscclppSuccess; -// } - -// inline void mscclppCommIntraBarrierIn(struct mscclppComm* comm, uint32_t x) { -// int phase = comm->intraBarrierPhase; -// if (comm->intraRanks == 1) { -// // Release everyone (just me). -// comm->intraBarrierGate = (uint64_t(x)<<32) | (phase^1); -// } else { -// struct mscclppComm* comm0 = comm->intraComm0; -// uint64_t count = __atomic_add_fetch(&comm0->intraBarrierCounter, (uint64_t(x)<<32) + 1, __ATOMIC_RELEASE); -// if (uint32_t(count) == uint32_t(comm->intraRanks)) { -// // Reset. -// __atomic_store_n(&comm0->intraBarrierCounter, 0, __ATOMIC_RELAXED); -// // Release everyone. -// __atomic_store_n(&comm0->intraBarrierGate, (count>>32<<32) | (phase^1), __ATOMIC_RELEASE); -// } -// } -// } - -// // returns sum of x values contributed to mscclppCommIntraBarrierIn(comm, x) -// inline uint32_t mscclppCommIntraBarrierOut(struct mscclppComm* comm) { -// struct mscclppComm* comm0 = comm->intraComm0; -// comm->intraBarrierPhase ^= 1; -// uint32_t phase = comm->intraBarrierPhase; -// uint64_t gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED); -// if ((gate & 1) != phase) { -// uint64_t t0 = clockNano(); -// do { -// // Spin vigorously for first 5us. -// if (clockNano()-t0 >= 5*1000) sched_yield(); -// gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED); -// } while ((gate & 1) != phase); -// } -// if (comm->intraRanks != 1) __atomic_thread_fence(__ATOMIC_ACQUIRE); -// return gate>>32; -// } - -// // Scrambles the bits of non-builtin values of mscclppRedOp_t according to the -// // communicator memory address. Used to catch bugs so that integer handles -// // associated with this communicator won't collide with handles of other -// // communicatrs. This function is its own inverse. -// static inline mscclppRedOp_t mscclppUserRedOpMangle(mscclppComm *comm, mscclppRedOp_t op) { -// // Preserve the built-in values. -// if(int(op) < int(mscclppNumOps)) -// return op; -// uint64_t h = reinterpret_cast(comm); -// h ^= h >> 32; -// h *= 0x9e3779b97f4a7c13u; // Knuth's 64-bit magical hash constant -// h >>= 32; // h is now an excellent 32-bit hash of the comm pointer -// h &= int(mscclppMaxRedOp); // mscclppMaxRedOp is a power of 2 minus 1 -// int op1 = int(h) ^ int(op); -// // Since builtin values are preserved, we also have to preserve their preimage. -// return op1 < int(mscclppNumOps) ? op : mscclppRedOp_t(op1); -// } - -// mscclppResult_t mscclppCommEnsureReady(mscclppComm_t comm); -// mscclppResult_t mscclppCommSetAsyncError(mscclppComm_t comm, mscclppResult_t nextState); - #endif diff --git a/src/proxy.cc b/src/proxy.cc index 81f2cf23..a46be2b6 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -122,12 +122,12 @@ void* mscclppProxyServiceIb(void* _args) { SEND_STATE_INPROGRESS }; int *sendState; - uint64_t *currentProxyFlagVlaue; + uint64_t *currentProxyFlagValue; if (mscclppCalloc((void **)&sendState, comm->nConns) != mscclppSuccess) { WARN("mscclppCalloc failed: errno %d", errno); return NULL; } - if (mscclppCalloc((void **)¤tProxyFlagVlaue, comm->nConns) != mscclppSuccess) { + if (mscclppCalloc((void **)¤tProxyFlagValue, comm->nConns) != mscclppSuccess) { WARN("mscclppCalloc failed: errno %d", errno); return NULL; } @@ -143,7 +143,7 @@ void* mscclppProxyServiceIb(void* _args) { for (int i = 0; i < (int)comm->nConns; ++i) { sendState[i] = SEND_STATE_INIT; struct mscclppConn *conn = &comm->conns[i]; - currentProxyFlagVlaue[i] = *conn->cpuProxyFlag; + currentProxyFlagValue[i] = *conn->cpuProxyFlag; // Post recv if (conn->ibQp->postRecv(0) != 0) { WARN("postRecv failed: errno %d", errno); @@ -195,7 +195,7 @@ void* mscclppProxyServiceIb(void* _args) { } if (wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) { // TODO(chhwang): cpu flush - *((volatile uint64_t *)conn->cpuProxyFlag) = ++currentProxyFlagVlaue[trigger.fields.connId]; + *((volatile uint64_t *)conn->cpuProxyFlag) = ++currentProxyFlagValue[trigger.fields.connId]; // recv completion if (conn->ibQp->postRecv(wc->wr_id) != 0) { WARN("postRecv failed: errno %d", errno);