cleanups

2026-05-13 01:36:10 +00:00 · 2023-03-14 09:21:52 +00:00
parent aacee9727b
commit 135520a14a
2 changed files with 4 additions and 212 deletions
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -177,231 +177,23 @@ struct mscclppConn {
 };

 struct mscclppComm {
-//   struct mscclppMemoryStack memPermanent, memScoped;
-//   // List of destructors to run when comm is destructed
-//   struct mscclppDestructor* destructorHead;
-
-//   struct mscclppChannel channels[MAXCHANNELS];
-//   struct mscclppPeerInfo* peerInfo;
-//   struct mscclppTopoSystem* topo;
-
  struct mscclppConn conns[MAXCONNECTIONS];
  int nConns;

-//   mscclppNet_t* mscclppNet;
-//   mscclppCollNet_t* mscclppCollNet;
  void* bootstrap;
-//   // Bitmasks for mscclppTransportP2pSetup
-//   uint64_t* connectSend;
-//   uint64_t* connectRecv;

  uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.

  int rank;    // my rank in the communicator
  int nRanks;  // number of GPUs in communicator
  int cudaDev; // my cuda device index
-//   int compCap; // compute capability of the GPU
-//   int64_t busId;   // my PCI bus ID in int format
-//   cpu_set_t cpuAffinity; // CPU affinity of the GPU
-
-  // int node;
-  // int nNodes;
-  // int localRank;
-  // int localRanks;
-  // int maxLocalRanks;
-  // int* rankToNode;
-  // int* rankToLocalRank;
-  // int* localRankToRank; 
-//   // localRanks and localRanktoRank for all nodes
-//   struct mscclppNodeRanks* nodeRanks;
-
-//   bool checkPointers;
-//   bool dmaBufSupport;
-
-//   // Counter for tracking CUDA launches (P2P and collectives included)
-//   uint64_t opCount;
-//   // Collective operation counter
-//   uint64_t collOpCount;
-
-//   // Channels for collectives
-//   int nChannels;
-//   // Channels (per peer) for p2p
-//   int p2pnChannels;
-//   int p2pnChannelsPerPeer;
-//   int p2pChannels[MAXCHANNELS];
-
-//   // Should this comm allocate LL buffers for network P2P connections?
-//   bool allocP2pNetLLBuffers;
-
-//   // Buffer sizes
-//   int buffSizes[MSCCLPP_NUM_PROTOCOLS];
-//   int p2pChunkSize;
-
-//   // Algorithm/Protocols thresholds
-//   ssize_t threadThresholds[MSCCLPP_NUM_ALGORITHMS][MSCCLPP_NUM_PROTOCOLS];
-//   float latencies[MSCCLPP_NUM_FUNCTIONS][MSCCLPP_NUM_ALGORITHMS][MSCCLPP_NUM_PROTOCOLS];
-//   float bandwidths[MSCCLPP_NUM_FUNCTIONS][MSCCLPP_NUM_ALGORITHMS][MSCCLPP_NUM_PROTOCOLS];
-//   int maxThreads[MSCCLPP_NUM_ALGORITHMS][MSCCLPP_NUM_PROTOCOLS];
-
-//   /* This attribute can indicate the states of communicators and return code of
-//    * asynchronous MSCCLPP operations. */
-//   mscclppResult_t asyncResult;

  // Flag to ask MSCCLPP kernels to abort
  volatile uint32_t *abortFlag;

-//   // Device side of the communicator (for cudaFree's)
-//   struct mscclppDevComm* devComm; // actually = &mscclppDevCommAndChannels::comm
-
-//   // Operation pool.
-//   int workFifoDepth; // size of workFifoHeap[], power of 2
-//   struct mscclppWork* workFifoHeap;
-//   struct mscclppWork* devWorkFifoHeap;
-//   void* workFifoHeapGdrHandle;
-
-//   // Work completion notificaion
-//   uint32_t* workFifoDone/*[MAXCHANNELS]*/; // in cudaHost memory
-//   uint32_t workFifoSent; // Monotonic (mod 1<<32) index of next unused fifo slot.
-//   uint32_t workFifoAckdMin; // Monotonic index of least unprocessed fifo slot over all channels.
-
-//   // Intra-process sync
-//   struct mscclppComm* intraComm0; // leader of intra-process comms (self possible)
-//   struct mscclppComm* intraNext; // next of intra-process comms, intraComm0 is head
-//   int intraRank;
-//   int intraRanks;
-//   uint32_t intraBarrierPhase;
-//   char intraPad1[64 - sizeof(uint64_t)];
-//   uint64_t intraBarrierCounter; // only used if this is intraComm0
-//   char intraPad2[64 - sizeof(uint64_t)];
-//   uint64_t intraBarrierGate; // only used if this is intraComm0
-
  struct mscclppIbContext *ibContext[MSCCLPP_IB_MAX_DEVS];
-
  // Last one is for P2P proxies.
  struct mscclppProxyState proxyState[MSCCLPP_IB_MAX_DEVS + 1];
-
-//   // Whether this communicator uses collNet
-//   int collNetSupport;
-//   int intraHighestTransportType;
-
-//   size_t channelSize; // User requested work size (bytes) for channel partitions
-
-//   // Internal streams
-//   struct mscclppStrongStream deviceStream, hostStream;
-
-//   // pools backed by comm->memPermanent
-//   struct mscclppMemoryPool memPool_mscclppProxyOp;
-//   struct mscclppMemoryPool memPool_mscclppKernelPlan;
-//   struct mscclppMemoryPool memPool_mscclppPointerList;
-//   // Next comm in this thread's active mscclppGroup[Start|End](). Holds "0x1" when
-//   // this comm is not yet in a group.
-//   struct mscclppComm* groupNext;
-//   // Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
-//   struct mscclppComm* preconnectNext;
-//   int persistentRefs; // number of persistent plan-lists capturing this comm
-//   struct mscclppTasks tasks;
-
-//   // user-created reduction ops
-//   int userRedOpCapacity, userRedOpFreeHead;
-//   mscclppUserRedOp *userRedOps;
-
-//   // Queue of things for the main thread to do
-//   struct mscclppIntruQueueMpsc<struct mscclppCommCallback, &mscclppCommCallback::next> callbackQueue;
-
-//   // List of kernel plans built form tasks.
-//   struct mscclppIntruQueue<struct mscclppKernelPlan, &mscclppKernelPlan::next> planQueue;
-//   // First of the unlaunched kernels in `planQueue`
-//   struct mscclppKernelPlan* unlaunchedPlansHead;
-
-//   // communicator mode
-//   int blocking;
-//   // initState is to more conveniently reclaim resources when errors happen.
-//   mscclppResult_t initState;
-//   // flag to indicate if mscclppCommFinalize() is called
-//   bool finalizeCalled;
-//   // shared structures for finalization
-//   int finalizeRankCnt;
 };

-// enum mscclppLaunchMode {
-//   mscclppLaunchModeInvalid=0,
-//   mscclppLaunchModeParallel,
-//   mscclppLaunchModeGroup
-// };
-// extern enum mscclppLaunchMode mscclppParamLaunchMode;
-
-// void mscclppCommPushFree(struct mscclppComm* comm, void* buf);
-// void mscclppCommPushCudaFree(struct mscclppComm* comm, void* buf);
-// void mscclppCommPushCudaHostFree(struct mscclppComm* comm, void* buf);
-// void mscclppCommPushCudaGdrFree(struct mscclppComm* comm, void* handle);
-
-// inline mscclppResult_t mscclppCommPollCallbacks(struct mscclppComm* comm, bool waitSome) {
-//   mscclppResult_t result = mscclppSuccess;
-//   struct mscclppCommCallback* cb = mscclppIntruQueueMpscDequeueAll(&comm->callbackQueue, waitSome);
-//   while (cb != nullptr) {
-//     struct mscclppCommCallback* next = cb->next;
-//     mscclppResult_t res1 = cb->fn(comm, cb); // may reclaim memory of cb
-//     if (res1 != mscclppSuccess) result = res1;
-//     cb = next;
-//   }
-//   MSCCLPPCHECK(result);
-//   return mscclppSuccess;
-// }
-
-// inline void mscclppCommIntraBarrierIn(struct mscclppComm* comm, uint32_t x) {
-//   int phase = comm->intraBarrierPhase;
-//   if (comm->intraRanks == 1) {
-//     // Release everyone (just me).
-//     comm->intraBarrierGate = (uint64_t(x)<<32) | (phase^1);
-//   } else {
-//     struct mscclppComm* comm0 = comm->intraComm0;
-//     uint64_t count = __atomic_add_fetch(&comm0->intraBarrierCounter, (uint64_t(x)<<32) + 1, __ATOMIC_RELEASE);
-//     if (uint32_t(count) == uint32_t(comm->intraRanks)) {
-//       // Reset.
-//       __atomic_store_n(&comm0->intraBarrierCounter, 0, __ATOMIC_RELAXED);
-//       // Release everyone.
-//       __atomic_store_n(&comm0->intraBarrierGate, (count>>32<<32) | (phase^1), __ATOMIC_RELEASE);
-//     }
-//   }
-// }
-
-// // returns sum of x values contributed to mscclppCommIntraBarrierIn(comm, x)
-// inline uint32_t mscclppCommIntraBarrierOut(struct mscclppComm* comm) {
-//   struct mscclppComm* comm0 = comm->intraComm0;
-//   comm->intraBarrierPhase ^= 1;
-//   uint32_t phase = comm->intraBarrierPhase;
-//   uint64_t gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED);
-//   if ((gate & 1) != phase) {
-//     uint64_t t0 = clockNano();
-//     do {
-//       // Spin vigorously for first 5us.
-//       if (clockNano()-t0 >= 5*1000) sched_yield();
-//       gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED);
-//     } while ((gate & 1) != phase);
-//   }
-//   if (comm->intraRanks != 1) __atomic_thread_fence(__ATOMIC_ACQUIRE);
-//   return gate>>32;
-// }
-
-// // Scrambles the bits of non-builtin values of mscclppRedOp_t according to the
-// // communicator memory address. Used to catch bugs so that integer handles
-// // associated with this communicator won't collide with handles of other
-// // communicatrs. This function is its own inverse.
-// static inline mscclppRedOp_t mscclppUserRedOpMangle(mscclppComm *comm, mscclppRedOp_t op) {
-//   // Preserve the built-in values.
-//   if(int(op) < int(mscclppNumOps))
-//     return op;
-//   uint64_t h = reinterpret_cast<uint64_t>(comm);
-//   h ^= h >> 32;
-//   h *= 0x9e3779b97f4a7c13u; // Knuth's 64-bit magical hash constant
-//   h >>= 32; // h is now an excellent 32-bit hash of the comm pointer
-//   h &= int(mscclppMaxRedOp); // mscclppMaxRedOp is a power of 2 minus 1
-//   int op1 = int(h) ^ int(op);
-//   // Since builtin values are preserved, we also have to preserve their preimage.
-//   return op1 < int(mscclppNumOps) ? op : mscclppRedOp_t(op1);
-// }
-
-// mscclppResult_t mscclppCommEnsureReady(mscclppComm_t comm);
-// mscclppResult_t mscclppCommSetAsyncError(mscclppComm_t comm, mscclppResult_t nextState);
-
 #endif
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -122,12 +122,12 @@ void* mscclppProxyServiceIb(void* _args) {
    SEND_STATE_INPROGRESS
  };
  int *sendState;
-  uint64_t *currentProxyFlagVlaue;
+  uint64_t *currentProxyFlagValue;
  if (mscclppCalloc((void **)&sendState, comm->nConns) != mscclppSuccess) {
    WARN("mscclppCalloc failed: errno %d", errno);
    return NULL;
  }
-  if (mscclppCalloc((void **)&currentProxyFlagVlaue, comm->nConns) != mscclppSuccess) {
+  if (mscclppCalloc((void **)&currentProxyFlagValue, comm->nConns) != mscclppSuccess) {
    WARN("mscclppCalloc failed: errno %d", errno);
    return NULL;
  }
@@ -143,7 +143,7 @@ void* mscclppProxyServiceIb(void* _args) {
  for (int i = 0; i < (int)comm->nConns; ++i) {
    sendState[i] = SEND_STATE_INIT;
    struct mscclppConn *conn = &comm->conns[i];
-    currentProxyFlagVlaue[i] = *conn->cpuProxyFlag;
+    currentProxyFlagValue[i] = *conn->cpuProxyFlag;
    // Post recv
    if (conn->ibQp->postRecv(0) != 0) {
      WARN("postRecv failed: errno %d", errno);
@@ -195,7 +195,7 @@ void* mscclppProxyServiceIb(void* _args) {
        }
        if (wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
          // TODO(chhwang): cpu flush
-          *((volatile uint64_t *)conn->cpuProxyFlag) = ++currentProxyFlagVlaue[trigger.fields.connId];
+          *((volatile uint64_t *)conn->cpuProxyFlag) = ++currentProxyFlagValue[trigger.fields.connId];
          // recv completion
          if (conn->ibQp->postRecv(wc->wr_id) != 0) {
            WARN("postRecv failed: errno %d", errno);