link format correction

2026-04-19 22:39:11 +00:00 · 2023-03-27 20:40:15 +00:00
parent 0edb89dba2
commit 19bf369dc1
35 changed files with 1779 additions and 1432 deletions
--- a/python/src/_py_mscclpp.cpp
+++ b/python/src/_py_mscclpp.cpp
@@ -12,8 +12,8 @@ namespace nb = nanobind;
 using namespace nb::literals;

 // This is a poorman's substitute for std::format, which is a C++20 feature.
-template <typename... Args>
-std::string string_format(const std::string &format, Args... args) {
+template <typename... Args> std::string string_format(const std::string& format, Args... args)
+{
 // Shutup format warning.
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wformat-security"
@@ -40,46 +40,50 @@ std::string string_format(const std::string &format, Args... args) {
 }

 // Maybe return the value, maybe throw an exception.
-template <typename... Args>
-void checkResult(
-    mscclppResult_t status, const std::string &format, Args... args) {
+template <typename... Args> void checkResult(mscclppResult_t status, const std::string& format, Args... args)
+{
  switch (status) {
-    case mscclppSuccess:
-      return;
+  case mscclppSuccess:
+    return;

-    case mscclppUnhandledCudaError:
-    case mscclppSystemError:
-    case mscclppInternalError:
-    case mscclppRemoteError:
-    case mscclppInProgress:
-    case mscclppNumResults:
-      throw std::runtime_error(string_format(format, args...));
+  case mscclppUnhandledCudaError:
+  case mscclppSystemError:
+  case mscclppInternalError:
+  case mscclppRemoteError:
+  case mscclppInProgress:
+  case mscclppNumResults:
+    throw std::runtime_error(string_format(format, args...));

-    case mscclppInvalidArgument:
-    case mscclppInvalidUsage:
-    default:
-      throw std::invalid_argument(string_format(format, args...));
+  case mscclppInvalidArgument:
+  case mscclppInvalidUsage:
+  default:
+    throw std::invalid_argument(string_format(format, args...));
  }
 }

 // Maybe return the value, maybe throw an exception.
 template <typename Val, typename... Args>
-Val maybe(
-    mscclppResult_t status, Val val, const std::string &format, Args... args) {
+Val maybe(mscclppResult_t status, Val val, const std::string& format, Args... args)
+{
  checkResult(status, format, args...);
  return val;
 }

 // Wrapper around connection state.
-struct MscclppComm {
+struct MscclppComm
+{
  mscclppComm_t _handle;
  bool _is_open = false;

- public:
-  ~MscclppComm() { close(); }
+public:
+  ~MscclppComm()
+  {
+    close();
+  }

  // Close should be safe to call on a closed handle.
-  void close() {
+  void close()
+  {
    if (_is_open) {
      checkResult(mscclppCommDestroy(_handle), "Failed to close comm channel");
      _handle = 0;
@@ -87,176 +91,116 @@ struct MscclppComm {
    }
  }

-  void check_open() {
+  void check_open()
+  {
    if (!_is_open) {
      throw std::invalid_argument("MscclppComm is not open");
    }
  }
 };

-static const std::string DOC_MscclppUniqueId =
-    "MSCCLPP Unique Id; used by the MPI Interface";
+static const std::string DOC_MscclppUniqueId = "MSCCLPP Unique Id; used by the MPI Interface";

 static const std::string DOC_MscclppComm = "MSCCLPP Communications Handle";

-
-NB_MODULE(_py_mscclpp, m) {
+NB_MODULE(_py_mscclpp, m)
+{
  m.doc() = "Python bindings for MSCCLPP: which is not NCCL";

  m.attr("MSCCLPP_UNIQUE_ID_BYTES") = MSCCLPP_UNIQUE_ID_BYTES;

  nb::class_<mscclppUniqueId>(m, "MscclppUniqueId")
-      .def_ro_static("__doc__", &DOC_MscclppUniqueId)
-      .def_static(
-          "from_context",
-          []() {
-            mscclppUniqueId uniqueId;
-            return maybe(
-                mscclppGetUniqueId(&uniqueId),
-                uniqueId,
-                "Failed to get MSCCLP Unique Id.");
-          },
-          nb::call_guard<nb::gil_scoped_release>())
-      .def_static(
-          "from_bytes",
-          [](nb::bytes source) {
-            if (source.size() != MSCCLPP_UNIQUE_ID_BYTES) {
-              throw std::invalid_argument(string_format(
-                  "Requires exactly %d bytes; found %d",
-                  MSCCLPP_UNIQUE_ID_BYTES,
-                  source.size()));
-            }
+    .def_ro_static("__doc__", &DOC_MscclppUniqueId)
+    .def_static(
+      "from_context",
+      []() {
+        mscclppUniqueId uniqueId;
+        return maybe(mscclppGetUniqueId(&uniqueId), uniqueId, "Failed to get MSCCLP Unique Id.");
+      },
+      nb::call_guard<nb::gil_scoped_release>())
+    .def_static("from_bytes",
+                [](nb::bytes source) {
+                  if (source.size() != MSCCLPP_UNIQUE_ID_BYTES) {
+                    throw std::invalid_argument(
+                      string_format("Requires exactly %d bytes; found %d", MSCCLPP_UNIQUE_ID_BYTES, source.size()));
+                  }

-            mscclppUniqueId uniqueId;
-            std::memcpy(
-                uniqueId.internal, source.c_str(), sizeof(uniqueId.internal));
-            return uniqueId;
-          })
-      .def("bytes", [](mscclppUniqueId id) {
-        return nb::bytes(id.internal, sizeof(id.internal));
-      });
+                  mscclppUniqueId uniqueId;
+                  std::memcpy(uniqueId.internal, source.c_str(), sizeof(uniqueId.internal));
+                  return uniqueId;
+                })
+    .def("bytes", [](mscclppUniqueId id) { return nb::bytes(id.internal, sizeof(id.internal)); });

  nb::class_<MscclppComm>(m, "MscclppComm")
-      .def_ro_static("__doc__", &DOC_MscclppComm)
-      .def_static(
-          "init_rank_from_address",
-          [](const std::string &address, int rank, int world_size) {
-            MscclppComm comm = {0};
-            comm._is_open = true;
-            return maybe(
-                mscclppCommInitRank(
-                    &comm._handle, world_size, address.c_str(), rank),
-                comm,
-                "Failed to initialize comms: %s rank=%d world_size=%d",
-                address,
-                rank,
-                world_size);
-          },
-          nb::call_guard<nb::gil_scoped_release>(),
-          "address"_a,
-          "rank"_a,
-          "world_size"_a,
-          "Initialize comms given an IP address, rank, and world_size")
-      .def_static(
-          "init_rank_from_id",
-          [](const mscclppUniqueId &id, int rank, int world_size) {
-            MscclppComm comm = {0};
-            comm._is_open = true;
-            return maybe(
-                mscclppCommInitRankFromId(&comm._handle, world_size, id, rank),
-                comm,
-                "Failed to initialize comms: %02X%s rank=%d world_size=%d",
-                id.internal,
-                rank,
-                world_size);
-          },
-          nb::call_guard<nb::gil_scoped_release>(),
-          "id"_a,
-          "rank"_a,
-          "world_size"_a,
-          "Initialize comms given u UniqueID, rank, and world_size")
-      .def(
-          "opened",
-          [](MscclppComm &comm) { return comm._is_open; },
-          "Is this comm object opened?")
-      .def(
-          "closed",
-          [](MscclppComm &comm) { return !comm._is_open; },
-          "Is this comm object closed?")
-      .def(
-          "rank",
-          [](MscclppComm &comm) {
-            comm.check_open();
-            int rank;
-            return maybe(
-                mscclppCommRank(comm._handle, &rank),
-                rank,
-                "Failed to retrieve MSCCLPP rank");
-          },
-          nb::call_guard<nb::gil_scoped_release>(),
-          "The rank of this node.")
-      .def(
-          "size",
-          [](MscclppComm &comm) {
-            comm.check_open();
-            int size;
-            return maybe(
-                mscclppCommSize(comm._handle, &size),
-                size,
-                "Failed to retrieve MSCCLPP world size");
-          },
-          nb::call_guard<nb::gil_scoped_release>(),
-          "The world size of this node.")
-      .def(
-          "connection_setup",
-          [](MscclppComm &comm) {
-            comm.check_open();
-            return maybe(
-                mscclppConnectionSetup(comm._handle),
-                true,
-                "Failed to settup MSCCLPP connection");
-          },
-          nb::call_guard<nb::gil_scoped_release>(),
-          "Run connection setup for MSCCLPP.")
-      .def(
-          "launch_proxy",
-          [](MscclppComm &comm) {
-            comm.check_open();
-            return maybe(
-                mscclppProxyLaunch(comm._handle),
-                true,
-                "Failed to launch MSCCLPP proxy");
-          },
-          nb::call_guard<nb::gil_scoped_release>(),
-          "Start the MSCCLPP proxy.")
-      .def(
-          "stop_proxy",
-          [](MscclppComm &comm) {
-            comm.check_open();
-            return maybe(
-                mscclppProxyStop(comm._handle),
-                true,
-                "Failed to stop MSCCLPP proxy");
-          },
-          nb::call_guard<nb::gil_scoped_release>(),
-          "Start the MSCCLPP proxy.")
-      .def(
-          "close",
-          &MscclppComm::close,
-          nb::call_guard<nb::gil_scoped_release>())
-      .def(
-          "__del__",
-          &MscclppComm::close,
-          nb::call_guard<nb::gil_scoped_release>())
-      .def(
-          "bootstrap_all_gather",
-          [](MscclppComm &comm, void *data, int size) {
-            comm.check_open();
-            return maybe(
-                mscclppBootstrapAllGather(comm._handle, data, size),
-                true,
-                "Failed to stop MSCCLPP proxy");
-	  },
-          nb::call_guard<nb::gil_scoped_release>());
-
+    .def_ro_static("__doc__", &DOC_MscclppComm)
+    .def_static(
+      "init_rank_from_address",
+      [](const std::string& address, int rank, int world_size) {
+        MscclppComm comm = {0};
+        comm._is_open = true;
+        return maybe(mscclppCommInitRank(&comm._handle, world_size, address.c_str(), rank), comm,
+                     "Failed to initialize comms: %s rank=%d world_size=%d", address, rank, world_size);
+      },
+      nb::call_guard<nb::gil_scoped_release>(), "address"_a, "rank"_a, "world_size"_a,
+      "Initialize comms given an IP address, rank, and world_size")
+    .def_static(
+      "init_rank_from_id",
+      [](const mscclppUniqueId& id, int rank, int world_size) {
+        MscclppComm comm = {0};
+        comm._is_open = true;
+        return maybe(mscclppCommInitRankFromId(&comm._handle, world_size, id, rank), comm,
+                     "Failed to initialize comms: %02X%s rank=%d world_size=%d", id.internal, rank, world_size);
+      },
+      nb::call_guard<nb::gil_scoped_release>(), "id"_a, "rank"_a, "world_size"_a,
+      "Initialize comms given u UniqueID, rank, and world_size")
+    .def(
+      "opened", [](MscclppComm& comm) { return comm._is_open; }, "Is this comm object opened?")
+    .def(
+      "closed", [](MscclppComm& comm) { return !comm._is_open; }, "Is this comm object closed?")
+    .def(
+      "rank",
+      [](MscclppComm& comm) {
+        comm.check_open();
+        int rank;
+        return maybe(mscclppCommRank(comm._handle, &rank), rank, "Failed to retrieve MSCCLPP rank");
+      },
+      nb::call_guard<nb::gil_scoped_release>(), "The rank of this node.")
+    .def(
+      "size",
+      [](MscclppComm& comm) {
+        comm.check_open();
+        int size;
+        return maybe(mscclppCommSize(comm._handle, &size), size, "Failed to retrieve MSCCLPP world size");
+      },
+      nb::call_guard<nb::gil_scoped_release>(), "The world size of this node.")
+    .def(
+      "connection_setup",
+      [](MscclppComm& comm) {
+        comm.check_open();
+        return maybe(mscclppConnectionSetup(comm._handle), true, "Failed to settup MSCCLPP connection");
+      },
+      nb::call_guard<nb::gil_scoped_release>(), "Run connection setup for MSCCLPP.")
+    .def(
+      "launch_proxy",
+      [](MscclppComm& comm) {
+        comm.check_open();
+        return maybe(mscclppProxyLaunch(comm._handle), true, "Failed to launch MSCCLPP proxy");
+      },
+      nb::call_guard<nb::gil_scoped_release>(), "Start the MSCCLPP proxy.")
+    .def(
+      "stop_proxy",
+      [](MscclppComm& comm) {
+        comm.check_open();
+        return maybe(mscclppProxyStop(comm._handle), true, "Failed to stop MSCCLPP proxy");
+      },
+      nb::call_guard<nb::gil_scoped_release>(), "Start the MSCCLPP proxy.")
+    .def("close", &MscclppComm::close, nb::call_guard<nb::gil_scoped_release>())
+    .def("__del__", &MscclppComm::close, nb::call_guard<nb::gil_scoped_release>())
+    .def(
+      "bootstrap_all_gather",
+      [](MscclppComm& comm, void* data, int size) {
+        comm.check_open();
+        return maybe(mscclppBootstrapAllGather(comm._handle, data, size), true, "Failed to stop MSCCLPP proxy");
+      },
+      nb::call_guard<nb::gil_scoped_release>());
 }
--- a/src/bootstrap/bootstrap.cc
+++ b/src/bootstrap/bootstrap.cc
@@ -4,25 +4,27 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "mscclpp.h"
-#include "core.h"
-#include "utils.h"
 #include "bootstrap.h"
-#include <unistd.h>
+#include "core.h"
+#include "mscclpp.h"
+#include "utils.h"
 #include <sys/types.h>
+#include <unistd.h>

-struct bootstrapRootArgs {
+struct bootstrapRootArgs
+{
  struct mscclppSocket* listenSock;
  uint64_t magic;
 };

 /* Init functions */
-static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
+static char bootstrapNetIfName[MAX_IF_NAME_SIZE + 1];
 static union mscclppSocketAddress bootstrapNetIfAddr;
 static int bootstrapNetInitDone = 0;
 pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;

-mscclppResult_t bootstrapNetInit(const char* ip_port_pair) {
+mscclppResult_t bootstrapNetInit(const char* ip_port_pair)
+{
  if (bootstrapNetInitDone == 0) {
    pthread_mutex_lock(&bootstrapNetLock);
    if (bootstrapNetInitDone == 0) {
@@ -38,7 +40,8 @@ mscclppResult_t bootstrapNetInit(const char* ip_port_pair) {
          WARN("Invalid MSCCLPP_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
          return mscclppInvalidArgument;
        }
-        if (mscclppFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
+        if (mscclppFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE,
+                                            1) <= 0) {
          WARN("NET/Socket : No usable listening interface found");
          return mscclppSystemError;
        }
@@ -49,9 +52,9 @@ mscclppResult_t bootstrapNetInit(const char* ip_port_pair) {
          return mscclppInternalError;
        }
      }
-      char line[SOCKET_NAME_MAXLEN+MAX_IF_NAME_SIZE+2];
+      char line[SOCKET_NAME_MAXLEN + MAX_IF_NAME_SIZE + 2];
      sprintf(line, " %s:", bootstrapNetIfName);
-      mscclppSocketToString(&bootstrapNetIfAddr, line+strlen(line));
+      mscclppSocketToString(&bootstrapNetIfAddr, line + strlen(line));
      INFO(MSCCLPP_INIT, "Bootstrap : Using%s", line);
      bootstrapNetInitDone = 1;
    }
@@ -61,15 +64,21 @@ mscclppResult_t bootstrapNetInit(const char* ip_port_pair) {
 }

 /* Socket Interface Selection type */
-enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
+enum bootstrapInterface_t
+{
+  findSubnetIf = -1,
+  dontCareIf = -2
+};

 // Additional sync functions
-static mscclppResult_t bootstrapNetSend(struct mscclppSocket* sock, void* data, int size) {
+static mscclppResult_t bootstrapNetSend(struct mscclppSocket* sock, void* data, int size)
+{
  MSCCLPPCHECK(mscclppSocketSend(sock, &size, sizeof(int)));
  MSCCLPPCHECK(mscclppSocketSend(sock, data, size));
  return mscclppSuccess;
 }
-static mscclppResult_t bootstrapNetRecv(struct mscclppSocket* sock, void* data, int size) {
+static mscclppResult_t bootstrapNetRecv(struct mscclppSocket* sock, void* data, int size)
+{
  int recvSize;
  MSCCLPPCHECK(mscclppSocketRecv(sock, &recvSize, sizeof(int)));
  if (recvSize > size) {
@@ -80,7 +89,8 @@ static mscclppResult_t bootstrapNetRecv(struct mscclppSocket* sock, void* data,
  return mscclppSuccess;
 }

-struct extInfo {
+struct extInfo
+{
  int rank;
  int nranks;
  union mscclppSocketAddress extAddressListenRoot;
@@ -89,7 +99,8 @@ struct extInfo {

 #include <sys/resource.h>

-static mscclppResult_t setFilesLimit() {
+static mscclppResult_t setFilesLimit()
+{
  struct rlimit filesLimit;
  SYSCHECK(getrlimit(RLIMIT_NOFILE, &filesLimit), "getrlimit");
  filesLimit.rlim_cur = filesLimit.rlim_max;
@@ -97,16 +108,17 @@ static mscclppResult_t setFilesLimit() {
  return mscclppSuccess;
 }

-static void *bootstrapRoot(void* rargs) {
+static void* bootstrapRoot(void* rargs)
+{
  struct bootstrapRootArgs* args = (struct bootstrapRootArgs*)rargs;
  struct mscclppSocket* listenSock = args->listenSock;
  uint64_t magic = args->magic;
  mscclppResult_t res = mscclppSuccess;
  int nranks = 0, c = 0;
  struct extInfo info;
-  union mscclppSocketAddress *rankAddresses = NULL;
-  union mscclppSocketAddress *rankAddressesRoot = NULL; // for initial rank <-> root information exchange
-  union mscclppSocketAddress *zero = NULL;
+  union mscclppSocketAddress* rankAddresses = NULL;
+  union mscclppSocketAddress* rankAddressesRoot = NULL; // for initial rank <-> root information exchange
+  union mscclppSocketAddress* zero = NULL;
  MSCCLPPCHECKGOTO(mscclppCalloc(&zero, 1), res, out);
  setFilesLimit();

@@ -136,21 +148,21 @@ static void *bootstrapRoot(void* rargs) {
    }

    // Save the connection handle for that rank
-    memcpy(rankAddressesRoot+info.rank, &info.extAddressListenRoot, sizeof(union mscclppSocketAddress));
-    memcpy(rankAddresses+info.rank, &info.extAddressListen, sizeof(union mscclppSocketAddress));
+    memcpy(rankAddressesRoot + info.rank, &info.extAddressListenRoot, sizeof(union mscclppSocketAddress));
+    memcpy(rankAddresses + info.rank, &info.extAddressListen, sizeof(union mscclppSocketAddress));

    ++c;
-    TRACE(MSCCLPP_INIT, "Received connect from rank %d total %d/%d",  info.rank, c, nranks);
+    TRACE(MSCCLPP_INIT, "Received connect from rank %d total %d/%d", info.rank, c, nranks);
  } while (c < nranks);
  TRACE(MSCCLPP_INIT, "COLLECTED ALL %d HANDLES", nranks);

  // Send the connect handle for the next rank in the AllGather ring
-  for (int r=0; r<nranks; ++r) {
-    int next = (r+1) % nranks;
+  for (int r = 0; r < nranks; ++r) {
+    int next = (r + 1) % nranks;
    struct mscclppSocket sock;
-    MSCCLPPCHECKGOTO(mscclppSocketInit(&sock, rankAddressesRoot+r, magic, mscclppSocketTypeBootstrap), res, out);
+    MSCCLPPCHECKGOTO(mscclppSocketInit(&sock, rankAddressesRoot + r, magic, mscclppSocketTypeBootstrap), res, out);
    MSCCLPPCHECKGOTO(mscclppSocketConnect(&sock), res, out);
-    MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, rankAddresses+next, sizeof(union mscclppSocketAddress)), res, out);
+    MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, rankAddresses + next, sizeof(union mscclppSocketAddress)), res, out);
    MSCCLPPCHECKGOTO(mscclppSocketClose(&sock), res, out);
  }
  TRACE(MSCCLPP_INIT, "SENT OUT ALL %d HANDLES", nranks);
@@ -160,16 +172,20 @@ out:
    mscclppSocketClose(listenSock);
    free(listenSock);
  }
-  if (rankAddresses) free(rankAddresses);
-  if (rankAddressesRoot) free(rankAddressesRoot);
-  if (zero) free(zero);
+  if (rankAddresses)
+    free(rankAddresses);
+  if (rankAddressesRoot)
+    free(rankAddressesRoot);
+  if (zero)
+    free(zero);
  free(rargs);

  TRACE(MSCCLPP_INIT, "DONE");
  return NULL;
 }

-mscclppResult_t bootstrapCreateRoot(struct mscclppBootstrapHandle* handle) {
+mscclppResult_t bootstrapCreateRoot(struct mscclppBootstrapHandle* handle)
+{
  struct mscclppSocket* listenSock;
  struct bootstrapRootArgs* args;
  pthread_t thread;
@@ -191,7 +207,8 @@ mscclppResult_t bootstrapCreateRoot(struct mscclppBootstrapHandle* handle) {
 // #include <netinet/in.h>
 // #include <arpa/inet.h>

-mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle, bool isRoot, const char* ip_port_pair) {
+mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle, bool isRoot, const char* ip_port_pair)
+{
  memset(handle, 0, sizeof(mscclppBootstrapHandle));
  const char* env = NULL;
  if (ip_port_pair) {
@@ -220,14 +237,16 @@ mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle, bool
  return mscclppSuccess;
 }

-struct unexConn {
+struct unexConn
+{
  int peer;
  int tag;
  struct mscclppSocket sock;
  struct unexConn* next;
 };

-struct bootstrapState {
+struct bootstrapState
+{
  struct mscclppSocket listenSock;
  struct mscclppSocket ringRecvSocket;
  struct mscclppSocket ringSendSocket;
@@ -238,10 +257,11 @@ struct bootstrapState {
  int rank;
  int nranks;
  uint64_t magic;
-  volatile uint32_t *abortFlag;
+  volatile uint32_t* abortFlag;
 };

-mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscclppComm* comm) {
+mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscclppComm* comm)
+{
  int rank = comm->rank;
  int nranks = comm->nRanks;
  struct bootstrapState* state;
@@ -262,12 +282,14 @@ mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscc
  info.rank = rank;
  info.nranks = nranks;
  // Create socket for other ranks to contact me
-  MSCCLPPCHECK(mscclppSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->magic, mscclppSocketTypeBootstrap, comm->abortFlag));
+  MSCCLPPCHECK(mscclppSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->magic, mscclppSocketTypeBootstrap,
+                                 comm->abortFlag));
  MSCCLPPCHECK(mscclppSocketListen(&state->listenSock));
  MSCCLPPCHECK(mscclppSocketGetAddr(&state->listenSock, &info.extAddressListen));

  // Create socket for root to contact me
-  MSCCLPPCHECK(mscclppSocketInit(&listenSockRoot, &bootstrapNetIfAddr, comm->magic, mscclppSocketTypeBootstrap, comm->abortFlag));
+  MSCCLPPCHECK(
+    mscclppSocketInit(&listenSockRoot, &bootstrapNetIfAddr, comm->magic, mscclppSocketTypeBootstrap, comm->abortFlag));
  MSCCLPPCHECK(mscclppSocketListen(&listenSockRoot));
  MSCCLPPCHECK(mscclppSocketGetAddr(&listenSockRoot, &info.extAddressListenRoot));

@@ -278,7 +300,7 @@ mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscc
    tv.tv_sec = msec / 1000;
    tv.tv_nsec = 1000000 * (msec % 1000);
    TRACE(MSCCLPP_INIT, "rank %d delaying connection to root by %ld msec", rank, msec);
-    (void) nanosleep(&tv, NULL);
+    (void)nanosleep(&tv, NULL);
  }

  // send info on my listening socket to root
@@ -294,7 +316,8 @@ mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscc
  MSCCLPPCHECK(mscclppSocketClose(&sock));
  MSCCLPPCHECK(mscclppSocketClose(&listenSockRoot));

-  MSCCLPPCHECK(mscclppSocketInit(&state->ringSendSocket, &nextAddr, comm->magic, mscclppSocketTypeBootstrap, comm->abortFlag));
+  MSCCLPPCHECK(
+    mscclppSocketInit(&state->ringSendSocket, &nextAddr, comm->magic, mscclppSocketTypeBootstrap, comm->abortFlag));
  MSCCLPPCHECK(mscclppSocketConnect(&state->ringSendSocket));
  // Accept the connect request from the previous rank in the AllGather ring
  MSCCLPPCHECK(mscclppSocketInit(&state->ringRecvSocket));
@@ -302,7 +325,7 @@ mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscc

  // AllGather all listen handlers
  MSCCLPPCHECK(mscclppCalloc(&state->peerCommAddresses, nranks));
-  MSCCLPPCHECK(mscclppSocketGetAddr(&state->listenSock, state->peerCommAddresses+rank));
+  MSCCLPPCHECK(mscclppSocketGetAddr(&state->listenSock, state->peerCommAddresses + rank));
  MSCCLPPCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union mscclppSocketAddress)));

  // Create the service proxy
@@ -310,9 +333,10 @@ mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscc

  // proxy is aborted through a message; don't set abortFlag
  MSCCLPPCHECK(mscclppCalloc(&proxySocket, 1));
-  MSCCLPPCHECK(mscclppSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, mscclppSocketTypeProxy, comm->abortFlag));
+  MSCCLPPCHECK(
+    mscclppSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, mscclppSocketTypeProxy, comm->abortFlag));
  MSCCLPPCHECK(mscclppSocketListen(proxySocket));
-  MSCCLPPCHECK(mscclppSocketGetAddr(proxySocket, state->peerProxyAddresses+rank));
+  MSCCLPPCHECK(mscclppSocketGetAddr(proxySocket, state->peerProxyAddresses + rank));
  MSCCLPPCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union mscclppSocketAddress)));
  // MSCCLPPCHECK(mscclppProxyInit(comm, proxySocket, state->peerProxyAddresses));

@@ -321,7 +345,8 @@ mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscc
  return mscclppSuccess;
 }

-mscclppResult_t bootstrapAllGather(void* commState, void* allData, int size) {
+mscclppResult_t bootstrapAllGather(void* commState, void* allData, int size)
+{
  struct bootstrapState* state = (struct bootstrapState*)commState;
  char* data = (char*)allData;
  int rank = state->rank;
@@ -333,26 +358,29 @@ mscclppResult_t bootstrapAllGather(void* commState, void* allData, int size) {
   * At each step i receive data from (rank-i-1) from left
   * and send previous step's data from (rank-i) to right
   */
-  for (int i=0; i<nranks-1; i++) {
+  for (int i = 0; i < nranks - 1; i++) {
    size_t rslice = (rank - i - 1 + nranks) % nranks;
    size_t sslice = (rank - i + nranks) % nranks;

    // Send slice to the right
-    MSCCLPPCHECK(bootstrapNetSend(&state->ringSendSocket, data+sslice*size, size));
+    MSCCLPPCHECK(bootstrapNetSend(&state->ringSendSocket, data + sslice * size, size));
    // Recv slice from the left
-    MSCCLPPCHECK(bootstrapNetRecv(&state->ringRecvSocket, data+rslice*size, size));
+    MSCCLPPCHECK(bootstrapNetRecv(&state->ringRecvSocket, data + rslice * size, size));
  }

  TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
  return mscclppSuccess;
 }

-mscclppResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) {
+mscclppResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size)
+{
  mscclppResult_t ret = mscclppSuccess;
  struct bootstrapState* state = (struct bootstrapState*)commState;
  struct mscclppSocket sock;

-  MSCCLPPCHECKGOTO(mscclppSocketInit(&sock, state->peerCommAddresses+peer, state->magic, mscclppSocketTypeBootstrap, state->abortFlag), ret, fail);
+  MSCCLPPCHECKGOTO(mscclppSocketInit(&sock, state->peerCommAddresses + peer, state->magic, mscclppSocketTypeBootstrap,
+                                     state->abortFlag),
+                   ret, fail);
  MSCCLPPCHECKGOTO(mscclppSocketConnect(&sock), ret, fail);
  MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, &state->rank, sizeof(int)), ret, fail);
  MSCCLPPCHECKGOTO(bootstrapNetSend(&sock, &tag, sizeof(int)), ret, fail);
@@ -365,8 +393,10 @@ fail:
  goto exit;
 }

-mscclppResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag) {
-  if (nranks == 1) return mscclppSuccess;
+mscclppResult_t bootstrapBarrier(void* commState, int* ranks, int rank, int nranks, int tag)
+{
+  if (nranks == 1)
+    return mscclppSuccess;
  TRACE(MSCCLPP_INIT, "rank %d nranks %d tag %x - ENTER", rank, nranks, tag);

  /* Simple intra process barrier
@@ -375,7 +405,7 @@ mscclppResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nran
   * "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988"
   */
  int data[1];
-  for (int mask=1; mask<nranks; mask<<=1) {
+  for (int mask = 1; mask < nranks; mask <<= 1) {
    int src = (rank - mask + nranks) % nranks;
    int dst = (rank + mask) % nranks;
    MSCCLPPCHECK(bootstrapSend(commState, ranks[dst], tag, data, sizeof(data)));
@@ -386,23 +416,26 @@ mscclppResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nran
  return mscclppSuccess;
 }

-mscclppResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size) {
-  if (nranks == 1) return mscclppSuccess;
+mscclppResult_t bootstrapIntraNodeAllGather(void* commState, int* ranks, int rank, int nranks, void* allData, int size)
+{
+  if (nranks == 1)
+    return mscclppSuccess;
  char* data = (char*)allData;
  TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - ENTER", rank, nranks, size);

-  for (int i=1; i<nranks; i++) {
+  for (int i = 1; i < nranks; i++) {
    int src = (rank - i + nranks) % nranks;
    int dst = (rank + i) % nranks;
-    MSCCLPPCHECK(bootstrapSend(commState, ranks[dst], /*tag=*/i, data+rank*size, size));
-    MSCCLPPCHECK(bootstrapRecv(commState, ranks[src], /*tag=*/i, data+src*size, size));
+    MSCCLPPCHECK(bootstrapSend(commState, ranks[dst], /*tag=*/i, data + rank * size, size));
+    MSCCLPPCHECK(bootstrapRecv(commState, ranks[src], /*tag=*/i, data + src * size, size));
  }

  TRACE(MSCCLPP_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
  return mscclppSuccess;
 }

-mscclppResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct mscclppSocket* sock) {
+mscclppResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct mscclppSocket* sock)
+{
  // New unex
  struct unexConn* unex;
  MSCCLPPCHECK(mscclppCalloc(&unex, 1));
@@ -416,12 +449,15 @@ mscclppResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int ta
    state->unexpectedConnections = unex;
    return mscclppSuccess;
  }
-  while (list->next) list = list->next;
+  while (list->next)
+    list = list->next;
  list->next = unex;
  return mscclppSuccess;
 }

-mscclppResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct mscclppSocket* sock, int* found) {
+mscclppResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct mscclppSocket* sock,
+                                  int* found)
+{
  struct unexConn* elem = state->unexpectedConnections;
  struct unexConn* prev = NULL;
  *found = 0;
@@ -443,7 +479,8 @@ mscclppResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int ta
  return mscclppSuccess;
 }

-static void unexpectedFree(struct bootstrapState* state) {
+static void unexpectedFree(struct bootstrapState* state)
+{
  struct unexConn* elem = state->unexpectedConnections;
  struct unexConn* prev = NULL;

@@ -456,7 +493,8 @@ static void unexpectedFree(struct bootstrapState* state) {
 }

 // We can't know who we'll receive from, so we need to receive everything at once
-mscclppResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) {
+mscclppResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size)
+{
  mscclppResult_t ret = mscclppSuccess;
  struct bootstrapState* state = (struct bootstrapState*)commState;
  struct mscclppSocket sock;
@@ -490,7 +528,8 @@ fail:
  goto exit;
 }

-mscclppResult_t bootstrapClose(void* commState) {
+mscclppResult_t bootstrapClose(void* commState)
+{
  struct bootstrapState* state = (struct bootstrapState*)commState;
  if (state->unexpectedConnections != NULL) {
    unexpectedFree(state);
@@ -510,9 +549,11 @@ mscclppResult_t bootstrapClose(void* commState) {
  return mscclppSuccess;
 }

-mscclppResult_t bootstrapAbort(void* commState) {
+mscclppResult_t bootstrapAbort(void* commState)
+{
  struct bootstrapState* state = (struct bootstrapState*)commState;
-  if (commState == NULL) return mscclppSuccess;
+  if (commState == NULL)
+    return mscclppSuccess;
  MSCCLPPCHECK(mscclppSocketClose(&state->listenSock));
  MSCCLPPCHECK(mscclppSocketClose(&state->ringSendSocket));
  MSCCLPPCHECK(mscclppSocketClose(&state->ringRecvSocket));
--- a/src/bootstrap/socket.cc
+++ b/src/bootstrap/socket.cc
@@ -8,25 +8,30 @@
 #include "utils.h"
 #include <stdlib.h>

-#include <unistd.h>
 #include <ifaddrs.h>
 #include <net/if.h>
+#include <unistd.h>

-static mscclppResult_t socketProgressOpt(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset, int block, int* closed) {
+static mscclppResult_t socketProgressOpt(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset,
+                                         int block, int* closed)
+{
  int bytes = 0;
  *closed = 0;
  char* data = (char*)ptr;
-  char line[SOCKET_NAME_MAXLEN+1];
+  char line[SOCKET_NAME_MAXLEN + 1];
  do {
-    if (op == MSCCLPP_SOCKET_RECV) bytes = recv(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
-    if (op == MSCCLPP_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? MSG_NOSIGNAL : MSG_DONTWAIT | MSG_NOSIGNAL);
+    if (op == MSCCLPP_SOCKET_RECV)
+      bytes = recv(sock->fd, data + (*offset), size - (*offset), block ? 0 : MSG_DONTWAIT);
+    if (op == MSCCLPP_SOCKET_SEND)
+      bytes = send(sock->fd, data + (*offset), size - (*offset), block ? MSG_NOSIGNAL : MSG_DONTWAIT | MSG_NOSIGNAL);
    if (op == MSCCLPP_SOCKET_RECV && bytes == 0) {
      *closed = 1;
      return mscclppSuccess;
    }
    if (bytes == -1) {
      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
-        WARN("socketProgressOpt: Call to recv from %s failed : %s", mscclppSocketToString(&sock->addr, line), strerror(errno));
+        WARN("socketProgressOpt: Call to recv from %s failed : %s", mscclppSocketToString(&sock->addr, line),
+             strerror(errno));
        return mscclppRemoteError;
      } else {
        bytes = 0;
@@ -41,18 +46,20 @@ static mscclppResult_t socketProgressOpt(int op, struct mscclppSocket* sock, voi
  return mscclppSuccess;
 }

-static mscclppResult_t socketProgress(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset) {
+static mscclppResult_t socketProgress(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset)
+{
  int closed;
  MSCCLPPCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0, &closed));
  if (closed) {
-    char line[SOCKET_NAME_MAXLEN+1];
+    char line[SOCKET_NAME_MAXLEN + 1];
    WARN("socketProgress: Connection closed by remote peer %s", mscclppSocketToString(&sock->addr, line, 0));
    return mscclppRemoteError;
  }
  return mscclppSuccess;
 }

-static mscclppResult_t socketWait(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset) {
+static mscclppResult_t socketWait(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset)
+{
  while (*offset < size)
    MSCCLPPCHECK(socketProgress(op, sock, ptr, size, offset));
  return mscclppSuccess;
@@ -62,27 +69,34 @@ static mscclppResult_t socketWait(int op, struct mscclppSocket* sock, void* ptr,
 *
 * Output: "IPv4/IPv6 address<port>"
 */
-const char *mscclppSocketToString(union mscclppSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) {
-  if (buf == NULL || addr == NULL) return NULL;
-  struct sockaddr *saddr = &addr->sa;
-  if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; }
+const char* mscclppSocketToString(union mscclppSocketAddress* addr, char* buf, const int numericHostForm /*= 1*/)
+{
+  if (buf == NULL || addr == NULL)
+    return NULL;
+  struct sockaddr* saddr = &addr->sa;
+  if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) {
+    buf[0] = '\0';
+    return buf;
+  }
  char host[NI_MAXHOST], service[NI_MAXSERV];
  /* NI_NUMERICHOST: If set, then the numeric form of the hostname is returned.
   * (When not set, this will still happen in case the node's name cannot be determined.)
   */
  int flag = NI_NUMERICSERV | (numericHostForm ? NI_NUMERICHOST : 0);
-  (void) getnameinfo(saddr, sizeof(union mscclppSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag);
+  (void)getnameinfo(saddr, sizeof(union mscclppSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag);
  sprintf(buf, "%s<%s>", host, service);
  return buf;
 }

-static uint16_t socketToPort(union mscclppSocketAddress *addr) {
-  struct sockaddr *saddr = &addr->sa;
+static uint16_t socketToPort(union mscclppSocketAddress* addr)
+{
+  struct sockaddr* saddr = &addr->sa;
  return ntohs(saddr->sa_family == AF_INET ? addr->sin.sin_port : addr->sin6.sin6_port);
 }

 /* Allow the user to force the IPv4/IPv6 interface selection */
-static int envSocketFamily(void) {
+static int envSocketFamily(void)
+{
  int family = -1; // Family selection is not forced, will use first one found
  char* env = getenv("MSCCLPP_SOCKET_FAMILY");
  if (env == NULL)
@@ -91,35 +105,41 @@ static int envSocketFamily(void) {
  INFO(MSCCLPP_ENV, "MSCCLPP_SOCKET_FAMILY set by environment to %s", env);

  if (strcmp(env, "AF_INET") == 0)
-    family = AF_INET;  // IPv4
+    family = AF_INET; // IPv4
  else if (strcmp(env, "AF_INET6") == 0)
    family = AF_INET6; // IPv6
  return family;
 }

-static int findInterfaces(const char* prefixList, char* names, union mscclppSocketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
+static int findInterfaces(const char* prefixList, char* names, union mscclppSocketAddress* addrs, int sock_family,
+                          int maxIfNameSize, int maxIfs)
+{
 #ifdef ENABLE_TRACE
-  char line[SOCKET_NAME_MAXLEN+1];
+  char line[SOCKET_NAME_MAXLEN + 1];
 #endif
  struct netIf userIfs[MAX_IFS];
  bool searchNot = prefixList && prefixList[0] == '^';
-  if (searchNot) prefixList++;
+  if (searchNot)
+    prefixList++;
  bool searchExact = prefixList && prefixList[0] == '=';
-  if (searchExact) prefixList++;
+  if (searchExact)
+    prefixList++;
  int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);

  int found = 0;
  struct ifaddrs *interfaces, *interface;
  getifaddrs(&interfaces);
  for (interface = interfaces; interface && found < maxIfs; interface = interface->ifa_next) {
-    if (interface->ifa_addr == NULL) continue;
+    if (interface->ifa_addr == NULL)
+      continue;

    /* We only support IPv4 & IPv6 */
    int family = interface->ifa_addr->sa_family;
    if (family != AF_INET && family != AF_INET6)
      continue;

-    TRACE(MSCCLPP_INIT|MSCCLPP_NET,"Found interface %s:%s", interface->ifa_name, mscclppSocketToString((union mscclppSocketAddress *) interface->ifa_addr, line));
+    TRACE(MSCCLPP_INIT | MSCCLPP_NET, "Found interface %s:%s", interface->ifa_name,
+          mscclppSocketToString((union mscclppSocketAddress*)interface->ifa_addr, line));

    /* Allow the caller to force the socket family type */
    if (sock_family != -1 && family != sock_family)
@@ -128,7 +148,8 @@ static int findInterfaces(const char* prefixList, char* names, union mscclppSock
    /* We also need to skip IPv6 loopback interfaces */
    if (family == AF_INET6) {
      struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr);
-      if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue;
+      if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr))
+        continue;
    }

    // check against user specified interfaces
@@ -140,15 +161,18 @@ static int findInterfaces(const char* prefixList, char* names, union mscclppSock
    // getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link
    bool duplicate = false;
    for (int i = 0; i < found; i++) {
-      if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; }
+      if (strcmp(interface->ifa_name, names + i * maxIfNameSize) == 0) {
+        duplicate = true;
+        break;
+      }
    }

    if (!duplicate) {
      // Store the interface name
-      strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize);
+      strncpy(names + found * maxIfNameSize, interface->ifa_name, maxIfNameSize);
      // Store the IP address
      int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
-      memcpy(addrs+found, interface->ifa_addr, salen);
+      memcpy(addrs + found, interface->ifa_addr, salen);
      found++;
    }
  }
@@ -157,7 +181,8 @@ static int findInterfaces(const char* prefixList, char* names, union mscclppSock
  return found;
 }

-static bool matchSubnet(struct ifaddrs local_if, union mscclppSocketAddress* remote) {
+static bool matchSubnet(struct ifaddrs local_if, union mscclppSocketAddress* remote)
+{
  /* Check family first */
  int family = local_if.ifa_addr->sa_family;
  if (family != remote->sa.sa_family) {
@@ -180,8 +205,8 @@ static bool matchSubnet(struct ifaddrs local_if, union mscclppSocketAddress* rem
    struct in6_addr& mask_in6 = mask->sin6_addr;
    struct in6_addr& remote_in6 = remote_addr.sin6_addr;
    bool same = true;
-    int len = 16;  //IPv6 address is 16 unsigned char
-    for (int c = 0; c < len; c++) {  //Network byte order is big-endian
+    int len = 16;                   // IPv6 address is 16 unsigned char
+    for (int c = 0; c < len; c++) { // Network byte order is big-endian
      char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c];
      char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c];
      if (c1 ^ c2) {
@@ -200,16 +225,19 @@ static bool matchSubnet(struct ifaddrs local_if, union mscclppSocketAddress* rem
  }
 }

-int mscclppFindInterfaceMatchSubnet(char* ifNames, union mscclppSocketAddress* localAddrs, union mscclppSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) {
+int mscclppFindInterfaceMatchSubnet(char* ifNames, union mscclppSocketAddress* localAddrs,
+                                    union mscclppSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs)
+{
 #ifdef ENABLE_TRACE
-  char line[SOCKET_NAME_MAXLEN+1];
+  char line[SOCKET_NAME_MAXLEN + 1];
 #endif
-  char line_a[SOCKET_NAME_MAXLEN+1];
+  char line_a[SOCKET_NAME_MAXLEN + 1];
  int found = 0;
  struct ifaddrs *interfaces, *interface;
  getifaddrs(&interfaces);
  for (interface = interfaces; interface && !found; interface = interface->ifa_next) {
-    if (interface->ifa_addr == NULL) continue;
+    if (interface->ifa_addr == NULL)
+      continue;

    /* We only support IPv4 & IPv6 */
    int family = interface->ifa_addr->sa_family;
@@ -223,14 +251,17 @@ int mscclppFindInterfaceMatchSubnet(char* ifNames, union mscclppSocketAddress* l

    // Store the local IP address
    int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
-    memcpy(localAddrs+found, interface->ifa_addr, salen);
+    memcpy(localAddrs + found, interface->ifa_addr, salen);

    // Store the interface name
-    strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
+    strncpy(ifNames + found * ifNameMaxSize, interface->ifa_name, ifNameMaxSize);

-    TRACE(MSCCLPP_INIT|MSCCLPP_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, mscclppSocketToString(localAddrs+found, line), mscclppSocketToString(remoteAddr, line_a));
+    TRACE(MSCCLPP_INIT | MSCCLPP_NET, "NET : Found interface %s:%s in the same subnet as remote address %s",
+          interface->ifa_name, mscclppSocketToString(localAddrs + found, line),
+          mscclppSocketToString(remoteAddr, line_a));
    found++;
-    if (found == maxIfs) break;
+    if (found == maxIfs)
+      break;
  }

  if (found == 0) {
@@ -240,7 +271,8 @@ int mscclppFindInterfaceMatchSubnet(char* ifNames, union mscclppSocketAddress* l
  return found;
 }

-mscclppResult_t mscclppSocketGetAddrFromString(union mscclppSocketAddress* ua, const char* ip_port_pair) {
+mscclppResult_t mscclppSocketGetAddrFromString(union mscclppSocketAddress* ua, const char* ip_port_pair)
+{
  if (!(ip_port_pair && strlen(ip_port_pair) > 1)) {
    WARN("Net : string is null");
    return mscclppInvalidArgument;
@@ -262,7 +294,7 @@ mscclppResult_t mscclppSocketGetAddrFromString(union mscclppSocketAddress* ua, c
    hints.ai_family = AF_UNSPEC;
    hints.ai_socktype = SOCK_STREAM;

-    if ( (rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) {
+    if ((rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) {
      WARN("Net : error encountered when getting address info : %s", gai_strerror(rv));
      return mscclppInvalidArgument;
    }
@@ -271,16 +303,16 @@ mscclppResult_t mscclppSocketGetAddrFromString(union mscclppSocketAddress* ua, c
    if (p->ai_family == AF_INET) {
      struct sockaddr_in& sin = ua->sin;
      memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in));
-      sin.sin_family = AF_INET;                        // IPv4
-      //inet_pton(AF_INET, ni.prefix, &(sin.sin_addr));  // IP address
-      sin.sin_port = htons(ni.port);                   // port
+      sin.sin_family = AF_INET; // IPv4
+      // inet_pton(AF_INET, ni.prefix, &(sin.sin_addr));  // IP address
+      sin.sin_port = htons(ni.port); // port
    } else if (p->ai_family == AF_INET6) {
      struct sockaddr_in6& sin6 = ua->sin6;
      memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6));
-      sin6.sin6_family = AF_INET6;                     // IPv6
-      sin6.sin6_port = htons(ni.port);                 // port
-      sin6.sin6_flowinfo = 0;                          // needed by IPv6, but possibly obsolete
-      sin6.sin6_scope_id = 0;                          // should be global scope, set to 0
+      sin6.sin6_family = AF_INET6;     // IPv6
+      sin6.sin6_port = htons(ni.port); // port
+      sin6.sin6_flowinfo = 0;          // needed by IPv6, but possibly obsolete
+      sin6.sin6_scope_id = 0;          // should be global scope, set to 0
    } else {
      WARN("Net : unsupported IP family");
      return mscclppInvalidArgument;
@@ -291,35 +323,39 @@ mscclppResult_t mscclppSocketGetAddrFromString(union mscclppSocketAddress* ua, c
  } else {
    int i, j = -1, len = strlen(ip_port_pair);
    for (i = 1; i < len; i++) {
-      if (ip_port_pair[i] == '%') j = i;
-      if (ip_port_pair[i] == ']') break;
+      if (ip_port_pair[i] == '%')
+        j = i;
+      if (ip_port_pair[i] == ']')
+        break;
    }
    if (i == len) {
      WARN("Net : No valid [IPv6]:port pair found");
      return mscclppInvalidArgument;
    }
-    bool global_scope = (j == -1 ? true : false);     // If no % found, global scope; otherwise, link scope
+    bool global_scope = (j == -1 ? true : false); // If no % found, global scope; otherwise, link scope

    char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ];
    memset(ip_str, '\0', sizeof(ip_str));
    memset(port_str, '\0', sizeof(port_str));
    memset(if_name, '\0', sizeof(if_name));
-    strncpy(ip_str, ip_port_pair+1, global_scope ? i-1 : j-1);
-    strncpy(port_str, ip_port_pair+i+2, len-i-1);
+    strncpy(ip_str, ip_port_pair + 1, global_scope ? i - 1 : j - 1);
+    strncpy(port_str, ip_port_pair + i + 2, len - i - 1);
    int port = atoi(port_str);
-    if (!global_scope) strncpy(if_name, ip_port_pair+j+1, i-j-1); // If not global scope, we need the intf name
+    if (!global_scope)
+      strncpy(if_name, ip_port_pair + j + 1, i - j - 1); // If not global scope, we need the intf name

    struct sockaddr_in6& sin6 = ua->sin6;
-    sin6.sin6_family = AF_INET6;                       // IPv6
-    inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr));    // IP address
-    sin6.sin6_port = htons(port);                      // port
-    sin6.sin6_flowinfo = 0;                            // needed by IPv6, but possibly obsolete
-    sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name);  // 0 if global scope; intf index if link scope
+    sin6.sin6_family = AF_INET6;                                     // IPv6
+    inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr));                  // IP address
+    sin6.sin6_port = htons(port);                                    // port
+    sin6.sin6_flowinfo = 0;                                          // needed by IPv6, but possibly obsolete
+    sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name); // 0 if global scope; intf index if link scope
  }
  return mscclppSuccess;
 }

-int mscclppFindInterfaces(char* ifNames, union mscclppSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) {
+int mscclppFindInterfaces(char* ifNames, union mscclppSocketAddress* ifAddrs, int ifNameMaxSize, int maxIfs)
+{
  static int shownIfName = 0;
  int nIfs = 0;
  // Allow user to force the INET socket family selection
@@ -329,7 +365,8 @@ int mscclppFindInterfaces(char* ifNames, union mscclppSocketAddress *ifAddrs, in
  if (env && strlen(env) > 1) {
    INFO(MSCCLPP_ENV, "MSCCLPP_SOCKET_IFNAME set by environment to %s", env);
    // Specified by user : find or fail
-    if (shownIfName++ == 0) INFO(MSCCLPP_NET, "MSCCLPP_SOCKET_IFNAME set to %s", env);
+    if (shownIfName++ == 0)
+      INFO(MSCCLPP_NET, "MSCCLPP_SOCKET_IFNAME set to %s", env);
    nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
  } else {
    // Try to automatically pick the right one
@@ -347,15 +384,19 @@ int mscclppFindInterfaces(char* ifNames, union mscclppSocketAddress *ifAddrs, in
      }
    }
    // Then look for anything else (but not docker or lo)
-    if (nIfs == 0) nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+    if (nIfs == 0)
+      nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
    // Finally look for docker, then lo.
-    if (nIfs == 0) nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
-    if (nIfs == 0) nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+    if (nIfs == 0)
+      nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+    if (nIfs == 0)
+      nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
  }
  return nIfs;
 }

-mscclppResult_t mscclppSocketListen(struct mscclppSocket* sock) {
+mscclppResult_t mscclppSocketListen(struct mscclppSocket* sock)
+{
  if (sock == NULL) {
    WARN("mscclppSocketListen: pass NULL socket");
    return mscclppInvalidArgument;
@@ -383,8 +424,8 @@ mscclppResult_t mscclppSocketListen(struct mscclppSocket* sock) {
  SYSCHECK(getsockname(sock->fd, &sock->addr.sa, &size), "getsockname");

 #ifdef ENABLE_TRACE
-  char line[SOCKET_NAME_MAXLEN+1];
-  TRACE(MSCCLPP_INIT|MSCCLPP_NET,"Listening on socket %s", mscclppSocketToString(&sock->addr, line));
+  char line[SOCKET_NAME_MAXLEN + 1];
+  TRACE(MSCCLPP_INIT | MSCCLPP_NET, "Listening on socket %s", mscclppSocketToString(&sock->addr, line));
 #endif

  /* Put the socket in listen mode
@@ -395,17 +436,20 @@ mscclppResult_t mscclppSocketListen(struct mscclppSocket* sock) {
  return mscclppSuccess;
 }

-mscclppResult_t mscclppSocketGetAddr(struct mscclppSocket* sock, union mscclppSocketAddress* addr) {
+mscclppResult_t mscclppSocketGetAddr(struct mscclppSocket* sock, union mscclppSocketAddress* addr)
+{
  if (sock == NULL) {
    WARN("mscclppSocketGetAddr: pass NULL socket");
    return mscclppInvalidArgument;
  }
-  if (sock->state != mscclppSocketStateReady) return mscclppInternalError;
+  if (sock->state != mscclppSocketStateReady)
+    return mscclppInternalError;
  memcpy(addr, &sock->addr, sizeof(union mscclppSocketAddress));
  return mscclppSuccess;
 }

-static mscclppResult_t socketTryAccept(struct mscclppSocket* sock) {
+static mscclppResult_t socketTryAccept(struct mscclppSocket* sock)
+{
  socklen_t socklen = sizeof(union mscclppSocketAddress);
  sock->fd = accept(sock->acceptFd, &sock->addr.sa, &socklen);
  if (sock->fd != -1) {
@@ -416,19 +460,22 @@ static mscclppResult_t socketTryAccept(struct mscclppSocket* sock) {
  } else if (++sock->acceptRetries == RETRY_ACCEPT_TIMES) {
    WARN("socketTryAccept: exceeded retries (%d)", sock->acceptRetries);
    return mscclppRemoteError;
-  } else {  
+  } else {
    usleep(SLEEP_INT);
-    if (sock->acceptRetries % 1000 == 0) INFO(MSCCLPP_ALL, "socketTryAccept: Call to try accept returned %s, retrying", strerror(errno));
+    if (sock->acceptRetries % 1000 == 0)
+      INFO(MSCCLPP_ALL, "socketTryAccept: Call to try accept returned %s, retrying", strerror(errno));
  }
  return mscclppSuccess;
 }

-static mscclppResult_t socketFinalizeAccept(struct mscclppSocket* sock) {
+static mscclppResult_t socketFinalizeAccept(struct mscclppSocket* sock)
+{
  uint64_t magic;
  enum mscclppSocketType type;
  int received = 0;
  MSCCLPPCHECK(mscclppSocketProgress(MSCCLPP_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
-  if (received == 0) return mscclppSuccess;
+  if (received == 0)
+    return mscclppSuccess;
  MSCCLPPCHECK(socketWait(MSCCLPP_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
  if (magic != sock->magic) {
    WARN("socketFinalizeAccept: wrong magic %lx != %lx", magic, sock->magic);
@@ -453,7 +500,8 @@ static mscclppResult_t socketFinalizeAccept(struct mscclppSocket* sock) {
  return mscclppSuccess;
 }

-static mscclppResult_t socketStartConnect(struct mscclppSocket* sock) {
+static mscclppResult_t socketStartConnect(struct mscclppSocket* sock)
+{
  /* blocking/non-blocking connect() is determined by asyncFlag. */
  int ret = connect(sock->fd, &sock->addr.sa, sock->salen);

@@ -470,7 +518,8 @@ static mscclppResult_t socketStartConnect(struct mscclppSocket* sock) {
      return mscclppRemoteError;
    }
    usleep(SLEEP_INT);
-    if (sock->refusedRetries % 1000 == 0) INFO(MSCCLPP_ALL, "Call to connect returned %s, retrying", strerror(errno));
+    if (sock->refusedRetries % 1000 == 0)
+      INFO(MSCCLPP_ALL, "Call to connect returned %s, retrying", strerror(errno));
    return mscclppSuccess;
  } else if (errno == ETIMEDOUT) {
    if (++sock->timedOutRetries == RETRY_TIMEDOUT_TIMES) {
@@ -481,14 +530,15 @@ static mscclppResult_t socketStartConnect(struct mscclppSocket* sock) {
    usleep(SLEEP_INT);
    return mscclppSuccess;
  } else {
-    char line[SOCKET_NAME_MAXLEN+1];
+    char line[SOCKET_NAME_MAXLEN + 1];
    sock->state = mscclppSocketStateError;
    WARN("socketStartConnect: Connect to %s failed : %s", mscclppSocketToString(&sock->addr, line), strerror(errno));
    return mscclppSystemError;
  }
 }

-static mscclppResult_t socketPollConnect(struct mscclppSocket* sock) {
+static mscclppResult_t socketPollConnect(struct mscclppSocket* sock)
+{
  struct pollfd pfd;
  int timeout = 1, ret;
  socklen_t rlen = sizeof(int);
@@ -497,7 +547,8 @@ static mscclppResult_t socketPollConnect(struct mscclppSocket* sock) {
  pfd.fd = sock->fd;
  pfd.events = POLLOUT;
  SYSCHECK(ret = poll(&pfd, 1, timeout), "poll");
-  if (ret == 0) return mscclppSuccess;
+  if (ret == 0)
+    return mscclppSuccess;

  /* check socket status */
  EQCHECK(ret == 1 && (pfd.revents & POLLOUT), 0);
@@ -511,7 +562,8 @@ static mscclppResult_t socketPollConnect(struct mscclppSocket* sock) {
      WARN("socketPollConnect: exceeded retries (%d)", sock->refusedRetries);
      return mscclppRemoteError;
    }
-    if (sock->refusedRetries % 1000 == 0) INFO(MSCCLPP_ALL, "Call to connect returned %s, retrying", strerror(errno));
+    if (sock->refusedRetries % 1000 == 0)
+      INFO(MSCCLPP_ALL, "Call to connect returned %s, retrying", strerror(errno));
    usleep(SLEEP_INT);

    close(sock->fd);
@@ -535,7 +587,8 @@ static mscclppResult_t socketPollConnect(struct mscclppSocket* sock) {
  return mscclppSuccess;
 }

-mscclppResult_t mscclppSocketPollConnect(struct mscclppSocket* sock) {
+mscclppResult_t mscclppSocketPollConnect(struct mscclppSocket* sock)
+{
  if (sock == NULL) {
    WARN("mscclppSocketPollConnect: pass NULL socket");
    return mscclppInvalidArgument;
@@ -544,10 +597,12 @@ mscclppResult_t mscclppSocketPollConnect(struct mscclppSocket* sock) {
  return mscclppSuccess;
 }

-static mscclppResult_t socketFinalizeConnect(struct mscclppSocket* sock) {
+static mscclppResult_t socketFinalizeConnect(struct mscclppSocket* sock)
+{
  int sent = 0;
  MSCCLPPCHECK(socketProgress(MSCCLPP_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
-  if (sent == 0) return mscclppSuccess;
+  if (sent == 0)
+    return mscclppSuccess;
  MSCCLPPCHECK(socketWait(MSCCLPP_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
  sent = 0;
  MSCCLPPCHECK(socketWait(MSCCLPP_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent));
@@ -555,7 +610,8 @@ static mscclppResult_t socketFinalizeConnect(struct mscclppSocket* sock) {
  return mscclppSuccess;
 }

-static mscclppResult_t socketProgressState(struct mscclppSocket* sock) {
+static mscclppResult_t socketProgressState(struct mscclppSocket* sock)
+{
  if (sock->state == mscclppSocketStateAccepting) {
    MSCCLPPCHECK(socketTryAccept(sock));
  }
@@ -591,9 +647,10 @@ static mscclppResult_t socketProgressState(struct mscclppSocket* sock) {
 //   return mscclppSuccess;
 // }

-mscclppResult_t mscclppSocketConnect(struct mscclppSocket* sock) {
+mscclppResult_t mscclppSocketConnect(struct mscclppSocket* sock)
+{
 #ifdef ENABLE_TRACE
-  char line[SOCKET_NAME_MAXLEN+1];
+  char line[SOCKET_NAME_MAXLEN + 1];
 #endif
  const int one = 1;

@@ -608,39 +665,40 @@ mscclppResult_t mscclppSocketConnect(struct mscclppSocket* sock) {

  if (sock->state != mscclppSocketStateInitialized) {
    WARN("mscclppSocketConnect: wrong socket state %d", sock->state);
-    if (sock->state == mscclppSocketStateError) return mscclppRemoteError;
+    if (sock->state == mscclppSocketStateError)
+      return mscclppRemoteError;
    return mscclppInternalError;
  }
-  TRACE(MSCCLPP_INIT|MSCCLPP_NET,"Connecting to socket %s", mscclppSocketToString(&sock->addr, line));
+  TRACE(MSCCLPP_INIT | MSCCLPP_NET, "Connecting to socket %s", mscclppSocketToString(&sock->addr, line));

  SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");

  sock->state = mscclppSocketStateConnecting;
  do {
    MSCCLPPCHECK(socketProgressState(sock));
-  } while (sock->asyncFlag == 0 &&
-      (sock->abortFlag == NULL || *sock->abortFlag == 0) &&
-      (sock->state == mscclppSocketStateConnecting ||
-       sock->state == mscclppSocketStateConnectPolling ||
-       sock->state == mscclppSocketStateConnected));
+  } while (sock->asyncFlag == 0 && (sock->abortFlag == NULL || *sock->abortFlag == 0) &&
+           (sock->state == mscclppSocketStateConnecting || sock->state == mscclppSocketStateConnectPolling ||
+            sock->state == mscclppSocketStateConnected));

-  if (sock->abortFlag && *sock->abortFlag != 0) return mscclppInternalError;
+  if (sock->abortFlag && *sock->abortFlag != 0)
+    return mscclppInternalError;

  switch (sock->state) {
-    case mscclppSocketStateConnecting:
-    case mscclppSocketStateConnectPolling:
-    case mscclppSocketStateConnected:
-    case mscclppSocketStateReady:
-      return mscclppSuccess;
-    case mscclppSocketStateError:
-      return mscclppSystemError;
-    default:
-      WARN("mscclppSocketConnect: wrong socket state %d", sock->state);
-      return mscclppInternalError;
+  case mscclppSocketStateConnecting:
+  case mscclppSocketStateConnectPolling:
+  case mscclppSocketStateConnected:
+  case mscclppSocketStateReady:
+    return mscclppSuccess;
+  case mscclppSocketStateError:
+    return mscclppSystemError;
+  default:
+    WARN("mscclppSocketConnect: wrong socket state %d", sock->state);
+    return mscclppInternalError;
  }
 }

-mscclppResult_t mscclppSocketAccept(struct mscclppSocket* sock, struct mscclppSocket* listenSock) {
+mscclppResult_t mscclppSocketAccept(struct mscclppSocket* sock, struct mscclppSocket* listenSock)
+{
  mscclppResult_t ret = mscclppSuccess;

  if (listenSock == NULL || sock == NULL) {
@@ -665,36 +723,38 @@ mscclppResult_t mscclppSocketAccept(struct mscclppSocket* sock, struct mscclppSo

  do {
    MSCCLPPCHECKGOTO(socketProgressState(sock), ret, exit);
-  } while (sock->asyncFlag == 0 &&
-      (sock->abortFlag == NULL || *sock->abortFlag == 0) &&
-      (sock->state == mscclppSocketStateAccepting ||
-       sock->state == mscclppSocketStateAccepted));
+  } while (sock->asyncFlag == 0 && (sock->abortFlag == NULL || *sock->abortFlag == 0) &&
+           (sock->state == mscclppSocketStateAccepting || sock->state == mscclppSocketStateAccepted));

-  if (sock->abortFlag && *sock->abortFlag != 0) return mscclppInternalError;
+  if (sock->abortFlag && *sock->abortFlag != 0)
+    return mscclppInternalError;

  switch (sock->state) {
-    case mscclppSocketStateAccepting:
-    case mscclppSocketStateAccepted:
-    case mscclppSocketStateReady:
-      ret = mscclppSuccess;
-      break;
-    case mscclppSocketStateError:
-      ret = mscclppSystemError;
-      break;
-    default:
-      WARN("mscclppSocketAccept: wrong socket state %d", sock->state);
-      ret = mscclppInternalError;
-      break;
+  case mscclppSocketStateAccepting:
+  case mscclppSocketStateAccepted:
+  case mscclppSocketStateReady:
+    ret = mscclppSuccess;
+    break;
+  case mscclppSocketStateError:
+    ret = mscclppSystemError;
+    break;
+  default:
+    WARN("mscclppSocketAccept: wrong socket state %d", sock->state);
+    ret = mscclppInternalError;
+    break;
  }

 exit:
  return ret;
 }

-mscclppResult_t mscclppSocketInit(struct mscclppSocket* sock, union mscclppSocketAddress* addr, uint64_t magic, enum mscclppSocketType type, volatile uint32_t* abortFlag, int asyncFlag) {
+mscclppResult_t mscclppSocketInit(struct mscclppSocket* sock, union mscclppSocketAddress* addr, uint64_t magic,
+                                  enum mscclppSocketType type, volatile uint32_t* abortFlag, int asyncFlag)
+{
  mscclppResult_t ret = mscclppSuccess;

-  if (sock == NULL) goto exit;
+  if (sock == NULL)
+    goto exit;
  sock->timedOutRetries = 0;
  sock->refusedRetries = 0;
  sock->acceptRetries = 0;
@@ -712,9 +772,9 @@ mscclppResult_t mscclppSocketInit(struct mscclppSocket* sock, union mscclppSocke
    memcpy(&sock->addr, addr, sizeof(union mscclppSocketAddress));
    family = sock->addr.sa.sa_family;
    if (family != AF_INET && family != AF_INET6) {
-      char line[SOCKET_NAME_MAXLEN+1];
+      char line[SOCKET_NAME_MAXLEN + 1];
      WARN("mscclppSocketInit: connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)",
-          mscclppSocketToString(&sock->addr, line), family, AF_INET, AF_INET6);
+           mscclppSocketToString(&sock->addr, line), family, AF_INET, AF_INET6);
      ret = mscclppInternalError;
      goto fail;
    }
@@ -744,7 +804,8 @@ fail:
  goto exit;
 }

-mscclppResult_t mscclppSocketProgress(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset) {
+mscclppResult_t mscclppSocketProgress(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset)
+{
  if (sock == NULL) {
    WARN("mscclppSocketProgress: pass NULL socket");
    return mscclppInvalidArgument;
@@ -762,7 +823,8 @@ mscclppResult_t mscclppSocketProgress(int op, struct mscclppSocket* sock, void*
 //   return mscclppSuccess;
 // }

-mscclppResult_t mscclppSocketSend(struct mscclppSocket* sock, void* ptr, int size) {
+mscclppResult_t mscclppSocketSend(struct mscclppSocket* sock, void* ptr, int size)
+{
  int offset = 0;
  if (sock == NULL) {
    WARN("mscclppSocketSend: pass NULL socket");
@@ -776,7 +838,8 @@ mscclppResult_t mscclppSocketSend(struct mscclppSocket* sock, void* ptr, int siz
  return mscclppSuccess;
 }

-mscclppResult_t mscclppSocketRecv(struct mscclppSocket* sock, void* ptr, int size) {
+mscclppResult_t mscclppSocketRecv(struct mscclppSocket* sock, void* ptr, int size)
+{
  int offset = 0;
  if (sock == NULL) {
    WARN("mscclppSocketRecv: pass NULL socket");
@@ -805,9 +868,11 @@ mscclppResult_t mscclppSocketRecv(struct mscclppSocket* sock, void* ptr, int siz
 //   return mscclppSuccess;
 // }

-mscclppResult_t mscclppSocketClose(struct mscclppSocket* sock) {
+mscclppResult_t mscclppSocketClose(struct mscclppSocket* sock)
+{
  if (sock != NULL) {
-    if (sock->fd >= 0) close(sock->fd);
+    if (sock->fd >= 0)
+      close(sock->fd);
    sock->state = mscclppSocketStateClosed;
    sock->fd = -1;
  }
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -4,27 +4,31 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "core.h"
 #include "debug.h"
-#include <stdlib.h>
+#include "core.h"
 #include <stdarg.h>
+#include <stdlib.h>
 #include <sys/syscall.h>

 int mscclppDebugLevel = -1;
 static int pid = -1;
 static char hostname[1024];
 thread_local int mscclppDebugNoWarn = 0;
-char mscclppLastError[1024] = ""; // Global string for the last error in human readable form
+char mscclppLastError[1024] = "";         // Global string for the last error in human readable form
 uint64_t mscclppDebugMask = MSCCLPP_INIT; // Default debug sub-system mask is INIT
-FILE *mscclppDebugFile = stdout;
+FILE* mscclppDebugFile = stdout;
 pthread_mutex_t mscclppDebugLock = PTHREAD_MUTEX_INITIALIZER;
 std::chrono::steady_clock::time_point mscclppEpoch;

 static __thread int tid = -1;

-void mscclppDebugInit() {
+void mscclppDebugInit()
+{
  pthread_mutex_lock(&mscclppDebugLock);
-  if (mscclppDebugLevel != -1) { pthread_mutex_unlock(&mscclppDebugLock); return; }
+  if (mscclppDebugLevel != -1) {
+    pthread_mutex_unlock(&mscclppDebugLock);
+    return;
+  }
  const char* mscclpp_debug = getenv("MSCCLPP_DEBUG");
  int tempNcclDebugLevel = -1;
  if (mscclpp_debug == NULL) {
@@ -48,10 +52,13 @@ void mscclppDebugInit() {
  char* mscclppDebugSubsysEnv = getenv("MSCCLPP_DEBUG_SUBSYS");
  if (mscclppDebugSubsysEnv != NULL) {
    int invert = 0;
-    if (mscclppDebugSubsysEnv[0] == '^') { invert = 1; mscclppDebugSubsysEnv++; }
+    if (mscclppDebugSubsysEnv[0] == '^') {
+      invert = 1;
+      mscclppDebugSubsysEnv++;
+    }
    mscclppDebugMask = invert ? ~0ULL : 0ULL;
-    char *mscclppDebugSubsys = strdup(mscclppDebugSubsysEnv);
-    char *subsys = strtok(mscclppDebugSubsys, ",");
+    char* mscclppDebugSubsys = strdup(mscclppDebugSubsysEnv);
+    char* subsys = strtok(mscclppDebugSubsys, ",");
    while (subsys != NULL) {
      uint64_t mask = 0;
      if (strcasecmp(subsys, "INIT") == 0) {
@@ -78,7 +85,10 @@ void mscclppDebugInit() {
        mask = MSCCLPP_ALL;
      }
      if (mask) {
-        if (invert) mscclppDebugMask &= ~mask; else mscclppDebugMask |= mask;
+        if (invert)
+          mscclppDebugMask &= ~mask;
+        else
+          mscclppDebugMask |= mask;
      }
      subsys = strtok(NULL, ",");
    }
@@ -96,32 +106,32 @@ void mscclppDebugInit() {
  const char* mscclppDebugFileEnv = getenv("MSCCLPP_DEBUG_FILE");
  if (tempNcclDebugLevel > MSCCLPP_LOG_VERSION && mscclppDebugFileEnv != NULL) {
    int c = 0;
-    char debugFn[PATH_MAX+1] = "";
-    char *dfn = debugFn;
+    char debugFn[PATH_MAX + 1] = "";
+    char* dfn = debugFn;
    while (mscclppDebugFileEnv[c] != '\0' && c < PATH_MAX) {
      if (mscclppDebugFileEnv[c++] != '%') {
-        *dfn++ = mscclppDebugFileEnv[c-1];
+        *dfn++ = mscclppDebugFileEnv[c - 1];
        continue;
      }
      switch (mscclppDebugFileEnv[c++]) {
-        case '%': // Double %
-          *dfn++ = '%';
-          break;
-        case 'h': // %h = hostname
-          dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
-          break;
-        case 'p': // %p = pid
-          dfn += snprintf(dfn, PATH_MAX, "%d", pid);
-          break;
-        default: // Echo everything we don't understand
-          *dfn++ = '%';
-          *dfn++ = mscclppDebugFileEnv[c-1];
-          break;
+      case '%': // Double %
+        *dfn++ = '%';
+        break;
+      case 'h': // %h = hostname
+        dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
+        break;
+      case 'p': // %p = pid
+        dfn += snprintf(dfn, PATH_MAX, "%d", pid);
+        break;
+      default: // Echo everything we don't understand
+        *dfn++ = '%';
+        *dfn++ = mscclppDebugFileEnv[c - 1];
+        break;
      }
    }
    *dfn = '\0';
    if (debugFn[0] != '\0') {
-      FILE *file = fopen(debugFn, "w");
+      FILE* file = fopen(debugFn, "w");
      if (file != nullptr) {
        setbuf(file, nullptr); // disable buffering
        mscclppDebugFile = file;
@@ -138,20 +148,27 @@ void mscclppDebugInit() {
 * Also exported to the dynamically loadable Net transport modules so
 * they can share the debugging mechanisms and output files
 */
-void mscclppDebugLog(mscclppDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
-  if (__atomic_load_n(&mscclppDebugLevel, __ATOMIC_ACQUIRE) == -1) mscclppDebugInit();
-  if (mscclppDebugNoWarn != 0 && level == MSCCLPP_LOG_WARN) { level = MSCCLPP_LOG_INFO; flags = mscclppDebugNoWarn; }
+void mscclppDebugLog(mscclppDebugLogLevel level, unsigned long flags, const char* filefunc, int line, const char* fmt,
+                     ...)
+{
+  if (__atomic_load_n(&mscclppDebugLevel, __ATOMIC_ACQUIRE) == -1)
+    mscclppDebugInit();
+  if (mscclppDebugNoWarn != 0 && level == MSCCLPP_LOG_WARN) {
+    level = MSCCLPP_LOG_INFO;
+    flags = mscclppDebugNoWarn;
+  }

  // Save the last error (WARN) as a human readable string
  if (level == MSCCLPP_LOG_WARN) {
    pthread_mutex_lock(&mscclppDebugLock);
    va_list vargs;
    va_start(vargs, fmt);
-    (void) vsnprintf(mscclppLastError, sizeof(mscclppLastError), fmt, vargs);
+    (void)vsnprintf(mscclppLastError, sizeof(mscclppLastError), fmt, vargs);
    va_end(vargs);
    pthread_mutex_unlock(&mscclppDebugLock);
  }
-  if (mscclppDebugLevel < level || ((flags & mscclppDebugMask) == 0)) return;
+  if (mscclppDebugLevel < level || ((flags & mscclppDebugMask) == 0))
+    return;

  if (tid == -1) {
    tid = syscall(SYS_gettid);
@@ -165,23 +182,23 @@ void mscclppDebugLog(mscclppDebugLogLevel level, unsigned long flags, const char
  char buffer[1024];
  size_t len = 0;
  if (level == MSCCLPP_LOG_WARN) {
-    len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d MSCCLPP WARN ",
-                   hostname, pid, tid, cudaDev, filefunc, line);
+    len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d MSCCLPP WARN ", hostname, pid, tid, cudaDev, filefunc,
+                   line);
  } else if (level == MSCCLPP_LOG_INFO) {
    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] MSCCLPP INFO ", hostname, pid, tid, cudaDev);
  } else if (level == MSCCLPP_LOG_TRACE && flags == MSCCLPP_CALL) {
    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d MSCCLPP CALL ", hostname, pid, tid);
  } else if (level == MSCCLPP_LOG_TRACE) {
    auto delta = std::chrono::steady_clock::now() - mscclppEpoch;
-    double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
-    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d MSCCLPP TRACE ",
-                   hostname, pid, tid, cudaDev, timestamp, filefunc, line);
+    double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count() * 1000;
+    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d MSCCLPP TRACE ", hostname, pid, tid, cudaDev,
+                   timestamp, filefunc, line);
  }

  if (len) {
    va_list vargs;
    va_start(vargs, fmt);
-    len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
+    len += vsnprintf(buffer + len, sizeof(buffer) - len, fmt, vargs);
    va_end(vargs);
    buffer[len++] = '\n';
    fwrite(buffer, 1, len, mscclppDebugFile);
@@ -190,11 +207,13 @@ void mscclppDebugLog(mscclppDebugLogLevel level, unsigned long flags, const char

 MSCCLPP_PARAM(SetThreadName, "SET_THREAD_NAME", 0);

-void mscclppSetThreadName(pthread_t thread, const char *fmt, ...) {
+void mscclppSetThreadName(pthread_t thread, const char* fmt, ...)
+{
  // pthread_setname_np is nonstandard GNU extension
  // needs the following feature test macro
 #ifdef _GNU_SOURCE
-  if (mscclppParamSetThreadName() != 1) return;
+  if (mscclppParamSetThreadName() != 1)
+    return;
  char threadName[MSCCLPP_THREAD_NAMELEN];
  va_list vargs;
  va_start(vargs, fmt);
--- a/src/gdr.cc
+++ b/src/gdr.cc
@@ -3,11 +3,13 @@
 // Used to make the GDR library calls thread safe
 pthread_mutex_t gdrLock = PTHREAD_MUTEX_INITIALIZER;

-gdr_t wrap_gdr_open(void) {
+gdr_t wrap_gdr_open(void)
+{
  return gdr_open();
 }

-mscclppResult_t wrap_gdr_close(gdr_t g) {
+mscclppResult_t wrap_gdr_close(gdr_t g)
+{
  int ret = gdr_close(g);
  if (ret != 0) {
    WARN("gdr_close() failed: %d", ret);
@@ -16,7 +18,9 @@ mscclppResult_t wrap_gdr_close(gdr_t g) {
  return mscclppSuccess;
 }

-mscclppResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle) {
+mscclppResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space,
+                                    gdr_mh_t* handle)
+{
  int ret;
  GDRLOCKCALL(gdr_pin_buffer(g, addr, size, p2p_token, va_space, handle), ret);
  if (ret != 0) {
@@ -26,7 +30,8 @@ mscclppResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, ui
  return mscclppSuccess;
 }

-mscclppResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) {
+mscclppResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle)
+{
  int ret;
  GDRLOCKCALL(gdr_unpin_buffer(g, handle), ret);
  if (ret != 0) {
@@ -36,7 +41,8 @@ mscclppResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) {
  return mscclppSuccess;
 }

-mscclppResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) {
+mscclppResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t* info)
+{
  int ret;
  GDRLOCKCALL(gdr_get_info(g, handle, info), ret);
  if (ret != 0) {
@@ -46,7 +52,8 @@ mscclppResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) {
  return mscclppSuccess;
 }

-mscclppResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) {
+mscclppResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void** va, size_t size)
+{
  int ret;
  GDRLOCKCALL(gdr_map(g, handle, va, size), ret);
  if (ret != 0) {
@@ -56,7 +63,8 @@ mscclppResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) {
  return mscclppSuccess;
 }

-mscclppResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) {
+mscclppResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void* va, size_t size)
+{
  int ret;
  GDRLOCKCALL(gdr_unmap(g, handle, va, size), ret);
  if (ret != 0) {
--- a/src/ib.cc
+++ b/src/ib.cc
@@ -2,23 +2,23 @@
 #include <cstdlib>
 #include <cstring>
 #include <malloc.h>
-#include <vector>
 #include <unistd.h>
+#include <vector>

-#include "debug.h"
 #include "alloc.h"
 #include "comm.h"
+#include "debug.h"
 #include "ib.h"

-static int getIbDevNumaNode(const char *ibDevPath)
+static int getIbDevNumaNode(const char* ibDevPath)
 {
  if (ibDevPath == NULL) {
    WARN("ibDevPath is NULL");
    return -1;
  }
-  const char *postfix = "/device/numa_node";
-  FILE *fp = NULL;
-  char *filePath = NULL;
+  const char* postfix = "/device/numa_node";
+  FILE* fp = NULL;
+  char* filePath = NULL;
  int node = -1;
  int res;
  if (mscclppCalloc(&filePath, strlen(ibDevPath) + strlen(postfix) + 1) != mscclppSuccess) {
@@ -52,16 +52,16 @@ exit:
  return node;
 }

-mscclppResult_t mscclppIbContextCreate(struct mscclppIbContext **ctx, const char *ibDevName)
+mscclppResult_t mscclppIbContextCreate(struct mscclppIbContext** ctx, const char* ibDevName)
 {
-  struct mscclppIbContext *_ctx;
+  struct mscclppIbContext* _ctx;
  MSCCLPPCHECK(mscclppCalloc(&_ctx, 1));

  std::vector<int> ports;

  int num;
-  const char *ibDevPath = NULL;
-  struct ibv_device **devices = ibv_get_device_list(&num);
+  const char* ibDevPath = NULL;
+  struct ibv_device** devices = ibv_get_device_list(&num);
  for (int i = 0; i < num; ++i) {
    if (strncmp(devices[i]->name, ibDevName, IBV_SYSFS_NAME_MAX) == 0) {
      _ctx->ctx = ibv_open_device(devices[i]);
@@ -96,8 +96,7 @@ mscclppResult_t mscclppIbContextCreate(struct mscclppIbContext **ctx, const char
    if (portAttr.state != IBV_PORT_ACTIVE) {
      continue;
    }
-    if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND &&
-      portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) {
+    if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) {
      continue;
    }
    ports.push_back((int)i);
@@ -129,7 +128,7 @@ fail:
  return mscclppInternalError;
 }

-mscclppResult_t mscclppIbContextDestroy(struct mscclppIbContext *ctx)
+mscclppResult_t mscclppIbContextDestroy(struct mscclppIbContext* ctx)
 {
  for (int i = 0; i < ctx->nMrs; ++i) {
    if (ctx->mrs[i].mr) {
@@ -158,7 +157,7 @@ mscclppResult_t mscclppIbContextDestroy(struct mscclppIbContext *ctx)
  return mscclppSuccess;
 }

-mscclppResult_t mscclppIbContextCreateQp(struct mscclppIbContext *ctx, struct mscclppIbQp **ibQp, int port/*=-1*/)
+mscclppResult_t mscclppIbContextCreateQp(struct mscclppIbContext* ctx, struct mscclppIbQp** ibQp, int port /*=-1*/)
 {
  if (port < 0) {
    port = ctx->ports[0];
@@ -176,7 +175,7 @@ mscclppResult_t mscclppIbContextCreateQp(struct mscclppIbContext *ctx, struct ms
    }
  }

-  struct ibv_cq *cq = ibv_create_cq(ctx->ctx, MSCCLPP_IB_CQ_SIZE, NULL, NULL, 0);
+  struct ibv_cq* cq = ibv_create_cq(ctx->ctx, MSCCLPP_IB_CQ_SIZE, NULL, NULL, 0);
  if (cq == NULL) {
    WARN("ibv_create_cq failed (errno %d)", errno);
    return mscclppInternalError;
@@ -193,7 +192,7 @@ mscclppResult_t mscclppIbContextCreateQp(struct mscclppIbContext *ctx, struct ms
  qp_init_attr.cap.max_send_sge = 1;
  qp_init_attr.cap.max_recv_sge = 1;
  qp_init_attr.cap.max_inline_data = 0;
-  struct ibv_qp *qp = ibv_create_qp(ctx->pd, &qp_init_attr);
+  struct ibv_qp* qp = ibv_create_qp(ctx->pd, &qp_init_attr);
  if (qp == nullptr) {
    WARN("ibv_create_qp failed (errno %d)", errno);
    return mscclppInternalError;
@@ -219,7 +218,7 @@ mscclppResult_t mscclppIbContextCreateQp(struct mscclppIbContext *ctx, struct ms
    WARN("too many QPs");
    return mscclppInternalError;
  }
-  struct mscclppIbQp *_ibQp = &ctx->qps[ctx->nQps - 1];
+  struct mscclppIbQp* _ibQp = &ctx->qps[ctx->nQps - 1];
  _ibQp->qp = qp;
  _ibQp->info.lid = port_attr.lid;
  _ibQp->info.port = port;
@@ -229,8 +228,8 @@ mscclppResult_t mscclppIbContextCreateQp(struct mscclppIbContext *ctx, struct ms
  if (port_attr.link_layer != IBV_LINK_LAYER_INFINIBAND) {
    union ibv_gid gid;
    if (ibv_query_gid(ctx->ctx, port, 0, &gid) != 0) {
-        WARN("ibv_query_gid failed (errno %d)", errno);
-        return mscclppInternalError;
+      WARN("ibv_query_gid failed (errno %d)", errno);
+      return mscclppInternalError;
    }
    _ibQp->info.spn = gid.global.subnet_prefix;
  }
@@ -256,7 +255,8 @@ mscclppResult_t mscclppIbContextCreateQp(struct mscclppIbContext *ctx, struct ms
  return mscclppSuccess;
 }

-mscclppResult_t mscclppIbContextRegisterMr(struct mscclppIbContext *ctx, void *buff, size_t size, struct mscclppIbMr **ibMr)
+mscclppResult_t mscclppIbContextRegisterMr(struct mscclppIbContext* ctx, void* buff, size_t size,
+                                           struct mscclppIbMr** ibMr)
 {
  if (size == 0) {
    WARN("invalid size: %zu", size);
@@ -271,8 +271,8 @@ mscclppResult_t mscclppIbContextRegisterMr(struct mscclppIbContext *ctx, void *b
  }
  uintptr_t addr = reinterpret_cast<uintptr_t>(buff) & -pageSize;
  size_t pages = (size + (reinterpret_cast<uintptr_t>(buff) - addr) + pageSize - 1) / pageSize;
-  struct ibv_mr *mr =
-    ibv_reg_mr(ctx->pd, reinterpret_cast<void *>(addr), pages * pageSize,
+  struct ibv_mr* mr =
+    ibv_reg_mr(ctx->pd, reinterpret_cast<void*>(addr), pages * pageSize,
               IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_RELAXED_ORDERING);
  if (mr == nullptr) {
    WARN("ibv_reg_mr failed (errno %d)", errno);
@@ -287,7 +287,7 @@ mscclppResult_t mscclppIbContextRegisterMr(struct mscclppIbContext *ctx, void *b
    WARN("too many MRs");
    return mscclppInternalError;
  }
-  struct mscclppIbMr *_ibMr = &ctx->mrs[ctx->nMrs - 1];
+  struct mscclppIbMr* _ibMr = &ctx->mrs[ctx->nMrs - 1];
  _ibMr->mr = mr;
  _ibMr->buff = buff;
  _ibMr->info.addr = (uint64_t)buff;
@@ -298,7 +298,7 @@ mscclppResult_t mscclppIbContextRegisterMr(struct mscclppIbContext *ctx, void *b

 //////////////////////////////////////////////////////////////////////////////

-int mscclppIbQp::rtr(const mscclppIbQpInfo *info)
+int mscclppIbQp::rtr(const mscclppIbQpInfo* info)
 {
  struct ibv_qp_attr qp_attr;
  std::memset(&qp_attr, 0, sizeof(struct ibv_qp_attr));
@@ -324,8 +324,8 @@ int mscclppIbQp::rtr(const mscclppIbQpInfo *info)
  qp_attr.ah_attr.src_path_bits = 0;
  qp_attr.ah_attr.port_num = info->port;
  return ibv_modify_qp(this->qp, &qp_attr,
-    IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN |
-      IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER);
+                       IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN |
+                         IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER);
 }

 int mscclppIbQp::rts()
@@ -339,19 +339,19 @@ int mscclppIbQp::rts()
  qp_attr.sq_psn = 0;
  qp_attr.max_rd_atomic = 1;
  return ibv_modify_qp(this->qp, &qp_attr,
-    IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN |
-      IBV_QP_MAX_QP_RD_ATOMIC);
+                       IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN |
+                         IBV_QP_MAX_QP_RD_ATOMIC);
 }

-int mscclppIbQp::stageSend(struct mscclppIbMr *ibMr, const mscclppIbMrInfo *info, uint32_t size,
-                           uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled)
+int mscclppIbQp::stageSend(struct mscclppIbMr* ibMr, const mscclppIbMrInfo* info, uint32_t size, uint64_t wrId,
+                           uint64_t srcOffset, uint64_t dstOffset, bool signaled)
 {
  if (this->wrn >= MSCCLPP_IB_MAX_SENDS) {
    return -1;
  }
  int wrn = this->wrn;
-  struct ibv_send_wr *wr_ = &this->wrs[wrn];
-  struct ibv_sge *sge_ = &this->sges[wrn];
+  struct ibv_send_wr* wr_ = &this->wrs[wrn];
+  struct ibv_sge* sge_ = &this->sges[wrn];
  // std::memset(wr_, 0, sizeof(struct ibv_send_wr));
  // std::memset(sge_, 0, sizeof(struct ibv_sge));
  wr_->wr_id = wrId;
@@ -372,8 +372,8 @@ int mscclppIbQp::stageSend(struct mscclppIbMr *ibMr, const mscclppIbMrInfo *info
  return this->wrn;
 }

-int mscclppIbQp::stageSendWithImm(struct mscclppIbMr *ibMr, const mscclppIbMrInfo *info, uint32_t size,
-                                  uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled, unsigned int immData)
+int mscclppIbQp::stageSendWithImm(struct mscclppIbMr* ibMr, const mscclppIbMrInfo* info, uint32_t size, uint64_t wrId,
+                                  uint64_t srcOffset, uint64_t dstOffset, bool signaled, unsigned int immData)
 {
  int wrn = this->stageSend(ibMr, info, size, wrId, srcOffset, dstOffset, signaled);
  this->wrs[wrn - 1].opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
@@ -387,7 +387,7 @@ int mscclppIbQp::postSend()
    return 0;
  }

-  struct ibv_send_wr *bad_wr;
+  struct ibv_send_wr* bad_wr;
  int ret = ibv_post_send(this->qp, this->wrs, &bad_wr);
  if (ret != 0) {
    return ret;
--- a/src/include/align.h
+++ b/src/include/align.h
@@ -7,38 +7,35 @@
 #ifndef NCCL_ALIGN_H_
 #define NCCL_ALIGN_H_

-#define DIVUP(x, y) \
-    (((x)+(y)-1)/(y))
+#define DIVUP(x, y) (((x) + (y)-1) / (y))

-#define ROUNDUP(x, y) \
-    (DIVUP((x), (y))*(y))
+#define ROUNDUP(x, y) (DIVUP((x), (y)) * (y))

-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
+#define ALIGN_SIZE(size, align) size = ((size + (align)-1) / (align)) * (align);

 #if !__CUDA_ARCH__
-  #ifndef __host__
-    #define __host__
-  #endif
-  #ifndef __device__
-    #define __device__
-  #endif
+#ifndef __host__
+#define __host__
+#endif
+#ifndef __device__
+#define __device__
+#endif
 #endif

-template<typename X, typename Y, typename Z = decltype(X()+Y())>
-__host__ __device__ constexpr Z divUp(X x, Y y) {
-  return (x+y-1)/y;
+template <typename X, typename Y, typename Z = decltype(X() + Y())> __host__ __device__ constexpr Z divUp(X x, Y y)
+{
+  return (x + y - 1) / y;
 }

-template<typename X, typename Y, typename Z = decltype(X()+Y())>
-__host__ __device__ constexpr Z roundUp(X x, Y y) {
-  return (x+y-1) - (x+y-1)%y;
+template <typename X, typename Y, typename Z = decltype(X() + Y())> __host__ __device__ constexpr Z roundUp(X x, Y y)
+{
+  return (x + y - 1) - (x + y - 1) % y;
 }

 // assumes second argument is a power of 2
-template<typename X, typename Z = decltype(X()+int())>
-__host__ __device__ constexpr Z alignUp(X x, int a) {
-  return (x+a-1) & Z(-a);
+template <typename X, typename Z = decltype(X() + int())> __host__ __device__ constexpr Z alignUp(X x, int a)
+{
+  return (x + a - 1) & Z(-a);
 }

 #endif
--- a/src/include/alloc.h
+++ b/src/include/alloc.h
@@ -7,88 +7,94 @@
 #ifndef MSCCLPP_ALLOC_H_
 #define MSCCLPP_ALLOC_H_

-#include "mscclpp.h"
-#include "checks.h"
 #include "align.h"
+#include "checks.h"
+#include "mscclpp.h"
 #include "utils.h"
-#include <sys/mman.h>
-#include <unistd.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>

 uint64_t clockNano(); // from utils.h with which we have a circular dependency

-template <typename T>
-mscclppResult_t mscclppCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
+template <typename T> mscclppResult_t mscclppCudaHostCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line)
+{
  mscclppResult_t result = mscclppSuccess;
  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
  *ptr = nullptr;
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped), result, finish);
-  memset(*ptr, 0, nelem*sizeof(T));
+  CUDACHECKGOTO(cudaHostAlloc(ptr, nelem * sizeof(T), cudaHostAllocMapped), result, finish);
+  memset(*ptr, 0, nelem * sizeof(T));
 finish:
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (*ptr == nullptr) WARN("Failed to CUDA host alloc %ld bytes", nelem*sizeof(T));
-  INFO(MSCCLPP_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
+  if (*ptr == nullptr)
+    WARN("Failed to CUDA host alloc %ld bytes", nelem * sizeof(T));
+  INFO(MSCCLPP_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
  return result;
 }
 #define mscclppCudaHostCalloc(...) mscclppCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)

-inline mscclppResult_t mscclppCudaHostFree(void* ptr) {
+inline mscclppResult_t mscclppCudaHostFree(void* ptr)
+{
  CUDACHECK(cudaFreeHost(ptr));
  return mscclppSuccess;
 }

-template <typename T>
-mscclppResult_t mscclppCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
-  void* p = malloc(nelem*sizeof(T));
+template <typename T> mscclppResult_t mscclppCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line)
+{
+  void* p = malloc(nelem * sizeof(T));
  if (p == NULL) {
-    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
+    WARN("Failed to malloc %ld bytes", nelem * sizeof(T));
    return mscclppSystemError;
  }
-  INFO(MSCCLPP_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), p);
-  memset(p, 0, nelem*sizeof(T));
+  INFO(MSCCLPP_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), p);
+  memset(p, 0, nelem * sizeof(T));
  *ptr = (T*)p;
  return mscclppSuccess;
 }
 #define mscclppCalloc(...) mscclppCallocDebug(__VA_ARGS__, __FILE__, __LINE__)

-template <typename T>
-mscclppResult_t mscclppRealloc(T** ptr, size_t oldNelem, size_t nelem) {
-  if (nelem < oldNelem) return mscclppInternalError;
-  if (nelem == oldNelem) return mscclppSuccess;
+template <typename T> mscclppResult_t mscclppRealloc(T** ptr, size_t oldNelem, size_t nelem)
+{
+  if (nelem < oldNelem)
+    return mscclppInternalError;
+  if (nelem == oldNelem)
+    return mscclppSuccess;

  T* oldp = *ptr;
-  T* p = (T*)malloc(nelem*sizeof(T));
+  T* p = (T*)malloc(nelem * sizeof(T));
  if (p == NULL) {
-    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
+    WARN("Failed to malloc %ld bytes", nelem * sizeof(T));
    return mscclppSystemError;
  }
-  memcpy(p, oldp, oldNelem*sizeof(T));
+  memcpy(p, oldp, oldNelem * sizeof(T));
  free(oldp);
-  memset(p+oldNelem, 0, (nelem-oldNelem)*sizeof(T));
+  memset(p + oldNelem, 0, (nelem - oldNelem) * sizeof(T));
  *ptr = (T*)p;
-  INFO(MSCCLPP_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*sizeof(T), nelem*sizeof(T), *ptr);
+  INFO(MSCCLPP_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem * sizeof(T), nelem * sizeof(T),
+       *ptr);
  return mscclppSuccess;
 }

-template <typename T>
-mscclppResult_t mscclppCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
+template <typename T> mscclppResult_t mscclppCudaMallocDebug(T** ptr, size_t nelem, const char* filefunc, int line)
+{
  mscclppResult_t result = mscclppSuccess;
  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
  *ptr = nullptr;
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
+  CUDACHECKGOTO(cudaMalloc(ptr, nelem * sizeof(T)), result, finish);
 finish:
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem*sizeof(T));
-  INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
+  if (*ptr == nullptr)
+    WARN("Failed to CUDA malloc %ld bytes", nelem * sizeof(T));
+  INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
  return result;
 }
 #define mscclppCudaMalloc(...) mscclppCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__)

-template <typename T>
-mscclppResult_t mscclppCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
+template <typename T> mscclppResult_t mscclppCudaCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line)
+{
  mscclppResult_t result = mscclppSuccess;
  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
  *ptr = nullptr;
@@ -96,36 +102,39 @@ mscclppResult_t mscclppCudaCallocDebug(T** ptr, size_t nelem, const char *filefu
  // Need a side stream so as not to interfere with graph capture.
  cudaStream_t stream;
  CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-  CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
-  CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
+  CUDACHECKGOTO(cudaMalloc(ptr, nelem * sizeof(T)), result, finish);
+  CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem * sizeof(T), stream), result, finish);
  CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
  CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
 finish:
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (*ptr == nullptr) WARN("Failed to CUDA calloc %ld bytes", nelem*sizeof(T));
-  INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
+  if (*ptr == nullptr)
+    WARN("Failed to CUDA calloc %ld bytes", nelem * sizeof(T));
+  INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
  return result;
 }
 #define mscclppCudaCalloc(...) mscclppCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__)

 template <typename T>
-mscclppResult_t mscclppCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream, const char *filefunc, int line) {
+mscclppResult_t mscclppCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream, const char* filefunc, int line)
+{
  mscclppResult_t result = mscclppSuccess;
  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
  *ptr = nullptr;
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
-  CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
+  CUDACHECKGOTO(cudaMalloc(ptr, nelem * sizeof(T)), result, finish);
+  CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem * sizeof(T), stream), result, finish);
 finish:
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (*ptr == nullptr) WARN("Failed to CUDA calloc async %ld bytes", nelem*sizeof(T));
-  INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
+  if (*ptr == nullptr)
+    WARN("Failed to CUDA calloc async %ld bytes", nelem * sizeof(T));
+  INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
  return result;
 }
 #define mscclppCudaCallocAsync(...) mscclppCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__)

-template <typename T>
-mscclppResult_t mscclppCudaMemcpy(T* dst, T* src, size_t nelem) {
+template <typename T> mscclppResult_t mscclppCudaMemcpy(T* dst, T* src, size_t nelem)
+{
  mscclppResult_t result = mscclppSuccess;
  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
@@ -140,19 +149,19 @@ finish:
  return result;
 }

-template <typename T>
-mscclppResult_t mscclppCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stream) {
+template <typename T> mscclppResult_t mscclppCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stream)
+{
  mscclppResult_t result = mscclppSuccess;
  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem*sizeof(T), cudaMemcpyDefault, stream), result, finish);
+  CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem * sizeof(T), cudaMemcpyDefault, stream), result, finish);
 finish:
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
  return result;
 }

-template <typename T>
-mscclppResult_t mscclppCudaFree(T* ptr) {
+template <typename T> mscclppResult_t mscclppCudaFree(T* ptr)
+{
  mscclppResult_t result = mscclppSuccess;
  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
@@ -165,12 +174,14 @@ finish:
 // Allocate memory to be potentially ibv_reg_mr'd. This needs to be
 // allocated on separate pages as those pages will be marked DONTFORK
 // and if they are shared, that could cause a crash in a child process
-inline mscclppResult_t mscclppIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
+inline mscclppResult_t mscclppIbMallocDebug(void** ptr, size_t size, const char* filefunc, int line)
+{
  size_t page_size = sysconf(_SC_PAGESIZE);
  void* p;
  int size_aligned = ROUNDUP(size, page_size);
  int ret = posix_memalign(&p, page_size, size_aligned);
-  if (ret != 0) return mscclppSystemError;
+  if (ret != 0)
+    return mscclppSystemError;
  memset(p, 0, size);
  *ptr = p;
  INFO(MSCCLPP_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr);
--- a/src/include/bootstrap.h
+++ b/src/include/bootstrap.h
@@ -12,21 +12,24 @@

 #include "comm.h"

-struct mscclppBootstrapHandle {
+struct mscclppBootstrapHandle
+{
  uint64_t magic;
  union mscclppSocketAddress addr;
 };
-static_assert(sizeof(struct mscclppBootstrapHandle) <= sizeof(mscclppUniqueId), "Bootstrap handle is too large to fit inside MSCCLPP unique ID");
+static_assert(sizeof(struct mscclppBootstrapHandle) <= sizeof(mscclppUniqueId),
+              "Bootstrap handle is too large to fit inside MSCCLPP unique ID");

 mscclppResult_t bootstrapNetInit(const char* ip_port_pair = NULL);
 mscclppResult_t bootstrapCreateRoot(struct mscclppBootstrapHandle* handle);
-mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle, bool isRoot = true, const char* ip_port_pair = NULL);
+mscclppResult_t bootstrapGetUniqueId(struct mscclppBootstrapHandle* handle, bool isRoot = true,
+                                     const char* ip_port_pair = NULL);
 mscclppResult_t bootstrapInit(struct mscclppBootstrapHandle* handle, struct mscclppComm* comm);
 mscclppResult_t bootstrapAllGather(void* commState, void* allData, int size);
 mscclppResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
 mscclppResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
-mscclppResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
-mscclppResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size);
+mscclppResult_t bootstrapBarrier(void* commState, int* ranks, int rank, int nranks, int tag);
+mscclppResult_t bootstrapIntraNodeAllGather(void* commState, int* ranks, int rank, int nranks, void* allData, int size);
 mscclppResult_t bootstrapClose(void* commState);
 mscclppResult_t bootstrapAbort(void* commState);
 #endif
--- a/src/include/checks.h
+++ b/src/include/checks.h
@@ -11,151 +11,174 @@
 #include <cuda_runtime.h>

 // Check CUDA RT calls
-#define CUDACHECK(cmd) do {                                 \
-    cudaError_t err = cmd;                                  \
-    if( err != cudaSuccess ) {                              \
-        WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
-        return mscclppUnhandledCudaError;                      \
-    }                                                       \
-} while(false)
+#define CUDACHECK(cmd)                                                                                                 \
+  do {                                                                                                                 \
+    cudaError_t err = cmd;                                                                                             \
+    if (err != cudaSuccess) {                                                                                          \
+      WARN("Cuda failure '%s'", cudaGetErrorString(err));                                                              \
+      return mscclppUnhandledCudaError;                                                                                \
+    }                                                                                                                  \
+  } while (false)

-#define CUDACHECKGOTO(cmd, res, label) do {                 \
-    cudaError_t err = cmd;                                  \
-    if( err != cudaSuccess ) {                              \
-        WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
-        res = mscclppUnhandledCudaError;                       \
-        goto label;                                         \
-    }                                                       \
-} while(false)
+#define CUDACHECKGOTO(cmd, res, label)                                                                                 \
+  do {                                                                                                                 \
+    cudaError_t err = cmd;                                                                                             \
+    if (err != cudaSuccess) {                                                                                          \
+      WARN("Cuda failure '%s'", cudaGetErrorString(err));                                                              \
+      res = mscclppUnhandledCudaError;                                                                                 \
+      goto label;                                                                                                      \
+    }                                                                                                                  \
+  } while (false)

 // Report failure but clear error and continue
-#define CUDACHECKIGNORE(cmd) do {  \
-    cudaError_t err = cmd;         \
-    if( err != cudaSuccess ) {     \
-        INFO(MSCCLPP_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, cudaGetErrorString(err)); \
-        (void) cudaGetLastError(); \
-    }                              \
-} while(false)
+#define CUDACHECKIGNORE(cmd)                                                                                           \
+  do {                                                                                                                 \
+    cudaError_t err = cmd;                                                                                             \
+    if (err != cudaSuccess) {                                                                                          \
+      INFO(MSCCLPP_ALL, "%s:%d Cuda failure '%s'", __FILE__, __LINE__, cudaGetErrorString(err));                       \
+      (void)cudaGetLastError();                                                                                        \
+    }                                                                                                                  \
+  } while (false)

 #include <errno.h>
 // Check system calls
-#define SYSCHECK(call, name) do { \
-  int retval; \
-  SYSCHECKVAL(call, name, retval); \
-} while (false)
+#define SYSCHECK(call, name)                                                                                           \
+  do {                                                                                                                 \
+    int retval;                                                                                                        \
+    SYSCHECKVAL(call, name, retval);                                                                                   \
+  } while (false)

-#define SYSCHECKVAL(call, name, retval) do { \
-  SYSCHECKSYNC(call, name, retval); \
-  if (retval == -1) { \
-    WARN("Call to " name " failed : %s", strerror(errno)); \
-    return mscclppSystemError; \
-  } \
-} while (false)
+#define SYSCHECKVAL(call, name, retval)                                                                                \
+  do {                                                                                                                 \
+    SYSCHECKSYNC(call, name, retval);                                                                                  \
+    if (retval == -1) {                                                                                                \
+      WARN("Call to " name " failed : %s", strerror(errno));                                                           \
+      return mscclppSystemError;                                                                                       \
+    }                                                                                                                  \
+  } while (false)

-#define SYSCHECKSYNC(call, name, retval) do { \
-  retval = call; \
-  if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
-    INFO(MSCCLPP_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
-  } else { \
-    break; \
-  } \
-} while(true)
+#define SYSCHECKSYNC(call, name, retval)                                                                               \
+  do {                                                                                                                 \
+    retval = call;                                                                                                     \
+    if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) {                                 \
+      INFO(MSCCLPP_ALL, "Call to " name " returned %s, retrying", strerror(errno));                                    \
+    } else {                                                                                                           \
+      break;                                                                                                           \
+    }                                                                                                                  \
+  } while (true)

-#define SYSCHECKGOTO(statement, res, label) do { \
-  if ((statement) == -1) {    \
-    /* Print the back trace*/ \
-    res = mscclppSystemError;    \
-    INFO(MSCCLPP_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
-    goto label; \
-  } \
-} while (0);
+#define SYSCHECKGOTO(statement, res, label)                                                                            \
+  do {                                                                                                                 \
+    if ((statement) == -1) {                                                                                           \
+      /* Print the back trace*/                                                                                        \
+      res = mscclppSystemError;                                                                                        \
+      INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res);                                                       \
+      goto label;                                                                                                      \
+    }                                                                                                                  \
+  } while (0);

-#define NEQCHECK(statement, value) do {   \
-  if ((statement) != value) {             \
-    /* Print the back trace*/             \
-    INFO(MSCCLPP_ALL,"%s:%d -> %d", __FILE__, __LINE__, mscclppSystemError);    \
-    return mscclppSystemError;     \
-  }                             \
-} while (0);
+#define NEQCHECK(statement, value)                                                                                     \
+  do {                                                                                                                 \
+    if ((statement) != value) {                                                                                        \
+      /* Print the back trace*/                                                                                        \
+      INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, mscclppSystemError);                                        \
+      return mscclppSystemError;                                                                                       \
+    }                                                                                                                  \
+  } while (0);

-#define NEQCHECKGOTO(statement, value, res, label) do { \
-  if ((statement) != value) { \
-    /* Print the back trace*/ \
-    res = mscclppSystemError;    \
-    INFO(MSCCLPP_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
-    goto label; \
-  } \
-} while (0);
+#define NEQCHECKGOTO(statement, value, res, label)                                                                     \
+  do {                                                                                                                 \
+    if ((statement) != value) {                                                                                        \
+      /* Print the back trace*/                                                                                        \
+      res = mscclppSystemError;                                                                                        \
+      INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res);                                                       \
+      goto label;                                                                                                      \
+    }                                                                                                                  \
+  } while (0);

-#define EQCHECK(statement, value) do {    \
-  if ((statement) == value) {             \
-    /* Print the back trace*/             \
-    INFO(MSCCLPP_ALL,"%s:%d -> %d", __FILE__, __LINE__, mscclppSystemError);    \
-    return mscclppSystemError;     \
-  }                             \
-} while (0);
+#define EQCHECK(statement, value)                                                                                      \
+  do {                                                                                                                 \
+    if ((statement) == value) {                                                                                        \
+      /* Print the back trace*/                                                                                        \
+      INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, mscclppSystemError);                                        \
+      return mscclppSystemError;                                                                                       \
+    }                                                                                                                  \
+  } while (0);

-#define EQCHECKGOTO(statement, value, res, label) do { \
-  if ((statement) == value) { \
-    /* Print the back trace*/ \
-    res = mscclppSystemError;    \
-    INFO(MSCCLPP_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
-    goto label; \
-  } \
-} while (0);
+#define EQCHECKGOTO(statement, value, res, label)                                                                      \
+  do {                                                                                                                 \
+    if ((statement) == value) {                                                                                        \
+      /* Print the back trace*/                                                                                        \
+      res = mscclppSystemError;                                                                                        \
+      INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res);                                                       \
+      goto label;                                                                                                      \
+    }                                                                                                                  \
+  } while (0);

 // Propagate errors up
-#define MSCCLPPCHECK(call) do { \
-  mscclppResult_t res = call; \
-  if (res != mscclppSuccess && res != mscclppInProgress) { \
-    /* Print the back trace*/ \
-    if (mscclppDebugNoWarn == 0) INFO(MSCCLPP_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
-    return res; \
-  } \
-} while (0);
+#define MSCCLPPCHECK(call)                                                                                             \
+  do {                                                                                                                 \
+    mscclppResult_t res = call;                                                                                        \
+    if (res != mscclppSuccess && res != mscclppInProgress) {                                                           \
+      /* Print the back trace*/                                                                                        \
+      if (mscclppDebugNoWarn == 0)                                                                                     \
+        INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res);                                                     \
+      return res;                                                                                                      \
+    }                                                                                                                  \
+  } while (0);

-#define MSCCLPPCHECKGOTO(call, res, label) do { \
-  res = call; \
-  if (res != mscclppSuccess && res != mscclppInProgress) { \
-    /* Print the back trace*/ \
-    if (mscclppDebugNoWarn == 0) INFO(MSCCLPP_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
-    goto label; \
-  } \
-} while (0);
+#define MSCCLPPCHECKGOTO(call, res, label)                                                                             \
+  do {                                                                                                                 \
+    res = call;                                                                                                        \
+    if (res != mscclppSuccess && res != mscclppInProgress) {                                                           \
+      /* Print the back trace*/                                                                                        \
+      if (mscclppDebugNoWarn == 0)                                                                                     \
+        INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res);                                                     \
+      goto label;                                                                                                      \
+    }                                                                                                                  \
+  } while (0);

-#define MSCCLPPWAIT(call, cond, abortFlagPtr) do {         \
-  volatile uint32_t* tmpAbortFlag = (abortFlagPtr);     \
-  mscclppResult_t res = call;                \
-  if (res != mscclppSuccess && res != mscclppInProgress) {               \
-    if (mscclppDebugNoWarn == 0) INFO(MSCCLPP_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
-    return mscclppInternalError;             \
-  }                                       \
-  if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \
-} while (!(cond));
+#define MSCCLPPWAIT(call, cond, abortFlagPtr)                                                                          \
+  do {                                                                                                                 \
+    volatile uint32_t* tmpAbortFlag = (abortFlagPtr);                                                                  \
+    mscclppResult_t res = call;                                                                                        \
+    if (res != mscclppSuccess && res != mscclppInProgress) {                                                           \
+      if (mscclppDebugNoWarn == 0)                                                                                     \
+        INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res);                                                     \
+      return mscclppInternalError;                                                                                     \
+    }                                                                                                                  \
+    if (tmpAbortFlag)                                                                                                  \
+      NEQCHECK(*tmpAbortFlag, 0);                                                                                      \
+  } while (!(cond));

-#define MSCCLPPWAITGOTO(call, cond, abortFlagPtr, res, label) do { \
-  volatile uint32_t* tmpAbortFlag = (abortFlagPtr);             \
-  res = call;                             \
-  if (res != mscclppSuccess && res != mscclppInProgress) {               \
-    if (mscclppDebugNoWarn == 0) INFO(MSCCLPP_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
-    goto label;                           \
-  }                                       \
-  if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \
-} while (!(cond));
+#define MSCCLPPWAITGOTO(call, cond, abortFlagPtr, res, label)                                                          \
+  do {                                                                                                                 \
+    volatile uint32_t* tmpAbortFlag = (abortFlagPtr);                                                                  \
+    res = call;                                                                                                        \
+    if (res != mscclppSuccess && res != mscclppInProgress) {                                                           \
+      if (mscclppDebugNoWarn == 0)                                                                                     \
+        INFO(MSCCLPP_ALL, "%s:%d -> %d", __FILE__, __LINE__, res);                                                     \
+      goto label;                                                                                                      \
+    }                                                                                                                  \
+    if (tmpAbortFlag)                                                                                                  \
+      NEQCHECKGOTO(*tmpAbortFlag, 0, res, label);                                                                      \
+  } while (!(cond));

-#define MSCCLPPCHECKTHREAD(a, args) do { \
-  if (((args)->ret = (a)) != mscclppSuccess && (args)->ret != mscclppInProgress) { \
-    INFO(MSCCLPP_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \
-    return args; \
-  } \
-} while(0)
+#define MSCCLPPCHECKTHREAD(a, args)                                                                                    \
+  do {                                                                                                                 \
+    if (((args)->ret = (a)) != mscclppSuccess && (args)->ret != mscclppInProgress) {                                   \
+      INFO(MSCCLPP_INIT, "%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret);                               \
+      return args;                                                                                                     \
+    }                                                                                                                  \
+  } while (0)

-#define CUDACHECKTHREAD(a) do { \
-  if ((a) != cudaSuccess) { \
-    INFO(MSCCLPP_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
-    args->ret = mscclppUnhandledCudaError; \
-    return args; \
-  } \
-} while(0)
+#define CUDACHECKTHREAD(a)                                                                                             \
+  do {                                                                                                                 \
+    if ((a) != cudaSuccess) {                                                                                          \
+      INFO(MSCCLPP_INIT, "%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret);                                 \
+      args->ret = mscclppUnhandledCudaError;                                                                           \
+      return args;                                                                                                     \
+    }                                                                                                                  \
+  } while (0)

 #endif
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -7,8 +7,8 @@
 #ifndef MSCCLPP_COMM_H_
 #define MSCCLPP_COMM_H_

-#include "proxy.h"
 #include "ib.h"
+#include "proxy.h"

 // #define CACHE_LINE_SIZE 128
 // #define MEM_ALIGN 4096
@@ -21,43 +21,46 @@

 #define MAXCONNECTIONS 1024

-struct mscclppConn {
+struct mscclppConn
+{
  mscclppTransport_t transport;
  int remoteRank;
  uint64_t buffSize;
-  uint64_t *remoteProxyFlag;
-  uint64_t *cpuProxyFlag;
-  void *cpuProxyFlagGdrDesc;
-  struct mscclppDevConn *devConn;
-  struct mscclppIbContext *ibCtx;
-  struct mscclppIbQp *ibQp;
-  struct mscclppIbMr *ibBuffMr;
-  struct mscclppIbMr *ibLocalFlagMr;
-  struct mscclppIbMr *ibProxyFlagMr;
+  uint64_t* remoteProxyFlag;
+  uint64_t* cpuProxyFlag;
+  void* cpuProxyFlagGdrDesc;
+  struct mscclppDevConn* devConn;
+  struct mscclppIbContext* ibCtx;
+  struct mscclppIbQp* ibQp;
+  struct mscclppIbMr* ibBuffMr;
+  struct mscclppIbMr* ibLocalFlagMr;
+  struct mscclppIbMr* ibProxyFlagMr;
  struct mscclppIbMrInfo ibBuffMrInfo;
  struct mscclppIbMrInfo ibLocalFlagMrInfo;
  struct mscclppIbMrInfo ibProxyFlagMrInfo;
 };

-struct mscclppComm {
+struct mscclppComm
+{
  struct mscclppConn conns[MAXCONNECTIONS];
  struct mscclppDevConn devConns[MAXCONNECTIONS];
  int nConns;

  void* bootstrap;

-  uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
+  uint64_t
+    magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.

  int rank;    // my rank in the communicator
  int nRanks;  // number of GPUs in communicator
  int cudaDev; // my cuda device index

  // Flag to ask MSCCLPP kernels to abort
-  volatile uint32_t *abortFlag;
+  volatile uint32_t* abortFlag;

-  struct mscclppIbContext *ibContext[MSCCLPP_IB_MAX_DEVS];
+  struct mscclppIbContext* ibContext[MSCCLPP_IB_MAX_DEVS];
  cudaStream_t stream; // DMA engine stream for P2P
-  struct mscclppProxyState *proxyState[MSCCLPP_PROXY_MAX_NUM];
+  struct mscclppProxyState* proxyState[MSCCLPP_PROXY_MAX_NUM];
 };

 #endif
--- a/src/include/core.h
+++ b/src/include/core.h
@@ -7,32 +7,24 @@
 #ifndef MSCCLPP_CORE_H_
 #define MSCCLPP_CORE_H_

-#include <pthread.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <algorithm> // For std::min/std::max
-#include <stdio.h>
-#include <string.h>
-#include "mscclpp.h"
-#include "debug.h"
 #include "alloc.h"
+#include "debug.h"
+#include "mscclpp.h"
 #include "param.h"
+#include <algorithm> // For std::min/std::max
+#include <pthread.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>

 #ifdef PROFAPI
-#define MSCCLPP_API(ret, func, args...)        \
-    __attribute__ ((visibility("default"))) \
-    __attribute__ ((alias(#func)))          \
-    ret p##func (args);                     \
-    extern "C"                              \
-    __attribute__ ((visibility("default"))) \
-    __attribute__ ((weak))                  \
-    ret func(args)
+#define MSCCLPP_API(ret, func, args...)                                                                                \
+  __attribute__((visibility("default"))) __attribute__((alias(#func))) ret p##func(args);                              \
+  extern "C" __attribute__((visibility("default"))) __attribute__((weak)) ret func(args)
 #else
-#define MSCCLPP_API(ret, func, args...)        \
-    extern "C"                              \
-    __attribute__ ((visibility("default"))) \
-    ret func(args)
+#define MSCCLPP_API(ret, func, args...) extern "C" __attribute__((visibility("default"))) ret func(args)
 #endif // end PROFAPI

 #endif // end include guard
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -8,27 +8,49 @@
 #define MSCCLPP_DEBUG_H_

 #include "mscclpp.h"
-#include <stdio.h>
 #include <chrono>
+#include <stdio.h>
 #include <type_traits>

 #include <limits.h>
-#include <string.h>
 #include <pthread.h>
+#include <string.h>

 // Conform to pthread and NVTX standard
 #define MSCCLPP_THREAD_NAMELEN 16

-typedef enum {MSCCLPP_LOG_NONE=0, MSCCLPP_LOG_VERSION=1, MSCCLPP_LOG_WARN=2, MSCCLPP_LOG_INFO=3, MSCCLPP_LOG_ABORT=4, MSCCLPP_LOG_TRACE=5} mscclppDebugLogLevel;
-typedef enum {MSCCLPP_INIT=1, MSCCLPP_COLL=2, MSCCLPP_P2P=4, MSCCLPP_SHM=8, MSCCLPP_NET=16, MSCCLPP_GRAPH=32, MSCCLPP_TUNING=64, MSCCLPP_ENV=128, MSCCLPP_ALLOC=256, MSCCLPP_CALL=512, MSCCLPP_ALL=~0} mscclppDebugLogSubSys;
+typedef enum
+{
+  MSCCLPP_LOG_NONE = 0,
+  MSCCLPP_LOG_VERSION = 1,
+  MSCCLPP_LOG_WARN = 2,
+  MSCCLPP_LOG_INFO = 3,
+  MSCCLPP_LOG_ABORT = 4,
+  MSCCLPP_LOG_TRACE = 5
+} mscclppDebugLogLevel;
+typedef enum
+{
+  MSCCLPP_INIT = 1,
+  MSCCLPP_COLL = 2,
+  MSCCLPP_P2P = 4,
+  MSCCLPP_SHM = 8,
+  MSCCLPP_NET = 16,
+  MSCCLPP_GRAPH = 32,
+  MSCCLPP_TUNING = 64,
+  MSCCLPP_ENV = 128,
+  MSCCLPP_ALLOC = 256,
+  MSCCLPP_CALL = 512,
+  MSCCLPP_ALL = ~0
+} mscclppDebugLogSubSys;

 extern int mscclppDebugLevel;
 extern uint64_t mscclppDebugMask;
 extern pthread_mutex_t mscclppDebugLock;
-extern FILE *mscclppDebugFile;
+extern FILE* mscclppDebugFile;
 extern mscclppResult_t getHostName(char* hostname, int maxlen, const char delim);

-void mscclppDebugLog(mscclppDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));
+void mscclppDebugLog(mscclppDebugLogLevel level, unsigned long flags, const char* filefunc, int line, const char* fmt,
+                     ...) __attribute__((format(printf, 5, 6)));

 // Let code temporarily downgrade WARN into INFO
 extern thread_local int mscclppDebugNoWarn;
@@ -45,6 +67,6 @@ extern std::chrono::steady_clock::time_point mscclppEpoch;
 #define TRACE(...)
 #endif

-void mscclppSetThreadName(pthread_t thread, const char *fmt, ...);
+void mscclppSetThreadName(pthread_t thread, const char* fmt, ...);

 #endif
--- a/src/include/gdr.h
+++ b/src/include/gdr.h
@@ -1,53 +1,58 @@
 #ifndef MSCCLPP_GDR_H_
 #define MSCCLPP_GDR_H_

-#include "gdrapi.h"
-#include "debug.h"
-#include "checks.h"
 #include "align.h"
 #include "alloc.h"
+#include "checks.h"
+#include "debug.h"
+#include "gdrapi.h"

 // These can be used if the GDR library isn't thread safe
 #include <pthread.h>
 extern pthread_mutex_t gdrLock;
 #define GDRLOCK() pthread_mutex_lock(&gdrLock)
 #define GDRUNLOCK() pthread_mutex_unlock(&gdrLock)
-#define GDRLOCKCALL(cmd, ret) do {                      \
-    GDRLOCK();                                          \
-    ret = cmd;                                          \
-    GDRUNLOCK();                                        \
-} while(false)
+#define GDRLOCKCALL(cmd, ret)                                                                                          \
+  do {                                                                                                                 \
+    GDRLOCK();                                                                                                         \
+    ret = cmd;                                                                                                         \
+    GDRUNLOCK();                                                                                                       \
+  } while (false)

-#define GDRCHECK(cmd) do {                              \
-    int e;                                              \
-    /* GDRLOCKCALL(cmd, e); */                          \
-    e = cmd;                                            \
-    if( e != 0 ) {                                      \
-      WARN("GDRCOPY failure %d", e);                    \
-      return mscclppSystemError;                        \
-    }                                                   \
-} while(false)
+#define GDRCHECK(cmd)                                                                                                  \
+  do {                                                                                                                 \
+    int e;                                                                                                             \
+    /* GDRLOCKCALL(cmd, e); */                                                                                         \
+    e = cmd;                                                                                                           \
+    if (e != 0) {                                                                                                      \
+      WARN("GDRCOPY failure %d", e);                                                                                   \
+      return mscclppSystemError;                                                                                       \
+    }                                                                                                                  \
+  } while (false)

 gdr_t wrap_gdr_open(void);
 mscclppResult_t wrap_gdr_close(gdr_t g);
-mscclppResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle);
+mscclppResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space,
+                                    gdr_mh_t* handle);
 mscclppResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle);
-mscclppResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info);
-mscclppResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size);
-mscclppResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size);
+mscclppResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t* info);
+mscclppResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void** va, size_t size);
+mscclppResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void* va, size_t size);

 // Global GDR driver handle
 extern gdr_t mscclppGdrCopy;

-typedef struct gdr_mem_desc {
-  void *gdrDevMem;
-  void *gdrMap;
+typedef struct gdr_mem_desc
+{
+  void* gdrDevMem;
+  void* gdrMap;
  size_t gdrOffset;
  size_t gdrMapSize;
  gdr_mh_t gdrMh;
 } gdr_mem_desc_t;

-static gdr_t mscclppGdrInit() {
+static gdr_t mscclppGdrInit()
+{
  // int libMajor, libMinor, drvMajor, drvMinor;
  gdr_t handle = wrap_gdr_open();

@@ -68,13 +73,15 @@ static gdr_t mscclppGdrInit() {
  //     INFO(MSCCLPP_INIT, "GDRCOPY enabled library %d.%d driver %d.%d", libMajor, libMinor, drvMajor, drvMinor);
  // }
  return handle;
-// error:
-//   if (handle != NULL) (void) wrap_gdr_close(handle);
-//   return NULL;
+  // error:
+  //   if (handle != NULL) (void) wrap_gdr_close(handle);
+  //   return NULL;
 }

 template <typename T>
-mscclppResult_t mscclppGdrCudaCallocDebug(T** ptr, T** devPtr, size_t nelem, void** gdrDesc, const char *filefunc, int line) {
+mscclppResult_t mscclppGdrCudaCallocDebug(T** ptr, T** devPtr, size_t nelem, void** gdrDesc, const char* filefunc,
+                                          int line)
+{
  mscclppResult_t result = mscclppSuccess;
  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
  *ptr = nullptr;
@@ -85,20 +92,20 @@ mscclppResult_t mscclppGdrCudaCallocDebug(T** ptr, T** devPtr, size_t nelem, voi
  gdr_info_t info;
  size_t mapSize;
  gdr_mh_t mh;
-  char *devMem;
-  void *gdrMap;
+  char* devMem;
+  void* gdrMap;
  ssize_t off;
  gdr_mem_desc_t* md;
  uint64_t alignedAddr;
  size_t align;

-  mapSize = sizeof(T)*nelem;
+  mapSize = sizeof(T) * nelem;

  // GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE
  ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
  // GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too
-  MSCCLPPCHECKGOTO(mscclppCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1), result, finish);
-  alignedAddr = (((uint64_t) devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK;
+  MSCCLPPCHECKGOTO(mscclppCudaCalloc(&devMem, mapSize + GPU_PAGE_SIZE - 1), result, finish);
+  alignedAddr = (((uint64_t)devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK;
  align = alignedAddr - (uint64_t)devMem;
  MSCCLPPCHECKGOTO(wrap_gdr_pin_buffer(mscclppGdrCopy, alignedAddr, mapSize, 0, 0, &mh), result, finish);

@@ -113,29 +120,31 @@ mscclppResult_t mscclppGdrCudaCallocDebug(T** ptr, T** devPtr, size_t nelem, voi
  md->gdrDevMem = devMem;
  md->gdrMap = gdrMap;
  md->gdrMapSize = mapSize;
-  md->gdrOffset = off+align;
+  md->gdrOffset = off + align;
  md->gdrMh = mh;
  *gdrDesc = md;

-  *ptr = (T *)((char *)gdrMap+off);
-  if (devPtr) *devPtr = (T *)(devMem+off+align);
+  *ptr = (T*)((char*)gdrMap + off);
+  if (devPtr)
+    *devPtr = (T*)(devMem + off + align);

-  TRACE(mscclpp_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p",
-       md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr);
+  TRACE(mscclpp_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p", md->gdrDevMem,
+        md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr);

  return mscclppSuccess;

 finish:
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (*ptr == nullptr) WARN("Failed to CUDA calloc %ld bytes", nelem*sizeof(T));
-  INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
+  if (*ptr == nullptr)
+    WARN("Failed to CUDA calloc %ld bytes", nelem * sizeof(T));
+  INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
  return result;
 }
 #define mscclppGdrCudaCalloc(...) mscclppGdrCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__)

-
-static mscclppResult_t mscclppGdrCudaFree(void* gdrDesc) {
-  gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrDesc;
+static mscclppResult_t mscclppGdrCudaFree(void* gdrDesc)
+{
+  gdr_mem_desc_t* md = (gdr_mem_desc_t*)gdrDesc;
  MSCCLPPCHECK(wrap_gdr_unmap(mscclppGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize));
  MSCCLPPCHECK(wrap_gdr_unpin_buffer(mscclppGdrCopy, md->gdrMh));
  CUDACHECK(cudaFree(md->gdrDevMem));
@@ -144,5 +153,4 @@ static mscclppResult_t mscclppGdrCudaFree(void* gdrDesc) {
  return mscclppSuccess;
 }

-
 #endif
--- a/src/include/ib.h
+++ b/src/include/ib.h
@@ -2,10 +2,10 @@
 #define MSCCLPP_IB_H_

 #include "mscclpp.h"
+#include <infiniband/verbs.h>
 #include <list>
 #include <memory>
 #include <string>
-#include <infiniband/verbs.h>

 #define MSCCLPP_IB_CQ_SIZE 1024
 #define MSCCLPP_IB_CQ_POLL_NUM 4
@@ -13,20 +13,23 @@
 #define MSCCLPP_IB_MAX_DEVS 8

 // MR info to be shared with the remote peer
-struct mscclppIbMrInfo {
+struct mscclppIbMrInfo
+{
  uint64_t addr;
  uint32_t rkey;
 };

 // IB memory region
-struct mscclppIbMr {
-  struct ibv_mr *mr;
-  void *buff;
+struct mscclppIbMr
+{
+  struct ibv_mr* mr;
+  void* buff;
  struct mscclppIbMrInfo info;
 };

 // QP info to be shared with the remote peer
-struct mscclppIbQpInfo {
+struct mscclppIbQpInfo
+{
  uint16_t lid;
  uint8_t port;
  uint8_t linkLayer;
@@ -36,44 +39,47 @@ struct mscclppIbQpInfo {
 };

 // IB queue pair
-struct mscclppIbQp {
-  struct ibv_qp *qp;
+struct mscclppIbQp
+{
+  struct ibv_qp* qp;
  struct mscclppIbQpInfo info;
-  struct ibv_send_wr *wrs;
-  struct ibv_sge *sges;
-  struct ibv_cq *cq;
-  struct ibv_wc *wcs;
+  struct ibv_send_wr* wrs;
+  struct ibv_sge* sges;
+  struct ibv_cq* cq;
+  struct ibv_wc* wcs;
  int wrn;

-  int rtr(const mscclppIbQpInfo *info);
+  int rtr(const mscclppIbQpInfo* info);
  int rts();
-  int stageSend(struct mscclppIbMr *ibMr, const mscclppIbMrInfo *info, uint32_t size,
-                uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled);
-  int stageSendWithImm(struct mscclppIbMr *ibMr, const mscclppIbMrInfo *info, uint32_t size,
-                       uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled, unsigned int immData);
+  int stageSend(struct mscclppIbMr* ibMr, const mscclppIbMrInfo* info, uint32_t size, uint64_t wrId, uint64_t srcOffset,
+                uint64_t dstOffset, bool signaled);
+  int stageSendWithImm(struct mscclppIbMr* ibMr, const mscclppIbMrInfo* info, uint32_t size, uint64_t wrId,
+                       uint64_t srcOffset, uint64_t dstOffset, bool signaled, unsigned int immData);
  int postSend();
  int postRecv(uint64_t wrId);
  int pollCq();
 };

 // Holds resources of a single IB device.
-struct mscclppIbContext {
+struct mscclppIbContext
+{
  int numaNode;
-  struct ibv_context *ctx;
-  struct ibv_pd *pd;
-  int *ports;
+  struct ibv_context* ctx;
+  struct ibv_pd* pd;
+  int* ports;
  int nPorts;
-  struct mscclppIbQp *qps;
+  struct mscclppIbQp* qps;
  int nQps;
  int maxQps;
-  struct mscclppIbMr *mrs;
+  struct mscclppIbMr* mrs;
  int nMrs;
  int maxMrs;
 };

-mscclppResult_t mscclppIbContextCreate(struct mscclppIbContext **ctx, const char *ibDevName);
-mscclppResult_t mscclppIbContextDestroy(struct mscclppIbContext *ctx);
-mscclppResult_t mscclppIbContextCreateQp(struct mscclppIbContext *ctx, struct mscclppIbQp **ibQp, int port = -1);
-mscclppResult_t mscclppIbContextRegisterMr(struct mscclppIbContext *ctx, void *buff, size_t size, struct mscclppIbMr **ibMr);
+mscclppResult_t mscclppIbContextCreate(struct mscclppIbContext** ctx, const char* ibDevName);
+mscclppResult_t mscclppIbContextDestroy(struct mscclppIbContext* ctx);
+mscclppResult_t mscclppIbContextCreateQp(struct mscclppIbContext* ctx, struct mscclppIbQp** ibQp, int port = -1);
+mscclppResult_t mscclppIbContextRegisterMr(struct mscclppIbContext* ctx, void* buff, size_t size,
+                                           struct mscclppIbMr** ibMr);

 #endif
--- a/src/include/mscclpp.h
+++ b/src/include/mscclpp.h
@@ -17,47 +17,47 @@ extern "C" {
 /***************************************************************************************************************
 * A mscclppDevConn provides a zero-copy connection between two GPUs connected via P2P NVLink or InfiniBand.
 * The communication API is one-sided meaning that for every single data transfer, only one side
- * needs to execute unlike a two-sided communication stack such as NCCL where both sides 
+ * needs to execute unlike a two-sided communication stack such as NCCL where both sides
 * need to execute a send and a receive instruction, respectively, for every transfer.
- * 
- * A connection is uniquely identified by the (remoteRank, tag) pair at an endpoint.  
- * The two endpoints register buffers of the same size with the connection. 
- * 
+ *
+ * A connection is uniquely identified by the (remoteRank, tag) pair at an endpoint.
+ * The two endpoints register buffers of the same size with the connection.
+ *
 * The endpoints provide the remoteRank, tag, and the buffer when registering a connection with msccppConnect().
- * 
- * mscllppConnectionSetup() sets up all the registered connections. 
- * 
+ *
+ * mscllppConnectionSetup() sets up all the registered connections.
+ *
 ***************************************************************************************************************
- * A proxy thread running on the CPU is necessary to perform transfers using InfiniBand or the DMA engine. 
+ * A proxy thread running on the CPU is necessary to perform transfers using InfiniBand or the DMA engine.
 * The current implementation uses a single proxy thread per context - one IB connection or DMA engine per node.
- * Thus multiple threadblocks using different connections might use the same CPU proxy thread. 
- *  
+ * Thus multiple threadblocks using different connections might use the same CPU proxy thread.
+ *
 * Before using any of functionality of connections, mscclppProxyLaunch needs to be called to spawn the
 * proxy threads. There are currently two types of connections:
- * 
+ *
 * P2P via NVLink: the DMA engine can perform the copy between the buffers. DMA engine has higher latency
 * but has a higher bandwidth and costs no compute cycles on the GPU.
- * 
+ *
 * InfiniBand: the RDMA engine copies the data over MLX devices.
- * 
+ *
 ***************************************************************************************************************
 * At the runtime, a GPU kernel has access to a mscclppDevConn object that provides the following functions:
- * 
+ *
 * put(): the sender initiates a data transfer to the receiver.
- * 
+ *
 * signal(): the sender signals the receiver that data is ready to be consumed.
- * 
+ *
 * wait(): the reciever waits on the signal() to start reading the data.
- * 
+ *
 * The sender should not reuse the buffer till the signal returns.
 * The receiver should only access the data after the wait returns.
- *   
+ *
 * putWithSignal(): the sender initiates a data transfer and signals the receiver that data is ready to be consumed.
 * This is an optimized version of a put followed by a signal.
- * 
- * These functions hide the complexity of syncrhonization between the two GPUs and the CPU proxy thread. 
+ *
+ * These functions hide the complexity of syncrhonization between the two GPUs and the CPU proxy thread.
 * Example:
- * 
+ *
 * // sender GPU
 * devConn.put(data1)
 * // not OK to write to data1
@@ -67,43 +67,54 @@ extern "C" {
 * // not OK to write to data1, data2, data3         // not OK to read data1, data2, data3
 * devConn.signal() -------------------------------> devConn.wait()
 * // OK to write to data1, data2, data3             // OK to read data1, data2, data3
- * 
- * 
+ *
+ *
 * The two endpoint can concurrently use the same connection provided they are writing (puts) on different
- * indices in the registered buffer. 
+ * indices in the registered buffer.
 **************************************************************************************************************/
-struct mscclppDevConn {
+struct mscclppDevConn
+{
 #ifdef __CUDACC__
-  __forceinline__ __device__ void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize){
+  __forceinline__ __device__ void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize)
+  {
    fifo.push(mscclppData, dstDataOffset, srcDataOffset, dataSize);
  }

-  __forceinline__ __device__ void put(uint64_t dataOffset, uint64_t dataSize){
+  __forceinline__ __device__ void put(uint64_t dataOffset, uint64_t dataSize)
+  {
    put(dataOffset, dataOffset, dataSize);
  }

-  __forceinline__ __device__ void signal(){
+  __forceinline__ __device__ void signal()
+  {
    epochIncrement();
    uint64_t curFifoHead = fifo.push(mscclppFlag | mscclppSync, 0, 0, 1);
-    while (*(volatile uint64_t *)fifo.triggerFifoTail <= curFifoHead);
+    while (*(volatile uint64_t*)fifo.triggerFifoTail <= curFifoHead)
+      ;
  }

-  __forceinline__ __device__ void putWithSignal(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize){
+  __forceinline__ __device__ void putWithSignal(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize)
+  {
    epochIncrement();
    uint64_t curFifoHead = fifo.push(mscclppData | mscclppFlag | mscclppSync, dstDataOffset, srcDataOffset, dataSize);
-    while (*(volatile uint64_t *)fifo.triggerFifoTail <= curFifoHead);
+    while (*(volatile uint64_t*)fifo.triggerFifoTail <= curFifoHead)
+      ;
  }

-  __forceinline__ __device__ void putWithSignal(uint64_t dataOffset, uint64_t dataSize){
+  __forceinline__ __device__ void putWithSignal(uint64_t dataOffset, uint64_t dataSize)
+  {
    putWithSignal(dataOffset, dataOffset, dataSize);
  }

-  __forceinline__ __device__ void wait(){
+  __forceinline__ __device__ void wait()
+  {
    (*recvEpochId) += 1;
-    while (*(volatile uint64_t*)proxyEpochId < (*recvEpochId));
+    while (*(volatile uint64_t*)proxyEpochId < (*recvEpochId))
+      ;
  }

-  __forceinline__ __device__ void epochIncrement(){
+  __forceinline__ __device__ void epochIncrement()
+  {
    *(volatile uint64_t*)sendEpochId += 1;
  }

@@ -127,18 +138,24 @@ typedef struct mscclppComm* mscclppComm_t;
 typedef struct mscclppDevConn mscclppDevConn_t;

 #define MSCCLPP_UNIQUE_ID_BYTES 128
-typedef struct { char internal[MSCCLPP_UNIQUE_ID_BYTES]; } mscclppUniqueId;
+typedef struct
+{
+  char internal[MSCCLPP_UNIQUE_ID_BYTES];
+} mscclppUniqueId;

 /* Error type */
-typedef enum { mscclppSuccess                 =  0,
-               mscclppUnhandledCudaError      =  1,
-               mscclppSystemError             =  2,
-               mscclppInternalError           =  3,
-               mscclppInvalidArgument         =  4,
-               mscclppInvalidUsage            =  5,
-               mscclppRemoteError             =  6,
-               mscclppInProgress              =  7,
-               mscclppNumResults              =  8 } mscclppResult_t;
+typedef enum
+{
+  mscclppSuccess = 0,
+  mscclppUnhandledCudaError = 1,
+  mscclppSystemError = 2,
+  mscclppInternalError = 3,
+  mscclppInvalidArgument = 4,
+  mscclppInvalidUsage = 5,
+  mscclppRemoteError = 6,
+  mscclppInProgress = 7,
+  mscclppNumResults = 8
+} mscclppResult_t;

 /* Create a unique ID for communication. Only needs to be called by one process.
 * Use with mscclppCommInitRankFromId().
@@ -150,16 +167,18 @@ typedef enum { mscclppSuccess                 =  0,
 mscclppResult_t mscclppGetUniqueId(mscclppUniqueId* uniqueId);

 /* Transport Types */
-typedef enum { mscclppTransportP2P = 0,
-               mscclppTransportSHM = 1, // TODO(chhwang): not implemented yet
-               mscclppTransportIB = 2,
+typedef enum
+{
+  mscclppTransportP2P = 0,
+  mscclppTransportSHM = 1, // TODO(chhwang): not implemented yet
+  mscclppTransportIB = 2,
 } mscclppTransport_t;

 /* Initialize a communicator. nranks processes with rank 0 to nranks-1 need to call this function.
- * 
+ *
 * Outputs:
 *   comm: the communicator to be initialized
- * 
+ *
 * Inputs:
 *   nranks:     number of ranks in the communicator
 *   ipPortPair: a string of the form "ip:port" that represents the address of the root process
@@ -169,10 +188,10 @@ mscclppResult_t mscclppCommInitRank(mscclppComm_t* comm, int nranks, const char*

 /* Initialize a communicator from a given mscclppUniqueId. Same as mscclppCommInitRank() except that
 * id is provided by the user by calling mscclppGetUniqueId()
- * 
+ *
 * Outputs:
 *   comm: the communicator to be initialized
- * 
+ *
 * Inputs:
 *   nranks: number of ranks in the communicator
 *   id:     the unique ID to be used for communication
@@ -181,10 +200,10 @@ mscclppResult_t mscclppCommInitRank(mscclppComm_t* comm, int nranks, const char*
 mscclppResult_t mscclppCommInitRankFromId(mscclppComm_t* comm, int nranks, mscclppUniqueId id, int rank);

 /* Ring-based AllGather through the bootstrap socket.
- * 
+ *
 * Outputs:
 *   comm: the communicator
- * 
+ *
 * Inputs:
 *   data: data array to be gathered where `[r*size, (r+1)*size)` is the data for rank `r`
 *   size: data size per rank
@@ -192,26 +211,26 @@ mscclppResult_t mscclppCommInitRankFromId(mscclppComm_t* comm, int nranks, msccl
 mscclppResult_t mscclppBootstrapAllGather(mscclppComm_t comm, void* data, int size);

 /* Destroy a communicator.
- * 
+ *
 * Inputs:
 *   comm: the communicator to be destroyed
 */
 mscclppResult_t mscclppCommDestroy(mscclppComm_t comm);

 /* Return the string for the given error code.
- * 
+ *
 * Ouput:
 *   returns the string
- * 
+ *
 * Inputs:
 *   result: the error code that this function needs to translate
 */
-const char*  mscclppGetErrorString(mscclppResult_t result);
+const char* mscclppGetErrorString(mscclppResult_t result);

 /* Connect to a remote rank. This function only prepares metadata for connection. The actual connection
 * is made by a following call of mscclppConnectionSetup(). Note that this function is two-way and a connection
 * from rank i to remote rank j needs to have a counterpart from rank j to rank i.
- * 
+ *
 * Inputs:
 *   comm:          the communicator
 *   remoteRank:    the rank of the remote process
@@ -223,11 +242,11 @@ const char*  mscclppGetErrorString(mscclppResult_t result);
 *   ibDev:         the name of the IB device to be used. Expects a null for mscclppTransportP2P.
 */
 mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void* localBuff, uint64_t buffSize,
-                               mscclppTransport_t transportType, const char *ibDev=0);
+                               mscclppTransport_t transportType, const char* ibDev = 0);

 /* Establish all connections declared by mscclppConnect(). This function must be called after all mscclppConnect()
 * calls are made. This function ensures that all remote ranks are ready to communicate when it returns.
- * 
+ *
 * Inputs:
 *   comm: the communicator
 */
@@ -235,22 +254,22 @@ mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm);

 /* Return an array of mscclppDevConn_t and the number of connections created by mscclppConnectionSetup().
 * The order of connections matches the order of mscclppConnect() calls.
- * 
+ *
 * Outputs:
 *   devConns: the array of mscclppDevConn_t. Each mscclppDevConn_t corresponds to a mscclppConnect() call in the
 *             order of the calls.
 *   nConns:   the number of connections
- * 
+ *
 * Inputs:
 *   comm: the communicator
 */
 mscclppResult_t mscclppGetAllDeviceConnections(mscclppComm_t comm, mscclppDevConn_t** devConns, int* nConns);

 /* Return the mscclppDevConn_t corresponding to a given tag and a remoteRank.
- * 
+ *
 * Outputs:
 *   devConn: the mscclppDevConn_t corresponding to the given tag
- * 
+ *
 * Inputs:
 *   comm:       the communicator
 *   tag:        the tag of the connection
@@ -261,34 +280,34 @@ mscclppResult_t mscclppGetDeviceConnection(mscclppComm_t comm, int remoteRank, i
 /* Launch proxy threads for all connections created by mscclppConnectionSetup(). This function is supposed to be called
 * before starting a kernel that uses mscclppDevConn_t. Up to two proxy threads are launched for each (GPU + IB) pair
 * (one for P2P NVLink and one for InfiniBand).
- * 
+ *
 * Inputs:
 *  comm: the communicator
 */
 mscclppResult_t mscclppProxyLaunch(mscclppComm_t comm);

 /* Stop all proxy threads.
- * 
+ *
 * Inputs:
 *  comm: the communicator
 */
 mscclppResult_t mscclppProxyStop(mscclppComm_t comm);

 /* Return the rank of the calling process.
- * 
+ *
 * Outputs:
 *   rank: the rank of the calling process
- * 
+ *
 * Inputs:
 *   comm: the communicator
 */
 mscclppResult_t mscclppCommRank(mscclppComm_t comm, int* rank);

 /* Return the number of ranks of the communicator.
- * 
+ *
 * Outputs:
 *   size: the number of ranks of the communicator
- * 
+ *
 * Inputs:
 *   comm: the communicator
 */
--- a/src/include/mscclppfifo.h
+++ b/src/include/mscclppfifo.h
@@ -7,9 +7,12 @@
 extern "C" {
 #endif

-typedef enum : uint64_t { mscclppData = 0x1,
-                          mscclppFlag = 0x2,
-                          mscclppSync = 0x4} mscclppTriggerType_t;
+typedef enum : uint64_t
+{
+  mscclppData = 0x1,
+  mscclppFlag = 0x2,
+  mscclppSync = 0x4
+} mscclppTriggerType_t;

 #define MSCCLPP_BITS_SIZE 32
 #define MSCCLPP_BITS_OFFSET 32
@@ -19,34 +22,38 @@ typedef enum : uint64_t { mscclppData = 0x1,
 // the summation of number of bits must be 128 or less
 union alignas(16) mscclppTrigger {
  uint64_t value[2];
-  struct {
+  struct
+  {
    // first 64 bits: value[0]
-    uint64_t dataSize      : MSCCLPP_BITS_SIZE;
+    uint64_t dataSize : MSCCLPP_BITS_SIZE;
    uint64_t srcDataOffset : MSCCLPP_BITS_OFFSET;
-    uint64_t               : (64-MSCCLPP_BITS_SIZE-MSCCLPP_BITS_OFFSET); // ensure 64-bit alignment
+    uint64_t : (64 - MSCCLPP_BITS_SIZE - MSCCLPP_BITS_OFFSET); // ensure 64-bit alignment
    // second 64 bits: value[1]
    uint64_t dstDataOffset : MSCCLPP_BITS_OFFSET;
-    uint64_t connId        : MSCCLPP_BITS_CONNID;
-    uint64_t type          : MSCCLPP_BITS_TYPE;
-    uint64_t               : (64-MSCCLPP_BITS_OFFSET-MSCCLPP_BITS_CONNID-MSCCLPP_BITS_TYPE); // ensure 64-bit alignment
+    uint64_t connId : MSCCLPP_BITS_CONNID;
+    uint64_t type : MSCCLPP_BITS_TYPE;
+    uint64_t : (64 - MSCCLPP_BITS_OFFSET - MSCCLPP_BITS_CONNID - MSCCLPP_BITS_TYPE); // ensure 64-bit alignment
  } fields;
 };

 typedef mscclppTrigger* mscclppTrigger_t;

-struct mscclppConcurrentFifo {
+struct mscclppConcurrentFifo
+{
 #ifdef __CUDACC__

-  __forceinline__ __device__ uint64_t push(uint64_t type, uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize){
-    uint64_t curFifoHead = atomicAdd((unsigned long long int*)this->triggerFifoHead,1);
-    while (curFifoHead >= MSCCLPP_PROXY_FIFO_SIZE + *((volatile uint64_t*)this->triggerFifoTail));
-    while (*(volatile uint64_t*)&this->triggerFifo[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0);
+  __forceinline__ __device__ uint64_t push(uint64_t type, uint64_t dstDataOffset, uint64_t srcDataOffset,
+                                           uint64_t dataSize)
+  {
+    uint64_t curFifoHead = atomicAdd((unsigned long long int*)this->triggerFifoHead, 1);
+    while (curFifoHead >= MSCCLPP_PROXY_FIFO_SIZE + *((volatile uint64_t*)this->triggerFifoTail))
+      ;
+    while (*(volatile uint64_t*)&this->triggerFifo[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE] != 0)
+      ;
    uint64_t* valptr = (uint64_t*)&(this->triggerFifo[curFifoHead % MSCCLPP_PROXY_FIFO_SIZE].value);
-    asm volatile(
-      "st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(valptr),
-      "l"((srcDataOffset << MSCCLPP_BITS_SIZE) + dataSize),
-      "l"((((type << MSCCLPP_BITS_CONNID) + this->connId) << MSCCLPP_BITS_OFFSET) + dstDataOffset)
-    );
+    asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(valptr),
+                 "l"((srcDataOffset << MSCCLPP_BITS_SIZE) + dataSize),
+                 "l"((((type << MSCCLPP_BITS_CONNID) + this->connId) << MSCCLPP_BITS_OFFSET) + dstDataOffset));
    return curFifoHead;
  }

--- a/src/include/npkit/npkit.h
+++ b/src/include/npkit/npkit.h
@@ -9,8 +9,9 @@
 #include "npkit/npkit_event.h"
 #include "npkit/npkit_struct.h"

-class NpKit {
- public:
+class NpKit
+{
+public:
  static const uint64_t kNumGpuEventBuffers = 512;

  static const uint64_t kNumCpuEventBuffers = 32;
@@ -24,7 +25,8 @@ class NpKit {
  static NpKitEventCollectContext* GetGpuEventCollectContexts();

  static inline __device__ void CollectGpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp,
-                                                NpKitEventCollectContext* ctx) {
+                                                NpKitEventCollectContext* ctx)
+  {
    uint64_t event_buffer_head = ctx->event_buffer_head;
    if (event_buffer_head < kMaxNumGpuEventsPerBuffer) {
      NpKitEvent& event = ctx->event_buffer[event_buffer_head];
@@ -40,7 +42,7 @@ class NpKit {

  static uint64_t* GetCpuTimestamp();

- private:
+private:
  static void CpuTimestampUpdateThread();

  // 64K * 512 * 16B = 512MB per GPU
--- a/src/include/npkit/npkit_event.h
+++ b/src/include/npkit/npkit_event.h
@@ -1,22 +1,22 @@
 #ifndef NPKIT_EVENT_H_
 #define NPKIT_EVENT_H_

-#define NPKIT_EVENT_INVALID         0x0
+#define NPKIT_EVENT_INVALID 0x0

-#define NPKIT_EVENT_TIME_SYNC_GPU   0x1
-#define NPKIT_EVENT_TIME_SYNC_CPU   0x2
+#define NPKIT_EVENT_TIME_SYNC_GPU 0x1
+#define NPKIT_EVENT_TIME_SYNC_CPU 0x2

 #define NPKIT_EVENT_SM_REDUCE_ENTRY 0x3
-#define NPKIT_EVENT_SM_REDUCE_EXIT  0x4
+#define NPKIT_EVENT_SM_REDUCE_EXIT 0x4

-#define NPKIT_EVENT_IB_SEND_ENTRY   0x5
-#define NPKIT_EVENT_IB_SEND_EXIT    0x6
-#define NPKIT_EVENT_IB_RECV_ENTRY   0x7
-#define NPKIT_EVENT_IB_RECV_EXIT    0x8
+#define NPKIT_EVENT_IB_SEND_ENTRY 0x5
+#define NPKIT_EVENT_IB_SEND_EXIT 0x6
+#define NPKIT_EVENT_IB_RECV_ENTRY 0x7
+#define NPKIT_EVENT_IB_RECV_EXIT 0x8

-#define NPKIT_EVENT_DMA_SEND_ENTRY  0x9
-#define NPKIT_EVENT_DMA_SEND_EXIT   0xA
-#define NPKIT_EVENT_DMA_RECV_ENTRY  0xB
-#define NPKIT_EVENT_DMA_RECV_EXIT   0xC
+#define NPKIT_EVENT_DMA_SEND_ENTRY 0x9
+#define NPKIT_EVENT_DMA_SEND_EXIT 0xA
+#define NPKIT_EVENT_DMA_RECV_ENTRY 0xB
+#define NPKIT_EVENT_DMA_RECV_EXIT 0xC

 #endif
--- a/src/include/npkit/npkit_struct.h
+++ b/src/include/npkit/npkit_struct.h
@@ -7,7 +7,8 @@

 union NpKitEvent {
  uint64_t bits[2];
-  struct {
+  struct
+  {
    uint64_t type : 8;
    uint64_t size : 32;
    uint64_t rsvd : 24;
@@ -15,7 +16,8 @@ union NpKitEvent {
  } fields;
 };

-struct NpKitEventCollectContext {
+struct NpKitEventCollectContext
+{
  NpKitEvent* event_buffer;
  uint64_t event_buffer_head;
 };
--- a/src/include/param.h
+++ b/src/include/param.h
@@ -15,15 +15,16 @@ void initEnv();

 void mscclppLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache);

-#define MSCCLPP_PARAM(name, env, deftVal) \
-  int64_t mscclppParam##name() { \
-    constexpr int64_t uninitialized = INT64_MIN; \
-    static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \
-    static int64_t cache = uninitialized; \
-    if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \
-      mscclppLoadParam("MSCCLPP_" env, deftVal, uninitialized, &cache); \
-    } \
-    return cache; \
+#define MSCCLPP_PARAM(name, env, deftVal)                                                                              \
+  int64_t mscclppParam##name()                                                                                         \
+  {                                                                                                                    \
+    constexpr int64_t uninitialized = INT64_MIN;                                                                       \
+    static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value.");                       \
+    static int64_t cache = uninitialized;                                                                              \
+    if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) {                         \
+      mscclppLoadParam("MSCCLPP_" env, deftVal, uninitialized, &cache);                                                \
+    }                                                                                                                  \
+    return cache;                                                                                                      \
  }

 #endif
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -1,27 +1,29 @@
 #ifndef MSCCLPP_PROXY_H_
 #define MSCCLPP_PROXY_H_

-#include "mscclpp.h"
 #include "comm.h"
-#include <pthread.h>
+#include "mscclpp.h"
 #include <cuda_runtime.h>
+#include <pthread.h>

 #define MSCCLPP_PROXY_MAX_NUM (MSCCLPP_IB_MAX_DEVS + 1) // One is for a P2P proxy.

-typedef enum {
+typedef enum
+{
  MSCCLPP_PROXY_RUN_STATE_IDLE = 0,
  MSCCLPP_PROXY_RUN_STATE_RUNNING,
  MSCCLPP_PROXY_RUN_STATE_EXITING,
 } mscclppProxyRunState_t;

-template <typename T>
-struct mscclppGDRState {
+template <typename T> struct mscclppGDRState
+{
  T* hostPtr;
  T* devPtr;
  void* desc;
 };

-struct mscclppProxyState {
+struct mscclppProxyState
+{
  mscclppTransport_t transportType;
  pthread_t thread;
  mscclppProxyRunState_t run;
@@ -31,8 +33,8 @@ struct mscclppProxyState {
  mscclppGDRState<uint64_t> fifoHead;
  mscclppGDRState<uint64_t> fifoTail;

-  struct mscclppIbContext *ibContext; // For IB connection only
-  cudaStream_t stream; // for P2P DMA engine only
+  struct mscclppIbContext* ibContext; // For IB connection only
+  cudaStream_t stream;                // for P2P DMA engine only
 };

 mscclppResult_t mscclppProxyCreate(struct mscclppComm* comm);
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -8,21 +8,21 @@
 #define MSCCLPP_SOCKET_H_

 #include "mscclpp.h"
-#include <sys/socket.h>
 #include <arpa/inet.h>
-#include <netinet/tcp.h>
-#include <netdb.h>
 #include <fcntl.h>
+#include <netdb.h>
+#include <netinet/tcp.h>
 #include <poll.h>
 #include <stddef.h>
+#include <sys/socket.h>

 #define MAX_IFS 16
 #define MAX_IF_NAME_SIZE 16
-#define SLEEP_INT            1000 // connection retry sleep interval in usec
-#define RETRY_REFUSED_TIMES   2e4 // connection refused retry times before reporting a timeout (20 sec)
-#define RETRY_TIMEDOUT_TIMES    3 // connection timed out retry times (each one can take 20s)
-#define RETRY_ACCEPT_TIMES    2e4 // connection accept retry times (each one can take 20s)
-#define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
+#define SLEEP_INT 1000          // connection retry sleep interval in usec
+#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec)
+#define RETRY_TIMEDOUT_TIMES 3  // connection timed out retry times (each one can take 20s)
+#define RETRY_ACCEPT_TIMES 2e4  // connection accept retry times (each one can take 20s)
+#define SOCKET_NAME_MAXLEN (NI_MAXHOST + NI_MAXSERV)
 #define MSCCLPP_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL

 /* Common socket address storage structure for IPv4/IPv6 */
@@ -32,7 +32,8 @@ union mscclppSocketAddress {
  struct sockaddr_in6 sin6;
 };

-enum mscclppSocketState {
+enum mscclppSocketState
+{
  mscclppSocketStateNone = 0,
  mscclppSocketStateInitialized = 1,
  mscclppSocketStateAccepting = 2,
@@ -46,7 +47,8 @@ enum mscclppSocketState {
  mscclppSocketStateNum = 10
 };

-enum mscclppSocketType {
+enum mscclppSocketType
+{
  mscclppSocketTypeUnknown = 0,
  mscclppSocketTypeBootstrap = 1,
  mscclppSocketTypeProxy = 2,
@@ -54,7 +56,8 @@ enum mscclppSocketType {
  mscclppSocketTypeNetIb = 4
 };

-struct mscclppSocket {
+struct mscclppSocket
+{
  int fd;
  int acceptFd;
  int timedOutRetries;
@@ -69,13 +72,17 @@ struct mscclppSocket {
  enum mscclppSocketType type;
 };

-const char *mscclppSocketToString(union mscclppSocketAddress *addr, char *buf, const int numericHostForm = 1);
+const char* mscclppSocketToString(union mscclppSocketAddress* addr, char* buf, const int numericHostForm = 1);
 mscclppResult_t mscclppSocketGetAddrFromString(union mscclppSocketAddress* ua, const char* ip_port_pair);
-int mscclppFindInterfaceMatchSubnet(char* ifNames, union mscclppSocketAddress* localAddrs, union mscclppSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
-int mscclppFindInterfaces(char* ifNames, union mscclppSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
+int mscclppFindInterfaceMatchSubnet(char* ifNames, union mscclppSocketAddress* localAddrs,
+                                    union mscclppSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
+int mscclppFindInterfaces(char* ifNames, union mscclppSocketAddress* ifAddrs, int ifNameMaxSize, int maxIfs);

 // Initialize a socket
-mscclppResult_t mscclppSocketInit(struct mscclppSocket* sock, union mscclppSocketAddress* addr = NULL, uint64_t magic = MSCCLPP_SOCKET_MAGIC, enum mscclppSocketType type = mscclppSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0);
+mscclppResult_t mscclppSocketInit(struct mscclppSocket* sock, union mscclppSocketAddress* addr = NULL,
+                                  uint64_t magic = MSCCLPP_SOCKET_MAGIC,
+                                  enum mscclppSocketType type = mscclppSocketTypeUnknown,
+                                  volatile uint32_t* abortFlag = NULL, int asyncFlag = 0);
 // Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call
 mscclppResult_t mscclppSocketListen(struct mscclppSocket* sock);
 mscclppResult_t mscclppSocketGetAddr(struct mscclppSocket* sock, union mscclppSocketAddress* addr);
@@ -83,7 +90,8 @@ mscclppResult_t mscclppSocketGetAddr(struct mscclppSocket* sock, union mscclppSo
 mscclppResult_t mscclppSocketConnect(struct mscclppSocket* sock);
 // Return socket connection state.
 // mscclppResult_t mscclppSocketReady(struct mscclppSocket* sock, int *running);
-// Accept an incoming connection from listenSock->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr.
+// Accept an incoming connection from listenSock->fd and keep the file descriptor in sock->fd, with the remote side
+// IP/port in sock->addr.
 mscclppResult_t mscclppSocketAccept(struct mscclppSocket* sock, struct mscclppSocket* ulistenSock);
 // mscclppResult_t mscclppSocketGetFd(struct mscclppSocket* sock, int* fd);
 // mscclppResult_t mscclppSocketSetFd(int fd, struct mscclppSocket* sock);
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -7,13 +7,13 @@
 #ifndef MSCCLPP_UTILS_H_
 #define MSCCLPP_UTILS_H_

-#include "mscclpp.h"
 #include "alloc.h"
 #include "checks.h"
+#include "mscclpp.h"
+#include <new>
+#include <sched.h>
 #include <stdint.h>
 #include <time.h>
-#include <sched.h>
-#include <new>

 // int mscclppCudaCompCap();

@@ -21,7 +21,7 @@
 mscclppResult_t int64ToBusId(int64_t id, char* busId);
 mscclppResult_t busIdToInt64(const char* busId, int64_t* id);

-mscclppResult_t getBusId(int cudaDev, int64_t *busId);
+mscclppResult_t getBusId(int cudaDev, int64_t* busId);

 mscclppResult_t getHostName(char* hostname, int maxlen, const char delim);
 uint64_t getHash(const char* string, int n);
@@ -29,7 +29,8 @@ uint64_t getHostHash();
 uint64_t getPidHash();
 mscclppResult_t getRandomData(void* buffer, size_t bytes);

-struct netIf {
+struct netIf
+{
  char prefix[64];
  int port;
 };
@@ -37,27 +38,33 @@ struct netIf {
 int parseStringList(const char* string, struct netIf* ifList, int maxList);
 bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);

-static long log2i(long n) {
- long l = 0;
- while (n>>=1) l++;
- return l;
+static long log2i(long n)
+{
+  long l = 0;
+  while (n >>= 1)
+    l++;
+  return l;
 }

-inline uint64_t clockNano() {
+inline uint64_t clockNano()
+{
  struct timespec ts;
  clock_gettime(CLOCK_MONOTONIC, &ts);
-  return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec;
+  return uint64_t(ts.tv_sec) * 1000 * 1000 * 1000 + ts.tv_nsec;
 }

 /* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else
 * return -1 */
-inline mscclppResult_t getRandomData(void* buffer, size_t bytes) {
+inline mscclppResult_t getRandomData(void* buffer, size_t bytes)
+{
  mscclppResult_t ret = mscclppSuccess;
  if (bytes > 0) {
    const size_t one = 1UL;
    FILE* fp = fopen("/dev/urandom", "r");
-    if (buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one) ret = mscclppSystemError;
-    if (fp) fclose(fp);
+    if (buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one)
+      ret = mscclppSystemError;
+    if (fp)
+      fclose(fp);
  }
  return ret;
 }
@@ -252,7 +259,6 @@ inline mscclppResult_t getRandomData(void* buffer, size_t bytes) {
 //   me->topFrame = *me->topFrame.below; // C++ struct assignment
 // }

-
 ////////////////////////////////////////////////////////////////////////////////

 // struct mscclppMemoryPool {
@@ -441,7 +447,8 @@ inline mscclppResult_t getRandomData(void* buffer, size_t bytes) {
 //         uintptr_t expected = sleeping ? 0x1 : 0x0;
 //         uintptr_t desired = 0x1;
 //         me->waiting = waitSignal; // release done by successful compare exchange
-//         if (__atomic_compare_exchange_n(&me->tail, &expected, desired, /*weak=*/true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) {
+//         if (__atomic_compare_exchange_n(&me->tail, &expected, desired, /*weak=*/true, __ATOMIC_RELEASE,
+//         __ATOMIC_RELAXED)) {
 //           sleeping = true;
 //           pthread_cond_wait(&waitSignal->cond, &waitSignal->mutex);
 //         }
@@ -471,7 +478,8 @@ inline mscclppResult_t getRandomData(void* buffer, size_t bytes) {
 // template<typename T, T *T::*next>
 // T* mscclppIntruQueueMpscAbandon(mscclppIntruQueueMpsc<T,next>* me) {
 //   uintptr_t expected = 0x0;
-//   if (__atomic_compare_exchange_n(&me->tail, &expected, /*desired=*/0x2, /*weak=*/true, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
+//   if (__atomic_compare_exchange_n(&me->tail, &expected, /*desired=*/0x2, /*weak=*/true, __ATOMIC_RELAXED,
+//   __ATOMIC_RELAXED)) {
 //     return nullptr;
 //   } else {
 //     int spins = 0;
--- a/src/init.cc
+++ b/src/init.cc
@@ -1,17 +1,18 @@
-#include "mscclpp.h"
 #include "bootstrap.h"
 #include "core.h"
 #include "gdr.h"
+#include "mscclpp.h"
 #include <map>
 #include <sstream>
 #if defined(ENABLE_NPKIT)
 #include "npkit/npkit.h"
 #endif

-static uint64_t hashUniqueId(mscclppUniqueId const &id) {
-  char const *bytes = (char const*)&id;
+static uint64_t hashUniqueId(mscclppUniqueId const& id)
+{
+  char const* bytes = (char const*)&id;
  uint64_t h = 0xdeadbeef;
-  for(int i=0; i < (int)sizeof(mscclppUniqueId); i++) {
+  for (int i = 0; i < (int)sizeof(mscclppUniqueId); i++) {
    h ^= h >> 32;
    h *= 0x8db3db47fa2994ad;
    h += bytes[i];
@@ -25,7 +26,8 @@ static bool initialized = false;

 gdr_t mscclppGdrCopy = NULL;

-mscclppResult_t initGdrCopy() {
+mscclppResult_t initGdrCopy()
+{
  mscclppGdrCopy = mscclppGdrInit();
  if (mscclppGdrCopy == NULL) {
    WARN("GDR init failed");
@@ -34,8 +36,10 @@ mscclppResult_t initGdrCopy() {
  return mscclppSuccess;
 }

-static mscclppResult_t mscclppInit() {
-  if (__atomic_load_n(&initialized, __ATOMIC_ACQUIRE)) return mscclppSuccess;
+static mscclppResult_t mscclppInit()
+{
+  if (__atomic_load_n(&initialized, __ATOMIC_ACQUIRE))
+    return mscclppSuccess;
  pthread_mutex_lock(&initLock);
  if (!initialized) {
    // initEnv();
@@ -62,22 +66,25 @@ static std::string mscclppShmFileName(mscclppComm_t comm, int rank)
 }

 MSCCLPP_API(mscclppResult_t, mscclppGetUniqueId, mscclppUniqueId* out);
-mscclppResult_t mscclppGetUniqueId(mscclppUniqueId* out) {
+mscclppResult_t mscclppGetUniqueId(mscclppUniqueId* out)
+{
  MSCCLPPCHECK(mscclppInit());
-//   mscclppCHECK(PtrCheck(out, "GetUniqueId", "out"));
+  //   mscclppCHECK(PtrCheck(out, "GetUniqueId", "out"));
  mscclppResult_t res = bootstrapGetUniqueId((struct mscclppBootstrapHandle*)out);
  TRACE_CALL("mscclppGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(*out));
  return res;
 }

 MSCCLPP_API(mscclppResult_t, mscclppBootstrapAllGather, mscclppComm_t comm, void* data, int size);
-mscclppResult_t mscclppBootstrapAllGather(mscclppComm_t comm, void* data, int size){
+mscclppResult_t mscclppBootstrapAllGather(mscclppComm_t comm, void* data, int size)
+{
  MSCCLPPCHECK(bootstrapAllGather(comm->bootstrap, data, size));
  return mscclppSuccess;
 }

 MSCCLPP_API(mscclppResult_t, mscclppCommInitRank, mscclppComm_t* comm, int nranks, const char* ipPortPair, int rank);
-mscclppResult_t mscclppCommInitRank(mscclppComm_t* comm, int nranks, const char* ipPortPair, int rank) {
+mscclppResult_t mscclppCommInitRank(mscclppComm_t* comm, int nranks, const char* ipPortPair, int rank)
+{
  if (mscclppGdrCopy == NULL) {
    MSCCLPPCHECK(initGdrCopy());
  }
@@ -99,7 +106,7 @@ mscclppResult_t mscclppCommInitRank(mscclppComm_t* comm, int nranks, const char*
  MSCCLPPCHECK(bootstrapGetUniqueId(&handle, rank == 0, ipPortPair));
  _comm->magic = handle.magic;

-  MSCCLPPCHECKGOTO(mscclppCudaHostCalloc((uint32_t **)&_comm->abortFlag, 1), res, fail);
+  MSCCLPPCHECKGOTO(mscclppCudaHostCalloc((uint32_t**)&_comm->abortFlag, 1), res, fail);
  MSCCLPPCHECK(bootstrapInit(&handle, _comm));

 #if defined(ENABLE_NPKIT)
@@ -142,15 +149,18 @@ mscclppResult_t mscclppCommInitRank(mscclppComm_t* comm, int nranks, const char*
  return res;
 fail:
  if (_comm) {
-    if (_comm->abortFlag) mscclppCudaHostFree((void *)_comm->abortFlag);
+    if (_comm->abortFlag)
+      mscclppCudaHostFree((void*)_comm->abortFlag);
    free(_comm);
  }
-  if (comm) *comm = NULL;
+  if (comm)
+    *comm = NULL;
  return res;
 }

 MSCCLPP_API(mscclppResult_t, mscclppCommInitRankFromId, mscclppComm_t* comm, int nranks, mscclppUniqueId id, int rank);
-mscclppResult_t mscclppCommInitRankFromId(mscclppComm_t* comm, int nranks, mscclppUniqueId id, int rank) {
+mscclppResult_t mscclppCommInitRankFromId(mscclppComm_t* comm, int nranks, mscclppUniqueId id, int rank)
+{
  if (mscclppGdrCopy == NULL) {
    MSCCLPPCHECK(initGdrCopy());
  }
@@ -168,7 +178,7 @@ mscclppResult_t mscclppCommInitRankFromId(mscclppComm_t* comm, int nranks, msccl
  MSCCLPPCHECK(bootstrapNetInit());
  _comm->magic = handle->magic;

-  MSCCLPPCHECKGOTO(mscclppCudaHostCalloc((uint32_t **)&_comm->abortFlag, 1), res, fail);
+  MSCCLPPCHECKGOTO(mscclppCudaHostCalloc((uint32_t**)&_comm->abortFlag, 1), res, fail);
  MSCCLPPCHECK(bootstrapInit(handle, _comm));

 #if defined(ENABLE_NPKIT)
@@ -180,15 +190,18 @@ mscclppResult_t mscclppCommInitRankFromId(mscclppComm_t* comm, int nranks, msccl
  return res;
 fail:
  if (_comm) {
-    if (_comm->abortFlag) mscclppCudaHostFree((void *)_comm->abortFlag);
+    if (_comm->abortFlag)
+      mscclppCudaHostFree((void*)_comm->abortFlag);
    free(_comm);
  }
-  if (comm) *comm = NULL;
+  if (comm)
+    *comm = NULL;
  return res;
 }

 MSCCLPP_API(mscclppResult_t, mscclppCommDestroy, mscclppComm_t comm);
-mscclppResult_t mscclppCommDestroy(mscclppComm_t comm){
+mscclppResult_t mscclppCommDestroy(mscclppComm_t comm)
+{
 #if defined(ENABLE_NPKIT)
  const char* npkitDumpDir = nullptr;
 #endif
@@ -197,7 +210,7 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm){
    return mscclppSuccess;

  for (int i = 0; i < comm->nConns; ++i) {
-    struct mscclppConn *conn = &comm->conns[i];
+    struct mscclppConn* conn = &comm->conns[i];
    if (conn->cpuProxyFlagGdrDesc) {
      // IB
      MSCCLPPCHECK(mscclppGdrCudaFree(conn->cpuProxyFlagGdrDesc));
@@ -208,7 +221,7 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm){
  }

  for (int i = 0; i < MSCCLPP_PROXY_MAX_NUM; ++i) {
-    struct mscclppProxyState *proxyState = comm->proxyState[i];
+    struct mscclppProxyState* proxyState = comm->proxyState[i];
    if (proxyState) {
      MSCCLPPCHECK(mscclppGdrCudaFree(proxyState->triggerFifo.desc));
      MSCCLPPCHECK(mscclppGdrCudaFree(proxyState->fifoHead.desc));
@@ -227,9 +240,9 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm){
    }
  }

-  for (int i = 0; i < comm->nConns; i++){
-    struct mscclppConn *conn = &comm->conns[i];
-    if (conn){
+  for (int i = 0; i < comm->nConns; i++) {
+    struct mscclppConn* conn = &comm->conns[i];
+    if (conn) {
      MSCCLPPCHECK(mscclppCudaFree(conn->devConn->sendEpochId));
      MSCCLPPCHECK(mscclppCudaFree(conn->devConn->recvEpochId));
    }
@@ -238,7 +251,7 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm){
  if (comm->bootstrap)
    MSCCLPPCHECK(bootstrapClose(comm->bootstrap));

-  mscclppCudaHostFree((void *)comm->abortFlag);
+  mscclppCudaHostFree((void*)comm->abortFlag);
  free(comm);

 #if defined(ENABLE_NPKIT)
@@ -256,24 +269,36 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm){
 }

 MSCCLPP_API(const char*, mscclppGetErrorString, mscclppResult_t code);
-const char* mscclppGetErrorString(mscclppResult_t code) {
+const char* mscclppGetErrorString(mscclppResult_t code)
+{
  switch (code) {
-    case mscclppSuccess                : return "no error";
-    case mscclppUnhandledCudaError     : return "unhandled cuda error";
-    case mscclppSystemError            : return "unhandled system error";
-    case mscclppInternalError          : return "internal error";
-    case mscclppInvalidArgument        : return "invalid argument";
-    case mscclppInvalidUsage           : return "invalid usage";
-    case mscclppRemoteError            : return "remote process exited or there was a network error";
-    case mscclppInProgress             : return "MSCCL++ operation in progress";
-    default                            : return "unknown result code";
+  case mscclppSuccess:
+    return "no error";
+  case mscclppUnhandledCudaError:
+    return "unhandled cuda error";
+  case mscclppSystemError:
+    return "unhandled system error";
+  case mscclppInternalError:
+    return "internal error";
+  case mscclppInvalidArgument:
+    return "invalid argument";
+  case mscclppInvalidUsage:
+    return "invalid usage";
+  case mscclppRemoteError:
+    return "remote process exited or there was a network error";
+  case mscclppInProgress:
+    return "MSCCL++ operation in progress";
+  default:
+    return "unknown result code";
  }
 }

-MSCCLPP_API(mscclppResult_t, mscclppGetDeviceConnection, mscclppComm_t comm, int remoteRank, int tag, mscclppDevConn_t** devConn);
-mscclppResult_t mscclppGetDeviceConnection(mscclppComm_t comm, int remoteRank, int tag, mscclppDevConn_t** devConn){
-  for (int i = 0; i < comm->nConns; i++){
-    if (comm->devConns[i].remoteRank == remoteRank && comm->devConns[i].tag == tag){
+MSCCLPP_API(mscclppResult_t, mscclppGetDeviceConnection, mscclppComm_t comm, int remoteRank, int tag,
+            mscclppDevConn_t** devConn);
+mscclppResult_t mscclppGetDeviceConnection(mscclppComm_t comm, int remoteRank, int tag, mscclppDevConn_t** devConn)
+{
+  for (int i = 0; i < comm->nConns; i++) {
+    if (comm->devConns[i].remoteRank == remoteRank && comm->devConns[i].tag == tag) {
      *devConn = &comm->devConns[i];
      return mscclppSuccess;
    }
@@ -282,8 +307,8 @@ mscclppResult_t mscclppGetDeviceConnection(mscclppComm_t comm, int remoteRank, i
  return mscclppInvalidArgument;
 }

-
-MSCCLPP_API(mscclppResult_t, mscclppGetAllDeviceConnections, mscclppComm_t comm, mscclppDevConn_t** devConns, int* nConns);
+MSCCLPP_API(mscclppResult_t, mscclppGetAllDeviceConnections, mscclppComm_t comm, mscclppDevConn_t** devConns,
+            int* nConns);
 mscclppResult_t mscclppGetAllDeviceConnections(mscclppComm_t comm, mscclppDevConn_t** devConns, int* nConns)
 {
  *nConns = comm->nConns;
@@ -291,17 +316,16 @@ mscclppResult_t mscclppGetAllDeviceConnections(mscclppComm_t comm, mscclppDevCon
  return mscclppSuccess;
 }

-
-MSCCLPP_API(mscclppResult_t, mscclppConnect, mscclppComm_t comm, int remoteRank, int tag, 
-            void* localBuff, uint64_t buffSize, mscclppTransport_t transportType, const char *ibDev);
+MSCCLPP_API(mscclppResult_t, mscclppConnect, mscclppComm_t comm, int remoteRank, int tag, void* localBuff,
+            uint64_t buffSize, mscclppTransport_t transportType, const char* ibDev);
 mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void* localBuff, uint64_t buffSize,
-                               mscclppTransport_t transportType, const char *ibDev)
+                               mscclppTransport_t transportType, const char* ibDev)
 {
  if (comm->nConns == MAXCONNECTIONS) {
    WARN("Too many connections made");
    return mscclppInternalError;
  }
-  struct mscclppConn *conn = &comm->conns[comm->nConns];
+  struct mscclppConn* conn = &comm->conns[comm->nConns];
  conn->transport = transportType;
  conn->buffSize = buffSize;

@@ -333,12 +357,12 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void
    }
    // Set the ib context for this conn
    conn->ibCtx = comm->ibContext[ibDevIdx];
-  } else if (transportType == mscclppTransportP2P){
+  } else if (transportType == mscclppTransportP2P) {
    // Check if a DMA context/stream exists
-    if (comm->stream == NULL){
+    if (comm->stream == NULL) {
      CUDACHECK(cudaStreamCreateWithFlags(&comm->stream, cudaStreamNonBlocking));
    }
-  } else if (transportType == mscclppTransportSHM){
+  } else if (transportType == mscclppTransportSHM) {
    WARN("Shared memory interconnection is not implemented yet!");
    return mscclppInternalError;
  } else {
@@ -346,44 +370,44 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void
    return mscclppInvalidUsage;
  }

-
  // Find/create a proxy state for the given connection
-  struct mscclppProxyState *proxyState = NULL;
+  struct mscclppProxyState* proxyState = NULL;
  // First see if there is a matching context
  // If not, find the first empty proxy
  int firstEmptyProxyIndex = -1;
  for (int i = 0; i < MSCCLPP_PROXY_MAX_NUM; ++i) {
-    struct mscclppProxyState *curProxy = comm->proxyState[i];
-    if (curProxy && (curProxy->transportType == transportType)){
-      if ((transportType == mscclppTransportIB && curProxy->ibContext == conn->ibCtx) || (transportType == mscclppTransportP2P)){
+    struct mscclppProxyState* curProxy = comm->proxyState[i];
+    if (curProxy && (curProxy->transportType == transportType)) {
+      if ((transportType == mscclppTransportIB && curProxy->ibContext == conn->ibCtx) ||
+          (transportType == mscclppTransportP2P)) {
        proxyState = curProxy;
        break; // we found the matching context
      }
    }
-    if (curProxy == NULL && firstEmptyProxyIndex == -1){
+    if (curProxy == NULL && firstEmptyProxyIndex == -1) {
      firstEmptyProxyIndex = i;
    }
  }

-  if (proxyState == NULL && firstEmptyProxyIndex == -1){
+  if (proxyState == NULL && firstEmptyProxyIndex == -1) {
    WARN("Too many proxies have been allocated!");
    return mscclppInvalidUsage;
  }

  // If we couldn't find a matching context, create one
-  if (proxyState == NULL){
+  if (proxyState == NULL) {
    MSCCLPPCHECK(mscclppCalloc(&proxyState, 1));
    MSCCLPPCHECK(mscclppGdrCudaCalloc(&proxyState->triggerFifo.hostPtr, &proxyState->triggerFifo.devPtr,
                                      MSCCLPP_PROXY_FIFO_SIZE, &proxyState->triggerFifo.desc));
-    MSCCLPPCHECK(mscclppGdrCudaCalloc(&proxyState->fifoHead.hostPtr, &proxyState->fifoHead.devPtr,
-                                      1, &proxyState->fifoHead.desc));
-    MSCCLPPCHECK(mscclppGdrCudaCalloc(&proxyState->fifoTail.hostPtr, &proxyState->fifoTail.devPtr,
-                                      1, &proxyState->fifoTail.desc));
+    MSCCLPPCHECK(
+      mscclppGdrCudaCalloc(&proxyState->fifoHead.hostPtr, &proxyState->fifoHead.devPtr, 1, &proxyState->fifoHead.desc));
+    MSCCLPPCHECK(
+      mscclppGdrCudaCalloc(&proxyState->fifoTail.hostPtr, &proxyState->fifoTail.devPtr, 1, &proxyState->fifoTail.desc));

-    if (transportType == mscclppTransportIB){
+    if (transportType == mscclppTransportIB) {
      proxyState->ibContext = conn->ibCtx;
      proxyState->stream = NULL;
-    } else if (transportType == mscclppTransportP2P){
+    } else if (transportType == mscclppTransportP2P) {
      proxyState->ibContext = NULL;
      proxyState->stream = comm->stream;
    }
@@ -395,8 +419,8 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void
    WARN("Proxy allocation failed!");
    return mscclppInternalError;
  }
-  
-  struct mscclppDevConn *devConn = &comm->devConns[comm->nConns];
+
+  struct mscclppDevConn* devConn = &comm->devConns[comm->nConns];

  conn->devConn = devConn;
  conn->devConn->localBuff = localBuff;
@@ -415,7 +439,8 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void
  return mscclppSuccess;
 }

-struct connInfo {
+struct connInfo
+{
  cudaIpcMemHandle_t handleBuff;
  cudaIpcMemHandle_t handleFlag;
  cudaIpcMemHandle_t handleProxyFlag;
@@ -425,12 +450,13 @@ struct connInfo {
  mscclppIbMrInfo infoProxyFlagMr;
 };

-mscclppResult_t mscclppP2pConnectionSetupStart(struct connInfo* connInfo /*output*/, struct mscclppConn* conn /*input*/){
-  if (connInfo == NULL || conn == NULL){
+mscclppResult_t mscclppP2pConnectionSetupStart(struct connInfo* connInfo /*output*/, struct mscclppConn* conn /*input*/)
+{
+  if (connInfo == NULL || conn == NULL) {
    WARN("connInfo or connection cannot be null");
    return mscclppInternalError;
  }
-  struct mscclppDevConn *devConn = conn->devConn;
+  struct mscclppDevConn* devConn = conn->devConn;
  MSCCLPPCHECK(mscclppCudaCalloc(&devConn->proxyEpochId, 1));
  CUDACHECK(cudaIpcGetMemHandle(&connInfo->handleProxyFlag, devConn->proxyEpochId));
  CUDACHECK(cudaIpcGetMemHandle(&connInfo->handleBuff, devConn->localBuff));
@@ -438,28 +464,33 @@ mscclppResult_t mscclppP2pConnectionSetupStart(struct connInfo* connInfo /*outpu
  return mscclppSuccess;
 }

-mscclppResult_t mscclppP2pConnectionSetupEnd(struct connInfo* connInfo /*input*/, struct mscclppConn* conn /*output*/){
-  if (connInfo == NULL || conn == NULL){
+mscclppResult_t mscclppP2pConnectionSetupEnd(struct connInfo* connInfo /*input*/, struct mscclppConn* conn /*output*/)
+{
+  if (connInfo == NULL || conn == NULL) {
    WARN("ipcHandles or connection cannot be null");
    return mscclppInternalError;
  }
-  CUDACHECK(cudaIpcOpenMemHandle((void**)&conn->devConn->remoteBuff, connInfo->handleBuff, cudaIpcMemLazyEnablePeerAccess));
-  CUDACHECK(cudaIpcOpenMemHandle((void**)&conn->devConn->remoteFlag, connInfo->handleFlag, cudaIpcMemLazyEnablePeerAccess));
-  CUDACHECK(cudaIpcOpenMemHandle((void**)&conn->remoteProxyFlag, connInfo->handleProxyFlag, cudaIpcMemLazyEnablePeerAccess));
+  CUDACHECK(
+    cudaIpcOpenMemHandle((void**)&conn->devConn->remoteBuff, connInfo->handleBuff, cudaIpcMemLazyEnablePeerAccess));
+  CUDACHECK(
+    cudaIpcOpenMemHandle((void**)&conn->devConn->remoteFlag, connInfo->handleFlag, cudaIpcMemLazyEnablePeerAccess));
+  CUDACHECK(
+    cudaIpcOpenMemHandle((void**)&conn->remoteProxyFlag, connInfo->handleProxyFlag, cudaIpcMemLazyEnablePeerAccess));
  return mscclppSuccess;
 }

-mscclppResult_t mscclppIbConnectionSetupStart(struct connInfo* connInfo /*output*/, struct mscclppConn* conn /*input*/){
-  if (connInfo == NULL || conn == NULL){
+mscclppResult_t mscclppIbConnectionSetupStart(struct connInfo* connInfo /*output*/, struct mscclppConn* conn /*input*/)
+{
+  if (connInfo == NULL || conn == NULL) {
    WARN("connInfo or connection cannot be null");
    return mscclppInternalError;
  }
-  struct mscclppDevConn *devConn = conn->devConn;
+  struct mscclppDevConn* devConn = conn->devConn;
  devConn->remoteBuff = NULL;
  devConn->remoteFlag = NULL;
  MSCCLPPCHECK(mscclppGdrCudaCalloc(&conn->cpuProxyFlag, &devConn->proxyEpochId, 1, &conn->cpuProxyFlagGdrDesc));

-  struct mscclppIbContext *ibCtx = conn->ibCtx;
+  struct mscclppIbContext* ibCtx = conn->ibCtx;
  if (conn->ibQp == NULL) {
    MSCCLPPCHECK(mscclppIbContextCreateQp(ibCtx, &conn->ibQp));
  }
@@ -474,8 +505,9 @@ mscclppResult_t mscclppIbConnectionSetupStart(struct connInfo* connInfo /*output
  return mscclppSuccess;
 }

-mscclppResult_t mscclppIbConnectionSetupEnd(struct connInfo* connInfo /*input*/, struct mscclppConn* conn /*output*/){
-  if (connInfo == NULL || conn == NULL){
+mscclppResult_t mscclppIbConnectionSetupEnd(struct connInfo* connInfo /*input*/, struct mscclppConn* conn /*output*/)
+{
+  if (connInfo == NULL || conn == NULL) {
    WARN("ipcHandles or connection cannot be null");
    return mscclppInternalError;
  }
@@ -498,7 +530,7 @@ mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm)
 {
  // Send info to peers
  for (int i = 0; i < comm->nConns; ++i) {
-    struct mscclppConn *conn = &comm->conns[i];
+    struct mscclppConn* conn = &comm->conns[i];

    struct connInfo cInfo;
    if (conn->transport == mscclppTransportP2P) {
@@ -512,7 +544,7 @@ mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm)

  // Recv info from peers
  for (int i = 0; i < comm->nConns; ++i) {
-    struct mscclppConn *conn = &comm->conns[i];
+    struct mscclppConn* conn = &comm->conns[i];
    struct connInfo cInfo;
    MSCCLPPCHECK(bootstrapRecv(comm->bootstrap, conn->devConn->remoteRank, conn->devConn->tag, &cInfo, sizeof(cInfo)));
    if (conn->transport == mscclppTransportP2P) {
--- a/src/misc/npkit.cc
+++ b/src/misc/npkit.cc
@@ -17,7 +17,8 @@ uint64_t* NpKit::cpu_timestamp_ = nullptr;
 std::thread* NpKit::cpu_timestamp_update_thread_ = nullptr;
 volatile bool NpKit::cpu_timestamp_update_thread_should_stop_ = false;

-void NpKit::CpuTimestampUpdateThread() {
+void NpKit::CpuTimestampUpdateThread()
+{
  uint64_t init_system_clock = std::chrono::system_clock::now().time_since_epoch().count();
  uint64_t init_steady_clock = std::chrono::steady_clock::now().time_since_epoch().count();
  uint64_t curr_steady_clock = 0;
@@ -28,7 +29,8 @@ void NpKit::CpuTimestampUpdateThread() {
  }
 }

-mscclppResult_t NpKit::Init(int rank) {
+mscclppResult_t NpKit::Init(int rank)
+{
  uint64_t i = 0;
  NpKitEventCollectContext ctx;
  ctx.event_buffer_head = 0;
@@ -61,7 +63,8 @@ mscclppResult_t NpKit::Init(int rank) {
  return mscclppSuccess;
 }

-mscclppResult_t NpKit::Dump(const std::string& dump_dir) {
+mscclppResult_t NpKit::Dump(const std::string& dump_dir)
+{
  uint64_t i = 0;
  std::string dump_file_path;

@@ -74,7 +77,7 @@ mscclppResult_t NpKit::Dump(const std::string& dump_dir) {
    dump_file_path += std::to_string(i);
    auto cpu_trace_file = std::fstream(dump_file_path, std::ios::out | std::ios::binary);
    cpu_trace_file.write(reinterpret_cast<char*>(cpu_event_buffers_[i]),
-        cpu_collect_contexts_[i].event_buffer_head * sizeof(NpKitEvent));
+                         cpu_collect_contexts_[i].event_buffer_head * sizeof(NpKitEvent));
    cpu_trace_file.close();
  }

@@ -106,7 +109,7 @@ mscclppResult_t NpKit::Dump(const std::string& dump_dir) {
    MSCCLPPCHECK(mscclppCudaMemcpy(cpu_collect_contexts_, gpu_collect_contexts_ + i, 1));
    auto gpu_trace_file = std::fstream(dump_file_path, std::ios::out | std::ios::binary);
    gpu_trace_file.write(reinterpret_cast<char*>(cpu_event_buffers_[0]),
-        cpu_collect_contexts_[0].event_buffer_head * sizeof(NpKitEvent));
+                         cpu_collect_contexts_[0].event_buffer_head * sizeof(NpKitEvent));
    gpu_trace_file.close();
  }

@@ -126,7 +129,8 @@ mscclppResult_t NpKit::Dump(const std::string& dump_dir) {
  return mscclppSuccess;
 }

-mscclppResult_t NpKit::Shutdown() {
+mscclppResult_t NpKit::Shutdown()
+{
  uint64_t i = 0;

  // Stop CPU timestamp updating thread
@@ -153,11 +157,13 @@ mscclppResult_t NpKit::Shutdown() {
  return mscclppSuccess;
 }

-NpKitEventCollectContext* NpKit::GetGpuEventCollectContexts() {
+NpKitEventCollectContext* NpKit::GetGpuEventCollectContexts()
+{
  return gpu_collect_contexts_;
 }

-void NpKit::CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id) {
+void NpKit::CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id)
+{
  uint64_t event_buffer_head = cpu_collect_contexts_[channel_id].event_buffer_head;
  if (event_buffer_head < kMaxNumCpuEventsPerBuffer) {
    NpKitEvent& event = cpu_collect_contexts_[channel_id].event_buffer[event_buffer_head];
@@ -169,6 +175,7 @@ void NpKit::CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t
  }
 }

-uint64_t* NpKit::GetCpuTimestamp() {
+uint64_t* NpKit::GetCpuTimestamp()
+{
  return cpu_timestamp_;
 }
--- a/src/param.cc
+++ b/src/param.cc
@@ -9,48 +9,56 @@

 #include <algorithm>
 #include <errno.h>
+#include <pthread.h>
+#include <pwd.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
 #include <unistd.h>
-#include <pthread.h>
-#include <pwd.h>

-const char* userHomeDir() {
-  struct passwd *pwUser = getpwuid(getuid());
+const char* userHomeDir()
+{
+  struct passwd* pwUser = getpwuid(getuid());
  return pwUser == NULL ? NULL : pwUser->pw_dir;
 }

-void setEnvFile(const char* fileName) {
-  FILE * file = fopen(fileName, "r");
-  if (file == NULL) return;
+void setEnvFile(const char* fileName)
+{
+  FILE* file = fopen(fileName, "r");
+  if (file == NULL)
+    return;

-  char *line = NULL;
+  char* line = NULL;
  char envVar[1024];
  char envValue[1024];
  size_t n = 0;
  ssize_t read;
  while ((read = getline(&line, &n, file)) != -1) {
-    if (line[read-1] == '\n') line[read-1] = '\0';
-    int s=0; // Env Var Size
-    while (line[s] != '\0' && line[s] != '=') s++;
-    if (line[s] == '\0') continue;
-    strncpy(envVar, line, std::min(1023,s));
+    if (line[read - 1] == '\n')
+      line[read - 1] = '\0';
+    int s = 0; // Env Var Size
+    while (line[s] != '\0' && line[s] != '=')
+      s++;
+    if (line[s] == '\0')
+      continue;
+    strncpy(envVar, line, std::min(1023, s));
    envVar[s] = '\0';
    s++;
-    strncpy(envValue, line+s, 1023);
-    envValue[1023]='\0';
+    strncpy(envValue, line + s, 1023);
+    envValue[1023] = '\0';
    setenv(envVar, envValue, 0);
-    //printf("%s : %s->%s\n", fileName, envVar, envValue);
+    // printf("%s : %s->%s\n", fileName, envVar, envValue);
  }
-  if (line) free(line);
+  if (line)
+    free(line);
  fclose(file);
 }

-void initEnv() {
+void initEnv()
+{
  char confFilePath[1024];
-  const char * userDir = userHomeDir();
+  const char* userDir = userHomeDir();
  if (userDir) {
    sprintf(confFilePath, "%s/.mscclpp.conf", userDir);
    setEnvFile(confFilePath);
@@ -59,7 +67,8 @@ void initEnv() {
  setEnvFile(confFilePath);
 }

-void mscclppLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) {
+void mscclppLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache)
+{
  static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
  pthread_mutex_lock(&mutex);
  if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) {
@@ -70,9 +79,9 @@ void mscclppLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, i
      value = strtoll(str, nullptr, 0);
      if (errno) {
        value = deftVal;
-        INFO(MSCCLPP_ALL,"Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal);
+        INFO(MSCCLPP_ALL, "Invalid value %s for %s, using default %lld.", str, env, (long long)deftVal);
      } else {
-        INFO(MSCCLPP_ALL,"%s set by environment to %lld.", env, (long long)value);
+        INFO(MSCCLPP_ALL, "%s set by environment to %lld.", env, (long long)value);
      }
    }
    __atomic_store_n(cache, value, __ATOMIC_RELAXED);
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -1,16 +1,16 @@
-#include "comm.h"
-#include "socket.h"
-#include "debug.h"
 #include "alloc.h"
-#include "ib.h"
 #include "checks.h"
+#include "comm.h"
+#include "debug.h"
+#include "ib.h"
+#include "socket.h"

 #include <emmintrin.h>
-#include <sys/syscall.h>
-#include <numa.h>
 #include <map>
-#include <vector>
+#include <numa.h>
+#include <sys/syscall.h>
 #include <thread>
+#include <vector>

 #if defined(ENABLE_NPKIT)
 #include "npkit/npkit.h"
@@ -20,13 +20,13 @@
 // TODO(chhwang): verify if MSCCLPP_PROXY_FLAG_SET_BY_RDMA == 0 is useful, otherwise delete this option.
 #define MSCCLPP_PROXY_FLAG_SET_BY_RDMA 1

-#define PROXYCUDACHECK(cmd) \
-  do { \
-    cudaError_t err = cmd; \
-    if (err != cudaSuccess) { \
-      WARN("CUDA error from proxy: %s", cudaGetErrorString(err)); \
-      return NULL; \
-    } \
+#define PROXYCUDACHECK(cmd)                                                                                            \
+  do {                                                                                                                 \
+    cudaError_t err = cmd;                                                                                             \
+    if (err != cudaSuccess) {                                                                                          \
+      WARN("CUDA error from proxy: %s", cudaGetErrorString(err));                                                      \
+      return NULL;                                                                                                     \
+    }                                                                                                                  \
  } while (false)

 static void NumaBind(int node)
@@ -37,24 +37,27 @@ static void NumaBind(int node)
  numa_bind_compat(&mask);
 }

-struct proxyArgs {
+struct proxyArgs
+{
  struct mscclppComm* comm;
-  struct mscclppProxyState *proxyState;
+  struct mscclppProxyState* proxyState;
  cudaStream_t stream;
 };

-static void readTrigger(mscclppTrigger *dst, mscclppTrigger *src) {
-  __m128i xmm0 = _mm_load_si128((__m128i *)src);
-  _mm_store_si128((__m128i *)dst, xmm0);
+static void readTrigger(mscclppTrigger* dst, mscclppTrigger* src)
+{
+  __m128i xmm0 = _mm_load_si128((__m128i*)src);
+  _mm_store_si128((__m128i*)dst, xmm0);
 }

-void* mscclppProxyServiceP2P(void* _args) {
-  struct proxyArgs *args = (struct proxyArgs *)_args;
-  struct mscclppComm *comm = args->comm;
-  volatile mscclppProxyRunState_t *run = &args->proxyState->run;
-  mscclppTrigger *fifo = args->proxyState->triggerFifo.hostPtr;
-  volatile uint64_t *fifoTail = args->proxyState->fifoTail.hostPtr;
-  volatile uint64_t *fifoHead = args->proxyState->fifoHead.hostPtr;
+void* mscclppProxyServiceP2P(void* _args)
+{
+  struct proxyArgs* args = (struct proxyArgs*)_args;
+  struct mscclppComm* comm = args->comm;
+  volatile mscclppProxyRunState_t* run = &args->proxyState->run;
+  mscclppTrigger* fifo = args->proxyState->triggerFifo.hostPtr;
+  volatile uint64_t* fifoTail = args->proxyState->fifoTail.hostPtr;
+  volatile uint64_t* fifoHead = args->proxyState->fifoHead.hostPtr;

  cudaStream_t stream = args->proxyState->stream;
  free(_args);
@@ -73,32 +76,36 @@ void* mscclppProxyServiceP2P(void* _args) {
    if (runCheckCounter-- == 0) {
      runCheckCounter = MSCCLPP_PROXY_RUN_STATE_CHECK_PERIOD;
      // Check if we need to exit
-      if (*run != MSCCLPP_PROXY_RUN_STATE_RUNNING) break;
+      if (*run != MSCCLPP_PROXY_RUN_STATE_RUNNING)
+        break;
    }
    // Poll to see if we are ready to send anything
-    if (cachedFifoTail == *fifoHead) continue; // no need trigger
+    if (cachedFifoTail == *fifoHead)
+      continue; // no need trigger
    readTrigger(&trigger, &fifo[cachedFifoTail % MSCCLPP_PROXY_FIFO_SIZE]);
-    if (trigger.value[0] == 0) continue; // there is one in progreess
+    if (trigger.value[0] == 0)
+      continue; // there is one in progreess
    // there is a trigger value ready to be consumed
-    
-    struct mscclppConn *conn = &comm->conns[trigger.fields.connId];
+
+    struct mscclppConn* conn = &comm->conns[trigger.fields.connId];

    // Iterate over what send is needed
-    if (trigger.fields.type & mscclppData){
-      void *srcBuff = (void *)((char *)conn->devConn->localBuff + trigger.fields.srcDataOffset);
-      void *dstBuff = (void *)((char *)conn->devConn->remoteBuff + trigger.fields.dstDataOffset);
+    if (trigger.fields.type & mscclppData) {
+      void* srcBuff = (void*)((char*)conn->devConn->localBuff + trigger.fields.srcDataOffset);
+      void* dstBuff = (void*)((char*)conn->devConn->remoteBuff + trigger.fields.dstDataOffset);
      PROXYCUDACHECK(cudaMemcpyAsync(dstBuff, srcBuff, trigger.fields.dataSize, cudaMemcpyDeviceToDevice, stream));
    }
    if (trigger.fields.type & mscclppFlag) {
-      PROXYCUDACHECK(cudaMemcpyAsync(conn->remoteProxyFlag, conn->devConn->sendEpochId, sizeof(uint64_t), cudaMemcpyDeviceToDevice, stream));
+      PROXYCUDACHECK(cudaMemcpyAsync(conn->remoteProxyFlag, conn->devConn->sendEpochId, sizeof(uint64_t),
+                                     cudaMemcpyDeviceToDevice, stream));
    }
    // Wait for completion
-    if (trigger.fields.type & mscclppSync){
+    if (trigger.fields.type & mscclppSync) {
      PROXYCUDACHECK(cudaStreamSynchronize(stream));
    }

    // Send completion: reset only the high 64 bits
-    *(volatile uint64_t *)(&fifo[cachedFifoTail % MSCCLPP_PROXY_FIFO_SIZE]) = 0;
+    *(volatile uint64_t*)(&fifo[cachedFifoTail % MSCCLPP_PROXY_FIFO_SIZE]) = 0;
    cachedFifoTail++;
    *fifoTail = cachedFifoTail;
  }
@@ -112,28 +119,30 @@ void* mscclppProxyServiceP2P(void* _args) {
  return NULL;
 }

-void* mscclppProxyServiceIb(void* _args) {
-  struct proxyArgs *args = (struct proxyArgs *)_args;
-  struct mscclppComm *comm = args->comm;
-  struct mscclppIbContext *ibCtx = args->proxyState->ibContext;
-  volatile mscclppProxyRunState_t *run = &args->proxyState->run;
-  mscclppTrigger *fifo = args->proxyState->triggerFifo.hostPtr;
-  volatile uint64_t *fifoTail = args->proxyState->fifoTail.hostPtr;
-  volatile uint64_t *fifoHead = args->proxyState->fifoHead.hostPtr;
+void* mscclppProxyServiceIb(void* _args)
+{
+  struct proxyArgs* args = (struct proxyArgs*)_args;
+  struct mscclppComm* comm = args->comm;
+  struct mscclppIbContext* ibCtx = args->proxyState->ibContext;
+  volatile mscclppProxyRunState_t* run = &args->proxyState->run;
+  mscclppTrigger* fifo = args->proxyState->triggerFifo.hostPtr;
+  volatile uint64_t* fifoTail = args->proxyState->fifoTail.hostPtr;
+  volatile uint64_t* fifoHead = args->proxyState->fifoHead.hostPtr;
  free(_args);

 #if (MSCCLPP_PROXY_FLAG_SET_BY_RDMA == 0)
-  enum {
+  enum
+  {
    SEND_STATE_INIT,
    SEND_STATE_INPROGRESS
  };
-  int *sendState;
-  uint64_t *currentProxyFlagValue;
-  if (mscclppCalloc((void **)&sendState, comm->nConns) != mscclppSuccess) {
+  int* sendState;
+  uint64_t* currentProxyFlagValue;
+  if (mscclppCalloc((void**)&sendState, comm->nConns) != mscclppSuccess) {
    WARN("mscclppCalloc failed: errno %d", errno);
    return NULL;
  }
-  if (mscclppCalloc((void **)&currentProxyFlagValue, comm->nConns) != mscclppSuccess) {
+  if (mscclppCalloc((void**)&currentProxyFlagValue, comm->nConns) != mscclppSuccess) {
    WARN("mscclppCalloc failed: errno %d", errno);
    return NULL;
  }
@@ -148,7 +157,7 @@ void* mscclppProxyServiceIb(void* _args) {
 #if (MSCCLPP_PROXY_FLAG_SET_BY_RDMA == 0)
  for (int i = 0; i < (int)comm->nConns; ++i) {
    sendState[i] = SEND_STATE_INIT;
-    struct mscclppConn *conn = &comm->conns[i];
+    struct mscclppConn* conn = &comm->conns[i];
    currentProxyFlagValue[i] = *conn->cpuProxyFlag;
    // Post recv
    if (conn->ibQp->postRecv(0) != 0) {
@@ -163,17 +172,19 @@ void* mscclppProxyServiceIb(void* _args) {
    if (runCheckCounter-- == 0) {
      runCheckCounter = MSCCLPP_PROXY_RUN_STATE_CHECK_PERIOD;
      // Check if we need to exit
-      if (*run != MSCCLPP_PROXY_RUN_STATE_RUNNING) break;
+      if (*run != MSCCLPP_PROXY_RUN_STATE_RUNNING)
+        break;
    }

 #if (MSCCLPP_PROXY_FLAG_SET_BY_RDMA == 0)
-    struct mscclppConn *conn = &comm->conns[trigger.fields.connId];
+    struct mscclppConn* conn = &comm->conns[trigger.fields.connId];
    // Try send
    if (sendState[trigger.fields.connId] == SEND_STATE_INIT) {
      if (trigger.value[0] != 0) {
        // Do send
        conn->ibQp->stageSendWithImm(conn->ibBuffMr, &conn->ibBuffMrInfo, (uint32_t)trigger.fields.dataSize,
-                                     /*wrId=*/0, /*offset=*/trigger.fields.dataOffset, /*signaled=*/true, /*immData=*/0);
+                                     /*wrId=*/0, /*offset=*/trigger.fields.dataOffset, /*signaled=*/true,
+                                     /*immData=*/0);
        int ret;
        if ((ret = conn->ibQp->postSend()) != 0) {
          // Return value is errno.
@@ -189,7 +200,7 @@ void* mscclppProxyServiceIb(void* _args) {
      WARN("rank %d pollCq failed: errno %d", rank, errno);
    } else {
      for (int i = 0; i < wcNum; ++i) {
-        struct ibv_wc *wc = &conn->ibQp->wcs[i];
+        struct ibv_wc* wc = &conn->ibQp->wcs[i];
        if (wc->status != IBV_WC_SUCCESS) {
          WARN("rank %d wc status %d", rank, wc->status);
          continue;
@@ -200,7 +211,7 @@ void* mscclppProxyServiceIb(void* _args) {
        }
        if (wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
          // TODO(chhwang): cpu flush
-          *((volatile uint64_t *)conn->cpuProxyFlag) = ++currentProxyFlagValue[trigger.fields.connId];
+          *((volatile uint64_t*)conn->cpuProxyFlag) = ++currentProxyFlagValue[trigger.fields.connId];
          // recv completion
          if (conn->ibQp->postRecv(wc->wr_id) != 0) {
            WARN("postRecv failed: errno %d", errno);
@@ -208,7 +219,7 @@ void* mscclppProxyServiceIb(void* _args) {
          // WARN("rank %d recv completion", rank);
        } else if (wc->opcode == IBV_WC_RDMA_WRITE) {
          // send completion
-          *(volatile uint64_t *)(&fifo[fifoTail]) = 0;
+          *(volatile uint64_t*)(&fifo[fifoTail]) = 0;
          fifoTail = (fifoTail + 1) % MSCCLPP_PROXY_FIFO_SIZE;
          sendState[trigger.fields.connId] = SEND_STATE_INIT;
          // WARN("rank %d send completion", rank);
@@ -217,21 +228,24 @@ void* mscclppProxyServiceIb(void* _args) {
    }
 #else // (MSCCLPP_PROXY_FLAG_SET_BY_RDMA == 1)
    // Poll to see if we are ready to send anything
-    if (cachedFifoTail == *fifoHead) continue; // no need trigger
+    if (cachedFifoTail == *fifoHead)
+      continue; // no need trigger
    readTrigger(&trigger, &fifo[cachedFifoTail % MSCCLPP_PROXY_FIFO_SIZE]);
-    if (trigger.value[0] == 0) continue; // there is one in progreess
+    if (trigger.value[0] == 0)
+      continue; // there is one in progreess
    // there is a trigger value ready to be consumed

-    struct mscclppConn *conn = &comm->conns[trigger.fields.connId];
+    struct mscclppConn* conn = &comm->conns[trigger.fields.connId];

    if (trigger.fields.type & mscclppData) {
      conn->ibQp->stageSend(conn->ibBuffMr, &conn->ibBuffMrInfo, (uint32_t)trigger.fields.dataSize,
-                            /*wrId=*/0, /*srcOffset=*/trigger.fields.srcDataOffset, /*dstOffset=*/trigger.fields.dstDataOffset,
+                            /*wrId=*/0, /*srcOffset=*/trigger.fields.srcDataOffset,
+                            /*dstOffset=*/trigger.fields.dstDataOffset,
                            /*signaled=*/false);
 #if defined(ENABLE_NPKIT)
-      NpKit::CollectCpuEvent(
-        NPKIT_EVENT_IB_SEND_ENTRY, (uint32_t)trigger.fields.dataSize, 0 /* inflight request differentiator */,
-        *(volatile uint64_t*)NpKit::GetCpuTimestamp(), trigger.fields.connId /* event collection context index */);
+      NpKit::CollectCpuEvent(NPKIT_EVENT_IB_SEND_ENTRY, (uint32_t)trigger.fields.dataSize,
+                             0 /* inflight request differentiator */, *(volatile uint64_t*)NpKit::GetCpuTimestamp(),
+                             trigger.fields.connId /* event collection context index */);
 #endif
    }
    if (trigger.fields.type & mscclppFlag) {
@@ -255,7 +269,7 @@ void* mscclppProxyServiceIb(void* _args) {
          continue;
        }
        for (int i = 0; i < wcNum; ++i) {
-          struct ibv_wc *wc = &conn->ibQp->wcs[i];
+          struct ibv_wc* wc = &conn->ibQp->wcs[i];
          if (wc->status != IBV_WC_SUCCESS) {
            WARN("rank %d wc status %d", rank, wc->status);
            continue;
@@ -268,9 +282,10 @@ void* mscclppProxyServiceIb(void* _args) {
            // send completion
            waiting = false;
 #if defined(ENABLE_NPKIT)
-            NpKit::CollectCpuEvent(
-              NPKIT_EVENT_IB_SEND_EXIT, (uint32_t)trigger.fields.dataSize, 0 /* inflight request differentiator */,
-              *(volatile uint64_t*)NpKit::GetCpuTimestamp(), trigger.fields.connId /* event collection context index */);
+            NpKit::CollectCpuEvent(NPKIT_EVENT_IB_SEND_EXIT, (uint32_t)trigger.fields.dataSize,
+                                   0 /* inflight request differentiator */,
+                                   *(volatile uint64_t*)NpKit::GetCpuTimestamp(),
+                                   trigger.fields.connId /* event collection context index */);
 #endif
            break;
          }
@@ -279,22 +294,23 @@ void* mscclppProxyServiceIb(void* _args) {
    }

    // Send completion: reset only the high 64 bits
-    *(volatile uint64_t *)(&fifo[cachedFifoTail % MSCCLPP_PROXY_FIFO_SIZE]) = 0;
+    *(volatile uint64_t*)(&fifo[cachedFifoTail % MSCCLPP_PROXY_FIFO_SIZE]) = 0;
    cachedFifoTail++;
    *fifoTail = cachedFifoTail;
 #endif
  }

-  //TODO(saemal): we need to wait for completion of wc here too
+  // TODO(saemal): we need to wait for completion of wc here too

  *run = MSCCLPP_PROXY_RUN_STATE_IDLE;
  // WARN("Proxy exits: rank %d", rank);
  return NULL;
 }

-void* mscclppProxyService(void* _args) {
-  struct proxyArgs *args = (struct proxyArgs *)_args;
-  void *ret;
+void* mscclppProxyService(void* _args)
+{
+  struct proxyArgs* args = (struct proxyArgs*)_args;
+  void* ret;
  if (args->proxyState->ibContext == NULL) {
    ret = mscclppProxyServiceP2P(_args);
  } else {
@@ -303,12 +319,14 @@ void* mscclppProxyService(void* _args) {
  return ret;
 }

-mscclppResult_t mscclppProxyCreate(struct mscclppComm* comm) {
+mscclppResult_t mscclppProxyCreate(struct mscclppComm* comm)
+{
  for (int i = 0; i < MSCCLPP_PROXY_MAX_NUM; ++i) {
-    struct mscclppProxyState *proxyState = comm->proxyState[i];
-    if (proxyState == NULL) break;
+    struct mscclppProxyState* proxyState = comm->proxyState[i];
+    if (proxyState == NULL)
+      break;

-    struct proxyArgs *args;
+    struct proxyArgs* args;
    MSCCLPPCHECK(mscclppCalloc(&args, 1));
    args->comm = comm;
    args->proxyState = proxyState;
@@ -324,12 +342,14 @@ mscclppResult_t mscclppProxyCreate(struct mscclppComm* comm) {
  return mscclppSuccess;
 }

-mscclppResult_t mscclppProxyDestroy(struct mscclppComm* comm) {
+mscclppResult_t mscclppProxyDestroy(struct mscclppComm* comm)
+{
  for (int i = 0; i < MSCCLPP_PROXY_MAX_NUM; ++i) {
-    struct mscclppProxyState *proxyState = comm->proxyState[i];
-    if (proxyState == NULL) break;
+    struct mscclppProxyState* proxyState = comm->proxyState[i];
+    if (proxyState == NULL)
+      break;

-    volatile int *run = (volatile int *)&proxyState->run;
+    volatile int* run = (volatile int*)&proxyState->run;
    if (*run == MSCCLPP_PROXY_RUN_STATE_IDLE) {
      continue;
    }
--- a/src/utils.cc
+++ b/src/utils.cc
@@ -21,22 +21,24 @@
 //   return ccMajor*10+ccMinor;
 // }

-mscclppResult_t int64ToBusId(int64_t id, char* busId) {
+mscclppResult_t int64ToBusId(int64_t id, char* busId)
+{
  sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf));
  return mscclppSuccess;
 }

-mscclppResult_t busIdToInt64(const char* busId, int64_t* id) {
-  char hexStr[17];  // Longest possible int64 hex string + null terminator.
+mscclppResult_t busIdToInt64(const char* busId, int64_t* id)
+{
+  char hexStr[17]; // Longest possible int64 hex string + null terminator.
  int hexOffset = 0;
  for (int i = 0; hexOffset < sizeof(hexStr) - 1; i++) {
    char c = busId[i];
-    if (c == '.' || c == ':') continue;
-    if ((c >= '0' && c <= '9') ||
-        (c >= 'A' && c <= 'F') ||
-        (c >= 'a' && c <= 'f')) {
+    if (c == '.' || c == ':')
+      continue;
+    if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) {
      hexStr[hexOffset++] = busId[i];
-    } else break;
+    } else
+      break;
  }
  hexStr[hexOffset] = '\0';
  *id = strtol(hexStr, NULL, 16);
@@ -44,7 +46,8 @@ mscclppResult_t busIdToInt64(const char* busId, int64_t* id) {
 }

 // Convert a logical cudaDev index to the NVML device minor number
-mscclppResult_t getBusId(int cudaDev, int64_t *busId) {
+mscclppResult_t getBusId(int cudaDev, int64_t* busId)
+{
  // On most systems, the PCI bus ID comes back as in the 0000:00:00.0
  // format. Still need to allocate proper space in case PCI domain goes
  // higher.
@@ -54,18 +57,21 @@ mscclppResult_t getBusId(int cudaDev, int64_t *busId) {
  return mscclppSuccess;
 }

-mscclppResult_t getHostName(char* hostname, int maxlen, const char delim) {
+mscclppResult_t getHostName(char* hostname, int maxlen, const char delim)
+{
  if (gethostname(hostname, maxlen) != 0) {
    strncpy(hostname, "unknown", maxlen);
    return mscclppSystemError;
  }
  int i = 0;
-  while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen-1)) i++;
+  while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen - 1))
+    i++;
  hostname[i] = '\0';
  return mscclppSuccess;
 }

-uint64_t getHash(const char* string, int n) {
+uint64_t getHash(const char* string, int n)
+{
  // Based on DJB2a, result = result * 33 ^ char
  uint64_t result = 5381;
  for (int c = 0; c < n; c++) {
@@ -83,23 +89,24 @@ uint64_t getHash(const char* string, int n) {
 * This string can be overridden by using the MSCCLPP_HOSTID env var.
 */
 #define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
-uint64_t getHostHash(void) {
+uint64_t getHostHash(void)
+{
  char hostHash[1024];
-  char *hostId;
+  char* hostId;

  // Fall back is the full hostname if something fails
-  (void) getHostName(hostHash, sizeof(hostHash), '\0');
+  (void)getHostName(hostHash, sizeof(hostHash), '\0');
  int offset = strlen(hostHash);

  if ((hostId = getenv("MSCCLPP_HOSTID")) != NULL) {
    INFO(MSCCLPP_ENV, "MSCCLPP_HOSTID set by environment to %s", hostId);
    strncpy(hostHash, hostId, sizeof(hostHash));
  } else {
-    FILE *file = fopen(HOSTID_FILE, "r");
+    FILE* file = fopen(HOSTID_FILE, "r");
    if (file != NULL) {
-      char *p;
+      char* p;
      if (fscanf(file, "%ms", &p) == 1) {
-        strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
+        strncpy(hostHash + offset, p, sizeof(hostHash) - offset - 1);
        free(p);
      }
    }
@@ -107,9 +114,9 @@ uint64_t getHostHash(void) {
  }

  // Make sure the string is terminated
-  hostHash[sizeof(hostHash)-1]='\0';
+  hostHash[sizeof(hostHash) - 1] = '\0';

-  TRACE(MSCCLPP_INIT,"unique hostname '%s'", hostHash);
+  TRACE(MSCCLPP_INIT, "unique hostname '%s'", hostHash);

  return getHash(hostHash, strlen(hostHash));
 }
@@ -120,22 +127,26 @@ uint64_t getHostHash(void) {
 *
 * $$ $(readlink /proc/self/ns/pid)
 */
-uint64_t getPidHash(void) {
+uint64_t getPidHash(void)
+{
  char pname[1024];
  // Start off with our pid ($$)
-  sprintf(pname, "%ld", (long) getpid());
+  sprintf(pname, "%ld", (long)getpid());
  int plen = strlen(pname);
-  int len = readlink("/proc/self/ns/pid", pname+plen, sizeof(pname)-1-plen);
-  if (len < 0) len = 0;
+  int len = readlink("/proc/self/ns/pid", pname + plen, sizeof(pname) - 1 - plen);
+  if (len < 0)
+    len = 0;

-  pname[plen+len]='\0';
-  TRACE(MSCCLPP_INIT,"unique PID '%s'", pname);
+  pname[plen + len] = '\0';
+  TRACE(MSCCLPP_INIT, "unique PID '%s'", pname);

  return getHash(pname, strlen(pname));
 }

-int parseStringList(const char* string, struct netIf* ifList, int maxList) {
-  if (!string) return 0;
+int parseStringList(const char* string, struct netIf* ifList, int maxList)
+{
+  if (!string)
+    return 0;

  const char* ptr = string;

@@ -147,15 +158,18 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList) {
    if (c == ':') {
      if (ifC > 0) {
        ifList[ifNum].prefix[ifC] = '\0';
-        ifList[ifNum].port = atoi(ptr+1);
-        ifNum++; ifC = 0;
+        ifList[ifNum].port = atoi(ptr + 1);
+        ifNum++;
+        ifC = 0;
      }
-      while (c != ',' && c != '\0') c = *(++ptr);
+      while (c != ',' && c != '\0')
+        c = *(++ptr);
    } else if (c == ',' || c == '\0') {
      if (ifC > 0) {
        ifList[ifNum].prefix[ifC] = '\0';
        ifList[ifNum].port = -1;
-        ifNum++; ifC = 0;
+        ifNum++;
+        ifC = 0;
      }
    } else {
      ifList[ifNum].prefix[ifC] = c;
@@ -166,27 +180,32 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList) {
  return ifNum;
 }

-static bool matchIf(const char* string, const char* ref, bool matchExact) {
+static bool matchIf(const char* string, const char* ref, bool matchExact)
+{
  // Make sure to include '\0' in the exact case
  int matchLen = matchExact ? strlen(string) + 1 : strlen(ref);
  return strncmp(string, ref, matchLen) == 0;
 }

-static bool matchPort(const int port1, const int port2) {
-  if (port1 == -1) return true;
-  if (port2 == -1) return true;
-  if (port1 == port2) return true;
+static bool matchPort(const int port1, const int port2)
+{
+  if (port1 == -1)
+    return true;
+  if (port2 == -1)
+    return true;
+  if (port1 == port2)
+    return true;
  return false;
 }

-
-bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) {
+bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact)
+{
  // Make an exception for the case where no user list is defined
-  if (listSize == 0) return true;
+  if (listSize == 0)
+    return true;

-  for (int i=0; i<listSize; i++) {
-    if (matchIf(string, ifList[i].prefix, matchExact)
-        && matchPort(port, ifList[i].port)) {
+  for (int i = 0; i < listSize; i++) {
+    if (matchIf(string, ifList[i].prefix, matchExact) && matchPort(port, ifList[i].port)) {
      return true;
    }
  }
@@ -262,14 +281,13 @@ bool matchIfList(const char* string, int port, struct netIf* ifList, int listSiz
 //     me->topFrame.unhunks = proxy;
 //     mallocSize = size;
 //     proxy->obj = malloc(mallocSize);
-//     INFO(MSCCLPP_ALLOC, "%s:%d memory stack non-hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
-//     if (proxy->obj == nullptr) goto malloc_exhausted;
-//     return proxy->obj;
+//     INFO(MSCCLPP_ALLOC, "%s:%d memory stack non-hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long
+//     long)mallocSize); if (proxy->obj == nullptr) goto malloc_exhausted; return proxy->obj;
 //   }

 // malloc_exhausted:
-//   WARN("%s:%d Unrecoverable error detected: malloc(size=%llu) returned null.", __FILE__, __LINE__, (unsigned long long)mallocSize);
-//   abort();
+//   WARN("%s:%d Unrecoverable error detected: malloc(size=%llu) returned null.", __FILE__, __LINE__, (unsigned long
+//   long)mallocSize); abort();
 // }

 // void mscclppMemoryStackDestruct(struct mscclppMemoryStack* me) {
--- a/tests/allgather_test.cu
+++ b/tests/allgather_test.cu
@@ -3,36 +3,36 @@
 #ifdef MSCCLPP_USE_MPI_FOR_TESTS
 #include "mpi.h"
 #endif // MSCCLPP_USE_MPI_FOR_TESTS
+#include <iostream>
 #include <stdio.h>
 #include <stdlib.h>
-#include <unistd.h>
 #include <string>
-#include <iostream>
+#include <unistd.h>
 #include <unordered_map>

-
 static int nranksPerNode = 8;

 // Propagate errors up

-#define MSCCLPPCHECK(call) do { \
-  mscclppResult_t res = call; \
-  if (res != mscclppSuccess && res != mscclppInProgress) { \
-    /* Print the back trace*/ \
-    printf("Failure at %s:%d -> %s\n", __FILE__, __LINE__, mscclppGetErrorString(res));    \
-    return res; \
-  } \
-} while (0)
-
+#define MSCCLPPCHECK(call)                                                                                             \
+  do {                                                                                                                 \
+    mscclppResult_t res = call;                                                                                        \
+    if (res != mscclppSuccess && res != mscclppInProgress) {                                                           \
+      /* Print the back trace*/                                                                                        \
+      printf("Failure at %s:%d -> %s\n", __FILE__, __LINE__, mscclppGetErrorString(res));                              \
+      return res;                                                                                                      \
+    }                                                                                                                  \
+  } while (0)

 // Check CUDA RT calls
-#define CUDACHECK(cmd) do {                                   \
-    cudaError_t err = cmd;                                    \
-    if( err != cudaSuccess ) {                                \
-        printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
-        exit(EXIT_FAILURE);                                   \
-    }                                                         \
-} while(false)
+#define CUDACHECK(cmd)                                                                                                 \
+  do {                                                                                                                 \
+    cudaError_t err = cmd;                                                                                             \
+    if (err != cudaSuccess) {                                                                                          \
+      printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err));                                \
+      exit(EXIT_FAILURE);                                                                                              \
+    }                                                                                                                  \
+  } while (false)

 // Measure current time in second.
 static double getTime(void)
@@ -47,33 +47,36 @@ static double getTime(void)

 __constant__ mscclppDevConn_t constDevConns[16];

-__device__ void allgather0(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, int nelemsPerGPU){
+__device__ void allgather0(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, int nelemsPerGPU)
+{
  // this allgather is really simple and implemented as an alltoall

  // this thread's role is a sender role
  // put your data asynchronously
-  devConn.put(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int));
+  devConn.put(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int));
  // make sure everyone is put their data before some thread randomly blocks everyone else in signal
  __syncthreads();
  // push with flag and sync to make sure the data is received
  devConn.signal();
-  
+
  // this thread's role is a receiver role. wait on the semaphore to make sure the data is ready
  devConn.wait();
 }

-__device__ void allgather1(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, int nelemsPerGPU){
+__device__ void allgather1(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, int nelemsPerGPU)
+{
  // this allgather algorithm works as follows:
  // Step 1: GPU rank i sends data to GPU rank (i+1) % world_size
  // Step 2: GPU rank i waits for data from GPU rank (i+2) % world_size
  // ...
  // This order is much better for DMA engine for NVLinks

-  for (int i = 1; i < world_size; i++){
+  for (int i = 1; i < world_size; i++) {
    __syncthreads();
-    if (remoteRank != ((rank+i) % world_size)) continue;
+    if (remoteRank != ((rank + i) % world_size))
+      continue;
    // put your data to GPU (rank+i) % world_size and signal all in one call
-    devConn.putWithSignal(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int));
+    devConn.putWithSignal(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int));
  }
  // all connections wait for the signal from the sender
  devConn.wait();
@@ -82,7 +85,8 @@ __device__ void allgather1(mscclppDevConn_t devConn, int rank, int world_size, i
 __global__ void kernel(int rank, int world_size, int nelemsPerGPU, int kernel)
 {
  // only use a single thread from each warp
-  if (threadIdx.x % 32 != 0) return;
+  if (threadIdx.x % 32 != 0)
+    return;

  // find the mapping between remoteRank and devConns
  int warpId = threadIdx.x / 32;
@@ -106,7 +110,7 @@ int rankToNode(int rank)
  return rank / nranksPerNode;
 }

-void print_usage(const char *prog)
+void print_usage(const char* prog)
 {
 #ifdef MSCCLPP_USE_MPI_FOR_TESTS
  printf("usage: %s IP:PORT [rank nranks]\n", prog);
@@ -115,15 +119,16 @@ void print_usage(const char *prog)
 #endif
 }

-void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSize, int nelemsPerGPU, int** data_h, int **data_d)
+void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSize, int nelemsPerGPU, int** data_h,
+                                        int** data_d)
 {
  CUDACHECK(cudaMalloc(data_d, dataSize));
  CUDACHECK(cudaMemset(*data_d, 0, dataSize));

-  *data_h = new int[nelemsPerGPU*world_size];
-  for (int i = 0; i < nelemsPerGPU*world_size; i++){
+  *data_h = new int[nelemsPerGPU * world_size];
+  for (int i = 0; i < nelemsPerGPU * world_size; i++) {
    int val = i + 1;
-    if (i / nelemsPerGPU == rank){
+    if (i / nelemsPerGPU == rank) {
      (*data_h)[i] = val;
    } else {
      (*data_h)[i] = 0;
@@ -132,16 +137,18 @@ void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSiz
  CUDACHECK(cudaMemcpy(*data_d, *data_h, dataSize, cudaMemcpyHostToDevice));
 }

-mscclppResult_t setupMscclppConnections(int rank, int world_size, mscclppComm_t comm, int* data_d, size_t dataSize){
+mscclppResult_t setupMscclppConnections(int rank, int world_size, mscclppComm_t comm, int* data_d, size_t dataSize)
+{
  int thisNode = rankToNode(rank);
  int cudaNum = rankToLocalRank(rank);
  std::string ibDevStr = "mlx5_ib" + std::to_string(cudaNum);

  for (int r = 0; r < world_size; ++r) {
-    if (r == rank) continue;
+    if (r == rank)
+      continue;
    mscclppTransport_t transportType;
    const char* ibDev = ibDevStr.c_str();
-    if (rankToNode(r) == thisNode){
+    if (rankToNode(r) == thisNode) {
      ibDev = NULL;
      transportType = mscclppTransportP2P;
    } else {
@@ -153,7 +160,7 @@ mscclppResult_t setupMscclppConnections(int rank, int world_size, mscclppComm_t

  MSCCLPPCHECK(mscclppConnectionSetup(comm));

-  mscclppDevConn_t *devConns;
+  mscclppDevConn_t* devConns;
  int nCons;
  MSCCLPPCHECK(mscclppGetAllDeviceConnections(comm, &devConns, &nCons));

@@ -162,36 +169,39 @@ mscclppResult_t setupMscclppConnections(int rank, int world_size, mscclppComm_t
  return mscclppSuccess;
 }

-void printUsage(const char* prog, bool isMpi) {
-  if (isMpi){
+void printUsage(const char* prog, bool isMpi)
+{
+  if (isMpi) {
    std::string st = "you are using MPI for this test\n";
    st += "two possilbe usages are:\n";
    st += "> " + std::string(prog) + "\n";
    st += "or\n";
-    st += "> " + std::string(prog) + " -ip_port [ip:port]\n";      
+    st += "> " + std::string(prog) + " -ip_port [ip:port]\n";
    printf("%s", st.c_str());
  } else {
    std::string st = "you are NOT using MPI for this test\n";
    st += "the only possible usage:\n";
    st += "> " + std::string(prog) + " -ip_port [ip:port] -rank [rank] -nranks [nranks]\n";
-    printf("%s", st.c_str());        
+    printf("%s", st.c_str());
  }
 }

-std::unordered_map<std::string, std::string> parseArgs(int argc, const char* argv[], bool isMpi) {
+std::unordered_map<std::string, std::string> parseArgs(int argc, const char* argv[], bool isMpi)
+{
  std::unordered_map<std::string, std::string> options;

  for (int i = 1; i < argc; i++) {
    std::string arg = argv[i];
    if (arg == "-rankspernode") {
-      if (isMpi){
+      if (isMpi) {
        fprintf(stderr, "Error: -rankspernode should not be specified with MPI.\n");
        exit(-1);
      }
      if (i + 1 < argc) {
        options["rankspernode"] = argv[++i];
      } else {
-        fprintf(stderr, "Error: -rankspernode option requires an argument.\n");;
+        fprintf(stderr, "Error: -rankspernode option requires an argument.\n");
+        ;
        exit(-1);
      }
    } else if (arg == "-kernel") {
@@ -209,7 +219,7 @@ std::unordered_map<std::string, std::string> parseArgs(int argc, const char* arg
        exit(-1);
      }
    } else if (arg == "-rank") {
-      if (isMpi){
+      if (isMpi) {
        fprintf(stderr, "Error: -rank should not be specified with MPI.\n");
        exit(-1);
      }
@@ -220,7 +230,7 @@ std::unordered_map<std::string, std::string> parseArgs(int argc, const char* arg
        exit(-1);
      }
    } else if (arg == "-nranks") {
-      if (isMpi){
+      if (isMpi) {
        fprintf(stderr, "Error: -nranks should not be specified with MPI.\n");
        exit(-1);
      }
@@ -248,8 +258,7 @@ std::unordered_map<std::string, std::string> parseArgs(int argc, const char* arg
  return options;
 }

-
-int main(int argc, const char *argv[])
+int main(int argc, const char* argv[])
 {
  bool isMpi = false;
 #ifdef MSCCLPP_USE_MPI_FOR_TESTS
@@ -266,8 +275,7 @@ int main(int argc, const char *argv[])
  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
  // get the local number of nodes with MPI
  MPI_Comm shmcomm;
-  MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
-                      MPI_INFO_NULL, &shmcomm);
+  MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm);
  int shmrank;
  MPI_Comm_size(shmcomm, &shmrank);
  nranksPerNode = shmrank;
@@ -300,29 +308,33 @@ int main(int argc, const char *argv[])
  int cudaNum = rankToLocalRank(rank);
  CUDACHECK(cudaSetDevice(cudaNum));

-  if (rank == 0) printf("Initializing MSCCL++\n");
+  if (rank == 0)
+    printf("Initializing MSCCL++\n");
  mscclppComm_t comm;
  MSCCLPPCHECK(mscclppCommInitRank(&comm, world_size, ip_port, rank));

-  int *data_d;
-  int *data_h;
-  size_t dataSize = 1024*1024*1024;
+  int* data_d;
+  int* data_h;
+  size_t dataSize = 1024 * 1024 * 1024;
  if (parsedArgs.find("datasize") != parsedArgs.end()) {
    dataSize = std::stoi(parsedArgs["datasize"]);
  }
  int nelemsPerGPU = dataSize / sizeof(int) / world_size;

-  if (rank == 0) printf("Initializing data for allgather test\n");
+  if (rank == 0)
+    printf("Initializing data for allgather test\n");
  initializeAndAllocateAllGatherData(rank, world_size, dataSize, nelemsPerGPU, &data_h, &data_d);

-  if (rank == 0) printf("Setting up the connection in MSCCL++\n");
+  if (rank == 0)
+    printf("Setting up the connection in MSCCL++\n");
  MSCCLPPCHECK(setupMscclppConnections(rank, world_size, comm, data_d, dataSize));

-  if (rank == 0) printf("Launching MSCCL++ proxy threads\n");
+  if (rank == 0)
+    printf("Launching MSCCL++ proxy threads\n");
  MSCCLPPCHECK(mscclppProxyLaunch(comm));

-
-  if (rank == 0) printf("Testing the correctness of AllGather implementation\n");
+  if (rank == 0)
+    printf("Testing the correctness of AllGather implementation\n");
  cudaStream_t stream;
  CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  CUDACHECK(cudaDeviceSynchronize());
@@ -331,9 +343,9 @@ int main(int argc, const char *argv[])
  CUDACHECK(cudaMemcpy(data_h, data_d, dataSize, cudaMemcpyDeviceToHost));
  CUDACHECK(cudaDeviceSynchronize());

-  for (int i = 0; i < nelemsPerGPU*world_size; i++){
+  for (int i = 0; i < nelemsPerGPU * world_size; i++) {
    int val = i + 1;
-    if (data_h[i] != val){
+    if (data_h[i] != val) {
      printf("oh uh! data_h[%d] (%d) != val (%d)\n", i, data_h[i], val);
      break;
    }
@@ -341,11 +353,13 @@ int main(int argc, const char *argv[])
  int tmp[16];
  // A simple barrier
  MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int)));
-  if (rank == 0) printf("Successfully checked the correctness\n");
+  if (rank == 0)
+    printf("Successfully checked the correctness\n");

  // Perf test
  int iterwithoutcudagraph = 10;
-  if (rank == 0) printf("Running %d iterations of the kernel without CUDA graph\n", iterwithoutcudagraph);
+  if (rank == 0)
+    printf("Running %d iterations of the kernel without CUDA graph\n", iterwithoutcudagraph);
  for (int i = 0; i < iterwithoutcudagraph; ++i) {
    kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size, nelemsPerGPU, kernelNum);
  }
@@ -354,43 +368,51 @@ int main(int argc, const char *argv[])

  // cudaGraph Capture
  int cudagraphiter = 10;
-  if (rank == 0) printf("Capturing %d iterations of the kernel in a CUDA graph\n", cudagraphiter);
+  if (rank == 0)
+    printf("Capturing %d iterations of the kernel in a CUDA graph\n", cudagraphiter);
  cudaGraph_t graph;
  cudaGraphExec_t instance;
  cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
  for (int i = 0; i < cudagraphiter; ++i) {
-  	kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size, nelemsPerGPU, kernelNum);
+    kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size, nelemsPerGPU, kernelNum);
  }
  cudaStreamEndCapture(stream, &graph);
  cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);

  int cudagraphwarmup = 10;
-  if (rank == 0) printf("Warming up %d iterations of the CUDA graph with %d iterations of the kernel\n", cudagraphwarmup, cudagraphiter);
+  if (rank == 0)
+    printf("Warming up %d iterations of the CUDA graph with %d iterations of the kernel\n", cudagraphwarmup,
+           cudagraphiter);
  for (int i = 0; i < cudagraphwarmup; ++i) {
-	  cudaGraphLaunch(instance, stream);
+    cudaGraphLaunch(instance, stream);
  }
  CUDACHECK(cudaStreamSynchronize(stream));

-  // measure runtime 
+  // measure runtime
  int cudagraphlaunch = 10;
-  if (rank == 0) printf("Running %d iterations of the CUDA graph with %d iterations of the kernel\n", cudagraphlaunch, cudagraphiter);
+  if (rank == 0)
+    printf("Running %d iterations of the CUDA graph with %d iterations of the kernel\n", cudagraphlaunch,
+           cudagraphiter);
  MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int)));
  double t0 = getTime();
  for (int i = 0; i < cudagraphlaunch; ++i) {
-     cudaGraphLaunch(instance, stream);
+    cudaGraphLaunch(instance, stream);
  }
  CUDACHECK(cudaStreamSynchronize(stream));

  double t1 = getTime();
-  float ms = (t1-t0)*1000.0;
-  double time_in_us = ms * 1000. / (float) cudagraphlaunch / (float) cudagraphiter;
-  printf("Rank %d report: size %lu time: %f us/iter algBW %f GBps\n", rank, dataSize, time_in_us, (double) (dataSize) / 1e9 /(time_in_us/1e6));
+  float ms = (t1 - t0) * 1000.0;
+  double time_in_us = ms * 1000. / (float)cudagraphlaunch / (float)cudagraphiter;
+  printf("Rank %d report: size %lu time: %f us/iter algBW %f GBps\n", rank, dataSize, time_in_us,
+         (double)(dataSize) / 1e9 / (time_in_us / 1e6));
  MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int)));

-  if (rank == 0) printf("Stopping MSCCL++ proxy threads\n");
+  if (rank == 0)
+    printf("Stopping MSCCL++ proxy threads\n");
  MSCCLPPCHECK(mscclppProxyStop(comm));

-  if (rank == 0) printf("Destroying MSCCL++ communicator\n");
+  if (rank == 0)
+    printf("Destroying MSCCL++ communicator\n");
  MSCCLPPCHECK(mscclppCommDestroy(comm));
  printf("Rank %d succeeded!\n", rank);

--- a/tests/allgather_test2.cu
+++ b/tests/allgather_test2.cu
@@ -4,19 +4,20 @@
 #endif // MSCCLPP_USE_MPI_FOR_TESTS
 #include <stdio.h>
 #include <stdlib.h>
-#include <unistd.h>
 #include <string>
+#include <unistd.h>

 #define RANKS_PER_NODE 8

 // Check CUDA RT calls
-#define CUDACHECK(cmd) do {                                   \
-    cudaError_t err = cmd;                                    \
-    if( err != cudaSuccess ) {                                \
-        printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
-        exit(EXIT_FAILURE);                                   \
-    }                                                         \
-} while(false)
+#define CUDACHECK(cmd)                                                                                                 \
+  do {                                                                                                                 \
+    cudaError_t err = cmd;                                                                                             \
+    if (err != cudaSuccess) {                                                                                          \
+      printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err));                                \
+      exit(EXIT_FAILURE);                                                                                              \
+    }                                                                                                                  \
+  } while (false)

 // Measure current time in second.
 static double getTime(void)
@@ -33,43 +34,52 @@ __constant__ mscclppDevConn_t constDevConns[16];

 __global__ void kernel(int rank, int world_size, int nelemsPerGPU)
 {
-  if (threadIdx.x % 32 != 0) return;
+  if (threadIdx.x % 32 != 0)
+    return;

  int warpId = threadIdx.x / 32;
  bool isIB = false;
-  if (warpId >= world_size-1) isIB = true;
-  if (isIB) warpId = warpId - (world_size-1);
+  if (warpId >= world_size - 1)
+    isIB = true;
+  if (isIB)
+    warpId = warpId - (world_size - 1);
  int remoteRank = (warpId < rank) ? warpId : warpId + 1;
  mscclppDevConn_t devConn = constDevConns[remoteRank];
-  if (isIB) devConn = constDevConns[remoteRank + world_size];
+  if (isIB)
+    devConn = constDevConns[remoteRank + world_size];

-  // Each warp receives data from different ranks
+    // Each warp receives data from different ranks
 #if 1

  // Trigger sending data, flag and synchronize after
-  devConn.putWithSignal(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU*sizeof(int));
+  devConn.putWithSignal(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int));

  devConn.wait();

 #else
-  for (int i = 1; i < world_size; i++){
+  for (int i = 1; i < world_size; i++) {
    __syncthreads();
-    if (remoteRank != ((rank+i) % world_size)) continue;
+    if (remoteRank != ((rank + i) % world_size))
+      continue;

    // Trigger sending data, flag and synchronize after
-    int ibPortion = nelemsPerGPU/12;//nelemsPerGPU/12;
+    int ibPortion = nelemsPerGPU / 12; // nelemsPerGPU/12;
    if (isIB)
-      devConn.fifo.setTrigger(trig, mscclppFlag | mscclppData | mscclppSync, rank * nelemsPerGPU * sizeof(int) + (nelemsPerGPU - ibPortion)*sizeof(int), rank * nelemsPerGPU * sizeof(int) + (nelemsPerGPU - ibPortion)*sizeof(int), ibPortion*sizeof(int));
-    else 
-      devConn.fifo.setTrigger(trig, mscclppFlag | mscclppData | mscclppSync, rank * nelemsPerGPU * sizeof(int), rank * nelemsPerGPU * sizeof(int), (nelemsPerGPU-ibPortion)*sizeof(int));
+      devConn.fifo.setTrigger(trig, mscclppFlag | mscclppData | mscclppSync,
+                              rank * nelemsPerGPU * sizeof(int) + (nelemsPerGPU - ibPortion) * sizeof(int),
+                              rank * nelemsPerGPU * sizeof(int) + (nelemsPerGPU - ibPortion) * sizeof(int),
+                              ibPortion * sizeof(int));
+    else
+      devConn.fifo.setTrigger(trig, mscclppFlag | mscclppData | mscclppSync, rank * nelemsPerGPU * sizeof(int),
+                              rank * nelemsPerGPU * sizeof(int), (nelemsPerGPU - ibPortion) * sizeof(int));
    // Wait on the request to make sure it is safe to reuse buffer and flag
-    auto req = devConn.fifo.putWithSignal(dataOffset, dataSize); 
-    devConn.fifo.sync(req);    
+    auto req = devConn.fifo.putWithSignal(dataOffset, dataSize);
+    devConn.fifo.sync(req);
  }
  // Wait for receiving data from remote rank
-  while (*proxyFlag == baseFlag);
+  while (*proxyFlag == baseFlag)
+    ;
 #endif
-
 }

 int rankToLocalRank(int rank)
@@ -108,7 +118,7 @@ int cudaNumToIbNum(int cudaNum)
  return ibNum;
 }

-void print_usage(const char *prog)
+void print_usage(const char* prog)
 {
 #ifdef MSCCLPP_USE_MPI_FOR_TESTS
  printf("usage: %s IP:PORT [rank nranks]\n", prog);
@@ -117,14 +127,14 @@ void print_usage(const char *prog)
 #endif
 }

-int main(int argc, const char *argv[])
+int main(int argc, const char* argv[])
 {
 #ifdef MSCCLPP_USE_MPI_FOR_TESTS
  if (argc != 2 && argc != 4) {
    print_usage(argv[0]);
    return -1;
  }
-  const char *ip_port = argv[1];
+  const char* ip_port = argv[1];
  int rank;
  int world_size;
  if (argc == 4) {
@@ -140,7 +150,7 @@ int main(int argc, const char *argv[])
    print_usage(argv[0]);
    return -1;
  }
-  const char *ip_port = argv[1];
+  const char* ip_port = argv[1];
  int rank = atoi(argv[2]);
  int world_size = atoi(argv[3]);
 #endif
@@ -155,19 +165,19 @@ int main(int argc, const char *argv[])
  mscclppComm_t comm;
  MSCCLPPCHECK(mscclppCommInitRank(&comm, world_size, rank, ip_port));

-  int *data_d;
-  uint64_t *flag_d;
-  size_t data_size = 1536*1024*1024;
+  int* data_d;
+  uint64_t* flag_d;
+  size_t data_size = 1536 * 1024 * 1024;
  int nelemsPerGPU = data_size / sizeof(int) / world_size;
  CUDACHECK(cudaMalloc(&data_d, data_size));
  CUDACHECK(cudaMalloc(&flag_d, sizeof(uint64_t)));
  CUDACHECK(cudaMemset(data_d, 0, data_size));
  CUDACHECK(cudaMemset(flag_d, 0, sizeof(uint64_t)));

-  int* data_h = new int[nelemsPerGPU*world_size];
-  for (int i = 0; i < nelemsPerGPU*world_size; i++){
+  int* data_h = new int[nelemsPerGPU * world_size];
+  for (int i = 0; i < nelemsPerGPU * world_size; i++) {
    int val = i + 1;
-    if (i / nelemsPerGPU == rank){
+    if (i / nelemsPerGPU == rank) {
      data_h[i] = val;
    } else {
      data_h[i] = 0;
@@ -177,7 +187,8 @@ int main(int argc, const char *argv[])

  mscclppDevConn_t devConns[16];
  for (int r = 0; r < world_size; ++r) {
-    if (r == rank) continue;
+    if (r == rank)
+      continue;
    mscclppTransport_t transportType;
    const char* ibDev = NULL;
    transportType = mscclppTransportP2P;
@@ -185,12 +196,14 @@ int main(int argc, const char *argv[])
    MSCCLPPCHECK(mscclppConnect(comm, &devConns[r], r, 0, data_d, data_size, flag_d, transportType, ibDev));
  }
  for (int r = 0; r < world_size; ++r) {
-    if (r == rank) continue;
+    if (r == rank)
+      continue;
    mscclppTransport_t transportType;
    const char* ibDev = ibDevStr.c_str();
    transportType = mscclppTransportIB;
    // Connect with all other ranks
-    MSCCLPPCHECK(mscclppConnect(comm, &devConns[r+world_size], r, 0, data_d, data_size, flag_d, transportType, ibDev));
+    MSCCLPPCHECK(
+      mscclppConnect(comm, &devConns[r + world_size], r, 0, data_d, data_size, flag_d, transportType, ibDev));
  }

  MSCCLPPCHECK(mscclppConnectionSetup(comm));
@@ -202,16 +215,15 @@ int main(int argc, const char *argv[])
  cudaStream_t stream;
  CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));

-
  CUDACHECK(cudaDeviceSynchronize());
-  kernel<<<1, 32 * 2*(world_size - 1), 0, stream>>>(rank, world_size, nelemsPerGPU);
+  kernel<<<1, 32 * 2 * (world_size - 1), 0, stream>>>(rank, world_size, nelemsPerGPU);
  CUDACHECK(cudaDeviceSynchronize());
  CUDACHECK(cudaMemcpy(data_h, data_d, data_size, cudaMemcpyDeviceToHost));
  CUDACHECK(cudaDeviceSynchronize());

-  for (int i = 0; i < nelemsPerGPU*world_size; i++){
+  for (int i = 0; i < nelemsPerGPU * world_size; i++) {
    int val = i + 1;
-    if (data_h[i] != val){
+    if (data_h[i] != val) {
      printf("oh uh things went wrong! data_h[%d] (%d) != val (%d)\n", i, data_h[i], val);
      break;
    }
@@ -219,11 +231,11 @@ int main(int argc, const char *argv[])
  int tmp[16];
  MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int)));

-//   // Perf test
-//   cudaEvent_t ev_start;
-//   cudaEvent_t ev_end;
-//   CUDACHECK(cudaEventCreate(&ev_start));
-//   CUDACHECK(cudaEventCreate(&ev_end));
+  //   // Perf test
+  //   cudaEvent_t ev_start;
+  //   cudaEvent_t ev_end;
+  //   CUDACHECK(cudaEventCreate(&ev_start));
+  //   CUDACHECK(cudaEventCreate(&ev_end));

  // warm up
  // int warmupiter = 1000;
@@ -239,33 +251,34 @@ int main(int argc, const char *argv[])
  cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
  int cudagraphiter = 10;
  for (int i = 0; i < cudagraphiter; ++i) {
-  	kernel<<<1, 32 * 2*(world_size - 1), 0, stream>>>(rank, world_size, nelemsPerGPU);
+    kernel<<<1, 32 * 2 * (world_size - 1), 0, stream>>>(rank, world_size, nelemsPerGPU);
  }
  cudaStreamEndCapture(stream, &graph);
  cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);

  int cudagraphwarmup = 10;
  for (int i = 0; i < cudagraphwarmup; ++i) {
-	  cudaGraphLaunch(instance, stream);
+    cudaGraphLaunch(instance, stream);
  }
  CUDACHECK(cudaStreamSynchronize(stream));

-  // measure runtime 
-//  CUDACHECK(cudaEventRecord(ev_start, stream));
+  // measure runtime
+  //  CUDACHECK(cudaEventRecord(ev_start, stream));
  double t0 = getTime();
  int cudagraphlaunch = 10;
  for (int i = 0; i < cudagraphlaunch; ++i) {
-  // kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size);
-     cudaGraphLaunch(instance, stream);
+    // kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size);
+    cudaGraphLaunch(instance, stream);
  }
-//  CUDACHECK(cudaEventRecord(ev_end, stream));
+  //  CUDACHECK(cudaEventRecord(ev_end, stream));
  CUDACHECK(cudaStreamSynchronize(stream));

  double t1 = getTime();
-  float ms = (t1-t0)*1000.0;
-//  CUDACHECK(cudaEventElapsedTime(&ms, ev_start, ev_end));
-  double time_in_us = ms * 1000. / (float) cudagraphlaunch / (float) cudagraphiter;
-  printf("rank: %d, time: %f us/iter algBW %f\n", rank, time_in_us, (double) (data_size) / 1024./1024./1024./(time_in_us/1e6));
+  float ms = (t1 - t0) * 1000.0;
+  //  CUDACHECK(cudaEventElapsedTime(&ms, ev_start, ev_end));
+  double time_in_us = ms * 1000. / (float)cudagraphlaunch / (float)cudagraphiter;
+  printf("rank: %d, time: %f us/iter algBW %f\n", rank, time_in_us,
+         (double)(data_size) / 1024. / 1024. / 1024. / (time_in_us / 1e6));

  MSCCLPPCHECK(mscclppBootstrapAllGather(comm, tmp, sizeof(int)));
  MSCCLPPCHECK(mscclppProxyStop(comm));
--- a/tests/allreduce_allpairs_test.cu
+++ b/tests/allreduce_allpairs_test.cu
@@ -1,45 +1,49 @@
 #include "mscclpp.h"
+#include <cuda/barrier>
 #include <tuple>
 #include <vector>
-#include <cuda/barrier>

 #include "common.h"

-#define MSCCLPPCHECK(call) do { \
-  mscclppResult_t res = call; \
-  if (res != mscclppSuccess && res != mscclppInProgress) { \
-  /* Print the back trace*/ \
-  printf("Failure at %s:%d -> %d\n", __FILE__, __LINE__, res);    \
-  return res; \
-  } \
-} while (0);
+#define MSCCLPPCHECK(call)                                                                                             \
+  do {                                                                                                                 \
+    mscclppResult_t res = call;                                                                                        \
+    if (res != mscclppSuccess && res != mscclppInProgress) {                                                           \
+      /* Print the back trace*/                                                                                        \
+      printf("Failure at %s:%d -> %d\n", __FILE__, __LINE__, res);                                                     \
+      return res;                                                                                                      \
+    }                                                                                                                  \
+  } while (0);

-#define CUDACHECK(cmd) do { \
-  cudaError_t err = cmd; \
-  if( err != cudaSuccess ) { \
-    printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
-    exit(EXIT_FAILURE); \
-  } \
-} while(false)
+#define CUDACHECK(cmd)                                                                                                 \
+  do {                                                                                                                 \
+    cudaError_t err = cmd;                                                                                             \
+    if (err != cudaSuccess) {                                                                                          \
+      printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err));                                \
+      exit(EXIT_FAILURE);                                                                                              \
+    }                                                                                                                  \
+  } while (false)

-struct Volume {
+struct Volume
+{
  size_t offset;
  size_t size;
 };

-__host__ __device__ Volume chunkVolume(size_t totalSize, size_t totalChunks, size_t chunkIdx, size_t chunkCount) {
+__host__ __device__ Volume chunkVolume(size_t totalSize, size_t totalChunks, size_t chunkIdx, size_t chunkCount)
+{
  size_t remainder = totalSize % totalChunks;
  size_t smallChunk = totalSize / totalChunks;
  size_t largeChunk = smallChunk + 1;
  size_t numLargeChunks = chunkIdx < remainder ? remainder - chunkIdx : 0;
  size_t numSmallChunks = chunkCount - numLargeChunks;
-  size_t offset = (remainder - numLargeChunks) * largeChunk +
-                  (chunkIdx > remainder ? chunkIdx - remainder : 0) * smallChunk;
+  size_t offset =
+    (remainder - numLargeChunks) * largeChunk + (chunkIdx > remainder ? chunkIdx - remainder : 0) * smallChunk;
  return Volume{offset, numLargeChunks * largeChunk + numSmallChunks * smallChunk};
 }

-template<class T, void (*reduce)(T*,T*,size_t)>
-struct AllreduceAllpairs {
+template <class T, void (*reduce)(T*, T*, size_t)> struct AllreduceAllpairs
+{
  int rank;
  int nRanks;
  T* userData;
@@ -50,7 +54,8 @@ struct AllreduceAllpairs {
  uint64_t* connFlags;
  cuda::barrier<cuda::thread_scope_device>* barrier;

-  __device__ void run(int idx) {
+  __device__ void run(int idx)
+  {
    int myPeer = peerRank(idx, rank);
    mscclppDevConn_t phase1SendConn = conns[phase1SendConnIdx(myPeer)];
    mscclppDevConn_t phase1RecvConn = conns[phase1RecvConnIdx(myPeer)];
@@ -92,59 +97,70 @@ struct AllreduceAllpairs {
    Volume srcVolume2 = chunkVolume(userSize, nRanks, rank, 1);
    send(phase2Conn, srcVolume2.offset, srcVolume2.offset, srcVolume2.size);
    recv(phase2Conn);
-
  }

-  __device__ void send(mscclppDevConn_t& conn, size_t srcOffset, size_t dstOffset, size_t size) {
+  __device__ void send(mscclppDevConn_t& conn, size_t srcOffset, size_t dstOffset, size_t size)
+  {
    if (threadIdx.x == 0) {
-      volatile uint64_t *localFlag = conn.localFlag;
+      volatile uint64_t* localFlag = conn.localFlag;
      *localFlag = 1; // 1 is used to signal the send

      mscclppTrigger_t trigger;
      auto request = conn.fifo.getTrigger(&trigger);
-      conn.fifo.setTrigger(trigger, mscclppData | mscclppFlag, srcOffset * sizeof(T), dstOffset * sizeof(T), size * sizeof(T));
+      conn.fifo.setTrigger(trigger, mscclppData | mscclppFlag, srcOffset * sizeof(T), dstOffset * sizeof(T),
+                           size * sizeof(T));
    }
    __syncthreads();
  }

-  __device__ void recv(mscclppDevConn_t& conn) {
+  __device__ void recv(mscclppDevConn_t& conn)
+  {
    if (threadIdx.x == 0) {
-      volatile uint64_t *proxyFlag = conn.proxyFlag;
-      while (*proxyFlag != 1) {}
+      volatile uint64_t* proxyFlag = conn.proxyFlag;
+      while (*proxyFlag != 1) {
+      }
      *proxyFlag = 0;
    }
    __syncthreads();
  }

-  __host__ __device__ int numPeers() {
+  __host__ __device__ int numPeers()
+  {
    return nRanks - 1;
  }

-  __host__ __device__ int numBlocks() {
+  __host__ __device__ int numBlocks()
+  {
    return numPeers();
  }

-  __host__ __device__ int peerIdx(int peerRank, int myRank) {
+  __host__ __device__ int peerIdx(int peerRank, int myRank)
+  {
    return peerRank < myRank ? peerRank : peerRank - 1;
  }

-  __host__ __device__ int peerRank(int peerIdx, int myRank) {
+  __host__ __device__ int peerRank(int peerIdx, int myRank)
+  {
    return peerIdx < myRank ? peerIdx : peerIdx + 1;
  }

-  __host__ __device__ int phase1SendConnIdx(int peerRank) {
+  __host__ __device__ int phase1SendConnIdx(int peerRank)
+  {
    return peerIdx(peerRank, rank) * 3;
  }

-  __host__ __device__ int phase1RecvConnIdx(int peerRank) {
+  __host__ __device__ int phase1RecvConnIdx(int peerRank)
+  {
    return peerIdx(peerRank, rank) * 3 + 1;
  }

-  __host__ __device__ int phase2ConnIdx(int peerRank) {
+  __host__ __device__ int phase2ConnIdx(int peerRank)
+  {
    return peerIdx(peerRank, rank) * 3 + 2;
  }

-  void freeGPUResources() {
+  void freeGPUResources()
+  {
    if (scratch)
      CUDACHECK(cudaFree(scratch));
    scratch = nullptr;
@@ -160,16 +176,16 @@ struct AllreduceAllpairs {
  }
 };

-// The builder class encapsulates the 
-template<class T, void (*reduce)(T*,T*,size_t)>
-class AllreduceAllpairsBuilder {
+// The builder class encapsulates the
+template <class T, void (*reduce)(T*, T*, size_t)> class AllreduceAllpairsBuilder
+{
  AllreduceAllpairs<T, reduce> d;
  std::vector<mscclppDevConn_t> hostConns;

 public:
-
  // The constructor is called after the user has allocated the buffer to be allreduced
-  AllreduceAllpairsBuilder(T* data, size_t size) {
+  AllreduceAllpairsBuilder(T* data, size_t size)
+  {
    d.userData = data;
    d.userSize = size;
    d.scratch = nullptr;
@@ -179,7 +195,8 @@ public:
  }

  // connect is called after rank initialization but before connection setup
-  mscclppResult_t connect(mscclppComm_t comm) {
+  mscclppResult_t connect(mscclppComm_t comm)
+  {
    MSCCLPPCHECK(mscclppCommRank(comm, &d.rank));
    MSCCLPPCHECK(mscclppCommSize(comm, &d.nRanks));

@@ -195,47 +212,55 @@ public:
      if (peer != d.rank) {
        int sendTag = d.rank < peer ? 0 : 1;
        int recvTag = d.rank < peer ? 1 : 0;
-        MSCCLPPCHECK(mscclppConnect(comm, hostConns.data() + d.phase1SendConnIdx(peer), peer, d.userData, d.userSize * sizeof(T), d.connFlags + 0, sendTag, mscclppTransportP2P, nullptr));
-        MSCCLPPCHECK(mscclppConnect(comm, hostConns.data() + d.phase1RecvConnIdx(peer), peer, d.scratch, d.scratchSize * sizeof(T), d.connFlags + 1, recvTag, mscclppTransportP2P, nullptr));
-        MSCCLPPCHECK(mscclppConnect(comm, hostConns.data() + d.phase2ConnIdx(peer), peer, d.userData, d.userSize * sizeof(T), d.connFlags + 2, 2, mscclppTransportP2P, nullptr));
+        MSCCLPPCHECK(mscclppConnect(comm, hostConns.data() + d.phase1SendConnIdx(peer), peer, d.userData,
+                                    d.userSize * sizeof(T), d.connFlags + 0, sendTag, mscclppTransportP2P, nullptr));
+        MSCCLPPCHECK(mscclppConnect(comm, hostConns.data() + d.phase1RecvConnIdx(peer), peer, d.scratch,
+                                    d.scratchSize * sizeof(T), d.connFlags + 1, recvTag, mscclppTransportP2P, nullptr));
+        MSCCLPPCHECK(mscclppConnect(comm, hostConns.data() + d.phase2ConnIdx(peer), peer, d.userData,
+                                    d.userSize * sizeof(T), d.connFlags + 2, 2, mscclppTransportP2P, nullptr));
      }
    }

    return mscclppSuccess;
  }

-  // finishSetup is called after connection setup and returns an algorithm object that is ready to be passed to a GPU kernel
-  AllreduceAllpairs<T, reduce> finishSetup() {
+  // finishSetup is called after connection setup and returns an algorithm object that is ready to be passed to a GPU
+  // kernel
+  AllreduceAllpairs<T, reduce> finishSetup()
+  {
    CUDACHECK(cudaMalloc(&d.conns, hostConns.size() * sizeof(mscclppDevConn_t)));
-    CUDACHECK(cudaMemcpy(d.conns, hostConns.data(), hostConns.size() * sizeof(mscclppDevConn_t), cudaMemcpyHostToDevice));
+    CUDACHECK(
+      cudaMemcpy(d.conns, hostConns.data(), hostConns.size() * sizeof(mscclppDevConn_t), cudaMemcpyHostToDevice));
    CUDACHECK(cudaMalloc(&d.barrier, sizeof(cuda::barrier<cuda::thread_scope_device>)));
    cuda::barrier<cuda::thread_scope_device> initBarrier(d.numBlocks());
-    CUDACHECK(cudaMemcpy(d.barrier, &initBarrier, sizeof(cuda::barrier<cuda::thread_scope_device>), cudaMemcpyHostToDevice));
+    CUDACHECK(
+      cudaMemcpy(d.barrier, &initBarrier, sizeof(cuda::barrier<cuda::thread_scope_device>), cudaMemcpyHostToDevice));
    return d;
  }
 };

-template<class T>
-__device__ void reduceSum(T* dst, T* src, size_t size) {
+template <class T> __device__ void reduceSum(T* dst, T* src, size_t size)
+{
  for (int i = threadIdx.x; i < size; i += blockDim.x) {
    dst[i] += src[i];
  }
 }

-template<class T>
-__global__ void init(T* data, size_t size, int rank) {
+template <class T> __global__ void init(T* data, size_t size, int rank)
+{
  for (int i = threadIdx.x; i < size; i += blockDim.x) {
    data[i] = rank;
  }
 }

 // The main test kernel
-template<class T>
-__global__ void testKernel(AllreduceAllpairs<T, reduceSum> d) {
+template <class T> __global__ void testKernel(AllreduceAllpairs<T, reduceSum> d)
+{
  d.run(blockIdx.x);
 }

-int main(int argc, const char *argv[]) {
+int main(int argc, const char* argv[])
+{
 #ifdef MSCCLPP_USE_MPI_FOR_TESTS
  MPI_Init(NULL, NULL);
 #endif
@@ -246,14 +271,14 @@ int main(int argc, const char *argv[]) {
  CUDACHECK(cudaSetDevice(rank));

  // Allocate and initialize 1 MB of data
-  int *data;
+  int* data;
  size_t dataSize = 1024 * 1024 / sizeof(int);
  CUDACHECK(cudaMalloc(&data, dataSize * sizeof(int)));
  init<<<1, 256>>>(data, dataSize, rank);
-  
+
  // Create the collective
  AllreduceAllpairsBuilder<int, reduceSum> builder(data, dataSize);
-  
+
  // Create the communicator
  mscclppComm_t comm;
  MSCCLPPCHECK(mscclppCommInitRank(&comm, world_size, rank, ip_port));
@@ -268,7 +293,7 @@ int main(int argc, const char *argv[]) {

  // Run the collective
  testKernel<<<allreduce.numBlocks(), 256>>>(allreduce);
-  
+
  // Wait for kernel to finish
  CUDACHECK(cudaDeviceSynchronize());

--- a/tests/bootstrap_test.cc
+++ b/tests/bootstrap_test.cc
@@ -4,19 +4,20 @@
 #endif
 #include <stdio.h>
 #include <stdlib.h>
-#include <unistd.h>
 #include <string>
+#include <unistd.h>

-#define MSCCLPPCHECK(call) do { \
-  mscclppResult_t res = call; \
-  if (res != mscclppSuccess && res != mscclppInProgress) { \
-    /* Print the back trace*/ \
-    printf("Failure at %s:%d -> %d\n", __FILE__, __LINE__, res);    \
-    return res; \
-  } \
-} while (0);
+#define MSCCLPPCHECK(call)                                                                                             \
+  do {                                                                                                                 \
+    mscclppResult_t res = call;                                                                                        \
+    if (res != mscclppSuccess && res != mscclppInProgress) {                                                           \
+      /* Print the back trace*/                                                                                        \
+      printf("Failure at %s:%d -> %d\n", __FILE__, __LINE__, res);                                                     \
+      return res;                                                                                                      \
+    }                                                                                                                  \
+  } while (0);

-void print_usage(const char *prog)
+void print_usage(const char* prog)
 {
 #ifdef MSCCLPP_USE_MPI_FOR_TESTS
  std::string st = "you are using MPI for this test\n";
@@ -33,7 +34,7 @@ void print_usage(const char *prog)
 #endif
 }

-int main(int argc, const char *argv[])
+int main(int argc, const char* argv[])
 {
  if (argc >= 2 && (std::string(argv[1]) == "-h" || std::string(argv[1]) == "--help")) {
    print_usage(argv[0]);
@@ -48,7 +49,7 @@ int main(int argc, const char *argv[])
  MPI_Init(NULL, NULL);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
-  const char *ip_port;
+  const char* ip_port;
  if (argc == 2)
    ip_port = argv[1];
  else
@@ -58,7 +59,7 @@ int main(int argc, const char *argv[])
    print_usage(argv[0]);
    return -1;
  }
-  const char *ip_port = argv[1];
+  const char* ip_port = argv[1];
  rank = atoi(argv[2]);
  world_size = atoi(argv[3]);
 #endif
@@ -70,7 +71,8 @@ int main(int argc, const char *argv[])
  } else {
 #ifdef MSCCLPP_USE_MPI_FOR_TESTS
    mscclppUniqueId id;
-    if (rank == 0) MSCCLPPCHECK(mscclppGetUniqueId(&id));
+    if (rank == 0)
+      MSCCLPPCHECK(mscclppGetUniqueId(&id));
    MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
    MSCCLPPCHECK(mscclppCommInitRankFromId(&comm, world_size, id, rank));
 #else
@@ -80,7 +82,7 @@ int main(int argc, const char *argv[])
  }

  // allocate some test buffer
-  int *buf = (int *)calloc(world_size, sizeof(int));
+  int* buf = (int*)calloc(world_size, sizeof(int));
  if (buf == nullptr) {
    printf("calloc failed\n");
    return -1;
@@ -101,7 +103,7 @@ int main(int argc, const char *argv[])
  MSCCLPPCHECK(mscclppCommDestroy(comm));

 #ifdef MSCCLPP_USE_MPI_FOR_TESTS
-    MPI_Finalize();
+  MPI_Finalize();
 #endif

  printf("Rank %d Succeeded\n", rank);
--- a/tests/common.h
+++ b/tests/common.h
@@ -8,7 +8,7 @@
 #include "mpi.h"
 #endif // MSCCLPP_USE_MPI_FOR_TESTS

-void print_usage(const char *prog)
+void print_usage(const char* prog)
 {
 #ifdef MSCCLPP_USE_MPI_FOR_TESTS
  printf("usage: %s IP:PORT [rank nranks]\n", prog);
@@ -17,7 +17,8 @@ void print_usage(const char *prog)
 #endif
 }

-void parse_arguments(int argc, const char *argv[], const char** ip_port, int* rank, int* world_size) {
+void parse_arguments(int argc, const char* argv[], const char** ip_port, int* rank, int* world_size)
+{
 #ifdef MSCCLPP_USE_MPI_FOR_TESTS
  if (argc != 2 && argc != 4) {
    print_usage(argv[0]);
--- a/tests/p2p_test.cu
+++ b/tests/p2p_test.cu
@@ -1,8 +1,8 @@
 #include "mscclpp.h"
 #include <stdio.h>
 #include <stdlib.h>
-#include <unistd.h>
 #include <string>
+#include <unistd.h>

 #include "common.h"

@@ -10,23 +10,25 @@
 #define USE_DMA_FOR_P2P 1
 #define TEST_CONN_TYPE 0 // 0: P2P(for local)+IB(for remote), 1: IB-Only

-#define MSCCLPPCHECK(call) do { \
-  mscclppResult_t res = call; \
-  if (res != mscclppSuccess && res != mscclppInProgress) { \
-    /* Print the back trace*/ \
-    printf("Failure at %s:%d -> %d\n", __FILE__, __LINE__, res);    \
-    return res; \
-  } \
-} while (0);
+#define MSCCLPPCHECK(call)                                                                                             \
+  do {                                                                                                                 \
+    mscclppResult_t res = call;                                                                                        \
+    if (res != mscclppSuccess && res != mscclppInProgress) {                                                           \
+      /* Print the back trace*/                                                                                        \
+      printf("Failure at %s:%d -> %d\n", __FILE__, __LINE__, res);                                                     \
+      return res;                                                                                                      \
+    }                                                                                                                  \
+  } while (0);

 // Check CUDA RT calls
-#define CUDACHECK(cmd) do {                                   \
-    cudaError_t err = cmd;                                    \
-    if( err != cudaSuccess ) {                                \
-        printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
-        exit(EXIT_FAILURE);                                   \
-    }                                                         \
-} while(false)
+#define CUDACHECK(cmd)                                                                                                 \
+  do {                                                                                                                 \
+    cudaError_t err = cmd;                                                                                             \
+    if (err != cudaSuccess) {                                                                                          \
+      printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err));                                \
+      exit(EXIT_FAILURE);                                                                                              \
+    }                                                                                                                  \
+  } while (false)

 // Measure current time in second.
 static double getTime(void)
@@ -43,17 +45,18 @@ __constant__ mscclppDevConn_t constDevConns[16];

 __global__ void kernel(int rank, int world_size)
 {
-  if (threadIdx.x % 32 != 0) return;
+  if (threadIdx.x % 32 != 0)
+    return;

  int warpId = threadIdx.x / 32;
  int remoteRank = (warpId < rank) ? warpId : warpId + 1;
  mscclppDevConn_t devConn = constDevConns[remoteRank];
-  volatile int *data = (volatile int *)devConn.localBuff;
-  volatile uint64_t *localFlag = devConn.localFlag;
+  volatile int* data = (volatile int*)devConn.localBuff;
+  volatile uint64_t* localFlag = devConn.localFlag;
 #if (USE_DMA_FOR_P2P == 0)
-  volatile uint64_t *remoteFlag = devConn.remoteFlag;
+  volatile uint64_t* remoteFlag = devConn.remoteFlag;
 #endif
-  volatile uint64_t *proxyFlag = devConn.proxyFlag;
+  volatile uint64_t* proxyFlag = devConn.proxyFlag;

  uint64_t baseFlag = *localFlag;

@@ -83,7 +86,8 @@ __global__ void kernel(int rank, int world_size)
  devConn.fifo.sync(req);

  // Wait for receiving data from remote rank
-  while (*proxyFlag == baseFlag) {}
+  while (*proxyFlag == baseFlag) {
+  }

 #else // USE_DMA_FOR_P2P == 0

@@ -95,13 +99,15 @@ __global__ void kernel(int rank, int world_size)
    devConn.setTrigger(trig, mscclppFlag | mscclppData, rank * sizeof(int), sizeof(int));

    // Wait for receiving data from remote rank
-    while (*proxyFlag == baseFlag) {}
+    while (*proxyFlag == baseFlag) {
+    }
  } else { // P2P
    // Directly read data
-    volatile int *remoteData = (volatile int *)devConn.remoteBuff;
+    volatile int* remoteData = (volatile int*)devConn.remoteBuff;

    // Wait until the remote data is set
-    while (*remoteFlag == baseFlag) {}
+    while (*remoteFlag == baseFlag) {
+    }

    // Read remote data
    data[remoteRank] = remoteData[remoteRank];
@@ -146,7 +152,7 @@ int cudaNumToIbNum(int cudaNum)
  return ibNum;
 }

-int main(int argc, const char *argv[])
+int main(int argc, const char* argv[])
 {
 #ifdef MSCCLPP_USE_MPI_FOR_TESTS
  MPI_Init(NULL, NULL);
@@ -165,8 +171,8 @@ int main(int argc, const char *argv[])
  mscclppComm_t comm;
  MSCCLPPCHECK(mscclppCommInitRank(&comm, world_size, rank, ip_port));

-  int *data_d;
-  uint64_t *flag_d;
+  int* data_d;
+  uint64_t* flag_d;
  size_t data_size = sizeof(int) * world_size;
  CUDACHECK(cudaMalloc(&data_d, data_size));
  CUDACHECK(cudaMalloc(&flag_d, sizeof(uint64_t)));
@@ -174,9 +180,10 @@ int main(int argc, const char *argv[])
  CUDACHECK(cudaMemset(flag_d, 0, sizeof(uint64_t)));

  for (int r = 0; r < world_size; ++r) {
-    if (r == rank) continue;
+    if (r == rank)
+      continue;
    mscclppTransport_t transportType = mscclppTransportIB;
-    const char *ibDev = ibDevStr.c_str();
+    const char* ibDev = ibDevStr.c_str();
 #if (TEST_CONN_TYPE == 0) // P2P+IB
    if (rankToNode(r) == thisNode) {
      transportType = mscclppTransportP2P;
@@ -191,7 +198,7 @@ int main(int argc, const char *argv[])

  MSCCLPPCHECK(mscclppProxyLaunch(comm));

-  mscclppDevConn_t *devConns;
+  mscclppDevConn_t* devConns;
  int nCons;
  MSCCLPPCHECK(mscclppGetAllDeviceConnections(comm, &devConns, &nCons));

@@ -204,7 +211,7 @@ int main(int argc, const char *argv[])
  CUDACHECK(cudaDeviceSynchronize());

  // Read results from GPU
-  int *buf = (int *)calloc(world_size, sizeof(int));
+  int* buf = (int*)calloc(world_size, sizeof(int));
  if (buf == nullptr) {
    printf("calloc failed\n");
    return -1;
@@ -230,9 +237,9 @@ int main(int argc, const char *argv[])

  // warm up
  // int warmupiter = 10;
-//  for (int i = 0; i < warmupiter; ++i) {
-//    kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size);
-//  }
+  //  for (int i = 0; i < warmupiter; ++i) {
+  //    kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size);
+  //  }

  // cudaGraph Capture
  cudaGraph_t graph;
@@ -240,32 +247,32 @@ int main(int argc, const char *argv[])
  cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
  int cudagraphiter = 100;
  for (int i = 0; i < cudagraphiter; ++i) {
-  	kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size);
+    kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size);
  }
  cudaStreamEndCapture(stream, &graph);
  cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);

  int cudagraphwarmup = 10;
  for (int i = 0; i < cudagraphwarmup; ++i) {
-	  cudaGraphLaunch(instance, stream);
+    cudaGraphLaunch(instance, stream);
  }
  CUDACHECK(cudaStreamSynchronize(stream));

-  // measure runtime 
-//  CUDACHECK(cudaEventRecord(ev_start, stream));
+  // measure runtime
+  //  CUDACHECK(cudaEventRecord(ev_start, stream));
  double t0 = getTime();
  int cudagraphlaunch = 10;
  for (int i = 0; i < cudagraphlaunch; ++i) {
-  // kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size);
-     cudaGraphLaunch(instance, stream);
+    // kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size);
+    cudaGraphLaunch(instance, stream);
  }
-//  CUDACHECK(cudaEventRecord(ev_end, stream));
+  //  CUDACHECK(cudaEventRecord(ev_end, stream));
  CUDACHECK(cudaStreamSynchronize(stream));

  double t1 = getTime();
-  float ms = (t1-t0)*1000.0;
-//  CUDACHECK(cudaEventElapsedTime(&ms, ev_start, ev_end));
-  printf("rank: %d, time: %f us/iter\n", rank, ms * 1000. / (float) cudagraphlaunch / (float) cudagraphiter);
+  float ms = (t1 - t0) * 1000.0;
+  //  CUDACHECK(cudaEventElapsedTime(&ms, ev_start, ev_end));
+  printf("rank: %d, time: %f us/iter\n", rank, ms * 1000. / (float)cudagraphlaunch / (float)cudagraphiter);

  MSCCLPPCHECK(mscclppProxyStop(comm));