Fix cpplint error in main branch (#740)

Fix the legacy cpplint error in main branch. --------- Co-authored-by: Qinghua Zhou <qinghuahzhou@microsoft.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Binyang Li <binyli@microsoft.com>
2026-04-20 06:49:29 +00:00 · 2026-02-06 01:25:12 +08:00
parent dc747b1522
commit 620378b4fb
14 changed files with 75 additions and 75 deletions
--- a/examples/tutorials/01-basic-concepts/gpu_ping_pong.cu
+++ b/examples/tutorials/01-basic-concepts/gpu_ping_pong.cu
@@ -9,7 +9,7 @@
 #include <sstream>

 template <typename... Args>
-void log(Args &&...args) {
+void log(Args&&... args) {
  std::stringstream ss;
  (ss << ... << args);
  ss << std::endl;
@@ -23,7 +23,7 @@ __device__ void spin_cycles(unsigned long long cycles) {
  }
 }

-__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
+__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
  if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
    for (int i = 0; i < iter; ++i) {
      devHandle->relaxedWait();
@@ -34,7 +34,7 @@ __global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, in
  }
 }

-__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
+__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
  if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
    for (int i = 0; i < iter; ++i) {
      devHandle->relaxedSignal();
@@ -88,7 +88,7 @@ int main() {
  mscclpp::Semaphore sema0(/*localSemaphoreStub*/ semaStub0, /*remoteSemaphoreStub*/ semaStub1);
  mscclpp::BaseMemoryChannel memChan0(sema0);
  mscclpp::BaseMemoryChannelDeviceHandle memChanHandle0 = memChan0.deviceHandle();
-  void *devHandle0;
+  void* devHandle0;
  MSCCLPP_CUDATHROW(cudaMalloc(&devHandle0, sizeof(mscclpp::BaseMemoryChannelDeviceHandle)));
  MSCCLPP_CUDATHROW(cudaMemcpy(devHandle0, &memChanHandle0, sizeof(memChanHandle0), cudaMemcpyHostToDevice));

@@ -98,14 +98,14 @@ int main() {
  mscclpp::Semaphore sema1(/*localSemaphoreStub*/ semaStub1, /*remoteSemaphoreStub*/ semaStub0);
  mscclpp::BaseMemoryChannel memChan1(sema1);
  mscclpp::BaseMemoryChannelDeviceHandle memChanHandle1 = memChan1.deviceHandle();
-  void *devHandle1;
+  void* devHandle1;
  MSCCLPP_CUDATHROW(cudaMalloc(&devHandle1, sizeof(mscclpp::BaseMemoryChannelDeviceHandle)));
  MSCCLPP_CUDATHROW(cudaMemcpy(devHandle1, &memChanHandle1, sizeof(memChanHandle1), cudaMemcpyHostToDevice));

  log("GPU 0: Launching gpuKernel0 ...");

  MSCCLPP_CUDATHROW(cudaSetDevice(0));
-  gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle0), iter);
+  gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle0), iter);
  MSCCLPP_CUDATHROW(cudaGetLastError());

  log("GPU 1: Launching gpuKernel1 ...");
@@ -115,7 +115,7 @@ int main() {
  MSCCLPP_CUDATHROW(cudaEventCreate(&start));
  MSCCLPP_CUDATHROW(cudaEventCreate(&end));
  MSCCLPP_CUDATHROW(cudaEventRecord(start));
-  gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle1), iter);
+  gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle1), iter);
  MSCCLPP_CUDATHROW(cudaGetLastError());
  MSCCLPP_CUDATHROW(cudaEventRecord(end));
  MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
--- a/examples/tutorials/02-bootstrap/gpu_ping_pong_mp.cu
+++ b/examples/tutorials/02-bootstrap/gpu_ping_pong_mp.cu
@@ -14,7 +14,7 @@
 #define PORT_NUMBER "50505"

 template <typename... Args>
-void log(Args &&...args) {
+void log(Args&&... args) {
  std::stringstream ss;
  (ss << ... << args);
  ss << std::endl;
@@ -50,7 +50,7 @@ __device__ void spin_cycles(unsigned long long cycles) {
  }
 }

-__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
+__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
  if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
    for (int i = 0; i < iter; ++i) {
      devHandle->relaxedWait();
@@ -61,7 +61,7 @@ __global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, in
  }
 }

-__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
+__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
  if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
    for (int i = 0; i < iter; ++i) {
      devHandle->relaxedSignal();
@@ -115,14 +115,14 @@ void worker(int gpuId) {

  mscclpp::BaseMemoryChannel memChan(sema);
  auto memChanHandle = memChan.deviceHandle();
-  void *devHandle;
+  void* devHandle;
  MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(memChanHandle)));
  MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &memChanHandle, sizeof(memChanHandle), cudaMemcpyHostToDevice));

  log("GPU ", gpuId, ": Launching a GPU kernel ...");

  if (gpuId == 0) {
-    gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle), iter);
+    gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle), iter);
    MSCCLPP_CUDATHROW(cudaGetLastError());
    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
  } else {
@@ -130,7 +130,7 @@ void worker(int gpuId) {
    MSCCLPP_CUDATHROW(cudaEventCreate(&start));
    MSCCLPP_CUDATHROW(cudaEventCreate(&end));
    MSCCLPP_CUDATHROW(cudaEventRecord(start));
-    gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle), iter);
+    gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle), iter);
    MSCCLPP_CUDATHROW(cudaGetLastError());
    MSCCLPP_CUDATHROW(cudaEventRecord(end));
    MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
--- a/examples/tutorials/03-memory-channel/bidir_memory_channel.cu
+++ b/examples/tutorials/03-memory-channel/bidir_memory_channel.cu
@@ -16,7 +16,7 @@
 #define PORT_NUMBER "50505"

 template <typename... Args>
-void log(Args &&...args) {
+void log(Args&&... args) {
  std::stringstream ss;
  (ss << ... << args);
  ss << std::endl;
@@ -47,7 +47,7 @@ int wait_process(int pid) {

 __device__ mscclpp::DeviceSyncer devSyncer;

-__global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) {
+__global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) {
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  if (tid == 0) {
    devHandle->relaxedSignal();
@@ -65,7 +65,7 @@ __global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, si
  }
 }

-__global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) {
+__global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) {
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  if (tid == 0) {
    devHandle->relaxedSignal();
@@ -79,7 +79,7 @@ __global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, si
  devHandle->get(srcOffset, dstOffset, copyBytes, /*threadId*/ tid, /*numThreads*/ blockDim.x * gridDim.x);
 }

-__global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank,
+__global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank,
                                     uint32_t flag) {
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  if (tid == 0) {
@@ -95,7 +95,7 @@ __global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle *devHand
  devHandle->unpackPackets(pktBufOffset, dstOffset, copyBytes, tid, blockDim.x * gridDim.x, flag);
 }

-void worker(int myRank, int gpuId, const std::string &ipPort) {
+void worker(int myRank, int gpuId, const std::string& ipPort) {
  MSCCLPP_CUDATHROW(cudaSetDevice(gpuId));
  const int remoteRank = myRank == 0 ? 1 : 0;
  const int nRanks = 2;
@@ -132,8 +132,8 @@ void worker(int myRank, int gpuId, const std::string &ipPort) {
  auto memChanHandle = memChan.deviceHandle();
  auto memPktChanHandle = memPktChan.deviceHandle();

-  void *devHandle;
-  void *devPktHandle;
+  void* devHandle;
+  void* devPktHandle;
  MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(memChanHandle)));
  MSCCLPP_CUDATHROW(cudaMalloc(&devPktHandle, sizeof(memPktChanHandle)));
  MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &memChanHandle, sizeof(memChanHandle), cudaMemcpyHostToDevice));
@@ -145,18 +145,18 @@ void worker(int myRank, int gpuId, const std::string &ipPort) {
  std::function<void(size_t)> kernels[3];

  kernels[0] = [&](size_t copyBytes) {
-    bidirPutKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle *>(devHandle),
-                                            copyBytes, myRank);
+    bidirPutKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle*>(devHandle), copyBytes,
+                                            myRank);
  };

  kernels[1] = [&](size_t copyBytes) {
-    bidirGetKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle *>(devHandle),
-                                            copyBytes, myRank);
+    bidirGetKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle*>(devHandle), copyBytes,
+                                            myRank);
  };

  kernels[2] = [&](size_t copyBytes) {
    static uint32_t flag = 1;
-    bidirPutPacketKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle *>(devPktHandle),
+    bidirPutPacketKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle*>(devPktHandle),
                                                  copyBytes, myRank, flag++);
  };

@@ -215,7 +215,7 @@ void worker(int myRank, int gpuId, const std::string &ipPort) {
  bootstrap->barrier();
 }

-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
  if (argc == 1) {
    int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER); });
    int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER); });
@@ -241,7 +241,7 @@ int main(int argc, char **argv) {
    try {
      rank = std::stoi(argv[2]);
      gpuId = std::stoi(argv[3]);
-    } catch (const std::exception &) {
+    } catch (const std::exception&) {
      log("Error: rank and gpu_id must be valid integers.");
      return -1;
    }
--- a/examples/tutorials/04-port-channel/bidir_port_channel.cu
+++ b/examples/tutorials/04-port-channel/bidir_port_channel.cu
@@ -16,7 +16,7 @@
 #define PORT_NUMBER "50505"

 template <typename... Args>
-void log(Args &&...args) {
+void log(Args&&... args) {
  std::stringstream ss;
  (ss << ... << args);
  ss << std::endl;
@@ -45,7 +45,7 @@ int wait_process(int pid) {
  return -1;
 }

-__global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) {
+__global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) {
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  if (tid == 0) {
    devHandle->signal();
@@ -58,7 +58,7 @@ __global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle *devHandle, size
  }
 }

-void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport transport) {
+void worker(int rank, int gpuId, const std::string& ipPort, mscclpp::Transport transport) {
  MSCCLPP_CUDATHROW(cudaSetDevice(gpuId));
  const int myRank = rank;
  const int remoteRank = myRank == 0 ? 1 : 0;
@@ -90,7 +90,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t

  auto portChanHandle = portChan.deviceHandle();

-  void *devHandle;
+  void* devHandle;
  MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(portChanHandle)));
  MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &portChanHandle, sizeof(portChanHandle), cudaMemcpyHostToDevice));

@@ -100,7 +100,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t
  std::function<void(size_t)> kernels[1];

  kernels[0] = [&](size_t copyBytes) {
-    bidirPutKernel<<<1, 1, 0, stream>>>(reinterpret_cast<mscclpp::PortChannelDeviceHandle *>(devHandle), copyBytes,
+    bidirPutKernel<<<1, 1, 0, stream>>>(reinterpret_cast<mscclpp::PortChannelDeviceHandle*>(devHandle), copyBytes,
                                        myRank);
  };

@@ -166,7 +166,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t
  bootstrap->barrier();
 }

-mscclpp::Transport parseTransport(const std::string &transportStr) {
+mscclpp::Transport parseTransport(const std::string& transportStr) {
  if (transportStr == "CudaIpc") return mscclpp::Transport::CudaIpc;
  if (transportStr == "IB0") return mscclpp::Transport::IB0;
  if (transportStr == "IB1") return mscclpp::Transport::IB1;
@@ -180,7 +180,7 @@ mscclpp::Transport parseTransport(const std::string &transportStr) {
  throw std::runtime_error("Unknown transport: " + transportStr);
 }

-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
  if (argc == 1) {
    int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER, mscclpp::Transport::CudaIpc); });
    int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER, mscclpp::Transport::CudaIpc); });
--- a/include/mscclpp/assert_device.hpp
+++ b/include/mscclpp/assert_device.hpp
@@ -19,11 +19,11 @@
 #else  // defined(DEBUG_BUILD)

 #if defined(MSCCLPP_DEVICE_HIP)
-extern "C" __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
-                                         const char *__function);
+extern "C" __device__ void __assert_fail(const char* __assertion, const char* __file, unsigned int __line,
+                                         const char* __function);
 #else   // !defined(MSCCLPP_DEVICE_HIP)
-extern "C" __host__ __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
-                                                  const char *__function) __THROW;
+extern "C" __host__ __device__ void __assert_fail(const char* __assertion, const char* __file, unsigned int __line,
+                                                  const char* __function) __THROW;
 #endif  // !defined(MSCCLPP_DEVICE_HIP)

 /// Assert a condition on the device and print a message if the condition is false.
--- a/python/csrc/error_py.cpp
+++ b/python/csrc/error_py.cpp
@@ -11,17 +11,17 @@ using namespace mscclpp;

 #define REGISTER_EXCEPTION_TRANSLATOR(name_)                                                                         \
  nb::register_exception_translator(                                                                                 \
-      [](const std::exception_ptr &p, void *payload) {                                                               \
+      [](const std::exception_ptr& p, void* payload) {                                                               \
        try {                                                                                                        \
          std::rethrow_exception(p);                                                                                 \
-        } catch (const name_ &e) {                                                                                   \
-          PyErr_SetObject(reinterpret_cast<PyObject *>(payload),                                                     \
+        } catch (const name_& e) {                                                                                   \
+          PyErr_SetObject(reinterpret_cast<PyObject*>(payload),                                                      \
                          PyTuple_Pack(2, PyLong_FromLong(long(e.getErrorCode())), PyUnicode_FromString(e.what()))); \
        }                                                                                                            \
      },                                                                                                             \
      m.attr(#name_).ptr());

-void register_error(nb::module_ &m) {
+void register_error(nb::module_& m) {
  nb::enum_<ErrorCode>(m, "CppErrorCode")
      .value("SystemError", ErrorCode::SystemError)
      .value("InternalError", ErrorCode::InternalError)
--- a/python/csrc/npkit_py.cpp
+++ b/python/csrc/npkit_py.cpp
@@ -8,7 +8,7 @@

 namespace nb = nanobind;

-void register_npkit(nb::module_ &m) {
+void register_npkit(nb::module_& m) {
  nb::module_ sub_m = m.def_submodule("cpp_npkit", "NPKit functions");
  sub_m.def("init", &NpKit::Init);
  sub_m.def("dump", &NpKit::Dump);
--- a/python/csrc/numa_py.cpp
+++ b/python/csrc/numa_py.cpp
@@ -6,7 +6,7 @@ int getDeviceNumaNode(int cudaDev);
 void numaBind(int node);
 };  // namespace mscclpp

-void register_numa(nb::module_ &m) {
+void register_numa(nb::module_& m) {
  nb::module_ sub_m = m.def_submodule("cpp_numa", "numa functions");
  sub_m.def("get_device_numa_node", &mscclpp::getDeviceNumaNode);
  sub_m.def("numa_bind", &mscclpp::numaBind);
--- a/src/core/context.cc
+++ b/src/core/context.cc
@@ -23,14 +23,14 @@ void CudaIpcStream::setStreamIfNeeded() {
  }
 }

-void CudaIpcStream::memcpyD2D(void *dst, const void *src, size_t nbytes) {
+void CudaIpcStream::memcpyD2D(void* dst, const void* src, size_t nbytes) {
  CudaDeviceGuard deviceGuard(deviceId_);
  setStreamIfNeeded();
  MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, nbytes, cudaMemcpyDeviceToDevice, *stream_));
  dirty_ = true;
 }

-void CudaIpcStream::memcpyH2D(void *dst, const void *src, size_t nbytes) {
+void CudaIpcStream::memcpyH2D(void* dst, const void* src, size_t nbytes) {
  CudaDeviceGuard deviceGuard(deviceId_);
  setStreamIfNeeded();
  MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, nbytes, cudaMemcpyHostToDevice, *stream_));
@@ -48,7 +48,7 @@ void CudaIpcStream::sync() {

 Context::Impl::Impl() {}

-IbCtx *Context::Impl::getIbContext(Transport ibTransport) {
+IbCtx* Context::Impl::getIbContext(Transport ibTransport) {
  // Find IB context or create it
  auto it = ibContexts_.find(ibTransport);
  if (it == ibContexts_.end()) {
@@ -70,7 +70,7 @@ MSCCLPP_API_CPP Context::Context() : pimpl_(std::make_unique<Impl>()) {}

 MSCCLPP_API_CPP Context::~Context() = default;

-MSCCLPP_API_CPP RegisteredMemory Context::registerMemory(void *ptr, size_t size, TransportFlags transports) {
+MSCCLPP_API_CPP RegisteredMemory Context::registerMemory(void* ptr, size_t size, TransportFlags transports) {
  return RegisteredMemory(std::make_shared<RegisteredMemory::Impl>(ptr, size, transports, *pimpl_));
 }

@@ -78,7 +78,7 @@ MSCCLPP_API_CPP Endpoint Context::createEndpoint(EndpointConfig config) {
  return Endpoint(std::make_shared<Endpoint::Impl>(config, *pimpl_));
 }

-MSCCLPP_API_CPP Connection Context::connect(const Endpoint &localEndpoint, const Endpoint &remoteEndpoint) {
+MSCCLPP_API_CPP Connection Context::connect(const Endpoint& localEndpoint, const Endpoint& remoteEndpoint) {
  if (localEndpoint.device().type == DeviceType::GPU && localEndpoint.device().id < 0) {
    throw Error("No GPU device ID provided for local endpoint", ErrorCode::InvalidUsage);
  }
--- a/src/core/include/context.hpp
+++ b/src/core/include/context.hpp
@@ -24,9 +24,9 @@ class CudaIpcStream {
 public:
  CudaIpcStream(int deviceId);

-  void memcpyD2D(void *dst, const void *src, size_t nbytes);
+  void memcpyD2D(void* dst, const void* src, size_t nbytes);

-  void memcpyH2D(void *dst, const void *src, size_t nbytes);
+  void memcpyH2D(void* dst, const void* src, size_t nbytes);

  void sync();

@@ -44,7 +44,7 @@ struct Context::Impl {

  Impl();

-  IbCtx *getIbContext(Transport ibTransport);
+  IbCtx* getIbContext(Transport ibTransport);
  std::shared_ptr<uint64_t> getToken();
 };

--- a/src/core/include/gpu_ipc_mem.hpp
+++ b/src/core/include/gpu_ipc_mem.hpp
@@ -46,7 +46,7 @@ struct GpuIpcMemHandle {
    char handle[64];
  } fabric;

-  static void deleter(GpuIpcMemHandle *handle);
+  static void deleter(GpuIpcMemHandle* handle);

  // We make GpuIpcMemHandle trivially copyable for easy serialization,
  // and thus it cannot have explicit destructors.
@@ -61,7 +61,7 @@ struct GpuIpcMemHandle {
    using Base::Base;

    // Allow implicit conversion from Base
-    UniquePtr(Base &&other) : Base(std::move(other)) {}
+    UniquePtr(Base&& other) : Base(std::move(other)) {}
  };

  static UniquePtr create(const CUdeviceptr ptr);
@@ -70,7 +70,7 @@ struct GpuIpcMemHandle {

 using UniqueGpuIpcMemHandle = GpuIpcMemHandle::UniquePtr;

-std::ostream &operator<<(std::ostream &os, const GpuIpcMemHandle::TypeFlags &typeFlags);
+std::ostream& operator<<(std::ostream& os, const GpuIpcMemHandle::TypeFlags& typeFlags);

 static_assert(std::is_trivially_copyable_v<GpuIpcMemHandle>);

@@ -82,7 +82,7 @@ class GpuIpcMem : public std::enable_shared_from_this<GpuIpcMem> {
  /// Create a GpuIpcMem instance from a GpuIpcMemHandle.
  /// @param handle The handle to import.
  /// @return A shared_ptr to the created GpuIpcMem instance.
-  static std::shared_ptr<GpuIpcMem> create(const GpuIpcMemHandle &handle);
+  static std::shared_ptr<GpuIpcMem> create(const GpuIpcMemHandle& handle);

  ~GpuIpcMem();

@@ -102,7 +102,7 @@ class GpuIpcMem : public std::enable_shared_from_this<GpuIpcMem> {
  std::shared_ptr<void> mapMulticast(int numDevices, size_t mcOffset, CUdeviceptr bufferAddr, size_t bufferSize);

 private:
-  GpuIpcMem(const GpuIpcMemHandle &handle);
+  GpuIpcMem(const GpuIpcMemHandle& handle);

  GpuIpcMemHandle handle_;
  CUmemGenericAllocationHandle allocHandle_;
--- a/src/core/include/ibverbs_wrapper.hpp
+++ b/src/core/include/ibverbs_wrapper.hpp
@@ -12,12 +12,12 @@ namespace mscclpp {

 struct IBVerbs {
 private:
-  static void *dlsym(const std::string &symbol, bool allowReturnNull = false);
+  static void* dlsym(const std::string& symbol, bool allowReturnNull = false);

 public:
 #define REGISTER_IBV_FUNC_WITH_NAME(name__, func__)                                          \
  template <typename... Args>                                                                \
-  static inline auto(name__)(Args && ...args) {                                              \
+  static inline auto(name__)(Args && ... args) {                                             \
    static_assert(sizeof(&::func__) > 0, #func__ " is expected be a function, not a macro"); \
    static decltype(&::func__) impl = nullptr;                                               \
    if (!impl) impl = reinterpret_cast<decltype(impl)>(IBVerbs::dlsym(#func__));             \
@@ -46,7 +46,7 @@ struct IBVerbs {
  REGISTER_IBV_FUNC(ibv_wc_status_str)

  static bool isDmabufSupported();
-  static struct ibv_mr *ibv_reg_dmabuf_mr(struct ibv_pd *, uint64_t, size_t, uint64_t, int, int);
+  static struct ibv_mr* ibv_reg_dmabuf_mr(struct ibv_pd*, uint64_t, size_t, uint64_t, int, int);

  ///
  /// Below is for cases where the API (may be / is) a macro. Refer to `infiniband/verbs.h`.
@@ -57,8 +57,8 @@ struct IBVerbs {
 #else  // defined(ibv_get_device_list)
 #undef ibv_get_device_list
  REGISTER_IBV_FUNC(ibv_static_providers)
-  static inline struct ibv_device **ibv_get_device_list(int *num_devices) {
-    using FuncType = struct ibv_device **(*)(int *);
+  static inline struct ibv_device** ibv_get_device_list(int* num_devices) {
+    using FuncType = struct ibv_device** (*)(int*);
    static FuncType impl = nullptr;
    if (!impl) impl = reinterpret_cast<FuncType>(IBVerbs::dlsym("ibv_get_device_list"));
    IBVerbs::ibv_static_providers(NULL, _RDMA_STATIC_PREFIX(RDMA_STATIC_PROVIDERS), NULL);
@@ -67,21 +67,21 @@ struct IBVerbs {
 #endif  // defined(ibv_get_device_list)

 #undef ibv_query_port
-  static inline int ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) {
+  static inline int ibv_query_port(struct ibv_context* context, uint8_t port_num, struct ibv_port_attr* port_attr) {
    static decltype(&::ibv_query_port) impl = nullptr;
    if (!impl) impl = reinterpret_cast<decltype(impl)>(IBVerbs::dlsym("ibv_query_port"));
-    struct verbs_context *vctx = verbs_get_ctx_op(context, query_port);
+    struct verbs_context* vctx = verbs_get_ctx_op(context, query_port);
    if (!vctx) {
      int rc;
      ::memset(port_attr, 0, sizeof(*port_attr));
-      rc = impl(context, port_num, (struct _compat_ibv_port_attr *)port_attr);
+      rc = impl(context, port_num, (struct _compat_ibv_port_attr*)port_attr);
      return rc;
    }
    return vctx->query_port(context, port_num, port_attr, sizeof(*port_attr));
  }

 #undef ibv_reg_mr
-  static inline struct ibv_mr *ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) {
+  static inline struct ibv_mr* ibv_reg_mr(struct ibv_pd* pd, void* addr, size_t length, int access) {
    static decltype(&::ibv_reg_mr) impl = nullptr;
    static decltype(&::ibv_reg_mr_iova2) impl_iova2 = nullptr;
    int is_access_const = __builtin_constant_p(((int)(access)&IBV_ACCESS_OPTIONAL_RANGE) == 0);
@@ -98,11 +98,11 @@ struct IBVerbs {
  /// Below is for cases where the API (may be / is) a static function. Refer to `infiniband/verbs.h`.
  ///

-  static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
+  static inline int ibv_post_send(struct ibv_qp* qp, struct ibv_send_wr* wr, struct ibv_send_wr** bad_wr) {
    return qp->context->ops.post_send(qp, wr, bad_wr);
  }

-  static inline int ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc) {
+  static inline int ibv_poll_cq(struct ibv_cq* cq, int num_entries, struct ibv_wc* wc) {
    return cq->context->ops.poll_cq(cq, num_entries, wc);
  }
 };
--- a/src/ext/nccl/audit-shim/audit_nccl.cc
+++ b/src/ext/nccl/audit-shim/audit_nccl.cc
@@ -8,11 +8,11 @@

 extern "C" __attribute__((visibility("default"))) unsigned int la_version(unsigned int) { return LAV_CURRENT; }

-extern "C" __attribute__((visibility("default"))) char *la_objsearch(const char *name, uintptr_t *, unsigned int) {
-  const char *library = "libmscclpp_nccl.so";
+extern "C" __attribute__((visibility("default"))) char* la_objsearch(const char* name, uintptr_t*, unsigned int) {
+  const char* library = "libmscclpp_nccl.so";
  if (strcmp(name, "libnccl.so.2") && strcmp(name, "libnccl.so") && strcmp(name, "librccl.so") &&
      strcmp(name, "librccl.so.1")) {
-    return (char *)name;
+    return (char*)name;
  }
-  return (char *)library;
+  return (char*)library;
 }
--- a/test/unit/local_channel_tests.cu
+++ b/test/unit/local_channel_tests.cu
@@ -12,10 +12,10 @@

 __constant__ mscclpp::PortChannelDeviceHandle gPortChannel;

-__global__ void kernelLocalPortChannelTest(void *dst, void *src, size_t bytes, int *ret) {
+__global__ void kernelLocalPortChannelTest(void* dst, void* src, size_t bytes, int* ret) {
  if (blockIdx.x == 0) {
    // sender
-    int *ptr = reinterpret_cast<int *>(src);
+    int* ptr = reinterpret_cast<int*>(src);
    for (size_t idx = threadIdx.x; idx < bytes / sizeof(int); idx += blockDim.x) {
      ptr[idx] = MAGIC_CONST;
    }
@@ -29,7 +29,7 @@ __global__ void kernelLocalPortChannelTest(void *dst, void *src, size_t bytes, i
      gPortChannel.wait();
    }
    __syncthreads();
-    int *ptr = reinterpret_cast<int *>(dst);
+    int* ptr = reinterpret_cast<int*>(dst);
    for (size_t idx = threadIdx.x; idx < bytes / sizeof(int); idx += blockDim.x) {
      if (ptr[idx] != MAGIC_CONST) {
        *ret = 1;  // Error: value mismatch