Remove alloc.h and beef up cuda_utils.hpp (#82)

2026-05-11 17:00:22 +00:00 · 2023-05-24 01:34:18 -07:00
parent 216373eab2
commit 457c422791
16 changed files with 129 additions and 260 deletions
--- a/include/mscclpp/cuda_utils.hpp
+++ b/include/mscclpp/cuda_utils.hpp
@@ -10,6 +10,8 @@

 namespace mscclpp {

+// A RAII guard that will cudaThreadExchangeStreamCaptureMode to cudaStreamCaptureModeRelaxed on construction and
+// restore the previous mode on destruction. This is helpful when we want to avoid CUDA graph capture.
 struct AvoidCudaGraphCaptureGuard {
  AvoidCudaGraphCaptureGuard() : mode_(cudaStreamCaptureModeRelaxed) {
    MSCCLPP_CUDATHROW(cudaThreadExchangeStreamCaptureMode(&mode_));
@@ -18,6 +20,7 @@ struct AvoidCudaGraphCaptureGuard {
  cudaStreamCaptureMode mode_;
 };

+// A RAII wrapper around cudaStream_t that will call cudaStreamDestroy on destruction.
 struct CudaStreamWithFlags {
  CudaStreamWithFlags(unsigned int flags) { MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&stream_, flags)); }
  ~CudaStreamWithFlags() { cudaStreamDestroy(stream_); }
@@ -48,7 +51,7 @@ T* cudaHostCalloc(size_t nelem) {
 }

 template <class T, T*(alloc)(size_t), class Deleter, class Memory>
-Memory safeMake(size_t nelem) {
+Memory safeAlloc(size_t nelem) {
  T* ptr = nullptr;
  try {
    ptr = alloc(nelem);
@@ -63,46 +66,98 @@ Memory safeMake(size_t nelem) {

 }  // namespace detail

+// A deleter that calls cudaFree for use with std::unique_ptr/std::shared_ptr.
 template <class T>
 struct CudaDeleter {
-  void operator()(T* ptr) {
+  using TPtrOrArray = std::conditional_t<std::is_array_v<T>, T, T*>;
+  void operator()(TPtrOrArray ptr) {
    AvoidCudaGraphCaptureGuard cgcGuard;
    MSCCLPP_CUDATHROW(cudaFree(ptr));
  }
 };

+// A deleter that calls cudaFreeHost for use with std::unique_ptr/std::shared_ptr.
 template <class T>
 struct CudaHostDeleter {
-  void operator()(T* ptr) {
+  using TPtrOrArray = std::conditional_t<std::is_array_v<T>, T, T*>;
+  void operator()(TPtrOrArray ptr) {
    AvoidCudaGraphCaptureGuard cgcGuard;
    MSCCLPP_CUDATHROW(cudaFreeHost(ptr));
  }
 };

+// Allocates memory on the device and returns a std::shared_ptr to it. The memory is zeroed out.
 template <class T>
-std::shared_ptr<T> makeSharedCuda(size_t count = 1) {
-  return detail::safeMake<T, detail::cudaCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
+std::shared_ptr<T> allocSharedCuda(size_t count = 1) {
+  return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
 }

 template <class T>
 using UniqueCudaPtr = std::unique_ptr<T, CudaDeleter<T>>;

+// Allocates memory on the device and returns a std::unique_ptr to it. The memory is zeroed out.
 template <class T>
-UniqueCudaPtr<T> makeUniqueCuda(size_t count = 1) {
-  return detail::safeMake<T, detail::cudaCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
+UniqueCudaPtr<T> allocUniqueCuda(size_t count = 1) {
+  return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
 }

+// Allocates memory with cudaHostAlloc, constructs an object of type T in it and returns a std::shared_ptr to it.
+template <class T, typename... Args>
+std::shared_ptr<T> makeSharedCudaHost(Args&&... args) {
+  auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, std::shared_ptr<T>>(1);
+  new (ptr.get()) T(std::forward<Args>(args)...);
+  return ptr;
+}
+
+// Allocates an array of objects of type T with cudaHostAlloc, default constructs each element and returns a
+// std::shared_ptr to it.
 template <class T>
-std::shared_ptr<T> makeSharedCudaHost(size_t count = 1) {
-  return detail::safeMake<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, std::shared_ptr<T>>(count);
+std::shared_ptr<T[]> makeSharedCudaHost(size_t count) {
+  using TElem = std::remove_extent_t<T>;
+  auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<TElem>, std::shared_ptr<T[]>>(count);
+  for (size_t i = 0; i < count; ++i) {
+    new (&ptr[i]) TElem();
+  }
+  return ptr;
 }

 template <class T>
 using UniqueCudaHostPtr = std::unique_ptr<T, CudaHostDeleter<T>>;

+// Allocates memory with cudaHostAlloc, constructs an object of type T in it and returns a std::unique_ptr to it.
+template <class T, typename... Args, std::enable_if_t<false == std::is_array_v<T>, bool> = true>
+UniqueCudaHostPtr<T> makeUniqueCudaHost(Args&&... args) {
+  auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, UniqueCudaHostPtr<T>>(1);
+  new (ptr.get()) T(std::forward<Args>(args)...);
+  return ptr;
+}
+
+// Allocates an array of objects of type T with cudaHostAlloc, default constructs each element and returns a
+// std::unique_ptr to it.
+template <class T, std::enable_if_t<true == std::is_array_v<T>, bool> = true>
+UniqueCudaHostPtr<T> makeUniqueCudaHost(size_t count) {
+  using TElem = std::remove_extent_t<T>;
+  auto ptr = detail::safeAlloc<TElem, detail::cudaHostCalloc<TElem>, CudaHostDeleter<T>, UniqueCudaHostPtr<T>>(count);
+  for (size_t i = 0; i < count; ++i) {
+    new (&ptr[i]) TElem();
+  }
+  return ptr;
+}
+
+// Asynchronous cudaMemcpy without capture into a CUDA graph.
 template <class T>
-UniqueCudaHostPtr<T> makeUniqueCudaHost(size_t count = 1) {
-  return detail::safeMake<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, UniqueCudaHostPtr<T>>(count);
+void memcpyCudaAsync(T* dst, const T* src, size_t count, cudaStream_t stream, cudaMemcpyKind kind = cudaMemcpyDefault) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, count * sizeof(T), kind, stream));
+}
+
+// Synchronous cudaMemcpy without capture into a CUDA graph.
+template <class T>
+void memcpyCuda(T* dst, const T* src, size_t count, cudaMemcpyKind kind = cudaMemcpyDefault) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  CudaStreamWithFlags stream(cudaStreamNonBlocking);
+  MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, count * sizeof(T), kind, stream));
+  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
 }

 }  // namespace mscclpp
--- a/src/bootstrap/bootstrap.cc
+++ b/src/bootstrap/bootstrap.cc
@@ -11,6 +11,7 @@
 #include <vector>

 #include "api.h"
+#include "checks.h"
 #include "checks_internal.hpp"
 #include "socket.h"
 #include "utils.h"
--- a/src/bootstrap/socket.cc
+++ b/src/bootstrap/socket.cc
@@ -6,12 +6,16 @@

 #include "socket.h"

+#include <errno.h>
 #include <ifaddrs.h>
 #include <net/if.h>
 #include <stdlib.h>
+#include <string.h>
 #include <unistd.h>

+#include "checks.h"
 #include "config.h"
+#include "debug.h"
 #include "utils.h"

 static mscclppResult_t socketProgressOpt(int op, struct mscclppSocket* sock, void* ptr, int size, int* offset,
--- a/src/epoch.cc
+++ b/src/epoch.cc
@@ -5,7 +5,7 @@
 namespace mscclpp {

 MSCCLPP_API_CPP DeviceEpoch::DeviceEpoch(Communicator& communicator, std::shared_ptr<Connection> connection)
-    : BaseEpoch(connection, makeUniqueCuda<EpochIds>(), makeUniqueCuda<uint64_t>()) {
+    : BaseEpoch(connection, allocUniqueCuda<EpochIds>(), allocUniqueCuda<uint64_t>()) {
  setup(communicator);
 }

--- a/src/fifo.cc
+++ b/src/fifo.cc
@@ -11,7 +11,7 @@
 namespace mscclpp {

 struct HostProxyFifo::Impl {
-  UniqueCudaHostPtr<ProxyTrigger> triggers;
+  UniqueCudaHostPtr<ProxyTrigger[]> triggers;
  UniqueCudaPtr<uint64_t> head;
  UniqueCudaPtr<uint64_t> tailReplica;

@@ -28,9 +28,9 @@ struct HostProxyFifo::Impl {
  CudaStreamWithFlags stream;

  Impl()
-      : triggers(makeUniqueCudaHost<ProxyTrigger>(MSCCLPP_PROXY_FIFO_SIZE)),
-        head(makeUniqueCuda<uint64_t>(1)),
-        tailReplica(makeUniqueCuda<uint64_t>(1)),
+      : triggers(makeUniqueCudaHost<ProxyTrigger[]>(MSCCLPP_PROXY_FIFO_SIZE)),
+        head(allocUniqueCuda<uint64_t>()),
+        tailReplica(allocUniqueCuda<uint64_t>()),
        hostTail(0),
        stream(cudaStreamNonBlocking) {}
 };
--- a/src/include/alloc.h
+++ b/src/include/alloc.h
@@ -1,182 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef MSCCLPP_ALLOC_H_
-#define MSCCLPP_ALLOC_H_
-
-#include <stdlib.h>
-#include <sys/mman.h>
-#include <unistd.h>
-
-#include "align.h"
-#include "checks.h"
-#include "mscclpp.h"
-#include "utils.h"
-
-template <typename T>
-mscclppResult_t mscclppCudaHostCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) {
-  mscclppResult_t result = mscclppSuccess;
-  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
-  *ptr = nullptr;
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  CUDACHECKGOTO(cudaHostAlloc(ptr, nelem * sizeof(T), cudaHostAllocMapped | cudaHostAllocWriteCombined), result,
-                finish);
-  memset(*ptr, 0, nelem * sizeof(T));
-finish:
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (*ptr == nullptr) WARN("Failed to CUDA host alloc %ld bytes", nelem * sizeof(T));
-  INFO(MSCCLPP_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
-  return result;
-}
-#define mscclppCudaHostCalloc(...) mscclppCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
-
-inline mscclppResult_t mscclppCudaHostFree(void* ptr) {
-  CUDACHECK(cudaFreeHost(ptr));
-  return mscclppSuccess;
-}
-
-template <typename T>
-mscclppResult_t mscclppCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) {
-  void* p = malloc(nelem * sizeof(T));
-  if (p == NULL) {
-    WARN("Failed to malloc %ld bytes", nelem * sizeof(T));
-    return mscclppSystemError;
-  }
-  INFO(MSCCLPP_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), p);
-  memset(p, 0, nelem * sizeof(T));
-  *ptr = (T*)p;
-  return mscclppSuccess;
-}
-#define mscclppCalloc(...) mscclppCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
-
-template <typename T>
-mscclppResult_t mscclppRealloc(T** ptr, size_t oldNelem, size_t nelem) {
-  if (nelem < oldNelem) return mscclppInternalError;
-  if (nelem == oldNelem) return mscclppSuccess;
-
-  T* oldp = *ptr;
-  T* p = (T*)malloc(nelem * sizeof(T));
-  if (p == NULL) {
-    WARN("Failed to malloc %ld bytes", nelem * sizeof(T));
-    return mscclppSystemError;
-  }
-  memcpy(p, oldp, oldNelem * sizeof(T));
-  free(oldp);
-  memset(p + oldNelem, 0, (nelem - oldNelem) * sizeof(T));
-  *ptr = (T*)p;
-  INFO(MSCCLPP_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem * sizeof(T), nelem * sizeof(T),
-       *ptr);
-  return mscclppSuccess;
-}
-
-template <typename T>
-mscclppResult_t mscclppCudaMallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) {
-  mscclppResult_t result = mscclppSuccess;
-  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
-  *ptr = nullptr;
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  CUDACHECKGOTO(cudaMalloc(ptr, nelem * sizeof(T)), result, finish);
-finish:
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem * sizeof(T));
-  INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
-  return result;
-}
-#define mscclppCudaMalloc(...) mscclppCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
-
-template <typename T>
-mscclppResult_t mscclppCudaCallocDebug(T** ptr, size_t nelem, const char* filefunc, int line) {
-  mscclppResult_t result = mscclppSuccess;
-  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
-  *ptr = nullptr;
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  // Need a side stream so as not to interfere with graph capture.
-  cudaStream_t stream;
-  CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-  CUDACHECKGOTO(cudaMalloc(ptr, nelem * sizeof(T)), result, finish);
-  CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem * sizeof(T), stream), result, finish);
-  CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
-  CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
-finish:
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (*ptr == nullptr) WARN("Failed to CUDA calloc %ld bytes", nelem * sizeof(T));
-  INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
-  return result;
-}
-#define mscclppCudaCalloc(...) mscclppCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
-
-template <typename T>
-mscclppResult_t mscclppCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream, const char* filefunc,
-                                            int line) {
-  mscclppResult_t result = mscclppSuccess;
-  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
-  *ptr = nullptr;
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  CUDACHECKGOTO(cudaMalloc(ptr, nelem * sizeof(T)), result, finish);
-  CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem * sizeof(T), stream), result, finish);
-finish:
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (*ptr == nullptr) WARN("Failed to CUDA calloc async %ld bytes", nelem * sizeof(T));
-  INFO(MSCCLPP_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem * sizeof(T), *ptr);
-  return result;
-}
-#define mscclppCudaCallocAsync(...) mscclppCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__)
-
-template <typename T>
-mscclppResult_t mscclppCudaMemcpy(T* dst, T* src, size_t nelem) {
-  mscclppResult_t result = mscclppSuccess;
-  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  // Need a side stream so as not to interfere with graph capture.
-  cudaStream_t stream;
-  CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), result, finish);
-  MSCCLPPCHECKGOTO(mscclppCudaMemcpyAsync(dst, src, nelem, stream), result, finish);
-  CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
-  CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
-finish:
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  return result;
-}
-
-template <typename T>
-mscclppResult_t mscclppCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stream) {
-  mscclppResult_t result = mscclppSuccess;
-  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem * sizeof(T), cudaMemcpyDefault, stream), result, finish);
-finish:
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  return result;
-}
-
-template <typename T>
-mscclppResult_t mscclppCudaFree(T* ptr) {
-  mscclppResult_t result = mscclppSuccess;
-  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  CUDACHECKGOTO(cudaFree(ptr), result, finish);
-finish:
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  return result;
-}
-
-// Allocate memory to be potentially ibv_reg_mr'd. This needs to be
-// allocated on separate pages as those pages will be marked DONTFORK
-// and if they are shared, that could cause a crash in a child process
-inline mscclppResult_t mscclppIbMallocDebug(void** ptr, size_t size, const char* filefunc, int line) {
-  size_t page_size = sysconf(_SC_PAGESIZE);
-  void* p;
-  int size_aligned = ROUNDUP(size, page_size);
-  int ret = posix_memalign(&p, page_size, size_aligned);
-  if (ret != 0) return mscclppSystemError;
-  memset(p, 0, size);
-  *ptr = p;
-  INFO(MSCCLPP_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr);
-  return mscclppSuccess;
-}
-#define mscclppIbMalloc(...) mscclppIbMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
-
-#endif
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -8,11 +8,11 @@
 #define MSCCLPP_UTILS_H_

 #include <stdint.h>
+#include <stdio.h>

 #include <chrono>

-#include "alloc.h"
-// #include "mscclpp.h"
+#include "mscclpp.h"

 // int mscclppCudaCompCap();

--- a/src/npkit/npkit.cc
+++ b/src/npkit/npkit.cc
@@ -6,49 +6,43 @@
 #include <chrono>
 #include <fstream>

-#include "alloc.h"
-
 uint64_t NpKit::rank_ = 0;

-NpKitEvent** NpKit::gpu_event_buffers_ = nullptr;
-NpKitEvent** NpKit::cpu_event_buffers_ = nullptr;
+std::vector<mscclpp::UniqueCudaPtr<NpKitEvent>> NpKit::gpu_event_buffers_;
+std::vector<std::unique_ptr<NpKitEvent[]>> NpKit::cpu_event_buffers_;

-NpKitEventCollectContext* NpKit::gpu_collect_contexts_ = nullptr;
-NpKitEventCollectContext* NpKit::cpu_collect_contexts_ = nullptr;
+mscclpp::UniqueCudaPtr<NpKitEventCollectContext> NpKit::gpu_collect_contexts_;
+std::unique_ptr<NpKitEventCollectContext[]> NpKit::cpu_collect_contexts_;
 uint64_t NpKit::cpu_base_system_timestamp_ = 0;
 uint64_t NpKit::cpu_base_steady_timestamp_ = 0;

-mscclppResult_t NpKit::Init(int rank) {
+void NpKit::Init(int rank) {
  uint64_t i = 0;
  NpKitEventCollectContext ctx;
  ctx.event_buffer_head = 0;
  rank_ = rank;

  // Init event data structures
-  MSCCLPPCHECK(mscclppCalloc(&gpu_event_buffers_, kNumGpuEventBuffers));
-  MSCCLPPCHECK(mscclppCudaCalloc(&gpu_collect_contexts_, kNumGpuEventBuffers));
+  gpu_collect_contexts_ = mscclpp::allocUniqueCuda<NpKitEventCollectContext>(kNumGpuEventBuffers);
  for (i = 0; i < kNumGpuEventBuffers; i++) {
-    MSCCLPPCHECK(mscclppCudaCalloc(gpu_event_buffers_ + i, kMaxNumGpuEventsPerBuffer));
-    ctx.event_buffer = gpu_event_buffers_[i];
-    MSCCLPPCHECK(mscclppCudaMemcpy(gpu_collect_contexts_ + i, &ctx, 1));
+    gpu_event_buffers_.emplace_back(mscclpp::allocUniqueCuda<NpKitEvent>(kMaxNumGpuEventsPerBuffer));
+    ctx.event_buffer = gpu_event_buffers_[i].get();
+    mscclpp::memcpyCuda(gpu_collect_contexts_.get() + i, &ctx, 1);
  }

-  MSCCLPPCHECK(mscclppCalloc(&cpu_event_buffers_, kNumCpuEventBuffers));
-  MSCCLPPCHECK(mscclppCalloc(&cpu_collect_contexts_, kNumCpuEventBuffers));
+  cpu_collect_contexts_ = std::make_unique<NpKitEventCollectContext[]>(kNumCpuEventBuffers);
  for (i = 0; i < kNumCpuEventBuffers; i++) {
-    MSCCLPPCHECK(mscclppCalloc(cpu_event_buffers_ + i, kMaxNumCpuEventsPerBuffer));
-    ctx.event_buffer = cpu_event_buffers_[i];
+    cpu_event_buffers_.emplace_back(std::make_unique<NpKitEvent[]>(kMaxNumCpuEventsPerBuffer));
+    ctx.event_buffer = cpu_event_buffers_[i].get();
    cpu_collect_contexts_[i] = ctx;
  }

  // Init timestamp
  cpu_base_system_timestamp_ = std::chrono::system_clock::now().time_since_epoch().count();
  cpu_base_steady_timestamp_ = std::chrono::steady_clock::now().time_since_epoch().count();
-
-  return mscclppSuccess;
 }

-mscclppResult_t NpKit::Dump(const std::string& dump_dir) {
+void NpKit::Dump(const std::string& dump_dir) {
  uint64_t i = 0;
  std::string dump_file_path;

@@ -60,7 +54,7 @@ mscclppResult_t NpKit::Dump(const std::string& dump_dir) {
    dump_file_path += "_channel_";
    dump_file_path += std::to_string(i);
    auto cpu_trace_file = std::fstream(dump_file_path, std::ios::out | std::ios::binary);
-    cpu_trace_file.write(reinterpret_cast<char*>(cpu_event_buffers_[i]),
+    cpu_trace_file.write(reinterpret_cast<char*>(cpu_event_buffers_[i].get()),
                         cpu_collect_contexts_[i].event_buffer_head * sizeof(NpKitEvent));
    cpu_trace_file.close();
  }
@@ -89,10 +83,10 @@ mscclppResult_t NpKit::Dump(const std::string& dump_dir) {
    dump_file_path += std::to_string(rank_);
    dump_file_path += "_buf_";
    dump_file_path += std::to_string(i);
-    MSCCLPPCHECK(mscclppCudaMemcpy(cpu_event_buffers_[0], gpu_event_buffers_[i], kMaxNumGpuEventsPerBuffer));
-    MSCCLPPCHECK(mscclppCudaMemcpy(cpu_collect_contexts_, gpu_collect_contexts_ + i, 1));
+    mscclpp::memcpyCuda(cpu_event_buffers_[0].get(), gpu_event_buffers_[i].get(), kMaxNumGpuEventsPerBuffer);
+    mscclpp::memcpyCuda(cpu_collect_contexts_.get(), gpu_collect_contexts_.get() + i, 1);
    auto gpu_trace_file = std::fstream(dump_file_path, std::ios::out | std::ios::binary);
-    gpu_trace_file.write(reinterpret_cast<char*>(cpu_event_buffers_[0]),
+    gpu_trace_file.write(reinterpret_cast<char*>(cpu_event_buffers_[0].get()),
                         cpu_collect_contexts_[0].event_buffer_head * sizeof(NpKitEvent));
    gpu_trace_file.close();
  }
@@ -103,37 +97,27 @@ mscclppResult_t NpKit::Dump(const std::string& dump_dir) {
  dump_file_path += std::to_string(rank_);
  cudaDeviceProp dev_prop;
  int dev;
-  CUDACHECK(cudaGetDevice(&dev));
-  CUDACHECK(cudaGetDeviceProperties(&dev_prop, dev));
+  MSCCLPP_CUDATHROW(cudaGetDevice(&dev));
+  MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&dev_prop, dev));
  std::string clock_rate_str = std::to_string(dev_prop.clockRate);
  auto gpu_clock_rate_file = std::fstream(dump_file_path, std::ios::out);
  gpu_clock_rate_file.write(clock_rate_str.c_str(), clock_rate_str.length());
  gpu_clock_rate_file.close();
-
-  return mscclppSuccess;
 }

-mscclppResult_t NpKit::Shutdown() {
+void NpKit::Shutdown() {
  uint64_t i = 0;

  // Free CPU event data structures
-  for (i = 0; i < kNumCpuEventBuffers; i++) {
-    free(cpu_event_buffers_[i]);
-  }
-  free(cpu_event_buffers_);
-  free(cpu_collect_contexts_);
+  cpu_event_buffers_.clear();
+  cpu_collect_contexts_.reset();

  // Free GPU event data structures
-  for (i = 0; i < kNumGpuEventBuffers; i++) {
-    CUDACHECK(cudaFree(gpu_event_buffers_[i]));
-  }
-  free(gpu_event_buffers_);
-  CUDACHECK(cudaFree(gpu_collect_contexts_));
-
-  return mscclppSuccess;
+  gpu_event_buffers_.clear();
+  gpu_collect_contexts_.reset();
 }

-NpKitEventCollectContext* NpKit::GetGpuEventCollectContexts() { return gpu_collect_contexts_; }
+NpKitEventCollectContext* NpKit::GetGpuEventCollectContexts() { return gpu_collect_contexts_.get(); }

 void NpKit::CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id) {
  uint64_t event_buffer_head = cpu_collect_contexts_[channel_id].event_buffer_head;
--- a/src/npkit/npkit.h
+++ b/src/npkit/npkit.h
@@ -1,9 +1,10 @@
 #ifndef NPKIT_H_
 #define NPKIT_H_

+#include <mscclpp/cuda_utils.hpp>
 #include <string>
+#include <vector>

-#include "mscclpp.h"
 #include "npkit_event.h"
 #include "npkit_struct.h"

@@ -13,11 +14,11 @@ class NpKit {

  static const uint64_t kNumCpuEventBuffers = 32;

-  static mscclppResult_t Init(int rank);
+  static void Init(int rank);

-  static mscclppResult_t Dump(const std::string& dump_dir);
+  static void Dump(const std::string& dump_dir);

-  static mscclppResult_t Shutdown();
+  static void Shutdown();

  static NpKitEventCollectContext* GetGpuEventCollectContexts();

@@ -47,11 +48,11 @@ class NpKit {
  // 64K * 2 (send/recv) * (512/32) = 2M, 2M * 32 * 16B = 1GB per CPU
  static const uint64_t kMaxNumCpuEventsPerBuffer = 1ULL << 21;

-  static NpKitEvent** gpu_event_buffers_;
-  static NpKitEvent** cpu_event_buffers_;
+  static std::vector<mscclpp::UniqueCudaPtr<NpKitEvent>> gpu_event_buffers_;
+  static std::vector<std::unique_ptr<NpKitEvent[]>> cpu_event_buffers_;

-  static NpKitEventCollectContext* gpu_collect_contexts_;
-  static NpKitEventCollectContext* cpu_collect_contexts_;
+  static mscclpp::UniqueCudaPtr<NpKitEventCollectContext> gpu_collect_contexts_;
+  static std::unique_ptr<NpKitEventCollectContext[]> cpu_collect_contexts_;

  static uint64_t cpu_base_system_timestamp_;
  static uint64_t cpu_base_steady_timestamp_;
--- a/src/registered_memory.cc
+++ b/src/registered_memory.cc
@@ -6,6 +6,7 @@

 #include "api.h"
 #include "checks_internal.hpp"
+#include "debug.h"
 #include "utils.h"

 namespace mscclpp {
--- a/src/utils.cc
+++ b/src/utils.cc
@@ -6,12 +6,16 @@

 #include "utils.h"

+#include <cuda_runtime.h>
 #include <numa.h>
 #include <stdlib.h>
+#include <unistd.h>

 #include <memory>
 #include <string>

+#include "checks.h"
+
 // Get current Compute Capability
 // int mscclppCudaCompCap() {
 //   int cudaDev;
--- a/test/allgather_test_host_offloading.cu
+++ b/test/allgather_test_host_offloading.cu
@@ -2,6 +2,7 @@
 #include <mscclpp/fifo.hpp>
 #include <mscclpp/proxy.hpp>
 #include <mscclpp/epoch.hpp>
+#include <mscclpp/checks.hpp>
 #include <utils.h>

 #ifdef MSCCLPP_USE_MPI_FOR_TESTS
@@ -392,9 +393,9 @@ int main(int argc, char* argv[])
    printf("Stopping MSCCL++ proxy threads\n");
  proxyService.stop();

-  CUDACHECK(cudaFree(data_d));
-  CUDACHECK(cudaFree(deviceHandles1));
-  CUDACHECK(cudaFree(deviceHandles2));
+  MSCCLPP_CUDATHROW(cudaFree(data_d));
+  MSCCLPP_CUDATHROW(cudaFree(deviceHandles1));
+  MSCCLPP_CUDATHROW(cudaFree(deviceHandles2));

 #ifdef MSCCLPP_USE_MPI_FOR_TESTS
  MPI_Finalize();
--- a/test/ib_test.cc
+++ b/test/ib_test.cc
@@ -34,7 +34,7 @@ int main(int argc, const char* argv[])
  CUDACHECK(cudaSetDevice(cudaDevId));

  int nelem = 1;
-  auto data = mscclpp::makeUniqueCuda<int>(nelem);
+  auto data = mscclpp::allocUniqueCuda<int>(nelem);

  std::shared_ptr<mscclpp::Bootstrap> bootstrap(new mscclpp::Bootstrap(isSend, 2));
  bootstrap->initialize(ipPortPair);
--- a/test/mscclpp-test/allgather_test.cu
+++ b/test/mscclpp-test/allgather_test.cu
@@ -200,7 +200,7 @@ class AllGatherTestEngine : public BaseTestEngine {
 };

 void AllGatherTestEngine::allocateBuffer() {
-  sendBuff_ = mscclpp::makeSharedCuda<int>(args_.maxBytes / sizeof(int));
+  sendBuff_ = mscclpp::allocSharedCuda<int>(args_.maxBytes / sizeof(int));
  expectedBuff_ = std::shared_ptr<int[]>(new int[args_.maxBytes / sizeof(int)]);
 }

--- a/test/mscclpp-test/sendrecv_test.cu
+++ b/test/mscclpp-test/sendrecv_test.cu
@@ -119,8 +119,8 @@ class SendRecvTestEngine : public BaseTestEngine {
 };

 void SendRecvTestEngine::allocateBuffer() {
-  std::shared_ptr<int> sendBuff = mscclpp::makeSharedCuda<int>(args_.maxBytes / sizeof(int));
-  std::shared_ptr<int> recvBuff = mscclpp::makeSharedCuda<int>(args_.maxBytes / sizeof(int));
+  std::shared_ptr<int> sendBuff = mscclpp::allocSharedCuda<int>(args_.maxBytes / sizeof(int));
+  std::shared_ptr<int> recvBuff = mscclpp::allocSharedCuda<int>(args_.maxBytes / sizeof(int));
  devicePtrs_.push_back(sendBuff);
  devicePtrs_.push_back(recvBuff);

--- a/test/unit/cuda_memory_tests.cc
+++ b/test/unit/cuda_memory_tests.cc
@@ -2,11 +2,11 @@
 #include <mscclpp/cuda_utils.hpp>

 TEST(CudaMemoryTest, Shared) {
-  auto p1 = mscclpp::makeSharedCuda<uint32_t>();
-  auto p2 = mscclpp::makeSharedCuda<int64_t>(5);
+  auto p1 = mscclpp::allocSharedCuda<uint32_t>();
+  auto p2 = mscclpp::allocSharedCuda<int64_t>(5);
 }

 TEST(CudaMemoryTest, Unique) {
-  auto p1 = mscclpp::makeUniqueCuda<uint32_t>();
-  auto p2 = mscclpp::makeUniqueCuda<int64_t>(5);
+  auto p1 = mscclpp::allocUniqueCuda<uint32_t>();
+  auto p2 = mscclpp::allocUniqueCuda<int64_t>(5);
 }