Remove alloc.h and beef up cuda_utils.hpp (#82)

2026-05-24 14:54:51 +00:00 · 2023-05-24 01:34:18 -07:00
parent 216373eab2
commit 457c422791
16 changed files with 129 additions and 260 deletions
--- a/include/mscclpp/cuda_utils.hpp
+++ b/include/mscclpp/cuda_utils.hpp
@@ -10,6 +10,8 @@

 namespace mscclpp {

+// A RAII guard that will cudaThreadExchangeStreamCaptureMode to cudaStreamCaptureModeRelaxed on construction and
+// restore the previous mode on destruction. This is helpful when we want to avoid CUDA graph capture.
 struct AvoidCudaGraphCaptureGuard {
  AvoidCudaGraphCaptureGuard() : mode_(cudaStreamCaptureModeRelaxed) {
    MSCCLPP_CUDATHROW(cudaThreadExchangeStreamCaptureMode(&mode_));
@@ -18,6 +20,7 @@ struct AvoidCudaGraphCaptureGuard {
  cudaStreamCaptureMode mode_;
 };

+// A RAII wrapper around cudaStream_t that will call cudaStreamDestroy on destruction.
 struct CudaStreamWithFlags {
  CudaStreamWithFlags(unsigned int flags) { MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&stream_, flags)); }
  ~CudaStreamWithFlags() { cudaStreamDestroy(stream_); }
@@ -48,7 +51,7 @@ T* cudaHostCalloc(size_t nelem) {
 }

 template <class T, T*(alloc)(size_t), class Deleter, class Memory>
-Memory safeMake(size_t nelem) {
+Memory safeAlloc(size_t nelem) {
  T* ptr = nullptr;
  try {
    ptr = alloc(nelem);
@@ -63,46 +66,98 @@ Memory safeMake(size_t nelem) {

 }  // namespace detail

+// A deleter that calls cudaFree for use with std::unique_ptr/std::shared_ptr.
 template <class T>
 struct CudaDeleter {
-  void operator()(T* ptr) {
+  using TPtrOrArray = std::conditional_t<std::is_array_v<T>, T, T*>;
+  void operator()(TPtrOrArray ptr) {
    AvoidCudaGraphCaptureGuard cgcGuard;
    MSCCLPP_CUDATHROW(cudaFree(ptr));
  }
 };

+// A deleter that calls cudaFreeHost for use with std::unique_ptr/std::shared_ptr.
 template <class T>
 struct CudaHostDeleter {
-  void operator()(T* ptr) {
+  using TPtrOrArray = std::conditional_t<std::is_array_v<T>, T, T*>;
+  void operator()(TPtrOrArray ptr) {
    AvoidCudaGraphCaptureGuard cgcGuard;
    MSCCLPP_CUDATHROW(cudaFreeHost(ptr));
  }
 };

+// Allocates memory on the device and returns a std::shared_ptr to it. The memory is zeroed out.
 template <class T>
-std::shared_ptr<T> makeSharedCuda(size_t count = 1) {
-  return detail::safeMake<T, detail::cudaCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
+std::shared_ptr<T> allocSharedCuda(size_t count = 1) {
+  return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
 }

 template <class T>
 using UniqueCudaPtr = std::unique_ptr<T, CudaDeleter<T>>;

+// Allocates memory on the device and returns a std::unique_ptr to it. The memory is zeroed out.
 template <class T>
-UniqueCudaPtr<T> makeUniqueCuda(size_t count = 1) {
-  return detail::safeMake<T, detail::cudaCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
+UniqueCudaPtr<T> allocUniqueCuda(size_t count = 1) {
+  return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
 }

+// Allocates memory with cudaHostAlloc, constructs an object of type T in it and returns a std::shared_ptr to it.
+template <class T, typename... Args>
+std::shared_ptr<T> makeSharedCudaHost(Args&&... args) {
+  auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, std::shared_ptr<T>>(1);
+  new (ptr.get()) T(std::forward<Args>(args)...);
+  return ptr;
+}
+
+// Allocates an array of objects of type T with cudaHostAlloc, default constructs each element and returns a
+// std::shared_ptr to it.
 template <class T>
-std::shared_ptr<T> makeSharedCudaHost(size_t count = 1) {
-  return detail::safeMake<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, std::shared_ptr<T>>(count);
+std::shared_ptr<T[]> makeSharedCudaHost(size_t count) {
+  using TElem = std::remove_extent_t<T>;
+  auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<TElem>, std::shared_ptr<T[]>>(count);
+  for (size_t i = 0; i < count; ++i) {
+    new (&ptr[i]) TElem();
+  }
+  return ptr;
 }

 template <class T>
 using UniqueCudaHostPtr = std::unique_ptr<T, CudaHostDeleter<T>>;

+// Allocates memory with cudaHostAlloc, constructs an object of type T in it and returns a std::unique_ptr to it.
+template <class T, typename... Args, std::enable_if_t<false == std::is_array_v<T>, bool> = true>
+UniqueCudaHostPtr<T> makeUniqueCudaHost(Args&&... args) {
+  auto ptr = detail::safeAlloc<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, UniqueCudaHostPtr<T>>(1);
+  new (ptr.get()) T(std::forward<Args>(args)...);
+  return ptr;
+}
+
+// Allocates an array of objects of type T with cudaHostAlloc, default constructs each element and returns a
+// std::unique_ptr to it.
+template <class T, std::enable_if_t<true == std::is_array_v<T>, bool> = true>
+UniqueCudaHostPtr<T> makeUniqueCudaHost(size_t count) {
+  using TElem = std::remove_extent_t<T>;
+  auto ptr = detail::safeAlloc<TElem, detail::cudaHostCalloc<TElem>, CudaHostDeleter<T>, UniqueCudaHostPtr<T>>(count);
+  for (size_t i = 0; i < count; ++i) {
+    new (&ptr[i]) TElem();
+  }
+  return ptr;
+}
+
+// Asynchronous cudaMemcpy without capture into a CUDA graph.
 template <class T>
-UniqueCudaHostPtr<T> makeUniqueCudaHost(size_t count = 1) {
-  return detail::safeMake<T, detail::cudaHostCalloc<T>, CudaHostDeleter<T>, UniqueCudaHostPtr<T>>(count);
+void memcpyCudaAsync(T* dst, const T* src, size_t count, cudaStream_t stream, cudaMemcpyKind kind = cudaMemcpyDefault) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, count * sizeof(T), kind, stream));
+}
+
+// Synchronous cudaMemcpy without capture into a CUDA graph.
+template <class T>
+void memcpyCuda(T* dst, const T* src, size_t count, cudaMemcpyKind kind = cudaMemcpyDefault) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  CudaStreamWithFlags stream(cudaStreamNonBlocking);
+  MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, count * sizeof(T), kind, stream));
+  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
 }

 }  // namespace mscclpp